Skip to content

Commit 25f780a

Browse files
committed
Add clickhouse provisioning code
Moved from the pvannierop/cbioportal-clickhouse-pilot repository.
1 parent a317afe commit 25f780a

File tree

3 files changed

+221
-0
lines changed

3 files changed

+221
-0
lines changed

dev/clickhouse/DEV_SETUP.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Setup for development on Clickhouse integration
2+
3+
## Pre-requisites
4+
5+
1. cBioPortal database:
6+
7+
- provisioned with one or more study in the _cbioportal_ database.
8+
- user: _cbio_ password: _P@ssword1_ with all permissions on the _cbioportal_ database.
9+
- available on port 3306 on the host system.
10+
11+
3. System with docker and docker compose installed.
12+
13+
## Setup
14+
15+
All commands start from the root repository location.
16+
17+
1. Start cBioPortal database.
18+
2. Create MySQL views in the cBioPortal database by running the commands
19+
in [cbio_database_views.sql](mysql_provisioning/cbio_database_views.sql).
20+
3. Start Clickhouse (provisioned automatically).
21+
22+
```
23+
cd ./dev/clickhouse
24+
docker compose up -d
25+
```
26+
27+
This will start a Clickhouse instance that is available on the host system:
28+
- port: _8123_
29+
- database: _cbioportal_
30+
- username: _cbio_
31+
- password: _P@ssword1_
32+
33+
## TODO
34+
35+
- Extend [cbio_database_views.sql](mysql_provisioning/cbio_database_views.sql)
36+
and [c_import_mysql_data.sql](clickhouse_provisioning/c_import_mysql_data.sql) to cover all clickhouse tables.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
insert into cbioportal.sample_in_genetic_profile
2+
select * from mysql(
3+
'127.0.0.1:3306',
4+
'cbioportal',
5+
'view_sample_in_genetic_profile',
6+
'cbio',
7+
'P@ssword1'
8+
);
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
# sample
2+
DROP VIEW IF EXISTS view_sample;
3+
CREATE VIEW view_sample AS
4+
SELECT
5+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
6+
sample.STABLE_ID as sample_stable_id,
7+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
8+
p.STABLE_ID as patient_stable_id,
9+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
10+
FROM sample
11+
INNER JOIN patient p on sample.PATIENT_ID = p.INTERNAL_ID
12+
INNER JOIN cancer_study cs on p.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;
13+
14+
# sample list
15+
DROP VIEW IF EXISTS view_sample_list;
16+
CREATE VIEW view_sample_list AS
17+
SELECT
18+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
19+
sl.STABLE_ID as sample_list_stable_id,
20+
sl.NAME as name,
21+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
22+
FROM sample_list as sl
23+
INNER JOIN sample_list_list as sll ON sll.LIST_ID = sl.LIST_ID
24+
INNER JOIN sample as s ON s.INTERNAL_ID = sll.SAMPLE_ID
25+
INNER JOIN cancer_study cs on sl.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;
26+
27+
# genomic_event
28+
DROP TABLE IF EXISTS view_genomic_event;
29+
-- This view takes a long time to materialize. I store the data in a table to prevent repeated recalculations.
30+
# CREATE TABLE view_genomic_event AS
31+
# SELECT
32+
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
33+
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
34+
# me.PROTEIN_CHANGE as variant,
35+
# gp.STABLE_ID as gene_panel_stable_id,
36+
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
37+
# g.STABLE_ID as genetic_profile_stable_id
38+
# FROM mutation
39+
# LEFT JOIN mutation_event as me ON mutation.MUTATION_EVENT_ID = me.MUTATION_EVENT_ID
40+
# LEFT JOIN sample_profile sp on mutation.SAMPLE_ID = sp.SAMPLE_ID and mutation.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
41+
# LEFT JOIN gene_panel gp on sp.PANEL_ID = gp.INTERNAL_ID
42+
# LEFT JOIN genetic_profile g on sp.GENETIC_PROFILE_ID = g.GENETIC_PROFILE_ID
43+
# LEFT JOIN cancer_study cs on g.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
44+
# LEFT JOIN sample on mutation.SAMPLE_ID = sample.INTERNAL_ID
45+
# LEFT JOIN gene ON mutation.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID;
46+
# INSERT INTO view_genomic_event
47+
# SELECT
48+
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
49+
# gene.HUGO_GENE_SYMBOL as hugo_gene_symbol,
50+
# convert(ce.ALTERATION, char) as variant,
51+
# gene_panel.STABLE_ID as gene_panel_stable_id,
52+
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
53+
# gp.STABLE_ID as genetic_profile_stable_id
54+
# FROM sample_cna_event
55+
# LEFT JOIN cna_event ce on sample_cna_event.CNA_EVENT_ID = ce.CNA_EVENT_ID
56+
# LEFT JOIN gene on ce.ENTREZ_GENE_ID = gene.ENTREZ_GENE_ID
57+
# LEFT JOIN genetic_profile gp on sample_cna_event.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
58+
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
59+
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
60+
# LEFT JOIN sample on sample_cna_event.SAMPLE_ID = sample.INTERNAL_ID
61+
# LEFT JOIN gene_panel ON sp.PANEL_ID = gene_panel.INTERNAL_ID;
62+
# INSERT INTO view_genomic_event
63+
# SELECT
64+
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
65+
# hugo_gene_symbol,
66+
# Event_Info as variant,
67+
# g.STABLE_ID as gene_panel_stable_id,
68+
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
69+
# gp.STABLE_ID as genetic_profile_stable_id
70+
# FROM structural_variant as sv
71+
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
72+
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
73+
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
74+
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
75+
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
76+
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
77+
# INSERT INTO view_genomic_event
78+
# SELECT
79+
# concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
80+
# hugo_gene_symbol,
81+
# Event_Info as variant,
82+
# g.STABLE_ID as gene_panel_stable_id,
83+
# cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
84+
# gp.STABLE_ID as genetic_profile_stable_id
85+
# FROM structural_variant as sv
86+
# LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL as hugo_gene_symbol FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
87+
# LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
88+
# LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
89+
# LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
90+
# LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
91+
# LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
92+
93+
-- structural variant
94+
DROP VIEW IF EXISTS view_structural_variant;
95+
CREATE VIEW view_structural_variant AS
96+
SELECT
97+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
98+
gene1.HUGO_GENE_SYMBOL as hugo_symbol_gene1,
99+
gene2.HUGO_GENE_SYMBOL as hugo_symbol_gene2,
100+
g.STABLE_ID as gene_panel_stable_id,
101+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier,
102+
gp.STABLE_ID as genetic_profile_stable_id
103+
FROM structural_variant as sv
104+
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene1 on gene1.ENTREZ_GENE_ID = sv.SITE1_ENTREZ_GENE_ID
105+
LEFT JOIN (SELECT ENTREZ_GENE_ID, HUGO_GENE_SYMBOL FROM gene) gene2 on gene2.ENTREZ_GENE_ID = sv.SITE2_ENTREZ_GENE_ID
106+
LEFT OUTER JOIN genetic_profile gp on gp.GENETIC_PROFILE_ID = sv.GENETIC_PROFILE_ID
107+
LEFT JOIN sample s on sv.SAMPLE_ID = s.INTERNAL_ID
108+
LEFT JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID
109+
LEFT JOIN sample_profile sp on gp.GENETIC_PROFILE_ID = sp.GENETIC_PROFILE_ID
110+
LEFT JOIN gene_panel g on sp.PANEL_ID = g.INTERNAL_ID;
111+
112+
-- sample_clinical_attribute_numeric
113+
DROP VIEW IF EXISTS view_sample_clinical_attribute_numeric;
114+
CREATE VIEW view_sample_clinical_attribute_numeric AS
115+
SELECT
116+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
117+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
118+
ATTR_ID as attribute_name,
119+
ATTR_VALUE as attribute_value,
120+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
121+
FROM cancer_study cs
122+
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
123+
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
124+
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
125+
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';
126+
127+
-- sample_clinical_attribute_categorical
128+
DROP VIEW IF EXISTS view_sample_clinical_attribute_categorical;
129+
CREATE VIEW view_sample_clinical_attribute_categorical AS
130+
SELECT
131+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', s.STABLE_ID) as sample_unique_id,
132+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
133+
ATTR_ID as attribute_name,
134+
ATTR_VALUE as attribute_value,
135+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
136+
FROM cancer_study cs
137+
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
138+
INNER JOIN sample s on p.INTERNAL_ID = s.PATIENT_ID
139+
INNER JOIN clinical_sample cs on s.INTERNAL_ID = cs.INTERNAL_ID
140+
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';
141+
142+
-- patient_clinical_attribute_numeric
143+
DROP VIEW IF EXISTS view_patient_clinical_attribute_numeric;
144+
CREATE VIEW view_patient_clinical_attribute_numeric AS
145+
SELECT
146+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
147+
ATTR_ID as attribute_name,
148+
ATTR_VALUE as attribute_value,
149+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
150+
FROM cancer_study cs
151+
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
152+
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
153+
WHERE ATTR_VALUE REGEXP '^[0-9.]+$';
154+
155+
-- patient_clinical_attribute_categorical
156+
DROP VIEW IF EXISTS view_patient_clinical_attribute_categorical;
157+
CREATE VIEW view_patient_clinical_attribute_categorical AS
158+
SELECT
159+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', p.STABLE_ID) as patient_unique_id,
160+
ATTR_ID as attribute_name,
161+
ATTR_VALUE as attribute_value,
162+
cs.CANCER_STUDY_IDENTIFIER as cancer_study_identifier
163+
FROM cancer_study cs
164+
INNER JOIN patient p on cs.CANCER_STUDY_ID = p.CANCER_STUDY_ID
165+
INNER JOIN clinical_patient cp on p.INTERNAL_ID = cp.INTERNAL_ID
166+
WHERE ATTR_VALUE NOT REGEXP '^[0-9.]+$';
167+
168+
-- sample_in_genomic_profile
169+
DROP VIEW IF EXISTS view_sample_in_genetic_profile;
170+
CREATE VIEW view_sample_in_genetic_profile AS
171+
SELECT
172+
concat(cs.CANCER_STUDY_IDENTIFIER, '_', sample.STABLE_ID) as sample_unique_id,
173+
replace(gp.STABLE_ID, concat(cs.CANCER_STUDY_IDENTIFIER, '_'), '') as genetic_profile_stable_id_short
174+
FROM sample
175+
INNER JOIN sample_profile sp on sample.INTERNAL_ID = sp.SAMPLE_ID
176+
INNER JOIN genetic_profile gp on sp.GENETIC_PROFILE_ID = gp.GENETIC_PROFILE_ID
177+
INNER JOIN cancer_study cs on gp.CANCER_STUDY_ID = cs.CANCER_STUDY_ID;

0 commit comments

Comments
 (0)