Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: removing symbols from QTL study identifiers #971

Merged
merged 9 commits into from
Jan 31, 2025
26 changes: 26 additions & 0 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,3 +885,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
/ f.pow(x["pos"], 2)
/ f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
)


def clean_strings_from_symbols(source: Column) -> Column:
"""To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.

Args:
source (Column): Source string

Returns:
Column: Cleaned string

Examples:
>>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
>>> df = spark.createDataFrame(d).toDF("source")
>>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+-------------+---------+
|source |cleaned |
+-------------+---------+
|AbCd-12.2 |abcd-12_2|
|AaBb..123? |aabb_123_|
|cDd!@#$%^&*()|cdd_ |
+-------------+---------+
<BLANKLINE>
"""
characters_to_replace = r"[^a-z0-9-_]+"
return f.regexp_replace(f.lower(source), characters_to_replace, "_")
4 changes: 3 additions & 1 deletion src/gentropy/dataset/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from gentropy.assets import data
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.common.spark_helpers import (
convert_from_wide_to_long,
)
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down
16 changes: 10 additions & 6 deletions src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)

from gentropy.common.session import Session
from gentropy.common.spark_helpers import clean_strings_from_symbols
from gentropy.common.utils import parse_pvalue
from gentropy.dataset.study_locus import FinemappingMethod, StudyLocus
from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex
Expand Down Expand Up @@ -171,12 +172,15 @@ def parse_susie_results(
f.col("molecular_trait_id").alias("traitFromSource"),
f.col("gene_id").alias("geneId"),
f.col("dataset_id"),
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
# Upon creation, the studyId cleaned from symbols:
clean_strings_from_symbols(
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
)
).alias("studyId"),
f.col("tissue_id").alias("biosampleFromSourceId"),
EqtlCatalogueStudyIndex._identify_study_type().alias("studyType"),
Expand Down
3 changes: 2 additions & 1 deletion src/gentropy/datasource/eqtl_catalogue/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def _identify_study_type(
*[f.lit(x) for x in chain(*cls.method_to_qtl_type_mapping.items())]
)[f.col("quant_method")]
return f.when(
f.col("study_type") == "single-cell", f.concat(f.lit("sc"), qtl_type_mapping)
f.col("study_type") == "single-cell",
f.concat(f.lit("sc"), qtl_type_mapping),
).otherwise(qtl_type_mapping)

@classmethod
Expand Down
40 changes: 20 additions & 20 deletions tests/gentropy/data_samples/QTD000584.credible_sets.tsv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
molecular_trait_id gene_id cs_id variant rsid cs_size pip pvalue beta se z cs_min_r2 region
RBP7.14208.3.3..1 ENSG00000162444 RBP7.14208.3.3..1_L1 chr1_10000458_T_C rs3003377 22 0.0170703689724805 1.33564e-10 0.139135 0.0215922 6.45120520807609 0.649351082174683 chr1:8997206-10997206
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sample files were changed to provide a more complete representation of the issues caused by the special characters in the studyId.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, thank you!

RBP7.14208.3.3..1 ENSG00000162444 RBP7.14208.3.3..1_L1 chr1_10003040_A_G rs3003378 22 0.0346598350987255 5.69374e-11 -0.140285 0.0213409 -6.58178370994891 0.649351082174683 chr1:8997206-10997206
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1004625_A_G rs2799056 14 0.0475804849731317 2.17772e-16 0.155149 0.018795 8.27268578736212 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005429_C_CA rs58913475 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005429_C_CA rs397795410 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005904_C_T rs3128116 14 0.082073967987591 1.22412e-16 0.15631 0.0187765 8.34279413645199 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005954_G_A rs57683598 14 0.0829345886044969 1.20956e-16 0.156326 0.0187752 8.34423990954914 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1006159_C_T rs9778087 14 0.0287755307773455 3.71195e-16 0.154747 0.0188957 8.20709279382202 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1007746_CTTTTTTTTTTTTTT_C rs1196345893 14 0.169853803516783 5.20268e-17 0.176669 0.020963 8.44875667909097 0.617640026801365 chr1:1138-2001138
RBP7:14208.3.3..1 ENSG00000162444 RBP7:14208.3.3..1_L1 chr1_10000458_T_C rs3003377 22 0.0170703689724805 1.33564e-10 0.139135 0.0215922 6.45120520807609 0.649351082174683 chr1:8997206-10997206
RBP7:14208.3.3..1 ENSG00000162444 RBP7:14208.3.3..1_L1 chr1_10003040_A_G rs3003378 22 0.0346598350987255 5.69374e-11 -0.140285 0.0213409 -6.58178370994891 0.649351082174683 chr1:8997206-10997206
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1004625_A_G rs2799056 14 0.0475804849731317 2.17772e-16 0.155149 0.018795 8.27268578736212 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005429_C_CA rs58913475 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005429_C_CA rs397795410 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005904_C_T rs3128116 14 0.082073967987591 1.22412e-16 0.15631 0.0187765 8.34279413645199 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005954_G_A rs57683598 14 0.0829345886044969 1.20956e-16 0.156326 0.0187752 8.34423990954914 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1006159_C_T rs9778087 14 0.0287755307773455 3.71195e-16 0.154747 0.0188957 8.20709279382202 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1007746_CTTTTTTTTTTTTTT_C rs1196345893 14 0.169853803516783 5.20268e-17 0.176669 0.020963 8.44875667909097 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1008088_T_C rs3121567 17 0.0408954841106202 5.10581e-17 0.436084 0.0517305 8.44446424543245 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1008307_G_C rs2465140 14 0.113047689891171 8.91487e-17 0.156234 0.0186815 8.3812176673309 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1008307_G_C rs2465140 14 0.113047689891171 8.91487e-17 0.156234 0.0186815 8.3812176673309 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1008307_G_C rs2465140 6 0.0340123675250653 6.68955e-36 0.240866 0.0190257 12.6742943381175 0.960143135725474 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1009184_T_C rs3128117 14 0.114792998892356 8.78673e-17 0.156301 0.0186856 8.38296665698105 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1009184_T_C rs3128117 14 0.114792998892356 8.78673e-17 0.156301 0.0186856 8.38296665698105 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1009184_T_C rs3128117 6 0.0368499383801147 5.5694e-36 0.241193 0.0190289 12.689400366898 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1009716_C_T rs3135457 17 0.0392096517327741 5.41887e-18 0.443041 0.0509582 8.70818986165733 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1009731_C_T rs13303172 17 0.0491133974154425 2.89124e-18 0.455466 0.0519529 8.78131609329298 0.830135313092393 chr1:1138-2001138
Expand All @@ -28,25 +28,25 @@ ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013312_G_A rs2341
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013490_C_G rs4615788 17 0.0596684312812006 1.30541e-18 0.434291 0.0490274 8.87346286235471 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013541_T_C rs15842 17 0.0580138201506556 1.34141e-18 0.434388 0.0490555 8.87034155283964 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013855_G_A rs2465124 17 0.0591361376280749 1.32867e-18 0.433762 0.0489788 8.87144043891645 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1014228_G_A rs1921 14 0.12433309239898 8.57119e-17 0.15341 0.0183335 8.38560743650197 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1014228_G_A rs1921 14 0.12433309239898 8.57119e-17 0.15341 0.0183335 8.38560743650197 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1014228_G_A rs1921 6 0.819317788540627 1.62353e-36 0.238448 0.0186635 12.7903962671107 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1014545_C_T rs2799070 17 0.0932055530070119 6.4825e-18 0.414546 0.0477952 8.68848253629389 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1014863_A_C rs1891906 14 0.0193128429509125 5.74201e-16 0.152247 0.0187132 8.15329964672768 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1014863_A_C rs1891906 14 0.0193128429509125 5.74201e-16 0.152247 0.0187132 8.15329964672768 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1014863_A_C rs1891906 6 0.0253214818240058 1.12696e-35 0.240358 0.0190503 12.6312379431961 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1015336_A_T rs2799069 17 0.0705760271470166 1.24902e-18 0.447782 0.0505217 8.87870564010476 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs575235313 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs369296755 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs59612205 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs575235313 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs369296755 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs59612205 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs575235313 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs369296755 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs59612205 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1015950_G_A rs9697717 17 0.0572897456684484 5.86568e-18 0.418525 0.0481893 8.70025128920918 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1016623_G_A rs3128118 14 0.0249715969486798 4.40368e-16 0.152839 0.0187107 8.1860030389925 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1017114_A_AT rs59239970 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1017114_A_AT rs397687829 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1016623_G_A rs3128118 14 0.0249715969486798 4.40368e-16 0.152839 0.0187107 8.1860030389925 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1017114_A_AT rs59239970 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1017114_A_AT rs397687829 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1017114_A_AT rs59239970 6 0.0248308249271214 9.24949e-36 0.240377 0.0190272 12.6474850406766 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1017114_A_AT rs397687829 6 0.0248308249271214 9.24949e-36 0.240377 0.0190272 12.6474850406766 0.960143135725474 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1022518_G_T rs2799064 14 0.0319029278184674 2.66215e-16 0.155806 0.0189309 8.25406439760094 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1022518_G_T rs2799064 14 0.0319029278184674 2.66215e-16 0.155806 0.0189309 8.25406439760094 0.617640026801365 chr1:1138-2001138
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L8 chr1_103440589_TA_T rs1327439088 5 0.418485920864639 1.08665e-07 0.632842 0.118879 5.34468451936006 0.710818444681624 chr1:102655290-104655290
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L7 chr1_103534368_A_G rs36133328 1 0.999623621396104 0.282208 0.0493205 0.0458561 1.07174598226751 1 chr1:102655290-104655290
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L1 chr1_103537231_T_TA rs553487095 4 0.770652473010321 6.33883e-62 -1.04771 0.0617702 -16.992451463208 0.88938645908643 chr1:102655290-104655290
Expand Down
Loading
Loading