Skip to content

Commit

Permalink
chore: removing symbols from QTL study identifiers (#971)
Browse files Browse the repository at this point in the history
* chore: adding logic to sanitize QTL study identifiers

* fix: instead using urllib.parser.quote, just use regexrepace

* refactor: moving sanitizing logic to qtl ingestion
  • Loading branch information
DSuveges authored Jan 31, 2025
1 parent e7f7945 commit ce8afb9
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 74 deletions.
26 changes: 26 additions & 0 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,29 @@ def calculate_harmonic_sum(input_array: Column) -> Column:
/ f.pow(x["pos"], 2)
/ f.lit(sum(1 / ((i + 1) ** 2) for i in range(1000))),
)


def clean_strings_from_symbols(source: Column) -> Column:
"""To make strings URL-safe and consitent by lower-casing and replace special characters with underscores.
Args:
source (Column): Source string
Returns:
Column: Cleaned string
Examples:
>>> d = [("AbCd-12.2",),("AaBb..123?",),("cDd!@#$%^&*()",),]
>>> df = spark.createDataFrame(d).toDF("source")
>>> df.withColumn("cleaned", clean_strings_from_symbols(f.col("source"))).show(truncate=False)
+-------------+---------+
|source |cleaned |
+-------------+---------+
|AbCd-12.2 |abcd-12_2|
|AaBb..123? |aabb_123_|
|cDd!@#$%^&*()|cdd_ |
+-------------+---------+
<BLANKLINE>
"""
characters_to_replace = r"[^a-z0-9-_]+"
return f.regexp_replace(f.lower(source), characters_to_replace, "_")
4 changes: 3 additions & 1 deletion src/gentropy/dataset/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from gentropy.assets import data
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import convert_from_wide_to_long
from gentropy.common.spark_helpers import (
convert_from_wide_to_long,
)
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down
16 changes: 10 additions & 6 deletions src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)

from gentropy.common.session import Session
from gentropy.common.spark_helpers import clean_strings_from_symbols
from gentropy.common.utils import parse_pvalue
from gentropy.dataset.study_locus import FinemappingMethod, StudyLocus
from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex
Expand Down Expand Up @@ -171,12 +172,15 @@ def parse_susie_results(
f.col("molecular_trait_id").alias("traitFromSource"),
f.col("gene_id").alias("geneId"),
f.col("dataset_id"),
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
# Upon creation, the studyId cleaned from symbols:
clean_strings_from_symbols(
f.concat_ws(
"_",
f.col("study_label"),
f.col("quant_method"),
f.col("sample_group"),
f.col("molecular_trait_id"),
)
).alias("studyId"),
f.col("tissue_id").alias("biosampleFromSourceId"),
EqtlCatalogueStudyIndex._identify_study_type().alias("studyType"),
Expand Down
3 changes: 2 additions & 1 deletion src/gentropy/datasource/eqtl_catalogue/study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def _identify_study_type(
*[f.lit(x) for x in chain(*cls.method_to_qtl_type_mapping.items())]
)[f.col("quant_method")]
return f.when(
f.col("study_type") == "single-cell", f.concat(f.lit("sc"), qtl_type_mapping)
f.col("study_type") == "single-cell",
f.concat(f.lit("sc"), qtl_type_mapping),
).otherwise(qtl_type_mapping)

@classmethod
Expand Down
40 changes: 20 additions & 20 deletions tests/gentropy/data_samples/QTD000584.credible_sets.tsv
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
molecular_trait_id gene_id cs_id variant rsid cs_size pip pvalue beta se z cs_min_r2 region
RBP7.14208.3.3..1 ENSG00000162444 RBP7.14208.3.3..1_L1 chr1_10000458_T_C rs3003377 22 0.0170703689724805 1.33564e-10 0.139135 0.0215922 6.45120520807609 0.649351082174683 chr1:8997206-10997206
RBP7.14208.3.3..1 ENSG00000162444 RBP7.14208.3.3..1_L1 chr1_10003040_A_G rs3003378 22 0.0346598350987255 5.69374e-11 -0.140285 0.0213409 -6.58178370994891 0.649351082174683 chr1:8997206-10997206
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1004625_A_G rs2799056 14 0.0475804849731317 2.17772e-16 0.155149 0.018795 8.27268578736212 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005429_C_CA rs58913475 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005429_C_CA rs397795410 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005904_C_T rs3128116 14 0.082073967987591 1.22412e-16 0.15631 0.0187765 8.34279413645199 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1005954_G_A rs57683598 14 0.0829345886044969 1.20956e-16 0.156326 0.0187752 8.34423990954914 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1006159_C_T rs9778087 14 0.0287755307773455 3.71195e-16 0.154747 0.0188957 8.20709279382202 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1007746_CTTTTTTTTTTTTTT_C rs1196345893 14 0.169853803516783 5.20268e-17 0.176669 0.020963 8.44875667909097 0.617640026801365 chr1:1138-2001138
RBP7:14208.3.3..1 ENSG00000162444 RBP7:14208.3.3..1_L1 chr1_10000458_T_C rs3003377 22 0.0170703689724805 1.33564e-10 0.139135 0.0215922 6.45120520807609 0.649351082174683 chr1:8997206-10997206
RBP7:14208.3.3..1 ENSG00000162444 RBP7:14208.3.3..1_L1 chr1_10003040_A_G rs3003378 22 0.0346598350987255 5.69374e-11 -0.140285 0.0213409 -6.58178370994891 0.649351082174683 chr1:8997206-10997206
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1004625_A_G rs2799056 14 0.0475804849731317 2.17772e-16 0.155149 0.018795 8.27268578736212 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005429_C_CA rs58913475 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005429_C_CA rs397795410 14 0.0821809105318325 1.22244e-16 0.156317 0.018777 8.34295752984943 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005904_C_T rs3128116 14 0.082073967987591 1.22412e-16 0.15631 0.0187765 8.34279413645199 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1005954_G_A rs57683598 14 0.0829345886044969 1.20956e-16 0.156326 0.0187752 8.34423990954914 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1006159_C_T rs9778087 14 0.0287755307773455 3.71195e-16 0.154747 0.0188957 8.20709279382202 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1007746_CTTTTTTTTTTTTTT_C rs1196345893 14 0.169853803516783 5.20268e-17 0.176669 0.020963 8.44875667909097 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1008088_T_C rs3121567 17 0.0408954841106202 5.10581e-17 0.436084 0.0517305 8.44446424543245 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1008307_G_C rs2465140 14 0.113047689891171 8.91487e-17 0.156234 0.0186815 8.3812176673309 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1008307_G_C rs2465140 14 0.113047689891171 8.91487e-17 0.156234 0.0186815 8.3812176673309 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1008307_G_C rs2465140 6 0.0340123675250653 6.68955e-36 0.240866 0.0190257 12.6742943381175 0.960143135725474 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1009184_T_C rs3128117 14 0.114792998892356 8.78673e-17 0.156301 0.0186856 8.38296665698105 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1009184_T_C rs3128117 14 0.114792998892356 8.78673e-17 0.156301 0.0186856 8.38296665698105 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1009184_T_C rs3128117 6 0.0368499383801147 5.5694e-36 0.241193 0.0190289 12.689400366898 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1009716_C_T rs3135457 17 0.0392096517327741 5.41887e-18 0.443041 0.0509582 8.70818986165733 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1009731_C_T rs13303172 17 0.0491133974154425 2.89124e-18 0.455466 0.0519529 8.78131609329298 0.830135313092393 chr1:1138-2001138
Expand All @@ -28,25 +28,25 @@ ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013312_G_A rs2341
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013490_C_G rs4615788 17 0.0596684312812006 1.30541e-18 0.434291 0.0490274 8.87346286235471 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013541_T_C rs15842 17 0.0580138201506556 1.34141e-18 0.434388 0.0490555 8.87034155283964 0.830135313092393 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1013855_G_A rs2465124 17 0.0591361376280749 1.32867e-18 0.433762 0.0489788 8.87144043891645 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1014228_G_A rs1921 14 0.12433309239898 8.57119e-17 0.15341 0.0183335 8.38560743650197 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1014228_G_A rs1921 14 0.12433309239898 8.57119e-17 0.15341 0.0183335 8.38560743650197 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1014228_G_A rs1921 6 0.819317788540627 1.62353e-36 0.238448 0.0186635 12.7903962671107 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1014545_C_T rs2799070 17 0.0932055530070119 6.4825e-18 0.414546 0.0477952 8.68848253629389 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1014863_A_C rs1891906 14 0.0193128429509125 5.74201e-16 0.152247 0.0187132 8.15329964672768 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1014863_A_C rs1891906 14 0.0193128429509125 5.74201e-16 0.152247 0.0187132 8.15329964672768 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1014863_A_C rs1891906 6 0.0253214818240058 1.12696e-35 0.240358 0.0190503 12.6312379431961 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1015336_A_T rs2799069 17 0.0705760271470166 1.24902e-18 0.447782 0.0505217 8.87870564010476 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs575235313 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs369296755 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1015925_A_ATT rs59612205 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs575235313 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs369296755 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1015925_A_ATT rs59612205 14 0.0290657333363032 3.71821e-16 0.152991 0.0186817 8.2068585448562 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs575235313 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs369296755 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1015925_A_ATT rs59612205 6 0.0224165593925054 1.03516e-35 0.240113 0.0190203 12.6381934222441 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L2 chr1_1015950_G_A rs9697717 17 0.0572897456684484 5.86568e-18 0.418525 0.0481893 8.70025128920918 0.830135313092393 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1016623_G_A rs3128118 14 0.0249715969486798 4.40368e-16 0.152839 0.0187107 8.1860030389925 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1017114_A_AT rs59239970 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1017114_A_AT rs397687829 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1016623_G_A rs3128118 14 0.0249715969486798 4.40368e-16 0.152839 0.0187107 8.1860030389925 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1017114_A_AT rs59239970 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1017114_A_AT rs397687829 14 0.0258900605545787 4.21998e-16 0.152767 0.0186899 8.19127935909139 0.617640026801365 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1017114_A_AT rs59239970 6 0.0248308249271214 9.24949e-36 0.240377 0.0190272 12.6474850406766 0.960143135725474 chr1:1138-2001138
ISG15.14151.4.3..1 ENSG00000187608 ISG15.14151.4.3..1_L1 chr1_1017114_A_AT rs397687829 6 0.0248308249271214 9.24949e-36 0.240377 0.0190272 12.6474850406766 0.960143135725474 chr1:1138-2001138
ISG15.14148.2.3..1 ENSG00000187608 ISG15.14148.2.3..1_L1 chr1_1022518_G_T rs2799064 14 0.0319029278184674 2.66215e-16 0.155806 0.0189309 8.25406439760094 0.617640026801365 chr1:1138-2001138
ISG15:14148.2.3..1 ENSG00000187608 ISG15:14148.2.3..1_L1 chr1_1022518_G_T rs2799064 14 0.0319029278184674 2.66215e-16 0.155806 0.0189309 8.25406439760094 0.617640026801365 chr1:1138-2001138
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L8 chr1_103440589_TA_T rs1327439088 5 0.418485920864639 1.08665e-07 0.632842 0.118879 5.34468451936006 0.710818444681624 chr1:102655290-104655290
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L7 chr1_103534368_A_G rs36133328 1 0.999623621396104 0.282208 0.0493205 0.0458561 1.07174598226751 1 chr1:102655290-104655290
AMY1A.7918.114.3..1 ENSG00000237763 AMY1A.7918.114.3..1_L1 chr1_103537231_T_TA rs553487095 4 0.770652473010321 6.33883e-62 -1.04771 0.0617702 -16.992451463208 0.88938645908643 chr1:102655290-104655290
Expand Down
Loading

0 comments on commit ce8afb9

Please sign in to comment.