Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement posterior prob filter for COLOC at small overlaps N<10 #977

Merged
merged 17 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 60 additions & 6 deletions src/gentropy/method/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ def colocalise(
.withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME))
.join(
overlapping_signals.calculate_beta_ratio(),
on=["leftStudyLocusId", "rightStudyLocusId","chromosome"],
how="left"
on=["leftStudyLocusId", "rightStudyLocusId", "chromosome"],
how="left",
)
),
_schema=Colocalisation.get_schema(),
Expand All @@ -208,11 +208,15 @@ class Coloc(ColocalisationMethodInterface):

Attributes:
PSEUDOCOUNT (float): Pseudocount to avoid log(0). Defaults to 1e-10.
OVERLAP_SIZE_CUTOFF (int): Minimum number of overlapping variants bfore filtering. Defaults to 10.
xyg123 marked this conversation as resolved.
Show resolved Hide resolved
POSTERIOR_CUTOFF (float): Minimum overlapping Posterior probability cutoff for small overlaps. Defaults to 0.9.
"""

METHOD_NAME: str = "COLOC"
METHOD_METRIC: str = "h4"
PSEUDOCOUNT: float = 1e-10
OVERLAP_SIZE_CUTOFF: int = 5
POSTERIOR_CUTOFF: float = 0.9

@staticmethod
def _get_posteriors(all_bfs: NDArray[np.float64]) -> DenseVector:
Expand Down Expand Up @@ -277,7 +281,15 @@ def colocalise(
)
.select("*", "statistics.*")
# Before summing log_BF columns nulls need to be filled with 0:
.fillna(0, subset=["left_logBF", "right_logBF"])
.fillna(
0,
subset=[
"left_logBF",
"right_logBF",
"left_posteriorProbability",
"right_posteriorProbability",
],
)
# Sum of log_BFs for each pair of signals
.withColumn(
"sum_log_bf",
Expand Down Expand Up @@ -305,9 +317,18 @@ def colocalise(
fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias(
"right_logBF"
),
fml.array_to_vector(
f.collect_list(f.col("left_posteriorProbability"))
).alias("left_posteriorProbability"),
fml.array_to_vector(
f.collect_list(f.col("right_posteriorProbability"))
).alias("right_posteriorProbability"),
fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias(
"sum_log_bf"
),
f.collect_list(f.col("tagVariantSource")).alias(
"tagVariantSourceList"
),
)
.withColumn("logsum1", logsum(f.col("left_logBF")))
.withColumn("logsum2", logsum(f.col("right_logBF")))
Expand All @@ -327,10 +348,39 @@ def colocalise(
# h3
.withColumn("sumlogsum", f.col("logsum1") + f.col("logsum2"))
.withColumn("max", f.greatest("sumlogsum", "logsum12"))
.withColumn(
"anySnpBothSidesHigh",
f.aggregate(
f.transform(
f.arrays_zip(
fml.vector_to_array(f.col("left_posteriorProbability")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not so sure that the converting to vectors and back is a correct way to handle the posteriorProbabilibies, I wish to understand the logic, why the VectorUDT was used initially here. @ireneisdoomed do you know the reason behind the vectorization of the logBF values (is it the sparsity or default numpy conversion?)

fml.vector_to_array(
f.col("right_posteriorProbability")
),
f.col("tagVariantSourceList"),
),
# row["0"] = left PP, row["1"] = right PP, row["tagVariantSourceList"]
lambda row: f.when(
(row["tagVariantSourceList"] == "both")
Copy link
Contributor

@project-defiant project-defiant Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think checking for both needs to happen before you array_zip, otherwise you will end up with mixed results (not comming from the same variant) - I may be wrong though, can you provide a specific test to this part? You could add it on top of the colocalisation step test. I would like to see a test cases for:

  1. Case when left overlap (or right) does not exist, so the algorithm orders the lists (array_zip) correctly just taking into account the both
  2. Case when all PIPs from left side are low and all PIPs from right side are high
  3. Case when at least one PIP from left and one PIP from right is high

To do this you would have to make an overlap example with at least 2 variants on one side and 3 variants on the other side

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @project-defiant , tests added, let me know if it addresses your concerns!

& (row["0"] > Coloc.POSTERIOR_CUTOFF)
& (row["1"] > Coloc.POSTERIOR_CUTOFF),
1.0,
).otherwise(0.0),
Comment on lines +365 to +368
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not 100% sure you are comparing 2 the same variants here (since they can be left or right oriented as well)

),
f.lit(0.0),
lambda acc, x: acc + x,
)
> 0, # True if sum of these 1.0's > 0
)
.filter(
(f.col("numberColocalisingVariants") > Coloc.OVERLAP_SIZE_CUTOFF)
| (f.col("anySnpBothSidesHigh"))
)
.withColumn(
"logdiff",
f.when(
f.col("sumlogsum") == f.col("logsum12"), Coloc.PSEUDOCOUNT
(f.col("sumlogsum") == f.col("logsum12")),
Coloc.PSEUDOCOUNT,
).otherwise(
f.col("max")
+ f.log(
Expand Down Expand Up @@ -382,12 +432,16 @@ def colocalise(
"lH2bf",
"lH3bf",
"lH4bf",
"left_posteriorProbability",
"right_posteriorProbability",
"tagVariantSourceList",
"anySnpBothSidesHigh",
)
.withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME))
.join(
overlapping_signals.calculate_beta_ratio(),
on=["leftStudyLocusId", "rightStudyLocusId","chromosome"],
how="left"
on=["leftStudyLocusId", "rightStudyLocusId", "chromosome"],
how="left",
)
),
_schema=Colocalisation.get_schema(),
Expand Down
14 changes: 10 additions & 4 deletions tests/gentropy/method/test_colocalisation_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
"right_logBF": 10.5,
"left_beta": 0.1,
"right_beta": 0.2,
"left_posteriorProbability": 0.91,
"right_posteriorProbability": 0.92,
},
},
],
Expand Down Expand Up @@ -72,6 +74,8 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
"right_logBF": 10.5,
"left_beta": 0.1,
"right_beta": 0.2,
"left_posteriorProbability": 0.91,
"right_posteriorProbability": 0.92,
},
},
{
Expand All @@ -85,6 +89,8 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
"right_logBF": 10.5,
"left_beta": 0.3,
"right_beta": 0.5,
"left_posteriorProbability": 0.91,
"right_posteriorProbability": 0.92,
},
},
],
Expand Down Expand Up @@ -151,8 +157,8 @@ def test_coloc_no_logbf(
"right_logBF": None,
"left_beta": 0.1,
"right_beta": 0.2,
"left_posteriorProbability": None,
"right_posteriorProbability": None,
"left_posteriorProbability": 0.91,
"right_posteriorProbability": 0.92,
}, # irrelevant for COLOC
}
],
Expand Down Expand Up @@ -212,8 +218,8 @@ def test_coloc_no_betas(spark: SparkSession) -> None:
"right_logBF": 10.3,
"left_beta": None,
"right_beta": None,
"left_posteriorProbability": None,
"right_posteriorProbability": None,
"left_posteriorProbability": 0.91,
"right_posteriorProbability": 0.92,
}, # irrelevant for COLOC
}
],
Expand Down
Loading