-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: implement posterior prob filter for COLOC at small overlaps N<10 #977
Changes from all commits
cd5d6af
2f4109c
a8ba427
f231c6c
e24e51d
5273af7
28cebc4
0ec4a05
f15b913
639187a
94de973
7ac3585
8f567ee
76b33b3
ce9608d
17ca54f
f84e6a3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -208,11 +208,15 @@ class Coloc(ColocalisationMethodInterface): | |
|
||
Attributes: | ||
PSEUDOCOUNT (float): Pseudocount to avoid log(0). Defaults to 1e-10. | ||
OVERLAP_SIZE_CUTOFF (int): Minimum number of overlapping variants bfore filtering. Defaults to 5. | ||
POSTERIOR_CUTOFF (float): Minimum overlapping Posterior probability cutoff for small overlaps. Defaults to 0.5. | ||
""" | ||
|
||
METHOD_NAME: str = "COLOC" | ||
METHOD_METRIC: str = "h4" | ||
PSEUDOCOUNT: float = 1e-10 | ||
OVERLAP_SIZE_CUTOFF: int = 5 | ||
POSTERIOR_CUTOFF: float = 0.5 | ||
|
||
@staticmethod | ||
def _get_posteriors(all_bfs: NDArray[np.float64]) -> DenseVector: | ||
|
@@ -277,7 +281,15 @@ def colocalise( | |
) | ||
.select("*", "statistics.*") | ||
# Before summing log_BF columns nulls need to be filled with 0: | ||
.fillna(0, subset=["left_logBF", "right_logBF"]) | ||
.fillna( | ||
0, | ||
subset=[ | ||
"left_logBF", | ||
"right_logBF", | ||
"left_posteriorProbability", | ||
"right_posteriorProbability", | ||
], | ||
) | ||
# Sum of log_BFs for each pair of signals | ||
.withColumn( | ||
"sum_log_bf", | ||
|
@@ -305,9 +317,18 @@ def colocalise( | |
fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias( | ||
"right_logBF" | ||
), | ||
fml.array_to_vector( | ||
f.collect_list(f.col("left_posteriorProbability")) | ||
).alias("left_posteriorProbability"), | ||
fml.array_to_vector( | ||
f.collect_list(f.col("right_posteriorProbability")) | ||
).alias("right_posteriorProbability"), | ||
fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias( | ||
"sum_log_bf" | ||
), | ||
f.collect_list(f.col("tagVariantSource")).alias( | ||
"tagVariantSourceList" | ||
), | ||
) | ||
.withColumn("logsum1", logsum(f.col("left_logBF"))) | ||
.withColumn("logsum2", logsum(f.col("right_logBF"))) | ||
|
@@ -327,10 +348,39 @@ def colocalise( | |
# h3 | ||
.withColumn("sumlogsum", f.col("logsum1") + f.col("logsum2")) | ||
.withColumn("max", f.greatest("sumlogsum", "logsum12")) | ||
.withColumn( | ||
"anySnpBothSidesHigh", | ||
f.aggregate( | ||
f.transform( | ||
f.arrays_zip( | ||
fml.vector_to_array(f.col("left_posteriorProbability")), | ||
fml.vector_to_array( | ||
f.col("right_posteriorProbability") | ||
), | ||
f.col("tagVariantSourceList"), | ||
), | ||
# row["0"] = left PP, row["1"] = right PP, row["tagVariantSourceList"] | ||
lambda row: f.when( | ||
(row["tagVariantSourceList"] == "both") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think checking for
To do this you would have to make an overlap example with at least 2 variants on one side and 3 variants on the other side There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @project-defiant , tests added, let me know if it addresses your concerns! |
||
& (row["0"] > Coloc.POSTERIOR_CUTOFF) | ||
& (row["1"] > Coloc.POSTERIOR_CUTOFF), | ||
1.0, | ||
).otherwise(0.0), | ||
Comment on lines
+365
to
+368
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not 100% sure you are comparing 2 the same variants here (since they can be left or right oriented as well) |
||
), | ||
f.lit(0.0), | ||
lambda acc, x: acc + x, | ||
) | ||
> 0, # True if sum of these 1.0's > 0 | ||
) | ||
.filter( | ||
(f.col("numberColocalisingVariants") > Coloc.OVERLAP_SIZE_CUTOFF) | ||
| (f.col("anySnpBothSidesHigh")) | ||
) | ||
.withColumn( | ||
"logdiff", | ||
f.when( | ||
f.col("sumlogsum") == f.col("logsum12"), Coloc.PSEUDOCOUNT | ||
(f.col("sumlogsum") == f.col("logsum12")), | ||
Coloc.PSEUDOCOUNT, | ||
).otherwise( | ||
f.col("max") | ||
+ f.log( | ||
|
@@ -382,6 +432,10 @@ def colocalise( | |
"lH2bf", | ||
"lH3bf", | ||
"lH4bf", | ||
"left_posteriorProbability", | ||
"right_posteriorProbability", | ||
"tagVariantSourceList", | ||
"anySnpBothSidesHigh", | ||
) | ||
.withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME)) | ||
.join( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not so sure that the converting to vectors and back is a correct way to handle the posteriorProbabilibies, I wish to understand the logic, why the
VectorUDT
was used initially here. @ireneisdoomed do you know the reason behind the vectorization of the logBF values (is it the sparsity or default numpy conversion?)