Skip to content

Commit

Permalink
fix: empty inSilicoPredictors object in GnomAD variant index (#807)
Browse files Browse the repository at this point in the history
* fix: empty inSilicoPredictors object in variant index

* fix: object filtering expression

* fix(hail): limit removed
  • Loading branch information
DSuveges authored Oct 10, 2024
1 parent 31e217b commit 58333c0
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions src/gentropy/datasource/gnomad/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,24 @@ def as_variant_index(self: GnomADVariants) -> VariantIndex:
.drop("locus", "alleles")
.select_globals()
.to_spark(flatten=False)
.withColumn(
"variantId",
VariantIndex.hash_long_variant_ids(
f.col("variantId"),
f.col("chromosome"),
f.col("position"),
self.lenght_threshold,
),
.withColumns(
{
# Once The parsing is done, we have to drop objects with no score from inSilicoPredictors:
"inSilicoPredictors": f.filter(
f.col("inSilicoPredictors"),
lambda predictor: predictor["score"].isNotNull(),
),
# Generate a variantId that is hashed for long variant ids:
"variantId": VariantIndex.hash_long_variant_ids(
f.col("variantId"),
f.col("chromosome"),
f.col("position"),
self.lenght_threshold,
),
# We are not capturing the most severe consequence from GnomAD, but this column needed for the schema:
"mostSevereConsequenceId": f.lit(None).cast(t.StringType()),
}
)
.withColumn("mostSevereConsequenceId", f.lit(None).cast(t.StringType()))
),
_schema=VariantIndex.get_schema(),
)

0 comments on commit 58333c0

Please sign in to comment.