Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…
Browse files Browse the repository at this point in the history
…1_coloc_fix
  • Loading branch information
xyg123 committed Jan 29, 2025
2 parents 5273af7 + a99ca84 commit 28cebc4
Show file tree
Hide file tree
Showing 22 changed files with 175 additions and 172 deletions.
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ci:
autofix_commit_msg: "chore: pre-commit auto fixes [...]"
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.4
rev: v0.9.3
hooks:
- id: ruff
args:
Expand Down Expand Up @@ -57,14 +57,14 @@ repos:
exclude: "CHANGELOG.md"

- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
rev: v9.18.0
rev: v9.20.0
hooks:
- id: commitlint
additional_dependencies: ["@commitlint/[email protected]"]
stages: [commit-msg]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v1.13.0"
rev: "v1.14.1"
hooks:
- id: mypy
args:
Expand Down Expand Up @@ -97,7 +97,7 @@ repos:
- id: beautysh

- repo: https://github.com/jsh9/pydoclint
rev: 0.5.9
rev: 0.6.0
hooks:
- id: pydoclint
- repo: https://github.com/astral-sh/uv-pre-commit
Expand Down
4 changes: 3 additions & 1 deletion tests/gentropy/dataset/test_colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def test_append_study_metadata_right(
assert (
observed_df.select(f"{colocalisation_side}GeneId").collect()[0][0]
== expected_geneId
), f"Expected {colocalisation_side}GeneId {expected_geneId}, but got {observed_df.select(f'{colocalisation_side}GeneId').collect()[0][0]}"
), (
f"Expected {colocalisation_side}GeneId {expected_geneId}, but got {observed_df.select(f'{colocalisation_side}GeneId').collect()[0][0]}"
)

@pytest.fixture(autouse=True)
def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
Expand Down
12 changes: 6 additions & 6 deletions tests/gentropy/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def test_initialize_without_schema(self: TestDataset, spark: SparkSession) -> No
"""Test if Dataset derived class collects the schema from assets if schema is not provided."""
df = spark.createDataFrame([(1,)], schema=MockDataset.get_schema())
ds = MockDataset(_df=df)
assert (
ds.schema == MockDataset.get_schema()
), "Schema should be inferred from df"
assert ds.schema == MockDataset.get_schema(), (
"Schema should be inferred from df"
)

def test_passing_incorrect_types(self: TestDataset, spark: SparkSession) -> None:
"""Test if passing incorrect object types to Dataset raises an error."""
Expand Down Expand Up @@ -97,6 +97,6 @@ def test_process_class_params(spark: SparkSession) -> None:
}
class_params, spark_params = Dataset._process_class_params(params)
assert "_df" in class_params, "Class params should contain _df"
assert (
"recursiveFileLookup" in spark_params
), "Spark params should contain recursiveFileLookup"
assert "recursiveFileLookup" in spark_params, (
"Spark params should contain recursiveFileLookup"
)
24 changes: 12 additions & 12 deletions tests/gentropy/dataset/test_l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None:
"""Tests processing of gene interactions from OTP."""
expected_cols = ["geneIdA", "geneIdB", "score"]
observed_df = L2GGoldStandard.process_gene_interactions(sample_otp_interactions)
assert (
observed_df.columns == expected_cols
), "Gene interactions has a different schema."
assert observed_df.columns == expected_cols, (
"Gene interactions has a different schema."
)


def test_predictions(mock_l2g_predictions: L2GPrediction) -> None:
Expand Down Expand Up @@ -171,9 +171,9 @@ def test_l2g_feature_constructor_with_schema_mismatch(
),
with_gold_standard=False,
)
assert (
fm._df.schema["distanceTssMean"].dataType == FloatType()
), "Feature `distanceTssMean` is not being casted to FloatType. Check L2GFeatureMatrix constructor."
assert fm._df.schema["distanceTssMean"].dataType == FloatType(), (
"Feature `distanceTssMean` is not being casted to FloatType. Check L2GFeatureMatrix constructor."
)


def test_calculate_feature_missingness_rate(
Expand All @@ -185,9 +185,9 @@ def test_calculate_feature_missingness_rate(
assert isinstance(observed_missingness, dict)
assert mock_l2g_feature_matrix.features_list is not None and len(
observed_missingness
) == len(
mock_l2g_feature_matrix.features_list
), "Missing features in the missingness rate dictionary."
assert (
observed_missingness == expected_missingness
), "Missingness rate is incorrect."
) == len(mock_l2g_feature_matrix.features_list), (
"Missing features in the missingness rate dictionary."
)
assert observed_missingness == expected_missingness, (
"Missingness rate is incorrect."
)
54 changes: 27 additions & 27 deletions tests/gentropy/dataset/test_l2g_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,9 @@ def test__common_colocalisation_feature_logic(
},
],
).select("studyLocusId", "geneId", "eQtlColocH4Maximum")
assert (
observed_df.collect() == expected_df.collect()
), "The feature values are not as expected."
assert observed_df.collect() == expected_df.collect(), (
"The feature values are not as expected."
)

def test_extend_missing_colocalisation_to_neighbourhood_genes(
self: TestCommonColocalisationFeatureLogic,
Expand Down Expand Up @@ -330,9 +330,9 @@ def test_extend_missing_colocalisation_to_neighbourhood_genes(
expected_df = spark.createDataFrame(
[{"geneId": "gene3", "studyLocusId": "1", "eQtlColocH4Maximum": 0.0}]
).select("studyLocusId", "geneId", "eQtlColocH4Maximum")
assert (
observed_df.collect() == expected_df.collect()
), "The feature values are not as expected."
assert observed_df.collect() == expected_df.collect(), (
"The feature values are not as expected."
)

def test_common_neighbourhood_colocalisation_feature_logic(
self: TestCommonColocalisationFeatureLogic,
Expand Down Expand Up @@ -369,9 +369,9 @@ def test_common_neighbourhood_colocalisation_feature_logic(
},
],
).select("geneId", "studyLocusId", "eQtlColocH4MaximumNeighbourhood")
assert (
observed_df.collect() == expected_df.collect()
), "The expected and observed dataframes do not match."
assert observed_df.collect() == expected_df.collect(), (
"The expected and observed dataframes do not match."
)

@pytest.fixture(autouse=True)
def _setup(self: TestCommonColocalisationFeatureLogic, spark: SparkSession) -> None:
Expand Down Expand Up @@ -555,9 +555,9 @@ def test_common_distance_feature_logic(
.select("studyLocusId", "geneId", feature_name)
.orderBy(feature_name)
)
assert (
observed_df.collect() == expected_df.collect()
), f"Expected and observed dataframes are not equal for feature {feature_name}."
assert observed_df.collect() == expected_df.collect(), (
f"Expected and observed dataframes are not equal for feature {feature_name}."
)

def test_common_neighbourhood_distance_feature_logic(
self: TestCommonDistanceFeatureLogic,
Expand All @@ -584,9 +584,9 @@ def test_common_neighbourhood_distance_feature_logic(
), # 0.91/0.91
["geneId", "studyLocusId", feature_name],
).orderBy(feature_name)
assert (
observed_df.collect() == expected_df.collect()
), "Output doesn't meet the expectation."
assert observed_df.collect() == expected_df.collect(), (
"Output doesn't meet the expectation."
)

@pytest.fixture(autouse=True)
def _setup(
Expand Down Expand Up @@ -773,9 +773,9 @@ def test_common_vep_feature_logic(
.orderBy(feature_name)
.select("studyLocusId", "geneId", feature_name)
)
assert (
observed_df.collect() == expected_df.collect()
), f"Expected and observed dataframes are not equal for feature {feature_name}."
assert observed_df.collect() == expected_df.collect(), (
f"Expected and observed dataframes are not equal for feature {feature_name}."
)

def test_common_neighbourhood_vep_feature_logic(
self: TestCommonVepFeatureLogic,
Expand Down Expand Up @@ -807,9 +807,9 @@ def test_common_neighbourhood_vep_feature_logic(
.orderBy(feature_name)
.select("studyLocusId", "geneId", feature_name)
)
assert (
observed_df.collect() == expected_df.collect()
), "Output doesn't meet the expectation."
assert observed_df.collect() == expected_df.collect(), (
"Output doesn't meet the expectation."
)

@pytest.fixture(autouse=True)
def _setup(self: TestCommonVepFeatureLogic, spark: SparkSession) -> None:
Expand Down Expand Up @@ -890,9 +890,9 @@ def test_common_genecount_feature_logic(
.orderBy("studyLocusId", "geneId")
)

assert (
observed_df.collect() == expected_df.collect()
), f"Expected and observed dataframes do not match for feature {feature_name}."
assert observed_df.collect() == expected_df.collect(), (
f"Expected and observed dataframes do not match for feature {feature_name}."
)

@pytest.fixture(autouse=True)
def _setup(self: TestCommonGeneCountFeatureLogic, spark: SparkSession) -> None:
Expand Down Expand Up @@ -981,9 +981,9 @@ def test_is_protein_coding_feature_logic(
.select("studyLocusId", "geneId", "isProteinCoding")
.orderBy("studyLocusId", "geneId")
)
assert (
observed_df.collect() == expected_df.collect()
), "Expected and observed DataFrames do not match."
assert observed_df.collect() == expected_df.collect(), (
"Expected and observed DataFrames do not match."
)

@pytest.fixture(autouse=True)
def _setup(
Expand Down
12 changes: 6 additions & 6 deletions tests/gentropy/dataset/test_l2g_feature_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def test_study_locus(
self.sample_study_locus, features_list, loader
)
for feature in features_list:
assert (
feature in fm._df.columns
), f"Feature {feature} not found in feature matrix."
assert feature in fm._df.columns, (
f"Feature {feature} not found in feature matrix."
)

def test_gold_standard(
self: TestFromFeaturesList,
Expand All @@ -78,9 +78,9 @@ def test_gold_standard(
self.sample_gold_standard, features_list, loader
)
for feature in features_list:
assert (
feature in fm._df.columns
), f"Feature {feature} not found in feature matrix."
assert feature in fm._df.columns, (
f"Feature {feature} not found in feature matrix."
)

@pytest.fixture(autouse=True)
def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:
Expand Down
18 changes: 9 additions & 9 deletions tests/gentropy/dataset/test_study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,9 +519,9 @@ def test_filter_ld_set(spark: SparkSession) -> None:
observed_data, ["studyLocusId", "ldSet"]
).withColumn("ldSet", StudyLocus.filter_ld_set(f.col("ldSet"), 0.5))
expected_tags_in_ld = 0
assert (
observed_df.filter(f.size("ldSet") > 1).count() == expected_tags_in_ld
), "Expected tags in ld set differ from observed."
assert observed_df.filter(f.size("ldSet") > 1).count() == expected_tags_in_ld, (
"Expected tags in ld set differ from observed."
)


def test_annotate_locus_statistics_boundaries(
Expand Down Expand Up @@ -862,9 +862,9 @@ def test_build_feature_matrix(
study_locus=mock_study_locus,
)
fm = mock_study_locus.build_feature_matrix(features_list, loader)
assert isinstance(
fm, L2GFeatureMatrix
), "Feature matrix should be of type L2GFeatureMatrix"
assert isinstance(fm, L2GFeatureMatrix), (
"Feature matrix should be of type L2GFeatureMatrix"
)


class TestStudyLocusRedundancyFlagging:
Expand Down Expand Up @@ -1280,6 +1280,6 @@ def test_correctness_all_qlts_are_flagged(self: TestTransQtlFlagging) -> None:

def test_correctness_found_trans(self: TestTransQtlFlagging) -> None:
"""Make sure trans qtls are flagged."""
assert (
self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2
), "Expected number of rows differ from observed."
assert self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2, (
"Expected number of rows differ from observed."
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def test_ontology_parser(self: TestOntologyParger, spark: SparkSession) -> None:
self.SAMPLE_EFO_PATH, spark
).retain_rows_with_ancestor_id(["CL_0000000"])

assert isinstance(
cell_ontology, BiosampleIndex
), "Cell ontology subset is not parsed correctly to BiosampleIndex."
assert isinstance(
uberon, BiosampleIndex
), "Uberon subset is not parsed correctly to BiosampleIndex."
assert isinstance(
efo_cell_line, BiosampleIndex
), "EFO cell line subset is not parsed correctly to BiosampleIndex."
assert isinstance(cell_ontology, BiosampleIndex), (
"Cell ontology subset is not parsed correctly to BiosampleIndex."
)
assert isinstance(uberon, BiosampleIndex), (
"Uberon subset is not parsed correctly to BiosampleIndex."
)
assert isinstance(efo_cell_line, BiosampleIndex), (
"EFO cell line subset is not parsed correctly to BiosampleIndex."
)

def test_merge_biosample_indices(
self: TestOntologyParger, spark: SparkSession
Expand All @@ -49,6 +49,6 @@ def test_merge_biosample_indices(
efo = extract_ontology_from_json(self.SAMPLE_EFO_PATH, spark)

merged = cell_ontology.merge_indices([uberon, efo])
assert isinstance(
merged, BiosampleIndex
), "Merging of biosample indices is not correct."
assert isinstance(merged, BiosampleIndex), (
"Merging of biosample indices is not correct."
)
33 changes: 15 additions & 18 deletions tests/gentropy/datasource/ensembl/test_vep_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,12 @@ def test_in_silico_output_missing_value(
x[0] for x in filter(lambda x: x[2] is None, self.SAMPLE_DATA)
]
# Assert that the correct variants return null:
assert (
[
x["variantId"]
for x in self.df.filter(
f.col("in_silico_predictions").isNull()
).collect()
]
== variant_with_missing_score
), "Not the right variants got nullified in-silico predictor object."
assert [
x["variantId"]
for x in self.df.filter(f.col("in_silico_predictions").isNull()).collect()
] == variant_with_missing_score, (
"Not the right variants got nullified in-silico predictor object."
)


class TestVEPParser:
Expand Down Expand Up @@ -120,18 +117,18 @@ def test_conversion(self: TestVEPParser) -> None:
_schema=VariantIndex.get_schema(),
)

assert isinstance(
variant_index, VariantIndex
), "VariantIndex object not created."
assert isinstance(variant_index, VariantIndex), (
"VariantIndex object not created."
)

def test_variant_count(self: TestVEPParser) -> None:
"""Test if the number of variants is correct.
It is expected that all rows from the parsed VEP output are present in the processed VEP output.
"""
assert (
self.raw_vep_output.count() == self.processed_vep_output.count()
), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
assert self.raw_vep_output.count() == self.processed_vep_output.count(), (
f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
)

def test_collection(self: TestVEPParser) -> None:
"""Test if the collection of VEP variantIndex runs without failures."""
Expand All @@ -150,6 +147,6 @@ def test_ensembl_transcripts_no_duplicates(self: TestVEPParser) -> None:
)

asserted_targets = [t["targetId"] for t in targets]
assert len(asserted_targets) == len(
set(asserted_targets)
), "Duplicate ensembl transcripts in a single row."
assert len(asserted_targets) == len(set(asserted_targets)), (
"Duplicate ensembl transcripts in a single row."
)
6 changes: 3 additions & 3 deletions tests/gentropy/datasource/finngen/test_finngen_study_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,9 @@ def test_finngen_validate_release_prefix(
) -> None:
"""Test validate_release_prefix."""
if not xfail:
assert (
FinnGenStudyIndex.validate_release_prefix(prefix) == expected_output
), "Incorrect match object"
assert FinnGenStudyIndex.validate_release_prefix(prefix) == expected_output, (
"Incorrect match object"
)
else:
with pytest.raises(ValueError):
FinnGenStudyIndex.validate_release_prefix(prefix)
Expand Down
Loading

0 comments on commit 28cebc4

Please sign in to comment.