Merge branch 'dev' of https://github.com/opentargets/gentropy into xg…

…1_coloc_fix
opentargets · Jan 29, 2025 · 28cebc4 · 28cebc4
2 parents 5273af7 + a99ca84
commit 28cebc4
Show file tree

Hide file tree

Showing 22 changed files with 175 additions and 172 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ ci:
   autofix_commit_msg: "chore: pre-commit auto fixes [...]"
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.4
+    rev: v0.9.3
     hooks:
       - id: ruff
         args:
@@ -57,14 +57,14 @@ repos:
         exclude: "CHANGELOG.md"
 
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-    rev: v9.18.0
+    rev: v9.20.0
     hooks:
       - id: commitlint
         additional_dependencies: ["@commitlint/[email protected]"]
         stages: [commit-msg]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.13.0"
+    rev: "v1.14.1"
     hooks:
       - id: mypy
         args:
@@ -97,7 +97,7 @@ repos:
       - id: beautysh
 
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.5.9
+    rev: 0.6.0
     hooks:
       - id: pydoclint
   - repo: https://github.com/astral-sh/uv-pre-commit

diff --git a/tests/gentropy/dataset/test_colocalisation.py b/tests/gentropy/dataset/test_colocalisation.py
@@ -63,7 +63,9 @@ def test_append_study_metadata_right(
         assert (
             observed_df.select(f"{colocalisation_side}GeneId").collect()[0][0]
             == expected_geneId
-        ), f"Expected {colocalisation_side}GeneId {expected_geneId}, but got {observed_df.select(f'{colocalisation_side}GeneId').collect()[0][0]}"
+        ), (
+            f"Expected {colocalisation_side}GeneId {expected_geneId}, but got {observed_df.select(f'{colocalisation_side}GeneId').collect()[0][0]}"
+        )
 
     @pytest.fixture(autouse=True)
     def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:

diff --git a/tests/gentropy/dataset/test_dataset.py b/tests/gentropy/dataset/test_dataset.py
@@ -42,9 +42,9 @@ def test_initialize_without_schema(self: TestDataset, spark: SparkSession) -> No
         """Test if Dataset derived class collects the schema from assets if schema is not provided."""
         df = spark.createDataFrame([(1,)], schema=MockDataset.get_schema())
         ds = MockDataset(_df=df)
-        assert (
-            ds.schema == MockDataset.get_schema()
-        ), "Schema should be inferred from df"
+        assert ds.schema == MockDataset.get_schema(), (
+            "Schema should be inferred from df"
+        )
 
     def test_passing_incorrect_types(self: TestDataset, spark: SparkSession) -> None:
         """Test if passing incorrect object types to Dataset raises an error."""
@@ -97,6 +97,6 @@ def test_process_class_params(spark: SparkSession) -> None:
     }
     class_params, spark_params = Dataset._process_class_params(params)
     assert "_df" in class_params, "Class params should contain _df"
-    assert (
-        "recursiveFileLookup" in spark_params
-    ), "Spark params should contain recursiveFileLookup"
+    assert "recursiveFileLookup" in spark_params, (
+        "Spark params should contain recursiveFileLookup"
+    )
diff --git a/tests/gentropy/dataset/test_l2g.py b/tests/gentropy/dataset/test_l2g.py
@@ -29,9 +29,9 @@ def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None:
     """Tests processing of gene interactions from OTP."""
     expected_cols = ["geneIdA", "geneIdB", "score"]
     observed_df = L2GGoldStandard.process_gene_interactions(sample_otp_interactions)
-    assert (
-        observed_df.columns == expected_cols
-    ), "Gene interactions has a different schema."
+    assert observed_df.columns == expected_cols, (
+        "Gene interactions has a different schema."
+    )
 
 
 def test_predictions(mock_l2g_predictions: L2GPrediction) -> None:
@@ -171,9 +171,9 @@ def test_l2g_feature_constructor_with_schema_mismatch(
         ),
         with_gold_standard=False,
     )
-    assert (
-        fm._df.schema["distanceTssMean"].dataType == FloatType()
-    ), "Feature `distanceTssMean` is not being casted to FloatType. Check L2GFeatureMatrix constructor."
+    assert fm._df.schema["distanceTssMean"].dataType == FloatType(), (
+        "Feature `distanceTssMean` is not being casted to FloatType. Check L2GFeatureMatrix constructor."
+    )
 
 
 def test_calculate_feature_missingness_rate(
@@ -185,9 +185,9 @@ def test_calculate_feature_missingness_rate(
     assert isinstance(observed_missingness, dict)
     assert mock_l2g_feature_matrix.features_list is not None and len(
         observed_missingness
-    ) == len(
-        mock_l2g_feature_matrix.features_list
-    ), "Missing features in the missingness rate dictionary."
-    assert (
-        observed_missingness == expected_missingness
-    ), "Missingness rate is incorrect."
+    ) == len(mock_l2g_feature_matrix.features_list), (
+        "Missing features in the missingness rate dictionary."
+    )
+    assert observed_missingness == expected_missingness, (
+        "Missingness rate is incorrect."
+    )
diff --git a/tests/gentropy/dataset/test_l2g_feature.py b/tests/gentropy/dataset/test_l2g_feature.py
@@ -295,9 +295,9 @@ def test__common_colocalisation_feature_logic(
                 },
             ],
         ).select("studyLocusId", "geneId", "eQtlColocH4Maximum")
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), "The feature values are not as expected."
+        assert observed_df.collect() == expected_df.collect(), (
+            "The feature values are not as expected."
+        )
 
     def test_extend_missing_colocalisation_to_neighbourhood_genes(
         self: TestCommonColocalisationFeatureLogic,
@@ -330,9 +330,9 @@ def test_extend_missing_colocalisation_to_neighbourhood_genes(
         expected_df = spark.createDataFrame(
             [{"geneId": "gene3", "studyLocusId": "1", "eQtlColocH4Maximum": 0.0}]
         ).select("studyLocusId", "geneId", "eQtlColocH4Maximum")
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), "The feature values are not as expected."
+        assert observed_df.collect() == expected_df.collect(), (
+            "The feature values are not as expected."
+        )
 
     def test_common_neighbourhood_colocalisation_feature_logic(
         self: TestCommonColocalisationFeatureLogic,
@@ -369,9 +369,9 @@ def test_common_neighbourhood_colocalisation_feature_logic(
                 },
             ],
         ).select("geneId", "studyLocusId", "eQtlColocH4MaximumNeighbourhood")
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), "The expected and observed dataframes do not match."
+        assert observed_df.collect() == expected_df.collect(), (
+            "The expected and observed dataframes do not match."
+        )
 
     @pytest.fixture(autouse=True)
     def _setup(self: TestCommonColocalisationFeatureLogic, spark: SparkSession) -> None:
@@ -555,9 +555,9 @@ def test_common_distance_feature_logic(
             .select("studyLocusId", "geneId", feature_name)
             .orderBy(feature_name)
         )
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), f"Expected and observed dataframes are not equal for feature {feature_name}."
+        assert observed_df.collect() == expected_df.collect(), (
+            f"Expected and observed dataframes are not equal for feature {feature_name}."
+        )
 
     def test_common_neighbourhood_distance_feature_logic(
         self: TestCommonDistanceFeatureLogic,
@@ -584,9 +584,9 @@ def test_common_neighbourhood_distance_feature_logic(
             ),  # 0.91/0.91
             ["geneId", "studyLocusId", feature_name],
         ).orderBy(feature_name)
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), "Output doesn't meet the expectation."
+        assert observed_df.collect() == expected_df.collect(), (
+            "Output doesn't meet the expectation."
+        )
 
     @pytest.fixture(autouse=True)
     def _setup(
@@ -773,9 +773,9 @@ def test_common_vep_feature_logic(
             .orderBy(feature_name)
             .select("studyLocusId", "geneId", feature_name)
         )
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), f"Expected and observed dataframes are not equal for feature {feature_name}."
+        assert observed_df.collect() == expected_df.collect(), (
+            f"Expected and observed dataframes are not equal for feature {feature_name}."
+        )
 
         def test_common_neighbourhood_vep_feature_logic(
             self: TestCommonVepFeatureLogic,
@@ -807,9 +807,9 @@ def test_common_neighbourhood_vep_feature_logic(
                 .orderBy(feature_name)
                 .select("studyLocusId", "geneId", feature_name)
             )
-            assert (
-                observed_df.collect() == expected_df.collect()
-            ), "Output doesn't meet the expectation."
+            assert observed_df.collect() == expected_df.collect(), (
+                "Output doesn't meet the expectation."
+            )
 
     @pytest.fixture(autouse=True)
     def _setup(self: TestCommonVepFeatureLogic, spark: SparkSession) -> None:
@@ -890,9 +890,9 @@ def test_common_genecount_feature_logic(
             .orderBy("studyLocusId", "geneId")
         )
 
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), f"Expected and observed dataframes do not match for feature {feature_name}."
+        assert observed_df.collect() == expected_df.collect(), (
+            f"Expected and observed dataframes do not match for feature {feature_name}."
+        )
 
     @pytest.fixture(autouse=True)
     def _setup(self: TestCommonGeneCountFeatureLogic, spark: SparkSession) -> None:
@@ -981,9 +981,9 @@ def test_is_protein_coding_feature_logic(
             .select("studyLocusId", "geneId", "isProteinCoding")
             .orderBy("studyLocusId", "geneId")
         )
-        assert (
-            observed_df.collect() == expected_df.collect()
-        ), "Expected and observed DataFrames do not match."
+        assert observed_df.collect() == expected_df.collect(), (
+            "Expected and observed DataFrames do not match."
+        )
 
     @pytest.fixture(autouse=True)
     def _setup(

diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py
@@ -60,9 +60,9 @@ def test_study_locus(
             self.sample_study_locus, features_list, loader
         )
         for feature in features_list:
-            assert (
-                feature in fm._df.columns
-            ), f"Feature {feature} not found in feature matrix."
+            assert feature in fm._df.columns, (
+                f"Feature {feature} not found in feature matrix."
+            )
 
     def test_gold_standard(
         self: TestFromFeaturesList,
@@ -78,9 +78,9 @@ def test_gold_standard(
             self.sample_gold_standard, features_list, loader
         )
         for feature in features_list:
-            assert (
-                feature in fm._df.columns
-            ), f"Feature {feature} not found in feature matrix."
+            assert feature in fm._df.columns, (
+                f"Feature {feature} not found in feature matrix."
+            )
 
     @pytest.fixture(autouse=True)
     def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None:

diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
@@ -519,9 +519,9 @@ def test_filter_ld_set(spark: SparkSession) -> None:
         observed_data, ["studyLocusId", "ldSet"]
     ).withColumn("ldSet", StudyLocus.filter_ld_set(f.col("ldSet"), 0.5))
     expected_tags_in_ld = 0
-    assert (
-        observed_df.filter(f.size("ldSet") > 1).count() == expected_tags_in_ld
-    ), "Expected tags in ld set differ from observed."
+    assert observed_df.filter(f.size("ldSet") > 1).count() == expected_tags_in_ld, (
+        "Expected tags in ld set differ from observed."
+    )
 
 
 def test_annotate_locus_statistics_boundaries(
@@ -862,9 +862,9 @@ def test_build_feature_matrix(
         study_locus=mock_study_locus,
     )
     fm = mock_study_locus.build_feature_matrix(features_list, loader)
-    assert isinstance(
-        fm, L2GFeatureMatrix
-    ), "Feature matrix should be of type L2GFeatureMatrix"
+    assert isinstance(fm, L2GFeatureMatrix), (
+        "Feature matrix should be of type L2GFeatureMatrix"
+    )
 
 
 class TestStudyLocusRedundancyFlagging:
@@ -1280,6 +1280,6 @@ def test_correctness_all_qlts_are_flagged(self: TestTransQtlFlagging) -> None:
 
     def test_correctness_found_trans(self: TestTransQtlFlagging) -> None:
         """Make sure trans qtls are flagged."""
-        assert (
-            self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2
-        ), "Expected number of rows differ from observed."
+        assert self.qtl_flagged.df.filter(f.col("isTransQtl")).count() == 2, (
+            "Expected number of rows differ from observed."
+        )
diff --git a/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
@@ -28,15 +28,15 @@ def test_ontology_parser(self: TestOntologyParger, spark: SparkSession) -> None:
             self.SAMPLE_EFO_PATH, spark
         ).retain_rows_with_ancestor_id(["CL_0000000"])
 
-        assert isinstance(
-            cell_ontology, BiosampleIndex
-        ), "Cell ontology subset is not parsed correctly to BiosampleIndex."
-        assert isinstance(
-            uberon, BiosampleIndex
-        ), "Uberon subset is not parsed correctly to BiosampleIndex."
-        assert isinstance(
-            efo_cell_line, BiosampleIndex
-        ), "EFO cell line subset is not parsed correctly to BiosampleIndex."
+        assert isinstance(cell_ontology, BiosampleIndex), (
+            "Cell ontology subset is not parsed correctly to BiosampleIndex."
+        )
+        assert isinstance(uberon, BiosampleIndex), (
+            "Uberon subset is not parsed correctly to BiosampleIndex."
+        )
+        assert isinstance(efo_cell_line, BiosampleIndex), (
+            "EFO cell line subset is not parsed correctly to BiosampleIndex."
+        )
 
     def test_merge_biosample_indices(
         self: TestOntologyParger, spark: SparkSession
@@ -49,6 +49,6 @@ def test_merge_biosample_indices(
         efo = extract_ontology_from_json(self.SAMPLE_EFO_PATH, spark)
 
         merged = cell_ontology.merge_indices([uberon, efo])
-        assert isinstance(
-            merged, BiosampleIndex
-        ), "Merging of biosample indices is not correct."
+        assert isinstance(merged, BiosampleIndex), (
+            "Merging of biosample indices is not correct."
+        )
diff --git a/tests/gentropy/datasource/ensembl/test_vep_variants.py b/tests/gentropy/datasource/ensembl/test_vep_variants.py
@@ -75,15 +75,12 @@ def test_in_silico_output_missing_value(
             x[0] for x in filter(lambda x: x[2] is None, self.SAMPLE_DATA)
         ]
         # Assert that the correct variants return null:
-        assert (
-            [
-                x["variantId"]
-                for x in self.df.filter(
-                    f.col("in_silico_predictions").isNull()
-                ).collect()
-            ]
-            == variant_with_missing_score
-        ), "Not the right variants got nullified in-silico predictor object."
+        assert [
+            x["variantId"]
+            for x in self.df.filter(f.col("in_silico_predictions").isNull()).collect()
+        ] == variant_with_missing_score, (
+            "Not the right variants got nullified in-silico predictor object."
+        )
 
 
 class TestVEPParser:
@@ -120,18 +117,18 @@ def test_conversion(self: TestVEPParser) -> None:
             _schema=VariantIndex.get_schema(),
         )
 
-        assert isinstance(
-            variant_index, VariantIndex
-        ), "VariantIndex object not created."
+        assert isinstance(variant_index, VariantIndex), (
+            "VariantIndex object not created."
+        )
 
     def test_variant_count(self: TestVEPParser) -> None:
         """Test if the number of variants is correct.
 
         It is expected that all rows from the parsed VEP output are present in the processed VEP output.
         """
-        assert (
-            self.raw_vep_output.count() == self.processed_vep_output.count()
-        ), f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
+        assert self.raw_vep_output.count() == self.processed_vep_output.count(), (
+            f"Incorrect number of variants in processed VEP output: expected {self.raw_vep_output.count()}, got {self.processed_vep_output.count()}."
+        )
 
     def test_collection(self: TestVEPParser) -> None:
         """Test if the collection of VEP variantIndex runs without failures."""
@@ -150,6 +147,6 @@ def test_ensembl_transcripts_no_duplicates(self: TestVEPParser) -> None:
         )
 
         asserted_targets = [t["targetId"] for t in targets]
-        assert len(asserted_targets) == len(
-            set(asserted_targets)
-        ), "Duplicate ensembl transcripts in a single row."
+        assert len(asserted_targets) == len(set(asserted_targets)), (
+            "Duplicate ensembl transcripts in a single row."
+        )
diff --git a/tests/gentropy/datasource/finngen/test_finngen_study_index.py b/tests/gentropy/datasource/finngen/test_finngen_study_index.py
@@ -354,9 +354,9 @@ def test_finngen_validate_release_prefix(
 ) -> None:
     """Test validate_release_prefix."""
     if not xfail:
-        assert (
-            FinnGenStudyIndex.validate_release_prefix(prefix) == expected_output
-        ), "Incorrect match object"
+        assert FinnGenStudyIndex.validate_release_prefix(prefix) == expected_output, (
+            "Incorrect match object"
+        )
     else:
         with pytest.raises(ValueError):
             FinnGenStudyIndex.validate_release_prefix(prefix)