Auto-identify bulk data as heavy or light chain (#121)

matsengrp · Mar 4, 2025 · 25a3a56 · 25a3a56
1 parent a571cb9
commit 25a3a56
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 36 deletions.
diff --git a/netam/framework.py b/netam/framework.py
@@ -375,20 +375,11 @@ def standardize_heavy_light_columns(pcp_df):
     """Ensure that heavy and light chain columns are present, and fill missing ones with
     placeholder values.
 
-    If only `parent` and `child` column is present, we assume these are heavy chain sequences.
+    If only `parent` and `child` column is present, we assume this is bulk data and determine heavy/light chain from
+    V gene family.
     """
-    cols = pcp_df.columns
-    # Do some checking first:
-    if "parent_h" in cols:
-        assert "child_h" in cols, "child_h column missing!"
-        assert "v_gene_h" in cols, "v_gene_h column missing!"
-    elif "parent" in cols:
-        assert "child" in cols, "child column missing!"
-        assert "v_gene" in cols, "v_gene column missing!"
-    if "parent_l" in cols:
-        assert "child_l" in cols, "child_l column missing!"
-        assert "v_gene_l" in cols, "v_gene_l column missing!"
-
+    light_names = {"IGK", "IGL"}
+    required_columns = ["parent", "child", "v_gene"]
     differentiated_columns = [
         "parent",
         "child",
@@ -400,33 +391,57 @@ def standardize_heavy_light_columns(pcp_df):
         "cdr3_codon_start",
         "cdr3_codon_end",
     ]
-    for diff_colname in differentiated_columns:
-        diff_colname_h = diff_colname + "_h"
-        diff_colname_l = diff_colname + "_l"
-
-        # Look for heavy chain, assuming undifferentiated name means heavy
-        # chain
-        if diff_colname + "_h" in cols:
-            pass
-        elif diff_colname in cols:
-            pcp_df[diff_colname_h] = pcp_df[diff_colname]
-        else:
-            pcp_df[diff_colname_h] = ""
-
-        # Look for light chain
-        if diff_colname_l in cols:
-            pass
-        else:
-            pcp_df[diff_colname_l] = ""
+    cols = pcp_df.columns
+    # Do some checking first:
+    if "parent_h" in cols:
+        for col in required_columns:
+            assert col + "_h" in cols, f"{col}_h column missing from pcp file!"
+            assert col + "_l" in cols, f"{col}_l column missing from pcp file!"
+        pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
+        pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]
+        # Check that V gene families are in the correct columns:
+        if not pcp_df["v_family_l"].str[:3].isin(light_names).all():
+            _non_light_names = pcp_df[~pcp_df["v_family_l"].str[:3].isin(light_names)][
+                "v_family_l"
+            ].unique()
+            raise ValueError(
+                f"Unexpected light chain V gene families: {_non_light_names}"
+            )
+        if not (pcp_df["v_family_h"].str[:3] == "IGH").all():
+            _non_heavy_names = pcp_df[pcp_df["v_family_h"].str[:3] != "IGH"][
+                "v_family_h"
+            ].unique()
+            raise ValueError(
+                f"Unexpected heavy chain V gene families: {_non_heavy_names}"
+            )
+    elif "parent" in cols:
+        for col in required_columns:
+            assert col in cols, f"{col} column missing from pcp file!"
+        pcp_df["v_family"] = pcp_df["v_gene"].str.split("-").str[0]
+        is_heavy_chain = pcp_df["v_family"].str[:3] == "IGH"
+        _non_heavy_names = pcp_df[~is_heavy_chain]["v_family"]
+        if not _non_heavy_names.str[:3].isin(light_names).all():
+            raise ValueError(
+                f"V gene families not recognized: {_non_heavy_names[~_non_heavy_names.str[:3].isin(light_names)].unique()}"
+            )
+        # Make _h and _l versions of all columns and transfer data from
+        # undifferentiated columns to the correct version using is_heavy_chain
+        for col in differentiated_columns + ["v_family"]:
+            if col in pcp_df.columns:
+                pcp_df[col + "_h"] = pcp_df[col]
+                pcp_df[col + "_l"] = pcp_df[col]
+                if pd.api.types.is_string_dtype(pcp_df[col]):
+                    fill_value = ""
+                else:
+                    fill_value = pd.NA
+                pcp_df.loc[is_heavy_chain, col + "_l"] = fill_value
+                pcp_df.loc[~is_heavy_chain, col + "_h"] = fill_value
 
     if (pcp_df["parent_h"].str.len() + pcp_df["parent_l"].str.len()).min() < 3:
         raise ValueError("At least one PCP has fewer than three nucleotides.")
 
-    pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
-    pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]
-
     pcp_df.drop(
-        columns=differentiated_columns,
+        columns=differentiated_columns + ["v_family"],
         inplace=True,
         errors="ignore",
     )

diff --git a/netam/sequences.py b/netam/sequences.py
@@ -444,7 +444,8 @@ def assert_pcp_valid(parent, child, aa_mask=None):
         child, aa_mask
     ):
         raise ValueError(
-            "Parent-child pair matches after masking codons containing ambiguities"
+            "Parent-child nucleotide sequence pair matches after masking codons containing ambiguities. "
+            "To avoid this try filtering data using `netam.sequences.assert_pcp_valid`."
         )