Skip to content

Commit

Permalink
Auto-identify bulk data as heavy or light chain (#121)
Browse files Browse the repository at this point in the history
  • Loading branch information
willdumm authored Mar 4, 2025
1 parent a571cb9 commit 25a3a56
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 36 deletions.
85 changes: 50 additions & 35 deletions netam/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,20 +375,11 @@ def standardize_heavy_light_columns(pcp_df):
"""Ensure that heavy and light chain columns are present, and fill missing ones with
placeholder values.
If only `parent` and `child` column is present, we assume these are heavy chain sequences.
If only `parent` and `child` column is present, we assume this is bulk data and determine heavy/light chain from
V gene family.
"""
cols = pcp_df.columns
# Do some checking first:
if "parent_h" in cols:
assert "child_h" in cols, "child_h column missing!"
assert "v_gene_h" in cols, "v_gene_h column missing!"
elif "parent" in cols:
assert "child" in cols, "child column missing!"
assert "v_gene" in cols, "v_gene column missing!"
if "parent_l" in cols:
assert "child_l" in cols, "child_l column missing!"
assert "v_gene_l" in cols, "v_gene_l column missing!"

light_names = {"IGK", "IGL"}
required_columns = ["parent", "child", "v_gene"]
differentiated_columns = [
"parent",
"child",
Expand All @@ -400,33 +391,57 @@ def standardize_heavy_light_columns(pcp_df):
"cdr3_codon_start",
"cdr3_codon_end",
]
for diff_colname in differentiated_columns:
diff_colname_h = diff_colname + "_h"
diff_colname_l = diff_colname + "_l"

# Look for heavy chain, assuming undifferentiated name means heavy
# chain
if diff_colname + "_h" in cols:
pass
elif diff_colname in cols:
pcp_df[diff_colname_h] = pcp_df[diff_colname]
else:
pcp_df[diff_colname_h] = ""

# Look for light chain
if diff_colname_l in cols:
pass
else:
pcp_df[diff_colname_l] = ""
cols = pcp_df.columns
# Do some checking first:
if "parent_h" in cols:
for col in required_columns:
assert col + "_h" in cols, f"{col}_h column missing from pcp file!"
assert col + "_l" in cols, f"{col}_l column missing from pcp file!"
pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]
# Check that V gene families are in the correct columns:
if not pcp_df["v_family_l"].str[:3].isin(light_names).all():
_non_light_names = pcp_df[~pcp_df["v_family_l"].str[:3].isin(light_names)][
"v_family_l"
].unique()
raise ValueError(
f"Unexpected light chain V gene families: {_non_light_names}"
)
if not (pcp_df["v_family_h"].str[:3] == "IGH").all():
_non_heavy_names = pcp_df[pcp_df["v_family_h"].str[:3] != "IGH"][
"v_family_h"
].unique()
raise ValueError(
f"Unexpected heavy chain V gene families: {_non_heavy_names}"
)
elif "parent" in cols:
for col in required_columns:
assert col in cols, f"{col} column missing from pcp file!"
pcp_df["v_family"] = pcp_df["v_gene"].str.split("-").str[0]
is_heavy_chain = pcp_df["v_family"].str[:3] == "IGH"
_non_heavy_names = pcp_df[~is_heavy_chain]["v_family"]
if not _non_heavy_names.str[:3].isin(light_names).all():
raise ValueError(
f"V gene families not recognized: {_non_heavy_names[~_non_heavy_names.str[:3].isin(light_names)].unique()}"
)
# Make _h and _l versions of all columns and transfer data from
# undifferentiated columns to the correct version using is_heavy_chain
for col in differentiated_columns + ["v_family"]:
if col in pcp_df.columns:
pcp_df[col + "_h"] = pcp_df[col]
pcp_df[col + "_l"] = pcp_df[col]
if pd.api.types.is_string_dtype(pcp_df[col]):
fill_value = ""
else:
fill_value = pd.NA
pcp_df.loc[is_heavy_chain, col + "_l"] = fill_value
pcp_df.loc[~is_heavy_chain, col + "_h"] = fill_value

if (pcp_df["parent_h"].str.len() + pcp_df["parent_l"].str.len()).min() < 3:
raise ValueError("At least one PCP has fewer than three nucleotides.")

pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]

pcp_df.drop(
columns=differentiated_columns,
columns=differentiated_columns + ["v_family"],
inplace=True,
errors="ignore",
)
Expand Down
3 changes: 2 additions & 1 deletion netam/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,8 @@ def assert_pcp_valid(parent, child, aa_mask=None):
child, aa_mask
):
raise ValueError(
"Parent-child pair matches after masking codons containing ambiguities"
"Parent-child nucleotide sequence pair matches after masking codons containing ambiguities. "
"To avoid this try filtering data using `netam.sequences.assert_pcp_valid`."
)


Expand Down

0 comments on commit 25a3a56

Please sign in to comment.