Skip to content

Commit

Permalink
src/lineage_processing.jl: Update renaming CDR3_frequency to clone_fr…
Browse files Browse the repository at this point in the history
…equency and explaining what it is.
  • Loading branch information
mashu committed Oct 28, 2024
1 parent c91385f commit 7ef9a54
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 28 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LineageCollapse"
uuid = "e38bdfdf-80f5-4f0c-93e0-53dd02ee37b8"
authors = ["Mateusz Kaduk <[email protected]> and contributors"]
version = "0.0.10"
version = "0.0.11"

[deps]
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ preprocessed_df = preprocess_data(df)
result = process_lineages(preprocessed_df, clustering_method=HierarchicalClustering(0.2))

# Collapse identical CDR3s but only within each cluster
collapsed = combine(groupby(result, [:d_region, :lineage_id, :j_call_first, :v_call_first, :cdr3]), :cdr3 => length => :count)
# collapsed = collapse_lineages(lineages, 0.2, :soft)
collapsed = collapse_lineages(lineages, 0.2, :hardrest)
```

## Input Requirements
Expand Down
8 changes: 4 additions & 4 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ result1 = process_lineages(preprocessed_df)

# Assign lineages using length Normalized Hamming distance
# with Hierarchical clustering but different CDR3 similarity cutoff
result2 = process_lineages(preprocessed_df,
distance_metric=NormalizedHammingDistance(),
result2 = process_lineages(preprocessed_df,
distance_metric=NormalizedHammingDistance(),
clustering_method=HierarchicalClustering(0.1))
# Collapse identical CDR3s but only within each cluster
collapsed = combine(groupby(result, [:d_region, :lineage_id, :j_call_first, :v_call_first, :cdr3]), :cdr3 => length => :count)
# Collapse
collapsed_df = collapse_lineages(lineages, 0.2, :soft)

# Generate diagnostic plots (requires CairoMakie)
# using CairoMakie
Expand Down
36 changes: 20 additions & 16 deletions src/lineage_processing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ end
Compute pairwise distances between sequences using the specified distance metric.
"""
function compute_pairwise_distance(
metric::M,
metric::M,
sequences::AbstractVector{S}
)::Matrix{Float32} where {M <: Union{DistanceMetric, NormalizedDistanceMetric}, S <: LongSequence{DNAAlphabet{4}}}
n = length(sequences)
Expand All @@ -82,14 +82,14 @@ function perform_clustering(method::HierarchicalClustering, linkage::Symbol, dis
end

"""
process_lineages(df::DataFrame;
process_lineages(df::DataFrame;
distance_metric::Union{DistanceMetric, NormalizedDistanceMetric} = NormalizedHammingDistance(),
clustering_method::ClusteringMethod = HierarchicalClustering(0.1),
linkage::Symbol = :single)::DataFrame
Process lineages from a DataFrame of CDR3 sequences.
"""
function process_lineages(df::DataFrame;
function process_lineages(df::DataFrame;
distance_metric::Union{DistanceMetric, NormalizedDistanceMetric} = NormalizedHammingDistance(),
clustering_method::ClusteringMethod = HierarchicalClustering(0.2),
linkage::Symbol = :single)::DataFrame
Expand Down Expand Up @@ -132,29 +132,33 @@ function process_lineages(df::DataFrame;
end

"""
collapse_lineages(df::DataFrame, cdr3_frequency_threshold::Float64, collapse_strategy::Symbol=:hardest)
collapse_lineages(df::DataFrame, clone_frequency_threshold::Float64, collapse_strategy::Symbol=:hardest)
Collapse lineages in a DataFrame based on CDR3 sequence frequency and a specified collapse strategy.
Collapse lineages in a DataFrame based on clone frequency and a specified collapse strategy.
# Arguments
- `df::DataFrame`: Input DataFrame containing lineage data. Must have columns [:d_region, :lineage_id, :j_call_first, :v_call_first, :cdr3].
- `cdr3_frequency_threshold::Float64`: Minimum frequency threshold for CDR3 sequences (0.0 to 1.0).
- `clone_frequency_threshold::Float64`: Minimum frequency threshold for clones (0.0 to 1.0).
- `collapse_strategy::Symbol=:hardest`: Strategy for collapsing lineages. Options are:
- `:hardest`: Select only the most frequent sequence for each lineage.
- `:soft`: Select all sequences that meet or exceed the `cdr3_frequency_threshold`.
- `:hardest`: Select only the most frequent clone for each lineage.
- `:soft`: Select all clones that meet or exceed the `clone_frequency_threshold`.
# Returns
- `DataFrame`: Collapsed lineage data.
- `DataFrame`: Collapsed lineage data containing a new column:
- `clone_frequency`: Represents the relative frequency of each clone within its lineage,
calculated as (count of specific clone) / (total sequences in lineage).
A clone is defined by unique combination of D region, J call, V call, and CDR3 sequence.
Values range from 0.0 to 1.0, with higher values indicating more abundant clones in the lineage.
# Example
```julia
lineages = DataFrame(...) # Your input data
collapsed = collapse_lineages(lineages, 0.1, :soft)
```
"""
function collapse_lineages(df::DataFrame, cdr3_frequency_threshold::Float64, collapse_strategy::Symbol=:hardest)
if !(0.0 <= cdr3_frequency_threshold <= 1.0)
throw(ArgumentError("cdr3_frequency_threshold must be between 0.0 and 1.0"))
function collapse_lineages(df::DataFrame, clone_frequency_threshold::Float64, collapse_strategy::Symbol=:hardest)
if !(0.0 <= clone_frequency_threshold <= 1.0)
throw(ArgumentError("clone_frequency_threshold must be between 0.0 and 1.0"))
end
if !(collapse_strategy in [:hardest, :soft])
throw(ArgumentError("Invalid collapse strategy. Use :hardest or :soft."))
Expand All @@ -164,22 +168,22 @@ function collapse_lineages(df::DataFrame, cdr3_frequency_threshold::Float64, col
counted = combine(grouped, nrow => :sequence_count)

lineage_grouped = groupby(counted, :lineage_id)
with_frequency = transform(lineage_grouped, :sequence_count => (x -> x ./ sum(x)) => :frequency)
with_frequency = transform(lineage_grouped, :sequence_count => (x -> x ./ sum(x)) => :clone_frequency)

df_with_freq = leftjoin(df, with_frequency,
on=[:d_region, :lineage_id, :j_call_first, :v_call_first, :cdr3],
makeunique=true)

collapsed = if collapse_strategy == :hardest
combine(groupby(df_with_freq, :lineage_id)) do group
row_idx = argmax(group.frequency) # Single highest frequency
row_idx = argmax(group.clone_frequency) # Single highest frequency
group[row_idx:row_idx, :]
end
else
combine(groupby(df_with_freq, :lineage_id)) do group
group[group.frequency .>= cdr3_frequency_threshold, :]
group[group.clone_frequency .>= clone_frequency_threshold, :]
end
end

return collapsed
end
end
12 changes: 6 additions & 6 deletions test/test_lineage_processing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ using LinearAlgebra
if n != length(clustering2)
error("Clusterings must have the same length")
end

a, b = 0, 0
for i in 1:n-1
for j in i+1:n
Expand All @@ -113,7 +113,7 @@ using LinearAlgebra
end
end
end

return a / (a + b)
end

Expand Down Expand Up @@ -150,10 +150,10 @@ using LinearAlgebra
@test "GCTT" in result.cdr3 # Should keep GCTT in lineage 2 (20%, but rounded to 30%)

# Check frequencies
@test result[result.cdr3 .== "ATCG", :frequency][1] 0.6666666666666666 atol=1e-6
@test result[result.cdr3 .== "ATTG", :frequency][1] 0.3333333333333333 atol=1e-6
@test result[result.cdr3 .== "GCTA", :frequency][1] 0.6666666666666666 atol=1e-6
@test result[result.cdr3 .== "GCTT", :frequency][1] 0.3333333333333333 atol=1e-6
@test result[result.cdr3 .== "ATCG", :clone_frequency][1] 0.6666666666666666 atol=1e-6
@test result[result.cdr3 .== "ATTG", :clone_frequency][1] 0.3333333333333333 atol=1e-6
@test result[result.cdr3 .== "GCTA", :clone_frequency][1] 0.6666666666666666 atol=1e-6
@test result[result.cdr3 .== "GCTT", :clone_frequency][1] 0.3333333333333333 atol=1e-6
end

@testset "Edge cases" begin
Expand Down

2 comments on commit 7ef9a54

@mashu
Copy link
Owner Author

@mashu mashu commented on 7ef9a54 Oct 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

Frequency renamed to clone_frequency and updates to docs, as that's more informative.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/118201

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.0.11 -m "<description of version>" 7ef9a547479464d5322da1ae7d142a8196951054
git push origin v0.0.11

Please sign in to comment.