Skip to content

Commit

Permalink
src/preprocessing.jl: Export deduplicate_data and make it optional, a…
Browse files Browse the repository at this point in the history
…s input can be already deduplicated.
  • Loading branch information
mashu committed Oct 18, 2024
1 parent b0a5da5 commit 7a5c66d
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 26 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LineageCollapse"
uuid = "e38bdfdf-80f5-4f0c-93e0-53dd02ee37b8"
authors = ["Mateusz Kaduk <[email protected]> and contributors"]
version = "0.0.6"
version = "0.0.7"

[deps]
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Expand Down
2 changes: 1 addition & 1 deletion src/LineageCollapse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module LineageCollapse
using BioSequences
using StringDistances

export load_data, preprocess_data, process_lineages, plot_diagnostics
export load_data, preprocess_data, deduplicate_data, process_lineages, plot_diagnostics
export DistanceMetric, ClusteringMethod
export HammingDistance, NormalizedHammingDistance, LevenshteinDistance, HierarchicalClustering
export compute_distance, compute_pairwise_distance, perform_clustering
Expand Down
53 changes: 42 additions & 11 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
"""
preprocess_data(df::DataFrame; min_d_region_length::Int=0)::DataFrame
deduplicate_data(df::DataFrame, use_barcode::Bool=false)::DataFrame
Deduplicate the input DataFrame based on sequence or sequence+barcode.
# Arguments
- `df::DataFrame`: Input DataFrame.
- `use_barcode::Bool=false`: Whether to use barcode for deduplication.
# Returns
- `DataFrame`: Deduplicated DataFrame.
"""
function deduplicate_data(df::DataFrame, use_barcode::Bool=false)::DataFrame
if use_barcode && :barcode in propertynames(df)
df = unique(df, [:sequence, :barcode])
@info "Deduplicated using sequence and barcode: $(nrow(df))"
else
df = unique(df, :sequence)
@info "Deduplicated using sequence only: $(nrow(df))"
end
return df
end

"""
preprocess_data(df::DataFrame; min_d_region_length::Union{Int,Nothing}=nothing, deduplicate::Bool=false, use_barcode::Bool=false)::DataFrame
Preprocess the input DataFrame by performing data cleaning and transformation.
# Arguments
- `df::DataFrame`: Input DataFrame.
- `min_d_region_length::Int=0`: Minimum length of the D region to keep.
- `min_d_region_length::Union{Int,Nothing}=nothing`: Minimum length of the D region to keep. If nothing, no filtering is applied.
- `deduplicate::Bool=false`: Whether to deduplicate the DataFrame.
- `use_barcode::Bool=false`: Whether to use barcode for deduplication (only applicable if deduplicate is true).
# Returns
- `DataFrame`: Preprocessed DataFrame.
"""
function preprocess_data(df::DataFrame; min_d_region_length::Int=0)::DataFrame
function preprocess_data(df::DataFrame;
min_d_region_length::Union{Int,Nothing}=nothing,
deduplicate::Bool=false,
use_barcode::Bool=false)::DataFrame
@info "Processing $(nrow(df)) rows"

# Remove rows with missing CDR3
Expand All @@ -21,19 +49,17 @@ function preprocess_data(df::DataFrame; min_d_region_length::Int=0)::DataFrame
df = filter(row -> row.stop_codon == false, df)
@info "Dropped stop codons: $(nrow(df))"

# Remove duplicate sequences
df = unique(df, :sequence)
@info "Dropped duplicated sequences: $(nrow(df))"

# Calculate D region
transform!(df,
[:sequence, :v_sequence_end, :j_sequence_start] =>
ByRow((seq, v_end, j_start) -> seq[v_end+1:j_start]) => :d_region
)

# Filter based on D region length
df = filter(row -> length(row.d_region) > min_d_region_length, df)
@info "Dropped short (≤$min_d_region_length) D region sequences: $(nrow(df))"
# Filter based on D region length if specified
if !isnothing(min_d_region_length)
df = filter(row -> length(row.d_region) > min_d_region_length, df)
@info "Dropped short (≤$min_d_region_length) D region sequences: $(nrow(df))"
end

# Extract first allele from v_call and j_call
transform!(df, :v_call => ByRow(x -> first(split(x, ","))) => :v_call_first)
Expand All @@ -42,5 +68,10 @@ function preprocess_data(df::DataFrame; min_d_region_length::Int=0)::DataFrame
# Calculate CDR3 length
transform!(df, :cdr3 => ByRow(length) => :cdr3_length)

# Deduplicate if requested
if deduplicate
df = deduplicate_data(df, use_barcode)
end

return df
end
end
68 changes: 55 additions & 13 deletions test/test_preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,40 @@ using LineageCollapse
cdr3 = ["CGAT", "CGAT", missing, "CTAT", "CGAT"],
v_sequence_end = [3, 3, 3, 3, 3],
j_sequence_start = [6, 6, 6, 6, 6],
stop_codon = [false, false, false, true, false]
stop_codon = [false, false, false, true, false],
barcode = ["BC1", "BC2", "BC3", "BC4", "BC5"]
)

result = preprocess_data(df)
@testset "Default parameters" begin
result = preprocess_data(df)

@test result isa DataFrame
@test nrow(result) == 3 # 5 original rows - 1 missing CDR3 - 1 stop codon
@test :d_region in propertynames(result)
@test :v_call_first in propertynames(result)
@test :j_call_first in propertynames(result)
@test :cdr3_length in propertynames(result)

@test all(result.d_region .== "GAT")
@test result.v_call_first == ["IGHV1-1*01", "IGHV1-2*01", "IGHV1-1*01"]
@test result.j_call_first == ["IGHJ1*01", "IGHJ2*01", "IGHJ1*01"]
@test all(result.cdr3_length .== 4)
end

@test result isa DataFrame
@test nrow(result) == 2 # 5 original rows - 1 missing CDR3 - 1 stop codon - 1 duplicate sequence
@test :d_region in propertynames(result)
@test :v_call_first in propertynames(result)
@test :j_call_first in propertynames(result)
@test :cdr3_length in propertynames(result)

@test all(result.d_region .== "GAT")
@test result.v_call_first == ["IGHV1-1*01", "IGHV1-2*01"]
@test result.j_call_first == ["IGHJ1*01", "IGHJ2*01"]
@test all(result.cdr3_length .== 4)
@testset "With min_d_region_length" begin
result = preprocess_data(df, min_d_region_length=4)
@test nrow(result) == 0
end

@testset "With deduplication" begin
result = preprocess_data(df, deduplicate=true)
@test nrow(result) == 2 # 3 rows after initial preprocessing, 2 after deduplication
end

@testset "With deduplication and barcode" begin
result = preprocess_data(df, deduplicate=true, use_barcode=true)
@test nrow(result) == 3 # 3 rows after initial preprocessing, still 3 after deduplication with barcode
end
end

@testset "preprocess_data with custom min_d_region_length" begin
Expand Down Expand Up @@ -66,4 +84,28 @@ using LineageCollapse

@test nrow(result) == 0
end

@testset "deduplicate_data function" begin
df = DataFrame(
sequence_id = ["seq1", "seq2", "seq3", "seq4"],
sequence = ["ATCG", "ATCG", "ATCG", "ATCT"],
barcode = ["BC1", "BC2", "BC1", "BC4"]
)

@testset "Without barcode" begin
result = deduplicate_data(df)
@test nrow(result) == 2
end

@testset "With barcode" begin
result = deduplicate_data(df, true)
@test nrow(result) == 3
end

@testset "With barcode, but no barcode column" begin
df_no_barcode = select(df, Not(:barcode))
result = deduplicate_data(df_no_barcode, true)
@test nrow(result) == 2 # Falls back to sequence-only deduplication
end
end
end

2 comments on commit 7a5c66d

@mashu
Copy link
Owner Author

@mashu mashu commented on 7a5c66d Oct 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

Deduplication is now optional and can use barcodes.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/117550

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.0.7 -m "<description of version>" 7a5c66dbf89ece772f7303d1d8ee57786c78df2f
git push origin v0.0.7

Please sign in to comment.