Initial commit.

mashu · Oct 10, 2024 · cb9a9e2 · cb9a9e2
1 parent ce0a9ff
commit cb9a9e2
Show file tree

Hide file tree

Showing 8 changed files with 267 additions and 4 deletions.
diff --git a/Project.toml b/Project.toml
@@ -3,7 +3,21 @@ uuid = "e38bdfdf-80f5-4f0c-93e0-53dd02ee37b8"
 authors = ["Mateusz Kaduk <[email protected]> and contributors"]
 version = "0.0.1"
 
+[deps]
+BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+
 [compat]
+BioSequences = "3.1.6"
+CSV = "0.10.14"
+Clustering = "0.15.7"
+DataFrames = "1.7.0"
+Plots = "1.40.8"
+ProgressMeter = "1.10.2"
 julia = "1.10"
 
 [extras]

diff --git a/README.md b/README.md
@@ -1,6 +1,92 @@
-# LinageCollapse
-
+# LineageCollapse.jl
 [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://mashu.github.io/LinageCollapse.jl/stable/)
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://mashu.github.io/LinageCollapse.jl/dev/)
 [![Build Status](https://github.com/mashu/LinageCollapse.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/mashu/LinageCollapse.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/mashu/LinageCollapse.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/mashu/LinageCollapse.jl)
+
+LineageCollapse.jl is a high-performance Julia package for performing standardized soft lineage collapsing on immune repertoire sequencing data. This package provides a robust and efficient implementation of lineage collapsing, a common task in many types of immunological analyses, including antibody repertoire analysis, T cell receptor studies, and immune repertoire diversity assessment.
+
+## Features
+
+- Fast and memory-efficient processing of large-scale immune repertoire data
+- Implements a standardized soft lineage collapsing algorithm
+- Supports AIRR-compliant input data formats
+- Flexible parameter configuration for customized analysis
+- Built-in visualization tools for diagnostic plots
+- Multithreaded processing for improved performance on multi-core systems
+
+## Installation
+
+You can install LineageCollapse.jl using Julia's package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and run:
+
+```
+pkg> add LineageCollapse
+```
+
+## Quick Start
+
+```julia
+using LineageCollapse
+
+# Load and preprocess data
+df = load_data("path/to/your/airr_data.tsv.gz")
+preprocessed_df = preprocess_data(df)
+
+# Perform lineage collapsing
+collapsed_df = process_lineages(preprocessed_df)
+
+# Generate diagnostic plots
+plot_diagnostics(collapsed_df)
+```
+
+## Input Requirements
+
+LineageCollapse.jl requires input data to be in AIRR-C (Adaptive Immune Receptor Repertoire - Community) format, typically obtained from tools like IgBLAST. The following columns are required:
+
+- sequence
+- v_sequence_end
+- j_sequence_start
+- cdr3
+- v_call
+- j_call
+- stop_codon
+
+## Algorithm Overview
+
+1. **Grouping**: Sequences are grouped by `v_call`, `j_call`, and `cdr3_length`.
+2. **Distance Calculation**: Pairwise Hamming distances are computed between CDR3 sequences within each group.
+3. **Clustering**: Average linkage hierarchical clustering is performed on the distance matrix.
+4. **Cluster Formation**: The hierarchical tree is cut at 10% of its height from the tip to form clusters.
+5. **Allelic Ratio Filtering**: To preserve consistent variations in the D-region (between the end of V and start of J), an allelic ratio filter (default 50%) is applied to discard outliers.
+
+## Customization
+
+LineageCollapse.jl offers various parameters for customizing the analysis:
+
+```julia
+collapsed_df = process_lineages(preprocessed_df,
+                                cutoff_ratio=0.1,  # Tree cutting height ratio
+                                allele_ratio=0.5)  # Minimum allelic ratio
+```
+
+## Visualization
+
+The `plot_diagnostics` function generates a set of diagnostic plots to help assess the results of lineage collapsing:
+
+- Cluster size distribution
+- CDR3 length vs. cluster size
+- CDR3 frequency distribution
+- Top 10 V genes by count
+
+## Contributing
+
+Contributions to LineageCollapse.jl are welcome! Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project.
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
+
+## Acknowledgments
+
+- This package was inspired by the need for standardized lineage collapsing in immune repertoire analysis.
+- We thank the AIRR Community for their efforts in standardizing immune repertoire data formats and analysis methods.
diff --git a/src/LinageCollapse.jl b/src/LinageCollapse.jl
@@ -1,5 +1,15 @@
 module LinageCollapse
+    using CSV
+    using DataFrames
+    using ProgressMeter
+    using Clustering
+    using BioSequences
+    using Plots
 
-# Write your package code here.
+    export load_data, preprocess_data, process_lineages, plot_diagnostics
 
+    include("data_loading.jl")
+    include("preprocessing.jl")
+    include("lineage_processing.jl")
+    include("visualization.jl")
 end
diff --git a/src/data_loading.jl b/src/data_loading.jl
@@ -0,0 +1,18 @@
+"""
+    load_data(filepath::String; delimiter::Char='\t', required_columns=[:sequence, :v_sequence_end, :j_sequence_start, :cdr3, :v_call, :j_call, :stop_codon])::DataFrame
+
+Load data from a file and return a DataFrame.
+
+# Arguments
+- `filepath::String`: Path to the data file.
+- `delimiter::Char='\t'`: Delimiter used in the data file (default: tab).
+- `required_columns::Vector{Symbol}=[:sequence, :v_sequence_end, :j_sequence_start, :cdr3, :v_call, :j_call, :stop_codon]`: Required columns in the data file.
+
+# Returns
+- `DataFrame`: DataFrame containing the loaded data.
+"""
+function load_data(filepath::String; 
+                   delimiter::Char='\t', 
+                   required_columns=[:sequence, :v_sequence_end, :j_sequence_start, :cdr3, :v_call, :j_call, :stop_codon])::DataFrame
+    return CSV.File(filepath, delim=delimiter, select=required_columns) |> DataFrame
+end
diff --git a/src/lineage_processing.jl b/src/lineage_processing.jl
@@ -0,0 +1,81 @@
+"""
+    hamming(x::LongDNA{4}, y::LongDNA{4})::Float64
+
+Calculate the Hamming distance between two `LongDNA{4}` sequences.
+
+# Returns
+- `Float64`: Hamming distance.
+"""
+function hamming(x::LongDNA{4}, y::LongDNA{4})::Float64
+    xor_result = x.data .⊻ y.data
+    return sum(count_ones.(xor_result)) / 2
+end
+
+"""
+    pairwise_hamming(sequences::Vector{LongDNA{4}})::Array{Float64, 2}
+
+Compute the pairwise Hamming distance matrix for a vector of `LongDNA{4}` sequences.
+
+# Returns
+- `Array{Float64, 2}`: Symmetric distance matrix.
+"""
+function pairwise_hamming(sequences::Vector{LongDNA{4}})
+    n = length(sequences)
+    dist_matrix = zeros(Float64, n, n)
+
+    Threads.@threads for i in 1:n
+        for j in i+1:n
+            dist = hamming(sequences[i], sequences[j])
+            dist_matrix[i, j] = dist
+            dist_matrix[j, i] = dist
+        end
+    end
+
+    return dist_matrix
+end
+
+"""
+    process_lineages(df::DataFrame; 
+                     cutoff_ratio::Float64=0.1, 
+                     allele_ratio::Float64=0.5)::DataFrame
+
+Process lineages in the input DataFrame.
+
+# Arguments
+- `df::DataFrame`: Input DataFrame.
+- `cutoff_ratio::Float64=0.1`: Ratio to determine the cutoff height for hierarchical clustering.
+- `allele_ratio::Float64=0.5`: Minimum ratio of CDR3 frequency to consider.
+
+# Returns
+- `DataFrame`: Processed DataFrame with lineage information.
+"""
+function process_lineages(df::DataFrame; 
+                          cutoff_ratio::Float64=0.1, 
+                          allele_ratio::Float64=0.5)::DataFrame
+    grouped = groupby(df, [:v_call_first, :j_call_first, :cdr3_length])
+    processed_groups = Vector{DataFrame}()
+
+    @showprogress "Processing lineages" for group in grouped
+        if nrow(group) > 1
+            dist = pairwise_hamming(LongDNA{4}.(group.cdr3))
+            hclusters = hclust(dist, linkage=:average)
+            maximum_height = maximum(hclusters.height)
+            cutoff = maximum_height * cutoff_ratio
+            group[!, :cluster] = cutree(hclusters, h=cutoff)
+        else
+            group[!, :cluster] .= 1
+        end
+
+        cluster_grouped = groupby(group, :cluster)
+        for cgroup in cluster_grouped
+            cgroup[!, :cluster_size] .= nrow(cgroup)
+            cgroup = combine(groupby(cgroup, [:v_call_first, :j_call_first, :cluster, :cdr3_length, :cdr3, :d_region, :cluster_size]), nrow => :cdr3_count)
+            transform!(groupby(cgroup, :cluster), :cdr3_count => maximum => :max_cdr3_count)
+            transform!(groupby(cgroup, :cluster), [:cdr3_count, :max_cdr3_count] => ((count, max_count) -> count ./ max_count) => :cdr3_frequency)
+            filter!(row -> row.cdr3_frequency > allele_ratio, cgroup)
+            push!(processed_groups, cgroup)
+        end
+    end
+
+    return vcat(processed_groups...)
+end
diff --git a/src/preprocessing.jl b/src/preprocessing.jl
@@ -0,0 +1,35 @@
+"""
+    preprocess_data(df::DataFrame; min_d_region_length::Int=9)::DataFrame
+
+Preprocess the input DataFrame by performing data cleaning and transformation.
+
+# Arguments
+- `df::DataFrame`: Input DataFrame.
+- `min_d_region_length::Int=9`: Minimum length of the D sequence to keep.
+
+# Returns
+- `DataFrame`: Preprocessed DataFrame.
+"""
+function preprocess_data(df::DataFrame; min_d_region_length::Int=9)::DataFrame
+    @info "Processing $(nrow(df)) rows"
+    dropmissing!(df, [:cdr3])
+    @info "Dropped missing CDR3 rows: $(nrow(df))"
+    filter!(x->x.stop_codon == false, df)
+    @info "Dropped stop codons: $(nrow(df))"
+    unique!(df, :sequence)
+    @info "Dropped duplicated sequences: $(nrow(df))"
+
+    transform!(df,
+        [:sequence, :v_sequence_end, :j_sequence_start] =>
+        ByRow((seq, v_end, j_start) -> seq[v_end:j_start]) => :d_region,
+    )
+
+    filter!(row -> length(row.d_region) > min_d_region_length, df)
+    @info "Dropped short (<$min_d_region_length) D region sequences: $(nrow(df))"
+
+    df = transform(df, :v_call => ByRow(x -> first(split(x, ","))) => :v_call_first)
+    df = transform(df, :j_call => ByRow(x -> first(split(x, ","))) => :j_call_first)
+    transform!(df, :cdr3 => ByRow(length) => :cdr3_length)
+
+    return df
+end
diff --git a/src/visualization.jl b/src/visualization.jl
@@ -0,0 +1,19 @@
+"""
+    plot_diagnostics(df::DataFrame)
+
+Generate diagnostic plots for the processed data.
+
+# Arguments
+- `df::DataFrame`: Processed DataFrame with lineage information.
+
+# Returns
+- `Plots.Plot`: A composite plot with multiple diagnostic visualizations.
+"""
+function plot_diagnostics(df::DataFrame)
+    p1 = histogram(df.cluster_size, title="Cluster Size Distribution", xlabel="Cluster Size", ylabel="Frequency")
+    p2 = scatter(df.cdr3_length, df.cluster_size, title="CDR3 Length vs Cluster Size", xlabel="CDR3 Length", ylabel="Cluster Size")
+    p3 = histogram(df.cdr3_frequency, title="CDR3 Frequency Distribution", xlabel="CDR3 Frequency", ylabel="Count")
+    p4 = bar(sort(combine(groupby(df, :v_call_first), nrow), :nrow, rev=true)[1:10, :], x=:v_call_first, y=:nrow, title="Top 10 V Genes", xlabel="V Gene", ylabel="Count")
+
+    return plot(p1, p2, p3, p4, layout=(2,2), size=(1200,800))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -2,5 +2,5 @@ using LinageCollapse
 using Test
 
 @testset "LinageCollapse.jl" begin
-    # Write your tests here.
+    @test 1 == 1
 end