Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ instance/

# Sphinx documentation
docs/build/
docs/source/auto_examples/
output_files/

# PyBuilder
target/
Expand Down Expand Up @@ -128,6 +130,9 @@ venv.bak/
.dmypy.json
dmypy.json

# ruff
.ruff_cache/

# Pyre type checker
.pyre/

Expand Down
Binary file removed docs/source/.DS_Store
Binary file not shown.
9 changes: 8 additions & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ from scratch, you may experience some differences in terms of retrieved values o
Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor and
multi-factor analysis (with categorical or continuous factors) using Wald tests, with an
optional `apeGLM <https://academic.oup.com/bioinformatics/article/35/12/2084/5159452>`_ LFC shrinkage step
:cite:p:`zhu2019heavy`. We plan to implement more in the near future. In case there is a feature you would particularly
:cite:p:`zhu2019heavy`.

PyDESeq2 also supports `pytximport <https://pytximport.complextissue.com/en/stable/>`_-derived
normalization factors :cite:p:`kuehlGeneCountEstimation2024b`, enabling accurate differential expression analysis
with transcript-level quantification data from tools like Salmon, Kallisto, or RSEM. When explicitly enabled,
this feature accounts for gene length differences between samples due to differential isoform usage.

We plan to implement more features in the near future. In case there is a feature you would particularly
like to be implemented, feel free to open an issue on GitHub.


Expand Down
14 changes: 13 additions & 1 deletion docs/source/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,16 @@ @article{zhu2019heavy
year={2019},
doi={10.1093/bioinformatics/bty895},
publisher={Oxford University Press}
}
}

@article{kuehlGeneCountEstimation2024b,
title = {Gene Count Estimation with Pytximport Enables Reproducible Analysis of Bulk {{RNA}} Sequencing Data in {{Python}}},
author = {Kuehl, Malte and Wong, Milagros N and Wanner, Nicola and Bonn, Stefan and Puelles, Victor G},
year = {2024},
journal = {Bioinformatics},
volume = {40},
number = {12},
pages = {btae700},
doi = {10.1093/bioinformatics/btae700},
publisher = {Oxford University Press}
}
164 changes: 164 additions & 0 deletions examples/plot_pytximport_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
pytximport Integration Example
==============================

This example demonstrates how to use PyDESeq2 with pytximport-derived data
for differential expression analysis with transcript-level quantification data.

pytximport enables the import of transcript-level abundance estimates from
popular RNA-seq quantification tools like Salmon, Kallisto, or RSEM into
PyDESeq2, while accounting for gene length differences between samples
due to differential isoform usage.
"""

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

# %%
# Read AnnData object
# -------------------
#
# Make sure that you have rounded the counts in .X to integers, as DESeq2 requires
# integer counts and that pytximport was used with counts_from_abundance=None
# (raw counts) to generate the AnnData object.

adata = ad.read_h5ad("../tests/data/pytximport/test_pytximport.h5ad")

# %%
# Standard usage without pytximport
# ---------------------------------
#
# By default, PyDESeq2 uses standard size factor normalization.

dds_standard = DeseqDataSet(adata=adata, design="~condition")
dds_standard.fit_size_factors()
print(f"Standard normalization: {'normalization_factors' not in dds_standard.obsm}")

# %%
# Explicit pytximport mode
# ------------------------
#
# To use pytximport normalization factors, explicitly set from_pytximport=True.

dds_explicit = DeseqDataSet(adata=adata, design="~condition", from_pytximport=True)
dds_explicit.fit_size_factors()
print(f"Pytximport normalization: {'normalization_factors' in dds_explicit.obsm}")

# %%
# Size factor estimation with normalization factors
# -------------------------------------------------
#
# When pytximport data is used, PyDESeq2 computes normalization factors
# that account for both library size and gene length differences.

dds_explicit.fit_size_factors()

print(f"Size factors computed: {'size_factors' in dds_explicit.obs}")
print(f"Normalization factors computed: {'normalization_factors' in dds_explicit.obsm}")

# Check normalization factors properties
norm_factors = dds_explicit.obsm["normalization_factors"]
size_factors = dds_explicit.obs["size_factors"]

print(f"Normalization factors shape: {norm_factors.shape}")
print(f"Size factors shape: {size_factors.shape}")
print(f"Size factors: {size_factors.values}")
print(f"Norm factors range: {norm_factors.min():.3f} - {norm_factors.max():.3f}")

# %%
# Plot distribution of normalization factors
# ------------------------------------------
#
# Normalization factors should vary around 1, reflecting gene length
# differences between samples.
plt.figure(figsize=(8, 5))
plt.hist(
norm_factors.flatten(),
bins=list(np.arange(0.5, 1.6, 0.1).astype(float)),
color="skyblue",
edgecolor="black",
alpha=0.7,
)
plt.axvline(x=1, color="red", linestyle="--", linewidth=2, label="x = 1")
plt.title("Distribution of Normalization Factors")
plt.xlabel("Normalization Factor")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()

# %%
# Full differential expression analysis
# -------------------------------------
#
# The complete DESeq2 pipeline works seamlessly with pytximport data.

# Run the full DESeq2 pipeline
dds_explicit.deseq2()

# Perform statistical testing
stat_res = DeseqStats(dds_explicit, contrast=["condition", "ko", "wt"])
stat_res.summary()
results_df = stat_res.results_df

print("Differential expression results:")
print(f"Total genes analyzed: {len(results_df)}")
print(f"Significant genes (padj < 0.05): {sum(results_df['padj'] < 0.05)}")

# Display top results
print("\nTop 5 most significant genes:")
top_results = results_df.sort_values("padj").head()
print(top_results[["baseMean", "log2FoldChange", "pvalue", "padj"]])
stat_res.plot_MA()

# %%
# Differential expression analysis without normalization factors
# --------------------------------------------------------------
#
# Compare results with standard DESeq2 normalization (without length correction).

# Read standard AnnData (generated with counts_from_abundance="length_scaled_tpm")
adata_standard = ad.read_h5ad(
"../tests/data/pytximport/test_pytximport_length_scaled.h5ad"
)

dds_standard = DeseqDataSet(adata=adata_standard, design="~condition")
dds_standard.deseq2()

stat_res_standard = DeseqStats(dds_standard, contrast=["condition", "ko", "wt"])
stat_res_standard.summary()
results_standard = stat_res_standard.results_df
print("\nTop 5 most significant genes:")
top_results = results_standard.sort_values("padj").head()
print(top_results[["baseMean", "log2FoldChange", "pvalue", "padj"]])
stat_res_standard.plot_MA()

# %%
# Comparison with standard normalization
# --------------------------------------
#
# Compare results with standard DESeq2 normalization (without length correction).
pytximport_sig = sum(results_df["padj"] < 0.05)
standard_sig = sum(results_standard["padj"] < 0.05)

print("\nComparison of significant genes (padj < 0.05):")
print(f"pytximport normalization: {pytximport_sig}")
print(f"Standard normalization: {standard_sig}")

# %%
# Variance Stabilizing Transformation (VST)
# -----------------------------------------
#
# VST also works with pytximport normalization factors.

dds_vst = DeseqDataSet(adata=adata, design="~condition", from_pytximport=True)
dds_vst.vst()

vst_counts = dds_vst.layers["vst_counts"]
print("\nVST transformation completed")
print(f"VST counts shape: {vst_counts.shape}")
print(f"VST counts range: {vst_counts.min():.3f} - {vst_counts.max():.3f}")
Loading
Loading