Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Coverage Statistics Calculation Script for Mosdepth #460

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/lr-mosdepth/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ RUN curl https://sdk.cloud.google.com | bash

# activate conda environment
RUN echo "source activate lr-mosdepth" > ~/.bashrc

# copy the script
COPY coverage_stats.py /
166 changes: 166 additions & 0 deletions docker/lr-mosdepth/coverage_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import argparse
import gzip
import json
import logging

import numpy as np
import pandas as pd


def parse_arguments():
parser = argparse.ArgumentParser(description="Process coverage values.")
parser.add_argument("mosdepth_regions", type=str,
help="Mosdepth output file containing coverage values.")
parser.add_argument("--cov_col", type=int, default=4,
help="Column holding the coverage values.")
parser.add_argument("--output_prefix", type=str,
help="Prefix to use for output files.")
parser.add_argument("--round", type=int, default=2, help="Rounding precision.")
parser.add_argument("--debug", action="store_true", help="Debug mode.")
return parser.parse_args()


def calculate_summary_statistics(
df: pd.DataFrame, cov_col: int, round_precision: int
) -> dict:
"""
Calculate summary statistics for the coverage values.
@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""

cov_data: pd.Series = df.iloc[:, cov_col - 1]
mean_val: float = cov_data.mean()
mad_val: float = np.mean(np.abs(cov_data - mean_val))

statistics = {
"cov_mean": round(mean_val, round_precision),
"cov_q1": round(cov_data.quantile(0.25), round_precision),
"cov_median": round(cov_data.median(), round_precision),
"cov_q3": round(cov_data.quantile(0.75), round_precision),
"cov_iqr": round(cov_data.quantile(0.75) - cov_data.quantile(0.25), round_precision),
"cov_stdev": round(cov_data.std(), round_precision),
"cov_mad": round(mad_val, round_precision),
"cov_percent_above_4x": percentage_greater_than_4x(df, cov_col, round_precision),
"cov_evenness_score": calculate_evenness_score(df, cov_col, round_precision)
}

# Replace Nan values with null
for key, value in statistics.items():
if pd.isna(value):
statistics[key] = "null"

return statistics


def percentage_greater_than_4x(
df: pd.DataFrame, cov_col: int, round_precision: int
) -> float:
"""
Calculate the percentage of coverage values greater than 4x.
@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""
logging.debug("Calculating percentage of coverage values greater than 4x")
total_bases = len(df)
bases_above_4x = df[df.iloc[:, cov_col - 1] > 4].shape[0]
percent_above_4x = round(bases_above_4x / total_bases, round_precision)

logging.info("Percentage of coverage values greater than 4x: %s", percent_above_4x)

return percent_above_4x


def calculate_evenness_score(
df: pd.DataFrame, cov_col: int, round_precision: int) -> float:
"""
Calculate the evenness score.
Konrad Oexle, Journal of Human Genetics 2016, Evaulation of the evenness score in NGS.
https://www.nature.com/articles/jhg201621

@param df: Dataframe containing coverage values
@param cov_col: Column containing coverage values
@param round_precision: Rounding precision
@return:
"""
logging.debug("Calculating evenness score")
mean_coverage = df.iloc[:, cov_col - 1].mean()
# Get the coverage values that are less than or equal to the mean coverage
d2 = df[df.iloc[:, cov_col - 1] <= mean_coverage].iloc[:, cov_col - 1].tolist()
# count of coverage values that are less than or equal to the mean coverage
d2_count = len(d2)
# sum of coverage values that are less than or equal to the mean coverage
d2_sum = sum(d2)
# total number of coverages
coverage_count = len(df)

logging.debug("Mean coverage: %s", mean_coverage)
logging.debug("D2 count: %s", d2_count)
logging.debug("D2 sum: %s", d2_sum)
logging.debug("Coverage count: %s", coverage_count)

if mean_coverage != 0:
evenness_score = round(1 - (d2_count - (d2_sum / mean_coverage)) / coverage_count, round_precision)
else:
logging.warning("Mean coverage is zero. Evenness score will be set to null.")
evenness_score = "null"

logging.info("Evenness score: %s", evenness_score)

return evenness_score


def open_file(file_path: str) -> pd.DataFrame:
"""
Open a file and return a dataframe
@param file_path: Path to the file
@return:
"""

if file_path.endswith(".gz"):
with gzip.open(file_path, "rt") as f:
df = pd.read_csv(f, sep="\t", header=None)
else:
with open(file_path, "rt") as f:
df = pd.read_csv(f, sep="\t", header=None)

return df


def main():
args = parse_arguments()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)

logging.info("Arguments: %s", args)
logging.info("Calculating coverage statistics")

prefix = args.output_prefix if args.output_prefix else args.mosdepth_regions.split(".")[0]

df: pd.DataFrame = open_file(args.mosdepth_regions)
logging.info("Opened file: %s", args.mosdepth_regions)

logging.debug("Dataframe shape: %s", df.shape)
logging.debug("Dataframe columns: %s", df.columns)
logging.debug(f"Dataframe Head\n{df.head()}")

statistics = calculate_summary_statistics(df, args.cov_col, args.round)

logging.info("Summary statistics: %s", statistics)

summary_file = f"{prefix}.cov_stat_summary.json"

logging.info("Writing summary statistics to file: %s", summary_file)
with open(summary_file, "w") as f:
json.dump(statistics, f)
f.write("\n")


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions docker/lr-mosdepth/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ dependencies:
- pyvcf=0.6.8=py36_0
- readline=8.1=h27cfd23_0
- samtools=1.11=h6270b1f_0
- numpy=1.19.5
- pandas=1.1.5
- setuptools=52.0.0=py36h06a4308_0
- six=1.15.0=py36h06a4308_0
- sqlite=3.35.4=hdfb4753_0
Expand Down
Loading