Skip to content

Commit

Permalink
fix: cleaned plot_mapping_length script and added logging output, closes
Browse files Browse the repository at this point in the history
  • Loading branch information
Rina Ahmed-Begrich committed Jul 24, 2024
1 parent a8f6e3c commit 5fb31af
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 70 deletions.
2 changes: 1 addition & 1 deletion workflow/rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ rule extract_mapping_length:
conda:
"../envs/plot_mapping_length.yml"
log:
"results/{mapping_status}/length_dist/{sample}_length_dist.log",
path="results/{mapping_status}/length_dist/log/{sample}_length_dist.log",
script:
"../scripts/plot_mapping_length.py"

Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/get_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def check_gff(input_gff, log=[], error=[]):
"Location or format of the supplied genome files was not correct, quit"
)
else:
log += [f"Module finished successfully\n"]
log += ["Module finished successfully\n"]
log = ["GET_GENOME: " + i for i in log]
with open(output_log, "w") as log_file:
log_file.write("\n".join(log))
130 changes: 62 additions & 68 deletions workflow/scripts/plot_mapping_length.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
#!/usr/bin/python

import os
import sys
import argparse
# PLOT MAPPING LENGTH
# -----------------------------------------------------------------------------
#
# This script plots the length of all alignments contained in the bam file.
# Input bam file is checked for alignments.

import matplotlib
import pysam
import pandas as pd
from collections import Counter
from matplotlib import pyplot as plt


__authors__ = "Rina Ahmed-Begrich"
__mail__ = "[email protected]"
__program_name__ = "plot_mapping_length.py"
__version_info__ = ('0', '0', '1')
__version__ = '.'.join(__version_info__)


def get_aln_length(bamfile, sample):
lengths = []
for aln in bamfile.fetch():
Expand All @@ -25,64 +21,62 @@ def get_aln_length(bamfile, sample):

cnt_dict = Counter(lengths)
df = pd.DataFrame.from_dict(cnt_dict, orient='index').sort_index()
df.columns=[sample]
df.columns = [sample]
return df


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__,
epilog="""
Example Usage:
plot_mapping_length.py -b <bam_file> -s <sample_name> -o <output_folder>
Written by %s (%s),
Max Planck Unit for the Science of Pathogens. Copyright (c) 2024.
Copyright Holder All Rights Reserved.
""" % (__authors__, __mail__),
formatter_class=argparse.RawDescriptionHelpFormatter)

input = str(snakemake.input["bam"])
sample_name = snakemake.wildcards.sample
output_tsv = snakemake.output["tsv"]
output_pdf = snakemake.output["pdf"]

try:
if not os.path.exists(input):
sys.stderr.write(
"Input file %s does not exist. " + "Exit forced\n." % input
)
sys.exit(1)
bamfile = pysam.AlignmentFile(input, "rb")
df = get_aln_length(bamfile=bamfile, sample=sample_name)

# plot pdf
matplotlib.rcParams['axes.grid'] = True

x = df.index
y = list(df[sample_name])
zipped = zip(x, y)
max_index, max_val = max(zipped, key=lambda x: x[1])

fig, ax = plt.subplots()
ax.plot(x, y, marker='o')
ax.set_title('{}, mode at {}'.format(sample_name, max_index))
ax.axvline(max_index, c='r', linestyle='--', alpha=0.3)
ax.axvspan(20, 40, color='k', alpha=0.1)
ax.set_ylabel('Counts')
ax.set_xlabel('Read length')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.tight_layout()
plt.savefig(output_pdf, bbox_inches='tight')

# export coount table
df.index.set_names(['Length'], inplace=True)
df.reset_index(inplace=True)
df.to_csv(output_tsv, sep='\t', index=False)

except:
sys.stderr.write("Error ocurred when processing bam file: %s.\n"
% input)
raise RuntimeError
sys.exit(1)
input = str(snakemake.input["bam"])
sample_name = snakemake.wildcards.sample
output_tsv = snakemake.output["tsv"]
output_pdf = snakemake.output["pdf"]
output_log = snakemake.log["path"]
log = []
error = []

bamfile = pysam.AlignmentFile(input, "rb")
num_aln = bamfile.count()

# check if bam file is empty
if num_aln != 0:
df = get_aln_length(bamfile=bamfile, sample=sample_name)

# plot pdf
matplotlib.rcParams['axes.grid'] = True

x = df.index
y = list(df[sample_name])
zipped = zip(x, y)
max_index, max_val = max(zipped, key=lambda x: x[1])

fig, ax = plt.subplots()
ax.plot(x, y, marker='o')
ax.set_title('{}, mode at {}'.format(sample_name, max_index))
ax.axvline(max_index, c='r', linestyle='--', alpha=0.3)
ax.axvspan(20, 40, color='k', alpha=0.1)
ax.set_ylabel('Counts')
ax.set_xlabel('Read length')
ax.get_yaxis().get_major_formatter().set_scientific(False)
plt.tight_layout()
plt.savefig(output_pdf, bbox_inches='tight')

# export count table
df.index.set_names(['Length'], inplace=True)
df.reset_index(inplace=True)
df.to_csv(output_tsv, sep='\t', index=False)

log += [f"{num_aln} alignments processed."]
else:
# no alignment found.
error += ["Bam file does not contain any alignment."]

# print error / log messages
if error:
print("\n".join(error))
raise ValueError(
"Error occurred while processing BAM file, exit forced."
)
else:
log += ["Module finished successfully!\n"]
log = ["PLOT_MAPPING_LENGTH: " + i for i in log]
with open(output_log, "w") as log_file:
log_file.write("\n".join(log))

0 comments on commit 5fb31af

Please sign in to comment.