-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pipeline for ingesting high-coverage PB/ONT aligned BAMs and format them for re-processing #418
base: main
Are you sure you want to change the base?
Changes from all commits
e09dc65
d3afc4b
3a2ac5c
29aa964
0262525
7be9309
1ff0912
4224ab4
a13a541
2905c69
b16c619
c96ea8e
39d77d2
cbfed4e
8039c88
b16155f
5bf0b2f
0d61935
989ead1
b8e67ff
e5a79c2
b5dc978
1abcc7a
da5bfb0
7975d6d
5620d74
e910bac
04b82f8
14be4e2
1f8dcae
1ac2a31
ce90a22
3d3a508
60517b4
a3da0a6
6775bd3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
FROM python:3.9.16-slim-bullseye | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These dockerfiles are so long, and have so much repeated content. |
||
|
||
COPY remove_duplicate_ont_aln.py /opt/ | ||
COPY remove_duplicate_ont_namesorted_unaligned.py /opt/ | ||
|
||
RUN pip install pysam==0.21.0 | ||
|
||
# install gcloud and gsutil cli | ||
ARG DEBIAN_FRONTEND=noninteractive | ||
RUN apt-get -qqy update --fix-missing && \ | ||
apt-get -qqy dist-upgrade && \ | ||
apt-get -qqy install --no-install-recommends \ | ||
apt-transport-https \ | ||
ca-certificates \ | ||
gnupg \ | ||
zlib1g-dev \ | ||
curl \ | ||
wget \ | ||
tree \ | ||
tabix && \ | ||
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ | ||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ | ||
apt-get -qqy update && \ | ||
apt-get -qqy install --no-install-recommends google-cloud-cli && \ | ||
gcloud config set core/disable_usage_reporting true && \ | ||
gcloud config set component_manager/disable_update_check true && \ | ||
gcloud config set metrics/environment github_docker_image && \ | ||
apt-get -qqy purge gnupg && \ | ||
apt-get -qqy clean && \ | ||
rm -rf /tmp/* \ | ||
/var/tmp/* \ | ||
/var/cache/apt/* \ | ||
/var/lib/apt/lists/* \ | ||
/usr/share/man/?? \ | ||
/usr/share/man/??_* | ||
|
||
# install latest samtools | ||
ARG DEBIAN_FRONTEND=noninteractive | ||
ARG SAMTOOLS_VERSION=1.18 | ||
ARG BCFTOOLS_VERSION=1.18 | ||
RUN apt-get -qqy update --fix-missing && \ | ||
apt-get -qqy dist-upgrade && \ | ||
apt-get -qqy install --no-install-recommends \ | ||
ca-certificates \ | ||
libbz2-dev \ | ||
libcurl4-openssl-dev \ | ||
liblzma-dev \ | ||
libncurses5-dev \ | ||
autoconf \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that all this stuff -- automake, gcc, wget, etc. -- is left in the final image. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I now see that you remove most of it below. I still think it would be cleaner to remove it via staging. |
||
automake \ | ||
bzip2 \ | ||
gcc \ | ||
make \ | ||
wget \ | ||
zlib1g-dev && \ | ||
wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ | ||
tar xjf samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ | ||
cd samtools-${SAMTOOLS_VERSION} && ./configure --without-curses --enable-libcurl && make -s all all-htslib && make install install-htslib && cd - && \ | ||
rm -rf samtools-${SAMTOOLS_VERSION}* && \ | ||
wget https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ | ||
tar xjf bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ | ||
cd bcftools-${BCFTOOLS_VERSION} && ./configure --without-curses && make -s && make install && cd - && \ | ||
rm -rf bcftools-${BCFTOOLS_VERSION}* && \ | ||
apt-get -qqy purge autoconf automake bzip2 gcc make wget && \ | ||
apt-get -qqy clean && \ | ||
rm -rf /tmp/* \ | ||
/var/tmp/* \ | ||
/var/cache/apt/* \ | ||
/var/lib/apt/lists/* \ | ||
/usr/share/man/?? \ | ||
/usr/share/man/??_* && \ | ||
samtools --help && \ | ||
bcftools --help |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
VERSION = 0.1.2 | ||
TAG1 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:$(VERSION) | ||
TAG2 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:latest | ||
|
||
all: build push | ||
|
||
build: | ||
docker build -t $(TAG1) -t $(TAG2) . | ||
|
||
push: | ||
docker push $(TAG1) | ||
docker push $(TAG2) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import argparse | ||
import pysam | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Remove redundant alignment records from ONT BAM file', | ||
prog='remove_redundant_reads') | ||
parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix") | ||
parser.add_argument('-a', '--annotations', type=str, help="Annotations on (potential) duplicate reads") | ||
|
||
parser.add_argument('bam', type=str, help="BAM") | ||
args = parser.parse_args() | ||
|
||
# create a dict of set's, a trick to avoid Hash collisions | ||
guilty_dict_per_chr = dict() | ||
with open(args.annotations) as f: | ||
for line in f: | ||
arr = line.strip().split('\t') | ||
name = arr[0] | ||
chrom = arr[2] | ||
guilty_dict_per_chr.setdefault(chrom, set()) | ||
guilty_dict_per_chr[chrom].add(name) | ||
|
||
print("chromosomes on which there are duplicate records:") | ||
print(f" {guilty_dict_per_chr}") | ||
|
||
# Silence message about the .bai file not being found. | ||
pysam.set_verbosity(0) | ||
|
||
num_alignments, num_dropped_alignments = 0, 0 | ||
duplicate_signatures = [] | ||
bf = pysam.Samfile(args.bam, 'rb', check_sq=False) | ||
with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out: | ||
# we rely on the observation that for coordinate sorted BAM, | ||
# duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh | ||
current_position = -1 | ||
current_signatures = set() | ||
for read in bf: | ||
num_alignments += 1 | ||
|
||
chrom = read.reference_name | ||
n = read.query_name | ||
if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]: | ||
|
||
mq = read.mapping_quality | ||
sam_flag = read.flag | ||
pos = read.reference_start | ||
cigar = read.cigarstring | ||
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I note that you've added the (very lengthy!) cigar string to the signature. I believe that the signature is already unnecessarily lengthy. |
||
|
||
if current_position != pos: # new position, let's write and reset | ||
out.write(read) | ||
current_position = pos | ||
current_signatures = set() | ||
current_signatures.add(signature) | ||
elif signature in current_signatures: # You're a duplicate record, and not appearing for the 1st time! | ||
num_dropped_alignments += 1 | ||
duplicate_signatures.append(signature) # same signature may appear more than twice, hence list and append | ||
pass | ||
else: # you are in a new group of duplicates that map to this location | ||
out.write(read) | ||
current_signatures.add(signature) | ||
else: | ||
out.write(read) | ||
|
||
print(f'num_alignments: {num_alignments}') | ||
print(f'num_dropped_alignments: {num_dropped_alignments}') | ||
print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}') | ||
|
||
with open(f'{args.prefix}.duplicate.signatures.txt', 'w') as out: | ||
for sig in duplicate_signatures: | ||
out.write(f"{sig}\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't looked at this in detail. I'm going to assume it's OK.