diff --git a/coretex/qiime2/__init__.py b/coretex/bioinformatics/qiime2/__init__.py similarity index 83% rename from coretex/qiime2/__init__.py rename to coretex/bioinformatics/qiime2/__init__.py index 23c552ab..c6ed6bad 100644 --- a/coretex/qiime2/__init__.py +++ b/coretex/bioinformatics/qiime2/__init__.py @@ -15,52 +15,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import List, Optional -from pathlib import Path +from typing import Optional -import subprocess -import logging +from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \ + getFastqDPSamples, getFastqMPSamples, getMetadataSample, getPhylogeneticTreeSamples, \ + isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \ + isMetadataSample, isPhylogeneticTreeSample, sampleNumber - -class QiimeCommandException(Exception): - pass - - -def logProcessOutput(output: bytes, severity: int) -> None: - decoded = output.decode("UTF-8") - - for line in decoded.split("\n"): - # skip empty lines - if line.strip() == "": - continue - - # ignoring type for now, has to be fixed in coretexpylib - logging.getLogger("coretexpylib").log(severity, line) - - -def QiimeCommand(args: List[str]) -> None: - process = subprocess.Popen( - args, - shell = False, - cwd = Path(__file__).parent, - stdout = subprocess.PIPE, - stderr = subprocess.PIPE - ) - - stdout, stderr = process.communicate() - - if len(stdout) > 0: - logProcessOutput(stdout, logging.INFO) - - if len(stderr) > 0: - severity = logging.CRITICAL - if process.returncode == 0: - severity = logging.WARNING - - logProcessOutput(stderr, severity) - - if process.returncode != 0: - raise QiimeCommandException +from ..utils import command def toolsImport( @@ -82,7 +44,7 @@ def toolsImport( "--input-format" , inputFormat ]) - QiimeCommand(args) + command(args) def demuxEmpSingle( @@ -93,7 +55,7 @@ def demuxEmpSingle( errorCorretctionDetailsPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "demux", "emp-single", "--i-seqs", sequencesPath, "--m-barcodes-file", barcodesPath, @@ -105,7 +67,7 @@ def demuxEmpSingle( def demuxSummarize(dataPath: str, visualizationPath: str) -> None: - QiimeCommand([ + command([ "qiime", "demux", "summarize", "--i-data", dataPath, "--o-visualization", visualizationPath @@ -121,7 +83,7 @@ def dada2DenoiseSingle( denoisingStatsPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "dada2", "denoise-single", "--i-demultiplexed-seqs", inputPath, "--p-trim-left", str(trimLeft), @@ -133,7 +95,7 @@ def dada2DenoiseSingle( def metadataTabulate(inputFile: str, visualizationPath: str) -> None: - QiimeCommand([ + command([ "qiime", "metadata", "tabulate", "--m-input-file", inputFile, "--o-visualization", visualizationPath @@ -141,7 +103,7 @@ def metadataTabulate(inputFile: str, visualizationPath: str) -> None: def featureTableSummarize(inputPath: str, visualizationPath: str, metadataPath: str) -> None: - QiimeCommand([ + command([ "qiime", "feature-table", "summarize", "--i-table", inputPath, "--o-visualization", visualizationPath, @@ -150,7 +112,7 @@ def featureTableSummarize(inputPath: str, visualizationPath: str, metadataPath: def featureTableTabulateSeqs(inputPath: str, visualizationPath: str) -> None: - QiimeCommand([ + command([ "qiime", "feature-table", "tabulate-seqs", "--i-data", inputPath, "--o-visualization", visualizationPath @@ -165,7 +127,7 @@ def phylogenyAlignToTreeMafftFasttree( rootedTreePath: str ) -> None: - QiimeCommand([ + command([ "qiime", "phylogeny", "align-to-tree-mafft-fasttree", "--i-sequences", sequencesPath, "--o-alignment", aligmentPath, @@ -183,7 +145,7 @@ def diversityCoreMetricsPhylogenetic( outputDir: str ) -> None: - QiimeCommand([ + command([ "qiime", "diversity", "core-metrics-phylogenetic", "--i-phylogeny", phlogenyPath, "--i-table", tablePath, @@ -199,7 +161,7 @@ def diversityAlphaGroupSignificance( visualizationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "diversity", "alpha-group-significance", "--i-alpha-diversity", alphaDiversityPath, "--m-metadata-file", metadataPath, @@ -215,7 +177,7 @@ def diversityBetaGroupSignificance( pairwise: bool ) -> None: - QiimeCommand([ + command([ "qiime", "diversity", "beta-group-significance", "--i-distance-matrix", distanceMatrixPath, "--m-metadata-file", metadataPath, @@ -232,7 +194,7 @@ def emperorPlot( visualizationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "emperor", "plot", "--i-pcoa", pcoaPath, "--m-metadata-file", metadataPath, @@ -249,7 +211,7 @@ def diversityAlphaRarefaction( visualizationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "diversity", "alpha-rarefaction", "--i-table", tablePath, "--i-phylogeny", phylogenyPath, @@ -265,7 +227,7 @@ def featureClassifierClassifySklearn( classificationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "feature-classifier", "classify-sklearn", "--i-classifier", classifierPath, "--i-reads", readsPath, @@ -280,7 +242,7 @@ def taxaBarplot( visualizationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "taxa", "barplot", "--i-table", tablePath, "--i-taxonomy", taxonomyPath, @@ -296,7 +258,7 @@ def featureTableFilterSamples( filteredTablePath: str ) -> None: - QiimeCommand([ + command([ "qiime", "feature-table", "filter-samples", "--i-table", tablePath, "--m-metadata-file", metadataPath, @@ -306,7 +268,7 @@ def featureTableFilterSamples( def compositionAddPseudocount(tablePath: str, compositionTablePath: str) -> None: - QiimeCommand([ + command([ "qiime", "composition", "add-pseudocount", "--i-table", tablePath, "--o-composition-table", compositionTablePath @@ -320,7 +282,7 @@ def compositionAncom( visualizationPath: str ) -> None: - QiimeCommand([ + command([ "qiime", "composition", "ancom", "--i-table", tablePath, "--m-metadata-file", metadataPath, @@ -336,7 +298,7 @@ def taxaCollapse( collapsedTablePath: str ) -> None: - QiimeCommand([ + command([ "qiime", "taxa", "collapse", "--i-table", tablePath, "--i-taxonomy", taxonomyPath, diff --git a/coretex/qiime2/utils.py b/coretex/bioinformatics/qiime2/utils.py similarity index 98% rename from coretex/qiime2/utils.py rename to coretex/bioinformatics/qiime2/utils.py index 3246d11d..59abff50 100644 --- a/coretex/qiime2/utils.py +++ b/coretex/bioinformatics/qiime2/utils.py @@ -21,7 +21,7 @@ import logging import gzip -from ..coretex import Experiment, CustomSample, CustomDataset +from ...coretex import Experiment, CustomSample, CustomDataset def createSample(name: str, datasetId: int, path: Path, experiment: Experiment, stepName: str, retryCount: int = 0) -> CustomSample: diff --git a/coretex/bioinformatics/sequence_alignment/__init__.py b/coretex/bioinformatics/sequence_alignment/__init__.py new file mode 100644 index 00000000..19806730 --- /dev/null +++ b/coretex/bioinformatics/sequence_alignment/__init__.py @@ -0,0 +1,221 @@ +# Copyright (C) 2023 Coretex LLC + +# This file is part of Coretex.ai + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from typing import List, Tuple +from pathlib import Path + +import subprocess +import logging + +from ..utils import command, logProcessOutput, CommandException +from ...coretex import CustomDataset + + +def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None: + """ + This function acts as a wrapper for the index command of BWA + (Burrows-Wheeler Aligner). It is necessary to perform indexing + on the reference sequence before alignment. + + Parameters + ---------- + bwaPath : Path + Path pointing to the bwa binary + sequencePath : Path + Path pointing to the reference sequence + prefix : Path + Path serving as the prefix for indexing. This command will + generate 5 new files that are needed for alignment. These files will + have the path "{prefix}.{suffix}", each with a different suffix / extension + + Example + ------- + >>> from pathlib import Path + >>> bwaPath = Path("tools/bwa") + >>> sequencePath = Path("sequences/hg18.fa") + >>> prefix = Path("output/hg18") + >>> indexCommand(bwaPath, sequencePath, prefix) + + Link to BWA: https://bio-bwa.sourceforge.net + """ + + command([ + str(bwaPath.absolute()), "index", + "-p", str(prefix.absolute()), + str(sequencePath.absolute()) + ]) + + +def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None: + """ + This function acts as a wrapper for the mem command of BWA + (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read + to the reference sequence and outputs a SAM file. + + Parameters + ---------- + bwaPath : Path + Path pointing to the bwa binary + prefix : Path + Path that serves as the prefix for the 5 files generated during indexing + sequencePath : Path + Path pointing to the sequence read file on which alignment should be performed + outputPath : Path + Path of the output file + + Example + ------- + >>> from pathlib import Path + >>> bwaPath = Path("tools/bwa") + >>> prefix = Path("reference/hg18") + >>> sequencePath = Path("input/R34D.fastq") + >>> outputPath = Path("output/R34D.sam") + >>> alignCommand(bwaPath, prefix, sequencePath, outputPath) + + Link to BWA: https://bio-bwa.sourceforge.net + """ + + args = [ + str(bwaPath.absolute()), "mem", + "-o", str(outputPath.absolute()), + str(prefix.absolute()), + str(sequencePath.absolute()) + ] + + command(args, True) + + +def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None: + """ + This function uses the CLI tool "samtools" to convert SAM files into their binary + version, BAM. + + Parameters + ---------- + samtoolsPath : Path + Path pointing to the samtools binary + samPath : Path + Path pointing to the input .sam file + outputPath : Path + Path pointing to the output, .bam, file + + Example + ------- + >>> from pathlib import Path + >>> samtoolsPath = Path("tools/samtools") + >>> samPath = Path("input/R34D.sam") + >>> outputPath = Path("output/R34D.bam") + >>> sam2bamCommand(samtoolsPath, samPath, outputPath) + + Link to samtools: http://htslib.org/ + """ + + command([ + str(samtoolsPath.absolute()), "view", + "-b", "-S", "-o", + str(outputPath.absolute()), + str(samPath.absolute()) + ]) + + +def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]: + """ + Takes an aligned sequence file (SAM/BAM) and returns three lists holding + MAPQ scores, leftmost position and sequence length for each read in the + file. This is done using the samtools view command. + + Parameters + ---------- + samtoolsPath : Path + Path pointing to the samtools binary + samPath : Path + Path pointing to the input .sam or .bam file + + Example + ------- + >>> from pathlib import Path + >>> samtoolsPath = Path("tools/samtools") + >>> file = Path("R34D.bam") + >>> scores, positions, lengths = extractData(samtoolsPath, file) + + Link to samtools: http://htslib.org/ + """ + + scores: list[int] = [] + positions: list[int] = [] + sequenceLengths: list[int] = [] + + args = [ + str(samtoolsPath.absolute()), + "view", + "-F", "256", "-F", "2048", "-F", "512", # Will ignore all duplicate reads + str(file.absolute()) + ] + + process = subprocess.Popen( + args, + shell = False, + cwd = Path(__file__).parent, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE + ) + + while process.poll() is None: + if process.stdout is None: + continue + + stdout = process.stdout.readlines() + + for line in stdout: + if len(line) > 0: + fields = line.decode("UTF-8").strip().split("\t") + scores.append(int(fields[4])) + positions.append(int(fields[3])) + sequenceLengths.append(len(fields[9])) + + if process.stderr is not None: + for stderr in process.stderr: + if len(stderr) > 0: + if process.returncode == 0: + logProcessOutput(stderr, logging.WARNING) + else: + logProcessOutput(stderr, logging.CRITICAL) + + if process.returncode != 0: + raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}") + + return scores, positions, sequenceLengths + + +def chmodX(file: Path) -> None: + command(["chmod", "+x", str(file.absolute())]) + + +def loadFa(dataset: CustomDataset) -> List[Path]: + inputFiles: list[Path] = [] + + for sample in dataset.samples: + sample.unzip() + + for file in Path(sample.path).iterdir(): + if file.suffix.startswith(".fa"): + inputFiles.append(file) + + if len(inputFiles) == 0: + raise ValueError(">> [Coretex] No sequence reads found") + + return inputFiles diff --git a/coretex/bioinformatics/utils.py b/coretex/bioinformatics/utils.py new file mode 100644 index 00000000..22f039fa --- /dev/null +++ b/coretex/bioinformatics/utils.py @@ -0,0 +1,67 @@ +# Copyright (C) 2023 Coretex LLC + +# This file is part of Coretex.ai + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from typing import List +from pathlib import Path + +import logging +import subprocess + + +def logProcessOutput(output: bytes, severity: int) -> None: + decoded = output.decode("UTF-8") + + for line in decoded.split("\n"): + # skip empty lines + if line.strip() == "": + continue + + # ignoring type for now, has to be fixed in coretexpylib + logging.getLogger("coretexpylib").log(severity, line) + + +class CommandException(Exception): + pass + + +def command(args: List[str], ignoreOutput: bool = False, shell: bool = False) -> None: + process = subprocess.Popen( + args, + shell = shell, + cwd = Path(__file__).parent, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE + ) + + while process.poll() is None: + if process.stdout is None or process.stderr is None: + continue + + stdout = process.stdout.readline() + stderr = process.stderr.readline() + + if len(stdout) > 0 and not ignoreOutput: + logProcessOutput(stdout, logging.INFO) + + if len(stderr) > 0 and not ignoreOutput: + if process.returncode == 0: + logProcessOutput(stderr, logging.INFO) + else: + logProcessOutput(stderr, logging.CRITICAL) + + if process.returncode != 0: + raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")