diff --git a/coretex/bioinformatics/qiime2/__init__.py b/coretex/bioinformatics/qiime2/__init__.py index 28e6d97d..c6ed6bad 100644 --- a/coretex/bioinformatics/qiime2/__init__.py +++ b/coretex/bioinformatics/qiime2/__init__.py @@ -15,42 +15,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -from typing import List, Optional -from pathlib import Path +from typing import Optional -import subprocess -import logging +from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \ + getFastqDPSamples, getFastqMPSamples, getMetadataSample, getPhylogeneticTreeSamples, \ + isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \ + isMetadataSample, isPhylogeneticTreeSample, sampleNumber -from ..utils import logProcessOutput - - -class QiimeCommandException(Exception): - pass - - -def command(args: List[str]) -> None: - process = subprocess.Popen( - args, - shell = False, - cwd = Path(__file__).parent, - stdout = subprocess.PIPE, - stderr = subprocess.PIPE - ) - - stdout, stderr = process.communicate() - - if len(stdout) > 0: - logProcessOutput(stdout, logging.INFO) - - if len(stderr) > 0: - severity = logging.CRITICAL - if process.returncode == 0: - severity = logging.WARNING - - logProcessOutput(stderr, severity) - - if process.returncode != 0: - raise QiimeCommandException +from ..utils import command def toolsImport( diff --git a/coretex/bioinformatics/sequence_alignment/__init__.py b/coretex/bioinformatics/sequence_alignment/__init__.py index 59d4da59..bb2f7579 100644 --- a/coretex/bioinformatics/sequence_alignment/__init__.py +++ b/coretex/bioinformatics/sequence_alignment/__init__.py @@ -19,42 +19,39 @@ from pathlib import Path import subprocess -import logging +from ..utils import command from ...coretex import CustomDataset -from ..utils import logProcessOutput - - -def command(args: List[str], ignoreOutput: bool = False) -> None: - process = subprocess.Popen( - args, - shell = False, - cwd = Path(__file__).parent, - stdout = subprocess.PIPE, - stderr = subprocess.PIPE - ) - - while process.poll() is None: - if process.stdout is None or process.stderr is None: - continue - - stdout = process.stdout.readline() - stderr = process.stderr.readline() - - if len(stdout) > 0 and not ignoreOutput: - logProcessOutput(stdout, logging.INFO) - - if len(stderr) > 0 and not ignoreOutput: - if process.returncode == 0: - logProcessOutput(stderr, logging.INFO) - else: - logProcessOutput(stderr, logging.CRITICAL) - - if process.returncode != 0: - raise RuntimeError(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}") def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None: + """ + This function acts as a wrapper for the index command of BWA + (Burrows-Wheeler Aligner). It is necessary to perform indexing + on the reference sequence before alignment. + + Parameters + ---------- + bwaPath : Path + Path pointing to the bwa binary + sequencePath : Path + Path pointing to the reference sequence + prefix : Path + Path serving as the prefix for indexing. This command will + generate 5 new files that are needed for alignment. These files will + have the path "{prefix}.{suffix}", each with a different suffix / extension + + Example + ------- + >>> from pathlib import Path + >>> bwaPath = Path("tools/bwa") + >>> sequencePath = Path("sequences/hg18.fa") + >>> prefix = Path("output/hg18") + >>> indexCommand(bwaPath, sequencePath, prefix) + + Link to BWA: https://bio-bwa.sourceforge.net + """ + command([ str(bwaPath.absolute()), "index", "-p", str(prefix.absolute()), @@ -63,17 +60,69 @@ def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None: def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None: + """ + This function acts as a wrapper for the mem command of BWA + (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read + to the reference sequence and outputs a SAM file. + + Parameters + ---------- + bwaPath : Path + Path pointing to the bwa binary + prefix : Path + Path that serves as the prefix for the 5 files generated during indexing + sequencePath : Path + Path pointing to the sequence read file on which alignment should be performed + outputPath : Path + Path of the output file + + Example + ------- + >>> from pathlib import Path + >>> bwaPath = Path("tools/bwa") + >>> prefix = Path("reference/hg18") + >>> sequencePath = Path("input/R34D.fastq") + >>> outputPath = Path("output/R34D.sam") + >>> alignCommand(bwaPath, prefix, sequencePath, outputPath) + + Link to BWA: https://bio-bwa.sourceforge.net + """ + args = [ str(bwaPath.absolute()), "mem", "-o", str(outputPath.absolute()), str(prefix.absolute()), str(sequencePath.absolute()) - ] + ] command(args, True) def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None: + """ + This function uses the CLI tool "samtools" to convert SAM files into their binary + version, BAM. + + Parameters + ---------- + samtoolsPath : Path + Path pointing to the samtools binary + samPath : Path + Path pointing to the input .sam file + outputPath : Path + Path pointing to the output, .bam, file + + Example + ------- + >>> from pathlib import Path + >>> samtoolsPath = Path("tools/samtools") + >>> samPath = Path("input/R34D.sam") + >>> outputPath = Path("output/R34D.bam") + >>> sam2bamCommand(samtoolsPath, samPath, outputPath) + + Link to samtools: http://htslib.org/ + """ + command([ str(samtoolsPath.absolute()), "view", "-b", "-S", "-o", @@ -83,13 +132,36 @@ def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None: def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]: + """ + Takes an aligned sequence file (SAM/BAM) and returns three lists holding + MAPQ scores, leftmost position and sequence length for each read in the + file. This is done using the samtools view command. + + Parameters + ---------- + samtoolsPath : Path + Path pointing to the samtools binary + samPath : Path + Path pointing to the input .sam or .bam file + + Example + ------- + >>> from pathlib import Path + >>> samtoolsPath = Path("tools/samtools") + >>> file = Path("R34D.bam") + >>> scores, positions, lengths = extractData(samtoolsPath, file) + + Link to samtools: http://htslib.org/ + """ + scores: list[int] = [] positions: list[int] = [] sequenceLengths: list[int] = [] args = [ str(samtoolsPath.absolute()), - "view", "-F", "256", "-F", "2048", "-F", "512", + "view", + "-F", "256", "-F", "2048", "-F", "512", # Will ignore all duplicate reads str(file.absolute()) ] diff --git a/coretex/bioinformatics/utils.py b/coretex/bioinformatics/utils.py index 72f55659..ca2d684b 100644 --- a/coretex/bioinformatics/utils.py +++ b/coretex/bioinformatics/utils.py @@ -15,7 +15,11 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . +from typing import List +from pathlib import Path + import logging +import subprocess def logProcessOutput(output: bytes, severity: int) -> None: @@ -28,3 +32,36 @@ def logProcessOutput(output: bytes, severity: int) -> None: # ignoring type for now, has to be fixed in coretexpylib logging.getLogger("coretexpylib").log(severity, line) + + +class CommandException(Exception): + pass + + +def command(args: List[str], ignoreOutput: bool = False) -> None: + process = subprocess.Popen( + args, + shell = False, + cwd = Path(__file__).parent, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE + ) + + while process.poll() is None: + if process.stdout is None or process.stderr is None: + continue + + stdout = process.stdout.readline() + stderr = process.stderr.readline() + + if len(stdout) > 0 and not ignoreOutput: + logProcessOutput(stdout, logging.INFO) + + if len(stderr) > 0 and not ignoreOutput: + if process.returncode == 0: + logProcessOutput(stderr, logging.INFO) + else: + logProcessOutput(stderr, logging.CRITICAL) + + if process.returncode != 0: + raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")