diff --git a/coretex/bioinformatics/qiime2/__init__.py b/coretex/bioinformatics/qiime2/__init__.py
index 28e6d97d..c6ed6bad 100644
--- a/coretex/bioinformatics/qiime2/__init__.py
+++ b/coretex/bioinformatics/qiime2/__init__.py
@@ -15,42 +15,14 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-from typing import List, Optional
-from pathlib import Path
+from typing import Optional
-import subprocess
-import logging
+from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \
+ getFastqDPSamples, getFastqMPSamples, getMetadataSample, getPhylogeneticTreeSamples, \
+ isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \
+ isMetadataSample, isPhylogeneticTreeSample, sampleNumber
-from ..utils import logProcessOutput
-
-
-class QiimeCommandException(Exception):
- pass
-
-
-def command(args: List[str]) -> None:
- process = subprocess.Popen(
- args,
- shell = False,
- cwd = Path(__file__).parent,
- stdout = subprocess.PIPE,
- stderr = subprocess.PIPE
- )
-
- stdout, stderr = process.communicate()
-
- if len(stdout) > 0:
- logProcessOutput(stdout, logging.INFO)
-
- if len(stderr) > 0:
- severity = logging.CRITICAL
- if process.returncode == 0:
- severity = logging.WARNING
-
- logProcessOutput(stderr, severity)
-
- if process.returncode != 0:
- raise QiimeCommandException
+from ..utils import command
def toolsImport(
diff --git a/coretex/bioinformatics/sequence_alignment/__init__.py b/coretex/bioinformatics/sequence_alignment/__init__.py
index 59d4da59..bb2f7579 100644
--- a/coretex/bioinformatics/sequence_alignment/__init__.py
+++ b/coretex/bioinformatics/sequence_alignment/__init__.py
@@ -19,42 +19,39 @@
from pathlib import Path
import subprocess
-import logging
+from ..utils import command
from ...coretex import CustomDataset
-from ..utils import logProcessOutput
-
-
-def command(args: List[str], ignoreOutput: bool = False) -> None:
- process = subprocess.Popen(
- args,
- shell = False,
- cwd = Path(__file__).parent,
- stdout = subprocess.PIPE,
- stderr = subprocess.PIPE
- )
-
- while process.poll() is None:
- if process.stdout is None or process.stderr is None:
- continue
-
- stdout = process.stdout.readline()
- stderr = process.stderr.readline()
-
- if len(stdout) > 0 and not ignoreOutput:
- logProcessOutput(stdout, logging.INFO)
-
- if len(stderr) > 0 and not ignoreOutput:
- if process.returncode == 0:
- logProcessOutput(stderr, logging.INFO)
- else:
- logProcessOutput(stderr, logging.CRITICAL)
-
- if process.returncode != 0:
- raise RuntimeError(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")
def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:
+ """
+ This function acts as a wrapper for the index command of BWA
+ (Burrows-Wheeler Aligner). It is necessary to perform indexing
+ on the reference sequence before alignment.
+
+ Parameters
+ ----------
+ bwaPath : Path
+ Path pointing to the bwa binary
+ sequencePath : Path
+ Path pointing to the reference sequence
+ prefix : Path
+ Path serving as the prefix for indexing. This command will
+ generate 5 new files that are needed for alignment. These files will
+ have the path "{prefix}.{suffix}", each with a different suffix / extension
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> bwaPath = Path("tools/bwa")
+ >>> sequencePath = Path("sequences/hg18.fa")
+ >>> prefix = Path("output/hg18")
+ >>> indexCommand(bwaPath, sequencePath, prefix)
+
+ Link to BWA: https://bio-bwa.sourceforge.net
+ """
+
command([
str(bwaPath.absolute()), "index",
"-p", str(prefix.absolute()),
@@ -63,17 +60,69 @@ def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:
def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None:
+ """
+ This function acts as a wrapper for the mem command of BWA
+ (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read
+ to the reference sequence and outputs a SAM file.
+
+ Parameters
+ ----------
+ bwaPath : Path
+ Path pointing to the bwa binary
+ prefix : Path
+ Path that serves as the prefix for the 5 files generated during indexing
+ sequencePath : Path
+ Path pointing to the sequence read file on which alignment should be performed
+ outputPath : Path
+ Path of the output file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> bwaPath = Path("tools/bwa")
+ >>> prefix = Path("reference/hg18")
+ >>> sequencePath = Path("input/R34D.fastq")
+ >>> outputPath = Path("output/R34D.sam")
+ >>> alignCommand(bwaPath, prefix, sequencePath, outputPath)
+
+ Link to BWA: https://bio-bwa.sourceforge.net
+ """
+
args = [
str(bwaPath.absolute()), "mem",
"-o", str(outputPath.absolute()),
str(prefix.absolute()),
str(sequencePath.absolute())
- ]
+ ]
command(args, True)
def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
+ """
+ This function uses the CLI tool "samtools" to convert SAM files into their binary
+ version, BAM.
+
+ Parameters
+ ----------
+ samtoolsPath : Path
+ Path pointing to the samtools binary
+ samPath : Path
+ Path pointing to the input .sam file
+ outputPath : Path
+ Path pointing to the output, .bam, file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> samtoolsPath = Path("tools/samtools")
+ >>> samPath = Path("input/R34D.sam")
+ >>> outputPath = Path("output/R34D.bam")
+ >>> sam2bamCommand(samtoolsPath, samPath, outputPath)
+
+ Link to samtools: http://htslib.org/
+ """
+
command([
str(samtoolsPath.absolute()), "view",
"-b", "-S", "-o",
@@ -83,13 +132,36 @@ def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]:
+ """
+ Takes an aligned sequence file (SAM/BAM) and returns three lists holding
+ MAPQ scores, leftmost position and sequence length for each read in the
+ file. This is done using the samtools view command.
+
+ Parameters
+ ----------
+ samtoolsPath : Path
+ Path pointing to the samtools binary
+ samPath : Path
+ Path pointing to the input .sam or .bam file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> samtoolsPath = Path("tools/samtools")
+ >>> file = Path("R34D.bam")
+ >>> scores, positions, lengths = extractData(samtoolsPath, file)
+
+ Link to samtools: http://htslib.org/
+ """
+
scores: list[int] = []
positions: list[int] = []
sequenceLengths: list[int] = []
args = [
str(samtoolsPath.absolute()),
- "view", "-F", "256", "-F", "2048", "-F", "512",
+ "view",
+ "-F", "256", "-F", "2048", "-F", "512", # Will ignore all duplicate reads
str(file.absolute())
]
diff --git a/coretex/bioinformatics/utils.py b/coretex/bioinformatics/utils.py
index 72f55659..ca2d684b 100644
--- a/coretex/bioinformatics/utils.py
+++ b/coretex/bioinformatics/utils.py
@@ -15,7 +15,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
+from typing import List
+from pathlib import Path
+
import logging
+import subprocess
def logProcessOutput(output: bytes, severity: int) -> None:
@@ -28,3 +32,36 @@ def logProcessOutput(output: bytes, severity: int) -> None:
# ignoring type for now, has to be fixed in coretexpylib
logging.getLogger("coretexpylib").log(severity, line)
+
+
+class CommandException(Exception):
+ pass
+
+
+def command(args: List[str], ignoreOutput: bool = False) -> None:
+ process = subprocess.Popen(
+ args,
+ shell = False,
+ cwd = Path(__file__).parent,
+ stdout = subprocess.PIPE,
+ stderr = subprocess.PIPE
+ )
+
+ while process.poll() is None:
+ if process.stdout is None or process.stderr is None:
+ continue
+
+ stdout = process.stdout.readline()
+ stderr = process.stderr.readline()
+
+ if len(stdout) > 0 and not ignoreOutput:
+ logProcessOutput(stdout, logging.INFO)
+
+ if len(stderr) > 0 and not ignoreOutput:
+ if process.returncode == 0:
+ logProcessOutput(stderr, logging.INFO)
+ else:
+ logProcessOutput(stderr, logging.CRITICAL)
+
+ if process.returncode != 0:
+ raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")