diff --git a/coretex/qiime2/__init__.py b/coretex/bioinformatics/qiime2/__init__.py
similarity index 83%
rename from coretex/qiime2/__init__.py
rename to coretex/bioinformatics/qiime2/__init__.py
index 23c552ab..c6ed6bad 100644
--- a/coretex/qiime2/__init__.py
+++ b/coretex/bioinformatics/qiime2/__init__.py
@@ -15,52 +15,14 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-from typing import List, Optional
-from pathlib import Path
+from typing import Optional
-import subprocess
-import logging
+from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \
+ getFastqDPSamples, getFastqMPSamples, getMetadataSample, getPhylogeneticTreeSamples, \
+ isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \
+ isMetadataSample, isPhylogeneticTreeSample, sampleNumber
-
-class QiimeCommandException(Exception):
- pass
-
-
-def logProcessOutput(output: bytes, severity: int) -> None:
- decoded = output.decode("UTF-8")
-
- for line in decoded.split("\n"):
- # skip empty lines
- if line.strip() == "":
- continue
-
- # ignoring type for now, has to be fixed in coretexpylib
- logging.getLogger("coretexpylib").log(severity, line)
-
-
-def QiimeCommand(args: List[str]) -> None:
- process = subprocess.Popen(
- args,
- shell = False,
- cwd = Path(__file__).parent,
- stdout = subprocess.PIPE,
- stderr = subprocess.PIPE
- )
-
- stdout, stderr = process.communicate()
-
- if len(stdout) > 0:
- logProcessOutput(stdout, logging.INFO)
-
- if len(stderr) > 0:
- severity = logging.CRITICAL
- if process.returncode == 0:
- severity = logging.WARNING
-
- logProcessOutput(stderr, severity)
-
- if process.returncode != 0:
- raise QiimeCommandException
+from ..utils import command
def toolsImport(
@@ -82,7 +44,7 @@ def toolsImport(
"--input-format" , inputFormat
])
- QiimeCommand(args)
+ command(args)
def demuxEmpSingle(
@@ -93,7 +55,7 @@ def demuxEmpSingle(
errorCorretctionDetailsPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "demux", "emp-single",
"--i-seqs", sequencesPath,
"--m-barcodes-file", barcodesPath,
@@ -105,7 +67,7 @@ def demuxEmpSingle(
def demuxSummarize(dataPath: str, visualizationPath: str) -> None:
- QiimeCommand([
+ command([
"qiime", "demux", "summarize",
"--i-data", dataPath,
"--o-visualization", visualizationPath
@@ -121,7 +83,7 @@ def dada2DenoiseSingle(
denoisingStatsPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "dada2", "denoise-single",
"--i-demultiplexed-seqs", inputPath,
"--p-trim-left", str(trimLeft),
@@ -133,7 +95,7 @@ def dada2DenoiseSingle(
def metadataTabulate(inputFile: str, visualizationPath: str) -> None:
- QiimeCommand([
+ command([
"qiime", "metadata", "tabulate",
"--m-input-file", inputFile,
"--o-visualization", visualizationPath
@@ -141,7 +103,7 @@ def metadataTabulate(inputFile: str, visualizationPath: str) -> None:
def featureTableSummarize(inputPath: str, visualizationPath: str, metadataPath: str) -> None:
- QiimeCommand([
+ command([
"qiime", "feature-table", "summarize",
"--i-table", inputPath,
"--o-visualization", visualizationPath,
@@ -150,7 +112,7 @@ def featureTableSummarize(inputPath: str, visualizationPath: str, metadataPath:
def featureTableTabulateSeqs(inputPath: str, visualizationPath: str) -> None:
- QiimeCommand([
+ command([
"qiime", "feature-table", "tabulate-seqs",
"--i-data", inputPath,
"--o-visualization", visualizationPath
@@ -165,7 +127,7 @@ def phylogenyAlignToTreeMafftFasttree(
rootedTreePath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "phylogeny", "align-to-tree-mafft-fasttree",
"--i-sequences", sequencesPath,
"--o-alignment", aligmentPath,
@@ -183,7 +145,7 @@ def diversityCoreMetricsPhylogenetic(
outputDir: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "diversity", "core-metrics-phylogenetic",
"--i-phylogeny", phlogenyPath,
"--i-table", tablePath,
@@ -199,7 +161,7 @@ def diversityAlphaGroupSignificance(
visualizationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "diversity", "alpha-group-significance",
"--i-alpha-diversity", alphaDiversityPath,
"--m-metadata-file", metadataPath,
@@ -215,7 +177,7 @@ def diversityBetaGroupSignificance(
pairwise: bool
) -> None:
- QiimeCommand([
+ command([
"qiime", "diversity", "beta-group-significance",
"--i-distance-matrix", distanceMatrixPath,
"--m-metadata-file", metadataPath,
@@ -232,7 +194,7 @@ def emperorPlot(
visualizationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "emperor", "plot",
"--i-pcoa", pcoaPath,
"--m-metadata-file", metadataPath,
@@ -249,7 +211,7 @@ def diversityAlphaRarefaction(
visualizationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "diversity", "alpha-rarefaction",
"--i-table", tablePath,
"--i-phylogeny", phylogenyPath,
@@ -265,7 +227,7 @@ def featureClassifierClassifySklearn(
classificationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "feature-classifier", "classify-sklearn",
"--i-classifier", classifierPath,
"--i-reads", readsPath,
@@ -280,7 +242,7 @@ def taxaBarplot(
visualizationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "taxa", "barplot",
"--i-table", tablePath,
"--i-taxonomy", taxonomyPath,
@@ -296,7 +258,7 @@ def featureTableFilterSamples(
filteredTablePath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "feature-table", "filter-samples",
"--i-table", tablePath,
"--m-metadata-file", metadataPath,
@@ -306,7 +268,7 @@ def featureTableFilterSamples(
def compositionAddPseudocount(tablePath: str, compositionTablePath: str) -> None:
- QiimeCommand([
+ command([
"qiime", "composition", "add-pseudocount",
"--i-table", tablePath,
"--o-composition-table", compositionTablePath
@@ -320,7 +282,7 @@ def compositionAncom(
visualizationPath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "composition", "ancom",
"--i-table", tablePath,
"--m-metadata-file", metadataPath,
@@ -336,7 +298,7 @@ def taxaCollapse(
collapsedTablePath: str
) -> None:
- QiimeCommand([
+ command([
"qiime", "taxa", "collapse",
"--i-table", tablePath,
"--i-taxonomy", taxonomyPath,
diff --git a/coretex/qiime2/utils.py b/coretex/bioinformatics/qiime2/utils.py
similarity index 98%
rename from coretex/qiime2/utils.py
rename to coretex/bioinformatics/qiime2/utils.py
index 3246d11d..59abff50 100644
--- a/coretex/qiime2/utils.py
+++ b/coretex/bioinformatics/qiime2/utils.py
@@ -21,7 +21,7 @@
import logging
import gzip
-from ..coretex import Experiment, CustomSample, CustomDataset
+from ...coretex import Experiment, CustomSample, CustomDataset
def createSample(name: str, datasetId: int, path: Path, experiment: Experiment, stepName: str, retryCount: int = 0) -> CustomSample:
diff --git a/coretex/bioinformatics/sequence_alignment/__init__.py b/coretex/bioinformatics/sequence_alignment/__init__.py
new file mode 100644
index 00000000..19806730
--- /dev/null
+++ b/coretex/bioinformatics/sequence_alignment/__init__.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2023 Coretex LLC
+
+# This file is part of Coretex.ai
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from typing import List, Tuple
+from pathlib import Path
+
+import subprocess
+import logging
+
+from ..utils import command, logProcessOutput, CommandException
+from ...coretex import CustomDataset
+
+
+def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:
+ """
+ This function acts as a wrapper for the index command of BWA
+ (Burrows-Wheeler Aligner). It is necessary to perform indexing
+ on the reference sequence before alignment.
+
+ Parameters
+ ----------
+ bwaPath : Path
+ Path pointing to the bwa binary
+ sequencePath : Path
+ Path pointing to the reference sequence
+ prefix : Path
+ Path serving as the prefix for indexing. This command will
+ generate 5 new files that are needed for alignment. These files will
+ have the path "{prefix}.{suffix}", each with a different suffix / extension
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> bwaPath = Path("tools/bwa")
+ >>> sequencePath = Path("sequences/hg18.fa")
+ >>> prefix = Path("output/hg18")
+ >>> indexCommand(bwaPath, sequencePath, prefix)
+
+ Link to BWA: https://bio-bwa.sourceforge.net
+ """
+
+ command([
+ str(bwaPath.absolute()), "index",
+ "-p", str(prefix.absolute()),
+ str(sequencePath.absolute())
+ ])
+
+
+def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None:
+ """
+ This function acts as a wrapper for the mem command of BWA
+ (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read
+ to the reference sequence and outputs a SAM file.
+
+ Parameters
+ ----------
+ bwaPath : Path
+ Path pointing to the bwa binary
+ prefix : Path
+ Path that serves as the prefix for the 5 files generated during indexing
+ sequencePath : Path
+ Path pointing to the sequence read file on which alignment should be performed
+ outputPath : Path
+ Path of the output file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> bwaPath = Path("tools/bwa")
+ >>> prefix = Path("reference/hg18")
+ >>> sequencePath = Path("input/R34D.fastq")
+ >>> outputPath = Path("output/R34D.sam")
+ >>> alignCommand(bwaPath, prefix, sequencePath, outputPath)
+
+ Link to BWA: https://bio-bwa.sourceforge.net
+ """
+
+ args = [
+ str(bwaPath.absolute()), "mem",
+ "-o", str(outputPath.absolute()),
+ str(prefix.absolute()),
+ str(sequencePath.absolute())
+ ]
+
+ command(args, True)
+
+
+def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
+ """
+ This function uses the CLI tool "samtools" to convert SAM files into their binary
+ version, BAM.
+
+ Parameters
+ ----------
+ samtoolsPath : Path
+ Path pointing to the samtools binary
+ samPath : Path
+ Path pointing to the input .sam file
+ outputPath : Path
+ Path pointing to the output, .bam, file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> samtoolsPath = Path("tools/samtools")
+ >>> samPath = Path("input/R34D.sam")
+ >>> outputPath = Path("output/R34D.bam")
+ >>> sam2bamCommand(samtoolsPath, samPath, outputPath)
+
+ Link to samtools: http://htslib.org/
+ """
+
+ command([
+ str(samtoolsPath.absolute()), "view",
+ "-b", "-S", "-o",
+ str(outputPath.absolute()),
+ str(samPath.absolute())
+ ])
+
+
+def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]:
+ """
+ Takes an aligned sequence file (SAM/BAM) and returns three lists holding
+ MAPQ scores, leftmost position and sequence length for each read in the
+ file. This is done using the samtools view command.
+
+ Parameters
+ ----------
+ samtoolsPath : Path
+ Path pointing to the samtools binary
+ samPath : Path
+ Path pointing to the input .sam or .bam file
+
+ Example
+ -------
+ >>> from pathlib import Path
+ >>> samtoolsPath = Path("tools/samtools")
+ >>> file = Path("R34D.bam")
+ >>> scores, positions, lengths = extractData(samtoolsPath, file)
+
+ Link to samtools: http://htslib.org/
+ """
+
+ scores: list[int] = []
+ positions: list[int] = []
+ sequenceLengths: list[int] = []
+
+ args = [
+ str(samtoolsPath.absolute()),
+ "view",
+ "-F", "256", "-F", "2048", "-F", "512", # Will ignore all duplicate reads
+ str(file.absolute())
+ ]
+
+ process = subprocess.Popen(
+ args,
+ shell = False,
+ cwd = Path(__file__).parent,
+ stdout = subprocess.PIPE,
+ stderr = subprocess.PIPE
+ )
+
+ while process.poll() is None:
+ if process.stdout is None:
+ continue
+
+ stdout = process.stdout.readlines()
+
+ for line in stdout:
+ if len(line) > 0:
+ fields = line.decode("UTF-8").strip().split("\t")
+ scores.append(int(fields[4]))
+ positions.append(int(fields[3]))
+ sequenceLengths.append(len(fields[9]))
+
+ if process.stderr is not None:
+ for stderr in process.stderr:
+ if len(stderr) > 0:
+ if process.returncode == 0:
+ logProcessOutput(stderr, logging.WARNING)
+ else:
+ logProcessOutput(stderr, logging.CRITICAL)
+
+ if process.returncode != 0:
+ raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")
+
+ return scores, positions, sequenceLengths
+
+
+def chmodX(file: Path) -> None:
+ command(["chmod", "+x", str(file.absolute())])
+
+
+def loadFa(dataset: CustomDataset) -> List[Path]:
+ inputFiles: list[Path] = []
+
+ for sample in dataset.samples:
+ sample.unzip()
+
+ for file in Path(sample.path).iterdir():
+ if file.suffix.startswith(".fa"):
+ inputFiles.append(file)
+
+ if len(inputFiles) == 0:
+ raise ValueError(">> [Coretex] No sequence reads found")
+
+ return inputFiles
diff --git a/coretex/bioinformatics/utils.py b/coretex/bioinformatics/utils.py
new file mode 100644
index 00000000..22f039fa
--- /dev/null
+++ b/coretex/bioinformatics/utils.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2023 Coretex LLC
+
+# This file is part of Coretex.ai
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from typing import List
+from pathlib import Path
+
+import logging
+import subprocess
+
+
+def logProcessOutput(output: bytes, severity: int) -> None:
+ decoded = output.decode("UTF-8")
+
+ for line in decoded.split("\n"):
+ # skip empty lines
+ if line.strip() == "":
+ continue
+
+ # ignoring type for now, has to be fixed in coretexpylib
+ logging.getLogger("coretexpylib").log(severity, line)
+
+
+class CommandException(Exception):
+ pass
+
+
+def command(args: List[str], ignoreOutput: bool = False, shell: bool = False) -> None:
+ process = subprocess.Popen(
+ args,
+ shell = shell,
+ cwd = Path(__file__).parent,
+ stdout = subprocess.PIPE,
+ stderr = subprocess.PIPE
+ )
+
+ while process.poll() is None:
+ if process.stdout is None or process.stderr is None:
+ continue
+
+ stdout = process.stdout.readline()
+ stderr = process.stderr.readline()
+
+ if len(stdout) > 0 and not ignoreOutput:
+ logProcessOutput(stdout, logging.INFO)
+
+ if len(stderr) > 0 and not ignoreOutput:
+ if process.returncode == 0:
+ logProcessOutput(stderr, logging.INFO)
+ else:
+ logProcessOutput(stderr, logging.CRITICAL)
+
+ if process.returncode != 0:
+ raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")