Skip to content

Commit

Permalink
CTX-4247: Improved code based on discussions
Browse files Browse the repository at this point in the history
  • Loading branch information
Vuk Manojlovic committed Jul 18, 2023
1 parent 77dff7e commit d698be9
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 67 deletions.
40 changes: 6 additions & 34 deletions coretex/bioinformatics/qiime2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,14 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from typing import List, Optional
from pathlib import Path
from typing import Optional

import subprocess
import logging
from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \
getFastqDPSamples, getFastqMPSamples, getMetadataSample, getPhylogeneticTreeSamples, \
isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \
isMetadataSample, isPhylogeneticTreeSample, sampleNumber

from ..utils import logProcessOutput


class QiimeCommandException(Exception):
pass


def command(args: List[str]) -> None:
process = subprocess.Popen(
args,
shell = False,
cwd = Path(__file__).parent,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)

stdout, stderr = process.communicate()

if len(stdout) > 0:
logProcessOutput(stdout, logging.INFO)

if len(stderr) > 0:
severity = logging.CRITICAL
if process.returncode == 0:
severity = logging.WARNING

logProcessOutput(stderr, severity)

if process.returncode != 0:
raise QiimeCommandException
from ..utils import command


def toolsImport(
Expand Down
138 changes: 105 additions & 33 deletions coretex/bioinformatics/sequence_alignment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,42 +19,39 @@
from pathlib import Path

import subprocess
import logging

from ..utils import command
from ...coretex import CustomDataset
from ..utils import logProcessOutput


def command(args: List[str], ignoreOutput: bool = False) -> None:
process = subprocess.Popen(
args,
shell = False,
cwd = Path(__file__).parent,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)

while process.poll() is None:
if process.stdout is None or process.stderr is None:
continue

stdout = process.stdout.readline()
stderr = process.stderr.readline()

if len(stdout) > 0 and not ignoreOutput:
logProcessOutput(stdout, logging.INFO)

if len(stderr) > 0 and not ignoreOutput:
if process.returncode == 0:
logProcessOutput(stderr, logging.INFO)
else:
logProcessOutput(stderr, logging.CRITICAL)

if process.returncode != 0:
raise RuntimeError(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")


def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:
"""
This function acts as a wrapper for the index command of BWA
(Burrows-Wheeler Aligner). It is necessary to perform indexing
on the reference sequence before alignment.
Parameters
----------
bwaPath : Path
Path pointing to the bwa binary
sequencePath : Path
Path pointing to the reference sequence
prefix : Path
Path serving as the prefix for indexing. This command will
generate 5 new files that are needed for alignment. These files will
have the path "{prefix}.{suffix}", each with a different suffix / extension
Example
-------
>>> from pathlib import Path
>>> bwaPath = Path("tools/bwa")
>>> sequencePath = Path("sequences/hg18.fa")
>>> prefix = Path("output/hg18")
>>> indexCommand(bwaPath, sequencePath, prefix)
Link to BWA: https://bio-bwa.sourceforge.net
"""

command([
str(bwaPath.absolute()), "index",
"-p", str(prefix.absolute()),
Expand All @@ -63,17 +60,69 @@ def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) -> None:


def alignCommand(bwaPath: Path, prefix: Path, sequencePath: Path, outputPath: Path) -> None:
"""
This function acts as a wrapper for the mem command of BWA
(Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read
to the reference sequence and outputs a SAM file.
Parameters
----------
bwaPath : Path
Path pointing to the bwa binary
prefix : Path
Path that serves as the prefix for the 5 files generated during indexing
sequencePath : Path
Path pointing to the sequence read file on which alignment should be performed
outputPath : Path
Path of the output file
Example
-------
>>> from pathlib import Path
>>> bwaPath = Path("tools/bwa")
>>> prefix = Path("reference/hg18")
>>> sequencePath = Path("input/R34D.fastq")
>>> outputPath = Path("output/R34D.sam")
>>> alignCommand(bwaPath, prefix, sequencePath, outputPath)
Link to BWA: https://bio-bwa.sourceforge.net
"""

args = [
str(bwaPath.absolute()), "mem",
"-o", str(outputPath.absolute()),
str(prefix.absolute()),
str(sequencePath.absolute())
]
]

command(args, True)


def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:
"""
This function uses the CLI tool "samtools" to convert SAM files into their binary
version, BAM.
Parameters
----------
samtoolsPath : Path
Path pointing to the samtools binary
samPath : Path
Path pointing to the input .sam file
outputPath : Path
Path pointing to the output, .bam, file
Example
-------
>>> from pathlib import Path
>>> samtoolsPath = Path("tools/samtools")
>>> samPath = Path("input/R34D.sam")
>>> outputPath = Path("output/R34D.bam")
>>> sam2bamCommand(samtoolsPath, samPath, outputPath)
Link to samtools: http://htslib.org/
"""

command([
str(samtoolsPath.absolute()), "view",
"-b", "-S", "-o",
Expand All @@ -83,13 +132,36 @@ def sam2bamCommand(samtoolsPath: Path, samPath: Path, outputPath: Path) -> None:


def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]:
"""
Takes an aligned sequence file (SAM/BAM) and returns three lists holding
MAPQ scores, leftmost position and sequence length for each read in the
file. This is done using the samtools view command.
Parameters
----------
samtoolsPath : Path
Path pointing to the samtools binary
samPath : Path
Path pointing to the input .sam or .bam file
Example
-------
>>> from pathlib import Path
>>> samtoolsPath = Path("tools/samtools")
>>> file = Path("R34D.bam")
>>> scores, positions, lengths = extractData(samtoolsPath, file)
Link to samtools: http://htslib.org/
"""

scores: list[int] = []
positions: list[int] = []
sequenceLengths: list[int] = []

args = [
str(samtoolsPath.absolute()),
"view", "-F", "256", "-F", "2048", "-F", "512",
"view",
"-F", "256", "-F", "2048", "-F", "512", # Will ignore all duplicate reads
str(file.absolute())
]

Expand Down
37 changes: 37 additions & 0 deletions coretex/bioinformatics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from typing import List
from pathlib import Path

import logging
import subprocess


def logProcessOutput(output: bytes, severity: int) -> None:
Expand All @@ -28,3 +32,36 @@ def logProcessOutput(output: bytes, severity: int) -> None:

# ignoring type for now, has to be fixed in coretexpylib
logging.getLogger("coretexpylib").log(severity, line)


class CommandException(Exception):
pass


def command(args: List[str], ignoreOutput: bool = False) -> None:
process = subprocess.Popen(
args,
shell = False,
cwd = Path(__file__).parent,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE
)

while process.poll() is None:
if process.stdout is None or process.stderr is None:
continue

stdout = process.stdout.readline()
stderr = process.stderr.readline()

if len(stdout) > 0 and not ignoreOutput:
logProcessOutput(stdout, logging.INFO)

if len(stderr) > 0 and not ignoreOutput:
if process.returncode == 0:
logProcessOutput(stderr, logging.INFO)
else:
logProcessOutput(stderr, logging.CRITICAL)

if process.returncode != 0:
raise CommandException(f">> [Coretex] Falied to execute command. Returncode: {process.returncode}")

0 comments on commit d698be9

Please sign in to comment.