-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Concatenate fastq files for microsalt deliveries (#2951)
- Loading branch information
Showing
12 changed files
with
299 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
class FastqServiceError(Exception): | ||
pass | ||
|
||
|
||
class ConcatenationError(FastqServiceError): | ||
pass | ||
|
||
|
||
class InvalidFastqDirectory(FastqServiceError): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from pathlib import Path | ||
|
||
from cg.services.fastq_file_service.utils import ( | ||
concatenate_forward_reads, | ||
concatenate_reverse_reads, | ||
remove_raw_fastqs, | ||
) | ||
|
||
|
||
class FastqFileService: | ||
|
||
def concatenate( | ||
self, | ||
fastq_directory: Path, | ||
forward_output: Path, | ||
reverse_output: Path, | ||
remove_raw: bool = False, | ||
): | ||
temp_forward: Path | None = concatenate_forward_reads(fastq_directory) | ||
temp_reverse: Path | None = concatenate_reverse_reads(fastq_directory) | ||
|
||
if remove_raw: | ||
remove_raw_fastqs( | ||
fastq_directory=fastq_directory, | ||
forward_file=temp_forward, | ||
reverse_file=temp_reverse, | ||
) | ||
|
||
if temp_forward: | ||
temp_forward.rename(forward_output) | ||
|
||
if temp_reverse: | ||
temp_reverse.rename(reverse_output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
from pathlib import Path | ||
import re | ||
import shutil | ||
import uuid | ||
|
||
from cg.services.fastq_file_service.exceptions import ConcatenationError | ||
|
||
|
||
def concatenate_forward_reads(directory: Path) -> Path | None: | ||
fastqs: list[Path] = get_forward_read_fastqs(directory) | ||
if not fastqs: | ||
return | ||
output_file: Path = get_new_unique_file(directory) | ||
concatenate(input_files=fastqs, output_file=output_file) | ||
validate_concatenation(input_files=fastqs, output_file=output_file) | ||
return output_file | ||
|
||
|
||
def concatenate_reverse_reads(directory: Path) -> Path | None: | ||
fastqs: list[Path] = get_reverse_read_fastqs(directory) | ||
if not fastqs: | ||
return | ||
file: Path = get_new_unique_file(directory) | ||
concatenate(input_files=fastqs, output_file=file) | ||
validate_concatenation(input_files=fastqs, output_file=file) | ||
return file | ||
|
||
|
||
def get_new_unique_file(directory: Path) -> Path: | ||
unique_id = uuid.uuid4() | ||
return Path(directory, f"{unique_id}.fastq.gz") | ||
|
||
|
||
def get_forward_read_fastqs(fastq_directory: Path) -> list[Path]: | ||
return get_fastqs_by_direction(fastq_directory=fastq_directory, direction=1) | ||
|
||
|
||
def get_reverse_read_fastqs(fastq_directory: Path) -> list[Path]: | ||
return get_fastqs_by_direction(fastq_directory=fastq_directory, direction=2) | ||
|
||
|
||
def get_fastqs_by_direction(fastq_directory: Path, direction: int) -> list[Path]: | ||
pattern = f".+_R{direction}_[0-9]+.fastq.gz" | ||
fastqs: list[Path] = [] | ||
for file in fastq_directory.iterdir(): | ||
if re.match(pattern, file.name): | ||
fastqs.append(file) | ||
return sort_files_by_name(fastqs) | ||
|
||
|
||
def get_total_size(files: list[Path]) -> int: | ||
return sum(file.stat().st_size for file in files) | ||
|
||
|
||
def concatenate(input_files: list[Path], output_file: Path) -> None: | ||
with open(output_file, "wb") as write_file_obj: | ||
for file in input_files: | ||
with open(file, "rb") as file_descriptor: | ||
shutil.copyfileobj(file_descriptor, write_file_obj) | ||
|
||
|
||
def validate_concatenation(input_files: list[Path], output_file: Path) -> None: | ||
total_size: int = get_total_size(input_files) | ||
concatenated_size: int = get_total_size([output_file]) | ||
if total_size != concatenated_size: | ||
raise ConcatenationError | ||
|
||
|
||
def sort_files_by_name(files: list[Path]) -> list[Path]: | ||
return sorted(files, key=lambda file: file.name) | ||
|
||
|
||
def file_can_be_removed(file: Path, forward_file: Path, reverse_file: Path) -> bool: | ||
return file.suffix == ".gz" and file != forward_file and file != reverse_file | ||
|
||
|
||
def remove_raw_fastqs(fastq_directory: Path, forward_file: Path, reverse_file: Path) -> None: | ||
for file in fastq_directory.iterdir(): | ||
if file_can_be_removed(file=file, forward_file=forward_file, reverse_file=reverse_file): | ||
file.unlink() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pathlib import Path | ||
import pytest | ||
|
||
from cg.services.fastq_file_service.fastq_file_service import FastqFileService | ||
|
||
|
||
@pytest.fixture | ||
def fastq_file_service(): | ||
return FastqFileService() | ||
|
||
|
||
def create_fastqs_directory(number_forward_reads, number_reverse_reads, tmp_path): | ||
fastq_dir = Path(tmp_path, "fastqs") | ||
fastq_dir.mkdir() | ||
for i in range(number_forward_reads): | ||
file = Path(fastq_dir, f"sample_R1_{i}.fastq.gz") | ||
file.write_text(f"forward read {i}") | ||
|
||
for i in range(number_reverse_reads): | ||
file = Path(fastq_dir, f"sample_R2_{i}.fastq.gz") | ||
file.write_text(f"reverse read {i}") | ||
return fastq_dir | ||
|
||
|
||
@pytest.fixture | ||
def fastqs_dir(tmp_path) -> Path: | ||
return create_fastqs_directory( | ||
number_forward_reads=3, number_reverse_reads=3, tmp_path=tmp_path | ||
) | ||
|
||
|
||
@pytest.fixture | ||
def fastqs_forward(tmp_path) -> Path: | ||
"""Return a directory with only forward reads.""" | ||
return create_fastqs_directory( | ||
number_forward_reads=3, number_reverse_reads=0, tmp_path=tmp_path | ||
) |
Oops, something went wrong.