Skip to content

Commit

Permalink
Merge pull request #42 from asapdiscovery/nextstrain
Browse files Browse the repository at this point in the history
Add Nextstrain functionality
  • Loading branch information
hmacdope authored Sep 26, 2024
2 parents 2e591c9 + ce1889b commit 630af43
Show file tree
Hide file tree
Showing 10 changed files with 1,630 additions and 26 deletions.
16 changes: 12 additions & 4 deletions .github/workflows/CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@ on:
# (from https://help.github.com/en/actions/reference/events-that-trigger-workflows#scheduled-events-schedule)
- cron: "0 0 * * 0"

defaults:
run:
shell: bash -l {0}

jobs:
test:
name: Test on ${{ matrix.os }}, Python ${{ matrix.python-version }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macOS-latest, ubuntu-latest, windows-latest]
os: [macOS-latest, ubuntu-latest]
python-version: ["3.10", "3.11"]
fail-fast: false

Expand All @@ -36,14 +40,18 @@ jobs:
ulimit -a
# More info on options: https://github.com/marketplace/actions/provision-with-micromamba
- name: "Setup Micromamba"
# More info on options: https://github.com/mamba-org/provision-with-micromamba
- name: Setup Conda Environment
uses: mamba-org/setup-micromamba@v1
with:
environment-file: devtools/conda-envs/choppa.yaml
environment-name: test
channels: conda-forge,defaults
cache-environment: true
cache-downloads: true
cache-environment-key: environment-${{ steps.date.outputs.date }}
cache-downloads-key: downloads-${{ steps.date.outputs.date }}
create-args: >-
python=${{ matrix.python-version }}
python==${{ matrix.python-version }}
- name: Install package
# conda setup requires this special shell
Expand Down
102 changes: 101 additions & 1 deletion choppa/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@
from choppa.IO.input import FitnessFactory, ComplexFactory
from choppa.align import AlignFactory
from choppa.render import PublicationView, InteractiveView


from choppa.data.metadata.resources import (
MERS_REAL_GENE_TO_COMMUNITY_GENE,
MERS_TARGET_TO_REAL_GENE,
)

from choppa.cli.utils import SpecialHelpOrder
from pathlib import Path


@click.group(
cls=SpecialHelpOrder,
context_settings={"max_content_width": shutil.get_terminal_size().columns - 20},
Expand Down Expand Up @@ -114,7 +122,7 @@ def render(
# check extensions
if not Path(outfile_publication).suffix == ".pse":
raise ValueError("--op/--outfile-publication should end in '.pse'.")

if not Path(outfile_interactive).suffix == ".html":
raise ValueError("--oi/--outfile-interactive should end in '.html'.")

Expand Down Expand Up @@ -147,3 +155,95 @@ def render(
fitness_threshold=fitness_threshold,
output_session_file=outfile_interactive,
).render()


@cli.command(
name="nextstrain",
help=". ",
short_help="From the database of NextStrain-maintained pathogen analyses (https://nextstrain.org), generate a data format suitable for choppa.render.",
)
@click.option(
"-v",
"--virus",
type=click.STRING,
help="Name of the virus to download mutation data for. See https://nextstrain.org/pathogens for a list of available viruses.",
required=True,
)
@click.option(
"-g",
"--gene",
type=click.STRING,
help="Name of the gene to download mutation data for. See e.g. https://nextstrain.org/zika for a view of available genes.",
required=True,
)
@click.option(
"-o",
"--outfile",
type=click.Path(exists=False, file_okay=True, dir_okay=False, writable=True),
help="Name of output file to write mutation data to in CSV. Should end in '.csv'.",
required=True,
)
def nextstrain(
virus: Optional[str] = None,
gene: Optional[str] = None,
outfile: Optional[str] = None,
):
from choppa.nextstrain import (
get_url,
fetch_nextstrain_json,
fetch_nextstrain_root_sequence,
fetch_nextstrain_json_mers_cov,
nextstrain_json_to_tree,
extract_tree_data,
count_mutations_events,
finalize_dataframe,
)

# check extension
if not Path(outfile).suffix == ".csv":
raise ValueError("-o/--outfile should end in '.csv'.")

download_url, nextstrain_tree_url = get_url(virus, gene)

# Fetch the JSON data from the data URL
# Fetch the root sequence data
if virus == "MERS-CoV":
tree_json = fetch_nextstrain_json_mers_cov()
root_sequence_json = fetch_nextstrain_root_sequence(
nextstrain_tree_url, MERS_COV=True
)
else:
tree_json = fetch_nextstrain_json(download_url)
root_sequence_json = fetch_nextstrain_root_sequence(nextstrain_tree_url)

if root_sequence_json is None:
# Fallback to tree_json if the root sequence is not available via the URL
if "root_sequence" in tree_json:
root_sequence_json = tree_json["root_sequence"]
else:
# Fail if no root sequence is available
raise ValueError(
"Root sequence is missing from the Nextstrain API and the main tree data."
)

# Make a tree from the JSON data
tree = nextstrain_json_to_tree(tree_json)

# Extract the mutations from the tree
metadata_df = extract_tree_data(
tree, attributes=["mutations"], include_internal_nodes=True
)

# Count terminal mutations
treating_mers = False
if virus == "MERS-CoV":
# rename to match inconsistent gene naming in this community contribution NextStrain
gene = MERS_REAL_GENE_TO_COMMUNITY_GENE[MERS_TARGET_TO_REAL_GENE[gene]]
treating_mers = True

mutation_count_df = count_mutations_events(metadata_df, gene)

# finalize dataframe by adding mutations and root sequence together
_ = finalize_dataframe(
mutation_count_df, root_sequence_json, gene, outfile, mers=treating_mers
)
Loading

0 comments on commit 630af43

Please sign in to comment.