diff --git a/.travis.yml b/.travis.yml index 31678c6..9225791 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ install: script: - pytest -vv dryadic/tests/test_mtypes.py - pytest -vv dryadic/tests/test_mcombs.py + - pytest -vv dryadic/tests/test_mtrees.py - python dryadic/tests/test_cohorts.py - python dryadic/tests/test_pipes.py - python dryadic/tests/test_kbtl.py diff --git a/README.md b/README.md index 193491a..aa26505 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,13 @@ -# Dryad - -Dryad is a Python module for representing and predicting -omic features. - -The following submodules are available in Dryad: - -### `dryadic.features` ### -Loading -omic datasets and representing them in formats accessible to machine -learning methods. - -### `dryadic.learning` ### -Algorithms and pipelines for using -omic datasets to predict -omic and -phenotypic features. +# Dryad # +`dryad` is a Python module for making predictions using -omic data. It +consists of two main parts: `dryadic.features`, containing tools for loading +-omic datasets and representing them in formats accessible to machine learning +methods, and `dryadic.learning`, which incorporates these tools into pipelines +designed to facilitate drawing biological insights from training prediction +tasks on -omic data. + +## Installation ## +The easiest way to install `dryad` is via PyPI: + +```pip install -i https://test.pypi.org/simple/ dryad==0.4.2``` diff --git a/dryadic/features/cohorts/mut.py b/dryadic/features/cohorts/mut.py index d122188..a148758 100644 --- a/dryadic/features/cohorts/mut.py +++ b/dryadic/features/cohorts/mut.py @@ -18,7 +18,7 @@ class BaseMutationCohort(PresenceCohort, UniCohort): hierarchical representations of mutation data. Default is to initialize with one tree that only sorts mutations by gene. - mut_genes (set or list-like), optional + mut_genes (set or list-like of :obj:`str`), optional Set of genes whose mutation data should be considered. Recommended for reducing the size of large mutation datasets, as default is to use all mutations present in `mut_df`. diff --git a/dryadic/features/cohorts/utils.py b/dryadic/features/cohorts/utils.py index 0d07401..39a673d 100644 --- a/dryadic/features/cohorts/utils.py +++ b/dryadic/features/cohorts/utils.py @@ -81,6 +81,10 @@ def get_gencode(annot_file, include_types=None): annot_file (str): A .gtf file, downloaded from eg. www.gencodegenes.org/releases/22.html + include_types (list-like or set of :obj:`str`), optional + Which annotation fields to include in the returned object. + The default only loads gene-level data. + Returns: gn_annot (dict): Dictionary with keys corresponding to Ensembl gene IDs and values consisting of dicts with @@ -96,10 +100,10 @@ def get_gencode(annot_file, include_types=None): if include_types: use_types |= set(include_types) - if 'exon' in include_types: + if 'exon' in use_types: use_types |= {'UTR'} - # remove annotation records that are non-relevant or on sex chromosomes + # remove annotation records that are irrelevant or on sex chromosomes chroms_use = {'chr' + str(i+1) for i in range(22)} annot = annot.loc[annot['Type'].isin(use_types) & annot['Chr'].isin(chroms_use), :] @@ -127,17 +131,20 @@ def get_gencode(annot_file, include_types=None): info_flds.reset_index(drop=True)], axis=1) + # find records corresponding to protein-coding genes gene_df = info_df[(info_df.Type == 'gene') & (info_df.gene_type == 'protein_coding')] gene_df = gene_df.set_index('gene_id') - gn_annot = {gn: dict(recs[['Chr', 'Start', 'End', 'Strand', 'gene_name']]) + # create dictionary using these records with Ensembl gene ids as keys + gn_annot = {gn: {'Chr': recs.Chr, 'Start': recs.Start, 'End': recs.End, + 'Strand': recs.Strand, 'gene_name': recs.gene_name} for gn, recs in gene_df.iterrows()} + if len(use_types) > 1: info_df = info_df[info_df.transcript_type == 'protein_coding'] - # group transcript records according to parent gene, transform gene - # records into a dictionary + # group transcript records according to parent gene if 'transcript' in use_types: tx_groups = info_df[(info_df.Type == 'transcript') & info_df.gene_id.isin(gene_df.index)].groupby( @@ -146,8 +153,11 @@ def get_gencode(annot_file, include_types=None): # insert the transcripts for each gene into the gene record dictionary for gn, tx_df in tx_groups: gn_annot[gn]['Transcripts'] = { - tx: dict(recs[['Start', 'End', 'transcript_name']]) - for tx, recs in tx_df.set_index('transcript_id').iterrows() + tx_df.transcript_id.iloc[i]: { + 'Start': tx_df.Start.iloc[i], 'End': tx_df.End.iloc[i], + 'transcript_name': tx_df.transcript_name.iloc[i] + } + for i in range(tx_df.shape[0]) } if 'exon' in use_types: @@ -155,26 +165,26 @@ def get_gencode(annot_file, include_types=None): raise ValueError("Cannot load gene exon information without also " "loading transcript information!") - # likewise, group exon records according to parent gene - regn_groups = info_df[info_df.Type.isin(['exon', 'UTR']) - & info_df.gene_id.isin(gene_df.index)].groupby( - ['gene_id', 'transcript_id']) + # likewise, group exon and UTR records according to parent gene + use_df = info_df[info_df.gene_id.isin(gene_df.index)] + exn_groups = use_df[use_df.Type == 'exon'].groupby( + ['gene_id', 'transcript_id']) + utr_groups = use_df[use_df.Type == 'UTR'].groupby( + ['gene_id', 'transcript_id']) - for (gn, tx), regn_df in regn_groups: - exn_df = regn_df[regn_df.Type == 'exon'] - utr_df = regn_df[regn_df.Type == 'UTR'] + for (gn, tx), exn_df in exn_groups: + gn_annot[gn]['Transcripts'][tx]['UTRs'] = [] + gn_annot[gn]['Transcripts'][tx]['Exons'] = exn_df[[ + 'Start', 'End', 'exon_id']].to_dict(orient='records') - gn_annot[gn]['Transcripts'][tx]['Exons'] = exn_df.sort_values( - by='exon_number')[['Start', 'End', 'exon_id']].apply( - dict, axis=1).tolist() exn_count = len(gn_annot[gn]['Transcripts'][tx]['Exons']) - - for i in range(len(gn_annot[gn]['Transcripts'][tx]['Exons'])): + for i in range(exn_count): gn_annot[gn]['Transcripts'][tx]['Exons'][i][ 'number'] = "{}/{}".format(i + 1, exn_count) + for (gn, tx), utr_df in utr_groups: gn_annot[gn]['Transcripts'][tx]['UTRs'] = utr_df.sort_values( - by='Start')[['Start', 'End']].apply(dict, axis=1).tolist() + by='Start')[['Start', 'End']].to_dict(orient='records') return gn_annot diff --git a/dryadic/tests/test_cohorts.py b/dryadic/tests/test_cohorts.py index ab3751e..5b51b00 100644 --- a/dryadic/tests/test_cohorts.py +++ b/dryadic/tests/test_cohorts.py @@ -6,7 +6,7 @@ sys.path.extend([os.path.join(base_dir, '../..')]) from dryadic.features.cohorts.base import UniCohort -from dryadic.features.cohorts import * +from dryadic.features.cohorts import BaseMutationCohort from dryadic.features.mutations import MuType import numpy as np @@ -19,6 +19,16 @@ def load_omic_data(data_lbl): sep='\t', index_col=0) +def load_muts(muts_lbl): + return pd.read_csv( + os.path.join(os.path.dirname(__file__), 'resources', + "muts_{}.tsv".format(muts_lbl)), + engine='python', sep='\t', comment='#', + names=['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon', + 'ref_count', 'alt_count', 'PolyPhen'] + ) + + def check_samp_split(cdata, expr_samps): assert ((set(cdata.get_train_samples()) | set(cdata.get_test_samples())) == set(expr_samps)), ( diff --git a/dryadic/tests/test_kbtl.py b/dryadic/tests/test_kbtl.py index ae40441..ef246e4 100644 --- a/dryadic/tests/test_kbtl.py +++ b/dryadic/tests/test_kbtl.py @@ -105,7 +105,6 @@ def main(): cv_seed=None, test_prop=0.3) trs_cdata.update_split(new_seed=101) - import pdb; pdb.set_trace() mult_clf.tune_coh(trs_cdata, sing_mtype, test_count=4, tune_splits=2, parallel_jobs=1) print(mult_clf) diff --git a/dryadic/tests/test_mtrees.py b/dryadic/tests/test_mtrees.py index 161edfa..1780bc0 100644 --- a/dryadic/tests/test_mtrees.py +++ b/dryadic/tests/test_mtrees.py @@ -10,24 +10,14 @@ from ..features.mutations import MuType, MuTree from .test_mtypes import mtype_tester +from .test_cohorts import load_muts import pytest -import os import pandas as pd from itertools import product, chain from itertools import combinations as combn -def load_muts(muts_lbl): - return pd.read_csv( - os.path.join(os.path.dirname(__file__), 'resources', - "muts_{}.tsv".format(muts_lbl)), - engine='python', sep='\t', comment='#', - names=['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon', - 'ref_count', 'alt_count', 'PolyPhen'] - ) - - def pytest_generate_tests(metafunc): if metafunc.function.__code__.co_argcount == 1: pass diff --git a/setup.py b/setup.py index 8d6aa4b..37cba2b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ import setuptools setuptools.setup(name='dryad', - version='0.4.2a3', + version='0.4.2', description='Prediction of Cancer Phenotypes Using Mutation Trees', author='Michal Radoslaw Grzadkowski', author_email='grzadkow@ohsu.edu', @@ -9,7 +9,7 @@ exclude=["dryadic.tests.*", "dryadic.tests"]), url = 'https://github.com/ohsu-comp-bio/dryad', download_url = ('https://github.com/ohsu-comp-bio/' - 'dryad/archive/v0.4.2a3.tar.gz'), + 'dryad/archive/v0.4.2.tar.gz'), install_requires=[ 'numpy>=1.16', 'pandas>=0.25',