Skip to content

Commit

Permalink
Faster processing of GENCODE datasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
michal-g committed Mar 2, 2020
1 parent 04786d5 commit 866d2ec
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 49 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ install:
script:
- pytest -vv dryadic/tests/test_mtypes.py
- pytest -vv dryadic/tests/test_mcombs.py
- pytest -vv dryadic/tests/test_mtrees.py
- python dryadic/tests/test_cohorts.py
- python dryadic/tests/test_pipes.py
- python dryadic/tests/test_kbtl.py
Expand Down
25 changes: 12 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# Dryad

Dryad is a Python module for representing and predicting -omic features.

The following submodules are available in Dryad:

### `dryadic.features` ###
Loading -omic datasets and representing them in formats accessible to machine
learning methods.

### `dryadic.learning` ###
Algorithms and pipelines for using -omic datasets to predict -omic and
phenotypic features.
# Dryad #
`dryad` is a Python module for making predictions using -omic data. It
consists of two main parts: `dryadic.features`, containing tools for loading
-omic datasets and representing them in formats accessible to machine learning
methods, and `dryadic.learning`, which incorporates these tools into pipelines
designed to facilitate drawing biological insights from training prediction
tasks on -omic data.

## Installation ##
The easiest way to install `dryad` is via PyPI:

```pip install -i https://test.pypi.org/simple/ dryad==0.4.2```

2 changes: 1 addition & 1 deletion dryadic/features/cohorts/mut.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class BaseMutationCohort(PresenceCohort, UniCohort):
hierarchical representations of mutation data. Default is to
initialize with one tree that only sorts mutations by gene.
mut_genes (set or list-like), optional
mut_genes (set or list-like of :obj:`str`), optional
Set of genes whose mutation data should be considered. Recommended
for reducing the size of large mutation datasets, as default is to
use all mutations present in `mut_df`.
Expand Down
50 changes: 30 additions & 20 deletions dryadic/features/cohorts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ def get_gencode(annot_file, include_types=None):
annot_file (str): A .gtf file, downloaded from eg.
www.gencodegenes.org/releases/22.html
include_types (list-like or set of :obj:`str`), optional
Which annotation fields to include in the returned object.
The default only loads gene-level data.
Returns:
gn_annot (dict): Dictionary with keys corresponding to Ensembl gene
IDs and values consisting of dicts with
Expand All @@ -96,10 +100,10 @@ def get_gencode(annot_file, include_types=None):
if include_types:
use_types |= set(include_types)

if 'exon' in include_types:
if 'exon' in use_types:
use_types |= {'UTR'}

# remove annotation records that are non-relevant or on sex chromosomes
# remove annotation records that are irrelevant or on sex chromosomes
chroms_use = {'chr' + str(i+1) for i in range(22)}
annot = annot.loc[annot['Type'].isin(use_types)
& annot['Chr'].isin(chroms_use), :]
Expand Down Expand Up @@ -127,17 +131,20 @@ def get_gencode(annot_file, include_types=None):
info_flds.reset_index(drop=True)],
axis=1)

# find records corresponding to protein-coding genes
gene_df = info_df[(info_df.Type == 'gene')
& (info_df.gene_type == 'protein_coding')]
gene_df = gene_df.set_index('gene_id')

gn_annot = {gn: dict(recs[['Chr', 'Start', 'End', 'Strand', 'gene_name']])
# create dictionary using these records with Ensembl gene ids as keys
gn_annot = {gn: {'Chr': recs.Chr, 'Start': recs.Start, 'End': recs.End,
'Strand': recs.Strand, 'gene_name': recs.gene_name}
for gn, recs in gene_df.iterrows()}

if len(use_types) > 1:
info_df = info_df[info_df.transcript_type == 'protein_coding']

# group transcript records according to parent gene, transform gene
# records into a dictionary
# group transcript records according to parent gene
if 'transcript' in use_types:
tx_groups = info_df[(info_df.Type == 'transcript')
& info_df.gene_id.isin(gene_df.index)].groupby(
Expand All @@ -146,35 +153,38 @@ def get_gencode(annot_file, include_types=None):
# insert the transcripts for each gene into the gene record dictionary
for gn, tx_df in tx_groups:
gn_annot[gn]['Transcripts'] = {
tx: dict(recs[['Start', 'End', 'transcript_name']])
for tx, recs in tx_df.set_index('transcript_id').iterrows()
tx_df.transcript_id.iloc[i]: {
'Start': tx_df.Start.iloc[i], 'End': tx_df.End.iloc[i],
'transcript_name': tx_df.transcript_name.iloc[i]
}
for i in range(tx_df.shape[0])
}

if 'exon' in use_types:
if 'transcript' not in use_types:
raise ValueError("Cannot load gene exon information without also "
"loading transcript information!")

# likewise, group exon records according to parent gene
regn_groups = info_df[info_df.Type.isin(['exon', 'UTR'])
& info_df.gene_id.isin(gene_df.index)].groupby(
['gene_id', 'transcript_id'])
# likewise, group exon and UTR records according to parent gene
use_df = info_df[info_df.gene_id.isin(gene_df.index)]
exn_groups = use_df[use_df.Type == 'exon'].groupby(
['gene_id', 'transcript_id'])
utr_groups = use_df[use_df.Type == 'UTR'].groupby(
['gene_id', 'transcript_id'])

for (gn, tx), regn_df in regn_groups:
exn_df = regn_df[regn_df.Type == 'exon']
utr_df = regn_df[regn_df.Type == 'UTR']
for (gn, tx), exn_df in exn_groups:
gn_annot[gn]['Transcripts'][tx]['UTRs'] = []
gn_annot[gn]['Transcripts'][tx]['Exons'] = exn_df[[
'Start', 'End', 'exon_id']].to_dict(orient='records')

gn_annot[gn]['Transcripts'][tx]['Exons'] = exn_df.sort_values(
by='exon_number')[['Start', 'End', 'exon_id']].apply(
dict, axis=1).tolist()
exn_count = len(gn_annot[gn]['Transcripts'][tx]['Exons'])

for i in range(len(gn_annot[gn]['Transcripts'][tx]['Exons'])):
for i in range(exn_count):
gn_annot[gn]['Transcripts'][tx]['Exons'][i][
'number'] = "{}/{}".format(i + 1, exn_count)

for (gn, tx), utr_df in utr_groups:
gn_annot[gn]['Transcripts'][tx]['UTRs'] = utr_df.sort_values(
by='Start')[['Start', 'End']].apply(dict, axis=1).tolist()
by='Start')[['Start', 'End']].to_dict(orient='records')

return gn_annot

Expand Down
12 changes: 11 additions & 1 deletion dryadic/tests/test_cohorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
sys.path.extend([os.path.join(base_dir, '../..')])

from dryadic.features.cohorts.base import UniCohort
from dryadic.features.cohorts import *
from dryadic.features.cohorts import BaseMutationCohort
from dryadic.features.mutations import MuType

import numpy as np
Expand All @@ -19,6 +19,16 @@ def load_omic_data(data_lbl):
sep='\t', index_col=0)


def load_muts(muts_lbl):
return pd.read_csv(
os.path.join(os.path.dirname(__file__), 'resources',
"muts_{}.tsv".format(muts_lbl)),
engine='python', sep='\t', comment='#',
names=['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon',
'ref_count', 'alt_count', 'PolyPhen']
)


def check_samp_split(cdata, expr_samps):
assert ((set(cdata.get_train_samples()) | set(cdata.get_test_samples()))
== set(expr_samps)), (
Expand Down
1 change: 0 additions & 1 deletion dryadic/tests/test_kbtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def main():
cv_seed=None, test_prop=0.3)
trs_cdata.update_split(new_seed=101)

import pdb; pdb.set_trace()
mult_clf.tune_coh(trs_cdata, sing_mtype,
test_count=4, tune_splits=2, parallel_jobs=1)
print(mult_clf)
Expand Down
12 changes: 1 addition & 11 deletions dryadic/tests/test_mtrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,14 @@

from ..features.mutations import MuType, MuTree
from .test_mtypes import mtype_tester
from .test_cohorts import load_muts
import pytest

import os
import pandas as pd
from itertools import product, chain
from itertools import combinations as combn


def load_muts(muts_lbl):
return pd.read_csv(
os.path.join(os.path.dirname(__file__), 'resources',
"muts_{}.tsv".format(muts_lbl)),
engine='python', sep='\t', comment='#',
names=['Gene', 'Form', 'Sample', 'Protein', 'Transcript', 'Exon',
'ref_count', 'alt_count', 'PolyPhen']
)


def pytest_generate_tests(metafunc):
if metafunc.function.__code__.co_argcount == 1:
pass
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import setuptools

setuptools.setup(name='dryad',
version='0.4.2a3',
version='0.4.2',
description='Prediction of Cancer Phenotypes Using Mutation Trees',
author='Michal Radoslaw Grzadkowski',
author_email='[email protected]',
packages=setuptools.find_packages(
exclude=["dryadic.tests.*", "dryadic.tests"]),
url = 'https://github.com/ohsu-comp-bio/dryad',
download_url = ('https://github.com/ohsu-comp-bio/'
'dryad/archive/v0.4.2a3.tar.gz'),
'dryad/archive/v0.4.2.tar.gz'),
install_requires=[
'numpy>=1.16',
'pandas>=0.25',
Expand Down

0 comments on commit 866d2ec

Please sign in to comment.