Skip to content

Commit

Permalink
Final tweaks and updates to documentation for v0.4.2a3
Browse files Browse the repository at this point in the history
  • Loading branch information
michal-g committed Feb 20, 2020
1 parent e7ed029 commit 04786d5
Show file tree
Hide file tree
Showing 10 changed files with 90 additions and 19 deletions.
8 changes: 6 additions & 2 deletions dryadic/features/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ Organization of genomic perturbation features into tree-like structures
defined by hierarchical properties such as genomic location and mutation form.

## cohorts
Interfaces for pairing the datasets used as input with the features and phenotypes
used as labels that define -omic machine learning tasks.
Interfaces for pairing the datasets used as input with the features and
phenotypes used as labels that define -omic machine learning tasks.

## data
Utilities for loading datasets often used to augment -omic data, such
as protein binding domains.

6 changes: 4 additions & 2 deletions dryadic/features/cohorts/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class Cohort(object):
omic_data : An -omic dataset or collection thereof.
cv_seed (int): A seed used for random sampling from the datasets.
test_prop: The proportion of samples in each dataset used for testing.
test_samps: The samples in each dataset used for testing.
Should be `None` if `test_prop` is already specified.
"""

Expand Down Expand Up @@ -82,7 +84,7 @@ def get_features(self, include_feats=None, exclude_feats=None):
"""Retrieves features over which -omic measurements were made."""

def train_data(self,
pheno=None,
pheno,
include_samps=None, exclude_samps=None,
include_feats=None, exclude_feats=None):
"""Retrieval of the training cohort from the -omic dataset."""
Expand All @@ -101,7 +103,7 @@ def train_data(self,
return self.get_omic_data(samps, feats), pheno_data

def test_data(self,
pheno=None,
pheno,
include_samps=None, exclude_samps=None,
include_feats=None, exclude_feats=None):
"""Retrieval of the testing cohort from the -omic dataset."""
Expand Down
40 changes: 38 additions & 2 deletions dryadic/features/cohorts/mut.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,39 @@ class BaseMutationCohort(PresenceCohort, UniCohort):
Args:
expr_mat (pandas.DataFrame, shape = [n_samps, n_features])
-Omic dataset that will be used as input features for prediction.
mut_df (pandas.DataFrame, shape = [n_muts, n_fields])
A list of mutations present in the samples, with various fields
corresponding to mutation attributes.
mut_levels (iterable of list-like), optional
Which combinations of mutation attributes to use when creating
hierarchical representations of mutation data. Default is to
initialize with one tree that only sorts mutations by gene.
mut_genes (set or list-like), optional
Set of genes whose mutation data should be considered. Recommended
for reducing the size of large mutation datasets, as default is to
use all mutations present in `mut_df`.
cv_seed (int), optional: Seed used for random sampling.
test_prop (float), optional: Proportion of cohort's samples that will
be used for testing. Default is to not
have a testing sub-cohort.
Attributes:
mtree (MuTree): A hierarchical representation of the mutations present
in the dataset.
mtrees (:obj:`dict` of :obj:`MuTree`)
Hierarchical representations of the mutations present in the
dataset, ordered according to combinations of mutation attributes.
"""

def __init__(self,
expr_mat, mut_df, mut_levels=None, mut_genes=None,
domain_dir=None, leaf_annot=('PolyPhen', ),
cv_seed=None, test_prop=0):

# if a gene set is specified remove mutation data from other genes
if mut_genes is not None:
mut_df = mut_df.loc[mut_df.Gene.isin(mut_genes)]

Expand All @@ -33,6 +50,8 @@ def __init__(self,
self.leaf_annot = leaf_annot
self.mtrees = dict()

# initialize mutation tree(s) according to specified mutation
# attribute combinations
if mut_levels is None:
self.add_mut_lvls(('Gene', ))

Expand All @@ -43,11 +62,28 @@ def __init__(self,
super().__init__(expr_mat, cv_seed, test_prop)

def add_mut_lvls(self, lvls):
"""Adds a hierarchical representation of mutations.
This method adds (or replaces an existing) tree of mutations based
on a given combination of mutation attributes.
Args:
lvls (list-like of :obj:`str`)
"""
self.mtrees[tuple(lvls)] = MuTree(self.muts, levels=lvls,
domain_dir=self.domain_dir,
leaf_annot=self.leaf_annot)

def choose_mtree(self, pheno):
"""Finds (or adds) the tree that matches a given mutation object.
Args:
pheno (MuType or MutComb): An abstract representation of a set of
mutations.
"""
if isinstance(pheno, MuType):
phn_lvls = pheno.get_sorted_levels()

Expand Down
17 changes: 15 additions & 2 deletions dryadic/features/cohorts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,14 +207,27 @@ def log_norm(data_mat):


def drop_duplicate_genes(expr_mat):
"""Removes genes that appear more than once in a matrix of -omic data.
Args:
expr_mat (:obj:`pd.DataFrame`)
Returns:
expr_mat (:obj:`pd.DataFrame`)
"""
gene_counts = expr_mat.columns.value_counts()
dup_genes = gene_counts.index[gene_counts > 1]
new_expr = expr_mat.copy()


# for each gene which appears more than once, find the set of -omic
# measurements with the greatest total value
for dup_gene in dup_genes:
gn_indx = np.argwhere(new_expr.columns.get_loc(dup_gene)).flatten()
use_indx = new_expr.iloc[:, gn_indx].sum().values.argmax()


# remove the entries in the matrix for this gene that are not the
# entry with the greatest total value
rmv_indxs = gn_indx[:use_indx].tolist()
rmv_indxs += gn_indx[(use_indx + 1):].tolist()
new_expr = new_expr.iloc[:, [i for i in range(new_expr.shape[1])
Expand Down
1 change: 1 addition & 0 deletions dryadic/features/mutations/branches.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ def invert(self, mtree):
inv_mtype (MuType)
"""
#TODO: implement this in MuTrees
return mtree.get_diff(MuType(mtree.allkey()), self)

def subkeys(self):
Expand Down
3 changes: 3 additions & 0 deletions dryadic/features/mutations/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ def __len__(self):
return len(self.get_samples())

def sort_iter(self):
"""Iterates through the branches of the tree, ordering mutation
attributes where possible."""

if self.mut_level in ['Exon', 'Location']:
return iter(sorted(
Expand Down Expand Up @@ -890,6 +892,7 @@ def combtypes(self,
"""
branch_mtypes = set()
comb_mtypes = set()
#TODO: more error-checking for input values, e.g. sizes can't be zero

if not isinstance(min_branch_size, str):
branch_mtypes = self.branchtypes(
Expand Down
14 changes: 8 additions & 6 deletions dryadic/learning/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,23 +132,25 @@ def _fit(self, X, y=None, **fit_params):
return Xt, final_params

def predict_train(self,
cohort, pheno,
cohort, lbl_type='prob',
include_samps=None, exclude_samps=None,
include_feats=None, exclude_feats=None):
return self.predict_omic(
cohort.train_data(pheno,
cohort.train_data(None,
include_samps, exclude_samps,
include_feats, exclude_feats)[0]
include_feats, exclude_feats)[0],
lbl_type
)

def predict_test(self,
cohort, pheno,
cohort, lbl_type='prob',
include_samps=None, exclude_samps=None,
include_feats=None, exclude_feats=None):
return self.predict_omic(
cohort.test_data(pheno,
cohort.test_data(None,
include_samps, exclude_samps,
include_feats, exclude_feats)[0]
include_feats, exclude_feats)[0],
lbl_type
)

def predict_omic(self, omic_data):
Expand Down
6 changes: 3 additions & 3 deletions dryadic/tests/test_cohorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ def main():
cdata.update_split(new_seed=551, test_prop=1./3)
assert cdata.get_seed() == 551
assert len(cdata.get_samples()) == expr_data.shape[0]
assert cdata.train_data()[0].shape == (expr_data.shape[0] * 2/3,
expr_data.shape[1])
assert cdata.train_data(None)[0].shape == (expr_data.shape[0] * 2/3,
expr_data.shape[1])
check_samp_split(cdata, expr_data.index)

cdata.update_split(new_seed=551, test_samps=expr_data.index[:20])
assert cdata.test_data()[0].shape == (20, expr_data.shape[1])
assert cdata.test_data(None)[0].shape == (20, expr_data.shape[1])
check_samp_split(cdata, expr_data.index)

mut_data = load_omic_data('variants')
Expand Down
10 changes: 10 additions & 0 deletions dryadic/tests/test_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,17 @@ def main():
for param, _ in clf.tune_priors:
assert clf.get_params()[param] == cvs['params'][best_indx][param]

use_feats = cdata.get_features()[::3]
clf.fit_coh(cdata, test_mtype, include_feats=use_feats)
train_preds = clf.predict_train(cdata, include_feats=use_feats)
test_preds = clf.predict_test(cdata, include_feats=use_feats)
test_preds = clf.predict_test(cdata,
include_feats=use_feats, lbl_type='prob')

clf.fit_coh(cdata, test_mtype)
train_preds = clf.predict_train(cdata)
test_preds = clf.predict_test(cdata, lbl_type='raw')

tuned_coefs = np.floor(expr_data.shape[1]
* (clf.named_steps['feat'].mean_perc / 100))
assert tuned_coefs == len(clf.named_steps['fit'].coef_[0]), (
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import setuptools

setuptools.setup(name='dryad',
version='0.4.1',
version='0.4.2a3',
description='Prediction of Cancer Phenotypes Using Mutation Trees',
author='Michal Radoslaw Grzadkowski',
author_email='[email protected]',
packages=setuptools.find_packages(
exclude=["dryadic.tests.*", "dryadic.tests"]),
url = 'https://github.com/ohsu-comp-bio/dryad',
download_url = ('https://github.com/ohsu-comp-bio/'
'dryad/archive/v0.4.tar.gz'),
'dryad/archive/v0.4.2a3.tar.gz'),
install_requires=[
'numpy>=1.16',
'pandas>=0.25',
Expand Down

0 comments on commit 04786d5

Please sign in to comment.