Final tweaks and updates to documentation for v0.4.2a3

ohsu-comp-bio · Feb 20, 2020 · 04786d5 · 04786d5
1 parent e7ed029
commit 04786d5
Show file tree

Hide file tree

Showing 10 changed files with 90 additions and 19 deletions.
diff --git a/dryadic/features/README.md b/dryadic/features/README.md
@@ -9,6 +9,10 @@ Organization of genomic perturbation features into tree-like structures
 defined by hierarchical properties such as genomic location and mutation form.
 
 ## cohorts
-Interfaces for pairing the datasets used as input with the features and phenotypes
-used as labels that define -omic machine learning tasks.
+Interfaces for pairing the datasets used as input with the features and
+phenotypes used as labels that define -omic machine learning tasks.
+
+## data
+Utilities for loading datasets often used to augment -omic data, such
+as protein binding domains.
 
diff --git a/dryadic/features/cohorts/base.py b/dryadic/features/cohorts/base.py
@@ -25,6 +25,8 @@ class Cohort(object):
         omic_data : An -omic dataset or collection thereof.
         cv_seed (int): A seed used for random sampling from the datasets.
         test_prop: The proportion of samples in each dataset used for testing.
+        test_samps: The samples in each dataset used for testing.
+                    Should be `None` if `test_prop` is already specified.
 
     """
 
@@ -82,7 +84,7 @@ def get_features(self, include_feats=None, exclude_feats=None):
         """Retrieves features over which -omic measurements were made."""
 
     def train_data(self,
-                   pheno=None,
+                   pheno,
                    include_samps=None, exclude_samps=None,
                    include_feats=None, exclude_feats=None):
         """Retrieval of the training cohort from the -omic dataset."""
@@ -101,7 +103,7 @@ def train_data(self,
         return self.get_omic_data(samps, feats), pheno_data
 
     def test_data(self,
-                  pheno=None,
+                  pheno,
                   include_samps=None, exclude_samps=None,
                   include_feats=None, exclude_feats=None):
         """Retrieval of the testing cohort from the -omic dataset."""

diff --git a/dryadic/features/cohorts/mut.py b/dryadic/features/cohorts/mut.py
@@ -8,22 +8,39 @@ class BaseMutationCohort(PresenceCohort, UniCohort):
 
     Args:
         expr_mat (pandas.DataFrame, shape = [n_samps, n_features])
+            -Omic dataset that will be used as input features for prediction.
         mut_df (pandas.DataFrame, shape = [n_muts, n_fields])
+            A list of mutations present in the samples, with various fields
+            corresponding to mutation attributes.
+
+        mut_levels (iterable of list-like), optional
+            Which combinations of mutation attributes to use when creating
+            hierarchical representations of mutation data. Default is to
+            initialize with one tree that only sorts mutations by gene.
+
+        mut_genes (set or list-like), optional
+            Set of genes whose mutation data should be considered. Recommended
+            for reducing the size of large mutation datasets, as default is to
+            use all mutations present in `mut_df`.
+
         cv_seed (int), optional: Seed used for random sampling.
         test_prop (float), optional: Proportion of cohort's samples that will
                                      be used for testing. Default is to not
                                      have a testing sub-cohort.
 
     Attributes:
-        mtree (MuTree): A hierarchical representation of the mutations present
-                        in the dataset.
+        mtrees (:obj:`dict` of :obj:`MuTree`)
+            Hierarchical representations of the mutations present in the
+            dataset, ordered according to combinations of mutation attributes.
 
     """
 
     def __init__(self,
                  expr_mat, mut_df, mut_levels=None, mut_genes=None,
                  domain_dir=None, leaf_annot=('PolyPhen', ),
                  cv_seed=None, test_prop=0):
+
+        # if a gene set is specified remove mutation data from other genes
         if mut_genes is not None:
             mut_df = mut_df.loc[mut_df.Gene.isin(mut_genes)]
 
@@ -33,6 +50,8 @@ def __init__(self,
         self.leaf_annot = leaf_annot
         self.mtrees = dict()
 
+        # initialize mutation tree(s) according to specified mutation
+        # attribute combinations
         if mut_levels is None:
             self.add_mut_lvls(('Gene', ))
 
@@ -43,11 +62,28 @@ def __init__(self,
         super().__init__(expr_mat, cv_seed, test_prop)
 
     def add_mut_lvls(self, lvls):
+        """Adds a hierarchical representation of mutations.
+
+        This method adds (or replaces an existing) tree of mutations based
+        on a given combination of mutation attributes.
+
+        Args:
+            lvls (list-like of :obj:`str`)
+
+
+        """
         self.mtrees[tuple(lvls)] = MuTree(self.muts, levels=lvls,
                                           domain_dir=self.domain_dir,
                                           leaf_annot=self.leaf_annot)
 
     def choose_mtree(self, pheno):
+        """Finds (or adds) the tree that matches a given mutation object.
+
+        Args:
+            pheno (MuType or MutComb): An abstract representation of a set of
+                                       mutations.
+        
+        """
         if isinstance(pheno, MuType):
             phn_lvls = pheno.get_sorted_levels()
 

diff --git a/dryadic/features/cohorts/utils.py b/dryadic/features/cohorts/utils.py
@@ -207,14 +207,27 @@ def log_norm(data_mat):
 
 
 def drop_duplicate_genes(expr_mat):
+    """Removes genes that appear more than once in a matrix of -omic data.
+
+    Args:
+        expr_mat (:obj:`pd.DataFrame`)
+
+    Returns:
+        expr_mat (:obj:`pd.DataFrame`)
+
+    """
     gene_counts = expr_mat.columns.value_counts()
     dup_genes = gene_counts.index[gene_counts > 1]
     new_expr = expr_mat.copy()
-
+
+    # for each gene which appears more than once, find the set of -omic
+    # measurements with the greatest total value
     for dup_gene in dup_genes:
         gn_indx = np.argwhere(new_expr.columns.get_loc(dup_gene)).flatten()
         use_indx = new_expr.iloc[:, gn_indx].sum().values.argmax()
-
+
+        # remove the entries in the matrix for this gene that are not the
+        # entry with the greatest total value
         rmv_indxs = gn_indx[:use_indx].tolist()
         rmv_indxs += gn_indx[(use_indx + 1):].tolist()
         new_expr = new_expr.iloc[:, [i for i in range(new_expr.shape[1])

diff --git a/dryadic/features/mutations/branches.py b/dryadic/features/mutations/branches.py
@@ -691,6 +691,7 @@ def invert(self, mtree):
             inv_mtype (MuType)
 
         """
+        #TODO: implement this in MuTrees
         return mtree.get_diff(MuType(mtree.allkey()), self)
 
     def subkeys(self):

diff --git a/dryadic/features/mutations/trees.py b/dryadic/features/mutations/trees.py
@@ -543,6 +543,8 @@ def __len__(self):
         return len(self.get_samples())
 
     def sort_iter(self):
+        """Iterates through the branches of the tree, ordering mutation 
+           attributes where possible."""
 
         if self.mut_level in ['Exon', 'Location']:
             return iter(sorted(
@@ -890,6 +892,7 @@ def combtypes(self,
         """
         branch_mtypes = set()
         comb_mtypes = set()
+        #TODO: more error-checking for input values, e.g. sizes can't be zero
 
         if not isinstance(min_branch_size, str):
             branch_mtypes = self.branchtypes(

diff --git a/dryadic/learning/pipelines/base.py b/dryadic/learning/pipelines/base.py
@@ -132,23 +132,25 @@ def _fit(self, X, y=None, **fit_params):
         return Xt, final_params
 
     def predict_train(self,
-                      cohort, pheno,
+                      cohort, lbl_type='prob',
                       include_samps=None, exclude_samps=None,
                       include_feats=None, exclude_feats=None):
         return self.predict_omic(
-            cohort.train_data(pheno,
+            cohort.train_data(None,
                               include_samps, exclude_samps,
-                              include_feats, exclude_feats)[0]
+                              include_feats, exclude_feats)[0],
+            lbl_type
             )
 
     def predict_test(self,
-                     cohort, pheno,
+                     cohort, lbl_type='prob',
                      include_samps=None, exclude_samps=None,
                      include_feats=None, exclude_feats=None):
         return self.predict_omic(
-            cohort.test_data(pheno,
+            cohort.test_data(None,
                              include_samps, exclude_samps,
-                             include_feats, exclude_feats)[0]
+                             include_feats, exclude_feats)[0],
+            lbl_type
             )
 
     def predict_omic(self, omic_data):

diff --git a/dryadic/tests/test_cohorts.py b/dryadic/tests/test_cohorts.py
@@ -49,12 +49,12 @@ def main():
     cdata.update_split(new_seed=551, test_prop=1./3)
     assert cdata.get_seed() == 551
     assert len(cdata.get_samples()) == expr_data.shape[0]
-    assert cdata.train_data()[0].shape == (expr_data.shape[0] * 2/3,
-                                           expr_data.shape[1])
+    assert cdata.train_data(None)[0].shape == (expr_data.shape[0] * 2/3,
+                                               expr_data.shape[1])
     check_samp_split(cdata, expr_data.index)
 
     cdata.update_split(new_seed=551, test_samps=expr_data.index[:20])
-    assert cdata.test_data()[0].shape == (20, expr_data.shape[1])
+    assert cdata.test_data(None)[0].shape == (20, expr_data.shape[1])
     check_samp_split(cdata, expr_data.index)
 
     mut_data = load_omic_data('variants')

diff --git a/dryadic/tests/test_pipes.py b/dryadic/tests/test_pipes.py
@@ -34,7 +34,17 @@ def main():
     for param, _ in clf.tune_priors:
         assert clf.get_params()[param] == cvs['params'][best_indx][param]
 
+    use_feats = cdata.get_features()[::3]
+    clf.fit_coh(cdata, test_mtype, include_feats=use_feats)
+    train_preds = clf.predict_train(cdata, include_feats=use_feats)
+    test_preds = clf.predict_test(cdata, include_feats=use_feats)
+    test_preds = clf.predict_test(cdata,
+                                  include_feats=use_feats, lbl_type='prob')
+
     clf.fit_coh(cdata, test_mtype)
+    train_preds = clf.predict_train(cdata)
+    test_preds = clf.predict_test(cdata, lbl_type='raw')
+
     tuned_coefs = np.floor(expr_data.shape[1]
                            * (clf.named_steps['feat'].mean_perc / 100))
     assert tuned_coefs == len(clf.named_steps['fit'].coef_[0]), (

diff --git a/setup.py b/setup.py
@@ -1,15 +1,15 @@
 import setuptools
 
 setuptools.setup(name='dryad',
-      version='0.4.1',
+      version='0.4.2a3',
       description='Prediction of Cancer Phenotypes Using Mutation Trees',
       author='Michal Radoslaw Grzadkowski',
       author_email='[email protected]',
       packages=setuptools.find_packages(
           exclude=["dryadic.tests.*", "dryadic.tests"]),
       url = 'https://github.com/ohsu-comp-bio/dryad',
       download_url = ('https://github.com/ohsu-comp-bio/'
-                      'dryad/archive/v0.4.tar.gz'),
+                      'dryad/archive/v0.4.2a3.tar.gz'),
       install_requires=[
           'numpy>=1.16',
           'pandas>=0.25',