euxhenh · May 11, 2022
diff --git a/‎.github/workflows/build.yml
+69 b/‎.github/workflows/build.yml
+69
diff --git a/‎LICENSE.txt
+19 b/‎LICENSE.txt
+19
diff --git a/‎MANIFEST.in
+9 b/‎MANIFEST.in
+9
diff --git a/‎README.rst
+35 b/‎README.rst
+35
diff --git a/‎requirements.txt
+4 b/‎requirements.txt
+4
diff --git a/‎setup.py
+40 b/‎setup.py
+40
diff --git a/‎src/phenotype_cover/__init__.py
+3 b/‎src/phenotype_cover/__init__.py
+3
diff --git a/‎src/phenotype_cover/_base.py
+84 b/‎src/phenotype_cover/_base.py
+84
diff --git a/‎src/phenotype_cover/_gci_wrapper.py
+410 b/‎src/phenotype_cover/_gci_wrapper.py
+410
diff --git a/‎src/phenotype_cover/_logger.py
+13 b/‎src/phenotype_cover/_logger.py
+13
diff --git a/‎src/phenotype_cover/_operations.py
+110 b/‎src/phenotype_cover/_operations.py
+110
diff --git a/‎src/phenotype_cover/_phenotype_cover.py
+443 b/‎src/phenotype_cover/_phenotype_cover.py
+443
diff --git a/‎tests/test_pairmat_construction.py
+106 b/‎tests/test_pairmat_construction.py
+106
diff --git a/‎tests/test_phenotype_cover.py
+94 b/‎tests/test_phenotype_cover.py
+94
@@ -0,0 +1,69 @@
+name: Build and upload to PyPI
+
+# Build on every branch push, tag push, and pull request change:
+on: [push, pull_request]
+
+env:
+  CIBW_SKIP: "cp36-* pp*"
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+
+    steps:
+      - name: Cancel previous runs
+        uses: styfle/cancel-workflow-action@0.9.1
+        with:
+          access_token: ${{ github.token }}
+
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.8
+          cache: pip
+          cache-dependency-path: .github/workflows/build.yml
+
+      - name: Install cibuildwheel
+        run: pip install cibuildwheel
+
+      # - name: Run tests
+      #   run: cibuildwheel .
+
+      # - name: Build wheels
+      #   uses: pypa/cibuildwheel@v2.5.0
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./wheelhouse/*.whl
+
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Build sdist
+        run: pipx run build --sdist
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: dist/*.tar.gz
+
+  upload_pypi:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v3
+        with:
+          name: artifact
+          path: dist
+
+      - uses: pypa/gh-action-pypi-publish@v1.4.2
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,19 @@
+Copyright 2022 Euxhen Hasanaj, Carnegie Mellon University
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A ARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,9 @@
+include LICENSE
+include MANIFEST.in
+include pyproject.toml
+include README.rst
+include test/*.py
+include includes/*
+
+exclude .git*
+prune .github
@@ -0,0 +1,35 @@
+This repository provides two algorithms for the phenotype cover (PC)
+biomarker selection problem. GreedyPC is based on the extended greedy
+algorithm to set cover, and CEM-PC is based on the cross-entropy-method.
+
+Install via `pip install phenotype-cover`
+
+Import `GreedyPC` or `CEMPC` from `phenotype_cover`.
+
+Example
+
+    >>> from phenotype_cover import GreedyPC
+
+    >>> # Multiply data matrix by 100 to avoid a zero-matrix after the "rounding" step
+    >>> X = np.random.random((1000, 200)) * 100  # data matrix
+    >>> y = np.random.randint(0, 5, 1000)  # class labels
+
+    >>> gpc = GreedyPC()
+    >>> gpc.fit(X, y)
+    >>> features = gpc.select(10)  # coverage of 10
+
+Some other functionality implemented in GreedyPC
+
+    >>> # Number of elements reamining and coverage attained after every iteration
+    >>> gpc.plot_progress()
+    >>> gpc.n_elements_remaining_per_iter_
+    >>> gpc.coverage_per_iter_
+
+    >>> # Heatmap of the coverage provided by some feature i
+    >>> gpc.feature_coverage(i)
+
+    >>> # Maximum possible coverage for evey class pair
+    >>> gpc.max_coverage()
+
+    >>> # Pairs that could not be covered to the desired `coverage`
+    >>> gpc.pairs_with_incomplete_cover_
@@ -0,0 +1,4 @@
+numpy
+matplotlib
+scikit-learn
+multiset_multicover
@@ -0,0 +1,40 @@
+import os
+from setuptools import setup, Command, find_packages
+
+
+class CleanCommand(Command):
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
+
+
+cmdclass = {'clean': CleanCommand}
+
+
+options = {
+    'name': 'phenotype_cover',
+    'description': 'phenotype_cover is a package for biomarker discovery using multiset multicover.',
+    'long_description': 'Implements the greedy and cross-entropy-method phenotype cover algorithms.',
+    'license': 'MIT',
+    'version': '0.1',
+    'author': 'Euxhen Hasanaj',
+    'author_email': 'ehasanaj@cs.cmu.edu',
+    'url': 'https://github.com/euxhenh/phenotype-cover',
+    'provides': ['phenotype_cover'],
+    'package_dir': {'phenotype_cover': 'src/phenotype_cover'},
+    'packages': find_packages(where='src'),
+    'cmdclass': cmdclass,
+    'platforms': 'ALL',
+    'keywords': ['biomarker', 'marker', 'phenotype', 'scRNA-seq', 'set', 'cover', 'multiset', 'multicover'],
+    'install_requires': ['numpy', 'matplotlib', 'scikit-learn', 'multiset-multicover'],
+    'python_requires': ">=3.7"
+}
+
+setup(**options)
@@ -0,0 +1,3 @@
+from ._phenotype_cover import GreedyPC, CEMPC
+from ._gci_wrapper import GCIWrapper
+from ._operations import pairwise_differences, group_by
@@ -0,0 +1,84 @@
+from abc import abstractmethod
+
+from sklearn.base import BaseEstimator
+
+
+class FeatureSelector(BaseEstimator):
+    """Base class of feature selector methods.
+
+    This class should not be used directly.
+    """
+    @abstractmethod
+    def fit(self, X, y=None, **fit_params):
+        """Fit the model with X and y.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y: array-like of shape (n_samples,)
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+
+    @abstractmethod
+    def select(self, **select_params):
+        """Performs feature selection.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+
+    def fit_select(self, X, y=None, **params):
+        """Fit the model with X and y and perform feature selection.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+        y: array-like of shape (n_samples,)
+        params: dictionary
+            Parameters to be used for fit and select steps.
+
+        Returns
+        -------
+        self.select
+        """
+        fit_params = {
+            key: params[key]
+            for key in self._get_fit_params()
+            if key in params
+        }
+        select_params = {
+            key: params[key]
+            for key in self._get_select_params()
+            if key in params
+        }
+
+        self.fit(X, y, **fit_params)
+        return self.select(**select_params)
+
+    def get_scores(self):
+        if hasattr(self, 'feature_importances_'):
+            return self.feature_importances_
+        elif hasattr(self, 'coef_'):
+            return self.coef_
+        else:
+            raise ValueError("No scores found in estimator.")
+
+    def _get_fit_params(self):
+        """Returns a list of parameter names used during `fit`.
+        """
+        return []
+
+    def _get_select_params(self):
+        """Returns a list of parameter names used during `select`.
+        """
+        return []
@@ -0,0 +1,13 @@
+import logging
+
+FORMAT = '%(levelname)s: %(message)s'
+logging.basicConfig(format=FORMAT)
+
+
+def setup_logger(name, lvl='INFO'):
+    logger = logging.getLogger(name)
+    logger.setLevel(getattr(logging, lvl))
+    return logger
+
+
+logger = setup_logger('Feature Selector')
@@ -0,0 +1,110 @@
+from itertools import combinations
+
+import numpy as np
+from sklearn.utils.validation import (as_float_array, assert_all_finite,
+                                      check_X_y, column_or_1d, indexable)
+
+
+def group_by(X, y, *, category_orders=None, operation=lambda x: x.mean(axis=0)):
+    """Groups the samples in X by labels in y and applies `operation`
+    to the aggregated groups.
+
+    Parameters
+    __________
+    X: array-like of shape (n_samples, n_features)
+        The data matrix.
+    y: array-like of shape (n_samples,)
+        The class labels.
+    category_orders: array-like of shape (np.unique(y).size,)
+        Order of class labels to use when constructing the matrix.
+        If None, will sort the class labels alphabetically.
+    operation: callable
+        The function to apply to the aggregated groups.
+    """
+    X, y = check_X_y(X, y, accept_sparse=["csr"])
+    X = indexable(X)[0]
+
+    if category_orders is None:
+        category_orders = np.unique(y)
+    elif not set(category_orders).issubset(set(y)):
+        # To avoid getting nan values
+        raise ValueError("Found categories not present in `y`.")
+    else:
+        category_orders = column_or_1d(category_orders)
+
+    if not callable(operation):
+        raise ValueError("Please pass a callable operation.")
+
+    M = np.zeros((len(category_orders), X.shape[1]))
+
+    for i, category in enumerate(category_orders):
+        _agg_values = operation(X[y == category])
+        _agg_values = as_float_array(_agg_values).flatten()
+        if len(_agg_values) != X.shape[1]:
+            raise ValueError(
+                "Operation must return a vector of size X.shape[1]"
+                f"but instead found vector of size {len(_agg_values)}."
+            )
+        assert_all_finite(_agg_values)
+        M[i] = _agg_values
+
+    return M
+
+
+def pairwise_differences(
+        X, y,
+        *,
+        classes=None,
+        ordered=False,
+        operation=lambda x: x.mean(axis=0)):
+    """
+    Given an data matrix X, if ordered is False, construct a matrix P of shape
+    (n * (n-1) / 2, X.shape[1]) where n is the number of classes in y.
+    The (i*j, g) entry of P corresponds to the average expression of feature g
+    in group i - average expression of feature g in group j, in absolute value.
+    If ordered is True, the shape of P will be (n * (n-1), X.shape[1]) and
+    the pairwise distances will be clipped at 0.
+
+    Returns P and a dictionary of mappings: label, label -> index.
+
+    Parameters
+    _________
+    X: np.ndarray of shape (n_samples, n_features)
+    y: np.ndarray of shape (n_samples,)
+    classes: np.ndarray or None, unique class labels in y
+    ordered: bool, if True will construct a matrix of ordered
+        pairwise differences. In this case the shape of P is
+        (n * (n-1), X.shape[1]).
+    operation: callable, operation to use when constructing the class vector.
+    """
+    if classes is None:
+        classes = np.unique(y)
+
+    n_classes = len(classes)
+    # All pairwise combinations
+    n_class_pairs = n_classes * (n_classes - 1) // 2
+
+    # Cache the average vector of each class
+    class_averages = group_by(
+        X, y, category_orders=classes, operation=operation)
+
+    # Compute the actual pairwise differences
+    P = np.zeros((n_class_pairs * (1 if not ordered else 2), X.shape[1]))
+    index_to_pair_dict = {}
+
+    # Make sure to use range(n_classes) when indexing instead of classes,
+    # to allow for arbitrary class labels.
+    for index, (i, j) in enumerate(combinations(range(n_classes), 2)):
+        difference = class_averages[i] - class_averages[j]
+        if ordered:
+            # Clip negative values to 0
+            # Assign i - j to index and j - i to index + n_class_pairs
+            P[index] = np.clip(difference, 0, None)
+            index_to_pair_dict[index] = (i, j)
+            P[index + n_class_pairs] = np.clip(-difference, 0, None)
+            index_to_pair_dict[index + n_class_pairs] = (j, i)
+        else:
+            P[index] = np.abs(difference)
+            index_to_pair_dict[index] = (i, j)
+
+    return P, index_to_pair_dict
@@ -0,0 +1,106 @@
+import unittest
+
+import numpy as np
+from scipy.sparse import csr_matrix
+from numpy.testing import assert_allclose
+from src.phenotype_cover import pairwise_differences, group_by
+
+
+class TestSelectors(unittest.TestCase):
+    def test1(self):
+        a = np.array([
+            [1, 2, 6, 1],
+            [3, 6, 7, 1],
+            [3, 4, 1, 6]
+        ])
+        y = [0, 1, 0]
+        M = group_by(a, y)
+        gt = np.array([
+            [2, 3, 3.5, 3.5],
+            [3, 6, 7, 1]
+        ])
+
+        assert_allclose(M, gt)
+
+    def test2(self):
+        a = np.array([
+            [1, 2, 6, 1],
+            [3, 6, 7, 1],
+            [3, 4, 1, 6]
+        ])
+        y = [0, 1, 0]
+        M = group_by(a, y, operation=lambda x: x.sum(axis=0))
+        gt = np.array([
+            [4, 6, 7, 7],
+            [3, 6, 7, 1]
+        ])
+
+        assert_allclose(M, gt)
+
+    def test3(self):
+        row = np.array([0, 0, 1, 2, 2, 2])
+        col = np.array([0, 2, 2, 0, 1, 2])
+        data = np.array([1, 2, 3, 4, 5, 6])
+        a = csr_matrix((data, (row, col)), shape=(3, 3))
+        y = [0, 1, 0]
+
+        M = group_by(a, y, operation=lambda x: x.sum(axis=0))
+        gt = np.array([
+            [5, 5, 8],
+            [0, 0, 3]
+        ])
+
+        assert_allclose(M, gt)
+
+    def test4(self):
+        row = np.array([0, 0, 1, 2, 2, 2])
+        col = np.array([0, 2, 2, 0, 1, 2])
+        data = np.array([1, 2, 3, 4, 5, 6])
+        a = csr_matrix((data, (row, col)), shape=(3, 3))
+        y = [0, 1, 0]
+
+        M = group_by(a, y,
+                     category_orders=[1, 0], operation=lambda x: x.sum(axis=0))
+        gt = np.array([
+            [0, 0, 3],
+            [5, 5, 8]
+        ])
+
+        assert_allclose(M, gt)
+
+    def test5(self):
+        X = np.array([
+            [0, 1, 2, 4, 1, 4],
+            [2, 5, 1, 3, 4, 1],
+            [3, 4, 1, 2, 1, 1],
+            [4, 6, 7, 1, 3, 5],
+            [6, 1, 3, 4, 5, 1]
+        ], dtype=float)
+        labels = np.array([0, 0, 1, 1, 2], dtype=int)
+        pairmat, mapping = pairwise_differences(X, labels)
+
+        assert_allclose(pairmat, np.array([
+            [2.5, 2, 2.5, 2, 0.5, 0.5],  # 0 - 1
+            [5, 2, 1.5, 0.5, 2.5, 1.5],  # 0 - 2
+            [2.5, 4, 1, 2.5, 3, 2]  # 1 - 2
+        ]))
+
+    def test6(self):
+        X = np.array([
+            [0, 1, 2, 4, 1, 4],
+            [2, 5, 1, 3, 4, 1],
+            [3, 4, 1, 2, 1, 1],
+            [4, 6, 7, 1, 3, 5],
+            [6, 1, 3, 4, 5, 1]
+        ], dtype=float)
+        labels = np.array([0, 0, 1, 1, 2], dtype=int)
+        pairmat, mapping = pairwise_differences(X, labels, ordered=True)
+
+        assert_allclose(pairmat, np.array([
+            [0, 0, 0, 2, 0.5, 0],  # 0 - 1
+            [0, 2, 0, 0, 0, 1.5],  # 0 - 2
+            [0, 4, 1, 0, 0, 2],  # 1 - 2
+            [2.5, 2, 2.5, 0, 0, 0.5],  # 1 - 0
+            [5, 0, 1.5, 0.5, 2.5, 0],  # 2 - 0
+            [2.5, 0, 0, 2.5, 3, 0]  # 2 - 1
+        ]))
@@ -0,0 +1,94 @@
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose
+from sklearn.feature_selection import SelectFromModel
+
+from src.phenotype_cover import GreedyPC
+from src.phenotype_cover._gci_wrapper import GCIPython, GCIWrapper
+
+
+def wrap(gci, x):
+    gci.fit(x)
+
+    assert gci.n_elements == 4
+    assert gci.n_multisets_ == 5
+
+    assert_allclose(gci.max_coverage_, [14, 8, 7, 2])
+
+    solution = gci.predict(2)
+    assert_allclose(solution, [3, 1, 4])
+    n_elements_rem = gci.n_elements_remaining_
+    assert_allclose(n_elements_rem, [2, 1, 0])
+    coverage_until = gci.coverage_until_
+    assert_allclose(coverage_until, [0, 1, 2])
+
+    solution = gci.predict(3)
+    assert_allclose(solution, [3, 4, 1])
+    elements_incomplete_cover_ = gci.elements_incomplete_cover_
+    assert_allclose(elements_incomplete_cover_, [3])
+    coverage_until = gci.coverage_until_
+    assert_allclose(coverage_until, [1, 2, 3])
+
+
+class TestSelectors(unittest.TestCase):
+    def test_greedy_cover_selector(self):
+        gcs = GreedyPC()
+
+        X = np.array([
+            [1, 0, 0, 1, 2],
+            [0, 2, 1, 0, 3],
+            [1, 2, 0, 3, 0],
+            [0, 2, 1, 3, 0],
+            [1, 0, 0, 4, 0]
+        ])
+        y = np.array([0, 0, 1, 1, 1])
+
+        gcs.fit_select(X, y, coverage=2)
+        sfm = SelectFromModel(gcs, threshold=0.5, prefit=True)
+
+        assert_allclose(sfm.get_support(indices=True), np.array([3, 4]))
+        assert_allclose(sfm.transform(X), X[:, np.array([3, 4])])
+
+    def test1(self):
+        x = np.array([
+            [2, 1, 5, 6, 0],
+            [1, 3, 1, 3, 0],
+            [0, 1, 3, 1, 2],
+            [0, 1, 0, 0, 1]
+        ])
+        wrap(GCIPython(), x)
+
+    def test2(self):
+        x = np.array([
+            [2, 1, 5, 6, 0],
+            [1, 3, 1, 3, 0],
+            [0, 1, 3, 1, 2],
+            [0, 1, 0, 0, 1]
+        ])
+        wrap(GCIWrapper(x.shape[0]), x)
+
+    def test3(self):
+        N = 1000
+        xx = np.random.randint(0, 10, (N, 5000))
+        gcip = GCIPython()
+        gciw = GCIWrapper(N)
+
+        gcip.fit(xx)
+        gciw.fit(xx)
+
+        assert_allclose(gcip.max_coverage_, gciw.max_coverage_)
+        assert_allclose(gcip.predict(5), gciw.predict(5))
+        assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
+        assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
+        assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)
+
+        assert_allclose(gcip.predict(10), gciw.predict(10))
+        assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
+        assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
+        assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)
+
+        assert_allclose(gcip.predict(1000), gciw.predict(1000))
+        assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
+        assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
+        assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from ._phenotype_cover import GreedyPC, CEMPC`
	`2`	`+from ._gci_wrapper import GCIWrapper`
	`3`	`+from ._operations import pairwise_differences, group_by`