Skip to content

Commit 88a608a

Browse files
committedMay 11, 2022
First commit
0 parents  commit 88a608a

14 files changed

+1439
-0
lines changed
 

‎.github/workflows/build.yml

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: Build and upload to PyPI
2+
3+
# Build on every branch push, tag push, and pull request change:
4+
on: [push, pull_request]
5+
6+
env:
7+
CIBW_SKIP: "cp36-* pp*"
8+
9+
jobs:
10+
build_wheels:
11+
name: Build wheels on ${{ matrix.os }}
12+
runs-on: ${{ matrix.os }}
13+
strategy:
14+
matrix:
15+
os: [ubuntu-latest, windows-latest, macos-latest]
16+
17+
steps:
18+
- name: Cancel previous runs
19+
uses: styfle/cancel-workflow-action@0.9.1
20+
with:
21+
access_token: ${{ github.token }}
22+
23+
- uses: actions/checkout@v3
24+
25+
- uses: actions/setup-python@v3
26+
with:
27+
python-version: 3.8
28+
cache: pip
29+
cache-dependency-path: .github/workflows/build.yml
30+
31+
- name: Install cibuildwheel
32+
run: pip install cibuildwheel
33+
34+
# - name: Run tests
35+
# run: cibuildwheel .
36+
37+
# - name: Build wheels
38+
# uses: pypa/cibuildwheel@v2.5.0
39+
40+
- uses: actions/upload-artifact@v3
41+
with:
42+
path: ./wheelhouse/*.whl
43+
44+
build_sdist:
45+
name: Build source distribution
46+
runs-on: ubuntu-latest
47+
steps:
48+
- uses: actions/checkout@v3
49+
50+
- name: Build sdist
51+
run: pipx run build --sdist
52+
53+
- uses: actions/upload-artifact@v3
54+
with:
55+
path: dist/*.tar.gz
56+
57+
upload_pypi:
58+
needs: [build_wheels, build_sdist]
59+
runs-on: ubuntu-latest
60+
steps:
61+
- uses: actions/download-artifact@v3
62+
with:
63+
name: artifact
64+
path: dist
65+
66+
- uses: pypa/gh-action-pypi-publish@v1.4.2
67+
with:
68+
user: __token__
69+
password: ${{ secrets.PYPI_API_TOKEN }}

‎LICENSE.txt

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Copyright 2022 Euxhen Hasanaj, Carnegie Mellon University
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"),
5+
to deal in the Software without restriction, including without limitation
6+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
7+
and/or sell copies of the Software, and to permit persons to whom the
8+
Software is furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in
11+
all copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
15+
MERCHANTABILITY, FITNESS FOR A ARTICULAR PURPOSE AND NONINFRINGEMENT.
16+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17+
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
18+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
19+
OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

‎MANIFEST.in

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include LICENSE
2+
include MANIFEST.in
3+
include pyproject.toml
4+
include README.rst
5+
include test/*.py
6+
include includes/*
7+
8+
exclude .git*
9+
prune .github

‎README.rst

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
This repository provides two algorithms for the phenotype cover (PC)
2+
biomarker selection problem. GreedyPC is based on the extended greedy
3+
algorithm to set cover, and CEM-PC is based on the cross-entropy-method.
4+
5+
Install via `pip install phenotype-cover`
6+
7+
Import `GreedyPC` or `CEMPC` from `phenotype_cover`.
8+
9+
Example
10+
11+
>>> from phenotype_cover import GreedyPC
12+
13+
>>> # Multiply data matrix by 100 to avoid a zero-matrix after the "rounding" step
14+
>>> X = np.random.random((1000, 200)) * 100 # data matrix
15+
>>> y = np.random.randint(0, 5, 1000) # class labels
16+
17+
>>> gpc = GreedyPC()
18+
>>> gpc.fit(X, y)
19+
>>> features = gpc.select(10) # coverage of 10
20+
21+
Some other functionality implemented in GreedyPC
22+
23+
>>> # Number of elements reamining and coverage attained after every iteration
24+
>>> gpc.plot_progress()
25+
>>> gpc.n_elements_remaining_per_iter_
26+
>>> gpc.coverage_per_iter_
27+
28+
>>> # Heatmap of the coverage provided by some feature i
29+
>>> gpc.feature_coverage(i)
30+
31+
>>> # Maximum possible coverage for evey class pair
32+
>>> gpc.max_coverage()
33+
34+
>>> # Pairs that could not be covered to the desired `coverage`
35+
>>> gpc.pairs_with_incomplete_cover_

‎requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
numpy
2+
matplotlib
3+
scikit-learn
4+
multiset_multicover

‎setup.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
from setuptools import setup, Command, find_packages
3+
4+
5+
class CleanCommand(Command):
6+
user_options = []
7+
8+
def initialize_options(self):
9+
pass
10+
11+
def finalize_options(self):
12+
pass
13+
14+
def run(self):
15+
os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
16+
17+
18+
cmdclass = {'clean': CleanCommand}
19+
20+
21+
options = {
22+
'name': 'phenotype_cover',
23+
'description': 'phenotype_cover is a package for biomarker discovery using multiset multicover.',
24+
'long_description': 'Implements the greedy and cross-entropy-method phenotype cover algorithms.',
25+
'license': 'MIT',
26+
'version': '0.1',
27+
'author': 'Euxhen Hasanaj',
28+
'author_email': 'ehasanaj@cs.cmu.edu',
29+
'url': 'https://github.com/euxhenh/phenotype-cover',
30+
'provides': ['phenotype_cover'],
31+
'package_dir': {'phenotype_cover': 'src/phenotype_cover'},
32+
'packages': find_packages(where='src'),
33+
'cmdclass': cmdclass,
34+
'platforms': 'ALL',
35+
'keywords': ['biomarker', 'marker', 'phenotype', 'scRNA-seq', 'set', 'cover', 'multiset', 'multicover'],
36+
'install_requires': ['numpy', 'matplotlib', 'scikit-learn', 'multiset-multicover'],
37+
'python_requires': ">=3.7"
38+
}
39+
40+
setup(**options)

‎src/phenotype_cover/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from ._phenotype_cover import GreedyPC, CEMPC
2+
from ._gci_wrapper import GCIWrapper
3+
from ._operations import pairwise_differences, group_by

‎src/phenotype_cover/_base.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from abc import abstractmethod
2+
3+
from sklearn.base import BaseEstimator
4+
5+
6+
class FeatureSelector(BaseEstimator):
7+
"""Base class of feature selector methods.
8+
9+
This class should not be used directly.
10+
"""
11+
@abstractmethod
12+
def fit(self, X, y=None, **fit_params):
13+
"""Fit the model with X and y.
14+
15+
Parameters
16+
----------
17+
X : array-like of shape (n_samples, n_features)
18+
Training data, where `n_samples` is the number of samples and
19+
`n_features` is the number of features.
20+
y: array-like of shape (n_samples,)
21+
22+
Returns
23+
-------
24+
self : object
25+
Returns the instance itself.
26+
"""
27+
28+
@abstractmethod
29+
def select(self, **select_params):
30+
"""Performs feature selection.
31+
32+
Returns
33+
-------
34+
self : object
35+
Returns the instance itself.
36+
"""
37+
38+
def fit_select(self, X, y=None, **params):
39+
"""Fit the model with X and y and perform feature selection.
40+
41+
Parameters
42+
----------
43+
X : array-like of shape (n_samples, n_features)
44+
Training data, where `n_samples` is the number of samples and
45+
`n_features` is the number of features.
46+
y: array-like of shape (n_samples,)
47+
params: dictionary
48+
Parameters to be used for fit and select steps.
49+
50+
Returns
51+
-------
52+
self.select
53+
"""
54+
fit_params = {
55+
key: params[key]
56+
for key in self._get_fit_params()
57+
if key in params
58+
}
59+
select_params = {
60+
key: params[key]
61+
for key in self._get_select_params()
62+
if key in params
63+
}
64+
65+
self.fit(X, y, **fit_params)
66+
return self.select(**select_params)
67+
68+
def get_scores(self):
69+
if hasattr(self, 'feature_importances_'):
70+
return self.feature_importances_
71+
elif hasattr(self, 'coef_'):
72+
return self.coef_
73+
else:
74+
raise ValueError("No scores found in estimator.")
75+
76+
def _get_fit_params(self):
77+
"""Returns a list of parameter names used during `fit`.
78+
"""
79+
return []
80+
81+
def _get_select_params(self):
82+
"""Returns a list of parameter names used during `select`.
83+
"""
84+
return []

‎src/phenotype_cover/_gci_wrapper.py

+410
Large diffs are not rendered by default.

‎src/phenotype_cover/_logger.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import logging
2+
3+
FORMAT = '%(levelname)s: %(message)s'
4+
logging.basicConfig(format=FORMAT)
5+
6+
7+
def setup_logger(name, lvl='INFO'):
8+
logger = logging.getLogger(name)
9+
logger.setLevel(getattr(logging, lvl))
10+
return logger
11+
12+
13+
logger = setup_logger('Feature Selector')

‎src/phenotype_cover/_operations.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from itertools import combinations
2+
3+
import numpy as np
4+
from sklearn.utils.validation import (as_float_array, assert_all_finite,
5+
check_X_y, column_or_1d, indexable)
6+
7+
8+
def group_by(X, y, *, category_orders=None, operation=lambda x: x.mean(axis=0)):
9+
"""Groups the samples in X by labels in y and applies `operation`
10+
to the aggregated groups.
11+
12+
Parameters
13+
__________
14+
X: array-like of shape (n_samples, n_features)
15+
The data matrix.
16+
y: array-like of shape (n_samples,)
17+
The class labels.
18+
category_orders: array-like of shape (np.unique(y).size,)
19+
Order of class labels to use when constructing the matrix.
20+
If None, will sort the class labels alphabetically.
21+
operation: callable
22+
The function to apply to the aggregated groups.
23+
"""
24+
X, y = check_X_y(X, y, accept_sparse=["csr"])
25+
X = indexable(X)[0]
26+
27+
if category_orders is None:
28+
category_orders = np.unique(y)
29+
elif not set(category_orders).issubset(set(y)):
30+
# To avoid getting nan values
31+
raise ValueError("Found categories not present in `y`.")
32+
else:
33+
category_orders = column_or_1d(category_orders)
34+
35+
if not callable(operation):
36+
raise ValueError("Please pass a callable operation.")
37+
38+
M = np.zeros((len(category_orders), X.shape[1]))
39+
40+
for i, category in enumerate(category_orders):
41+
_agg_values = operation(X[y == category])
42+
_agg_values = as_float_array(_agg_values).flatten()
43+
if len(_agg_values) != X.shape[1]:
44+
raise ValueError(
45+
"Operation must return a vector of size X.shape[1]"
46+
f"but instead found vector of size {len(_agg_values)}."
47+
)
48+
assert_all_finite(_agg_values)
49+
M[i] = _agg_values
50+
51+
return M
52+
53+
54+
def pairwise_differences(
55+
X, y,
56+
*,
57+
classes=None,
58+
ordered=False,
59+
operation=lambda x: x.mean(axis=0)):
60+
"""
61+
Given an data matrix X, if ordered is False, construct a matrix P of shape
62+
(n * (n-1) / 2, X.shape[1]) where n is the number of classes in y.
63+
The (i*j, g) entry of P corresponds to the average expression of feature g
64+
in group i - average expression of feature g in group j, in absolute value.
65+
If ordered is True, the shape of P will be (n * (n-1), X.shape[1]) and
66+
the pairwise distances will be clipped at 0.
67+
68+
Returns P and a dictionary of mappings: label, label -> index.
69+
70+
Parameters
71+
_________
72+
X: np.ndarray of shape (n_samples, n_features)
73+
y: np.ndarray of shape (n_samples,)
74+
classes: np.ndarray or None, unique class labels in y
75+
ordered: bool, if True will construct a matrix of ordered
76+
pairwise differences. In this case the shape of P is
77+
(n * (n-1), X.shape[1]).
78+
operation: callable, operation to use when constructing the class vector.
79+
"""
80+
if classes is None:
81+
classes = np.unique(y)
82+
83+
n_classes = len(classes)
84+
# All pairwise combinations
85+
n_class_pairs = n_classes * (n_classes - 1) // 2
86+
87+
# Cache the average vector of each class
88+
class_averages = group_by(
89+
X, y, category_orders=classes, operation=operation)
90+
91+
# Compute the actual pairwise differences
92+
P = np.zeros((n_class_pairs * (1 if not ordered else 2), X.shape[1]))
93+
index_to_pair_dict = {}
94+
95+
# Make sure to use range(n_classes) when indexing instead of classes,
96+
# to allow for arbitrary class labels.
97+
for index, (i, j) in enumerate(combinations(range(n_classes), 2)):
98+
difference = class_averages[i] - class_averages[j]
99+
if ordered:
100+
# Clip negative values to 0
101+
# Assign i - j to index and j - i to index + n_class_pairs
102+
P[index] = np.clip(difference, 0, None)
103+
index_to_pair_dict[index] = (i, j)
104+
P[index + n_class_pairs] = np.clip(-difference, 0, None)
105+
index_to_pair_dict[index + n_class_pairs] = (j, i)
106+
else:
107+
P[index] = np.abs(difference)
108+
index_to_pair_dict[index] = (i, j)
109+
110+
return P, index_to_pair_dict

‎src/phenotype_cover/_phenotype_cover.py

+443
Large diffs are not rendered by default.

‎tests/test_pairmat_construction.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import unittest
2+
3+
import numpy as np
4+
from scipy.sparse import csr_matrix
5+
from numpy.testing import assert_allclose
6+
from src.phenotype_cover import pairwise_differences, group_by
7+
8+
9+
class TestSelectors(unittest.TestCase):
10+
def test1(self):
11+
a = np.array([
12+
[1, 2, 6, 1],
13+
[3, 6, 7, 1],
14+
[3, 4, 1, 6]
15+
])
16+
y = [0, 1, 0]
17+
M = group_by(a, y)
18+
gt = np.array([
19+
[2, 3, 3.5, 3.5],
20+
[3, 6, 7, 1]
21+
])
22+
23+
assert_allclose(M, gt)
24+
25+
def test2(self):
26+
a = np.array([
27+
[1, 2, 6, 1],
28+
[3, 6, 7, 1],
29+
[3, 4, 1, 6]
30+
])
31+
y = [0, 1, 0]
32+
M = group_by(a, y, operation=lambda x: x.sum(axis=0))
33+
gt = np.array([
34+
[4, 6, 7, 7],
35+
[3, 6, 7, 1]
36+
])
37+
38+
assert_allclose(M, gt)
39+
40+
def test3(self):
41+
row = np.array([0, 0, 1, 2, 2, 2])
42+
col = np.array([0, 2, 2, 0, 1, 2])
43+
data = np.array([1, 2, 3, 4, 5, 6])
44+
a = csr_matrix((data, (row, col)), shape=(3, 3))
45+
y = [0, 1, 0]
46+
47+
M = group_by(a, y, operation=lambda x: x.sum(axis=0))
48+
gt = np.array([
49+
[5, 5, 8],
50+
[0, 0, 3]
51+
])
52+
53+
assert_allclose(M, gt)
54+
55+
def test4(self):
56+
row = np.array([0, 0, 1, 2, 2, 2])
57+
col = np.array([0, 2, 2, 0, 1, 2])
58+
data = np.array([1, 2, 3, 4, 5, 6])
59+
a = csr_matrix((data, (row, col)), shape=(3, 3))
60+
y = [0, 1, 0]
61+
62+
M = group_by(a, y,
63+
category_orders=[1, 0], operation=lambda x: x.sum(axis=0))
64+
gt = np.array([
65+
[0, 0, 3],
66+
[5, 5, 8]
67+
])
68+
69+
assert_allclose(M, gt)
70+
71+
def test5(self):
72+
X = np.array([
73+
[0, 1, 2, 4, 1, 4],
74+
[2, 5, 1, 3, 4, 1],
75+
[3, 4, 1, 2, 1, 1],
76+
[4, 6, 7, 1, 3, 5],
77+
[6, 1, 3, 4, 5, 1]
78+
], dtype=float)
79+
labels = np.array([0, 0, 1, 1, 2], dtype=int)
80+
pairmat, mapping = pairwise_differences(X, labels)
81+
82+
assert_allclose(pairmat, np.array([
83+
[2.5, 2, 2.5, 2, 0.5, 0.5], # 0 - 1
84+
[5, 2, 1.5, 0.5, 2.5, 1.5], # 0 - 2
85+
[2.5, 4, 1, 2.5, 3, 2] # 1 - 2
86+
]))
87+
88+
def test6(self):
89+
X = np.array([
90+
[0, 1, 2, 4, 1, 4],
91+
[2, 5, 1, 3, 4, 1],
92+
[3, 4, 1, 2, 1, 1],
93+
[4, 6, 7, 1, 3, 5],
94+
[6, 1, 3, 4, 5, 1]
95+
], dtype=float)
96+
labels = np.array([0, 0, 1, 1, 2], dtype=int)
97+
pairmat, mapping = pairwise_differences(X, labels, ordered=True)
98+
99+
assert_allclose(pairmat, np.array([
100+
[0, 0, 0, 2, 0.5, 0], # 0 - 1
101+
[0, 2, 0, 0, 0, 1.5], # 0 - 2
102+
[0, 4, 1, 0, 0, 2], # 1 - 2
103+
[2.5, 2, 2.5, 0, 0, 0.5], # 1 - 0
104+
[5, 0, 1.5, 0.5, 2.5, 0], # 2 - 0
105+
[2.5, 0, 0, 2.5, 3, 0] # 2 - 1
106+
]))

‎tests/test_phenotype_cover.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import unittest
2+
3+
import numpy as np
4+
from numpy.testing import assert_allclose
5+
from sklearn.feature_selection import SelectFromModel
6+
7+
from src.phenotype_cover import GreedyPC
8+
from src.phenotype_cover._gci_wrapper import GCIPython, GCIWrapper
9+
10+
11+
def wrap(gci, x):
12+
gci.fit(x)
13+
14+
assert gci.n_elements == 4
15+
assert gci.n_multisets_ == 5
16+
17+
assert_allclose(gci.max_coverage_, [14, 8, 7, 2])
18+
19+
solution = gci.predict(2)
20+
assert_allclose(solution, [3, 1, 4])
21+
n_elements_rem = gci.n_elements_remaining_
22+
assert_allclose(n_elements_rem, [2, 1, 0])
23+
coverage_until = gci.coverage_until_
24+
assert_allclose(coverage_until, [0, 1, 2])
25+
26+
solution = gci.predict(3)
27+
assert_allclose(solution, [3, 4, 1])
28+
elements_incomplete_cover_ = gci.elements_incomplete_cover_
29+
assert_allclose(elements_incomplete_cover_, [3])
30+
coverage_until = gci.coverage_until_
31+
assert_allclose(coverage_until, [1, 2, 3])
32+
33+
34+
class TestSelectors(unittest.TestCase):
35+
def test_greedy_cover_selector(self):
36+
gcs = GreedyPC()
37+
38+
X = np.array([
39+
[1, 0, 0, 1, 2],
40+
[0, 2, 1, 0, 3],
41+
[1, 2, 0, 3, 0],
42+
[0, 2, 1, 3, 0],
43+
[1, 0, 0, 4, 0]
44+
])
45+
y = np.array([0, 0, 1, 1, 1])
46+
47+
gcs.fit_select(X, y, coverage=2)
48+
sfm = SelectFromModel(gcs, threshold=0.5, prefit=True)
49+
50+
assert_allclose(sfm.get_support(indices=True), np.array([3, 4]))
51+
assert_allclose(sfm.transform(X), X[:, np.array([3, 4])])
52+
53+
def test1(self):
54+
x = np.array([
55+
[2, 1, 5, 6, 0],
56+
[1, 3, 1, 3, 0],
57+
[0, 1, 3, 1, 2],
58+
[0, 1, 0, 0, 1]
59+
])
60+
wrap(GCIPython(), x)
61+
62+
def test2(self):
63+
x = np.array([
64+
[2, 1, 5, 6, 0],
65+
[1, 3, 1, 3, 0],
66+
[0, 1, 3, 1, 2],
67+
[0, 1, 0, 0, 1]
68+
])
69+
wrap(GCIWrapper(x.shape[0]), x)
70+
71+
def test3(self):
72+
N = 1000
73+
xx = np.random.randint(0, 10, (N, 5000))
74+
gcip = GCIPython()
75+
gciw = GCIWrapper(N)
76+
77+
gcip.fit(xx)
78+
gciw.fit(xx)
79+
80+
assert_allclose(gcip.max_coverage_, gciw.max_coverage_)
81+
assert_allclose(gcip.predict(5), gciw.predict(5))
82+
assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
83+
assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
84+
assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)
85+
86+
assert_allclose(gcip.predict(10), gciw.predict(10))
87+
assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
88+
assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
89+
assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)
90+
91+
assert_allclose(gcip.predict(1000), gciw.predict(1000))
92+
assert_allclose(gcip.n_elements_remaining_, gciw.n_elements_remaining_)
93+
assert_allclose(gcip.coverage_until_, gciw.coverage_until_)
94+
assert_allclose(gcip.elements_incomplete_cover_, gciw.elements_incomplete_cover_)

0 commit comments

Comments
 (0)
Please sign in to comment.