Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -4,4 +4,4 @@ __pycache__
docs/build
.coverage
poetry.lock
coverage.xml
coverage.xml
16 changes: 9 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_
|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License|

.. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg
.. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml
:target: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml

.. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8
.. _Coverage: https://codecov.io/gh/GauravPandeyLab/eipy
:target: https://codecov.io/gh/GauravPandeyLab/eipy

.. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest
.. _ReadTheDocs: https://eipy.readthedocs.io/en/latest/
:target: https://eipy.readthedocs.io/en/latest/

.. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration
:target: https://pypi.org/project/ensemble-integration/

.. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue
.. _PythonVersion: https://github.com/GauravPandeyLab/eipy

.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
.. _Black: https://github.com/psf/black
:target: https://github.com/psf/black

.. |License| image:: https://img.shields.io/badge/License-GPLv3-blue
.. _License: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING
:target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING


``ensemble-integration``: Integrating multi-modal data for predictive modeling
6 changes: 3 additions & 3 deletions docs/source/development.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Development
===========

We welcome contributions to the development of ``eipy``. To contribute follow the below instructions to submit a pull request:
We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request:

1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub <https://github.com/GauravPandeyLab/eipy>`__ for supported versions).
2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website <https://python-poetry.org/docs/>`__.
@@ -49,9 +49,9 @@ Note that new test file names must have the prefix `test_`.
9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing
to the main branch has been disabled.

10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`
10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`__
to iterate version numbers in pyproject.toml automatically. You can publish to
PyPI by creating a new `release <https://github.com/GauravPandeyLab/eipy/releases>`__,
which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the
GitHub release tag, which you should manually iterate.
Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
44 changes: 44 additions & 0 deletions eipy/additional_ensembles.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,18 @@

class MeanAggregation(BaseEstimator, ClassifierMixin):
"""
Mean Aggregation
Trivially takes the mean of X.
Attributes
----------
classes : array
Ordered arrray of unique labels for computing mean.
X_ : array of (n_samples, n_features)
Base predictor data for computing mean.
y_ : array of (n_samples,)
True labels of X_.
"""

def __init__(self):
@@ -36,7 +47,18 @@ def predict_proba(self, X):

class MedianAggregation(BaseEstimator, ClassifierMixin):
"""
Median Aggregation
Trivially takes the median of X.
Attributes
----------
classes : array
Ordered arrray of unique labels for computing mean.
X_ : array of (n_samples, n_features)
Base predictor data for computing mean.
y_ : array of (n_samples,)
True labels of X_.
"""

def __init__(self):
@@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin):
Caruana R. et al. (2006) Getting the most out of ensemble selection.
In: Sixth International Conference on Data
Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833.
Parameters
----------
scoring :
max_ensemble_size : int
Maximum number of base models to ensemble.
random_state : int
For determining a random state.
greater_is_better : bool
For sorting models by performance with respect to a metric.
Attributes
----------
selected_ensemble : list
List of models selected for ensemble.
train_performance : list
Record of model performances.
argbest : bool
True if metric of interest is to be maximized. Used for model selection.
best : bool
True if metric of interest is to be maximized. Used for selecting maximum scorers.
"""

def __init__(
4 changes: 2 additions & 2 deletions eipy/datasets.py
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ def load_diabetes():
"""
zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
# Get data path
data_path = get_data_home()
data_path = _get_data_home()
folder_ext = "diabetes"
data_ext_path = join(data_path, folder_ext)
# check data downloaded before
@@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix):
return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)


def get_data_home(data_home=None):
def _get_data_home(data_home=None):
"""Return the path of the eipy data directory.
This function is referring from scikit-learn.
70 changes: 34 additions & 36 deletions eipy/ei.py
Original file line number Diff line number Diff line change
@@ -17,21 +17,21 @@
from joblib import Parallel, delayed
import warnings
from eipy.utils import (
X_is_dict,
X_to_numpy,
y_to_numpy,
set_predictor_seeds,
random_integers,
sample,
retrieve_X_y,
append_modality,
safe_predict_proba,
_X_is_dict,
_X_to_numpy,
_y_to_numpy,
_set_predictor_seeds,
_random_integers,
_sample,
_retrieve_X_y,
_append_modality,
_safe_predict_proba,
dummy_cv,
bar_format,
)
from eipy.metrics import (
base_summary,
ensemble_summary,
_base_summary,
_ensemble_summary,
)

warnings.filterwarnings("ignore", category=DeprecationWarning)
@@ -181,7 +181,7 @@ def __init__(
self.modality_names = []
self.n_features_per_modality = []

self.random_numbers_for_samples = random_integers(
self.random_numbers_for_samples = _random_integers(
n_integers=n_samples, seed=self.random_state
)
self.feature_names = {}
@@ -210,17 +210,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None):
\n... for ensemble performance analysis..."""
)
# convert y to a numpy array
y = y_to_numpy(y)
y = _y_to_numpy(y)

# check if base_predictors are passed here
if base_predictors is not None:
self.base_predictors = base_predictors # update base predictors

# set random_states in base_predictors
set_predictor_seeds(self.base_predictors, self.random_state)
_set_predictor_seeds(self.base_predictors, self.random_state)

# check data format and train accordingly
if X_is_dict(X):
if _X_is_dict(X):
for modality_name, modality in X.items():
self._fit_base(
X=modality,
@@ -252,12 +252,12 @@ def fit_ensemble(self, ensemble_predictors=None):
if ensemble_predictors is not None:
self.ensemble_predictors = ensemble_predictors

set_predictor_seeds(self.ensemble_predictors, self.random_state)
_set_predictor_seeds(self.ensemble_predictors, self.random_state)

y_test_combined = []

for fold_id in range(self.k_outer):
_, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
_, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
y_test_combined.extend(y_test)

ensemble_predictions = {}
@@ -270,25 +270,25 @@ def fit_ensemble(self, ensemble_predictors=None):
y_pred_combined = []

for fold_id in range(self.k_outer):
X_train, y_train = retrieve_X_y(
X_train, y_train = _retrieve_X_y(
labelled_data=self.ensemble_training_data[fold_id]
)
X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])

if self.sampling_aggregation == "mean":
X_train = X_train.T.groupby(level=[0, 1]).mean().T
X_test = X_test.T.groupby(level=[0, 1]).mean().T

model.fit(X_train, y_train)
y_pred = safe_predict_proba(model, X_test)
y_pred = _safe_predict_proba(model, X_test)
y_pred_combined.extend(y_pred)

ensemble_predictions[model_name] = y_pred_combined

ensemble_predictions["labels"] = y_test_combined

self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
self.ensemble_summary = ensemble_summary(
self.ensemble_summary = _ensemble_summary(
self.ensemble_predictions, self.metrics
)

@@ -298,7 +298,7 @@ def fit_ensemble(self, ensemble_predictors=None):
desc="Training final ensemble models",
bar_format=bar_format,
):
X_train, y_train = retrieve_X_y(
X_train, y_train = _retrieve_X_y(
labelled_data=self.ensemble_training_data_final[0]
)

@@ -314,7 +314,7 @@ def fit_ensemble(self, ensemble_predictors=None):

def predict(self, X_dict, ensemble_model_key):
"""
Predict class labels for samples in X
Predict class labels for samples in X.
Parameters
----------
@@ -336,7 +336,7 @@ def predict(self, X_dict, ensemble_model_key):
modality_name = self.modality_names[i]
X = X_dict[modality_name]

X, _ = X_to_numpy(X)
X, _ = _X_to_numpy(X)

base_models = copy.deepcopy(self.final_models["base models"][modality_name])
self.base_predictors = {}
@@ -345,15 +345,15 @@ def predict(self, X_dict, ensemble_model_key):
self.base_predictors[base_model_dict["model name"]] = 0

base_model = pickle.loads(base_model_dict["pickled model"])
y_pred = safe_predict_proba(base_model, X)
y_pred = _safe_predict_proba(base_model, X)

base_model_dict["fold id"] = 0
base_model_dict["y_pred"] = y_pred

combined_predictions = self._combine_predictions_outer(
base_models, modality_name, model_building=True
)
ensemble_prediction_data = append_modality(
ensemble_prediction_data = _append_modality(
ensemble_prediction_data, combined_predictions, model_building=True
)
ensemble_prediction_data = ensemble_prediction_data[0]
@@ -367,12 +367,12 @@ def predict(self, X_dict, ensemble_model_key):
self.final_models["ensemble models"][ensemble_model_key]
)

y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data)
return y_pred

@ignore_warnings(category=ConvergenceWarning)
def _fit_base(self, X, y, base_predictors=None, modality_name=None):
X, feature_names = X_to_numpy(X)
X, feature_names = _X_to_numpy(X)

self.modality_names.append(modality_name)
self.feature_names[modality_name] = feature_names
@@ -387,7 +387,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
modality_name=modality_name,
)

self.ensemble_training_data = append_modality(
self.ensemble_training_data = _append_modality(
self.ensemble_training_data, ensemble_training_data_modality
)

@@ -399,12 +399,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
modality_name=modality_name,
)

self.ensemble_test_data = append_modality(
self.ensemble_test_data = _append_modality(
self.ensemble_test_data, ensemble_test_data_modality
) # append data to dataframe

# create a summary of base predictor performance
self.base_summary = base_summary(self.ensemble_test_data, self.metrics)
self.base_summary = _base_summary(self.ensemble_test_data, self.metrics)

if self.model_building:
self._fit_base_final(X=X, y=y, modality_name=modality_name)
@@ -428,7 +428,7 @@ def _fit_base_final(self, X, y, modality_name=None):
modality_name=modality_name,
)

self.ensemble_training_data_final = append_modality(
self.ensemble_training_data_final = _append_modality(
self.ensemble_training_data_final, ensemble_training_data_modality
)

@@ -562,7 +562,7 @@ def _train_predict_single_base_predictor(

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_sample, y_sample = sample(
X_sample, y_sample = _sample(
X_train,
y_train,
strategy=self.sampling_strategy,
@@ -581,7 +581,7 @@ def _train_predict_single_base_predictor(
}

else:
y_pred = safe_predict_proba(model, X_test)
y_pred = _safe_predict_proba(model, X_test)

results_dict = {
"model name": model_name,
@@ -677,7 +677,6 @@ def save(self, path=None):
Parameters
----------
path : optional, default=None
Path to save the EnsembleIntegration class object.
"""
@@ -695,7 +694,6 @@ def load(cls, path):
Parameters
----------
path : str
Path to load the EnsembleIntegration class object.
"""
14 changes: 7 additions & 7 deletions eipy/interpretation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from sklearn.inspection import permutation_importance
from eipy.utils import X_to_numpy, retrieve_X_y, bar_format, y_to_numpy
from eipy.utils import _X_to_numpy, _retrieve_X_y, bar_format, _y_to_numpy
import pandas as pd
from tqdm import tqdm
import numpy as np
@@ -102,10 +102,10 @@ def rank_product_score(self, X_dict, y):
ensemble_predictor_keys = self.ensemble_predictor_keys

if self.LFR is None:
self.local_feature_rank(X_dict, y_to_numpy(y))
self._local_feature_rank(X_dict, _y_to_numpy(y))

if self.LMR is None:
self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys)
self._local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys)

print("Calculating combined rank product score...")

@@ -151,7 +151,7 @@ def rank_product_score(self, X_dict, y):

return self

def local_feature_rank(self, X_dict, y):
def _local_feature_rank(self, X_dict, y):
"""
Local Feature Ranks (LFRs) for each base predictor
@@ -177,7 +177,7 @@ def local_feature_rank(self, X_dict, y):
bar_format=bar_format,
):
X = X_dict[modality_name]
X, feature_names = X_to_numpy(X)
X, feature_names = _X_to_numpy(X)

# check feature names were seen during training
if len(self.EI.feature_names[modality_name]) > 1:
@@ -285,7 +285,7 @@ def local_feature_rank(self, X_dict, y):

return self

def local_model_rank(self, ensemble_predictor_keys):
def _local_model_rank(self, ensemble_predictor_keys):
"""
Local Model Ranks (LMRs)
@@ -302,7 +302,7 @@ def local_model_rank(self, ensemble_predictor_keys):
"""
# load ensemble training data from EI training

ensemble_X_train, ensemble_y_train = retrieve_X_y(
ensemble_X_train, ensemble_y_train = _retrieve_X_y(
labelled_data=self.EI.ensemble_training_data_final[0]
)

57 changes: 39 additions & 18 deletions eipy/metrics.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,39 @@
import numpy as np
import pandas as pd
import inspect
from eipy.utils import minority_class
from eipy.utils import _minority_class
from sklearn.metrics import roc_auc_score, precision_recall_curve


def fmax_score(y_test, y_score, beta=1.0, pos_label=1):
fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold(
"""
Computes the maximum F-score (the harmonic mean of precision and recall) and the corresponding threshold.
Parameters
----------
y_test : array of shape (n_samples,)
Array of test labels.
y_pred : array of shape (n_samples,)
Array of predicted probabilities on test data.
beta : float
Parameter for weighing precision and recall in F score calculations.
pos_label : bool
Class selection for computing F scores.
Returns
-------
fmax_score : float64
Calculated fmax
threshold_fmax : float64
Threshold corresponding to returned fmax
"""
fmax_score, _, _, threshold_fmax = _fmax_precision_recall_threshold(
y_test, y_score, beta=beta, pos_label=pos_label
)
return fmax_score, threshold_fmax


def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1):
def _fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1):
"""
Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein
Function Prediction. Nature Methods, 10(3), 221-227.
@@ -44,7 +65,7 @@ def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1):
return fmax_score, precision_fmax, recall_fmax, threshold_fmax


def try_metric_with_pos_label(y_true, y_pred, metric, pos_label):
def _try_metric_with_pos_label(y_true, y_pred, metric, pos_label):
"""
Compute score for a given metric.
"""
@@ -55,7 +76,7 @@ def try_metric_with_pos_label(y_true, y_pred, metric, pos_label):
return score


def scores(y_true, y_pred, metrics):
def _scores(y_true, y_pred, metrics):
"""
Compute all metrics for a single set of predictions. Returns a dictionary
containing metric keys, each paired to a tuple (score, threshold).
@@ -65,7 +86,7 @@ def scores(y_true, y_pred, metrics):
if metrics is None:
metrics = {"fmax (minority)": fmax_score, "auc": roc_auc_score}

pos_label = minority_class(y_true) # gives value 1 or 0
pos_label = _minority_class(y_true) # gives value 1 or 0

metric_threshold_dict = {}

@@ -75,14 +96,14 @@ def scores(y_true, y_pred, metrics):
if "y_pred" in inspect.signature(metric).parameters:
# calculate metric for target vector with threshold=0.5
metric_threshold_dict[metric_key] = (
try_metric_with_pos_label(
_try_metric_with_pos_label(
y_true, (np.array(y_pred) >= 0.5).astype(int), metric, pos_label
),
0.5,
)
# if y_score parameter exists in metric function then y should be probability vector
elif "y_score" in inspect.signature(metric).parameters:
metric_results = try_metric_with_pos_label(
metric_results = _try_metric_with_pos_label(
y_true, y_pred, metric, pos_label
)
if isinstance(
@@ -95,7 +116,7 @@ def scores(y_true, y_pred, metrics):
return metric_threshold_dict


def scores_matrix(X, labels, metrics):
def _scores_matrix(X, labels, metrics):
"""
Calculate metrics and threshold (if applicable) for each column
(set of predictions) in matrix X
@@ -104,7 +125,7 @@ def scores_matrix(X, labels, metrics):
scores_dict = {}
for column in X.columns:
column_temp = X[column]
metrics_per_column = scores(labels, column_temp, metrics)
metrics_per_column = _scores(labels, column_temp, metrics)
# metric_names = list(metrics.keys())
for metric_key in metrics_per_column.keys():
if not (metric_key in scores_dict):
@@ -115,13 +136,13 @@ def scores_matrix(X, labels, metrics):
return scores_dict


def create_metric_threshold_dataframes(X, labels, metrics):
def _create_metric_threshold_dataframes(X, labels, metrics):
"""
Create a separate dataframe for metrics and thresholds. thresholds_df contains
NaN if threshold not applicable.
"""

scores_dict = scores_matrix(X, labels, metrics)
scores_dict = _scores_matrix(X, labels, metrics)

metrics_df = pd.DataFrame(columns=X.columns)
thresholds_df = pd.DataFrame(columns=X.columns)
@@ -130,15 +151,15 @@ def create_metric_threshold_dataframes(X, labels, metrics):
return metrics_df, thresholds_df


def create_metric_threshold_dict(X, labels, metrics):
def _create_metric_threshold_dict(X, labels, metrics):
df_dict = {}
df_dict["metrics"], df_dict["thresholds"] = create_metric_threshold_dataframes(
df_dict["metrics"], df_dict["thresholds"] = _create_metric_threshold_dataframes(
X, labels, metrics
)
return df_dict


def base_summary(ensemble_test_dataframes, metrics):
def _base_summary(ensemble_test_dataframes, metrics):
"""
Create a base predictor performance summary by concatenating data across test folds
"""
@@ -149,13 +170,13 @@ def base_summary(ensemble_test_dataframes, metrics):
for df in ensemble_test_dataframes
]
)
return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics)
return _create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics)


def ensemble_summary(ensemble_predictions, metrics):
def _ensemble_summary(ensemble_predictions, metrics):
X = ensemble_predictions.drop(["labels"], axis=1)
labels = ensemble_predictions["labels"]
return create_metric_threshold_dict(X, labels, metrics)
return _create_metric_threshold_dict(X, labels, metrics)


# These two functions are an attempt at maximizing/minimizing any metric
28 changes: 14 additions & 14 deletions eipy/utils.py
Original file line number Diff line number Diff line change
@@ -15,15 +15,15 @@
bar_format = "{desc}: |{bar}|{percentage:3.0f}%"


def minority_class(y_true):
def _minority_class(y_true):
if np.bincount(y_true)[0] < np.bincount(y_true)[1]:
minority_class = 0
else:
minority_class = 1
return minority_class


def set_predictor_seeds(base_predictors, random_state):
def _set_predictor_seeds(base_predictors, random_state):
for _, v in base_predictors.items():
if type(v) == Pipeline:
est_ = list(v.named_steps)[-1]
@@ -33,25 +33,25 @@ def set_predictor_seeds(base_predictors, random_state):
v.set_params(**{"random_state": random_state})


def X_is_dict(X):
def _X_is_dict(X):
if isinstance(X, dict):
return True
else:
return False


def X_dict_to_numpy(X_dict):
def _X_dict_to_numpy(X_dict):
"""
Retrieve feature names and convert arrays to numpy.
"""
X_dict_numpy = {}
feature_names = {}
for key, X in X_dict.items():
X_dict_numpy[key], feature_names[key] = X_to_numpy(X)
X_dict_numpy[key], feature_names[key] = _X_to_numpy(X)
return X_dict_numpy, feature_names


def X_to_numpy(X):
def _X_to_numpy(X):
"""
Return X as a numpy array, with feature names if applicable.
"""
@@ -66,7 +66,7 @@ def X_to_numpy(X):
)


def y_to_numpy(y):
def _y_to_numpy(y):
"""
Check y is numpy array and convert if not.
"""
@@ -85,13 +85,13 @@ def y_to_numpy(y):
or pandas Series."""
)

if not is_binary_array(_y):
if not _is_binary_array(_y):
raise ValueError("y must contain binary values.")

return _y


def is_binary_array(arr):
def _is_binary_array(arr):
if all(x == 0 or x == 1 or x == 0.0 or x == 1.0 for x in arr):
return True
else:
@@ -110,20 +110,20 @@ def get_n_splits(self, X, y, groups=None):
return self.n_splits


def safe_predict_proba(model, X): # uses predict_proba method where possible
def _safe_predict_proba(model, X): # uses predict_proba method where possible
if hasattr(model, "predict_proba"):
y_pred = model.predict_proba(X)[:, 1]
else:
y_pred = model.predict(X)
return y_pred


def random_integers(n_integers=1, seed=42):
def _random_integers(n_integers=1, seed=42):
random.seed(seed)
return random.sample(range(0, 10000), n_integers)


def sample(X, y, strategy, random_state):
def _sample(X, y, strategy, random_state):
if strategy is None:
X_resampled, y_resampled = X, y
elif strategy == "undersampling": # define sampler
@@ -161,13 +161,13 @@ def sample(X, y, strategy, random_state):
return X_resampled, y_resampled


def retrieve_X_y(labelled_data):
def _retrieve_X_y(labelled_data):
X = labelled_data.drop(columns=["labels"], level=0)
y = np.ravel(labelled_data["labels"])
return X, y


def append_modality(current_data, modality_data, model_building=False):
def _append_modality(current_data, modality_data, model_building=False):
if current_data is None:
combined_dataframe = modality_data
else: