Skip to content

Commit

Permalink
WIP: make the feature filter consider all the feature types
Browse files Browse the repository at this point in the history
  • Loading branch information
bryant1410 committed Apr 28, 2023
1 parent 87eba8b commit 587c16c
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 38 deletions.
39 changes: 17 additions & 22 deletions features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import ast
import itertools
import json
import re
import string
import warnings
from collections import Counter, defaultdict
Expand All @@ -20,15 +19,14 @@
from pandas._typing import FilePath
from pandas.core.dtypes.inference import is_bool, is_float
from sentence_transformers import SentenceTransformer, util
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm.auto import tqdm, trange

from sklearn_util import BoolImputer, MultiHotEncoder, SelectMinBinaryUniqueValues
from sklearn_util import BoolImputer, MultiHotEncoder, SelectMinNonMostFrequentValues
from spacy_features import create_model, get_first_sentence, get_noun_chunk_count, get_root_pos, get_root_tag, \
get_sentence_count, get_subject_number, get_subject_person, get_tense, has_any_adjective, has_any_adverb, \
has_any_gerund, is_continuous, is_passive_voice, is_perfect
Expand Down Expand Up @@ -476,9 +474,12 @@ def _compute_features(clip_results: pd.DataFrame, feature_deny_list: Collection[

def _transform_features_to_numbers(
df: pd.DataFrame, dependent_variable_name: str, standardize_dependent_variable: bool = True,
standardize_binary_features: bool = True, binary_feature_min_unique_values: int = 50,
standardize_binary_features: bool = True, min_non_most_frequent_values: int = 50,
compute_neg_features: bool = True, merge_original_and_replacement_features: bool = True,
add_constant_feature: bool = False, verbose: bool = True) -> Tuple[pd.DataFrame, pd.Series]:
if df[dependent_variable_name].isna().any():
raise ValueError("The dependent variable contains NaN values.")

df = df.copy()

if not standardize_dependent_variable:
Expand All @@ -496,32 +497,26 @@ def _transform_features_to_numbers(
feature_count -= 1
print("Number of features before the transformation:", feature_count)

common_column_transformer_kwargs = {"remainder": "passthrough", "n_jobs": -1, "verbose_feature_names_out": False}
common_column_transformer_kwargs = {"n_jobs": -1, "verbose_feature_names_out": False}

new_df = Pipeline([
("encoder", make_column_transformer(
# Sparse outputs are not supported by Pandas. It also complicates standardization if applied.
(OneHotEncoder(dtype=bool, sparse_output=False), [f for f in df.columns if is_feature_string(df[f])]),
(MultiHotEncoder(dtype=bool), [f for f in df.columns if is_feature_multi_label(df[f])]),
remainder="passthrough",
**common_column_transformer_kwargs,
)),
# TODO: remove more generally: those that have the most common value more than N - F times.
("filter", make_column_transformer(
(SelectMinBinaryUniqueValues(binary_feature_min_unique_values), make_column_selector(dtype_include=bool)),
(VarianceThreshold(), make_column_selector(dtype_exclude=bool)),
**common_column_transformer_kwargs,
)),
("filter", SelectMinNonMostFrequentValues(min_non_most_frequent_values)),
("scaler", make_column_transformer(
(StandardScaler(), make_column_selector(dtype_exclude=None if standardize_binary_features else bool)),
remainder="passthrough",
**common_column_transformer_kwargs,
)),
("imputer", make_column_transformer(
(SimpleImputer(strategy="mean"), make_column_selector(rf"^(?!{re.escape(dependent_variable_name)}$).*",
dtype_include=np.number)),
(BoolImputer(strategy="most_frequent"),
make_column_selector(rf"^(?!{re.escape(dependent_variable_name)}$).*", dtype_include=bool)),
(SimpleImputer(strategy="most_frequent"),
make_column_selector(rf"^(?!{re.escape(dependent_variable_name)}$).*", dtype_exclude=[bool, np.number])),
(SimpleImputer(strategy="mean"), make_column_selector(dtype_include=np.number)),
(BoolImputer(strategy="most_frequent"), make_column_selector(dtype_include=bool)),
remainder=SimpleImputer(strategy="most_frequent"),
**common_column_transformer_kwargs,
)),
], verbose=verbose).set_output(transform="pandas").fit_transform(df)
Expand Down Expand Up @@ -578,7 +573,7 @@ def _compute_numeric_features(clip_results: pd.DataFrame, dependent_variable_nam
compute_neg_features: bool = True, levin_return_mode: LevinReturnMode = "all",
compute_similarity_features: bool = True,
merge_original_and_replacement_features: bool = True,
add_constant_feature: bool = False, binary_feature_min_unique_values: int = 50,
add_constant_feature: bool = False, min_non_most_frequent_values: int = 50,
standardize_dependent_variable: bool = True, standardize_binary_features: bool = True,
verbose: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
raw_features = _compute_features(clip_results, feature_deny_list=feature_deny_list,
Expand All @@ -588,7 +583,7 @@ def _compute_numeric_features(clip_results: pd.DataFrame, dependent_variable_nam
features, dependent_variable = _transform_features_to_numbers(
raw_features, dependent_variable_name, standardize_dependent_variable=standardize_dependent_variable,
standardize_binary_features=standardize_binary_features,
binary_feature_min_unique_values=binary_feature_min_unique_values, compute_neg_features=compute_neg_features,
min_non_most_frequent_values=min_non_most_frequent_values, compute_neg_features=compute_neg_features,
merge_original_and_replacement_features=merge_original_and_replacement_features,
add_constant_feature=add_constant_feature, verbose=verbose)

Expand All @@ -601,7 +596,7 @@ def load_features(path: FilePath, dependent_variable_name: str, max_data_count:
levin_return_mode: LevinReturnMode = "all", compute_similarity_features: bool = True,
merge_original_and_replacement_features: bool = True, add_constant_feature: bool = False,
remove_correlated_features: bool = True, feature_correlation_keep_threshold: float = .8,
do_vif: bool = False, binary_feature_min_unique_values: int = 50,
do_vif: bool = False, min_non_most_frequent_values: int = 50,
verbose: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
clip_results = _load_clip_results(path)
raw_features, features, dependent_variable = _compute_numeric_features(
Expand All @@ -610,7 +605,7 @@ def load_features(path: FilePath, dependent_variable_name: str, max_data_count:
standardize_binary_features=standardize_binary_features, compute_neg_features=compute_neg_features,
levin_return_mode=levin_return_mode, compute_similarity_features=compute_similarity_features,
merge_original_and_replacement_features=merge_original_and_replacement_features,
add_constant_feature=add_constant_feature, binary_feature_min_unique_values=binary_feature_min_unique_values,
add_constant_feature=add_constant_feature, min_non_most_frequent_values=min_non_most_frequent_values,
verbose=verbose)

if remove_correlated_features:
Expand Down
7 changes: 4 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,8 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--dependent-variable-name")
parser.add_argument("-r", "--remove-features", dest="feature_deny_list", nargs="+",
default={"wup-similarity", "lch-similarity", "path-similarity"})
parser.add_argument("--binary-feature-min-unique-values", type=int, default=100)
parser.add_argument("--min-non-most-frequent-values", type=int, default=100,
help="The minimum number of values that have to be different from the most frequent one.")
parser.add_argument("--no-neg-features", dest="compute_neg_features", action="store_false")
parser.add_argument("--levin-return-mode", choices=VALID_LEVIN_RETURN_MODES, default="semantic_fine_grained")
parser.add_argument("--merge-original-and-replacement-features", action="store_true")
Expand All @@ -302,7 +303,7 @@ def parse_args() -> argparse.Namespace:
assert args.max_data_count is None or not args.debug, "Cannot specify max data count in debug mode."
args.max_data_count = 1000 if args.debug else args.max_data_count

args.binary_feature_min_unique_values = 10 if args.debug else args.binary_feature_min_unique_values
args.min_non_most_frequent_values = 10 if args.debug else args.min_non_most_frequent_values

args.dependent_variable_name = (args.dependent_variable_name
or ("clip_score_diff" if args.model in REGRESSION_MODELS else "clip prediction"))
Expand Down Expand Up @@ -332,7 +333,7 @@ def main() -> None:
merge_original_and_replacement_features=args.merge_original_and_replacement_features,
remove_correlated_features=args.remove_correlated_features,
feature_correlation_keep_threshold=args.feature_correlation_keep_threshold, do_vif=args.do_vif,
binary_feature_min_unique_values=args.binary_feature_min_unique_values)
min_non_most_frequent_values=args.min_non_most_frequent_values)

if args.model in {"ols", "ridge", "lasso"}:
regularization = {"ols": None}.get(args.model, args.model)
Expand Down
46 changes: 33 additions & 13 deletions sklearn_util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import annotations

from collections import Counter
from typing import Any, Callable, Sequence

import numpy as np
import numpy.typing as npt
import pandas as pd
from overrides import overrides
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectorMixin
from sklearn.impute import SimpleImputer
Expand All @@ -14,31 +16,48 @@
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted


class SelectMinBinaryUniqueValues(SelectorMixin, BaseEstimator):
def __init__(self, binary_feature_min_unique_values: int = 50, leave_at_least_one: bool = True) -> None:
self.binary_feature_min_unique_values = binary_feature_min_unique_values
class SelectMinNonMostFrequentValues(SelectorMixin, BaseEstimator):
"""Select features with at least `min_non_most_frequent_values` values different from the most frequent value."""

def __init__(self, min_non_most_frequent_values: int = 50, leave_at_least_one: bool = True) -> None:
self.min_non_most_frequent_values = min_non_most_frequent_values
self.leave_at_least_one = leave_at_least_one

def fit(self, X: np.ndarray, y: np.ndarray | None = None) -> SelectMinBinaryUniqueValues: # noqa
assert np.unique(X).size <= 2 # Only binary.
def fit(self, X: np.ndarray, y: np.ndarray | None = None) -> SelectMinNonMostFrequentValues: # noqa
X = self._validate_data(X, ensure_2d=True, force_all_finite="allow-nan")

self.non_most_frequent_counts_ = np.empty(X.shape[1], dtype=np.int64) # noqa

for i, column in enumerate(X.transpose()):
two_most_common_list = Counter(column).most_common(2)

X = self._validate_data(X, ensure_2d=True)
if len(two_most_common_list) <= 1:
self.non_most_frequent_counts_[i] = 0
else:
if np.isnan(most_freq_value := two_most_common_list[0][0]):
most_freq_value = two_most_common_list[1][0]

self.non_most_frequent_counts_[i] = ((column != most_freq_value) & (column != np.nan)).sum() # noqa

non_zero = (X != 0).sum(axis=0) # For sparse arrays, it's efficient to check the non-zero elements.
self.min_counts_ = np.minimum(non_zero, X.shape[0] - non_zero)
if isinstance(self.min_counts_, np.matrix): # Can happen with CSR matrices.
self.min_counts_ = self.min_counts_.A1 # noqa
return self

@overrides
def transform(self, X: np.ndarray) -> np.ndarray:
# We don't change `X` because the data types may be changed.
_ = self._validate_data(X, ensure_2d=True, dtype=None, accept_sparse="csr", force_all_finite="allow-nan",
reset=False)
return self._transform(X)

@overrides
def _get_support_mask(self) -> np.ndarray:
check_is_fitted(self)
mask = self.min_counts_ >= self.binary_feature_min_unique_values
mask = self.non_most_frequent_counts_ >= self.min_non_most_frequent_values

if self.leave_at_least_one and not mask.any():
# We do this because, with sklearn-pandas, when we use a `MultiLabelBinarizer` (because they are
# transformed one by one), there may be no features left afterward and the next transformers in the
# pipeline may fail for that multi-label feature.
mask[self.min_counts_.argmax()] = True
mask[self.non_most_frequent_counts_.argmax()] = True

return mask

Expand Down Expand Up @@ -98,10 +117,11 @@ class BoolImputer(SimpleImputer):
It doesn't crash when there aren't any missing values. See https://github.com/scikit-learn/scikit-learn/issues/26292
"""

@overrides
def _validate_input(self, X, in_fit):
if self.strategy in ("most_frequent", "constant"):
# If input is a list of strings, dtype = object.
# Otherwise ValueError is raised in SimpleImputer
# Otherwise, ValueError is raised in SimpleImputer
# with strategy='most_frequent' or 'constant'
# because the list is converted to Unicode numpy array
if isinstance(X, list) and any(
Expand Down

0 comments on commit 587c16c

Please sign in to comment.