Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/feature1 #26

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,14 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail()
* add whatever you need for yourself and share it with us

## Change Log

### 0.0.27
* I made my life easier, now i just do "from pandas_ml_utils import pd, np, FeaturesAndLabels, ..."
* features and labels can now handle multi dimensions as a cell may contain another numpy array
* introduced sample weights which can be passed to the fit function i.e. for keras fit

### 0.0.25 / 26
* refactored how traing and test data sets are split
* refactored how training and test data sets are split
* allow to control the amount of young test data being used (useful for time series)
* add sample weights i.e. to penalize loss per sample in a keras model

Expand Down
62 changes: 31 additions & 31 deletions pandas_ml_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,56 @@
"""Augment pandas DataFrame with methods for machine learning"""
__version__ = '0.0.26'
__version__ = '0.0.27'

import logging
import pandas as pd

# imports to provide functionality via root import like import pandas_ml_utils as pmu; pmu.XY
from pandas.core.base import PandasObject as _PandasObject

import numpy as np
import pandas as pd
import pandas_ml_utils.pandas_utils_extension as _df_ext
from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix as _plot_correlation_matrix
from pandas_ml_utils.analysis.selection import feature_selection as _feature_selection
from pandas_ml_utils.datafetching import fetch_cryptocompare_hourly as _fetch_cryptocompare_hourly, \
fetch_cryptocompare_daily as _fetch_cryptocompare_daily, fetch_yahoo as _fetch_yahoo
from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
from pandas_ml_utils.model.fitting.fitter import fit as _fit, predict as _predict, backtest as _backtest, \
features_and_label_extractor as _features_and_label_extractor
from pandas_ml_utils.model.models import Model, SkModel, KerasModel, MultiModel
from pandas_ml_utils.wrappers.lazy_dataframe import LazyDataFrame
from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels

# imports only used to augment pandas classes
from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, \
extend_forecast, cloc2
from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix
from pandas_ml_utils.datafetching.fetch_yahoo import fetch_yahoo
from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest, features_and_label_extractor
from pandas_ml_utils.analysis.selection import feature_selection
from pandas.core.base import PandasObject
from pandas_ml_utils.datafetching.fetch_cryptocompare import fetch_cryptocompare_daily, fetch_cryptocompare_hourly


# log provided classes
_log = logging.getLogger(__name__)
_log.debug(f"available {Model} classes {[SkModel, KerasModel, MultiModel]}")
_log.debug(f"available other classes {[LazyDataFrame, FeaturesAndLabels]}")
_log.debug(f"numpy version {np.__version__}")
_log.debug(f"pandas version {pd.__version__}")

# add functions to pandas
# general utility functions
PandasObject.cloc2 = cloc2
PandasObject.inner_join = inner_join
PandasObject.drop_re = drop_re
PandasObject.drop_zero_or_nan = drop_zero_or_nan
PandasObject.add_apply = add_apply
PandasObject.shift_inplace = shift_inplace
PandasObject.extend_forecast = extend_forecast
_PandasObject.cloc2 = _df_ext.cloc2
_PandasObject.inner_join = _df_ext.inner_join
_PandasObject.drop_re = _df_ext.drop_re
_PandasObject.drop_zero_or_nan = _df_ext.drop_zero_or_nan
_PandasObject.add_apply = _df_ext.add_apply
_PandasObject.shift_inplace = _df_ext.shift_inplace
_PandasObject.extend_forecast = _df_ext.extend_forecast

# feature selection
PandasObject.plot_correlation_matrix = plot_correlation_matrix
PandasObject.feature_selection = feature_selection
_PandasObject.plot_correlation_matrix = _plot_correlation_matrix
_PandasObject.feature_selection = _feature_selection

# provide fit, predict and backtest method
PandasObject.fit = fit
PandasObject.predict = predict
PandasObject.backtest = backtest
_PandasObject.fit = _fit
_PandasObject.predict = _predict
_PandasObject.backtest = _backtest

# also provide the plan features and labels extractor
PandasObject.features_and_label_extractor = features_and_label_extractor
_PandasObject.features_and_label_extractor = _features_and_label_extractor

# data fetcher
setattr(pd, 'fetch_yahoo', fetch_yahoo)
setattr(pd, 'fetch_cryptocompare_daily', fetch_cryptocompare_daily)
setattr(pd, 'fetch_cryptocompare_hourly', fetch_cryptocompare_hourly)
setattr(pd, 'fetch_yahoo', _fetch_yahoo)
setattr(pd, 'fetch_cryptocompare_daily', _fetch_cryptocompare_daily)
setattr(pd, 'fetch_cryptocompare_hourly', _fetch_cryptocompare_hourly)

__doc__ = """
The main concept is to extend pandas DataFrame objects such that you can apply any statistical or machine learning
Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/analysis/correlation_analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
from typing import Tuple


Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/analysis/selection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import numpy as np
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
from typing import List, Iterable, Union, Tuple
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix, _sort_correlation, _plot_heatmap
Expand Down
2 changes: 2 additions & 0 deletions pandas_ml_utils/datafetching/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .fetch_cryptocompare import *
from .fetch_yahoo import *
2 changes: 1 addition & 1 deletion pandas_ml_utils/datafetching/fetch_cryptocompare.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime

import cachetools
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
import pytz

from pandas_ml_utils.extern.cryptocompare import CURR, LIMIT, TIME, get_historical_price_day, get_historical_price_hour
Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/datafetching/fetch_yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import traceback

import cachetools.func
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd

from ..pandas_utils_extension import inner_join
from ..utils.functions import join_kwargs
Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/extern/cryptocompare.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import time

import cachetools
import cachetools.func
import requests

_log = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List, Callable, Iterable, Dict, Type, Tuple, Union, Any

import numpy as np
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd

from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder
from pandas_ml_utils.utils.functions import join_kwargs
Expand All @@ -27,6 +27,7 @@ def __init__(self,
features: List[str],
labels: _LABELS,
label_type: Type = None,
sample_weights: Union[Dict[str, str], str] = None,
gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
feature_lags: Iterable[int] = None,
Expand All @@ -43,6 +44,8 @@ def __init__(self,
the average was. It is also possible to provide a Callable[[df, ...magic], labels] which returns
the expected data structure.
:param label_type: whether to treat a label as int, float, bool
:param sample_weights: sample weights get passed to the model.fit function. In keras for example this can be
used for imbalanced classes
:param gross_loss: expects a callable[[df, target, ...magic], df] which receives the source data frame and a
target (or None) and should return a series or data frame. Let's say you want to classify
whether a printer is jamming the next page or not. Halting and servicing the printer costs
Expand All @@ -68,6 +71,7 @@ def __init__(self,
"""
self._features = features
self._labels = labels
self._weights = sample_weights
self._targets = targets
self._gross_loss = gross_loss
self.label_type = label_type
Expand All @@ -89,6 +93,10 @@ def features(self):
def labels(self):
return self._labels

@property
def weights(self):
return self._weights

@property
def targets(self):
return self._targets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Tuple, Dict, Union, List

import numpy as np
import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
from sortedcontainers import SortedDict

from pandas_ml_utils.constants import *
Expand All @@ -24,6 +24,7 @@ class FeatureTargetLabelExtractor(object):
def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
# prepare fields
labels = features_and_labels.labels
weights = features_and_labels.weights
encoder = lambda frame, **kwargs: frame
label_columns = None
joined_kwargs = join_kwargs(features_and_labels.kwargs, kwargs)
Expand All @@ -49,12 +50,18 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **k
t: l if isinstance(l, TargetLabelEncoder) else IdentityEncoder(l) for t, l in labels.items()
}).encode

# flatten weights for multi models
if isinstance(weights, Dict):
weights = [l for t in labels.keys() for l in weights[t]]

# assign all fields
self._features_and_labels = features_and_labels # depricated copy all fields here
self._features = features_and_labels.features
self._labels_columns = label_columns

self._labels = labels
self._label_type = features_and_labels.label_type
self._weight_columns = weights
self._targets = features_and_labels.targets
self._gross_loss = features_and_labels.gross_loss
self._encoder = encoder
Expand Down Expand Up @@ -133,19 +140,17 @@ def prediction_to_frame(self,
def training_and_test_data(self,
test_size: float = 0.4,
youngest_size: float = None,
seed: int = 42) -> Tuple[Tuple[np.ndarray,...], Tuple[np.ndarray,...]]:
seed: int = 42) -> Tuple[Tuple[pd.DataFrame,...], Tuple[pd.DataFrame,...]]:
features, labels, weights = self.features_labels_weights_df
train_ix, test_ix = train_test_split(features.index, test_size, youngest_size, seed=seed)

return (
(train_ix,
features.loc[train_ix].values,
integrate_nested_arrays(labels.loc[train_ix].values),
weights.loc[train_ix].values if weights is not None else None),
(test_ix,
features.loc[test_ix].values,
integrate_nested_arrays(labels.loc[test_ix].values),
weights.loc[test_ix].values if weights is not None else None)
(features.loc[train_ix],
labels.loc[train_ix],
weights.loc[train_ix] if weights is not None else None),
(features.loc[test_ix],
labels.loc[test_ix],
weights.loc[test_ix] if weights is not None else None)
)

@property
Expand All @@ -155,11 +160,15 @@ def features_labels_weights_df(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Dat
df_labels = self.labels_df
index_intersect = df_features.index.intersection(df_labels.index)

# engineer sample weights
df_weights = self.weighs_df
if df_weights is not None:
index_intersect = index_intersect.intersection(df_weights.index)

# select only joining index values
df_features = df_features.loc[index_intersect]
df_labels = df_labels.loc[index_intersect]
# TODO add proper label weights
df_weights = None #pd.DataFrame(np.ones(len(df_labels)), index=df_labels.index)
df_weights = None if df_weights is None else df_weights.loc[index_intersect]

# sanity check
if not len(df_features) == len(df_labels):
Expand All @@ -177,7 +186,10 @@ def features_df(self) -> pd.DataFrame:
feature_rescaling = self._features_and_labels.feature_rescaling

# drop nan's and copy frame
df = self._df[features].dropna().copy()
try:
df = self._df[features].dropna().copy()
except KeyError:
raise KeyError(f'one of the keys >{features}< are not in :{self._df.columns}')

# generate feature matrix
if feature_lags is None:
Expand Down Expand Up @@ -221,9 +233,6 @@ def features_df(self) -> pd.DataFrame:
dff[col] = tmp[col]

_log.info(f" make features ... done in {pc() - start_pc: .2f} sec!")

# finally patch the "values" property for features data frame and return
dff.__class__ = _RNNShapedValuesDataFrame
return dff

@property
Expand All @@ -245,12 +254,29 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
return labels if level_above is None else [(level_above, col) for col in labels]

@property
@lru_cache(maxsize=1)
def labels_df(self) -> pd.DataFrame:
# here we can do all sorts of tricks and encodings ...
# joined_kwargs(self._features_and_labels.kwargs, self.)
df = self._encoder(self._df[self._labels_columns], **self._joined_kwargs).dropna().copy()
try:
df = self._df[self._labels_columns].dropna()
except KeyError:
raise KeyError(f'one of the keys >{self._labels_columns}< are not in: {self._df.columns}')

df = self._encoder(df, **self._joined_kwargs).dropna().copy()
return df if self._label_type is None else df.astype(self._label_type)

@property
def weighs_df(self) -> pd.DataFrame:
if self._weight_columns is not None:
try:
return self._df[self._weight_columns].dropna().copy()
except KeyError:
raise KeyError(f'one of the keys >{self._weight_columns}< are not in: {self._df.columns}')

else:
return None

@property
def source_df(self):
df = self._df.copy()
Expand Down Expand Up @@ -306,51 +332,6 @@ def target_df(self):

return df

def _fix_shape(self, df_features):
# features eventually are in [feature, row, time_step]
# but need to be in RNN shape which is [row, time_step, feature]
feature_arr = df_features.values if self._features_and_labels.feature_lags is None else \
np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2)

if len(feature_arr) <= 0:
_log.warning("empty feature array!")

return feature_arr

def __str__(self):
return f'min required data = {self.min_required_samples}'


class _RNNShapedValuesDataFrame(pd.DataFrame):

class Loc():
def __init__(self, df):
self.df = df

def __getitem__(self, item):
res = self.df.loc[item]
res.__class__ = _RNNShapedValuesDataFrame
return res

@property
def loc(self):
return _RNNShapedValuesDataFrame.Loc(super(pd.DataFrame, self))

@property
def values(self):
top_level_columns = unique_top_level_columns(self)

# we need to do a sneaky trick here to get a proper "super" object as super() does not work as expected
# so we simply rename with an empty dict
df = self.rename({})

# features eventually are in [feature, row, time_step]
# but need to be in RNN shape which is [row, time_step, feature]
feature_arr = df.values if top_level_columns is None else \
np.array([df[feature].values for feature in top_level_columns],
ndmin=3).swapaxes(0, 1).swapaxes(1, 2)

if len(feature_arr) <= 0:
_log.warning("empty feature array!")

return feature_arr
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from copy import deepcopy

import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
import numpy as np
from typing import Iterable, List, Dict, Union, Callable

Expand Down
2 changes: 1 addition & 1 deletion pandas_ml_utils/model/fitting/fit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any

import pandas as pd
import pandas_ml_utils.monkey_patched_dataframe as pd
import os
from pandas_ml_utils.model.models import Model
from pandas_ml_utils.summary.summary import Summary
Expand Down
Loading