KIC · KIC · Feb 26, 2020 · Feb 27, 2020 · Feb 27, 2020 · Feb 27, 2020
diff --git a/README.md b/README.md
@@ -233,8 +233,14 @@ df.predict(pmu.Model.load("/tmp/burrito.model")).tail()
 * add whatever you need for yourself and share it with us 
 
 ## Change Log
+
+### 0.0.27
+* I made my life easier, now i just do "from pandas_ml_utils import pd, np, FeaturesAndLabels, ..."
+* features and labels can now handle multi dimensions as a cell may contain another numpy array
+* introduced sample weights which can be passed to the fit function i.e. for keras fit
+
 ### 0.0.25 / 26
-* refactored how traing and test data sets are split
+* refactored how training and test data sets are split
 * allow to control the amount of young test data being used (useful for time series)
 * add sample weights i.e. to penalize loss per sample in a keras model 
 

diff --git a/pandas_ml_utils/__init__.py b/pandas_ml_utils/__init__.py
@@ -1,56 +1,56 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.0.26'
+__version__ = '0.0.27'
 
 import logging
-import pandas as pd
 
-# imports to provide functionality via root import like import pandas_ml_utils as pmu; pmu.XY
+from pandas.core.base import PandasObject as _PandasObject
+
+import numpy as np
+import pandas as pd
+import pandas_ml_utils.pandas_utils_extension as _df_ext
+from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix as _plot_correlation_matrix
+from pandas_ml_utils.analysis.selection import feature_selection as _feature_selection
+from pandas_ml_utils.datafetching import fetch_cryptocompare_hourly as _fetch_cryptocompare_hourly, \
+    fetch_cryptocompare_daily as _fetch_cryptocompare_daily, fetch_yahoo as _fetch_yahoo
+from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
+from pandas_ml_utils.model.fitting.fitter import fit as _fit, predict as _predict, backtest as _backtest, \
+    features_and_label_extractor as _features_and_label_extractor
 from pandas_ml_utils.model.models import Model, SkModel, KerasModel, MultiModel
 from pandas_ml_utils.wrappers.lazy_dataframe import LazyDataFrame
-from pandas_ml_utils.model.features_and_labels.features_and_labels import FeaturesAndLabels
-
-# imports only used to augment pandas classes
-from pandas_ml_utils.pandas_utils_extension import inner_join, drop_re, drop_zero_or_nan, add_apply, shift_inplace, \
-    extend_forecast, cloc2
-from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix
-from pandas_ml_utils.datafetching.fetch_yahoo import fetch_yahoo
-from pandas_ml_utils.model.fitting.fitter import fit, predict, backtest, features_and_label_extractor
-from pandas_ml_utils.analysis.selection import feature_selection
-from pandas.core.base import PandasObject
-from pandas_ml_utils.datafetching.fetch_cryptocompare import fetch_cryptocompare_daily, fetch_cryptocompare_hourly
-
 
 # log provided classes
 _log = logging.getLogger(__name__)
 _log.debug(f"available {Model} classes {[SkModel, KerasModel, MultiModel]}")
 _log.debug(f"available other classes {[LazyDataFrame, FeaturesAndLabels]}")
+_log.debug(f"numpy version {np.__version__}")
+_log.debug(f"pandas version {pd.__version__}")
 
 # add functions to pandas
 # general utility functions
-PandasObject.cloc2 = cloc2
-PandasObject.inner_join = inner_join
-PandasObject.drop_re = drop_re
-PandasObject.drop_zero_or_nan = drop_zero_or_nan
-PandasObject.add_apply = add_apply
-PandasObject.shift_inplace = shift_inplace
-PandasObject.extend_forecast = extend_forecast
+_PandasObject.cloc2 = _df_ext.cloc2
+_PandasObject.inner_join = _df_ext.inner_join
+_PandasObject.drop_re = _df_ext.drop_re
+_PandasObject.drop_zero_or_nan = _df_ext.drop_zero_or_nan
+_PandasObject.add_apply = _df_ext.add_apply
+_PandasObject.shift_inplace = _df_ext.shift_inplace
+_PandasObject.extend_forecast = _df_ext.extend_forecast
 
 # feature selection
-PandasObject.plot_correlation_matrix = plot_correlation_matrix
-PandasObject.feature_selection = feature_selection
+_PandasObject.plot_correlation_matrix = _plot_correlation_matrix
+_PandasObject.feature_selection = _feature_selection
 
 # provide fit, predict and backtest method
-PandasObject.fit = fit
-PandasObject.predict = predict
-PandasObject.backtest = backtest
+_PandasObject.fit = _fit
+_PandasObject.predict = _predict
+_PandasObject.backtest = _backtest
 
 # also provide the plan features and labels extractor
-PandasObject.features_and_label_extractor = features_and_label_extractor
+_PandasObject.features_and_label_extractor = _features_and_label_extractor
 
 # data fetcher
-setattr(pd, 'fetch_yahoo', fetch_yahoo)
-setattr(pd, 'fetch_cryptocompare_daily', fetch_cryptocompare_daily)
-setattr(pd, 'fetch_cryptocompare_hourly', fetch_cryptocompare_hourly)
+setattr(pd, 'fetch_yahoo', _fetch_yahoo)
+setattr(pd, 'fetch_cryptocompare_daily', _fetch_cryptocompare_daily)
+setattr(pd, 'fetch_cryptocompare_hourly', _fetch_cryptocompare_hourly)
 
 __doc__ = """
 The main concept is to extend pandas DataFrame objects such that you can apply any statistical or machine learning

diff --git a/pandas_ml_utils/analysis/correlation_analysis.py b/pandas_ml_utils/analysis/correlation_analysis.py
@@ -1,4 +1,4 @@
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 from typing import Tuple
 
 

diff --git a/pandas_ml_utils/analysis/selection.py b/pandas_ml_utils/analysis/selection.py
@@ -1,6 +1,6 @@
 import logging
 import numpy as np
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 from typing import List, Iterable, Union, Tuple
 from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
 from pandas_ml_utils.analysis.correlation_analysis import plot_correlation_matrix, _sort_correlation, _plot_heatmap

diff --git a/pandas_ml_utils/datafetching/__init__.py b/pandas_ml_utils/datafetching/__init__.py
@@ -0,0 +1,2 @@
+from .fetch_cryptocompare import *
+from .fetch_yahoo import *
diff --git a/pandas_ml_utils/datafetching/fetch_cryptocompare.py b/pandas_ml_utils/datafetching/fetch_cryptocompare.py
@@ -1,7 +1,7 @@
 import datetime
 
 import cachetools
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 import pytz
 
 from pandas_ml_utils.extern.cryptocompare import CURR, LIMIT, TIME, get_historical_price_day, get_historical_price_hour

diff --git a/pandas_ml_utils/datafetching/fetch_yahoo.py b/pandas_ml_utils/datafetching/fetch_yahoo.py
@@ -2,7 +2,7 @@
 import traceback
 
 import cachetools.func
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 
 from ..pandas_utils_extension import inner_join
 from ..utils.functions import join_kwargs

diff --git a/pandas_ml_utils/extern/cryptocompare.py b/pandas_ml_utils/extern/cryptocompare.py
@@ -4,7 +4,7 @@
 import logging
 import time
 
-import cachetools
+import cachetools.func
 import requests
 
 _log = logging.getLogger(__name__)

diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels.py b/pandas_ml_utils/model/features_and_labels/features_and_labels.py
@@ -4,7 +4,7 @@
 from typing import List, Callable, Iterable, Dict, Type, Tuple, Union, Any
 
 import numpy as np
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 
 from pandas_ml_utils.model.features_and_labels.target_encoder import TargetLabelEncoder
 from pandas_ml_utils.utils.functions import join_kwargs
@@ -27,6 +27,7 @@ def __init__(self,
                  features: List[str],
                  labels: _LABELS,
                  label_type: Type = None,
+                 sample_weights: Union[Dict[str, str], str] = None,
                  gross_loss: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
                  targets: Callable[[str, pd.DataFrame], Union[pd.Series, pd.DataFrame]] = None,
                  feature_lags: Iterable[int] = None,
@@ -43,6 +44,8 @@ def __init__(self,
                        the average was. It is also possible to provide a Callable[[df, ...magic], labels] which returns
                        the expected data structure.
         :param label_type: whether to treat a label as int, float, bool
+        :param sample_weights: sample weights get passed to the model.fit function. In keras for example this can be
+                               used for imbalanced classes
         :param gross_loss: expects a callable[[df, target, ...magic], df] which receives the source data frame and a
                            target (or None) and should return a series or data frame. Let's say you want to classify
                            whether a printer is jamming the next page or not. Halting and servicing the printer costs
@@ -68,6 +71,7 @@ def __init__(self,
         """
         self._features = features
         self._labels = labels
+        self._weights = sample_weights
         self._targets = targets
         self._gross_loss = gross_loss
         self.label_type = label_type
@@ -89,6 +93,10 @@ def features(self):
     def labels(self):
         return self._labels
 
+    @property
+    def weights(self):
+        return self._weights
+
     @property
     def targets(self):
         return self._targets

diff --git a/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py b/pandas_ml_utils/model/features_and_labels/features_and_labels_extractor.py
@@ -4,7 +4,7 @@
 from typing import Tuple, Dict, Union, List
 
 import numpy as np
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 from sortedcontainers import SortedDict
 
 from pandas_ml_utils.constants import *
@@ -24,6 +24,7 @@ class FeatureTargetLabelExtractor(object):
     def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **kwargs):
         # prepare fields
         labels = features_and_labels.labels
+        weights = features_and_labels.weights
         encoder = lambda frame, **kwargs: frame
         label_columns = None
         joined_kwargs = join_kwargs(features_and_labels.kwargs, kwargs)
@@ -49,12 +50,18 @@ def __init__(self, df: pd.DataFrame, features_and_labels: FeaturesAndLabels, **k
                 t: l if isinstance(l, TargetLabelEncoder) else IdentityEncoder(l) for t, l in labels.items()
             }).encode
 
+        # flatten weights for multi models
+        if isinstance(weights, Dict):
+            weights = [l for t in labels.keys() for l in weights[t]]
+
         # assign all fields
         self._features_and_labels = features_and_labels # depricated copy all fields here
         self._features = features_and_labels.features
         self._labels_columns = label_columns
+
         self._labels = labels
         self._label_type = features_and_labels.label_type
+        self._weight_columns = weights
         self._targets = features_and_labels.targets
         self._gross_loss = features_and_labels.gross_loss
         self._encoder = encoder
@@ -133,19 +140,17 @@ def prediction_to_frame(self,
     def training_and_test_data(self,
                                test_size: float = 0.4,
                                youngest_size: float = None,
-                               seed: int = 42) -> Tuple[Tuple[np.ndarray,...], Tuple[np.ndarray,...]]:
+                               seed: int = 42) -> Tuple[Tuple[pd.DataFrame,...], Tuple[pd.DataFrame,...]]:
         features, labels, weights = self.features_labels_weights_df
         train_ix, test_ix = train_test_split(features.index, test_size, youngest_size, seed=seed)
 
         return (
-            (train_ix,
-             features.loc[train_ix].values,
-             integrate_nested_arrays(labels.loc[train_ix].values),
-             weights.loc[train_ix].values if weights is not None else None),
-            (test_ix,
-             features.loc[test_ix].values,
-             integrate_nested_arrays(labels.loc[test_ix].values),
-             weights.loc[test_ix].values if weights is not None else None)
+            (features.loc[train_ix],
+             labels.loc[train_ix],
+             weights.loc[train_ix] if weights is not None else None),
+            (features.loc[test_ix],
+             labels.loc[test_ix],
+             weights.loc[test_ix] if weights is not None else None)
         )
 
     @property
@@ -155,11 +160,15 @@ def features_labels_weights_df(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Dat
         df_labels = self.labels_df
         index_intersect = df_features.index.intersection(df_labels.index)
 
+        # engineer sample weights
+        df_weights = self.weighs_df
+        if df_weights is not None:
+            index_intersect = index_intersect.intersection(df_weights.index)
+
         # select only joining index values
         df_features = df_features.loc[index_intersect]
         df_labels = df_labels.loc[index_intersect]
-        # TODO add proper label weights
-        df_weights = None #pd.DataFrame(np.ones(len(df_labels)), index=df_labels.index)
+        df_weights = None if df_weights is None else df_weights.loc[index_intersect]
 
         # sanity check
         if not len(df_features) == len(df_labels):
@@ -177,7 +186,10 @@ def features_df(self) -> pd.DataFrame:
         feature_rescaling = self._features_and_labels.feature_rescaling
 
         # drop nan's and copy frame
-        df = self._df[features].dropna().copy()
+        try:
+            df = self._df[features].dropna().copy()
+        except KeyError:
+            raise KeyError(f'one of the keys >{features}< are not in :{self._df.columns}')
 
         # generate feature matrix
         if feature_lags is None:
@@ -221,9 +233,6 @@ def features_df(self) -> pd.DataFrame:
                     dff[col] = tmp[col]
 
         _log.info(f" make features ... done in {pc() - start_pc: .2f} sec!")
-
-        # finally patch the "values" property for features data frame and return
-        dff.__class__ = _RNNShapedValuesDataFrame
         return dff
 
     @property
@@ -245,12 +254,29 @@ def label_names(self, level_above=None) -> List[Union[Tuple[str, ...],str]]:
             return labels if level_above is None else [(level_above, col) for col in labels]
 
     @property
+    @lru_cache(maxsize=1)
     def labels_df(self) -> pd.DataFrame:
         # here we can do all sorts of tricks and encodings ...
         # joined_kwargs(self._features_and_labels.kwargs, self.)
-        df = self._encoder(self._df[self._labels_columns], **self._joined_kwargs).dropna().copy()
+        try:
+            df = self._df[self._labels_columns].dropna()
+        except KeyError:
+            raise KeyError(f'one of the keys >{self._labels_columns}< are not in: {self._df.columns}')
+
+        df = self._encoder(df, **self._joined_kwargs).dropna().copy()
         return df if self._label_type is None else df.astype(self._label_type)
 
+    @property
+    def weighs_df(self) -> pd.DataFrame:
+        if self._weight_columns is not None:
+            try:
+                return self._df[self._weight_columns].dropna().copy()
+            except KeyError:
+                raise KeyError(f'one of the keys >{self._weight_columns}< are not in: {self._df.columns}')
+
+        else:
+            return None
+
     @property
     def source_df(self):
         df = self._df.copy()
@@ -306,51 +332,6 @@ def target_df(self):
 
         return df
 
-    def _fix_shape(self, df_features):
-        # features eventually are in [feature, row, time_step]
-        # but need to be in RNN shape which is [row, time_step, feature]
-        feature_arr = df_features.values if self._features_and_labels.feature_lags is None else \
-            np.array([df_features[cols].values for cols in self.feature_names], ndmin=3).swapaxes(0, 1).swapaxes(1, 2)
-
-        if len(feature_arr) <= 0:
-            _log.warning("empty feature array!")
-
-        return feature_arr
-
     def __str__(self):
         return f'min required data = {self.min_required_samples}'
 
-
-class _RNNShapedValuesDataFrame(pd.DataFrame):
-
-    class Loc():
-        def __init__(self, df):
-            self.df = df
-
-        def __getitem__(self, item):
-            res = self.df.loc[item]
-            res.__class__ = _RNNShapedValuesDataFrame
-            return res
-
-    @property
-    def loc(self):
-        return _RNNShapedValuesDataFrame.Loc(super(pd.DataFrame, self))
-
-    @property
-    def values(self):
-        top_level_columns = unique_top_level_columns(self)
-
-        # we need to do a sneaky trick here to get a proper "super" object as super() does not work as expected
-        # so we simply rename with an empty dict
-        df = self.rename({})
-
-        # features eventually are in [feature, row, time_step]
-        # but need to be in RNN shape which is [row, time_step, feature]
-        feature_arr = df.values if top_level_columns is None else \
-            np.array([df[feature].values for feature in top_level_columns],
-                     ndmin=3).swapaxes(0, 1).swapaxes(1, 2)
-
-        if len(feature_arr) <= 0:
-            _log.warning("empty feature array!")
-
-        return feature_arr
diff --git a/pandas_ml_utils/model/features_and_labels/target_encoder.py b/pandas_ml_utils/model/features_and_labels/target_encoder.py
@@ -1,6 +1,6 @@
 from copy import deepcopy
 
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 import numpy as np
 from typing import Iterable, List, Dict, Union, Callable
 

diff --git a/pandas_ml_utils/model/fitting/fit.py b/pandas_ml_utils/model/fitting/fit.py
@@ -1,6 +1,6 @@
 from typing import Any
 
-import pandas as pd
+import pandas_ml_utils.monkey_patched_dataframe as pd
 import os
 from pandas_ml_utils.model.models import Model
 from pandas_ml_utils.summary.summary import Summary
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .fetch_cryptocompare import *
		from .fetch_yahoo import *