Merge pull request #22 from KIC/feature/0.2.7

Feature/0.2.7
KIC · Aug 23, 2021 · 14fdef8 · 14fdef8
2 parents 59b7023 + cb111e7
commit 14fdef8
Show file tree

Hide file tree

Showing 74 changed files with 14,883 additions and 185 deletions.
diff --git a/Readme.md b/Readme.md
@@ -122,7 +122,10 @@ NOTE! This module is currently stalled as I mainly use pytorch at the moment.
 ...
 
 ### [pandas-ta-quant](pandas-ta-quant/Readme.md) 
-Technical analysis library
+Technical analysis library. 
+
+It is a pure python re-implementation of the famous TA-Lib and some custom indicators on top i.e. GARCH. 
+Use `df.ta.help` to see all possible indicators. 
 ![Ta Plot](./.readme/images/multi_index.png)
 
 ### [pandas-ta-quant-plot](pandas-ta-quant-plot/Readme.md) 

diff --git a/bump_version.sh b/bump_version.sh
@@ -10,7 +10,7 @@ OLD_VERSION=$1
 NEW_VERSION=$2
 
 set -e
-for f in `fgrep $OLD_VERSION */* -d skip | sed -e's/\s*=\s*/:/g'`
+for f in `(fgrep $OLD_VERSION */* -d skip ; fgrep $OLD_VERSION */*/* -d skip) | sed -e's/\s*=\s*/:/g'`
 do
   array=(${f//:/ })
   echo "sed -i -E \"s/(__version__)(\s*=\s*)(${array[2]})/\1\2'$NEW_VERSION'/\" ${array[0]}"

diff --git a/notebooks/blogs/probabilistic.ipynb b/notebooks/blogs/probabilistic.ipynb
diff --git a/one-liners.sh b/one-liners.sh
@@ -0,0 +1,5 @@
+echo "script is not meant to run"
+exit 0
+
+# freeze version of a dedicated package
+find -name requirements.txt -exec sed -i -e 's/numpy.*/numpy==1.20.*/g' {} \;
diff --git a/pandas-ml-1ntegration-test-private b/pandas-ml-1ntegration-test-private
diff --git a/pandas-ml-1ntegration-test/noxfile.py b/pandas-ml-1ntegration-test/noxfile.py
@@ -1,4 +1,4 @@
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 import shutil

diff --git a/pandas-ml-airflow/setup.py b/pandas-ml-airflow/setup.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods to fetch time series data for quant finance"""
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 

diff --git a/pandas-ml-common/dev-requirements.frozen.txt b/pandas-ml-common/dev-requirements.frozen.txt
@@ -1,8 +1,8 @@
-ipykernel==5.5.5
+ipykernel==6.2.0
 lxml==4.6.3
 Markdown==3.3.4
-nbconvert==6.0.7
+nbconvert==6.1.0
 nbformat==5.1.3
-requests==2.25.1
+requests==2.26.0
 scikit-learn==0.24.2
-twine==3.4.1
+twine==3.4.2
diff --git a/pandas-ml-common/noxfile.py b/pandas-ml-common/noxfile.py
@@ -1,4 +1,4 @@
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 import shutil

diff --git a/pandas-ml-common/pandas_ml_common/__init__.py b/pandas-ml-common/pandas_ml_common/__init__.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.2.0'
+__version__ = '0.2.7'
 
 import logging
 from typing import Union, List, Callable, Any

diff --git a/pandas-ml-common/pandas_ml_common/utils/serialization_utils.py b/pandas-ml-common/pandas_ml_common/utils/serialization_utils.py
@@ -4,6 +4,7 @@
 import traceback
 
 import dill as pickle
+import pandas as pd
 
 
 def serialize(obj, filename):
@@ -75,3 +76,15 @@ def dict_to_str(d):
         from sortedcontainers import SortedDict
         return ",".join([f"{k}={v}" for k, v in SortedDict(d).items()])
 
+
+def df_to_nested_dict(df: pd.DataFrame):
+    if isinstance(df.index, pd.MultiIndex) and df.index.nlevels > 1:
+        res = {}
+
+        keys = set(df.index.get_level_values(0))
+        for key in sorted(keys):
+            res[key] = df_to_nested_dict(df.loc[key])
+
+        return res
+    else:
+        return df.to_dict('records')
diff --git a/pandas-ml-common/pandas_ml_common_test/unit/utils/test__serialization_utils.py b/pandas-ml-common/pandas_ml_common_test/unit/utils/test__serialization_utils.py
@@ -0,0 +1,14 @@
+from unittest import TestCase
+
+import pandas as pd
+import numpy as np
+
+from pandas_ml_common.utils.serialization_utils import df_to_nested_dict
+
+
+class TestSerializationUtils(TestCase):
+
+    def test_nested_dict(self):
+        df = pd.DataFrame({"a": np.random.rand(27)})
+        df.index = pd.MultiIndex.from_tuples([(a, b, c) for a in range(3) for b in range(3) for c in range(3)])
+        print(df_to_nested_dict(df))
diff --git a/pandas-ml-common/requirements.frozen.txt b/pandas-ml-common/requirements.frozen.txt
@@ -1,5 +1,5 @@
 dill==0.3.4
 numpy==1.20.3
-pandas==1.2.4
+pandas==1.3.2
 scikit-learn==0.24.2
 sortedcontainers==2.4.0
diff --git a/pandas-ml-common/requirements.txt b/pandas-ml-common/requirements.txt
@@ -1,5 +1,5 @@
 dill
-numpy>=1.20.2
+numpy==1.20.*
 pandas
 scikit-learn
 sortedcontainers
diff --git a/pandas-ml-common/setup.py b/pandas-ml-common/setup.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 import re

diff --git a/pandas-ml-quant-rl/pandas_ml_quant_rl/__init__.py b/pandas-ml-quant-rl/pandas_ml_quant_rl/__init__.py
@@ -1,2 +1,4 @@
+__version__ = '0.2.7'
+
 from .environments import *
 from .model.agent import Agent
diff --git a/pandas-ml-quant-rl/setup.py b/pandas-ml-quant-rl/setup.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods to fetch time series data for quant finance"""
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 

diff --git a/pandas-ml-quant/dev-requirements.frozen.txt b/pandas-ml-quant/dev-requirements.frozen.txt
@@ -1,11 +1,11 @@
-ipykernel==5.5.5
-jupyter-client==6.1.12
+ipykernel==6.2.0
+jupyter-client==7.0.1
 lxml==4.6.3
 Markdown==3.3.4
-matplotlib==3.4.2
+matplotlib==3.4.3
 mplfinance==0.12.7a17
-nbconvert==6.0.7
-requests==2.25.1
-TA-Lib==0.4.20
-tox==3.23.1
-twine==3.4.1
+nbconvert==6.1.0
+requests==2.26.0
+TA-Lib==0.4.21
+tox==3.24.3
+twine==3.4.2
diff --git a/pandas-ml-quant/noxfile.py b/pandas-ml-quant/noxfile.py
@@ -1,4 +1,4 @@
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 import shutil

diff --git a/pandas-ml-quant/pandas_ml_quant/__init__.py b/pandas-ml-quant/pandas_ml_quant/__init__.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for quant analysis"""
-__version__ = '0.2.0'
+__version__ = '0.2.7'
 
 import importlib
 

diff --git a/pandas-ml-quant/pandas_ml_quant/empirical.py b/pandas-ml-quant/pandas_ml_quant/empirical.py
@@ -14,6 +14,14 @@ def __init__(self, x):
     def hist(self, bins='sqrt'):
         return np.histogram(self.x, bins=bins, density=True)
 
+    def extreme(self):
+        hist, edges = self.hist()
+        iext = np.argmax(hist)
+        return (edges[iext] + edges[iext + 1]) / 2
+
+    def std(self):
+        return np.std(self.x)
+
     def confidence_interval(self, lower: float, upper: float) -> Tuple[float, float]:
         # a[i-1] < v <= a[i]
         li = np.searchsorted(self.probs, lower, side='right')
@@ -29,6 +37,10 @@ def get_val(idx, prob):
 
         return get_val(li, lower), get_val(ri, upper)
 
+    def confidence_band_width(self, lower: float, upper: float) -> float:
+        l, u = self.confidence_interval(lower, upper)
+        return (u - l) / u
+
     def heat_bar(self, bins=21) -> Tuple[np.ndarray, np.ndarray]:
         # return a 2D array [value, mass]
         mass, edges = np.histogram(self.x, bins=bins, density=True)

diff --git a/pandas-ml-quant/pandas_ml_quant/model/summary/price_prediction_summary.py b/pandas-ml-quant/pandas_ml_quant/model/summary/price_prediction_summary.py
@@ -306,8 +306,16 @@ def calc_scores(self, *args, **kwargs):
         dfcdf = self.cdf.to_frame().dropna()
         idx = dfcdf.index.intersection(self.label_returns.index)
         dfl = self.label_returns.loc[idx]
+        dfpp = self.cdf[idx].apply(lambda cdf: cdf.extreme())
+        mean_std = self.cdf[idx].apply(lambda cdf: cdf.std()).mean()
         nr_events = len(dfcdf)
 
+        direction_correct_ratio = \
+            (((dfl.values.squeeze() > 0) & (dfpp.values.squeeze() > 0)) | ((dfl.values.squeeze() < 0) & (dfpp.values.squeeze() < 0))).sum() / len(dfl)
+
+        corr, _ = pearsonr(dfl.values.flatten(), dfpp.values.flatten())
+        r2 = r2_score(dfl, dfpp)
+
         tail_events = dfcdf.join(dfl).apply(
             lambda x: x.iloc[0].is_tail_event(x.iloc[1], self.left_confidence, self.right_confidence),
             axis=1,
@@ -324,11 +332,22 @@ def calc_scores(self, *args, **kwargs):
             axis=1,
             result_type='expand').mean()
 
+        # how wide is the confidence interval, the smaller the better
+        band_width = dfcdf.apply(
+            lambda cdf: cdf.iloc[0].confidence_band_width(self.left_confidence, self.right_confidence),
+            axis=1,
+            result_type='expand').mean()
+
         return pd.DataFrame({
             "first date": [dfcdf.index[0]],
             "last date": [dfcdf.index[-1]],
             "events": [nr_events],
+            "direction correct ratio of extreme": [direction_correct_ratio],
+            "correlation of extreme": [corr],
+            "r^2 of extreme": [r2],
+            "mean(σ)": [mean_std],
             f"confidence (exp: {self.expected_confidence:.2f} %)": [1 - tail_events.values.sum().item() / nr_events],
+            "conf width": [band_width],
             "left tail avg. distance %": [np.abs(distance.iloc[0])],
             "right tail avg. distance %": [distance.iloc[1]],
             "left tail events %": [tail_events.iloc[:, 0].sum() / nr_events],

diff --git a/pandas-ml-quant/pandas_ml_quant_test/model/test_fnl_postprocessor.py b/pandas-ml-quant/pandas_ml_quant_test/model/test_fnl_postprocessor.py
@@ -1,5 +1,6 @@
 from unittest import TestCase
 
+from pandas_ml_common.decorator import MultiFrameDecorator
 from pandas_ml_quant_test.config import DF_TEST
 from pandas_ml_utils import PostProcessedFeaturesAndLabels, Constant, np
 from pandas_ml_utils.ml.data.extraction.features_and_labels_extractor import FeaturesWithLabels
@@ -61,6 +62,24 @@ def test_feature_and_label_post_processing(self):
         self.assertEqual((6672, 4), fl.labels.shape)
         # FIXME implement chained lagging: self.assertEqual((6674, 3, 5, 2), fl.features_with_required_samples.features._.values.shape)
 
+    def test_empty_post_prcessor(self):
+        df = DF_TEST.copy()
+
+        fl: FeaturesWithLabels = df._.extract(
+            PostProcessedFeaturesAndLabels(
+                features=[
+                    lambda df: df["Close"].ta.log_returns(),
+                ],
+                feature_post_processor=[],
+                labels=[Constant(0)],
+            )
+        )
+
+        f = fl.features_with_required_samples.features
+
+        self.assertEqual((6762, 1), f.shape)
+        np.testing.assert_array_almost_equal(df[["Close"]].ta.log_returns().dropna().values, f.values)
+
     def test_post_row_standardisation(self):
         df = DF_TEST.copy()
 
@@ -81,11 +100,52 @@ def test_post_row_standardisation(self):
 
         f = fl.features_with_required_samples.features
 
+        self.assertEqual((6659, 20 * 3), f.shape)
         self.assertAlmostEqual(1, f.max(axis=1).values.max())
         self.assertAlmostEqual(0, f.min(axis=1).values.max())
         self.assertEqual(
             {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
              29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
              56, 57, 58, 59},
             set(f.apply(np.argmax, axis=1).values)
-        )
+        )
+
+    def test_multiple_features_post_processing(self):
+        df = DF_TEST.copy()
+
+        fl: FeaturesWithLabels = df._.extract(
+            PostProcessedFeaturesAndLabels(
+                features=(
+                    [lambda df: df["Close"].ta.log_returns(), lambda df: df["Close"].ta.trix(), lambda df: df["Close"].ta.rsi()],
+                    [lambda df: df["Close"].ta.rsi()],
+                ),
+                feature_post_processor=(
+                    [lambda df: df.ta.rnn(20), lambda df: df.ta.normalize_row('minmax01', level=1)],
+                    [lambda df: df.ta.rnn(10)],
+                ),
+                labels=[Constant(0)],
+            )
+        )
+
+        f = fl.features_with_required_samples.features
+        self.assertIsInstance(f, MultiFrameDecorator)
+
+        a, b = f.frames()
+
+        # test a
+        self.assertEqual((6659, 20 * 3), a.shape)
+        self.assertAlmostEqual(1, a.max(axis=1).values.max())
+        self.assertAlmostEqual(0, a.min(axis=1).values.max())
+        self.assertEqual(
+            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+             29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+             56, 57, 58, 59},
+            set(a.apply(np.argmax, axis=1).values)
+        )
+
+        # test b
+        self.assertEqual((6659, 10), b.shape)
+        np.testing.assert_array_almost_equal(
+            b.values[-1],
+            [0.580079, 0.561797, 0.501477, 0.58181 , 0.716154, 0.789371, 0.762768, 0.74797 , 0.687273, 0.666173]
+        )
diff --git a/pandas-ml-quant/pandas_ml_quant_test/test__empirical.py b/pandas-ml-quant/pandas_ml_quant_test/test__empirical.py
@@ -16,3 +16,5 @@ def test_empirical_probs(self):
 
         np.testing.assert_array_almost_equal(ecdf.heat_bar(6)[0], np.array([1., 6.]))
         np.testing.assert_array_almost_equal(ecdf.heat_bar(6)[1], np.array([0.12, 0.24, 0.36, 0.24, 0.12, 0.12]))
+
+        self.assertEqual(ecdf.confidence_band_width(0.1, 0.9), (6. - 2.) / 6.)
diff --git a/pandas-ml-quant/requirements.frozen.txt b/pandas-ml-quant/requirements.frozen.txt
@@ -1,19 +1,19 @@
-arch==4.19
+arch==5.0.1
 bintrees==2.2.0
 colorama==0.4.4
 cvxopt==1.2.6
-cvxpy==1.1.13
+cvxpy==1.1.15
 ipywidgets==7.6.3
-matplotlib==3.4.2
+matplotlib==3.4.3
 mgarch-setup-fix==0.2.0
 MiniSom==2.2.9
 mlxtend==0.18.0
 moviepy==1.0.3
-numba==0.53.1
+numba==0.54.0
 numpy==1.20.3
-pandas==1.2.4
+pandas==1.3.2
 pyts==0.11.0
 PyWavelets==1.1.1
 qpsolvers==1.6.1
-seaborn==0.11.1
+seaborn==0.11.2
 sortedcontainers==2.4.0
diff --git a/pandas-ml-quant/requirements.txt b/pandas-ml-quant/requirements.txt
@@ -11,7 +11,7 @@ mgarch-setup-fix
 mlxtend
 moviepy
 numba
-numpy
+numpy==1.20.*
 pandas
 pyts
 PyWavelets

diff --git a/pandas-ml-quant/setup.py b/pandas-ml-quant/setup.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for quant analysis"""
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 import os
 import re
 

diff --git a/pandas-ml-utils-tf/setup.py b/pandas-ml-utils-tf/setup.py
@@ -1,5 +1,5 @@
 """Augment pandas DataFrame with methods for machine learning"""
-__version__ = '0.2.6'
+__version__ = '0.2.7'
 
 import os
 from setuptools import setup, find_packages