Add support for python 3.10 (#35)

* replace the scipy dependency for the quantile function with the builtin one * add support for python 3.10 * prepare for new release
delftdata · Oct 18, 2021 · c5cfe9c · c5cfe9c
1 parent dfb5db3
commit c5cfe9c
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 15 deletions.
diff --git a/.github/workflows/build_all_os.yml b/.github/workflows/build_all_os.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ['3.8', '3.9']
+        python-version: ['3.8', '3.9', '3.10']
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![build](https://github.com/delftdata/valentine/actions/workflows/build.yml/badge.svg)](https://github.com/delftdata/valentine/actions/workflows/build.yml)
 [![codecov](https://codecov.io/gh/delftdata/valentine/branch/master/graph/badge.svg?token=4QR0X315CL)](https://codecov.io/gh/delftdata/valentine)
 [![PyPI version](https://badge.fury.io/py/valentine.svg)](https://badge.fury.io/py/valentine)
-[![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9-blue.svg)](https://www.python.org/downloads/release/python-360/)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9|3.10-blue.svg)](https://www.python.org/downloads/release/python-380/)
 
 A python package for capturing potential relationships among columns of different tabular datasets, which are given in the form of pandas DataFrames. Valentine is based on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921)
 
@@ -17,7 +17,7 @@ pip install valentine
 
 ## Installation requirements
 
-* Python 3.8, 3.9
+* Python>=3.8
 
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,7 @@
 # algorithms
 numpy==1.21.2
-scipy==1.7.1
-pandas==1.3.3
-nltk==3.6.4
+pandas==1.3.4
+nltk==3.6.5
 snakecase==1.0.1
 anytree==2.8.0
 six==1.16.0

diff --git a/setup.py b/setup.py
@@ -6,19 +6,18 @@
 
 setuptools.setup(
     name='valentine',
-    version='0.1.1',
+    version='0.1.2',
     description='Valentine Matcher',
     license_files=('LICENSE',),
     author='Delft Data',
     author_email='[email protected]',
     maintainer='Delft Data',
     maintainer_email='[email protected]',
     url='https://delftdata.github.io/valentine/',
-    download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.1.tar.gz',
-    packages=setuptools.find_packages(exclude=('tests*',)),
+    download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.2.tar.gz',
+    packages=setuptools.find_packages(exclude=('tests*', 'examples*')),
     install_requires=[
         'numpy>=1.21,<2.0',
-        'scipy>=1.7,<1.8',
         'pandas>=1.3,<1.4',
         'nltk>=3.6,<3.7',
         'snakecase>=1.0,<2.0',

diff --git a/valentine/algorithms/__init__.py b/valentine/algorithms/__init__.py
@@ -11,6 +11,10 @@
 all_matchers = schema_only_algorithms + instance_only_algorithms + schema_instance_algorithms
 
 __all__ = [
+    "schema_only_algorithms",
+    "instance_only_algorithms",
+    "schema_instance_algorithms",
+    "all_matchers",
     "Coma",
     "Cupid",
     "DistributionBased",

diff --git a/valentine/algorithms/distribution_based/quantile_histogram.py b/valentine/algorithms/distribution_based/quantile_histogram.py
@@ -1,6 +1,6 @@
+from statistics import quantiles
 from numpy import ndarray
 import numpy as np
-import scipy.stats as ss
 import math
 
 
@@ -49,7 +49,7 @@ def __init__(self,
                  name: tuple,
                  ranks: ndarray,
                  normalization: int,
-                 quantiles: int,
+                 n_quantiles: int,
                  reference_hist=None):
         """
         Parameters
@@ -60,7 +60,7 @@ def __init__(self,
             The column's ranked data
         normalization : int
             The number that normalizes the histogram values
-        quantiles : int
+        n_quantiles : int
             The number of quantiles
         reference_hist : QuantileHistogram, optional
             The reference histogram that provides the bucket boundaries
@@ -69,11 +69,11 @@ def __init__(self,
         self.bucket_values = {}
         self.name = name
         self.normalization_factor = normalization
-        self.quantiles = quantiles
+        self.quantiles = n_quantiles
         self.dist_matrix = self.calc_dist_matrix()
         if reference_hist is None:
             self.add_buckets(ranks.min(initial=math.inf),
-                             ss.mstats.mquantiles(ranks, np.array(list(range(1, quantiles + 1))) / quantiles))
+                             [round(q, 3) for q in quantiles(ranks, n=self.quantiles + 1, method='inclusive')])
             self.add_values(ranks)
         else:
             self.bucket_boundaries = reference_hist.bucket_boundaries