diff --git a/.github/workflows/build_all_os.yml b/.github/workflows/build_all_os.yml index 175ea19..fd0388c 100644 --- a/.github/workflows/build_all_os.yml +++ b/.github/workflows/build_all_os.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 - name: Set up Python diff --git a/README.md b/README.md index 240ee30..5c6cf95 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![build](https://github.com/delftdata/valentine/actions/workflows/build.yml/badge.svg)](https://github.com/delftdata/valentine/actions/workflows/build.yml) [![codecov](https://codecov.io/gh/delftdata/valentine/branch/master/graph/badge.svg?token=4QR0X315CL)](https://codecov.io/gh/delftdata/valentine) [![PyPI version](https://badge.fury.io/py/valentine.svg)](https://badge.fury.io/py/valentine) -[![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![Python 3.8+](https://img.shields.io/badge/python-3.8|3.9|3.10-blue.svg)](https://www.python.org/downloads/release/python-380/) A python package for capturing potential relationships among columns of different tabular datasets, which are given in the form of pandas DataFrames. Valentine is based on [Valentine: Evaluating Matching Techniques for Dataset Discovery](https://ieeexplore.ieee.org/abstract/document/9458921) @@ -17,7 +17,7 @@ pip install valentine ## Installation requirements -* Python 3.8, 3.9 +* Python>=3.8 diff --git a/requirements.txt b/requirements.txt index c51ca10..b55e97d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ # algorithms numpy==1.21.2 -scipy==1.7.1 -pandas==1.3.3 -nltk==3.6.4 +pandas==1.3.4 +nltk==3.6.5 snakecase==1.0.1 anytree==2.8.0 six==1.16.0 diff --git a/setup.py b/setup.py index 8083b9e..a6fa60e 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='valentine', - version='0.1.1', + version='0.1.2', description='Valentine Matcher', license_files=('LICENSE',), author='Delft Data', @@ -14,11 +14,10 @@ maintainer='Delft Data', maintainer_email='delftdatasystems@gmail.com', url='https://delftdata.github.io/valentine/', - download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.1.tar.gz', - packages=setuptools.find_packages(exclude=('tests*',)), + download_url='https://github.com/delftdata/valentine/archive/refs/tags/v0.1.2.tar.gz', + packages=setuptools.find_packages(exclude=('tests*', 'examples*')), install_requires=[ 'numpy>=1.21,<2.0', - 'scipy>=1.7,<1.8', 'pandas>=1.3,<1.4', 'nltk>=3.6,<3.7', 'snakecase>=1.0,<2.0', diff --git a/valentine/algorithms/__init__.py b/valentine/algorithms/__init__.py index fa335fe..34c7bd3 100644 --- a/valentine/algorithms/__init__.py +++ b/valentine/algorithms/__init__.py @@ -11,6 +11,10 @@ all_matchers = schema_only_algorithms + instance_only_algorithms + schema_instance_algorithms __all__ = [ + "schema_only_algorithms", + "instance_only_algorithms", + "schema_instance_algorithms", + "all_matchers", "Coma", "Cupid", "DistributionBased", diff --git a/valentine/algorithms/distribution_based/quantile_histogram.py b/valentine/algorithms/distribution_based/quantile_histogram.py index 0f20dd3..1df0802 100644 --- a/valentine/algorithms/distribution_based/quantile_histogram.py +++ b/valentine/algorithms/distribution_based/quantile_histogram.py @@ -1,6 +1,6 @@ +from statistics import quantiles from numpy import ndarray import numpy as np -import scipy.stats as ss import math @@ -49,7 +49,7 @@ def __init__(self, name: tuple, ranks: ndarray, normalization: int, - quantiles: int, + n_quantiles: int, reference_hist=None): """ Parameters @@ -60,7 +60,7 @@ def __init__(self, The column's ranked data normalization : int The number that normalizes the histogram values - quantiles : int + n_quantiles : int The number of quantiles reference_hist : QuantileHistogram, optional The reference histogram that provides the bucket boundaries @@ -69,11 +69,11 @@ def __init__(self, self.bucket_values = {} self.name = name self.normalization_factor = normalization - self.quantiles = quantiles + self.quantiles = n_quantiles self.dist_matrix = self.calc_dist_matrix() if reference_hist is None: self.add_buckets(ranks.min(initial=math.inf), - ss.mstats.mquantiles(ranks, np.array(list(range(1, quantiles + 1))) / quantiles)) + [round(q, 3) for q in quantiles(ranks, n=self.quantiles + 1, method='inclusive')]) self.add_values(ranks) else: self.bucket_boundaries = reference_hist.bucket_boundaries