From 3d9b5ffdef05cbf9507f9493233e5f0a47b2dfd6 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Tue, 16 Jan 2024 11:19:35 -0500 Subject: [PATCH 01/17] add downloads tile (#1085) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3ba4ee51b..1df9a2ea3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/DataProfiler) ![GitHub](https://img.shields.io/github/license/CapitalOne/DataProfiler) ![GitHub last commit](https://img.shields.io/github/last-commit/CapitalOne/DataProfiler) +[![Downloads](https://static.pepy.tech/badge/dataprofiler)](https://pepy.tech/project/dataprofiler)

From 516c6f52b74e13786abbd429c4141b2163c2d58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Fri, 2 Feb 2024 13:06:06 -0500 Subject: [PATCH 02/17] Add Python 3.11 to GHA --- .github/workflows/publish-python-package.yml | 2 +- .github/workflows/test-python-package.yml | 2 +- tox.ini | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish-python-package.yml b/.github/workflows/publish-python-package.yml index 75b9a41e2..4ed9e1bf3 100644 --- a/.github/workflows/publish-python-package.yml +++ b/.github/workflows/publish-python-package.yml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.11' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml index fa84b3d3a..5b7d6e5ab 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-python-package.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8, 3.9, "3.10"] + python-version: [3.8, 3.9, "3.10", "3.11"] steps: - uses: actions/checkout@v4 diff --git a/tox.ini b/tox.ini index 55fa50147..18c327525 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37, py38, py39, py310, docs, pypi-description, manifest, precom +envlist = py38, py39, py310, py311, docs, pypi-description, manifest, precom [testenv] From f41111034b415c600387a1006f541e928cf9db49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 09:12:10 -0400 Subject: [PATCH 03/17] Replace snappy with cramjam (#1091) * add downloads tile (#1085) * Replace snappy with cramjam * Delete test_no_snappy --------- Co-authored-by: Taylor Turner --- .pre-commit-config.yaml | 2 +- dataprofiler/__init__.py | 16 ---------- dataprofiler/tests/test_data_profiler.py | 40 ------------------------ requirements.txt | 2 +- 4 files changed, 2 insertions(+), 58 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 203e62b1f..b1d3ca62a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,7 +55,7 @@ repos: pyarrow>=1.0.1, chardet>=3.0.4, fastavro>=1.0.0.post1, - python-snappy>=0.5.4, + cramjam>=2.7.0, charset-normalizer>=1.3.6, psutil>=4.0.0, scipy>=1.4.1, diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py index 2e89d3e2b..5f218bd85 100644 --- a/dataprofiler/__init__.py +++ b/dataprofiler/__init__.py @@ -20,22 +20,6 @@ from .validators.base_validators import Validator from .version import __version__ -try: - import snappy -except ImportError: - import warnings - - warnings.warn( - "Snappy must be installed to use parquet/avro datasets." - "\n\n" - "For macOS use Homebrew:\n" - "\t`brew install snappy`" - "\n\n" - "For linux use apt-get:\n`" - "\tsudo apt-get -y install libsnappy-dev`\n", - ImportWarning, - ) - def set_seed(seed=None): # also check it's an integer diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py index ef7664cea..9ebdfa039 100644 --- a/dataprofiler/tests/test_data_profiler.py +++ b/dataprofiler/tests/test_data_profiler.py @@ -56,46 +56,6 @@ def test_data_profiling(self): self.assertIsNotNone(profile.profile) self.assertIsNotNone(profile.report()) - def test_no_snappy(self): - import importlib - import sys - import types - - orig_import = __import__ - # necessary for any wrapper around the library to test if snappy caught - # as an issue - - def reload_data_profiler(): - """Recursively reload modules.""" - sys_modules = sys.modules.copy() - for module_name, module in sys_modules.items(): - # Only reload top level of the dataprofiler - if "dataprofiler" in module_name and len(module_name.split(".")) < 3: - if isinstance(module, types.ModuleType): - importlib.reload(module) - - def import_mock(name, *args, **kwargs): - if name == "snappy": - raise ImportError("test") - return orig_import(name, *args, **kwargs) - - with mock.patch("builtins.__import__", side_effect=import_mock): - with self.assertWarns(ImportWarning) as w: - import dataprofiler - - reload_data_profiler() - - self.assertEqual( - str(w.warning), - "Snappy must be installed to use parquet/avro datasets." - "\n\n" - "For macOS use Homebrew:\n" - "\t`brew install snappy`" - "\n\n" - "For linux use apt-get:\n`" - "\tsudo apt-get -y install libsnappy-dev`\n", - ) - def test_no_tensorflow(self): import sys diff --git a/requirements.txt b/requirements.txt index a45dc34ae..405f808b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ pytz>=2020.1 pyarrow>=1.0.1 chardet>=3.0.4 fastavro>=1.0.0.post1 -python-snappy>=0.5.4 +cramjam>=2.7.0 charset-normalizer>=1.3.6 psutil>=4.0.0 scipy>=1.10.0 From f814ab7e9e0030a847bf5c7cd0d19765c60b7e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 11:01:49 -0400 Subject: [PATCH 04/17] Update dask modules --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index df4be852e..073628420 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ coverage>=5.0.1 -dask>=2.29.0 +dask[dask-expr]>=2020.12.0 fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 From eb9d89ef0ba43041f6745e70cfab92dac6f4325a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 11:14:09 -0400 Subject: [PATCH 05/17] Install dask dataframe --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 073628420..4281e6060 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ coverage>=5.0.1 -dask[dask-expr]>=2020.12.0 +dask[dask-expr,dataframe]>=2020.12.0 fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 From d23c4851c5a38f27af7fdeb2c3b8c3851e8a3420 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 11:39:21 -0400 Subject: [PATCH 06/17] Update dask modules in precommit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b1d3ca62a..c3ecf7f5b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -93,7 +93,7 @@ repos: # requirements-test.txt coverage>=5.0.1, - dask>=2.29.0, + dask[dask-expr,dataframe]>=2020.12.0, fsspec>=0.3.3, pytest>=6.0.1, pytest-cov>=2.8.1, From 63acf45c08ad8514f9eee9014e5858e44d58efbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 11:44:41 -0400 Subject: [PATCH 07/17] Correct copy/paste error From bd1874e101c0b8d11ad02f1b6c8dfc69e3e2deb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 14 Mar 2024 11:48:36 -0400 Subject: [PATCH 08/17] Try again to clear Unicode From b9f7a8a154c357a54edab248e9591388602dc261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 18 Mar 2024 13:53:09 -0400 Subject: [PATCH 09/17] Rolled back pre-commit dask version --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c3ecf7f5b..b1d3ca62a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -93,7 +93,7 @@ repos: # requirements-test.txt coverage>=5.0.1, - dask[dask-expr,dataframe]>=2020.12.0, + dask>=2.29.0, fsspec>=0.3.3, pytest>=6.0.1, pytest-cov>=2.8.1, From 95bba8289e5e27b7f93e3d91bce73881654fbeae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sun, 9 Jun 2024 14:04:24 -0400 Subject: [PATCH 10/17] Add py311 to tox --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index dc3a7c6c6..90d06af06 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,5 @@ [tox] -envlist = py39, py310, pypi-description, manifest, precom - +envlist = py39, py310, py311, pypi-description, manifest, precom [testenv] From c507ade53436fc82d763bdd81713667b7c03d64b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sun, 9 Jun 2024 14:05:44 -0400 Subject: [PATCH 11/17] Bump dask to 2024.4.1 --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 60ef71bc4..725b23849 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ coverage>=5.0.1 -dask[dask-expr,dataframe]>=2.29.0,<2024.2.0 +dask[dask-expr,dataframe]>=2024.4.1 fsspec>=0.3.3 pytest>=6.0.1 pytest-cov>=2.8.1 From 56b8b2cf1043044a29dd9e8e5d1a4feabcbb47e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 10 Jun 2024 10:51:24 -0400 Subject: [PATCH 12/17] Bump python-snappy 0.7.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 152b5eb36..b3df933ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ pytz>=2020.1 pyarrow>=1.0.1 chardet>=3.0.4 fastavro>=1.1.0 -python-snappy>=0.5.4 +python-snappy>=0.7.1 charset-normalizer>=1.3.6 psutil>=4.0.0 scipy>=1.10.0 From e118ee84ad6130b26efead02946f5eb1736c6b09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 10 Jun 2024 16:51:10 -0400 Subject: [PATCH 13/17] Rewrite labeler test --- .../tests/labelers/test_labeler_utils.py | 43 ++++++++----------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index f59a43e3f..361959449 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -1,6 +1,7 @@ import logging import unittest from unittest import mock +import tempfile import numpy as np import pandas as pd @@ -235,9 +236,7 @@ def test_verbose(self): self.assertIn("f1-score ", log_output) self.assertIn("F1 Score: ", log_output) - @mock.patch("dataprofiler.labelers.labeler_utils.classification_report") - @mock.patch("pandas.DataFrame") - def test_save_conf_mat(self, mock_dataframe, mock_report): + def test_save_conf_mat(self): # ideally mock out the actual contents written to file, but # would be difficult to get this completely worked out. @@ -248,29 +247,25 @@ def test_save_conf_mat(self, mock_dataframe, mock_report): [0, 1, 2], ] ) - expected_row_col_names = dict( - columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"], - index=["true:PAD", "true:UNKNOWN", "true:OTHER"], - ) - mock_instance_df = mock.Mock(spec=pd.DataFrame)() - mock_dataframe.return_value = mock_instance_df + expected_columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"] + expected_index=["true:PAD", "true:UNKNOWN", "true:OTHER"] - # still omit bc confusion mat should include all despite omit - f1, f1_report = labeler_utils.evaluate_accuracy( - self.y_pred, - self.y_true, - self.num_labels, - self.reverse_label_mapping, - omitted_labels=["PAD"], - verbose=False, - confusion_matrix_file="test.csv", - ) - - self.assertTrue((mock_dataframe.call_args[0][0] == expected_conf_mat).all()) - self.assertDictEqual(expected_row_col_names, mock_dataframe.call_args[1]) - - mock_instance_df.to_csv.assert_called() + with tempfile.NamedTemporaryFile() as tmpFile: + # still omit bc confusion mat should include all despite omit + f1, f1_report = labeler_utils.evaluate_accuracy( + self.y_pred, + self.y_true, + self.num_labels, + self.reverse_label_mapping, + omitted_labels=["PAD"], + verbose=False, + confusion_matrix_file=tmpFile.name, + ) + df1 = pd.read_csv(tmpFile.name, index_col=0) + self.assertListEqual(list(df1.columns), expected_columns) + self.assertListEqual(list(df1.index), expected_index) + np.testing.assert_array_equal(df1.values, expected_conf_mat) class TestTFFunctions(unittest.TestCase): def test_get_tf_layer_index_from_name(self): From 9021a7e5e21b76ef48f60c38251130ba8add3efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 10 Jun 2024 17:03:51 -0400 Subject: [PATCH 14/17] Correct isort --- dataprofiler/tests/labelers/test_labeler_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index 361959449..cbac15c9b 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -1,7 +1,7 @@ import logging +import tempfile import unittest from unittest import mock -import tempfile import numpy as np import pandas as pd From c59403466350dfc54064abb92caa332bb74d5619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 10 Jun 2024 17:09:37 -0400 Subject: [PATCH 15/17] Satisfy black --- dataprofiler/tests/labelers/test_labeler_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index cbac15c9b..3a3b56ea2 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -247,8 +247,8 @@ def test_save_conf_mat(self): [0, 1, 2], ] ) - expected_columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"] - expected_index=["true:PAD", "true:UNKNOWN", "true:OTHER"] + expected_columns = ["pred:PAD", "pred:UNKNOWN", "pred:OTHER"] + expected_index = ["true:PAD", "true:UNKNOWN", "true:OTHER"] with tempfile.NamedTemporaryFile() as tmpFile: # still omit bc confusion mat should include all despite omit @@ -267,6 +267,7 @@ def test_save_conf_mat(self): self.assertListEqual(list(df1.index), expected_index) np.testing.assert_array_equal(df1.values, expected_conf_mat) + class TestTFFunctions(unittest.TestCase): def test_get_tf_layer_index_from_name(self): model = tf.keras.Sequential() From 94236da642dd1b1b71f79e4f5f73b33ba453f4c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 10 Jun 2024 17:19:41 -0400 Subject: [PATCH 16/17] And flake8 --- dataprofiler/tests/labelers/test_labeler_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index 3a3b56ea2..c14fca54f 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -1,7 +1,6 @@ import logging import tempfile import unittest -from unittest import mock import numpy as np import pandas as pd From 6ca2c875408e4f08faf78fda858799df45cea8b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Tue, 11 Jun 2024 09:39:46 -0400 Subject: [PATCH 17/17] Synced with requirements --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f36c52663..ee9bddf6a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,7 +55,7 @@ repos: pyarrow>=1.0.1, chardet>=3.0.4, fastavro>=1.0.0.post1, - cramjam>=2.7.0, + python-snappy>=0.7.1, charset-normalizer>=1.3.6, psutil>=4.0.0, scipy>=1.4.1,