capitalone · carlsonp · Mar 14, 2024 · Mar 14, 2024
@@ -55,7 +55,7 @@ repos:
             pyarrow>=1.0.1,
             chardet>=3.0.4,
             fastavro>=1.0.0.post1,
-            python-snappy>=0.5.4,
+            cramjam>=2.7.0,
             charset-normalizer>=1.3.6,
             psutil>=4.0.0,
             scipy>=1.4.1,

@@ -20,22 +20,6 @@
 from .validators.base_validators import Validator
 from .version import __version__
 
-try:
-    import snappy
-except ImportError:
-    import warnings
-
-    warnings.warn(
-        "Snappy must be installed to use parquet/avro datasets."
-        "\n\n"
-        "For macOS use Homebrew:\n"
-        "\t`brew install snappy`"
-        "\n\n"
-        "For linux use apt-get:\n`"
-        "\tsudo apt-get -y install libsnappy-dev`\n",
-        ImportWarning,
-    )
-
 
 def set_seed(seed=None):
     # also check it's an integer

@@ -3003,7 +3003,7 @@ def tqdm(level: set[int]) -> Generator[int, None, None]:
         notification_str = "Calculating the statistics... "
         pool = None
         if auto_multiprocess_toggle:
-            pool, pool_size = profiler_utils.generate_pool(4, est_data_size)
+            pool, pool_size = profiler_utils.generate_pool(profiler_utils.suggest_pool_size(), est_data_size)
             if pool:
                 notification_str += " (with " + str(pool_size) + " processes)"
 

@@ -56,46 +56,6 @@ def test_data_profiling(self):
             self.assertIsNotNone(profile.profile)
             self.assertIsNotNone(profile.report())
 
-    def test_no_snappy(self):
-        import importlib
-        import sys
-        import types
-
-        orig_import = __import__
-        # necessary for any wrapper around the library to test if snappy caught
-        # as an issue
-
-        def reload_data_profiler():
-            """Recursively reload modules."""
-            sys_modules = sys.modules.copy()
-            for module_name, module in sys_modules.items():
-                # Only reload top level of the dataprofiler
-                if "dataprofiler" in module_name and len(module_name.split(".")) < 3:
-                    if isinstance(module, types.ModuleType):
-                        importlib.reload(module)
-
-        def import_mock(name, *args, **kwargs):
-            if name == "snappy":
-                raise ImportError("test")
-            return orig_import(name, *args, **kwargs)
-
-        with mock.patch("builtins.__import__", side_effect=import_mock):
-            with self.assertWarns(ImportWarning) as w:
-                import dataprofiler
-
-                reload_data_profiler()
-
-        self.assertEqual(
-            str(w.warning),
-            "Snappy must be installed to use parquet/avro datasets."
-            "\n\n"
-            "For macOS use Homebrew:\n"
-            "\t`brew install snappy`"
-            "\n\n"
-            "For linux use apt-get:\n`"
-            "\tsudo apt-get -y install libsnappy-dev`\n",
-        )
-
     def test_no_tensorflow(self):
         import sys
 

@@ -7,7 +7,7 @@ pytz>=2020.1
 pyarrow>=1.0.1
 chardet>=3.0.4
 fastavro>=1.0.0.post1
-python-snappy>=0.5.4
+cramjam>=2.7.0
 charset-normalizer>=1.3.6
 psutil>=4.0.0
 scipy>=1.10.0