From 0e244b780676bd73322b82e8a70ed825863e1059 Mon Sep 17 00:00:00 2001
From: jacob-buehler <jacobbuehler@college.harvard.edu>
Date: Thu, 22 Jun 2023 11:21:11 -0400
Subject: [PATCH 1/2] Updated null stats tests to include data initialized in
 setUpClass function

---
 .../tests/profilers/test_profile_builder.py   | 330 +++++++++++-------
 1 file changed, 199 insertions(+), 131 deletions(-)

diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
index 3a5505192..1ac37176f 100644
--- a/dataprofiler/tests/profilers/test_profile_builder.py
+++ b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -33,8 +33,7 @@
     StructuredOptions,
     UnstructuredOptions,
 )
-
-from . import utils as test_utils
+from dataprofiler.tests.profilers import utils as test_utils
 
 test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 
@@ -3560,6 +3559,17 @@ def test_correct_rows_ingested(self):
             "1": ["nan", "null", None, None, ""],
             1: ["nan", "None", "null", None, ""],
         }
+
+        # Although I know the point of this ticket was to use the data initialized in setUpClass,
+        # this function does not work properly, and cannot be made to work properly, with that data,
+        # so I think we should keep the original data for this function. The reason for this
+        # is that there are four types of null values in the 'test_dict' dataset, but just one
+        # type of null value in the setUpClass dataset. A major part of this test is verifying
+        # that multiple kinds of null values can be detected, so the setUpClass data
+        # is inappropriate.
+
+        # test_dict = self.data
+
         test_dataset = pd.DataFrame(data=test_dict)
         profiler_options = ProfilerOptions()
         profiler_options.set(
@@ -3592,8 +3602,10 @@ def test_correct_rows_ingested(self):
         )
 
     def test_correct_null_row_counts(self):
-        file_path = os.path.join(test_root_path, "data", "csv/empty_rows.txt")
-        data = pd.read_csv(file_path)
+        # file_path = os.path.join(test_root_path, "data", "csv/empty_rows.txt")
+        # data = pd.read_csv(file_path)
+        data = self.data
+
         profiler_options = ProfilerOptions()
         profiler_options.set(
             {
@@ -3602,18 +3614,25 @@ def test_correct_null_row_counts(self):
             }
         )
         profile = dp.StructuredProfiler(data, options=profiler_options)
-        self.assertEqual(2, profile.row_has_null_count)
-        self.assertEqual(0.25, profile._get_row_has_null_ratio())
-        self.assertEqual(2, profile.row_is_null_count)
-        self.assertEqual(0.25, profile._get_row_is_null_ratio())
+        self.assertEqual(10, profile.row_has_null_count)
+        self.assertEqual(0.5, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(0, profile._get_row_is_null_ratio())
 
-        file_path = os.path.join(test_root_path, "data", "csv/iris-with-null-rows.csv")
-        data = pd.read_csv(file_path)
-        profile = dp.StructuredProfiler(data, options=profiler_options)
-        self.assertEqual(13, profile.row_has_null_count)
-        self.assertEqual(13 / 24, profile._get_row_has_null_ratio())
-        self.assertEqual(3, profile.row_is_null_count)
-        self.assertEqual(3 / 24, profile._get_row_is_null_ratio())
+        # I commented out these lines of code, because they are a second
+        # test of the functions tested in the last four lines of code.
+        # Since we intend to use only the setUpClass data, there is no
+        # reason to keep these in, or test those functions a second time.
+
+        # file_path = os.path.join(test_root_path, "data", "csv/iris-with-null-rows.csv")
+        # data = pd.read_csv(file_path)
+        # data = self.data
+
+        # profile = dp.StructuredProfiler(data, options=profiler_options)
+        # self.assertEqual(13, profile.row_has_null_count)
+        # self.assertEqual(13 / 24, profile._get_row_has_null_ratio())
+        # self.assertEqual(3, profile.row_is_null_count)
+        # self.assertEqual(3 / 24, profile._get_row_is_null_ratio())
 
     def test_row_is_null_ratio_row_stats_disabled(self):
         profiler_options_1 = ProfilerOptions()
@@ -3646,35 +3665,39 @@ def test_null_in_file(self):
                 "row_statistics.is_enabled": True,
             }
         )
-        data = dp.Data(filename_null_in_file)
+        # data = dp.Data(filename_null_in_file)
+        data = self.data
+
         profile = dp.StructuredProfiler(data, options=profiler_options)
 
         report = profile.report(report_options={"output_format": "pretty"})
-        count_idx = report["global_stats"]["profile_schema"]["COUNT"][0]
-        numbers_idx = report["global_stats"]["profile_schema"][" NUMBERS"][0]
+        names_idx = report["global_stats"]["profile_schema"]["names"][0]
+        numbers_idx = report["global_stats"]["profile_schema"]["numbers"][0]
 
         self.assertEqual(
-            report["data_stats"][count_idx]["statistics"]["null_types_index"],
-            {"": "[2, 3, 4, 5, 7, 8]"},
+            report["data_stats"][names_idx]["statistics"]["null_types_index"],
+            {},
         )
 
         self.assertEqual(
             report["data_stats"][numbers_idx]["statistics"]["null_types_index"],
-            {"": "[5, 6, 8]", " ": "[2, 4]"},
+            {},
         )
 
     def test_correct_total_sample_size_and_counts_and_mutability(self):
-        data = [
-            ["test1", 1.0],
-            ["test2", 2.0],
-            ["test3", 3.0],
-            [None, None],
-            ["test5", 5.0],
-            ["test6", 6.0],
-            [None, None],
-            ["test7", 7.0],
-        ]
-        data = pd.DataFrame(data, columns=["NAME", "VALUE"])
+        # data = [
+        #     ["test1", 1.0],
+        #     ["test2", 2.0],
+        #     ["test3", 3.0],
+        #     [None, None],
+        #     ["test5", 5.0],
+        #     ["test6", 6.0],
+        #     [None, None],
+        #     ["test7", 7.0],
+        # ]
+        # data = pd.DataFrame(data, columns=["NAME", "VALUE"])
+        data = self.data
+
         profiler_options = ProfilerOptions()
         profiler_options.set(
             {
@@ -3683,33 +3706,34 @@ def test_correct_total_sample_size_and_counts_and_mutability(self):
             }
         )
 
-        col_one_len = len(data["NAME"])
-        col_two_len = len(data["VALUE"])
+        col_one_len = len(data["names"])
+        col_two_len = len(data["numbers"])
 
         # Test reloading data, ensuring immutable
         for i in range(2):
             # Profile Once
-            data.index = pd.RangeIndex(0, 8)
+            data.index = pd.RangeIndex(0, 20)
+
             profile = dp.StructuredProfiler(
                 data, options=profiler_options, samples_per_update=2
             )
 
             # Profile Twice
-            data.index = pd.RangeIndex(8, 16)
+            data.index = pd.RangeIndex(20, 40)
             profile.update_profile(data)
 
             # rows sampled are [5, 6], [13, 14] (0 index)
-            self.assertEqual(16, profile.total_samples)
+            self.assertEqual(40, profile.total_samples)
             self.assertEqual(4, profile._max_col_samples_used)
-            self.assertEqual(2, profile.row_has_null_count)
-            self.assertEqual(0.5, profile._get_row_has_null_ratio())
-            self.assertEqual(2, profile.row_is_null_count)
-            self.assertEqual(0.5, profile._get_row_is_null_ratio())
-            self.assertEqual(0.4375, profile._get_unique_row_ratio())
-            self.assertEqual(9, profile._get_duplicate_row_count())
+            self.assertEqual(0, profile.row_has_null_count)
+            self.assertEqual(0, profile._get_row_has_null_ratio())
+            self.assertEqual(0, profile.row_is_null_count)
+            self.assertEqual(0, profile._get_row_is_null_ratio())
+            self.assertEqual(0.375, profile._get_unique_row_ratio())
+            self.assertEqual(25, profile._get_duplicate_row_count())
 
-        self.assertEqual(col_one_len, len(data["NAME"]))
-        self.assertEqual(col_two_len, len(data["VALUE"]))
+        self.assertEqual(col_one_len, len(data["names"]))
+        self.assertEqual(col_two_len, len(data["numbers"]))
 
     def test_null_calculation_with_differently_sampled_cols(self):
         opts = ProfilerOptions()
@@ -3719,48 +3743,56 @@ def test_null_calculation_with_differently_sampled_cols(self):
                 "row_statistics.is_enabled": True,
             }
         )
-        data = pd.DataFrame(
-            {
-                "full": [1, 2, 3, 4, 5, 6, 7, 8, 9],
-                "sparse": [1, None, 3, None, 5, None, 7, None, 9],
-            }
-        )
+        # data = pd.DataFrame(
+        #     {
+        #         "full": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+        #         "sparse": [1, None, 3, None, 5, None, 7, None, 9],
+        #     }
+        # )
+        data = pd.DataFrame(self.data)
+
+        data1 = data[0:10]
+        data2 = data[11:]
+
         profile = dp.StructuredProfiler(
-            data, samples_per_update=5, min_true_samples=5, options=opts
+            data1, samples_per_update=5, min_true_samples=5, options=opts
         )
+
         # Rows 2, 4, 5, 6, 7 are sampled in first column
         # Therefore only those rows should be considered for null calculations
         # The only null in those rows in second column in that subset are 5, 7
         # Therefore only 2 rows have null according to row_has_null_count
         self.assertEqual(0, profile.row_is_null_count)
-        self.assertEqual(2, profile.row_has_null_count)
+        self.assertEqual(3, profile.row_has_null_count)
         # Accordingly, make sure ratio of null rows accounts for the fact that
         # Only 5 total rows were sampled (5 in col 1, 9 in col 2)
         self.assertEqual(0, profile._get_row_is_null_ratio())
-        self.assertEqual(0.4, profile._get_row_has_null_ratio())
+        self.assertEqual(0.6, profile._get_row_has_null_ratio())
+
+        # data2 = pd.DataFrame(
+        #     {
+        #         "sparse": [1, None, 3, None, 5, None, 7, None],
+        #         "sparser": [1, None, None, None, None, None, None, 8],
+        #     }
+        # )
 
-        data2 = pd.DataFrame(
-            {
-                "sparse": [1, None, 3, None, 5, None, 7, None],
-                "sparser": [1, None, None, None, None, None, None, 8],
-            }
-        )
         profile2 = dp.StructuredProfiler(
             data2, samples_per_update=2, min_true_samples=2, options=opts
         )
+
         # Rows are sampled as follows: [6, 5], [1, 4], [2, 3], [0, 7]
         # First column gets min true samples from ids 1, 4, 5, 6
         # Second column gets completely sampled (has a null in 1, 4, 5, 6)
         # rows 1 and 5 are completely null, 4 and 6 only null in col 2
-        self.assertEqual(2, profile2.row_is_null_count)
-        self.assertEqual(4, profile2.row_has_null_count)
+        self.assertEqual(0, profile2.row_is_null_count)
+        self.assertEqual(1, profile2.row_has_null_count)
         # Only 4 total rows sampled, ratio accordingly
-        self.assertEqual(0.5, profile2._get_row_is_null_ratio())
-        self.assertEqual(1, profile2._get_row_has_null_ratio())
+        self.assertEqual(0, profile2._get_row_is_null_ratio())
+        self.assertEqual(0.5, profile2._get_row_has_null_ratio())
 
     def test_null_row_stats_correct_after_updates(self, *mocks):
-        data1 = pd.DataFrame([[1, None], [1, 1], [None, None], [None, 1]])
-        data2 = pd.DataFrame([[None, None], [1, None], [None, None], [None, 1]])
+        new_data, update_data = self.data[0:10], self.data[11:]
+
         opts = ProfilerOptions()
         opts.set(
             {
@@ -3771,28 +3803,36 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
 
         # When setting min true samples/samples per update
         profile = dp.StructuredProfiler(
-            data1, min_true_samples=2, samples_per_update=2, options=opts
+            new_data, min_true_samples=2, samples_per_update=2, options=opts
         )
+
+        self.assertEqual(2, profile.row_has_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(1.0, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(2, profile._min_sampled_from_batch)
+        # self.assertSetEqual({}, profile._profile[0].null_types_index)
+        # self.assertSetEqual({}, profile._profile[1].null_types_index)
+        self.assertEqual({}, profile._profile[0].null_types_index)
+        self.assertEqual({}, profile._profile[1].null_types_index)
+
+        profile.update_profile(update_data, min_true_samples=2, sample_size=2)
         self.assertEqual(3, profile.row_has_null_count)
-        self.assertEqual(1, profile.row_is_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
         self.assertEqual(0.75, profile._get_row_has_null_ratio())
-        self.assertEqual(0.25, profile._get_row_is_null_ratio())
-        self.assertEqual(4, profile._min_sampled_from_batch)
-        self.assertSetEqual({2, 3}, profile._profile[0].null_types_index["nan"])
-        self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(2, profile._min_sampled_from_batch)
+        # self.assertSetEqual(
+        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        # )
+        # self.assertSetEqual(
+        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        # )
 
-        profile.update_profile(data2, min_true_samples=2, sample_size=2)
-        self.assertEqual(7, profile.row_has_null_count)
-        self.assertEqual(3, profile.row_is_null_count)
-        self.assertEqual(0.875, profile._get_row_has_null_ratio())
-        self.assertEqual(0.375, profile._get_row_is_null_ratio())
-        self.assertEqual(4, profile._min_sampled_from_batch)
-        self.assertSetEqual(
-            {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        )
-        self.assertSetEqual(
-            {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        self.assertEqual(
+            {36, 38, 24, 26, 28}, profile._profile[2].null_types_index["nan"]
         )
+        self.assertEqual({}, profile._profile[1].null_types_index)
 
         # When not setting min true samples/samples per update
         opts = ProfilerOptions()
@@ -3802,58 +3842,83 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
                 "row_statistics.is_enabled": True,
             }
         )
-        profile = dp.StructuredProfiler(data1, options=opts)
-        self.assertEqual(3, profile.row_has_null_count)
-        self.assertEqual(1, profile.row_is_null_count)
-        self.assertEqual(0.75, profile._get_row_has_null_ratio())
-        self.assertEqual(0.25, profile._get_row_is_null_ratio())
-        self.assertEqual(4, profile._min_sampled_from_batch)
-        self.assertSetEqual({2, 3}, profile._profile[0].null_types_index["nan"])
-        self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])
-
-        profile.update_profile(data2)
-        self.assertEqual(7, profile.row_has_null_count)
-        self.assertEqual(3, profile.row_is_null_count)
-        self.assertEqual(0.875, profile._get_row_has_null_ratio())
-        self.assertEqual(0.375, profile._get_row_is_null_ratio())
-        self.assertEqual(4, profile._min_sampled_from_batch)
-        self.assertSetEqual(
-            {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        profile = dp.StructuredProfiler(new_data, options=opts)
+        self.assertEqual(5, profile.row_has_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(0.5, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(10, profile._min_sampled_from_batch)
+        # self.assertSetEqual({20, 22, 24, 26, 28}, profile._profile[2].null_types_index["nan"])
+        # self.assertSetEqual({}, profile._profile[1].null_types_index)
+        self.assertEqual(
+            {20, 22, 24, 26, 28}, profile._profile[2].null_types_index["nan"]
         )
-        self.assertSetEqual(
-            {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        self.assertEqual({}, profile._profile[1].null_types_index)
+
+        profile.update_profile(update_data)
+        self.assertEqual(9, profile.row_has_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(0.47368421052631576, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(9, profile._min_sampled_from_batch)
+        # self.assertSetEqual(
+        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        # )
+        # self.assertSetEqual(
+        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        # )
+        self.assertEqual(
+            {32, 34, 36, 38, 20, 22, 24, 26, 28},
+            profile._profile[2].null_types_index["nan"],
         )
+        self.assertEqual({}, profile._profile[1].null_types_index)
 
         # Test that update with emtpy data doesn't change stats
         profile.update_profile(pd.DataFrame([]))
-        self.assertEqual(7, profile.row_has_null_count)
-        self.assertEqual(3, profile.row_is_null_count)
-        self.assertEqual(0.875, profile._get_row_has_null_ratio())
-        self.assertEqual(0.375, profile._get_row_is_null_ratio())
-        self.assertEqual(4, profile._min_sampled_from_batch)
-        self.assertSetEqual(
-            {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        )
-        self.assertSetEqual(
-            {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        self.assertEqual(9, profile.row_has_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(0.47368421052631576, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(9, profile._min_sampled_from_batch)
+        # self.assertSetEqual(
+        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        # )
+        # self.assertSetEqual(
+        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        # )
+        self.assertEqual(
+            {32, 34, 36, 38, 20, 22, 24, 26, 28},
+            profile._profile[2].null_types_index["nan"],
         )
+        self.assertEqual({}, profile._profile[1].null_types_index)
 
         # Test one row update
-        profile.update_profile(pd.DataFrame([[1, None]]))
-        self.assertEqual(8, profile.row_has_null_count)
-        self.assertEqual(3, profile.row_is_null_count)
-        self.assertEqual(8 / 9, profile._get_row_has_null_ratio())
-        self.assertEqual(3 / 9, profile._get_row_is_null_ratio())
-        self.assertEqual(1, profile._min_sampled_from_batch)
-        self.assertSetEqual(
-            {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        # profile.update_profile(pd.DataFrame([['apple',5,2.0]]))
+        profile.update_profile(
+            pd.DataFrame({"names": ["jeremy"], "numbers": [4], "tf_null": [None]})
         )
-        self.assertSetEqual(
-            {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+
+        self.assertEqual(10, profile.row_has_null_count)
+        self.assertEqual(0, profile.row_is_null_count)
+        self.assertEqual(0.5, profile._get_row_has_null_ratio())
+        self.assertEqual(0, profile._get_row_is_null_ratio())
+        self.assertEqual(1, profile._min_sampled_from_batch)
+        # self.assertSetEqual(
+        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
+        # )
+        # self.assertSetEqual(
+        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
+        # )
+        self.assertEqual(
+            {32, 34, 36, 38, 20, 22, 24, 26, 28},
+            profile._profile[2].null_types_index["nan"],
         )
+        self.assertEqual({}, profile._profile[1].null_types_index)
+
         # Weird pandas behavior makes this None since this column will be
         # recognized as object, not float64
-        self.assertSetEqual({8}, profile._profile[1].null_types_index["None"])
+        # self.assertSetEqual({}, profile._profile[2].null_types_index["None"])
+        self.assertEqual({0}, profile._profile[2].null_types_index["None"])
 
         # Tests row stats disabled
         options = StructuredOptions()
@@ -3863,15 +3928,17 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
                 "row_statistics.is_enabled": False,
             }
         )
-        profile2 = StructuredProfiler(data1, options=options)
+        profile2 = StructuredProfiler(new_data, options=options)
         self.assertEqual(0, profile2.row_is_null_count)
         self.assertEqual(0, profile2.row_has_null_count)
 
     def test_list_data_with_hll(self):
 
-        data = pd.DataFrame(
-            {"a": [1, 1, 4, 4, 3, 1, None], "b": [1, None, 3, 4, 4, None, 1]}
-        )
+        # data = pd.DataFrame(
+        #     {"a": [1, 1, 4, 4, 3, 1, None], "b": [1, None, 3, 4, 4, None, 1]}
+        # )
+        data = pd.DataFrame(self.data)
+
         # test hll_row_hashing
         profiler_options = ProfilerOptions()
         profiler_options.set(
@@ -3885,10 +3952,11 @@ def test_list_data_with_hll(self):
         with test_utils.mock_timeit():
             profiler = dp.StructuredProfiler(data, options=profiler_options)
 
-        self.assertEqual(6, profiler.hashed_row_object.cardinality())
+        self.assertEqual(15, profiler.hashed_row_object.cardinality())
 
     def test_add_profilers_row_statistics_options(self):
-        data = pd.DataFrame([1, None, 3, 4, 5, None, 1])
+        # data = pd.DataFrame([1, None, 3, 4, 5, None, 1])
+        data = pd.DataFrame(self.data)
 
         default_options = ProfilerOptions()
         default_options.set(
@@ -4044,13 +4112,13 @@ def test_add_profilers_row_statistics_options(self):
                 + full_hashing_ignore_reg_count_mismatch_profiler_2
             )
 
-        self.assertEqual(5, len(merged_profile.hashed_row_object))
+        self.assertEqual(15, len(merged_profile.hashed_row_object))
 
         # test successful merge
         with test_utils.mock_timeit():
             merged_profile = default_profiler_1 + default_profiler_2
 
-        self.assertEqual(5, merged_profile.hashed_row_object.cardinality())
+        self.assertEqual(15, merged_profile.hashed_row_object.cardinality())
 
     def test_correct_unique_row_ratio_full_row_hashing(self):
         self.assertEqual(15, len(self.trained_schema_full.hashed_row_object))

From 83f3688b9626ee794affd3c1d26cfd7354346c47 Mon Sep 17 00:00:00 2001
From: jacob-buehler <jacobbuehler@college.harvard.edu>
Date: Thu, 22 Jun 2023 11:53:49 -0400
Subject: [PATCH 2/2] 11:52am revision of null stats tests

---
 .../tests/profilers/test_profile_builder.py   | 88 +------------------
 1 file changed, 2 insertions(+), 86 deletions(-)

diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
index 1ac37176f..1e6decf22 100644
--- a/dataprofiler/tests/profilers/test_profile_builder.py
+++ b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -3560,16 +3560,6 @@ def test_correct_rows_ingested(self):
             1: ["nan", "None", "null", None, ""],
         }
 
-        # Although I know the point of this ticket was to use the data initialized in setUpClass,
-        # this function does not work properly, and cannot be made to work properly, with that data,
-        # so I think we should keep the original data for this function. The reason for this
-        # is that there are four types of null values in the 'test_dict' dataset, but just one
-        # type of null value in the setUpClass dataset. A major part of this test is verifying
-        # that multiple kinds of null values can be detected, so the setUpClass data
-        # is inappropriate.
-
-        # test_dict = self.data
-
         test_dataset = pd.DataFrame(data=test_dict)
         profiler_options = ProfilerOptions()
         profiler_options.set(
@@ -3602,8 +3592,6 @@ def test_correct_rows_ingested(self):
         )
 
     def test_correct_null_row_counts(self):
-        # file_path = os.path.join(test_root_path, "data", "csv/empty_rows.txt")
-        # data = pd.read_csv(file_path)
         data = self.data
 
         profiler_options = ProfilerOptions()
@@ -3619,21 +3607,6 @@ def test_correct_null_row_counts(self):
         self.assertEqual(0, profile.row_is_null_count)
         self.assertEqual(0, profile._get_row_is_null_ratio())
 
-        # I commented out these lines of code, because they are a second
-        # test of the functions tested in the last four lines of code.
-        # Since we intend to use only the setUpClass data, there is no
-        # reason to keep these in, or test those functions a second time.
-
-        # file_path = os.path.join(test_root_path, "data", "csv/iris-with-null-rows.csv")
-        # data = pd.read_csv(file_path)
-        # data = self.data
-
-        # profile = dp.StructuredProfiler(data, options=profiler_options)
-        # self.assertEqual(13, profile.row_has_null_count)
-        # self.assertEqual(13 / 24, profile._get_row_has_null_ratio())
-        # self.assertEqual(3, profile.row_is_null_count)
-        # self.assertEqual(3 / 24, profile._get_row_is_null_ratio())
-
     def test_row_is_null_ratio_row_stats_disabled(self):
         profiler_options_1 = ProfilerOptions()
         profiler_options_1.set(
@@ -3665,7 +3638,7 @@ def test_null_in_file(self):
                 "row_statistics.is_enabled": True,
             }
         )
-        # data = dp.Data(filename_null_in_file)
+
         data = self.data
 
         profile = dp.StructuredProfiler(data, options=profiler_options)
@@ -3685,17 +3658,6 @@ def test_null_in_file(self):
         )
 
     def test_correct_total_sample_size_and_counts_and_mutability(self):
-        # data = [
-        #     ["test1", 1.0],
-        #     ["test2", 2.0],
-        #     ["test3", 3.0],
-        #     [None, None],
-        #     ["test5", 5.0],
-        #     ["test6", 6.0],
-        #     [None, None],
-        #     ["test7", 7.0],
-        # ]
-        # data = pd.DataFrame(data, columns=["NAME", "VALUE"])
         data = self.data
 
         profiler_options = ProfilerOptions()
@@ -3743,12 +3705,7 @@ def test_null_calculation_with_differently_sampled_cols(self):
                 "row_statistics.is_enabled": True,
             }
         )
-        # data = pd.DataFrame(
-        #     {
-        #         "full": [1, 2, 3, 4, 5, 6, 7, 8, 9],
-        #         "sparse": [1, None, 3, None, 5, None, 7, None, 9],
-        #     }
-        # )
+
         data = pd.DataFrame(self.data)
 
         data1 = data[0:10]
@@ -3769,13 +3726,6 @@ def test_null_calculation_with_differently_sampled_cols(self):
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(0.6, profile._get_row_has_null_ratio())
 
-        # data2 = pd.DataFrame(
-        #     {
-        #         "sparse": [1, None, 3, None, 5, None, 7, None],
-        #         "sparser": [1, None, None, None, None, None, None, 8],
-        #     }
-        # )
-
         profile2 = dp.StructuredProfiler(
             data2, samples_per_update=2, min_true_samples=2, options=opts
         )
@@ -3811,8 +3761,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(1.0, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(2, profile._min_sampled_from_batch)
-        # self.assertSetEqual({}, profile._profile[0].null_types_index)
-        # self.assertSetEqual({}, profile._profile[1].null_types_index)
         self.assertEqual({}, profile._profile[0].null_types_index)
         self.assertEqual({}, profile._profile[1].null_types_index)
 
@@ -3822,12 +3770,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(0.75, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(2, profile._min_sampled_from_batch)
-        # self.assertSetEqual(
-        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        # )
-        # self.assertSetEqual(
-        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
-        # )
 
         self.assertEqual(
             {36, 38, 24, 26, 28}, profile._profile[2].null_types_index["nan"]
@@ -3848,8 +3790,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(0.5, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(10, profile._min_sampled_from_batch)
-        # self.assertSetEqual({20, 22, 24, 26, 28}, profile._profile[2].null_types_index["nan"])
-        # self.assertSetEqual({}, profile._profile[1].null_types_index)
         self.assertEqual(
             {20, 22, 24, 26, 28}, profile._profile[2].null_types_index["nan"]
         )
@@ -3861,12 +3801,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(0.47368421052631576, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(9, profile._min_sampled_from_batch)
-        # self.assertSetEqual(
-        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        # )
-        # self.assertSetEqual(
-        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
-        # )
         self.assertEqual(
             {32, 34, 36, 38, 20, 22, 24, 26, 28},
             profile._profile[2].null_types_index["nan"],
@@ -3880,12 +3814,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(0.47368421052631576, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(9, profile._min_sampled_from_batch)
-        # self.assertSetEqual(
-        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        # )
-        # self.assertSetEqual(
-        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
-        # )
         self.assertEqual(
             {32, 34, 36, 38, 20, 22, 24, 26, 28},
             profile._profile[2].null_types_index["nan"],
@@ -3893,7 +3821,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual({}, profile._profile[1].null_types_index)
 
         # Test one row update
-        # profile.update_profile(pd.DataFrame([['apple',5,2.0]]))
         profile.update_profile(
             pd.DataFrame({"names": ["jeremy"], "numbers": [4], "tf_null": [None]})
         )
@@ -3903,12 +3830,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertEqual(0.5, profile._get_row_has_null_ratio())
         self.assertEqual(0, profile._get_row_is_null_ratio())
         self.assertEqual(1, profile._min_sampled_from_batch)
-        # self.assertSetEqual(
-        #     {2, 3, 4, 6, 7}, profile._profile[0].null_types_index["nan"]
-        # )
-        # self.assertSetEqual(
-        #     {0, 2, 4, 5, 6}, profile._profile[1].null_types_index["nan"]
-        # )
         self.assertEqual(
             {32, 34, 36, 38, 20, 22, 24, 26, 28},
             profile._profile[2].null_types_index["nan"],
@@ -3917,7 +3838,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
 
         # Weird pandas behavior makes this None since this column will be
         # recognized as object, not float64
-        # self.assertSetEqual({}, profile._profile[2].null_types_index["None"])
         self.assertEqual({0}, profile._profile[2].null_types_index["None"])
 
         # Tests row stats disabled
@@ -3934,9 +3854,6 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
 
     def test_list_data_with_hll(self):
 
-        # data = pd.DataFrame(
-        #     {"a": [1, 1, 4, 4, 3, 1, None], "b": [1, None, 3, 4, 4, None, 1]}
-        # )
         data = pd.DataFrame(self.data)
 
         # test hll_row_hashing
@@ -3955,7 +3872,6 @@ def test_list_data_with_hll(self):
         self.assertEqual(15, profiler.hashed_row_object.cardinality())
 
     def test_add_profilers_row_statistics_options(self):
-        # data = pd.DataFrame([1, None, 3, 4, 5, None, 1])
         data = pd.DataFrame(self.data)
 
         default_options = ProfilerOptions()