Tuning for multiple columns part 1: Computing Sum histograms for mult…

…iple columns (#523)
OpenMined · Sep 9, 2024 · 71875ea · 71875ea
1 parent fcfef44
commit 71875ea
Show file tree

Hide file tree

Showing 12 changed files with 932 additions and 557 deletions.
diff --git a/analysis/contribution_bounders.py b/analysis/contribution_bounders.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ContributionBounder for utility analysis."""
+
+import numpy as np
 from pipeline_dp import contribution_bounders
 from pipeline_dp import sampling_utils
+from typing import Iterable
 
 
 class AnalysisContributionBounder(contribution_bounders.ContributionBounder):
@@ -65,9 +68,33 @@ def rekey_per_privacy_id_per_partition_key_and_unnest(pid_pk_v_values):
             for partition_key, values in partition_values:
                 if sampler is not None and not sampler.keep(partition_key):
                     continue
-                yield (privacy_id, partition_key), (len(values), sum(values),
-                                                    num_partitions_contributed,
-                                                    num_contributions)
+                # Sum values.
+                # values can contain multi-columns, the format is the following
+                # 1 column:
+                #   input: values = [v_0:float, ... ]
+                #   output: v_0 + ....
+                # k columns (k > 1):
+                #   input: values = [v_0=(v_00, ... v_0(k-1)), ...]
+                #   output: (v_00+v_10+..., ...)
+                if not values:
+                    # Empty public partitions
+                    sum_values = 0
+                elif len(values) == 1:
+                    # No need to sum, return 0th value
+                    sum_values = values[0]
+                elif not isinstance(values[0], Iterable):
+                    # 1 column
+                    sum_values = sum(values)
+                else:
+                    # multiple value columns, sum each column independently
+                    sum_values = tuple(np.array(values).sum(axis=0).tolist())
+
+                yield (privacy_id, partition_key), (
+                    len(values),
+                    sum_values,
+                    num_partitions_contributed,
+                    num_contributions,
+                )
 
         # Unnest the list per privacy id.
         col = backend.flat_map(

diff --git a/analysis/tests/contribution_bounders_test.py b/analysis/tests/contribution_bounders_test.py
@@ -40,7 +40,8 @@ def _run_contribution_bounding(self,
                                    input,
                                    max_partitions_contributed,
                                    max_contributions_per_partition,
-                                   partitions_sampling_prob: float = 1.0):
+                                   partitions_sampling_prob: float = 1.0,
+                                   aggregate_fn=count_aggregate_fn):
         params = CrossAndPerPartitionContributionParams(
             max_partitions_contributed, max_contributions_per_partition)
 
@@ -50,7 +51,7 @@ def _run_contribution_bounding(self,
             bounder.bound_contributions(input, params,
                                         pipeline_dp.LocalBackend(),
                                         _create_report_generator(),
-                                        count_aggregate_fn))
+                                        aggregate_fn))
 
     def test_contribution_bounding_empty_col(self):
         input = []
@@ -117,8 +118,27 @@ def test_contribution_bounding_cross_partition_bounding_and_sampling(self):
         # Check per- and cross-partition contribution limits are not enforced.
         self.assertEqual(set(expected_result), set(bound_result))
 
+    def test_contribution_bounding_cross_partition_bounding_and_2_column_values(
+            self):
+        input = [("pid1", 'pk1', (1, 2)), ("pid1", 'pk1', (3, 4)),
+                 ("pid1", 'pk2', (-1, 0)), ("pid2", 'pk1', (5, 5))]
+        max_partitions_contributed = 3
+        max_contributions_per_partition = 5
 
-class SamplingL0LinfContributionBounderTest(parameterized.TestCase):
+        bound_result = self._run_contribution_bounding(
+            input,
+            max_partitions_contributed,
+            max_contributions_per_partition,
+            aggregate_fn=lambda x: x)
+
+        expected_result = [(('pid1', 'pk2'), (1, (-1, 0), 2, 3)),
+                           (('pid1', 'pk1'), (2, (4, 6), 2, 3)),
+                           (('pid2', 'pk1'), (1, (5, 5), 1, 1))]
+        # Check per- and cross-partition contribution limits are not enforced.
+        self.assertEqual(set(expected_result), set(bound_result))
+
+
+class NoOpContributionBounderTest(parameterized.TestCase):
 
     def test_contribution_bounding_doesnt_drop_contributions(self):
         # Arrange.

diff --git a/pipeline_dp/data_extractors.py b/pipeline_dp/data_extractors.py
@@ -1,5 +1,20 @@
+# Copyright 2022 OpenMined.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes for keeping data (privacy unit, partition etc) extractors."""
+
 import dataclasses
-from typing import Callable
+from typing import Callable, List, Optional
 
 
 @dataclasses.dataclass
@@ -15,6 +30,20 @@ class DataExtractors:
     value_extractor: Callable = None
 
 
+@dataclasses.dataclass
+class MultiValueDataExtactors(DataExtractors):
+    """Data extractors with multiple value extractors.
+
+    Each extractor corresponds to the different value to aggregate.
+    """
+    value_extractors: Optional[List[Callable]] = None
+
+    def __post_init__(self):
+        if self.value_extractors is not None:
+            self.value_extractor = lambda row: tuple(
+                e(row) for e in self.value_extractors)
+
+
 @dataclasses.dataclass
 class PreAggregateExtractors:
     """Data extractors for pre-aggregated data.