diff --git a/examples/boundary tests/README.md b/examples/boundary tests/README.md new file mode 100644 index 0000000..1d453c9 --- /dev/null +++ b/examples/boundary tests/README.md @@ -0,0 +1,218 @@ +# Boundary Tests on SIGO + +This document presents the limitations of sigo regarding the size of the datasets +We will generate different datasets by varying the number of rows and the number of columns. +We will test these parameters on the different anonymization methods. + +## Number of rows + +We generate datasets of different sizes using `pimo`. +Below the `masking.yml` file allowing to generate a flow of jsonline with random float. + +```yaml +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 +``` + +We change the size of the datasets by using the `--repeat,-r` flag of `pimo` (***N = [100, 1000, 10000, 100000, 1000000]***). +And we anonymize the data with `sigo` using the anonymization method of our choice with the `--anonymizer,-a` flag. + +```console +pimo < test.json -c masking.yml -r 100 > test1_1.json +sigo -q A,B -a general < test1_1.json > output.json +``` + +A bash script is written to automate the tests in the `rows.sh` file. + +```console +cd rows +sudo chmod u+x rows.sh +. ./rows.sh +``` + +The results are listed in the `log.txt` file. + +| NoAnonymizer | Size | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 100 | 0.00 | SUCCESS | +| Test2 | 1 000 | 0.00 | SUCCESS | +| Test3 | 10 000 | 2.00 | SUCCESS | +| Test4 | 100 000 | 27.00 | SUCCESS | +| Test5 | 1 000 000 | 418.00 | SUCCESS | +| Test6 | 10 000 000 | | FAILED | + + + +
Generalization Aggregation Top Bottom Coding Random Noise
+ +| | Size | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 100 | 1.00 | SUCCESS | +| Test2 | 1 000 | 0.00 | SUCCESS | +| Test3 | 10 000 | 3.00 | SUCCESS | +| Test4 | 100 000 | 30.00 | SUCCESS | +| Test5 | 1 000 000 | 395.00 | SUCCESS | + + + +| | Size | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 100 | 0.00 | SUCCESS | +| Test2 | 1 000 | 0.00 | SUCCESS | +| Test3 | 10 000 | 3.00 | SUCCESS | +| Test4 | 100 000 | 29.00 | SUCCESS | +| Test5 | 1 000 000 | 386.00 | SUCCESS | + + + +| | Size | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 100 | 0.00 | SUCCESS | +| Test2 | 1 000 | 0.00 | SUCCESS | +| Test3 | 10 000 | 3.00 | SUCCESS | +| Test4 | 100 000 | 28.00 | SUCCESS | +| Test5 | 1 000 000 | 398.00 | SUCCESS | + + + +| | Size | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 100 | 0.00 | SUCCESS | +| Test2 | 1 000 | 1.00 | SUCCESS | +| Test3 | 10 000 | 3.00 | SUCCESS | +| Test4 | 100 000 | 37.00 | SUCCESS | +| Test5 | 1 000 000 | 420.00 | SUCCESS | + +
+ +![rows](rows/rows.png) + +## Number of columns + +Now we generate a dataset of 1000 rows using `pimo` and we change the number of attributes. +To do this we take the `masking.yml` file from the **rows folder** and add additional masks for each new attribute. +Here an example for the test with 4 attributes, + +```yaml +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "C" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "D" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 +``` + +So we get 5 masking.yml files : + +- `masking1.yml` for 2 attributes. +- `masking2.yml` for 4 attributes. +- `masking3.yml` for 8 attributes. +- `masking4.yml` for 16 attributes. +- `masking5.yml` for 32 attributes. + +```console +pimo < test2.json -c masking2.yml -r 1000 > test2_2.json +sigo -q A,B,C,D -a general < test2_2.json > output.json +``` + +The bash script for test automation is in the `columns.sh` file and the results are in the `log.txt` file. + +```console +cd columns +sudo chmod u+x columns.sh +. ./columns.sh +``` + +| NoAnonymizer | Attributes | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 2 | 0.00 | SUCCESS | +| Test2 | 4 | 1.00 | SUCCESS | +| Test3 | 8 | 0.00 | SUCCESS | +| Test4 | 16 | 1.00 | SUCCESS | +| Test5 | 32 | 3.00 | SUCCESS | + + + +
Generalization Aggregation Top Bottom Coding Random Noise
+ +| | Attributes | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 2 | 0.00 | SUCCESS | +| Test2 | 4 | 1.00 | SUCCESS | +| Test3 | 8 | 1.00 | SUCCESS | +| Test4 | 16 | 2.00 | SUCCESS | +| Test5 | 32 | 4.00 | SUCCESS | + + + +| | Attributes | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 2 | 0.00 | SUCCESS | +| Test2 | 4 | 1.00 | SUCCESS | +| Test3 | 8 | 1.00 | SUCCESS | +| Test4 | 16 | 2.00 | SUCCESS | +| Test5 | 32 | 4.00 | SUCCESS | + + + +| Top Bottom Coding | Attributes | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 2 | 0.00 | SUCCESS | +| Test2 | 4 | 0.00 | SUCCESS | +| Test3 | 8 | 0.00 | SUCCESS | +| Test4 | 16 | 1.00 | SUCCESS | +| Test5 | 32 | 4.00 | SUCCESS | + + + +| | Attributes | Execution time (sec) | Results | +|-------|:----------:|:----------------------:|:-------:| +| Test1 | 2 | 0.00 | SUCCESS | +| Test2 | 4 | 0.00 | SUCCESS | +| Test3 | 8 | 0.00 | SUCCESS | +| Test4 | 16 | 2.00 | SUCCESS | +| Test5 | 32 | 4.00 | SUCCESS | + +
+ +![columns](columns/columns.png) diff --git a/examples/boundary tests/columns/columns.png b/examples/boundary tests/columns/columns.png new file mode 100644 index 0000000..5df0ccc Binary files /dev/null and b/examples/boundary tests/columns/columns.png differ diff --git a/examples/boundary tests/columns/columns.sh b/examples/boundary tests/columns/columns.sh new file mode 100644 index 0000000..f0adba6 --- /dev/null +++ b/examples/boundary tests/columns/columns.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +rm -f log.txt + +size_array=("2" "4" "8" "16" "32") +qi_array=("A,B" "A,B,C,D" "A,B,C,D,E,F,G,H" "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P" "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,AA,BB,CC,DD,EE,FF") + +echo "NOANONYMIZER" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test${i}.json -c masking${i}.yml -r 1000 > test${i}_2.json + START=$(date +%s) + sigo -q ${qi_array[$i]} < test${i}_2.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test2_$i: (n=1000, x=${size_array[$i]}, method=NoAnomymizer) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_2.json + rm -f output.json +done + +echo "" >> log.txt +echo "GENERALIZATION" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test${i}.json -c masking${i}.yml -r 1000 > test${i}_2.json + START=$(date +%s) + sigo -q ${qi_array[$i]} -a general < test${i}_2.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test2_$i: (n=1000, x=${size_array[$i]}, method=Generalization) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_2.json + rm -f output.json +done + +echo "" >> log.txt +echo "AGGREGATION" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test${i}.json -c masking${i}.yml -r 1000 > test${i}_2.json + START=$(date +%s) + sigo -q ${qi_array[$i]} -a meanAggregation < test${i}_2.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test2_$i: (n=1000, x=${size_array[$i]}, method=Aggregation) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_2.json + rm -f output.json +done + +echo "" >> log.txt +echo "TOPBOTTOMCODING" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test${i}.json -c masking${i}.yml -r 1000 > test${i}_2.json + START=$(date +%s) + sigo -q ${qi_array[$i]} -a outlier < test${i}_2.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test2_$i: (n=1000, x=${size_array[$i]}, method=TopBottomCoding) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_2.json + rm -f output.json +done + +echo "" >> log.txt +echo "RANDOMNOISE" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test${i}.json -c masking${i}.yml -r 1000 > test${i}_2.json + START=$(date +%s) + sigo -q ${qi_array[$i]} -a laplaceNoise < test${i}_2.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test2_$i: (n=1000, x=${size_array[$i]}, method=RandomNoise) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_2.json + rm -f output.json +done + diff --git a/examples/boundary tests/columns/log.txt b/examples/boundary tests/columns/log.txt new file mode 100644 index 0000000..7aff548 --- /dev/null +++ b/examples/boundary tests/columns/log.txt @@ -0,0 +1,39 @@ +NOANONYMIZER +-------------------------------------------------------------------------------------------------------------- +Test2_1: (n=1000, x=2, method=NoAnomymizer) Execution time was 0 seconds. +Test2_2: (n=1000, x=4, method=NoAnomymizer) Execution time was 1 seconds. +Test2_3: (n=1000, x=8, method=NoAnomymizer) Execution time was 0 seconds. +Test2_4: (n=1000, x=16, method=NoAnomymizer) Execution time was 1 seconds. +Test2_5: (n=1000, x=32, method=NoAnomymizer) Execution time was 3 seconds. + +GENERALIZATION +-------------------------------------------------------------------------------------------------------------- +Test2_1: (n=1000, x=2, method=Generalization) Execution time was 0 seconds. +Test2_2: (n=1000, x=4, method=Generalization) Execution time was 1 seconds. +Test2_3: (n=1000, x=8, method=Generalization) Execution time was 1 seconds. +Test2_4: (n=1000, x=16, method=Generalization) Execution time was 2 seconds. +Test2_5: (n=1000, x=32, method=Generalization) Execution time was 4 seconds. + +AGGREGATION +-------------------------------------------------------------------------------------------------------------- +Test2_1: (n=1000, x=2, method=Aggregation) Execution time was 0 seconds. +Test2_2: (n=1000, x=4, method=Aggregation) Execution time was 1 seconds. +Test2_3: (n=1000, x=8, method=Aggregation) Execution time was 1 seconds. +Test2_4: (n=1000, x=16, method=Aggregation) Execution time was 2 seconds. +Test2_5: (n=1000, x=32, method=Aggregation) Execution time was 4 seconds. + +TOPBOTTOMCODING +-------------------------------------------------------------------------------------------------------------- +Test2_1: (n=1000, x=2, method=TopBottomCoding) Execution time was 0 seconds. +Test2_2: (n=1000, x=4, method=TopBottomCoding) Execution time was 0 seconds. +Test2_3: (n=1000, x=8, method=TopBottomCoding) Execution time was 0 seconds. +Test2_4: (n=1000, x=16, method=TopBottomCoding) Execution time was 1 seconds. +Test2_5: (n=1000, x=32, method=TopBottomCoding) Execution time was 4 seconds. + +RANDOMNOISE +-------------------------------------------------------------------------------------------------------------- +Test2_1: (n=1000, x=2, method=RandomNoise) Execution time was 0 seconds. +Test2_2: (n=1000, x=4, method=RandomNoise) Execution time was 0 seconds. +Test2_3: (n=1000, x=8, method=RandomNoise) Execution time was 0 seconds. +Test2_4: (n=1000, x=16, method=RandomNoise) Execution time was 2 seconds. +Test2_5: (n=1000, x=32, method=RandomNoise) Execution time was 4 seconds. diff --git a/examples/boundary tests/columns/masking1.yml b/examples/boundary tests/columns/masking1.yml new file mode 100644 index 0000000..af902e2 --- /dev/null +++ b/examples/boundary tests/columns/masking1.yml @@ -0,0 +1,17 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/columns/masking2.yml b/examples/boundary tests/columns/masking2.yml new file mode 100644 index 0000000..d1475db --- /dev/null +++ b/examples/boundary tests/columns/masking2.yml @@ -0,0 +1,31 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "C" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "D" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/columns/masking3.yml b/examples/boundary tests/columns/masking3.yml new file mode 100644 index 0000000..6b330d2 --- /dev/null +++ b/examples/boundary tests/columns/masking3.yml @@ -0,0 +1,59 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "C" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "D" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "E" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "F" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "G" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "H" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/columns/masking4.yml b/examples/boundary tests/columns/masking4.yml new file mode 100644 index 0000000..e3156ec --- /dev/null +++ b/examples/boundary tests/columns/masking4.yml @@ -0,0 +1,115 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "C" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "D" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "E" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "F" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "G" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "H" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "I" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "J" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "K" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "L" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "M" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "N" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "O" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "P" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/columns/masking5.yml b/examples/boundary tests/columns/masking5.yml new file mode 100644 index 0000000..06a844e --- /dev/null +++ b/examples/boundary tests/columns/masking5.yml @@ -0,0 +1,227 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "C" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "D" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "E" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "F" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "G" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "H" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "I" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "J" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "K" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "L" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "M" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "N" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "O" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "P" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "Q" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "R" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "S" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "T" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "U" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "V" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "W" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "X" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "Y" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "Z" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "AA" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "BB" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "CC" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "DD" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "EE" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "FF" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/columns/plot.vg.json b/examples/boundary tests/columns/plot.vg.json new file mode 100644 index 0000000..7cf87a4 --- /dev/null +++ b/examples/boundary tests/columns/plot.vg.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "data": { + "values": [ + {"attributes":2, "time (sec)": "27 Janv 2022 00:00:00", "method": "NoAnonymizer"}, + {"attributes":2, "time (sec)": "27 Janv 2022 00:00:00", "method": "Generalization"}, + {"attributes":2, "time (sec)": "27 Janv 2022 00:00:00", "method": "Aggregation"}, + {"attributes":2, "time (sec)": "27 Janv 2022 00:00:00", "method": "TopBottomCoding"}, + {"attributes":2, "time (sec)": "27 Janv 2022 00:00:00", "method": "RandomNoise"}, + {"attributes":4, "time (sec)": "27 Janv 2022 00:00:01", "method": "NoAnonymizer"}, + {"attributes":4, "time (sec)": "27 Janv 2022 00:00:01", "method": "Generalization"}, + {"attributes":4, "time (sec)": "27 Janv 2022 00:00:01", "method": "Aggregation"}, + {"attributes":4, "time (sec)": "27 Janv 2022 00:00:00", "method": "TopBottomCoding"}, + {"attributes":4, "time (sec)": "27 Janv 2022 00:00:00", "method": "RandomNoise"}, + {"attributes":8, "time (sec)": "27 Janv 2022 00:00:00", "method": "NoAnonymizer"}, + {"attributes":8, "time (sec)": "27 Janv 2022 00:00:01", "method": "Generalization"}, + {"attributes":8, "time (sec)": "27 Janv 2022 00:00:01", "method": "Aggregation"}, + {"attributes":8, "time (sec)": "27 Janv 2022 00:00:00", "method": "TopBottomCoding"}, + {"attributes":8, "time (sec)": "27 Janv 2022 00:00:00", "method": "RandomNoise"}, + {"attributes":16, "time (sec)": "27 Janv 2022 00:00:01", "method": "NoAnonymizer"}, + {"attributes":16, "time (sec)": "27 Janv 2022 00:00:02", "method": "Generalization"}, + {"attributes":16, "time (sec)": "27 Janv 2022 00:00:02", "method": "Aggregation"}, + {"attributes":16, "time (sec)": "27 Janv 2022 00:00:01", "method": "TopBottomCoding"}, + {"attributes":16, "time (sec)": "27 Janv 2022 00:00:02", "method": "RandomNoise"}, + {"attributes":32, "time (sec)": "27 Janv 2022 00:00:04", "method": "NoAnonymizer"}, + {"attributes":32, "time (sec)": "27 Janv 2022 00:00:04", "method": "Generalization"}, + {"attributes":32, "time (sec)": "27 Janv 2022 00:00:04", "method": "Aggregation"}, + {"attributes":32, "time (sec)": "27 Janv 2022 00:00:04", "method": "TopBottomCoding"}, + {"attributes":32, "time (sec)": "27 Janv 2022 00:00:04", "method": "RandomNoise"} + ] + }, + "mark": "bar", + "encoding": { + "x": {"field": "attributes"}, + "y": {"field": "time (sec)", "type": "temporal"}, + "xOffset": {"field": "method"}, + "color": {"field": "method"} + } + } diff --git a/examples/boundary tests/columns/test1.json b/examples/boundary tests/columns/test1.json new file mode 100644 index 0000000..8357c55 --- /dev/null +++ b/examples/boundary tests/columns/test1.json @@ -0,0 +1 @@ +{"A":10,"B":10} diff --git a/examples/boundary tests/columns/test2.json b/examples/boundary tests/columns/test2.json new file mode 100644 index 0000000..389b5b6 --- /dev/null +++ b/examples/boundary tests/columns/test2.json @@ -0,0 +1 @@ +{"A":10,"B":10,"C":10,"D":10} diff --git a/examples/boundary tests/columns/test3.json b/examples/boundary tests/columns/test3.json new file mode 100644 index 0000000..37ea52e --- /dev/null +++ b/examples/boundary tests/columns/test3.json @@ -0,0 +1 @@ +{"A":10,"B":10,"C":10,"D":10,"E":10,"F":10,"G":10,"H":10} diff --git a/examples/boundary tests/columns/test4.json b/examples/boundary tests/columns/test4.json new file mode 100644 index 0000000..e57835c --- /dev/null +++ b/examples/boundary tests/columns/test4.json @@ -0,0 +1 @@ +{"A":10,"B":10,"C":10,"D":10,"E":10,"F":10,"G":10,"H":10,"I":10,"J":10,"K":10,"L":10,"M":10,"N":10,"O":10,"P":10} diff --git a/examples/boundary tests/columns/test5.json b/examples/boundary tests/columns/test5.json new file mode 100644 index 0000000..5c2a70b --- /dev/null +++ b/examples/boundary tests/columns/test5.json @@ -0,0 +1 @@ +{"A":10,"B":10,"C":10,"D":10,"E":10,"F":10,"G":10,"H":10,"I":10,"J":10,"K":10,"L":10,"M":10,"N":10,"O":10,"P":10,"Q":10,"R":10,"S":10,"T":10,"U":10,"V":10,"W":10,"X":10,"Y":10,"Z":10,"AA":10,"BB":10,"CC":10,"DD":10,"EE":10,"FF":10} diff --git a/examples/boundary tests/rows/log.txt b/examples/boundary tests/rows/log.txt new file mode 100644 index 0000000..7f03b80 --- /dev/null +++ b/examples/boundary tests/rows/log.txt @@ -0,0 +1,39 @@ +NOANONYMIZER +-------------------------------------------------------------------------------------------------------------- +Test1_1: (n=100, x=2, method=NoAnomymizer) Execution time was 0 seconds. +Test1_2: (n=1000, x=2, method=NoAnomymizer) Execution time was 0 seconds. +Test1_3: (n=10000, x=2, method=NoAnomymizer) Execution time was 2 seconds. +Test1_4: (n=100000, x=2, method=NoAnomymizer) Execution time was 27 seconds. +Test1_5: (n=1000000, x=2, method=NoAnomymizer) Execution time was 418 seconds. + +GENERALIZATION +-------------------------------------------------------------------------------------------------------------- +Test1_1: (n=100, x=2, method=Generalization) Execution time was 1 seconds. +Test1_2: (n=1000, x=2, method=Generalization) Execution time was 0 seconds. +Test1_3: (n=10000, x=2, method=Generalization) Execution time was 3 seconds. +Test1_4: (n=100000, x=2, method=Generalization) Execution time was 30 seconds. +Test1_5: (n=1000000, x=2, method=Generalization) Execution time was 395 seconds. + +AGGREGATION +-------------------------------------------------------------------------------------------------------------- +Test1_1: (n=100, x=2, method=Aggregation) Execution time was 0 seconds. +Test1_2: (n=1000, x=2, method=Aggregation) Execution time was 0 seconds. +Test1_3: (n=10000, x=2, method=Aggregation) Execution time was 3 seconds. +Test1_4: (n=100000, x=2, method=Aggregation) Execution time was 29 seconds. +Test1_5: (n=1000000, x=2, method=Aggregation) Execution time was 386 seconds. + +TOPBOTTOMCODING +-------------------------------------------------------------------------------------------------------------- +Test1_1: (n=100, x=2, method=TopBottomCoding) Execution time was 0 seconds. +Test1_2: (n=1000, x=2, method=TopBottomCoding) Execution time was 0 seconds. +Test1_3: (n=10000, x=2, method=TopBottomCoding) Execution time was 3 seconds. +Test1_4: (n=100000, x=2, method=TopBottomCoding) Execution time was 28 seconds. +Test1_5: (n=1000000, x=2, method=TopBottomCoding) Execution time was 398 seconds. + +RANDOMNOISE +-------------------------------------------------------------------------------------------------------------- +Test1_1: (n=100, x=2, method=RandomNoise) Execution time was 0 seconds. +Test1_2: (n=1000, x=2, method=RandomNoise) Execution time was 1 seconds. +Test1_3: (n=10000, x=2, method=RandomNoise) Execution time was 3 seconds. +Test1_4: (n=100000, x=2, method=RandomNoise) Execution time was 37 seconds. +Test1_5: (n=1000000, x=2, method=RandomNoise) Execution time was 420 seconds. diff --git a/examples/boundary tests/rows/log_test.txt b/examples/boundary tests/rows/log_test.txt new file mode 100644 index 0000000..49d5969 --- /dev/null +++ b/examples/boundary tests/rows/log_test.txt @@ -0,0 +1,4 @@ +Test: (n=2000000000, x=2, method=NoAnomymizer) Execution time was 928 seconds. +Test: (n=3000000000, x=2, method=NoAnomymizer) Execution time was 1300 seconds. +Test: (n=4000000000, x=2, method=NoAnomymizer) Execution time was 2064 seconds. +Test: (n=5000000000, x=2, method=NoAnomymizer) Execution time was 4294 seconds. diff --git a/examples/boundary tests/rows/masking.yml b/examples/boundary tests/rows/masking.yml new file mode 100644 index 0000000..af902e2 --- /dev/null +++ b/examples/boundary tests/rows/masking.yml @@ -0,0 +1,17 @@ +version: "1" +seed: 42 +masking: + - selector: + jsonpath: "A" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 + - selector: + jsonpath: "B" + mask: + randomDecimal: + min: 0 + max: 100.00 + precision: 2 diff --git a/examples/boundary tests/rows/plot.vg.json b/examples/boundary tests/rows/plot.vg.json new file mode 100644 index 0000000..1e01650 --- /dev/null +++ b/examples/boundary tests/rows/plot.vg.json @@ -0,0 +1,41 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "width":600, + "height":300, + "data": {"values":[ + {"rows":100, "time": "27 Janv 2022 16:00:00", "method": "NoAnonymizer"}, + {"rows":1000, "time": "27 Janv 2022 16:00:00", "method": "NoAnonymizer"}, + {"rows":10000, "time": "27 Janv 2022 16:00:02", "method": "NoAnonymizer"}, + {"rows":100000, "time": "27 Janv 2022 16:00:27", "method": "NoAnonymizer"}, + {"rows":1000000, "time": "27 Janv 2022 16:06:58", "method": "NoAnonymizer"}, + {"rows":100, "time": "27 Janv 2022 16:00:01", "method": "Generalization"}, + {"rows":1000, "time": "27 Janv 2022 16:00:00", "method": "Generalization"}, + {"rows":10000, "time": "27 Janv 2022 16:00:03", "method": "Generalization"}, + {"rows":100000, "time": "27 Janv 2022 16:00:30", "method": "Generalization"}, + {"rows":1000000, "time": "27 Janv 2022 16:06:35", "method": "Generalization"}, + {"rows":100, "time": "27 Janv 2022 16:00:00", "method": "Aggregation"}, + {"rows":1000, "time": "27 Janv 2022 16:00:00", "method": "Aggregation"}, + {"rows":10000, "time": "27 Janv 2022 16:00:03", "method": "Aggregation"}, + {"rows":100000, "time": "27 Janv 2022 16:00:29", "method": "Aggregation"}, + {"rows":1000000, "time": "27 Janv 2022 16:06:26", "method": "Aggregation"}, + {"rows":100, "time": "27 Janv 2022 16:00:00", "method": "TopBottomCoding"}, + {"rows":1000, "time": "27 Janv 2022 16:00:00", "method": "TopBottomCoding"}, + {"rows":10000, "time": "27 Janv 2022 16:00:03", "method": "TopBottomCoding"}, + {"rows":100000, "time": "27 Janv 2022 16:00:28", "method": "TopBottomCoding"}, + {"rows":1000000, "time": "27 Janv 2022 16:06:38", "method": "TopBottomCoding"}, + {"rows":100, "time": "27 Janv 2022 16:00:00", "method": "RandomNoise"}, + {"rows":1000, "time": "27 Janv 2022 16:00:01", "method": "RandomNoise"}, + {"rows":10000, "time": "27 Janv 2022 16:00:03", "method": "RandomNoise"}, + {"rows":100000, "time": "27 Janv 2022 16:00:37", "method": "RandomNoise"}, + {"rows":1000000, "time": "27 Janv 2022 16:07:00", "method": "RandomNoise"} + ]}, + "mark": { + "type": "line", + "point": true + }, + "encoding": { + "y": {"field": "time", "timeUnit":"minutesseconds"}, + "x": {"field": "rows", "type": "quantitative", "scale": {"type": "log"}}, + "color": {"field": "method", "type": "nominal"} + } + } diff --git a/examples/boundary tests/rows/plot2.vg.json b/examples/boundary tests/rows/plot2.vg.json new file mode 100644 index 0000000..af8089a --- /dev/null +++ b/examples/boundary tests/rows/plot2.vg.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "width":600, + "height":300, + "data": {"values":[ + {"rows":1000000, "time": "27 Janv 2022 00:05:31", "method": "NoAnonymizer"}, + {"rows":2000000, "time": "27 Janv 2022 00:15:27", "method": "NoAnonymizer"}, + {"rows":3000000, "time": "27 Janv 2022 00:21:40", "method": "NoAnonymizer"}, + {"rows":4000000, "time": "27 Janv 2022 00:34:24", "method": "NoAnonymizer"}, + {"rows":5000000, "time": "27 Janv 2022 01:11:33", "method": "NoAnonymizer"} + ]}, + "mark": { + "type": "line", + "point": true + }, + "encoding": { + "y": {"field": "time", "timeUnit":"hoursminutesseconds"}, + "x": {"field": "rows", "type": "quantitative"}, + "color": {"field": "method", "type": "nominal"} + } + } + + diff --git a/examples/boundary tests/rows/rows.png b/examples/boundary tests/rows/rows.png new file mode 100644 index 0000000..d344ce5 Binary files /dev/null and b/examples/boundary tests/rows/rows.png differ diff --git a/examples/boundary tests/rows/rows.sh b/examples/boundary tests/rows/rows.sh new file mode 100644 index 0000000..fd48588 --- /dev/null +++ b/examples/boundary tests/rows/rows.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +rm -f log.txt + +size_array=("100" "1000" "10000" "100000" "1000000") + +echo "NOANONYMIZER" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test.json -c masking.yml -r ${size_array[$i]} > test${i}_1.json + START=$(date +%s) + sigo -q A,B < test${i}_1.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test1_$i: (n=${size_array[$i]}, x=2, method=NoAnomymizer) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_1.json + rm -f output.json +done + +echo "" >> log.txt +echo "GENERALIZATION" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test.json -c masking.yml -r ${size_array[$i]} > test${i}_1.json + START=$(date +%s) + sigo -q A,B -a general < test${i}_1.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test1_$i: (n=${size_array[$i]}, x=2, method=Generalization) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_1.json + rm -f output.json +done + +echo "" >> log.txt +echo "AGGREGATION" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test.json -c masking.yml -r ${size_array[$i]} > test${i}_1.json + START=$(date +%s) + sigo -q A,B -a meanAggregation < test${i}_1.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test1_$i: (n=${size_array[$i]}, x=2, method=Aggregation) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_1.json + rm -f output.json +done + +echo "" >> log.txt +echo "TOPBOTTOMCODING" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test.json -c masking.yml -r ${size_array[$i]} > test${i}_1.json + START=$(date +%s) + sigo -q A,B -a outlier < test${i}_1.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test1_$i: (n=${size_array[$i]}, x=2, method=TopBottomCoding) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_1.json + rm -f output.json +done + +echo "" >> log.txt +echo "RANDOMNOISE" >> log.txt +echo "--------------------------------------------------------------------------------------------------------------" >> log.txt + +for i in 1 2 3 4 5 +do + pimo < test.json -c masking.yml -r ${size_array[$i]} > test${i}_1.json + START=$(date +%s) + sigo -q A,B -a laplaceNoise < test${i}_1.json > output.json + END=$(date +%s) + DIFF=$(( $END - $START )) + echo "Test1_$i: (n=${size_array[$i]}, x=2, method=RandomNoise) Execution time was $DIFF seconds." >> log.txt + rm -f test${i}_1.json + rm -f output.json +done + diff --git a/examples/boundary tests/rows/rows2.png b/examples/boundary tests/rows/rows2.png new file mode 100644 index 0000000..1f8a36e Binary files /dev/null and b/examples/boundary tests/rows/rows2.png differ diff --git a/examples/boundary tests/rows/test.json b/examples/boundary tests/rows/test.json new file mode 100644 index 0000000..8357c55 --- /dev/null +++ b/examples/boundary tests/rows/test.json @@ -0,0 +1 @@ +{"A":10,"B":10} diff --git a/examples/boundary tests/rows/test.sh b/examples/boundary tests/rows/test.sh new file mode 100644 index 0000000..9e27803 --- /dev/null +++ b/examples/boundary tests/rows/test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +rm -f log_test.txt + +START=$(date +%s) +< test_2.json | jq -c '.[]' | sigo -q A,B | jq -s > output.json +END=$(date +%s) +DIFF=$(( $END - $START )) +echo "Test: (n=2000000000, x=2, method=NoAnomymizer) Execution time was $DIFF seconds." >> log_test.txt + +START=$(date +%s) +< test_3.json | jq -c '.[]' | sigo -q A,B | jq -s > output.json +END=$(date +%s) +DIFF=$(( $END - $START )) +echo "Test: (n=3000000000, x=2, method=NoAnomymizer) Execution time was $DIFF seconds." >> log_test.txt + +START=$(date +%s) +< test_4.json | jq -c '.[]' | sigo -q A,B | jq -s > output.json +END=$(date +%s) +DIFF=$(( $END - $START )) +echo "Test: (n=4000000000, x=2, method=NoAnomymizer) Execution time was $DIFF seconds." >> log_test.txt + +START=$(date +%s) +< test_5.json | jq -c '.[]' | sigo -q A,B | jq -s > output.json +END=$(date +%s) +DIFF=$(( $END - $START )) +echo "Test: (n=5000000000, x=2, method=NoAnomymizer) Execution time was $DIFF seconds." >> log_test.txt + +rm -f output.json diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index cf3116e..a74242d 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -19,6 +19,9 @@ package sigo_test import ( "encoding/json" + "fmt" + "math/rand" + "strconv" "strings" "testing" @@ -89,3 +92,55 @@ func TestGeneralizedClustering(t *testing.T) { assert.Equal(t, 2, result[4]["clusterID"]) assert.Equal(t, 2, result[5]["clusterID"]) } + +//nolint: gochecknoglobals +var tests = []struct { + n int + a sigo.Anonymizer + s string +}{ + {n: 1000, a: sigo.NewNoAnonymizer(), s: "NoAnonymizer"}, + {n: 1000, a: sigo.NewGeneralAnonymizer(), s: "Generalization"}, + {n: 1000, a: sigo.NewAggregationAnonymizer("mean"), s: "MeanAggregation"}, + {n: 1000, a: sigo.NewCodingAnonymizer(), s: "TopBottomCoding"}, + {n: 1000, a: sigo.NewNoiseAnonymizer("gaussian"), s: "GaussianNoise"}, + {n: 100000, a: sigo.NewNoAnonymizer(), s: "NoAnonymizer"}, + {n: 100000, a: sigo.NewGeneralAnonymizer(), s: "Generalization"}, + {n: 100000, a: sigo.NewAggregationAnonymizer("mean"), s: "MeanAggregation"}, + {n: 100000, a: sigo.NewCodingAnonymizer(), s: "TopBottomCoding"}, + {n: 100000, a: sigo.NewNoiseAnonymizer("gaussian"), s: "GaussianNoise"}, + {n: 1000000, a: sigo.NewNoAnonymizer(), s: "NoAnonymizer"}, + {n: 1000000, a: sigo.NewGeneralAnonymizer(), s: "Generalization"}, + {n: 1000000, a: sigo.NewAggregationAnonymizer("mean"), s: "MeanAggregation"}, + {n: 1000000, a: sigo.NewCodingAnonymizer(), s: "TopBottomCoding"}, + {n: 1000000, a: sigo.NewNoiseAnonymizer("gaussian"), s: "GaussianNoise"}, +} + +func BenchmarkAnonymize(b *testing.B) { + for _, test := range tests { + b.Run(fmt.Sprintf("input_size_%d, anonymizer_%s", test.n, test.s), func(b *testing.B) { + sourceText := []string{} + + for i := 0; i < test.n; i++ { + // nolint: gosec + x := rand.Intn(test.n) + // nolint: gosec + y := rand.Intn(test.n) + sourceText = append(sourceText, `{"x:"`+strconv.Itoa(x)+`, "y":`+strconv.Itoa(y)+`}`) + } + + data := strings.Join(sourceText, "\n") + + source, _ := infra.NewJSONLineSource(strings.NewReader(data), []string{"x", "y"}, []string{"foo"}) + + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _ = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 3, 1, 2, test.a, sink, sigo.NewNoDebugger()) + } + }) + } +} diff --git a/pkg/sigo/kdtree_test.go b/pkg/sigo/kdtree_test.go index 43272f7..15e7125 100644 --- a/pkg/sigo/kdtree_test.go +++ b/pkg/sigo/kdtree_test.go @@ -22,6 +22,7 @@ import ( "math/rand" "reflect" "testing" + "time" "github.com/cgi-fr/sigo/internal/infra" "github.com/cgi-fr/sigo/pkg/sigo" @@ -200,3 +201,39 @@ func TestAddClusterInfos(t *testing.T) { } } } + +func BenchmarkClustering(b *testing.B) { + N := []int{1000, 10000, 25000, 50000, 100000} + + for _, n := range N { + b.Run(fmt.Sprintf("input_size_%d", n), func(b *testing.B) { + rand.Seed(time.Now().UnixNano()) + + kdtree := sigo.NewKDTreeFactory().New(3, 1, 1) + rows := []jsonline.Row{} + + for i := 0; i < n; i++ { + row := jsonline.NewRow() + // nolint: gosec + x := rand.Intn(n) + row.Set("x", x) + // nolint: gosec + y := rand.Intn(n) + row.Set("y", y) + rows = append(rows, row) + } + + for j := 0; j < n; j++ { + record := infra.NewJSONLineRecord(&rows[j], &[]string{"x", "y"}, &[]string{}) + + kdtree.Add(record) + } + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + kdtree.Build() + } + }) + } +}