From 3a6e285b25073015bbe2a0c2fe507a40981a97a4 Mon Sep 17 00:00:00 2001 From: Nabil Miri Date: Wed, 11 May 2022 15:10:08 -0230 Subject: [PATCH] Add Hamming Distance knn similarity metric for long property + tests --- .../knn/metrics/HammingDistance.java | 52 +++++++++++++++++++ .../LongPropertySimilarityComputer.java | 10 ++-- .../metrics/LongPropertySimilarityMetric.java | 23 ++++++++ .../metrics/NormalizedAbsoluteDifference.java | 32 ++++++++++++ .../knn/metrics/SimilarityComputer.java | 17 ++++-- .../knn/metrics/SimilarityMetric.java | 6 ++- .../FilteredGenerateRandomNeighborsTest.java | 2 +- .../knn/GenerateRandomNeighborsTest.java | 2 +- .../knn/metrics/HammingDistanceTest.java | 40 ++++++++++++++ .../NormalizedAbsoluteDifferenceTest.java | 33 ++++++++++++ .../knn/metrics/SimilarityComputerTest.java | 19 +++++-- 11 files changed, 219 insertions(+), 17 deletions(-) create mode 100644 algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/HammingDistance.java create mode 100644 algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityMetric.java create mode 100644 algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifference.java create mode 100644 algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/HammingDistanceTest.java create mode 100644 algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifferenceTest.java diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/HammingDistance.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/HammingDistance.java new file mode 100644 index 00000000000..d22e44b46a1 --- /dev/null +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/HammingDistance.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) "Neo4j" + * Neo4j Sweden AB [http://neo4j.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.gds.similarity.knn.metrics; + +/** + * We compute the Hamming Distance, + * (https://en.wikipedia.org/wiki/Hamming_distance) and turn it into + * a similarity metric by clamping into 0..1 range using a linear + * transformation. + */ +public final class HammingDistance { + private HammingDistance() {} + + public static double longMetric(long left, long right) { + return normalizeBitCount( + Long.bitCount(left ^ right) + ); + } + + /** + * We use unity-based normalization to scale the bit + * count to the [0-1] range: + * y = (x_i - min(x)) / (max(x) - min(x)) See + * https://stats.stackexchange.com/a/70807 for example. + * In our case, min(x) = 0 since you cannot have a negative + * bit count, and max(x) = 64 since in Java, a long is + * 64 bits in size. + * + * We then subtract the normalized range from 1.0 to map + * 1.0 as most similar, and 0.0 as least similar. + */ + private static double normalizeBitCount(long bitCount) { + return 1.0 - (bitCount / 64.0); + } +} diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityComputer.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityComputer.java index 34f814d9481..fc402030ec7 100644 --- a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityComputer.java +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityComputer.java @@ -24,22 +24,20 @@ final class LongPropertySimilarityComputer implements SimilarityComputer { private final NodePropertyValues nodePropertyValues; + private final LongPropertySimilarityMetric metric; - LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues) { + LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues, LongPropertySimilarityMetric metric) { if (nodePropertyValues.valueType() != ValueType.LONG) { throw new IllegalArgumentException("The property is not of type LONG"); } this.nodePropertyValues = nodePropertyValues; + this.metric = metric; } @Override public double similarity(long firstNodeId, long secondNodeId) { var left = nodePropertyValues.longValue(firstNodeId); var right = nodePropertyValues.longValue(secondNodeId); - var abs = Math.abs(left - right); - if (abs == Long.MIN_VALUE) { - abs = Long.MAX_VALUE; - } - return 1.0 / (1.0 + abs); + return metric.compute(left, right); } } diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityMetric.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityMetric.java new file mode 100644 index 00000000000..035c773fd1e --- /dev/null +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityMetric.java @@ -0,0 +1,23 @@ +/* + * Copyright (c) "Neo4j" + * Neo4j Sweden AB [http://neo4j.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.gds.similarity.knn.metrics; +interface LongPropertySimilarityMetric { + double compute(long left, long right); +} diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifference.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifference.java new file mode 100644 index 00000000000..4f3dc6dc5ca --- /dev/null +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifference.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) "Neo4j" + * Neo4j Sweden AB [http://neo4j.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.gds.similarity.knn.metrics; + +public final class NormalizedAbsoluteDifference { + private NormalizedAbsoluteDifference() {} + + public static double longMetric(long left, long right) { + var abs = Math.abs(left - right); + if (abs == Long.MIN_VALUE) { + abs = Long.MAX_VALUE; + } + return 1.0 / (1.0 + abs); + } +} diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputer.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputer.java index e43d0e6f4c5..9a5b4b9dd59 100644 --- a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputer.java +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputer.java @@ -73,7 +73,11 @@ static SimilarityComputer ofProperty( ) { switch (properties.valueType()) { case LONG: - return ofLongProperty(properties); + return ofLongProperty( + name, + properties, + defaultSimilarityMetric + ); case DOUBLE: return ofDoubleProperty(properties); case DOUBLE_ARRAY: @@ -107,8 +111,15 @@ static SimilarityComputer ofDoubleProperty(NodePropertyValues nodePropertyValues return new DoublePropertySimilarityComputer(nodePropertyValues); } - static SimilarityComputer ofLongProperty(NodePropertyValues nodePropertyValues) { - return new LongPropertySimilarityComputer(nodePropertyValues); + static SimilarityComputer ofLongProperty(String name, NodePropertyValues properties, SimilarityMetric metric) { + switch (metric) { + case HAMMING_DISTANCE: + return new LongPropertySimilarityComputer(properties, HammingDistance::longMetric); + case NORMALIZED_ABSOLUTE_DIFFERENCE: + return new LongPropertySimilarityComputer(properties, NormalizedAbsoluteDifference::longMetric); + default: + throw unsupportedSimilarityMetric(name, properties.valueType(), metric); + } } static SimilarityComputer ofFloatArrayProperty(String name, NodePropertyValues properties, SimilarityMetric metric) { diff --git a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityMetric.java b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityMetric.java index 74ceb2da6ca..e0d14b67f0c 100644 --- a/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityMetric.java +++ b/algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityMetric.java @@ -25,7 +25,9 @@ import static org.neo4j.gds.utils.StringFormatting.toUpperCaseWithLocale; public enum SimilarityMetric { - JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON, LONG_PROPERTY_METRIC, DOUBLE_PROPERTY_METRIC, DEFAULT; + JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON, + NORMALIZED_ABSOLUTE_DIFFERENCE, DOUBLE_PROPERTY_METRIC, + HAMMING_DISTANCE, DEFAULT; public static SimilarityMetric parse(String value) { return SimilarityMetric.valueOf(toUpperCaseWithLocale(value)); @@ -34,7 +36,7 @@ public static SimilarityMetric parse(String value) { public static SimilarityMetric defaultMetricForType(ValueType valueType) { switch (valueType) { case LONG: - return LONG_PROPERTY_METRIC; + return NORMALIZED_ABSOLUTE_DIFFERENCE; case DOUBLE: return DOUBLE_PROPERTY_METRIC; case DOUBLE_ARRAY: diff --git a/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredGenerateRandomNeighborsTest.java b/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredGenerateRandomNeighborsTest.java index ae133135e22..6f19ca5b571 100644 --- a/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredGenerateRandomNeighborsTest.java +++ b/algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredGenerateRandomNeighborsTest.java @@ -68,7 +68,7 @@ public long size() { idMap, "myProperty", nodeProperties, - SimilarityMetric.LONG_PROPERTY_METRIC + SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE ); var random = new SplittableRandom(); diff --git a/algo/src/test/java/org/neo4j/gds/similarity/knn/GenerateRandomNeighborsTest.java b/algo/src/test/java/org/neo4j/gds/similarity/knn/GenerateRandomNeighborsTest.java index b71807b7c21..b20f686e586 100644 --- a/algo/src/test/java/org/neo4j/gds/similarity/knn/GenerateRandomNeighborsTest.java +++ b/algo/src/test/java/org/neo4j/gds/similarity/knn/GenerateRandomNeighborsTest.java @@ -67,7 +67,7 @@ public long size() { idMap, "myProperty", nodeProperties, - SimilarityMetric.LONG_PROPERTY_METRIC + SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE ); var random = new SplittableRandom(); diff --git a/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/HammingDistanceTest.java b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/HammingDistanceTest.java new file mode 100644 index 00000000000..ee334ac7ced --- /dev/null +++ b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/HammingDistanceTest.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) "Neo4j" + * Neo4j Sweden AB [http://neo4j.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.gds.similarity.knn.metrics; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class HammingDistanceTest { + @Test + void shouldReturnFullCorrelationWhenArgsAreIdentical() { + double dist = HammingDistance.longMetric(12345L, 12345L); + + assertEquals(1.0, dist); + } + + @Test + void shouldReturnCorrectCorrelation() { + double dist = HammingDistance.longMetric(12345L, 54321L); + + assertEquals(0.921875, dist); + } +} diff --git a/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifferenceTest.java b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifferenceTest.java new file mode 100644 index 00000000000..6c4a9b05a3b --- /dev/null +++ b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/NormalizedAbsoluteDifferenceTest.java @@ -0,0 +1,33 @@ +/* + * Copyright (c) "Neo4j" + * Neo4j Sweden AB [http://neo4j.com] + * + * This file is part of Neo4j. + * + * Neo4j is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.neo4j.gds.similarity.knn.metrics; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class NormalizedAbsoluteDifferenceTest { + @Test + void shouldComputeNormalizedAbsoluteDifference() { + double diff = NormalizedAbsoluteDifference.longMetric(1L, 2L); + + assertEquals(1.0, diff); + } +} diff --git a/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputerTest.java b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputerTest.java index 7bec193e48a..3fa1ea8b033 100644 --- a/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputerTest.java +++ b/algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputerTest.java @@ -66,16 +66,22 @@ void doublePropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentV } @Property - void longPropertySimilarityReturns1ForEqualValues(@ForAll @Positive long id) { + void longPropertySimilarityReturns1ForEqualValues( + @ForAll @Positive long id, + @ForAll @From("longMetrics") SimilarityMetric similarityMetric + ) { NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId); - var sim = SimilarityComputer.ofLongProperty(props); + var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric); assertThat(sim.similarity(id, id)).isEqualTo(1.0); } @Property - void longPropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentValues") LongLongPair ids) { + void longPropertySimilarityReturnsValuesBetween0And1( + @ForAll @From("differentValues") LongLongPair ids, + @ForAll @From("longMetrics") SimilarityMetric similarityMetric + ) { NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId); - var sim = SimilarityComputer.ofLongProperty(props); + var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric); assertThat(sim.similarity(ids.getOne(), ids.getTwo())).isStrictlyBetween(0.0, 1.0); } @@ -305,6 +311,11 @@ final Arbitrary differentValues() { .map(n2 -> PrimitiveTuples.pair((long) n1, (long) n2))); } + @Provide("longMetrics") + final Arbitrary longMetrics() { + return Arbitraries.of(SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE, SimilarityMetric.HAMMING_DISTANCE); + } + @Provide("longArrayMetrics") final Arbitrary longArrayMetrics() { return Arbitraries.of(SimilarityMetric.JACCARD, SimilarityMetric.OVERLAP);