Skip to content

Commit

Permalink
Add Hamming Distance knn similarity metric for long property
Browse files Browse the repository at this point in the history
+ tests
  • Loading branch information
htmlboss committed May 29, 2022
1 parent 255e450 commit 3a6e285
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

/**
* We compute the Hamming Distance,
* (https://en.wikipedia.org/wiki/Hamming_distance) and turn it into
* a similarity metric by clamping into 0..1 range using a linear
* transformation.
*/
public final class HammingDistance {
private HammingDistance() {}

public static double longMetric(long left, long right) {
return normalizeBitCount(
Long.bitCount(left ^ right)
);
}

/**
* We use unity-based normalization to scale the bit
* count to the [0-1] range:
* y = (x_i - min(x)) / (max(x) - min(x)) See
* https://stats.stackexchange.com/a/70807 for example.
* In our case, min(x) = 0 since you cannot have a negative
* bit count, and max(x) = 64 since in Java, a long is
* 64 bits in size.
*
* We then subtract the normalized range from 1.0 to map
* 1.0 as most similar, and 0.0 as least similar.
*/
private static double normalizeBitCount(long bitCount) {
return 1.0 - (bitCount / 64.0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,20 @@

final class LongPropertySimilarityComputer implements SimilarityComputer {
private final NodePropertyValues nodePropertyValues;
private final LongPropertySimilarityMetric metric;

LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues) {
LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues, LongPropertySimilarityMetric metric) {
if (nodePropertyValues.valueType() != ValueType.LONG) {
throw new IllegalArgumentException("The property is not of type LONG");
}
this.nodePropertyValues = nodePropertyValues;
this.metric = metric;
}

@Override
public double similarity(long firstNodeId, long secondNodeId) {
var left = nodePropertyValues.longValue(firstNodeId);
var right = nodePropertyValues.longValue(secondNodeId);
var abs = Math.abs(left - right);
if (abs == Long.MIN_VALUE) {
abs = Long.MAX_VALUE;
}
return 1.0 / (1.0 + abs);
return metric.compute(left, right);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;
interface LongPropertySimilarityMetric {
double compute(long left, long right);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

public final class NormalizedAbsoluteDifference {
private NormalizedAbsoluteDifference() {}

public static double longMetric(long left, long right) {
var abs = Math.abs(left - right);
if (abs == Long.MIN_VALUE) {
abs = Long.MAX_VALUE;
}
return 1.0 / (1.0 + abs);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ static SimilarityComputer ofProperty(
) {
switch (properties.valueType()) {
case LONG:
return ofLongProperty(properties);
return ofLongProperty(
name,
properties,
defaultSimilarityMetric
);
case DOUBLE:
return ofDoubleProperty(properties);
case DOUBLE_ARRAY:
Expand Down Expand Up @@ -107,8 +111,15 @@ static SimilarityComputer ofDoubleProperty(NodePropertyValues nodePropertyValues
return new DoublePropertySimilarityComputer(nodePropertyValues);
}

static SimilarityComputer ofLongProperty(NodePropertyValues nodePropertyValues) {
return new LongPropertySimilarityComputer(nodePropertyValues);
static SimilarityComputer ofLongProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {
switch (metric) {
case HAMMING_DISTANCE:
return new LongPropertySimilarityComputer(properties, HammingDistance::longMetric);
case NORMALIZED_ABSOLUTE_DIFFERENCE:
return new LongPropertySimilarityComputer(properties, NormalizedAbsoluteDifference::longMetric);
default:
throw unsupportedSimilarityMetric(name, properties.valueType(), metric);
}
}

static SimilarityComputer ofFloatArrayProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import static org.neo4j.gds.utils.StringFormatting.toUpperCaseWithLocale;

public enum SimilarityMetric {
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON, LONG_PROPERTY_METRIC, DOUBLE_PROPERTY_METRIC, DEFAULT;
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON,
NORMALIZED_ABSOLUTE_DIFFERENCE, DOUBLE_PROPERTY_METRIC,
HAMMING_DISTANCE, DEFAULT;

public static SimilarityMetric parse(String value) {
return SimilarityMetric.valueOf(toUpperCaseWithLocale(value));
Expand All @@ -34,7 +36,7 @@ public static SimilarityMetric parse(String value) {
public static SimilarityMetric defaultMetricForType(ValueType valueType) {
switch (valueType) {
case LONG:
return LONG_PROPERTY_METRIC;
return NORMALIZED_ABSOLUTE_DIFFERENCE;
case DOUBLE:
return DOUBLE_PROPERTY_METRIC;
case DOUBLE_ARRAY:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public long size() {
idMap,
"myProperty",
nodeProperties,
SimilarityMetric.LONG_PROPERTY_METRIC
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
);

var random = new SplittableRandom();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public long size() {
idMap,
"myProperty",
nodeProperties,
SimilarityMetric.LONG_PROPERTY_METRIC
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
);

var random = new SplittableRandom();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class HammingDistanceTest {
@Test
void shouldReturnFullCorrelationWhenArgsAreIdentical() {
double dist = HammingDistance.longMetric(12345L, 12345L);

assertEquals(1.0, dist);
}

@Test
void shouldReturnCorrectCorrelation() {
double dist = HammingDistance.longMetric(12345L, 54321L);

assertEquals(0.921875, dist);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class NormalizedAbsoluteDifferenceTest {
@Test
void shouldComputeNormalizedAbsoluteDifference() {
double diff = NormalizedAbsoluteDifference.longMetric(1L, 2L);

assertEquals(1.0, diff);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,22 @@ void doublePropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentV
}

@Property
void longPropertySimilarityReturns1ForEqualValues(@ForAll @Positive long id) {
void longPropertySimilarityReturns1ForEqualValues(
@ForAll @Positive long id,
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
) {
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
var sim = SimilarityComputer.ofLongProperty(props);
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
assertThat(sim.similarity(id, id)).isEqualTo(1.0);
}

@Property
void longPropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentValues") LongLongPair ids) {
void longPropertySimilarityReturnsValuesBetween0And1(
@ForAll @From("differentValues") LongLongPair ids,
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
) {
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
var sim = SimilarityComputer.ofLongProperty(props);
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
assertThat(sim.similarity(ids.getOne(), ids.getTwo())).isStrictlyBetween(0.0, 1.0);
}

Expand Down Expand Up @@ -305,6 +311,11 @@ final Arbitrary<LongLongPair> differentValues() {
.map(n2 -> PrimitiveTuples.pair((long) n1, (long) n2)));
}

@Provide("longMetrics")
final Arbitrary<SimilarityMetric> longMetrics() {
return Arbitraries.of(SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE, SimilarityMetric.HAMMING_DISTANCE);
}

@Provide("longArrayMetrics")
final Arbitrary<SimilarityMetric> longArrayMetrics() {
return Arbitraries.of(SimilarityMetric.JACCARD, SimilarityMetric.OVERLAP);
Expand Down

0 comments on commit 3a6e285

Please sign in to comment.