From 9cbabcb81d22f60e531e1559810936b560c5108f Mon Sep 17 00:00:00 2001 From: Fabian Paul Date: Thu, 26 Sep 2019 18:25:22 +0200 Subject: [PATCH 1/4] Fix right candidate selection for large cluster refinement --- .../main/java/com/bakdata/dedupe/clustering/RefineCluster.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java index e6b20a5..86e0f30 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java @@ -139,7 +139,7 @@ private static List getRandomEdges(final int potentialNumEdges, fi .mapToObj(i -> { // reverse of Gaussian int leftIndex = (int) (Math.sqrt(i + 0.25) - 0.5); - int rightIndex = i - getNumEdges(leftIndex) + leftIndex; + int rightIndex = (int) (0.5 * (i - getNumEdges(leftIndex) + leftIndex)); return WeightedEdge.of(leftIndex, rightIndex, Double.NaN); }) .collect(Collectors.toList()); From d21a39c18ff1b16ca6d3b90b95f1d37d0bd16c1e Mon Sep 17 00:00:00 2001 From: Fabian Paul Date: Fri, 27 Sep 2019 09:15:12 +0200 Subject: [PATCH 2/4] Fix formula --- .../java/com/bakdata/dedupe/clustering/RefineCluster.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java index 86e0f30..79a65cf 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java @@ -138,8 +138,8 @@ private static List getRandomEdges(final int potentialNumEdges, fi .limit(desiredNumEdges) .mapToObj(i -> { // reverse of Gaussian - int leftIndex = (int) (Math.sqrt(i + 0.25) - 0.5); - int rightIndex = (int) (0.5 * (i - getNumEdges(leftIndex) + leftIndex)); + int leftIndex = (int) (Math.sqrt(2 * i + 0.25) - 0.5); + int rightIndex = i - getNumEdges(leftIndex); return WeightedEdge.of(leftIndex, rightIndex, Double.NaN); }) .collect(Collectors.toList()); From 2e41aaa3517a618f7bc6fd16c22955252a66f131 Mon Sep 17 00:00:00 2001 From: Philipp Schirmer Date: Fri, 27 Sep 2019 10:58:42 +0200 Subject: [PATCH 3/4] Fix random edge computation --- build.gradle.kts | 2 +- .../dedupe/clustering/RefineCluster.java | 33 ++++++++----- .../dedupe/clustering/RefineClusterTest.java | 47 +++++++++++++++++++ 3 files changed, 68 insertions(+), 14 deletions(-) create mode 100644 common/src/test/java/com/bakdata/dedupe/clustering/RefineClusterTest.java diff --git a/build.gradle.kts b/build.gradle.kts index eb60eea..5f0f053 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -81,8 +81,8 @@ subprojects { dependencies { "testImplementation"("org.junit.jupiter:junit-jupiter-api:5.3.0") + "testImplementation"("org.junit.jupiter:junit-jupiter-params:5.3.0") "testRuntimeOnly"("org.junit.jupiter:junit-jupiter-engine:5.3.0") - "testImplementation"(group = "org.assertj", name = "assertj-core", version = "3.11.1") "compileOnly"("org.projectlombok:lombok:1.18.6") "annotationProcessor"("org.projectlombok:lombok:1.18.6") diff --git a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java index 79a65cf..63c0ef3 100644 --- a/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java +++ b/common/src/main/java/com/bakdata/dedupe/clustering/RefineCluster.java @@ -55,6 +55,7 @@ import lombok.Value; import lombok.experimental.FieldDefaults; import lombok.experimental.Wither; +import org.apache.commons.lang3.tuple.Pair; /** @@ -131,19 +132,25 @@ private static double scoreClustering(final byte[] partitions, final double[][] return score; } - private static List getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) { - final List weightedEdges; - weightedEdges = RANDOM.ints(0, potentialNumEdges) - .distinct() - .limit(desiredNumEdges) - .mapToObj(i -> { - // reverse of Gaussian - int leftIndex = (int) (Math.sqrt(2 * i + 0.25) - 0.5); - int rightIndex = i - getNumEdges(leftIndex); - return WeightedEdge.of(leftIndex, rightIndex, Double.NaN); - }) - .collect(Collectors.toList()); - return weightedEdges; + static List getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) { + return RANDOM.ints(0, potentialNumEdges) + .distinct() + .mapToObj(RefineCluster::createGaussPair) + .filter(RefineCluster::isNotSelfPair) + .map(p -> WeightedEdge.of(p.getLeft(), p.getRight(), Double.NaN)) + .limit(desiredNumEdges) + .collect(Collectors.toList()); + } + + private static boolean isNotSelfPair(final Pair pair) { + return !pair.getLeft().equals(pair.getRight()); + } + + static Pair createGaussPair(final int i) { + // reverse of Gaussian + final int leftIndex = (int) (Math.sqrt(2 * i + 0.25) - 0.5); + final int rightIndex = i - getNumEdges(leftIndex + 1); + return Pair.of(leftIndex, rightIndex); } private List> getRelevantClassifications(final Cluster cluster, diff --git a/common/src/test/java/com/bakdata/dedupe/clustering/RefineClusterTest.java b/common/src/test/java/com/bakdata/dedupe/clustering/RefineClusterTest.java new file mode 100644 index 0000000..c16aa99 --- /dev/null +++ b/common/src/test/java/com/bakdata/dedupe/clustering/RefineClusterTest.java @@ -0,0 +1,47 @@ +package com.bakdata.dedupe.clustering; + +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import static com.bakdata.dedupe.clustering.RefineCluster.createGaussPair; +import static com.bakdata.dedupe.clustering.RefineCluster.getRandomEdges; +import static org.assertj.core.api.Assertions.assertThat; + +class RefineClusterTest { + + static Stream generateGaussPairs() { + final int n = 7; + final AtomicInteger i = new AtomicInteger(); + return IntStream.range(0, n) + .boxed() + .flatMap(leftIndex -> IntStream.rangeClosed(0, leftIndex) + .boxed() + .map(rightIndex -> Pair.of(leftIndex, rightIndex))) + .map(p -> Arguments.of(i.getAndIncrement(), p.getLeft(), p.getRight())); + } + + @ParameterizedTest + @MethodSource("generateGaussPairs") + void shouldCreateCorrectGaussPair(final int i, final int leftIndex, final int rightIndex) { + assertThat(createGaussPair(i)) + .as("%d should generate %d and %d", i, leftIndex, rightIndex) + .satisfies(edge -> assertThat(edge.getLeft()).isEqualTo(leftIndex)) + .satisfies(edge -> assertThat(edge.getRight()).isEqualTo(rightIndex)); + } + + @Test + void shouldCreateCorrectNumberOfRandomEdges() { + final int potentialNumEdges = 55; // gaussian sum of 11 + final int desiredNumEdges = 45; // gaussian sum of 10 + assertThat(getRandomEdges(potentialNumEdges, desiredNumEdges)) + .hasSize(desiredNumEdges); + } + +} \ No newline at end of file From b8bae1d0c26f3d1ca0eda82b98a5eb80d8cfb642 Mon Sep 17 00:00:00 2001 From: Philipp Schirmer Date: Fri, 27 Sep 2019 11:01:28 +0200 Subject: [PATCH 4/4] Revert assertj --- build.gradle.kts | 1 + 1 file changed, 1 insertion(+) diff --git a/build.gradle.kts b/build.gradle.kts index 5f0f053..2645589 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -83,6 +83,7 @@ subprojects { "testImplementation"("org.junit.jupiter:junit-jupiter-api:5.3.0") "testImplementation"("org.junit.jupiter:junit-jupiter-params:5.3.0") "testRuntimeOnly"("org.junit.jupiter:junit-jupiter-engine:5.3.0") + "testImplementation"(group = "org.assertj", name = "assertj-core", version = "3.11.1") "compileOnly"("org.projectlombok:lombok:1.18.6") "annotationProcessor"("org.projectlombok:lombok:1.18.6")