Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix right candidate selection for large cluster refinement #20

Merged
merged 4 commits into from
Sep 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ subprojects {

dependencies {
"testImplementation"("org.junit.jupiter:junit-jupiter-api:5.3.0")
"testImplementation"("org.junit.jupiter:junit-jupiter-params:5.3.0")
"testRuntimeOnly"("org.junit.jupiter:junit-jupiter-engine:5.3.0")
"testImplementation"(group = "org.assertj", name = "assertj-core", version = "3.11.1")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import lombok.Value;
import lombok.experimental.FieldDefaults;
import lombok.experimental.Wither;
import org.apache.commons.lang3.tuple.Pair;


/**
Expand Down Expand Up @@ -131,19 +132,25 @@ private static double scoreClustering(final byte[] partitions, final double[][]
return score;
}

private static List<WeightedEdge> getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) {
final List<WeightedEdge> weightedEdges;
weightedEdges = RANDOM.ints(0, potentialNumEdges)
.distinct()
.limit(desiredNumEdges)
.mapToObj(i -> {
// reverse of Gaussian
int leftIndex = (int) (Math.sqrt(i + 0.25) - 0.5);
int rightIndex = i - getNumEdges(leftIndex) + leftIndex;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Second error, it should just be int rightIndex = i - getNumEdges(leftIndex);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Explanation: leftIndex and rightIndex should take all possible values where leftIndex > rightIndex and leftIndex < numNodes and rightIndex < numNodes.
i is a random edge < numEdges
leftIndex is the largest node, such that leftIndex * (leftIndex + 1) / 2 <= i
rightIndex is then the remainder.

Examples:

i leftIndex rightIndex
0 0 0
1 1 0
2 1 1
3 2 0
4 2 1
5 2 2
6 3 0
7 3 1
8 3 2
9 3 3
10 4 0
11 4 1
12 4 2
13 4 3
14 4 4
15 5 0
16 5 1
17 5 2
18 5 3
19 5 4
20 5 5
21 6 0
22 6 1
23 6 2
24 6 3
25 6 4
26 6 5
27 6 6

return WeightedEdge.of(leftIndex, rightIndex, Double.NaN);
})
.collect(Collectors.toList());
return weightedEdges;
static List<WeightedEdge> getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) {
return RANDOM.ints(0, potentialNumEdges)
.distinct()
.mapToObj(RefineCluster::createGaussPair)
.filter(RefineCluster::isNotSelfPair)
.map(p -> WeightedEdge.of(p.getLeft(), p.getRight(), Double.NaN))
.limit(desiredNumEdges)
.collect(Collectors.toList());
}

private static <T> boolean isNotSelfPair(final Pair<T, T> pair) {
return !pair.getLeft().equals(pair.getRight());
}

static Pair<Integer, Integer> createGaussPair(final int i) {
// reverse of Gaussian
final int leftIndex = (int) (Math.sqrt(2 * i + 0.25) - 0.5);
final int rightIndex = i - getNumEdges(leftIndex + 1);
return Pair.of(leftIndex, rightIndex);
}

private List<ClassifiedCandidate<T>> getRelevantClassifications(final Cluster<C, ? super T> cluster,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.bakdata.dedupe.clustering;

import org.apache.commons.lang3.tuple.Pair;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import static com.bakdata.dedupe.clustering.RefineCluster.createGaussPair;
import static com.bakdata.dedupe.clustering.RefineCluster.getRandomEdges;
import static org.assertj.core.api.Assertions.assertThat;

class RefineClusterTest {

static Stream<Arguments> generateGaussPairs() {
final int n = 7;
final AtomicInteger i = new AtomicInteger();
return IntStream.range(0, n)
.boxed()
.flatMap(leftIndex -> IntStream.rangeClosed(0, leftIndex)
.boxed()
.map(rightIndex -> Pair.of(leftIndex, rightIndex)))
.map(p -> Arguments.of(i.getAndIncrement(), p.getLeft(), p.getRight()));
}

@ParameterizedTest
@MethodSource("generateGaussPairs")
void shouldCreateCorrectGaussPair(final int i, final int leftIndex, final int rightIndex) {
assertThat(createGaussPair(i))
.as("%d should generate %d and %d", i, leftIndex, rightIndex)
.satisfies(edge -> assertThat(edge.getLeft()).isEqualTo(leftIndex))
.satisfies(edge -> assertThat(edge.getRight()).isEqualTo(rightIndex));
}

@Test
void shouldCreateCorrectNumberOfRandomEdges() {
final int potentialNumEdges = 55; // gaussian sum of 11
final int desiredNumEdges = 45; // gaussian sum of 10
assertThat(getRandomEdges(potentialNumEdges, desiredNumEdges))
.hasSize(desiredNumEdges);
}

}