Skip to content

Commit

Permalink
Merge pull request #281 from SimDing/readd-clustering
Browse files Browse the repository at this point in the history
Concerning issue #116. This adds the clustering that was removed in #89 again, not including any user interfaces.
The clustering is implemented in the de.jplag.clustering package. It includes two clustering algorithms (spectral and agglomerative), preprocessing, decoupling logic, clustering options and a factory class which can be used to run the clustering in just two statements.
  • Loading branch information
tsaglam authored Feb 24, 2022
2 parents ea7970d + 6d82ee7 commit bffc5c9
Show file tree
Hide file tree
Showing 44 changed files with 3,569 additions and 48 deletions.
11 changes: 11 additions & 0 deletions jplag/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@
<artifactId>system-rules</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>4.0.0</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>de.jplag</groupId>
Expand Down
59 changes: 45 additions & 14 deletions jplag/src/main/java/de/jplag/CLI.java
Original file line number Diff line number Diff line change
@@ -1,30 +1,24 @@
package de.jplag;

import static de.jplag.CommandLineArgument.BASE_CODE;
import static de.jplag.CommandLineArgument.COMPARISON_MODE;
import static de.jplag.CommandLineArgument.DEBUG;
import static de.jplag.CommandLineArgument.EXCLUDE_FILE;
import static de.jplag.CommandLineArgument.LANGUAGE;
import static de.jplag.CommandLineArgument.MIN_TOKEN_MATCH;
import static de.jplag.CommandLineArgument.RESULT_FOLDER;
import static de.jplag.CommandLineArgument.ROOT_DIRECTORY;
import static de.jplag.CommandLineArgument.SHOWN_COMPARISONS;
import static de.jplag.CommandLineArgument.SIMILARITY_THRESHOLD;
import static de.jplag.CommandLineArgument.SUBDIRECTORY;
import static de.jplag.CommandLineArgument.SUFFIXES;
import static de.jplag.CommandLineArgument.VERBOSITY;
import static de.jplag.CommandLineArgument.*;

import java.io.File;
import java.util.Optional;
import java.util.Random;

import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;

import de.jplag.clustering.ClusteringAlgorithm;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.clustering.Preprocessing;
import de.jplag.clustering.algorithm.InterClusterSimilarity;
import de.jplag.exceptions.ExitException;
import de.jplag.options.JPlagOptions;
import de.jplag.options.LanguageOption;
import de.jplag.options.SimilarityMetric;
import de.jplag.options.Verbosity;
import de.jplag.reporting.Report;
import de.jplag.strategy.ComparisonMode;
Expand All @@ -42,6 +36,8 @@ public class CLI {
"More Abstract than Tree", "Students Nightmare", "No, changing variable names does not work", "The tech is out there!"};

private static final String PROGRAM_NAME = "jplag";
static final String CLUSTERING_GROUP_NAME = "Clustering";
static final String CLUSTERING_PREPROCESSING_GROUP_NAME = "Clustering - Preprocessing";

private final ArgumentParser parser;

Expand Down Expand Up @@ -71,8 +67,9 @@ public static void main(String[] args) {
*/
public CLI() {
parser = ArgumentParsers.newFor(PROGRAM_NAME).build().defaultHelp(true).description(generateDescription());
CliGroupHelper groupHelper = new CliGroupHelper(parser);
for (CommandLineArgument argument : CommandLineArgument.values()) {
argument.parseWith(parser);
argument.parseWith(parser, groupHelper);
}
}

Expand Down Expand Up @@ -115,6 +112,40 @@ public JPlagOptions buildOptionsFromArguments(Namespace namespace) {
options.setMaximumNumberOfComparisons(SHOWN_COMPARISONS.getFrom(namespace));
ComparisonMode.fromName(COMPARISON_MODE.getFrom(namespace)).ifPresentOrElse(it -> options.setComparisonMode(it),
() -> System.out.println("Unknown comparison mode, using default mode!"));

ClusteringOptions.Builder clusteringBuilder = new ClusteringOptions.Builder();
Optional.ofNullable((Boolean) CLUSTER_ENABLE.getFrom(namespace)).ifPresent(clusteringBuilder::enabled);
Optional.ofNullable((ClusteringAlgorithm) CLUSTER_ALGORITHM.getFrom(namespace)).ifPresent(clusteringBuilder::algorithm);
Optional.ofNullable((SimilarityMetric) CLUSTER_METRIC.getFrom(namespace)).ifPresent(clusteringBuilder::similarityMetric);
Optional.ofNullable((Float) CLUSTER_SPECTRAL_BANDWIDTH.getFrom(namespace)).ifPresent(clusteringBuilder::spectralKernelBandwidth);
Optional.ofNullable((Float) CLUSTER_SPECTRAL_NOISE.getFrom(namespace)).ifPresent(clusteringBuilder::spectralGaussianProcessVariance);
Optional.ofNullable((Integer) CLUSTER_SPECTRAL_MIN_RUNS.getFrom(namespace)).ifPresent(clusteringBuilder::spectralMinRuns);
Optional.ofNullable((Integer) CLUSTER_SPECTRAL_MAX_RUNS.getFrom(namespace)).ifPresent(clusteringBuilder::spectralMaxRuns);
Optional.ofNullable((Integer) CLUSTER_SPECTRAL_KMEANS_ITERATIONS.getFrom(namespace))
.ifPresent(clusteringBuilder::spectralMaxKMeansIterationPerRun);
Optional.ofNullable((Float) CLUSTER_AGGLOMERATIVE_THRESHOLD.getFrom(namespace)).ifPresent(clusteringBuilder::agglomerativeThreshold);
Optional.ofNullable((InterClusterSimilarity) CLUSTER_AGGLOMERATIVE_INTER_CLUSTER_SIMILARITY.getFrom(namespace))
.ifPresent(clusteringBuilder::agglomerativeInterClusterSimilarity);
Optional.ofNullable((Boolean) CLUSTER_PREPROCESSING_NONE.getFrom(namespace)).ifPresent(none -> {
if (none) {
clusteringBuilder.preprocessor(Preprocessing.NONE);
}
});
Optional.ofNullable((Boolean) CLUSTER_PREPROCESSING_CDF.getFrom(namespace)).ifPresent(cdf -> {
if (cdf) {
clusteringBuilder.preprocessor(Preprocessing.CUMULATIVE_DISTRIBUTION_FUNCTION);
}
});
Optional.ofNullable((Float) CLUSTER_PREPROCESSING_PERCENTILE.getFrom(namespace)).ifPresent(percentile -> {
clusteringBuilder.preprocessor(Preprocessing.PERCENTILE);
clusteringBuilder.preprocessorPercentile(percentile);
});
Optional.ofNullable((Float) CLUSTER_PREPROCESSING_THRESHOLD.getFrom(namespace)).ifPresent(threshold -> {
clusteringBuilder.preprocessor(Preprocessing.THRESHOLD);
clusteringBuilder.preprocessorPercentile(threshold);
});
options.setClusteringOptions(clusteringBuilder.build());

return options;
}

Expand Down
33 changes: 33 additions & 0 deletions jplag/src/main/java/de/jplag/CliGroupHelper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package de.jplag;

import java.util.HashMap;
import java.util.Map;

import net.sourceforge.argparse4j.inf.ArgumentContainer;
import net.sourceforge.argparse4j.inf.ArgumentGroup;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.MutuallyExclusiveGroup;

/**
* Can be used to automatically create and reuse {@link ArgumentGroup}s and {@link MutuallyExclusiveGroup}s through
* their names only. This is useful when an {@link ArgumentParser} is not configured in an imperative fashion.
*/
public class CliGroupHelper {

private final ArgumentParser parser;
private Map<String, MutuallyExclusiveGroup> mutuallyExclusiveGroups = new HashMap<>();
private Map<String, ArgumentGroup> argumentGroups = new HashMap<>();

public CliGroupHelper(ArgumentParser parser) {
this.parser = parser;
}

public ArgumentContainer getMutuallyExclusiveGroup(String name) {
return mutuallyExclusiveGroups.computeIfAbsent(name, parser::addMutuallyExclusiveGroup);
}

public ArgumentContainer getArgumentGroup(String name) {
return argumentGroups.computeIfAbsent(name, parser::addArgumentGroup);
}

}
162 changes: 131 additions & 31 deletions jplag/src/main/java/de/jplag/CommandLineArgument.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package de.jplag;

import static de.jplag.CLI.CLUSTERING_GROUP_NAME;
import static de.jplag.CLI.CLUSTERING_PREPROCESSING_GROUP_NAME;
import static de.jplag.options.JPlagOptions.DEFAULT_COMPARISON_MODE;
import static de.jplag.options.JPlagOptions.DEFAULT_SHOWN_COMPARISONS;
import static de.jplag.options.JPlagOptions.DEFAULT_SIMILARITY_THRESHOLD;
Expand All @@ -9,62 +11,103 @@
import java.util.List;
import java.util.Optional;

import net.sourceforge.argparse4j.impl.Arguments;
import net.sourceforge.argparse4j.inf.Argument;
import net.sourceforge.argparse4j.inf.ArgumentAction;
import net.sourceforge.argparse4j.inf.ArgumentContainer;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.Namespace;

import de.jplag.clustering.ClusteringAlgorithm;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.clustering.algorithm.InterClusterSimilarity;
import de.jplag.options.LanguageOption;
import de.jplag.options.SimilarityMetric;
import de.jplag.strategy.ComparisonMode;

/**
* Command line arguments for the JPlag CLI. Each argument is defined through an enumeral.
* @author Timur Saglam
*/
public enum CommandLineArgument {
ROOT_DIRECTORY("rootDir", NumberOfArgumentValues.ONE_OR_MORE_VALUES, String.class),
LANGUAGE("-l", String.class, LanguageOption.getDefault().getDisplayName(), LanguageOption.getAllDisplayNames()),

ROOT_DIRECTORY(new Builder("rootDir", String.class).nargs(NumberOfArgumentValues.ONE_OR_MORE_VALUES)),
LANGUAGE(new Builder("-l", String.class).defaultsTo(LanguageOption.getDefault().getDisplayName()).choices(LanguageOption.getAllDisplayNames())),
BASE_CODE("-bc", String.class),
VERBOSITY("-v", String.class, "quiet", List.of("quiet", "long")), // TODO SH: Replace verbosity when integrating a real logging library
VERBOSITY(new Builder("-v", String.class).defaultsTo("quiet").choices(List.of("quiet", "long"))), // TODO SH: Replace verbosity when integrating a
// real logging library
DEBUG("-d", Boolean.class),
SUBDIRECTORY("-S", String.class),
SUFFIXES("-p", String.class),
EXCLUDE_FILE("-x", String.class),
MIN_TOKEN_MATCH("-t", Integer.class),
SIMILARITY_THRESHOLD("-m", Float.class, DEFAULT_SIMILARITY_THRESHOLD),
SHOWN_COMPARISONS("-n", Integer.class, DEFAULT_SHOWN_COMPARISONS),
RESULT_FOLDER("-r", String.class, "result"),
COMPARISON_MODE("-c", String.class, DEFAULT_COMPARISON_MODE.getName(), ComparisonMode.allNames());
SIMILARITY_THRESHOLD(new Builder("-m", Float.class).defaultsTo(DEFAULT_SIMILARITY_THRESHOLD)),
SHOWN_COMPARISONS(new Builder("-n", Integer.class).defaultsTo(DEFAULT_SHOWN_COMPARISONS)),
RESULT_FOLDER(new Builder("-r", String.class).defaultsTo("result")),
COMPARISON_MODE(new Builder("-c", String.class).defaultsTo(DEFAULT_COMPARISON_MODE.getName()).choices(ComparisonMode.allNames())),
CLUSTER_ENABLE(new Builder("--cluster-skip", Boolean.class).argumentGroup(CLUSTERING_GROUP_NAME).action(Arguments.storeTrue())),
CLUSTER_ALGORITHM(
new Builder("--cluster-alg", ClusteringAlgorithm.class).argumentGroup(CLUSTERING_GROUP_NAME)
.defaultsTo(ClusteringOptions.DEFAULTS.getAlgorithm())),
CLUSTER_METRIC(
new Builder("--cluster-metric", SimilarityMetric.class).argumentGroup(CLUSTERING_GROUP_NAME)
.defaultsTo(ClusteringOptions.DEFAULTS.getSimilarityMetric())),
CLUSTER_SPECTRAL_BANDWIDTH(
new Builder("--cluster-spectral-bandwidth", Float.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("bandwidth")
.defaultsTo(ClusteringOptions.DEFAULTS.getSpectralKernelBandwidth())),
CLUSTER_SPECTRAL_NOISE(
new Builder("--cluster-spectral-noise", Float.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("noise")
.defaultsTo(ClusteringOptions.DEFAULTS.getSpectralGaussianProcessVariance())),
CLUSTER_SPECTRAL_MIN_RUNS(
new Builder("--cluster-spectral-min-runs", Integer.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("min")
.defaultsTo(ClusteringOptions.DEFAULTS.getSpectralMinRuns())),
CLUSTER_SPECTRAL_MAX_RUNS(
new Builder("--cluster-spectral-max-runs", Integer.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("max")
.defaultsTo(ClusteringOptions.DEFAULTS.getSpectralMaxRuns())),
CLUSTER_SPECTRAL_KMEANS_ITERATIONS(
new Builder("--cluster-spectral-kmeans-interations", Integer.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("iterations")
.defaultsTo(ClusteringOptions.DEFAULTS.getSpectralMaxKMeansIterationPerRun())),
CLUSTER_AGGLOMERATIVE_THRESHOLD(
new Builder("--cluster-agglomerative-threshold", Float.class).argumentGroup(CLUSTERING_GROUP_NAME).metaVar("threshold")
.defaultsTo(ClusteringOptions.DEFAULTS.getAgglomerativeThreshold())),
CLUSTER_AGGLOMERATIVE_INTER_CLUSTER_SIMILARITY(
new Builder("--cluster-agglomerative-inter-cluster-similarity", InterClusterSimilarity.class).argumentGroup(CLUSTERING_GROUP_NAME)
.defaultsTo(ClusteringOptions.DEFAULTS.getAgglomerativeInterClusterSimilarity())),
CLUSTER_PREPROCESSING_NONE(
new Builder("--cluster-pp-none", Boolean.class).mutuallyExclusiveGroup(CLUSTERING_PREPROCESSING_GROUP_NAME)
.action(Arguments.storeTrue())),
CLUSTER_PREPROCESSING_CDF(
new Builder("--cluster-pp-cdf", Boolean.class).mutuallyExclusiveGroup(CLUSTERING_PREPROCESSING_GROUP_NAME).action(Arguments.storeTrue())),
CLUSTER_PREPROCESSING_PERCENTILE(
new Builder("--cluster-pp-percentile", Float.class).mutuallyExclusiveGroup(CLUSTERING_PREPROCESSING_GROUP_NAME).metaVar("percentile")),
CLUSTER_PREPROCESSING_THRESHOLD(
new Builder("--cluster-pp-threshold", Float.class).mutuallyExclusiveGroup(CLUSTERING_PREPROCESSING_GROUP_NAME).metaVar("threshold"));

private final String flag;
private final NumberOfArgumentValues numberOfValues;
private final String description;
private final Optional<Object> defaultValue;
private final Optional<Collection<String>> choices;
private final Optional<String> argumentGroup;
private final Optional<String> mutuallyExclusiveGroup;
private final Optional<ArgumentAction> action;
private final Optional<String> metaVar;
private final Class<?> type;

private CommandLineArgument(String flag, Class<?> type) {
this(flag, NumberOfArgumentValues.SINGLE_VALUE, type, Optional.empty(), Optional.empty());
}

private CommandLineArgument(String flag, NumberOfArgumentValues numberOfValues, Class<?> type) {
this(flag, numberOfValues, type, Optional.empty(), Optional.empty());
}

private CommandLineArgument(String flag, Class<?> type, Object defaultValue) {
this(flag, NumberOfArgumentValues.SINGLE_VALUE, type, Optional.of(defaultValue), Optional.empty());
}

private CommandLineArgument(String flag, Class<?> type, Object defaultValue, Collection<String> choices) {
this(flag, NumberOfArgumentValues.SINGLE_VALUE, type, Optional.of(defaultValue), Optional.of(choices));
this(new Builder(flag, type));
}

private CommandLineArgument(String flag, NumberOfArgumentValues numberOfValues, Class<?> type, Optional<Object> defaultValue,
Optional<Collection<String>> choices) {
this.flag = flag;
this.numberOfValues = numberOfValues;
this.type = type;
this.defaultValue = defaultValue;
this.choices = choices;
private CommandLineArgument(Builder builder) {
this.flag = builder.flag;
this.type = builder.type;
this.defaultValue = builder.defaultValue;
this.choices = builder.choices;
this.argumentGroup = builder.argumentGroup;
this.mutuallyExclusiveGroup = builder.mutuallyExclusiveGroup;
this.action = builder.action;
this.metaVar = builder.metaVar;
this.numberOfValues = builder.nargs.orElse(NumberOfArgumentValues.SINGLE_VALUE);
this.description = retrieveDescriptionFromMessages();
}

Expand All @@ -76,10 +119,10 @@ public String flag() {
}

/**
* @return the flag name of the command line argument without leading dashes.
* @return the flag name of the command line argument without leading dashes and inner dashes replaced with underscores.
*/
public String flagWithoutDash() {
return flag.replace("-", "");
return flag.replaceAll("^-+", "").replaceAll("-", "_");
}

/**
Expand Down Expand Up @@ -111,10 +154,15 @@ public <T> List<T> getListFrom(Namespace namespace) {
* Parses the command line argument with a specific parser.
* @param parser is that parser.
*/
public void parseWith(ArgumentParser parser) {
Argument argument = parser.addArgument(flag).help(description);
public void parseWith(ArgumentParser parser, CliGroupHelper groupHelper) {
ArgumentContainer argContainer = mutuallyExclusiveGroup.map(groupHelper::getMutuallyExclusiveGroup)
.or(() -> argumentGroup.map(groupHelper::getArgumentGroup)).orElse(parser);

Argument argument = argContainer.addArgument(flag).help(description);
choices.ifPresent(it -> argument.choices(it));
defaultValue.ifPresent(it -> argument.setDefault(it));
action.ifPresent(argument::action);
metaVar.ifPresent(argument::metavar);
argument.type(type);
if (type == Boolean.class) {
argument.action(storeTrue());
Expand All @@ -136,4 +184,56 @@ private String retrieveDescriptionFromMessages() {
}
return Messages.getString(getClass().getSimpleName() + "." + builder.toString());
}

private static class Builder {
private final String flag;
private final Class<?> type;
private Optional<Object> defaultValue = Optional.empty();
private Optional<Collection<String>> choices = Optional.empty();
private Optional<String> argumentGroup = Optional.empty();
private Optional<String> mutuallyExclusiveGroup = Optional.empty();
private Optional<ArgumentAction> action = Optional.empty();
private Optional<String> metaVar = Optional.empty();
private Optional<NumberOfArgumentValues> nargs = Optional.empty();

public Builder(String flag, Class<?> type) {
this.flag = flag;
this.type = type;
}

public Builder defaultsTo(Object defaultValue) {
this.defaultValue = Optional.of(defaultValue);
return this;
}

public Builder choices(Collection<String> choices) {
this.choices = Optional.of(choices);
return this;
}

public Builder argumentGroup(String argumentGroup) {
this.argumentGroup = Optional.of(argumentGroup);
return this;
}

public Builder mutuallyExclusiveGroup(String mutuallyExclusiveGroup) {
this.mutuallyExclusiveGroup = Optional.of(mutuallyExclusiveGroup);
return this;
}

public Builder action(ArgumentAction action) {
this.action = Optional.of(action);
return this;
}

public Builder metaVar(String metaVar) {
this.metaVar = Optional.of(metaVar);
return this;
}

public Builder nargs(NumberOfArgumentValues nargs) {
this.nargs = Optional.of(nargs);
return this;
}
}
}
4 changes: 4 additions & 0 deletions jplag/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import java.util.Set;
import java.util.stream.Collectors;

import de.jplag.clustering.ClusteringFactory;
import de.jplag.exceptions.ExitException;
import de.jplag.exceptions.SubmissionException;
import de.jplag.options.JPlagOptions;
Expand Down Expand Up @@ -90,6 +91,9 @@ public JPlagResult run() throws ExitException {
// Compare valid submissions.
JPlagResult result = comparisonStrategy.compareSubmissions(submissionSet);
errorCollector.print("\nTotal time for comparing submissions: " + TimeUtil.formatDuration(result.getDuration()), null);

result.setClusteringResult(ClusteringFactory.getClusterings(result.getComparisons(), options.getClusteringOptions()));

return result;
}

Expand Down
Loading

0 comments on commit bffc5c9

Please sign in to comment.