Skip to content

Commit

Permalink
Merge pull request #2179 from jplag/feature/GstOutOfBounds
Browse files Browse the repository at this point in the history
Greedy String Tiling out of bounds
  • Loading branch information
tsaglam authored Feb 26, 2025
2 parents 83db2c7 + d22302c commit 0ddf389
Show file tree
Hide file tree
Showing 10 changed files with 133 additions and 142 deletions.
15 changes: 8 additions & 7 deletions core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
import org.slf4j.LoggerFactory;

import de.jplag.clustering.ClusteringFactory;
import de.jplag.comparison.LongestCommonSubsquenceSearch;
import de.jplag.exceptions.ExitException;
import de.jplag.exceptions.RootDirectoryException;
import de.jplag.exceptions.SubmissionException;
import de.jplag.merging.MatchMerging;
import de.jplag.options.JPlagOptions;
import de.jplag.reporting.reportobject.model.Version;
import de.jplag.strategy.ComparisonStrategy;
import de.jplag.strategy.ParallelComparisonStrategy;

/**
* This class coordinates the whole errorConsumer flow.
Expand All @@ -39,8 +38,8 @@ private static Version loadVersion() {

/**
* Creates and initializes a JPlag instance, parameterized by a set of options.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
* @param options determines the parameterization.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
*/
@Deprecated(since = "4.3.0")
public JPlag(JPlagOptions options) {
Expand All @@ -49,9 +48,9 @@ public JPlag(JPlagOptions options) {

/**
* Main procedure, executes the comparison of source code submissions.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
* @return the results of the comparison, specifically the submissions whose similarity exceeds a set threshold.
* @throws ExitException if JPlag exits preemptively.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
*/
@Deprecated(since = "4.3.0")
public JPlagResult run() throws ExitException {
Expand All @@ -66,11 +65,13 @@ public JPlagResult run() throws ExitException {
*/
public static JPlagResult run(JPlagOptions options) throws ExitException {
checkForConfigurationConsistency(options);
GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options);
ComparisonStrategy comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm);

// Parse and validate submissions.
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
SubmissionSet submissionSet = builder.buildSubmissionSet();

LongestCommonSubsquenceSearch comparisonStrategy = new LongestCommonSubsquenceSearch(options);

if (options.normalize() && options.language().supportsNormalization() && options.language().requiresCoreNormalization()) {
submissionSet.normalizeSubmissions();
}
Expand Down Expand Up @@ -113,7 +114,7 @@ private static void checkForConfigurationConsistency(JPlagOptions options) throw
}

List<String> duplicateNames = getDuplicateSubmissionFolderNames(options);
if (duplicateNames.size() > 0) {
if (!duplicateNames.isEmpty()) {
throw new RootDirectoryException(String.format("Duplicate root directory names found: %s", String.join(", ", duplicateNames)));
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag;
package de.jplag.comparison;

import java.util.ArrayList;
import java.util.Comparator;
Expand All @@ -7,10 +7,12 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;

import de.jplag.JPlagComparison;
import de.jplag.Match;
import de.jplag.Submission;
import de.jplag.Token;
import de.jplag.options.JPlagOptions;

/**
Expand All @@ -22,15 +24,14 @@
* String Similarity via Greedy String Tiling and Running Karp−Rabin Matching </a>
*/
public class GreedyStringTiling {

private final int minimumMatchLength;
private final JPlagOptions options;
private final ConcurrentMap<TokenType, Integer> tokenTypeValues;
private final Map<Submission, Set<Token>> baseCodeMarkings = new IdentityHashMap<>();

private final Map<Submission, int[]> cachedTokenValueLists = new IdentityHashMap<>();
private final Map<Submission, SubsequenceHashLookupTable> cachedHashLookupTables = new IdentityHashMap<>();

private final TokenValueMapper tokenValueMapper;

private static final String ERROR_INDEX_OUT_OF_BOUNDS = """
GST index out of bounds. This is probably a random issue caused by multithreading issues.
Length of the list that caused the exception (the list of marks for the relevant submission): %s, Index in that list: %s
Expand All @@ -40,13 +41,14 @@ Length of the list that caused the exception (the list of marks for the relevant
Submission (other): %s
""".trim().stripIndent();

public GreedyStringTiling(JPlagOptions options) {
public GreedyStringTiling(JPlagOptions options, TokenValueMapper tokenValueMapper) {
this.options = options;
// Ensures 1 <= neighborLength <= minimumTokenMatch
int minimumNeighborLength = Math.min(Math.max(options.mergingOptions().minimumNeighborLength(), 1), options.minimumTokenMatch());
int minimumNeighborLength = Math.clamp(options.mergingOptions().minimumNeighborLength(), 1, options.minimumTokenMatch());

this.minimumMatchLength = options.mergingOptions().enabled() ? minimumNeighborLength : options.minimumTokenMatch();
this.tokenTypeValues = new ConcurrentHashMap<>();
this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);

this.tokenValueMapper = tokenValueMapper;
}

/**
Expand Down Expand Up @@ -108,8 +110,8 @@ public final JPlagComparison compare(Submission firstSubmission, Submission seco
* @return the comparison results.
*/
private JPlagComparison compareInternal(Submission leftSubmission, Submission rightSubmission) {
int[] leftValues = tokenValueListFromSubmission(leftSubmission);
int[] rightValues = tokenValueListFromSubmission(rightSubmission);
int[] leftValues = this.tokenValueMapper.getTokenValuesFor(leftSubmission);
int[] rightValues = this.tokenValueMapper.getTokenValuesFor(rightSubmission);

boolean[] leftMarked = calculateInitiallyMarked(leftSubmission);
boolean[] rightMarked = calculateInitiallyMarked(rightSubmission);
Expand Down Expand Up @@ -219,33 +221,14 @@ private boolean[] calculateInitiallyMarked(Submission submission) {

private SubsequenceHashLookupTable subsequenceHashLookupTableForSubmission(Submission submission, boolean[] marked) {
return cachedHashLookupTables.computeIfAbsent(submission,
key -> new SubsequenceHashLookupTable(minimumMatchLength, tokenValueListFromSubmission(key), marked));
}

/**
* Converts the tokens of the submission to a list of values.
* @param submission The submission from which to convert the tokens.
*/
private int[] tokenValueListFromSubmission(Submission submission) {
return cachedTokenValueLists.computeIfAbsent(submission, key -> {
List<Token> tokens = key.getTokenList();
int[] tokenValueList = new int[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
TokenType type = tokens.get(i).getType();
synchronized (tokenTypeValues) {
tokenTypeValues.putIfAbsent(type, tokenTypeValues.size());
}
tokenValueList[i] = tokenTypeValues.get(type);
}
return tokenValueList;
});
key -> new SubsequenceHashLookupTable(minimumMatchLength, this.tokenValueMapper.getTokenValuesFor(submission), marked));
}

private boolean checkMark(boolean[] marks, int index, Submission submission, Submission otherSubmission) {
if (index >= marks.length) {
throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index, submission.getTokenList().size(),
submission.getTokenList().stream().map(it -> it.getType().getDescription()).collect(Collectors.joining(", ")),
cachedTokenValueLists.get(submission).length, submission.getName(), otherSubmission.getName()));
this.tokenValueMapper.getTokenValuesFor(submission).length, submission.getName(), otherSubmission.getName()));
}

return marks[index];
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
package de.jplag.strategy;
package de.jplag.comparison;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import de.jplag.GreedyStringTiling;
import de.jplag.JPlagComparison;
import de.jplag.JPlagResult;
import de.jplag.Submission;
Expand All @@ -18,16 +16,21 @@
import de.jplag.logging.ProgressBarType;
import de.jplag.options.JPlagOptions;

public abstract class AbstractComparisonStrategy implements ComparisonStrategy {
/**
* Implements a parallelized token-based longest common subsequence search for all pairs of programs in a given set of
* programs.
*/
public class LongestCommonSubsquenceSearch {

private final Logger logger = LoggerFactory.getLogger(ComparisonStrategy.class);

private final GreedyStringTiling greedyStringTiling;
private final Logger logger = LoggerFactory.getLogger(LongestCommonSubsquenceSearch.class);

private final JPlagOptions options;

protected AbstractComparisonStrategy(JPlagOptions options, GreedyStringTiling greedyStringTiling) {
this.greedyStringTiling = greedyStringTiling;
/**
* Creates an instance of the subsequence search algorithm.
* @param options specifies relevant parameters for the comparison.
*/
public LongestCommonSubsquenceSearch(JPlagOptions options) {
this.options = options;
}

Expand All @@ -36,21 +39,26 @@ protected AbstractComparisonStrategy(JPlagOptions options, GreedyStringTiling gr
* <p>
* Caller must ensure that the provided set does have a basecode submission before calling.
* </p>
* @param comparisonAlgorithm is the algorithm implementation for the token-based longest common subsequence search.
* @param submissionSet Submissions and basecode to compare.
*/
protected void compareSubmissionsToBaseCode(SubmissionSet submissionSet) {
private void compareSubmissionsToBaseCode(GreedyStringTiling comparisonAlgorithm, SubmissionSet submissionSet) {
Submission baseCodeSubmission = submissionSet.getBaseCode();
for (Submission currentSubmission : submissionSet.getSubmissions()) {
JPlagComparison baseCodeComparison = greedyStringTiling.generateBaseCodeMarking(currentSubmission, baseCodeSubmission);
JPlagComparison baseCodeComparison = comparisonAlgorithm.generateBaseCodeMarking(currentSubmission, baseCodeSubmission);
currentSubmission.setBaseCodeComparison(baseCodeComparison);
}
}

/**
* Compares two submissions and optionally returns the results if similarity is high enough.
* @param comparisonAlgorithm is the algorithm implementation for the token-based longest common subsequence search.
* @param first is the first submission.
* @param second is the second submission.
* @return the comparison results, if the similarity is above the threshold specified via the options.
*/
protected Optional<JPlagComparison> compareSubmissions(Submission first, Submission second) {
JPlagComparison comparison = greedyStringTiling.compare(first, second);
private Optional<JPlagComparison> compareSubmissions(GreedyStringTiling comparisonAlgorithm, Submission first, Submission second) {
JPlagComparison comparison = comparisonAlgorithm.compare(first, second);
logger.trace("Comparing {}-{}: {}", first.getName(), second.getName(), comparison.similarity());

if (options.similarityMetric().isAboveThreshold(comparison, options.similarityThreshold())) {
Expand All @@ -77,16 +85,26 @@ private List<SubmissionTuple> buildComparisonTuples(List<Submission> submissions
return tuples;
}

@Override
/**
* Compares submissions from a set of submissions while considering a given base code.
* @param submissionSet Collection of submissions with optional basecode to compare.
* @return the comparison results.
*/
public JPlagResult compareSubmissions(SubmissionSet submissionSet) {
long timeBeforeStartInMillis = System.currentTimeMillis();

handleBaseCode(submissionSet);
TokenValueMapper tokenValueMapper = new TokenValueMapper(submissionSet);
GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options, tokenValueMapper);

boolean withBaseCode = submissionSet.hasBaseCode();
if (withBaseCode) {
compareSubmissionsToBaseCode(coreAlgorithm, submissionSet);
}

List<SubmissionTuple> tuples = buildComparisonTuples(submissionSet.getSubmissions());
ProgressBar progressBar = ProgressBarLogger.createProgressBar(ProgressBarType.COMPARING, tuples.size());
List<JPlagComparison> comparisons = prepareStream(tuples).flatMap(tuple -> {
Optional<JPlagComparison> result = compareTuple(tuple);
List<JPlagComparison> comparisons = tuples.stream().parallel().flatMap(tuple -> {
Optional<JPlagComparison> result = compareSubmissions(coreAlgorithm, tuple.left(), tuple.right());
progressBar.step();
return result.stream();
}).toList();
Expand All @@ -96,24 +114,4 @@ public JPlagResult compareSubmissions(SubmissionSet submissionSet) {

return new JPlagResult(comparisons, submissionSet, durationInMillis, options);
}

/**
* Handle the parsing of the base code.
* @param submissionSet The submission set to parse
*/
protected abstract void handleBaseCode(SubmissionSet submissionSet);

/**
* Prepare a stream for parsing the tuples. Here you can modify the tuples or the stream as necessary.
* @param tuples The tuples to stream
* @return The Stream of tuples
*/
protected abstract Stream<SubmissionTuple> prepareStream(List<SubmissionTuple> tuples);

/**
* Compares a single tuple. Returns nothing, if the similarity is not high enough.
* @param tuple The Tuple to compare
* @return The comparison
*/
protected abstract Optional<JPlagComparison> compareTuple(SubmissionTuple tuple);
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag.strategy;
package de.jplag.comparison;

import de.jplag.Submission;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package de.jplag;
package de.jplag.comparison;

import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -40,7 +40,7 @@ class SubsequenceHashLookupTable {
}

subsequenceHashes = new int[values.length - windowSize];
startIndexToSubsequenceHashesMap = new HashMap<>(subsequenceHashes.length);
startIndexToSubsequenceHashesMap = HashMap.newHashMap(subsequenceHashes.length);
computeSubsequenceHashes(marked);
}

Expand Down
66 changes: 66 additions & 0 deletions core/src/main/java/de/jplag/comparison/TokenValueMapper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package de.jplag.comparison;

import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;

import de.jplag.SharedTokenType;
import de.jplag.Submission;
import de.jplag.SubmissionSet;
import de.jplag.Token;
import de.jplag.TokenType;
import de.jplag.logging.ProgressBarLogger;
import de.jplag.logging.ProgressBarType;

/**
* Maps the tokens in a submission to integer IDs for usage in the {@link GreedyStringTiling} algorithm. Each token type
* will be assigned a unique number. The token lists in that form can be queried by calling
* {@link TokenValueMapper#getTokenValuesFor(Submission)}.
*/
public class TokenValueMapper {
private final Map<TokenType, Integer> tokenTypeValues;
private final Map<Submission, int[]> tokenValueMap;

/**
* Creates the submission to token ID mapping for a set of submissions. This will also show the progress to the user
* using the {@link ProgressBarLogger}.
* @param submissionSet is the set of submissions to process.
*/
public TokenValueMapper(SubmissionSet submissionSet) {
this.tokenTypeValues = new HashMap<>();
this.tokenValueMap = new IdentityHashMap<>();

this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);

addSubmissions(submissionSet);
if (submissionSet.hasBaseCode()) {
addSingleSubmission(submissionSet.getBaseCode());
}
}

private void addSubmissions(SubmissionSet submissionSet) {
ProgressBarLogger.iterate(ProgressBarType.TOKEN_VALUE_CREATION, submissionSet.getSubmissions(), this::addSingleSubmission);
}

private void addSingleSubmission(Submission submission) {
List<Token> tokens = submission.getTokenList();
int[] tokenValues = new int[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
TokenType type = tokens.get(i).getType();
tokenTypeValues.putIfAbsent(type, tokenTypeValues.size());
tokenValues[i] = tokenTypeValues.get(type);
}
this.tokenValueMap.put(submission, tokenValues);
}

/**
* Queries the token IDs for a single submission. Each number in the array corresponds to one token from the submission.
* The {@link SharedTokenType#FILE_END} token is guaranteed to be mapped to 0.
* @param submission The submission to query.
* @return the list of tokens.
*/
public int[] getTokenValuesFor(Submission submission) {
return this.tokenValueMap.get(submission);
}
}
1 change: 1 addition & 0 deletions core/src/main/java/de/jplag/logging/ProgressBarType.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
public enum ProgressBarType {
LOADING("Loading Submissions ", false),
PARSING("Parsing Submissions ", false),
TOKEN_VALUE_CREATION("Preparing Submissions", false),
COMPARING("Comparing Submissions", false),
MATCH_MERGING("Merging matched subsequences ", false),
TOKEN_STRING_NORMALIZATION("Normalizing Token Sequence", false),
Expand Down
Loading

0 comments on commit 0ddf389

Please sign in to comment.