Skip to content

Commit

Permalink
Ia fix 377 core (#389)
Browse files Browse the repository at this point in the history
Add method for using a test/training set
  • Loading branch information
idhamari authored Jun 16, 2022
1 parent 18c3089 commit 479a1ba
Show file tree
Hide file tree
Showing 10 changed files with 930 additions and 207 deletions.
94 changes: 86 additions & 8 deletions src/example/org/deidentifier/arx/examples/Example39.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOError;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.SecureRandom;
import java.text.ParseException;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -32,16 +37,24 @@
import org.deidentifier.arx.AttributeType;
import org.deidentifier.arx.AttributeType.Hierarchy;
import org.deidentifier.arx.Data;
import org.deidentifier.arx.DataSubset;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.aggregates.ClassificationConfigurationLogisticRegression;
import org.deidentifier.arx.aggregates.ClassificationConfigurationNaiveBayes;
import org.deidentifier.arx.aggregates.ClassificationConfigurationRandomForest;
import org.deidentifier.arx.criteria.Inclusion;
import org.deidentifier.arx.criteria.KAnonymity;
import org.deidentifier.arx.io.CSVHierarchyInput;
import org.deidentifier.arx.metric.Metric;

/**
* This class implements an example on how to compare data mining performance
*
* The evaluation can be used with either K-fold cross validation (default) or with
* subset for training and different subset for testing
*
* @author Fabian Prasser
* @author Florian Kohlmayer
* @author Ibraheem Al-Dhamari
*/
public class Example39 extends Example {

Expand Down Expand Up @@ -83,7 +96,25 @@ public boolean accept(File dir, String name) {

return data;
}

public static Set<Integer> getRandomDataSubsetIndices(double dataSize, Data inputData, int numRecords) {

if (dataSize < 0d || dataSize > 1d) {
System.out.println(" data size ratio is out of range");
throw new IOError(new Exception());
}

// Create a data subset via sampling based on beta
Set<Integer> subsetIndices = new HashSet<Integer>();
Random random = new SecureRandom();
for (int i = 0; i < numRecords; ++i) {
if (random.nextDouble() < dataSize) {
subsetIndices.add(i);
}
}
return subsetIndices;
}

/**
* Entry point.
*
Expand Down Expand Up @@ -111,18 +142,65 @@ public static void main(String[] args) throws ParseException, IOException {
data.getDefinition().setDataType("age", DataType.INTEGER);
data.getDefinition().setResponseVariable("marital-status", true);


ARXAnonymizer anonymizer = new ARXAnonymizer();

ARXConfiguration config = ARXConfiguration.create();
config.addPrivacyModel(new KAnonymity(5));
config.setSuppressionLimit(1d);
config.setQualityModel(Metric.createClassificationMetric());

// Create a training subset data with a specific percentage of the original data e.g 80%
double dataSize = 0.80;

// Creating a view from the original dataset
Set<Integer> subsetIndicesTrain = getRandomDataSubsetIndices(dataSize, data, data.getHandle().getNumRows()) ;
DataSubset datasubTrain = DataSubset.create(data.getHandle().getNumRows(), subsetIndicesTrain);

// Adding the data subset to the current configuration,
// this subset will be used for the anonymization,
// other records will be transformed but only suppressed,
// In the training, only the subset will be used
config.addPrivacyModel(new Inclusion (datasubTrain) );

config.setSuppressionLimit(1d);
config.setQualityModel(Metric.createClassificationMetric());

// Start anonymization process
ARXResult result = anonymizer.anonymize(data, config);
System.out.println("5-anonymous dataset (logistic regression)");
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, ARXClassificationConfiguration.createLogisticRegression()));
System.out.println("5-anonymous dataset (naive bayes)");
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, ARXClassificationConfiguration.createNaiveBayes()));
System.out.println("5-anonymous dataset (random forest)");
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, ARXClassificationConfiguration.createRandomForest()));

System.out.println("===============================================");
System.out.println(" 5-anonymous dataset (logistic regression)");
System.out.println("===============================================");
ClassificationConfigurationLogisticRegression logisticClassifier = ARXClassificationConfiguration.createLogisticRegression();
System.out.println("Evaluation using K-fold cross validation: ...............");
logisticClassifier.setEvaluateWithKfold(true);
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, logisticClassifier));
System.out.println("Evaluation using testing subset: ........................");
logisticClassifier.setEvaluateWithKfold(false);
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, logisticClassifier));

System.out.println("===============================================");
System.out.println(" 5-anonymous dataset (naive bayes)");
System.out.println("===============================================");
System.out.println("Evaluation using K-fold cross validation: ...............");
logisticClassifier.setEvaluateWithKfold(true);
ClassificationConfigurationNaiveBayes naiveBayesClassifier = ARXClassificationConfiguration.createNaiveBayes();
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, naiveBayesClassifier));
System.out.println("Evaluation using testing subset: ........................");
logisticClassifier.setEvaluateWithKfold(false);
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, naiveBayesClassifier));

System.out.println("===============================================");
System.out.println(" 5-anonymous dataset (random forest)");
System.out.println("===============================================");
System.out.println("Evaluation using K-fold cross validation: ...............");
logisticClassifier.setEvaluateWithKfold(true);
ClassificationConfigurationRandomForest randomForestClassifier = ARXClassificationConfiguration.createRandomForest();
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, randomForestClassifier));
System.out.println("Evaluation using testing subset: ........................");
logisticClassifier.setEvaluateWithKfold(false);
System.out.println(result.getOutput().getStatistics().getClassificationPerformance(features, clazz, randomForestClassifier));

}
}
}
Loading

0 comments on commit 479a1ba

Please sign in to comment.