forked from fau-is/coppa-matlab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoppa.m
152 lines (140 loc) · 7.13 KB
/
coppa.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
%% Import required files/functions/libraries
addpath(genpath('./libs/'));
addpath(genpath('./examples/'));
addpath(genpath('./coppa/'));
%% User Input
% Model
model = {'dbn'}; %Options: 'hmm','pfa','dbn', 'dbn_new'
analysis = {'impact_of_evidence'}; % Options: 'impact_of_evidence', 'identifiy_min_max_beliefs', 'impact_of_findings', 'what_if'
num_iter = [15 1]; %number of times EM is iterated | number of times the model will be initialized with different random values to avoid local optimum
% State range
min_state = 10; %Minimum number of states
max_state = 20; %Maximum number of states
grid_steps = 5; %Size of increment between states
% Data
dataset = {'bpi2013'}; %Options: 'sap','sap-small','bpi2013','test'
splitPercentage = 70; % Split Training Set
splitStable = 'yes'; %Options: 'yes','no'. Determines if data and test set is always identical or random
blow_up_test = 'yes'; %Options: 'yes','no'. If to add new cases for each partial trace of the test log or not
blow_up_train = 'no'; %Options: 'yes','no'. If to add new cases for each partial trace of the train log or not
% Context
max_num_context = 1; %Options: any number > 0. Determines how many context attributes will be considered, number should equal the number of background and symptom variables
background_variables = []; symptom_variables = [3]; %Options: numbers > 2 starting with 3. Number should equal position in csv-import plus 1 for the timestamp column
% Learning & Prediction
learn_new_model = 'no'; %Options: 'yes','no'. Learn new model or load from disk.
prediction_mode = 'distribution'; %Options: 'simple','distribution'. 'simple' not working at the moment
ngram_length = 3; %Options: any number > 0. Determines maximum length of ngrams for benchmark.
% Others
draw_model = 'no'; %Options: 'yes', 'no'. Shows model of bayesian network
%Initialize cell for storing results
num_models = numel(model);
num_datasets = numel(dataset);
result = cell(1,num_datasets);
for j=1:num_datasets
result{j} = cell(num_models+1, 3); %+1 for ngram; 3 for accuracy, sensivity and specificity
%% Load data set
%Input Required: only discrete attributes (except timestamp)
disp(['Loading dataset ' dataset{j}]);
if strcmp(dataset{j},'sap')
filename = './example/sap/SAP_P2P_COPPA_FULL.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd HH:mm:ss.SSSSSSS';
CaseID = 1; Activity = 2; Timestamp = 3;
elseif strcmp(dataset{j},'sap-small')
filename = './example/sap/SAP_P2P_COPPA_SMALL.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd HH:mm:ss.SSSSSSS';
CaseID = 1; Activity = 2; Timestamp = 3;
elseif strcmp(dataset{j},'sap-context')
filename = './example/sap/SAP_P2P_COPPA_CONTEXT.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd HH:mm:ss.SSSSSSS';
CaseID = 1; Activity = 2; Timestamp = 3;
elseif strcmp(dataset{j},'bpi2013')
filename = './example/bpi2013/VINST cases closed problems_COPPA.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd''T''HH:mm:ssXXX';
CaseID = 1; Activity = 2; Timestamp = 3;
elseif strcmp(dataset{j},'bpi2012a')
filename = './example/bpi2012/financial_log_application_process_ressourceContext.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd''T''HH:mm:ssXXX';
CaseID = 1; Activity = 2; Timestamp = 3;
elseif strcmp(dataset{j},'test-sametrace')
filename = './example/data_sametrace.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd''T''HH:mm:ssXXX';
CaseID = 1; Activity = 2; Timestamp = 3;
else
filename = './example/data.csv';
delimiter = ';';
timestamp_format = 'yyyy-MM-dd''T''HH:mm:ssXXX';
CaseID = 1; Activity = 2; Timestamp = 3;
end
for i=1:num_models
%Load and Prepare Data
[dataTraining dataTesting unique_values N mapping] = prepare_data(filename, delimiter, timestamp_format,CaseID,Timestamp,Activity,splitPercentage, splitStable, model{i}, blow_up_train, blow_up_test, max_num_context);
%% Define model and start learning
if strcmp(learn_new_model,'yes')
% Learn new model
[bestoverallbnet bestoverallstate] = stategrid_learning(model{i}, N ,dataTraining,num_iter,min_state, max_state,grid_steps, unique_values, background_variables, symptom_variables);
%save the best model on disk
save_name = ['bestbnet_' model{i} '_' dataset{j} '.mat'];
save(save_name,'bestoverallbnet');
else
% Load existing model from disk
disp('Loading saved model');
load_name = ['bestbnet_' model{i} '_' dataset{j} '.mat'];
load(load_name, 'bestoverallbnet');
end
%% Draw Model
if strcmp(draw_model,'yes')
G = bestoverallbnet.dag;
draw_graph(G);
end
%% Prediction
if strcmp(prediction_mode,'simple')
[pred rv acc] = prediction_simple(bestoverallbnet, dataTesting);
else
[pred rv pred_prob] = prediction(bestoverallbnet, dataTesting, model{i}, symptom_variables);
%cell2csv('test.csv',pred_prob,";");
end
[acc sens spec] = score_model(pred, rv);
result{j}{i,1} = acc;
result{j}{i,2} = sens;
result{j}{i,3} = spec;
%% Start analysis
if strcmp(analysis,'impact_of_evidence')
impact = impact_of_evidence(bestoverallbnet, dataTesting, model{i}, symptom_variables, mapping);
end
if strcmp(analysis,'identifiy_min_max_beliefs')
[minimum, normal, maximum]= identify_minimum_maximum_beliefs(bestoverallbnet, dataTesting, model{i}, symptom_variables, mapping);
end
if strcmp(analysis,'impact_of_findings')
impact_of_findings(bestoverallbnet, dataTesting, model{i}, symptom_variables, mapping);
end
if strcmp(analysis,'what_if')
what_if(bestoverallbnet, dataTesting, model{i}, symptom_variables, mapping, max_num_context);
end
end
%% N-Gram prediction for benchmark
[pred_n rv_n] = prediction_ngram(dataTraining,dataTesting,unique_values,ngram_length);
[acc_n sens_n spec_n] = score_model(pred_n, rv_n);
result{j}{num_models + 1,1} = acc_n;
result{j}{num_models + 1,2} = sens_n;
result{j}{num_models + 1,3} = spec_n;
end
disp('Results:');
for i=1:num_datasets
disp([' - Dataset: ' dataset{i}]);
for j=1:num_models
disp([' - Model: ' model{j}]);
disp([' - Accuracy: ' num2str(result{i}{j,1}*100) '%']);
disp([' - Sensitivity: ' num2str(result{i}{j,2}*100) '%']);
disp([' - Specificity: ' num2str(result{i}{j,3}*100) '%']);
end
disp([' - Model: ' num2str(ngram_length) '-gram' ]);
disp([' - Accuracy: ' num2str(result{i}{num_models+1,1}*100) '%']);
disp([' - Sensitivity: ' num2str(result{i}{num_models+1,2}*100) '%']);
disp([' - Specificity: ' num2str(result{i}{num_models+1,3}*100) '%']);
end