-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFit DAG.R
1027 lines (902 loc) · 37.4 KB
/
Fit DAG.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(bnlearn)
library(parallel)
library(stringr)
library(prodlim)
library(dplyr)
source("Scoring.R")
#' ** MAIN FITTING FUNCTIONS **
#' Learns BN structure from data using tabu search with random restarts
#'
#' Starts from a given DAG or from a DAG with only the whitelisted arcs. Uses a
#' hill climber with arc addition, deletion and reversal operations to find
#' locally optimal graph structures. Score function is the likelihood minus a
#' term penalizing graph complexity. Uses tabu search and random restart meta
#' heuristics to decrease the probability of ending up in a local optimum that
#' is far from the global optimum in score. Exploits score decomposability to
#' efficiently evaluate the score difference of each neighbouring graph. Can do
#' most costly computations in parallel (works well for large n, as computing
#' scores becomes costly in this case).
#'
#' Records number of queries involved in computation. Each query involves
#' computing the score for a single node.
#'
#' @param dat dataset to learn BN from
#' @param penalty determines penalization term on score function. Must be "bic",
#' "aic" or a number larger than 0
#' @param parallel if TRUE, computations are done in parallel where possible,
#' using all but one of the available cores on the machine
#' @param tabuSteps maximum number of steps to do in tabu search
#' @param tabuLength maximum length of tabu list
#' @param restarts number of random restarts to do
#' @param randomMoves number of moves to do on DAG when doing random restart
#' @param blacklist dataframe indicating which directed arcs are not allowed in
#' the DAG that is being learned. Consists of a 'from' and 'to' column
#' @param whitelist dataframe indicating which directed arcs must be in the DAG
#' @param dag.start DAG to start search from if necessary
#'
#' @return list containing best DAG found and number of queries involved in the
#' computation
#' @export
#'
#' @examples
#' fit.dag(learning.test, penalty = "bic")
fit.dag <- function(dat, penalty, parallel = TRUE, cl = NULL,
tabuSteps = 10, tabuLength = tabuSteps,
restarts = 0, randomMoves = ncol(dat),
blacklist = NULL, whitelist = NULL,
dag.start = NULL){
#If no graph is provided, initialize to empty graph + whitelisted edges
if(is.null(dag.start)){
dag.current <- initializeDag(names(dat), whitelist)
no.queries <- 0
}
else{
dag.current <- dag.start
no.queries <- dag.start$learning$ntests
}
#Initialize cluster and load relevant data, or set cluster to null
if(parallel && is.null(cl)){
cl <- initializeCluster(dat)
on.exit(stopCluster(cl))
}
#Initialize dataframe of moves and compute current score per node and change
#in score from each move
initialization <- recompute(dag.current, dat, penalty, cl = cl,
blacklist = blacklist, whitelist = whitelist)
moves <- initialization$moves
node.scores <- initialization$nodeScores
no.queries <- no.queries + initialization$queries
restart <- 0
#Track which DAG was the best over all restarts and what its score was
dag.best <- dag.current
score.best <- sum(node.scores)
while(restart <= restarts){
converged <- FALSE
while(!converged){
#Find best move (highest score, not blacklisted, preserves acyclicity)
bestMove <- findBestAllowedMove(moves, dag.current)
if(as.numeric(bestMove[4]) > 0){
#Execute best move. Update DAG, scores for nodes, score differences
#for moves and number of contingency tables computed
postMove <- updateHC(bestMove,
dag = dag.current,
dat = dat,
moves = moves,
node.scores = node.scores,
penalty = penalty,
cl = cl)
dag.current <- postMove$dag
moves <- postMove$moves
node.scores <- postMove$nodeScores
no.queries <- no.queries + postMove$queries
}
#If there is no neighbor with higher score, enter tabu search phase
else if (tabuSteps > 0){
tabuResult <- tabuSearch(tabuSteps = tabuSteps,
tabuLength = tabuLength,
dag.start = dag.current,
dat = dat,
moves = moves,
node.scores = node.scores,
penalty = penalty,
cl = cl)
#If tabu search returned null, it did not find a DAG with higher
#score than the previous best. In this case the search terminates
#(or does a random restart) from the previous best graph.
#Otherwise, we continue searching from the new best graph.
if(is.null(tabuResult$dag)){converged <- TRUE}
else {
dag.current <- tabuResult$dag
moves <- tabuResult$moves
node.scores <- tabuResult$nodeScores
}
no.queries <- no.queries + tabuResult$queries
}
#If no score-increasing move exists and tabu search is not active,
#terminate current phase of search
else{converged <- TRUE}
}
#Store DAG it is better than the previous best
#Ensure validity of current score by fully recomputing. Associated queries
#do not count towards total number of queries in search procedure
score.current <- computeScore(dag.current, dat, penalty, cl = cl)
if(score.current > score.best){
dag.best <- dag.current
score.best <- score.current
#If a better graph is found, reset number of restarts
restart <- 0
}
#Only go through perturbation procedure if max number of restarts
#has not been reached yet.
if(restart < restarts){
#Perturb graph
perturbed <- perturbGraph(randomMoves,
dag = dag.best,
dat = dat,
moves = moves,
penalty = penalty,
cl = cl)
#Retrieve perturbed graph and updated moves and node scores
dag.current <- perturbed$dag
moves <- perturbed$moves
node.scores <- perturbed$nodeScores
no.queries <- no.queries + perturbed$queries
}
restart <- restart + 1
}
#Store number of queries
dag.best$learning$ntests <- no.queries
return(dag = dag.best)
}
#' Fits DAG with known node ordering and restricted size of parental sets
#'
#' Uses that if the node ordering is known, the score maximization problem
#' decomposes into finding the best scoring parental set for each node. For
#' a given maximum number of parents, this can be done in polynomial time.
#'
#' @param node.names character vector of ordered(!) node names
#' @param max.parents maximum number of parents per node
#' @inheritParams fit.dag
#'
#' @return Maximum scoring DAG for given restrictions on node order and parental
#' sets
#'
#' @export
fit.dag.ordered <- function(node.names, max.parents, dat, penalty,
parallel = TRUE){
#Initialize DAG and cluster
dag.fitted <- empty.graph(node.names)
no.nodes <- length(node.names)
if(parallel){
cl <- initializeCluster(dat)
on.exit(stopCluster(cl))
}
#Computes score for given parent configuration of a node. Returns -Inf if
#there is no data for some parental configuration
wrapperScore <- function(parents.node, node, dat, penalty, no.nodes){
dag <- empty.graph(c(node, parents.node))
if(length(parents.node) > 0) {parents(dag, node) <- parents.node}
score.node <- computeScore.node(node, dag, dat, penalty,
no.nodes = no.nodes)
#If any parent configuration has no occurences, we deem it to be
#too complex to learn from our dataset, and give it -Inf score
if(is.na(score.node)){score.node <- -Inf}
return(score.node)
}
#Determine optimal parental sets for each node separately (possible due to
#known node ordering, which guarantees acyclicity when only nodes earlier in
#the ordering are considered as candidate parents)
for(i in 2:no.nodes){
#Generate list of potential parental sets
node <- node.names[i]
possible.parents <- lapply(0:min(max.parents, i-1),
combn, x = node.names[1:(i-1)],
simplify = FALSE) %>%
unlist(., recursive = FALSE)
#Compute score for each possible parental configuration.
#Only worth incurring paralellization overhead if set of possible
#parents is large enough
if(parallel && length(possible.parents) > 100){
scores <- parSapply(cl, possible.parents,
FUN = wrapperScore,
node = node,
dat = dat,
penalty = penalty,
no.nodes = length(node.names))
}
else{
scores <- sapply(possible.parents,
FUN = wrapperScore,
node = node,
dat = dat,
penalty = penalty,
no.nodes = length(node.names))
}
#Store best parental set
bestParents <- possible.parents[[which.max(scores)]]
if(length(bestParents) > 0)
{parents(dag.fitted, node) <- bestParents}
}
#Compute and store number of parental configurations that needed to be
#recorded
no.queries <- sum(sapply(0:max.parents, function(x){choose(1:no.nodes, x)}))
dag.fitted$learning$ntests <- no.queries
return(dag.fitted)
}
#' Uses a fixed number of EM iterations to learn parameters of a DAG
#'
#' Initializes by learning parameters using locally complete data, then imputes
#' missing values (E-step) and refits parameters (M-step) a given number of
#' times using likelihood weighted sampling. Allows setting a fixed maximum
#' number of observations per parameter for computational reasons. These
#' observations are randomly sampled from the full dataset.
#'
#' @param dag DAG structure to learn parameters for
#' @param dat Dataframe with missing values to learn parameters from
#' @param max.iter Maximum number of iterations (default is 1)
#' @param particles Number of samples to use in simulation
#' @param obs.perParam Number of observations per parameter to learn. Defaults
#' to full dataset
#' @param parallel If TRUE, do data imputation in parallel
#'
#' @return DAG with fitted parameters and imputed data after last E-step
#'
#' @export
em.parametric <- function(dag, dat, max.iter = 1, particles = 500,
obs.perParam = NULL, parallel = FALSE){
#Initialize by fitting parameters using locally complete data
if(class(dag)[1] != "bn.fit"){
if(class(dag)[1] != "bn"){stop("DAG should be either of type bn or type bn.fit")}
dag.fit <- bn.fit(dag, dat, replace.unidentifiable = TRUE)
}
else{
dag.fit <- dag
}
dat.imputed <- NULL
converged <- FALSE
iteration <- 1
#Take subsample of observations if desired
if(!is.null(obs.perParam)){
no.obs <- min(obs.perParam*nparams(dag.fit), nrow(dat))
dat <- sample_n(dat, size = no.obs)
}
#Iterate until maximum number of iterations is reached
while(iteration <= max.iter){
iteration <- iteration + 1
if(parallel){
dat.imputed <- parImpute(dag.fit, dat, particles = particles)
} else {
dat.imputed <- impute(dag.fit, dat, method = "bayes-lw", n = particles)
}
dag.fit <- bn.fit(dag, dat.imputed)
}
return(list(dag = dag.fit, dat = dat.imputed))
}
#' Uses Structural EM to learn DAG structure from incomplete data
#'
#' Iteratively imputes data (E-step) and learns DAG structure from fitted data
#' (M-step). Continuous until a maximum number of iterations is reached or the
#' SHD between the new and the previous graph is zero.
#'
#' @param dat Incomplete dataframe to learn DAG from
#' @param max.iter Optional maximum number of iterations
#' @param penalty Penalization factor to use for scoring, default is "bic"
#' @inheritParams fit.dag
#' @inheritParams em.parametric
#'
#' @return Fitted DAG
#'
#' @export
em.structural <- function(dat, tabuSteps = 10, penalty = "bic",
tabuLength = tabuSteps, blacklist = NULL,
whitelist = NULL, max.iter = Inf, particles = 500,
parallel = FALSE){
#Initialize to graph with only whitelisted edges and
dag.current <- initializeDag(names(dat), whitelist)
dat.imputed <- dat
fitted.current <- bn.fit(dag.current, dat.imputed)
converged <- FALSE
iteration <- 1
while(!converged && iteration <= max.iter){
iteration <- iteration + 1
#E-step: fit parameters of current network using previously imputed data
#and impute data based on new network
if(parallel){
dat.imputed <- parImpute(fitted.current, dat, particles = particles)
} else {
dat.imputed <- impute(fitted.current, dat, method = "bayes-lw", n = particles)
}
#M-step: optimize using completed data
dag.new <- fit.dag(dat.imputed, penalty = penalty,
parallel = parallel,
tabuSteps = tabuSteps,
tabuLength = tabuLength,
blacklist = blacklist,
whitelist = whitelist,
dag.start = dag.current)
#Use Bayesian fitting if all data is discrete to avoid unidentiability.
#If continuous data is also present, use MLE
if(all(sapply(dat, is.factor))){
fitted.new <- bn.fit(dag.new, dat.imputed, method = "bayes", iss = 1)
}
else{
fitted.new <- bn.fit(dag.new, dat.imputed, replace.unidentifiable = TRUE)
}
if(shd(dag.current, dag.new) == 0){converged <- TRUE}
else{dag.current <- dag.new; fitted.current <- fitted.new}
}
return(dag.new)
}
#' Learns DAGs using the non-parametric bootstrap
#'
#' Repeatedly sample with replacement from original dataset and learn DAGs from
#' these samples.
#'
#' @param samples Number of DAGs to learn
#' @param dat Data to obtain bootstrap samples from
#' @param sampleSize Number of observations to sample. Default is nrow(dat)
#' @param parallel Bootstrap in parallel if TRUE (default)
#' @inheritParams fit.dag
#'
#' @return List of bn objects learned from bootstrap samples
#'
#' @export
bootstrap.dag <- function(samples, dat, penalty,
tabuSteps = 10, tabuLength = tabuSteps,
blacklist = NULL, whitelist = NULL,
sampleSize = nrow(dat), parallel = FALSE){
require(dplyr)
#Obtains bootstrap sample and learns and returns DAG
bootstrap.single <- function(dummy, dat, penalty, tabuSteps, tabuLength,
blacklist, whitelist, sampleSize){
dat.sample <- sample_n(dat, sampleSize, replace = TRUE)
dag.fitted <- fit.dag(dat.sample, penalty = penalty,
parallel = FALSE,
tabuSteps = tabuSteps,
tabuLength = tabuLength,
blacklist = blacklist,
whitelist = whitelist)
return(dag.fitted)
}
#Run in parallel if requested
if(parallel){
cl <- initializeCluster(dat)
on.exit(stopCluster(cl))
dag.samples <- parLapplyLB(cl, 1:samples, bootstrap.single, dat = dat,
penalty = penalty,
tabuSteps = tabuSteps,
tabuLength = tabuLength,
blacklist = blacklist,
whitelist = whitelist,
sampleSize = sampleSize)
} else {
dag.samples <- lapply(1:samples, bootstrap.single, dat = dat,
penalty = penalty,
tabuSteps = tabuSteps,
tabuLength = tabuLength,
blacklist = blacklist,
whitelist = whitelist,
sampleSize = sampleSize)
}
return(dag.samples)
}
#' ** SUPPORTING FUNCTIONS **
#' Initialize dataframe of possible moves for local search
#'
#' The local search algorithm represents possible moves as ordered pairs of
#' distinct nodes. Whether such a pair represents an arc addition, deletion or
#' reversal depends on whether the arc is absent (addition), present (deletion)
#' or reversed (reverse) in the DAG that the move acts on.
#'
#' Moves are stored in a dataframe with as columns the from-node, the to-node, a
#' logical for whether the score difference for this move is still correct for
#' the current DAG, and the score difference for the move
#'
#' @param node.names character vectors of node names
#' @param blacklist dataframe indicating which directed arcs are not allowed in
#' the DAG that is being learned. Consists of a 'from' and 'to' column
#' @param whitelist dataframe indicating which directed arcs must be in the DAG
#'
#' @return dataframe containing each move, its score and whether the score is
#' up-to-date
#' @export
#'
#' @examples
initializeMoves <- function(node.names, blacklist, whitelist){
#Get all pairs of distinct nodes
moves <- expand.grid(from = node.names, to = node.names, stringsAsFactors = F)
moves <- with(moves, moves[from != to,])
#Efficiently enforce black- and whitelist by removing moves
require(prodlim)
if(!is.null(blacklist)){
#Prevent arc additions
names(blacklist) <- c("from", "to")
moves <- moves[-na.omit(row.match(data.frame(blacklist), moves)),]
}
if(!is.null(whitelist)){
#Prevent arc deletions and reversals
whitelistReverse <- whitelist[, c(2,1)]
names(whitelist) <- c("from", "to")
names(whitelistReverse) <- c("from", "to")
moves <- moves[-na.omit(row.match(rbind(whitelist, whitelistReverse), moves)),]
}
#Indicates that none of the scores for the moves has been computed, hence
#all should be evaluated before proceeding. If a score is not changed by
#some local move, recompute is FALSE and it is not computed on the next step
moves$recompute <- TRUE
moves$score <- NA
return(moves)
}
#' Initialize a graph with whitelisted arcs
#'
#' @inheritParams initializeMoves
#'
#' @return bn object with specified nodes and whitelisted arcs, otherwise empty
#'
#' @export
#'
#' @examples
initializeDag <- function(node.names, whitelist){
dag <- empty.graph(node.names)
if(!is.null(whitelist)){
for(i in 1:nrow(whitelist))
{
dag <- set.arc(dag.current,
as.character(whitelist[i,1]),
as.character(whitelist[i,2]))
}
}
return(dag)
}
#' Initialize cluster for DAG fitting
#'
#' Starts parallel cluster and loads bnlearn library, functions in Scoring and
#' Fit DAG files and data
#'
#' @param dat dataframe to load to clusters
#'
#' @return parallel cluster with all necessary information preloaded
#' @export
#'
#' @examples
initializeCluster <- function(dat){
cl <- makeCluster(detectCores() - 1)
invisible(clusterEvalQ(cl, library(bnlearn)))
invisible(clusterEvalQ(cl, source("Scoring.R")))
invisible(clusterEvalQ(cl, source("Fit DAG.R")))
invisible(clusterExport(cl, "dat", envir = environment()))
return(cl)
}
#' Executes move and updates scores
#'
#' Applies a given move to a DAG and updates the node and move scores
#' (exploiting score decomposability)
#'
#' @param move character vector containing to and from node (in that order)
#' @param dag DAG to apply move to
#' @param dat dataset to use
#' @param moves dataframe of moves with scores up to date for pre-move DAG
#' @param node.scores vector of scores per node
#' @param penalty string specifying the growth rate of the complexity penalty.
#' Either "bic", "aic" or a number between 0 and 1
#' @param cl cluster to use if computations should be done in parallel
#' @param recomputeMoves logical indicating whether scores should be recomputed
#' If false, scores are marked for recomputing but computation is not done
#'
#' @return Returns the new dag and the new moves dataframe. If scores are
#' recomputed, also returns the score for each node and the number of queries
#' @export
#'
#' @examples
updateHC <- function(move, dag, dat, moves, node.scores, penalty,
cl = NULL, recomputeMoves = TRUE){
node.from <- move[1]
node.to <- move[2]
#Update DAG
moveType <- findMoveType(node.from, node.to, dag)
dag.postMove <- makeMove(node.from, node.to, dag, moveType)
#Mark nodes whose parental set has changed for updating
nodesToRescore <- ifelse(moveType == "reverse",
c(node.to, node.from), c(node.to))
#Mark moves operating on arcs towards nodes whose parental sets have
#changed for updating
moves$recompute[moves$to == node.to] <- TRUE
if(moveType == "reverse")
{moves$recompute[moves$to == node.from] <- TRUE}
#Mark this move for updating because its meaning has changed (e.g. if
#move was adding an arc, it is now an arc reversal)
moves$recompute[moves$from == node.to & moves$to == node.from] <- TRUE
#Some operations that lead to a Markov equivalent graph might no longer do
#so after the current operation was performed. (e.g. an edge reversal might
#now lead to a new v-structure, while it previously did not). This statement
#ensures that moves that used to lead to Markov equivalent graphs are
#checked. The converse effect (move now leading to equivalent graph) is less
#problematic, as it will just lead to a graph with the same score. It will
#not cycle as the reverse move will be recomputed after that iteration and
#its score difference will be valued at 0
moves$recompute[(moves$from == node.to | moves$from == node.from) &
abs(moves$score) < 1E-10] <- TRUE
#Update node scores and move score differences if required
if(recomputeMoves){
result <- recompute(dag = dag.postMove,
dat = dat,
penalty = penalty,
moves = moves,
node.scores = node.scores,
nodesToRescore = nodesToRescore,
cl = cl)
return(list(dag = dag.postMove, moves = result$moves,
nodeScores = result$nodeScores, queries = result$queries))
}
return(list(dag = dag.postMove, moves = moves))
}
#' Updates local scores and score differences marked for updating
#'
#' Recomputes score per node for a specified subset of nodes and score
#' difference due to moves for a marked subset of moves. If no moves dataframe
#' and node scores are provided, they are initialized and computed from scratch
#'
#' @param nodesToRescore character vector specifying which nodes should have
#' their scores updated. If not provided, scores are updated for all nodes
#' @inheritParams updateHC
#' @return Updated moves, node scores and number of queries used in computing
#' them
#' @export
#'
#' @examples
recompute <- function(dag, dat, penalty, moves = NULL, node.scores = NULL,
nodesToRescore = nodes(dag), cl, blacklist, whitelist){
#If moves and node scores have not been provided, initialize them
if(is.null(moves)){
moves <- initializeMoves(nodes(dag), blacklist, whitelist)
}
if(is.null(node.scores)){
node.scores <- vector("numeric", length(nodes(dag)))
names(node.scores) <- nodes(dag)
}
#Convert moves dataframe to matrix to ensure no whitespaces are added
#when apply does the conversion internally
moveMat <- sapply(moves, format, trim = TRUE)
#If no cluster is provided, recompute scores sequentially
if(is.null(cl)){
#Compute score per node
newScores <- sapply(nodesToRescore, computeScore.node, dag = dag,
dat = dat, penalty = penalty)
node.scores[nodesToRescore] <- newScores
#Update score difference for each move marked for recomputation.
moves$score[moves$recompute] <- apply(moveMat[moves$recompute,], 1,
evaluateScore,
dat = dat,
node.scores = node.scores,
dag = dag,
penalty = penalty)
}
#If a cluster is provided, do same as above but in parallel
else{
newScores <- parSapply(cl, nodesToRescore, computeScore.node, dag = dag,
dat = dat, penalty = penalty)
node.scores[nodesToRescore] <- newScores
moves$score[moves$recompute] <- parApply(cl, moveMat[moves$recompute,], 1,
evaluateScore,
dat = dat,
node.scores = node.scores,
dag = dag,
penalty = penalty)
}
no.queries <- length(nodesToRescore) + computeNoQueries(moves, dag)
#Mark all moves as updated
moves$recompute <- FALSE
return(list(moves = moves, nodeScores = node.scores, queries = no.queries))
}
#Computes number of queries involved in rescoring the moves dataframe for a
#given DAG. Each arc addition or deletion can be scored in one query, each
#reversal can be scored in two
computeNoQueries <- function(moves, dag){
return(sum(apply(moves[moves$recompute,], 1, function(move){
ifelse(findMoveType(move[1], move[2], dag) == "reverse", 2, 1)})))
}
#' Computes score change due to a move
#'
#' Compares the pre- and post-move score for nodes of which the parental set
#' changes due to the move. For arc addition or deletion this is the to node,
#' for arc reversal this is the to and the from node.
#'
#' @param move character vector containing to and from node for move
#' @param node.scores pre-move node scores
#' @inheritParams evaluateHC
#'
#' @return score difference due to move if move is valid and data is available
#' for each parental configuration. If new graph is cyclic, returns -Inf.
#' If there is no data available for some parental configuration, returns NaN
#' and prints a warning
#' @export
#'
evaluateScore <- function(move, dat, node.scores, dag, penalty){
#Obtain move type and DAG after applying move
node.from <- move[1]
if(!node.from %in% nodes(dag)) {node.from <- str_trim(node.from)}
node.to <- move[2]
if(!node.to %in% nodes(dag)) {node.to <- str_trim(node.to)}
moveType <- findMoveType(node.from, node.to, dag)
dag.postMove <- makeMove(node.from, node.to, dag, moveType)
#If move is illegal or leads to cycle in graph, score with negative infinity
if(class(dag.postMove) == "try-error"){return(-Inf)}
#Compute difference in score from changing parent set of to-node
newScore.to <- computeScore.node(node.to, dag = dag.postMove, dat = dat,
penalty = penalty)
scoreDiff <- newScore.to - node.scores[[node.to]]
#If the move is a reversal, also take into account change at from-node
if(moveType == "reverse"){
newScore.from <- computeScore.node(node.from, dag = dag.postMove, dat = dat,
penalty = penalty)
scoreDiff <- scoreDiff + newScore.from - node.scores[node.from]
}
return(scoreDiff)
}
#' Applies move to a DAG and returns new graph
#'
#' Whether the move is an addition, deletion or reversal depends on whether the
#' arc or its reverse is already present in the DAG
#'
#' @param node.from string specifying first node in move
#' @param node.to string specifying second node in move
#' @param moveType optional string specifying whether the move is an arc
#' addition ("add"), deletion ("delete") or reversal ("reverse")
#'
#' @return If move gives a DAG, it is returned, otherwise a "try-error" is
#' returned
#' @export
#'
#' @examples
makeMove <- function(node.from, node.to, dag,
moveType = findMoveType(node.from, node.to, dag)){
if(moveType == "add")
{try(set.arc(dag, node.from, node.to), silent = TRUE) %>% return(.)}
else if(moveType == "delete")
{drop.arc(dag, node.from, node.to) %>% return(.)}
else if(moveType == "reverse")
{try(reverse.arc(dag, node.from, node.to), silent = TRUE) %>% return(.)}
}
#' Determines whether a move is an arc addition, deletion or reversal for a
#' given DAG
#'
#' @inheritParams makeMove
#'
#' @return If no arc is present from node.from to node.to, returns "add". If an
#' arc is present, returns "delete". If an arc from node.to to node.from is
#' present, returns "reverse"
#' @export
#'
#' @examples
findMoveType <- function(node.from, node.to, dag){
if(node.from %in% parents(dag, node.to)){return("delete")}
else if(node.to %in% parents(dag, node.from)){return("reverse")}
else {return("add")}
}
#' Determines whether a given move leads to a valid DAG
#'
#' Checks whether a move does not lead to directed cycles. If a tabu list is
#' provided, it checks whether the new graph is not Markov equivalent to any
#' graph in the tabu list. If a dataset over nodes is provided, it checks
#' whether data is available for each parental configuration in the new graph.
#'
#' The last is not relevant during hill climbing or tabu search, as such moves
#' will have score -Inf, but it is relevant during random restarts to ensure
#' that we do not start with a graph with NaN score because data is unavailable
#' for some parental configuration for a node.
#'
#' @param tabuList optional tabu list of DAGs
#' @param dat optional dataframe to be used when checking whether data is
#' available for each parental configuration
#' @inheritParams evaluateScore
#'
#' @return TRUE if new graph meets conditions, FALSE otherwise
#'
#' @export
#'
#' @examples
checkIfAllowed <- function(move, dag, tabuList = list(), dat = NULL){
node.from <- move[1]
node.to <- move[2]
moveType <- findMoveType(node.from, node.to, dag)
proposedDag <- makeMove(node.from, node.to, dag, moveType)
#Check that the proposed graph is acyclic
if(class(proposedDag) == "try-error"){return(FALSE)}
#Check that data is available for all of the configurations of the proposed
#parental set. Otherwise, the likelikhood is undefined
if(!is.null(dat)){
proposedScore <- computeNAL.discrete(node.to, proposedDag, dat)
if(length(proposedScore) == 1 && is.nan(proposedScore)){return(FALSE)}
}
#Check that proposed graph is not Markov equivalent to any graph in the
#tabu list
return(!graphInList(proposedDag, tabuList))
}
#' Finds allowed move with largest associated score change
#'
#' @param moves dataframe of moves with updated scores
#' @inheritParams checkIfAllowed
#'
#' @return allowed move with largest associated score change
#'
#' @export
#'
#' @examples
findBestAllowedMove <- function(moves, dag, tabuList = list()){
if(any(moves$recompute))
{warning("Not all scores are up-to-date: found move potentially suboptimal")}
#Order score differences from largest to smallest
scoreOrder <- order(moves$score, decreasing = TRUE)
#Iterate from largest to smallest score and return first allowed move
for(moveIndex in scoreOrder)
{
currentMove <- as.character(moves[moveIndex, ])
if(checkIfAllowed(currentMove, dag, tabuList))
{return(currentMove)}
}
#This shouldn't occur for sane tabu list lengths
stop("No allowed moves found")
}
#' Checks whether a DAG is Markov equivalent to any graph in a given list
#'
#' @param dag DAG to compare to graphs in list
#' @param graphs list of graphs
#'
#' @return TRUE if given DAG is Markov equivalent to any graph in list, FALSE
#' otherwise
#' @export
#'
#' @examples
graphInList <- function(dag, graphs){
#Check whether two DAGs are in the same Markov equivalence class, allowing
#for either to be null (in which case the comparison returns false)
compareDAGs <- function(dag1 , dag2){
if(is.null(dag1)||is.null(dag2)){return(FALSE)}
else {return(all.equal(cpdag(dag1), cpdag(dag2)) == TRUE)}
}
#If list is empty, return false by default
if(length(graphs) < 1){return(FALSE)}
else{
comparison <- sapply(graphs, compareDAGs, dag2 = dag)
return(any(comparison))
}
}
#' Performs tabu search from some initial graph
#'
#' Given some initial graph, makes at most tabuSteps moves to graphs that are
#' not in a tabu list of length at most tabuLength. Each move is the best
#' allowed move. If any of the obtained graphs has strictly higher score than
#' the initial graph, the algorithm terminates and the new graph is returned
#'
#' If no better graph is found, NULL is returned in place of a graph. Any
#' function calling this routine should cope gracefully with this scenario
#'
#' @param tabuSteps maximum number of tabu search steps to perform
#' @param tabuLength maximum number of previously seen graphs to store
#' @param dag.start initial DAG
#' @inheritParams evaluateHC
#'
#' @return If a better DAG than the initial one is found, returns a list
#' containing the DAG, an updated moves dataframe and node scores vector and
#' the number of queries performed during the search procedure. If no better
#' graph is found, returns a list containing a NULL graph and the number of
#' queries
#'
#' @export
#'
#' @examples
tabuSearch <- function(tabuSteps, tabuLength, dag.start, dat, moves,
node.scores, penalty, cl){
tabuList <- vector(mode = "list", length = tabuLength)
no.queries.local <- 0
dag <- dag.start
#Record score of starting graph
bestScore <- sum(node.scores)
for(tabuStep in 1:tabuSteps){
#Add previous graph to tabu list
tabuList[[(tabuStep - 1)%%tabuLength + 1]] <- dag
#Find best scoring neighboring DAG that is not in tabu list
bestMove <- findBestAllowedMove(moves, dag, tabuList)
#Execute best move
postMove <- updateHC(bestMove,
dag = dag,
dat = dat,
moves = moves,
node.scores = node.scores,
penalty = penalty,
cl = cl)
dag <- postMove$dag
moves <- postMove$moves
node.scores <- postMove$nodeScores
no.queries.local <- no.queries.local + postMove$queries
#If score of new graph exceeds score of starting graph,
#return new graph
newScore <- sum(node.scores)
if(newScore > bestScore)
{return(list(dag = dag, moves = moves, nodeScores = node.scores,
queries = no.queries.local))}
}
#If no improvement has been found, signal this by returning
#null and the number of queries
return(list(dag = NULL, queries = no.queries.local))
}
#' Performs a series of random moves on a given DAG
#'
#' Given a DAG, performs a given number of random moves. Each random move is
#' checked to give a valid DAG with data available for each parental
#' configuration. After performing random moves, recomputes those scores that
#' need updating.
#'
#' As the initial DAG is often sparse and the algorithm randomly generates pairs
#' of nodes to determine the moves, it tends to favor arc additions.
#'
#' @param randomMoves number of random moves to do
#' @inheritParams updateHC
#'
#' @return list containing DAG after random moves, updated moves dataframe and
#' node score vector and number of queries used for score recomputation
#'
#' @export
#'
#' @examples
perturbGraph <- function(randomMoves, dag, dat, moves, penalty, cl){
node.names <- nodes(dag)
no.moves <- 0
dag.current <- dag
while(no.moves < randomMoves){
#Generate random move
move <- sample(node.names, 2, replace = FALSE)
#Check that move gives a valid graph
if(checkIfAllowed(move, dag.current, dat = dat)){
no.moves <- no.moves + 1
#Adapt graph and record which moves will need to be recomputed,
#but do not recompute at each iteration (do this once at the end)
postMove <- updateHC(move,
dag = dag.current,
dat = dat,
moves = moves,
node.scores = NULL,
penalty = penalty,
recomputeMoves = FALSE)
dag.current <- postMove$dag
moves <- postMove$moves
}
}
recomputed <- recompute(dag.current, dat, penalty, moves, cl = cl )
moves <- recomputed$moves
node.scores <- recomputed$nodeScores
no.queries <- recomputed$queries
return(list(dag = dag.current, moves = moves, nodeScores = node.scores,
queries = no.queries))
}
#' Impute data in parallel using a fitted Bayesian Network
#'
#' Distributes a dataframe with missing values over all but one of the clusters
#' on the current machine and uses likelihood weighted sampling from the
#' mutilated network to impute missing values by their posterior mean
#' (continuous) or mode (discrete).
#'
#' @param dag.fit Fitted Bayesian Network to use for data imputation