-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathhcoset3.w
executable file
·2093 lines (1907 loc) · 64.8 KB
/
hcoset3.w
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\def\mod{\mathop{mod}}
@s cubepos int
@s moveseq int
@s kocsymm int
@s permcube int
@s lookup_type int
@s map template
@s vector template
@s set template
@* Introduction.
This is a coset solver for Rubik's Cube, using the cosets generated
by the group $\{U,F2,R2,D,B2,L2\}$. If you have not read {\tt cubepos},
{\tt kocsymm}, and {\tt phase1prune2}, you should read those before you try
to understand this program.
@(hcoset3.cpp@>=
const char *BANNER =
"This is hcoset3 1.0, (C) 2010 Tomas Rokicki. All Rights Reserved." ;
#include "phase1prune2.h"
#include <cstdio>
#include <iostream>
#include <map>
#include <set>
#ifdef _WIN32
#include <xmmintrin.h>
inline void prefetch(void *p) { _mm_prefetch((const char *)p, _MM_HINT_T1); }
#else
inline void prefetch(void *p) { __builtin_prefetch(p); }
#endif
using namespace std ;
@<Data declarations@> @;
@<Utility functions@> @;
@<Threading objects@> @;
void docoset(int seq, const char *movestring) {
@<Handle one coset from |movestring|@> @;
}
int main(int argc, char *argv[]) {
double progstart = walltime() ;
duration() ;
@<Parse arguments@> @;
@<Initialize the program@> @;
@<Handle the work@> @;
@<Cleanup@> @;
phase1prune2::check_integrity() ;
cout << "Completed in " << (walltime() - progstart) << endl ;
}
@ The first thing we take up is argument parsing. Two arguments we
know we need up front include a verbosity level (the default is 1, but
the |-q| option makes it 0 and the |-v| option makes it 2), and a
thread count.
@<Data decl...@>=
int verbose = 1 ;
int numthreads = 1 ;
@ Parsing the arguments is boilerplate code.
@<Parse arguments@>=
int oargc = argc ;
char **oargv = argv ;
while (argc > 1 && argv[1][0] == '-') {
argc-- ;
argv++ ;
switch (argv[0][1]) {
case 'v': verbose++ ; break ;
case 'q': verbose = 0 ; break ;
case 't':
if (argc < 2)
error("! not enough arguments to -t") ;
if (sscanf(argv[1], "%d", &numthreads) != 1)
error("! bad thread count argument") ;
if (numthreads < 1 || numthreads > MAX_THREADS)
error("! bad value for thread count") ;
argc-- ;
argv++ ;
break ;
@<More arguments@> @;
default:
error("! bad argument") ;
}
}
@ Usually the pruning tables are read from disk; if they don't exist,
they are created, and then written to disk. If for some reason
you do not want to write the pruning tables to disk, you can use
the |-W| option to inhibit this.
@<Data...@>=
int skipwrite = 0 ;
@ Parsing this argument is easy.
@<More arguments@>=
case 'W': skipwrite++ ; break ;
@ This coset solver works by finding as many phase 1 solutions to
a particular group position as it can; each phase 1 solution leads
to a particular element of the coset. As soon as we have
discovered all possible positions, we are done.
We start by writing the simplest possible coset solver; we can use
this to verify results from faster, more optimized solvers.
The simplest possible solver just keeps the element information
in a |set|.
@<Data...@>=
moveseq repseq ;
set<permcube> world ;
kocsymm repkc ;
permcube reppc ;
cubepos repcp ;
#ifdef QUARTER
int qparity ;
#endif
@ We always start by printing the arguments and the banner.
@<Initialize the program@>=
if (verbose)
cout << BANNER << endl << flush ;
for (int i=0; i<oargc; i++)
cout << " " << oargv[i] ;
cout << endl ;
@ Handling a coset starts by parsing the move string. Note that
|phase1prune2| is automatically protected against multiple
initialization.
@<Handle one coset...@>=
int oldsingcount = singcount ;
const char *tmp = movestring ;
repseq = cubepos::parse_moveseq(tmp) ;
#ifdef QUARTER
qparity = repseq.size() & 1 ;
#endif
if (*tmp)
error("! extra stuff at end of input moveseq") ;
cout << "Coset representative " << seq << " " << movestring << endl ;
phase1prune2::init(skipwrite) ;
double cosetstart = walltime() ;
@ Next, we execute the move sequence on our representative |kocsymm|.
@<Handle one coset...@>=
repkc = identity_kc ;
reppc = identity_pc ;
repcp = identity_cube ;
for (unsigned int i=0; i<repseq.size(); i++) {
repkc.move(repseq[i]) ;
reppc.move(repseq[i]) ;
repcp.move(repseq[i]) ;
}
#ifdef LEVELCOUNTS
setup_levmul(repkc, repseq) ;
#endif
@ We want to support different levels of slowness. Level 0 is the
slowest; level 1 the next fastest, and level 2, the default, is the
fastest. In addition, we want to limit the maximum level to search
with an option; this gives us a program that actually terminates.
We also have a separate overall maximum level, which we will
discuss later. We also maintain a global variable that indicates
the depth we are currently exploring; this is only used by the
prepass.
@<Data...@>=
int slow = 2 ;
int maxsearchdepth = 35 ;
int maxdepth = 35 ;
int global_depth ;
@ We use the |-s| option to set the slowness.
@<More arguments@>=
case 's':
if (argc < 2)
error("! not enough arguments to -s") ;
if (sscanf(argv[1], "%d", &slow) != 1)
error("! bad -s argument") ;
if (slow < 0 || slow > 2)
error("! bad value for -s") ;
argc-- ;
argv++ ;
break ;
case 'd':
if (argc < 2)
error("! not enough arguments to -d") ;
if (sscanf(argv[1], "%d", &maxdepth) != 1)
error("! bad -d argument") ;
if (maxdepth < 0)
error("! bad value for -d") ;
if (maxdepth < maxsearchdepth)
maxsearchdepth = maxdepth ;
argc-- ;
argv++ ;
break ;
case 'S':
if (argc < 2)
error("! not enough arguments to -S") ;
if (sscanf(argv[1], "%d", &maxsearchdepth) != 1)
error("! bad -S argument") ;
if (maxsearchdepth < 0)
error("! bad value for -S") ;
argc-- ;
argv++ ;
break ;
@ If the |slow| value is zero, we use the slowest search.
@<Handle one coset...@>=
if (slow == 0)
slowsearch1(repkc, reppc) ;
@ The slow search routine is much like the one in the two-phase
solver.
@<Utility...@>=
void slowsearch1(const kocsymm &kc, const permcube &pc, int togo,
int movemask, int canon) {
if (togo == 0) {
if (kc == identity_kc) {
probes++ ;
world.insert(pc) ;
}
return ;
}
togo-- ;
kocsymm kc2 ;
permcube pc2 ;
int newmovemask ;
while (movemask) {
int mv = ffs1(movemask) ;
movemask &= movemask - 1 ;
kc2 = kc ;
kc2.move(mv) ;
int nd = phase1prune2::lookup(kc2, togo, newmovemask) ;
if (nd <= togo) {
pc2 = pc ;
pc2.move(mv) ;
int new_canon = cubepos::next_cs(canon, mv) ;
slowsearch1(kc2, pc2, togo,
newmovemask & cubepos::cs_mask(new_canon),
new_canon) ;
}
}
}
@ Finally, we do the search. We continue forever in this slow mode
(there's no way it will ever finish, with just this logic). We put
the slow search logic in a subroutine.
@<Utility...@>=
void slowsearch1(const kocsymm &kc, const permcube &pc) {
duration() ;
for (int d=phase1prune2::lookup(repkc); d < maxsearchdepth; d++) {
probes = 0 ;
long long prevlev = uniq ;
slowsearch1(kc, pc, d, ALLMOVEMASK, CANONSEQSTART) ;
uniq = world.size() ;
long long thislev = uniq - prevlev ;
if (verbose)
cout << "Tests at " << d << " " << probes << " in " << duration()
<< " uniq " << uniq << " lev " << thislev << endl ;
}
}
@ Congratulations! At this point we have a coset solver!
The slow search runs slowly, but more importantly, it typically
runs out of memory rather quickly. To solve this, rather than using
a |set|, we can use a |vector| and sort it to calculate uniqueness.
But this would not make more than a factor of three improvement,
probably, so we go ahead and take the next step---a big bitmap, one
bit per element of the coset. With this data structure, memory
usage, though large, is fixed.
The size of these cosets is 19,508,428,800 elements, so the bitmap
needs to be about 2.5GB in size. On a 32-bit platform, it is unlikely
we will be able to allocate that much memory contiguously.
Furthermore, because of some optimizations we will make a little
later, we will want two bitmaps.
The bitmaps will be indexed by corner permutation and edge
permutation. The corner permutation has a smaller range of $8!$, so
we use it to select a memory page. Each memory page must handle all
possible edge permutations; they have a range of $8! 4!/2$ (the
division by two is because the corner and edge permutations must
match).
@<Data...@>=
const int FACT8 = 40320 ;
const int PAGESIZE = (FACT8 * FACT4 / 2 / 8) ;
const int PAGEGAP = ((PAGESIZE + 4095) & ~0xfff) ;
unsigned char *bitp1, *bitp2 ;
#define BITINDEX(a,b) (a+PAGEGAP*(long long)(b))
@ We must allocate these arrays at the start.
@<Handle one coset...@>=
if (bitp1 == 0 || bitp2 == 0) {
bitp1 = (unsigned char *)calloc(FACT8, PAGEGAP) ;
bitp2 = (unsigned char *)calloc(FACT8, PAGEGAP) ;
if (bitp1 == 0 || bitp2 == 0)
error("! no memory") ;
} else {
memset(bitp1, 0, FACT8*(size_t)PAGEGAP) ;
memset(bitp2, 0, FACT8*(size_t)PAGEGAP) ;
}
@ When we set a bit in the large bitmap, we need to ensure that it is
actually clear before we set it, in order to maintain the appropriate
count. We need to use a 64-bit value to hold this count. We also
define the target---how large each coset is.
@<Data...@>=
long long uniq = 0 ;
#ifdef QUARTER
long long puniq[2] ;
#endif
long long probes = 0 ;
const long long TARGET = FACT8 * (long long)FACT8 * (long long)FACT4/2 ;
#ifdef LEVELCOUNTS
long long uniq_ulev = 0 ;
long long sum_ulev[40] ;
#ifdef QUARTER
long long puniq_ulev[2] ;
#endif
#endif
#ifdef FASTCLEAN
unsigned char touched[FACT8] ;
int did_a_prepass ;
#endif
@ At the start of the program, we initialize our bitmap and some
other variables. If we already have a page, we assume it is
cleared (our cleanup routine will ensure this).
@<Handle one coset...@>=
uniq = 0 ;
#ifdef QUARTER
puniq[0] = 0 ;
puniq[1] = 0 ;
#endif
#ifdef FASTCLEAN
memset(touched, 0, sizeof(touched)) ;
did_a_prepass = 0 ;
#endif
#ifdef LEVELCOUNTS
uniq_ulev = 0 ;
#ifdef QUARTER
puniq_ulev[0] = 0 ;
puniq_ulev[1] = 0 ;
#endif
#endif
@ The bit order of the lowest order bits will turn out to be
absolutely critical later on. To support this we introduce an
array that translates a |FACT4| value into a particular offset.
@<Data...@>=
unsigned char permtobit[FACT4] ;
unsigned char bittoperm[FACT4] ;
@ For now we initialize it to just sequential order.
@<Initialize the program...@>=
for (int i=0; i<FACT4; i++) {
permtobit[i] = i ;
bittoperm[i] = i ;
}
@ Modern computers have processors so fast they can execute
hundreds of instructions during the time it takes to fetch a
single memory value. Here, in this one case, we can actually
defer the checking of the new bit that might be set until the
next time this routine is called, and issue a prefetch to the
memory address that it will need. Doing this makes a substantial
improvement in the overall performance of the program.
@<Data...@>=
unsigned char saveb ;
unsigned char *savep ;
@ We need a routine that takes a |permcube| and sets the appropriate
bit in the bitmap. We already have code that handles most of the
work for us. We actually check the previous bit, and set up for
the next bit. Note how we do as much work as possible before the
|flushbit()| call, to give the memory system as much time as possible
to fetch the data. This is one of the major keys to our excellent
search performance.
@<Utility...@>=
void flushbit() {
if (savep != 0) {
if (0 == (*savep & saveb)) {
*savep |= saveb ;
uniq++ ;
}
savep = 0 ;
}
}
void setonebit(const permcube &pc) {
// if (pc.em) error("! failed in setonebit with a bad pc em") ;
int cindex = (pc.c8_4*FACT4+pc.ctp)*FACT4+pc.cbp ;
unsigned int eindex = (((permcube::c12_8[pc.et]*FACT4) + pc.etp) * FACT4/2 +
(pc.ebp >> 1))*FACT4 + permtobit[pc.emp] ;
/*
if (cindex < 0 || cindex >= 40320)
error("! bad cindex") ;
if (eindex < 0 || eindex >= 483840)
error("! bad eindex") ;
*/
#ifdef FASTCLEAN
touched[cindex] = 1 ;
#endif
probes++ ;
flushbit() ;
savep = BITINDEX(bitp1, cindex) + (eindex >> 3) ;
prefetch(savep) ;
/*
static int sum = 0 ;
sum += *savep ;
*/
saveb = 1 << (eindex & 7) ;
}
@ With this written, the second search routine is very close to the
first. We also introduce a small optimization; we do not do the
final two lookups when there's only one more move to go because
we know they are unnecessary. The unrolling of the loop is this
fashion is another major key to our performance. Note that we
count on |ffs()| performing well; that's something to check on your
architecture.
@<Utility...@>=
void slowsearch2(const kocsymm &kc, const permcube &pc, int togo,
int movemask, int canon) {
if (togo == 0) {
if (kc == identity_kc)
setonebit(pc) ;
return ;
}
togo-- ;
kocsymm kc2 ;
permcube pc2 ;
int newmovemask ;
while (movemask) {
int mv = ffs1(movemask) ;
movemask &= movemask - 1 ;
kc2 = kc ;
kc2.move(mv) ;
int nd = phase1prune2::lookup(kc2, togo, newmovemask) ;
if (nd <= togo) {
pc2 = pc ;
pc2.move(mv) ;
int new_canon = cubepos::next_cs(canon, mv) ;
int movemask3 = newmovemask & cubepos::cs_mask(new_canon) ;
if (togo == 1) { // just do the moves.
permcube pc3 ;
while (movemask3) {
int mv2 = ffs1(movemask3) ;
movemask3 &= movemask3 - 1 ;
pc3 = pc2 ;
pc3.move(mv2) ;
setonebit(pc3) ;
}
} else {
slowsearch2(kc2, pc2, togo, movemask3, new_canon) ;
}
}
}
}
@ If the |slow| value is one, we use the next fastest search.
@<Handle one coset...@>=
if (slow == 1)
slowsearch2(repkc, reppc) ;
@ Our outer routine for the second level search is here.
@<Utility...@>=
void slowsearch2(const kocsymm &kc, const permcube &pc) {
duration() ;
for (int d=phase1prune2::lookup(repkc); d < maxsearchdepth; d++) {
probes = 0 ;
long long prevlev = uniq ;
slowsearch2(kc, pc, d, ALLMOVEMASK, CANONSEQSTART) ;
flushbit() ;
long long thislev = uniq - prevlev ;
if (verbose)
cout << "Tests at " << d << " " << probes << " in " << duration()
<< " uniq " << uniq << " lev " << thislev << endl ;
}
}
@ Many times coming up, we will need to store an edge coordinate
into a |permcube|. This routine makes it easy.
@<Utility...@>=
void unpack_edgecoord(permcube &pc, int e8_4, int epp1, int epp2) {
pc.et = permcube::c8_12[e8_4] ;
pc.etp = epp1 ;
pc.ebp = epp2 ;
pc.eb = kocsymm::epsymm_compress[0xf0f-kocsymm::epsymm_expand[pc.et]] ;
pc.em = 0 ;
}
void unpack_edgecoord(permcube &pc, int coord) {
unpack_edgecoord(pc, coord/(FACT4*FACT4), coord/FACT4%FACT4, coord%FACT4) ;
}
@* Prepass.
We've got a tremendously fast search routine at this point, and efficient
storage for the coset. There is one more major trick in our arsenal,
one that expands the capabilities of this coset solver tremendously.
That technique is the prepass.
Once a position has a phase 1 solution (that is, it is in the group $H$),
it has a tendency to stay there; 10 of the 18 possible moves leave it in
the coset. Most canonical sequences that are a solution to phsae one
end in a move in $H$. The prepass allows us to handle all sequences
ending in $H$, in a pass over the bitmap, without needing to explicitly
search. This speeds up our search by a factor of two, typically, and
for some cosets, like the trivial coset, speeds it up by a much larger
factor. For instance, for the trivial coset, at distance 12, there are
16,019,916,192 canonical sequences that solve phase 1, but only
329,352,128 of those, or about one in fifty, end in a move not in $H$.
This means a huge speedup for some cosets, and a good speedup for other
cosets.
Use of the prepass also permits us to calculate bounds on the overall
distance of the coset without finding optimal solutions for every single
position.
The prepass works by taking the current set of found solutions, and
extending each one by all ten moves in $H$, and adding those to the
set. That's why we declared |bitp2| above; this is to hold a second
copy of the bitmap, so we can maintain a separate previous and
next bitmap during the prepass.
For proving a bound of 20, the prepass is the key operation in the
coset solver, and the one that will take up the bulk of the time.
The trick to doing this efficiently is to lay out the bits in memory
strategically, so that almost all of the work can be done with
simple logical operations and a lookup table or two.
For checking, we may want to disable the prepass.
@<Data...@>=
int disable_prepass = 0 ;
@ The |-U| option disables the prepass.
@<More arguments@>=
case 'U': disable_prepass++ ; break ;
@ For the first time, it makes logical sense to define symbolic names
for some of the moves, so we do that here.
@<Utility...@>=
const int U1 = 0 ;
#ifndef QUARTER
const int U2 = 1 ;
#endif
const int U3 = TWISTS - 1 ;
const int F1 = TWISTS ;
const int R1 = 2 * TWISTS ;
const int D1 = 3 * TWISTS ;
const int D3 = 4 * TWISTS - 1 ;
const int B1 = 4 * TWISTS ;
const int L1 = 5 * TWISTS ;
#ifdef SLICE
const int J1 = 7 * TWISTS ;
const int K1 = 8 * TWISTS ;
#endif
@ The layout is already implicit in the |setbit| routine above, although
we have not finished all the code we need, nor have we set up the
|bittoperm| and |permtobit| arrays appropriately.
Let's start with the least significant bits, the |emp| bits.
Setting the mapping of middle edge permutations to bit locations.
For a given set of corner and up/down edge permutations, there are
twelve possible middle edge permutations that preserve parity.
Six of the ten moves in $H$ do not affect the middle edge
permutation, so for those six moves, we can just copy the middle
edge bits over; the assignment of bits to middle edge permutations
is not affected by those six moves. For the remaining four
moves (F2, R2, B2, L2), every move jumbles the bits in a specific way.
We assign the bits to the permutations and vice versa in a way such
that all the even permutations are in the low order bits, and such
that the odd permutation bit assignment of the 12 is that from
the corresponding even permutation after the move F2. This is how
we assign bits to permutations; the following code does the work
for us.
To make this work property for quarter turn too, we actually use
a pair of twists of a quarter turn move. Because all of this
only happens in the initialization, the performance is not an
issue.
@<Initialize the program...@>=
permcube pc ;
for (int i=0; i<FACT4/2; i++) {
permtobit[2*i] = i ;
pc.emp = 2 * i ;
pc.move(F1) ;
pc.move(F1) ;
permtobit[pc.emp] = 12 + i ;
}
for (int i=0; i<FACT4; i++)
bittoperm[permtobit[i]] = i ;
@ For the remaining moves, we have to calculate the rearrangements
of the 12 bits that can happen (for both even and odd values).
@<Data...@>=
#ifdef SLICE
const int SQMOVES = 5 ;
#else
const int SQMOVES = 3 ;
#endif
short rearrange[2][SQMOVES][1<<12] ;
@ Initializing these is just a slog through the possibilities. We
first set all the single-bit values, and then we combine them.
@<Initialize the program...@>=
#ifdef SLICE
const int mvs[] = { R1, B1, L1, J1, K1 } ;
#else
const int mvs[] = { R1, B1, L1 } ;
#endif
for (int mvi=0; mvi<SQMOVES; mvi++)
for (int p=0; p<FACT4; p++) {
pc.emp = p ;
pc.move(mvs[mvi]) ;
pc.move(mvs[mvi]) ;
rearrange[p&1][mvi][1<<(permtobit[p]%12)] =
1<<(permtobit[pc.emp]%12) ;
}
for (int p=0; p<2; p++)
for (int mvi=0; mvi<SQMOVES; mvi++)
for (int i=1; i<(1<<12); i++) {
int lowb = i & - i ;
rearrange[p][mvi][i] =
rearrange[p][mvi][lowb] | rearrange[p][mvi][i-lowb] ;
}
@ It turns out, with all the choices we have made (all of which were
deterministic), the values for B2 are the same going forwards or
backwards. We take advantage of that to reduce cache misses in the
inner loop, but we check this still holds here. In the slice turn
metric, this is also true for J2.
@<Initialize...@>=
for (int i=0; i<(1<<12); i++)
if (rearrange[0][1][i] != rearrange[1][1][i])
error("! mismatch in rearrange") ;
#ifdef SLICE
for (int i=0; i<(1<<12); i++)
if (rearrange[0][3][i] != rearrange[1][3][i])
error("! mismatch in rearrange") ;
#endif
@ For the next most significant bits, we use the up/down edge
permutation. We use a lookup array to perform this mapping. For
indexing, we drop the least significant bit of the up/down
edge mapping, since it can be reconstructed from the parity of
the other permutation components. We take advantage of the fact
that consecutive pairs of permutations indexed by $(2k,2k+1)$,
when right-multiplied by any element of $S_4$, yields another
pair, either $(2j,2j+1)$ or $(2j+1,2j)$.
(See |kocsymm| for more information.) (This is also true for
groups of 6, which help make this algorithm particularly
cache-friendly.) Note that this array is reasonably large, but
we access it sequentially. We premultiply by 3 to get an actual
page offset. Our pages are $8!*3/2$ or 60,480 bytes long, so
these indices barely fit into an unsigned short.
@<Data...@>=
#ifdef SLICE
const int PREPASS_MOVES = 15 ;
#endif
#ifdef HALF
const int PREPASS_MOVES = 10 ;
#endif
#ifdef QUARTER
const int PREPASS_MOVES = 8 ;
#endif
unsigned short eperm_map[FACT8/2][PREPASS_MOVES] ;
@ We iterate through the moves in sequentail order. For the
slice turn metric and the half turn metric, this is the same
order but for the quarter turn metric, we move the "faked"
half moves to the end of the array, and thus they show up in a
different order. These constants are used in the inner
loop to help us mange this issue.
@<Data...@>=
#ifdef QUARTER
const int ILU1 = 0 ;
const int ILU3 = 1 ;
const int ILF2 = 4 ;
const int ILR2 = 5 ;
const int ILD1 = 2 ;
const int ILD3 = 3 ;
const int ILB2 = 6 ;
const int ILL2 = 7 ;
#else
const int ILU1 = 0 ;
const int ILU2 = 1 ;
const int ILU3 = 2 ;
const int ILF2 = 3 ;
const int ILR2 = 4 ;
const int ILD1 = 5 ;
const int ILD2 = 6 ;
const int ILD3 = 7 ;
const int ILB2 = 8 ;
const int ILL2 = 9 ;
#ifdef SLICE
const int ILI1 = 10 ;
const int ILI2 = 11 ;
const int ILI3 = 12 ;
const int ILJ2 = 13 ;
const int ILK2 = 14 ;
#endif
#endif
@ The initialization is long but straightforward.
@<Initialize...@>=
int ind = 0 ;
for (int e8_4=0; e8_4<C8_4; e8_4++)
for (int epp1=0; epp1<FACT4; epp1++)
for (int epp2=0; epp2<FACT4; epp2 += 2, ind++) {
int mvi = 0 ;
for (int mv=0; mv<NMOVES_EXT; mv++) {
if (!kocsymm::in_Kociemba_group(mv))
continue ;
unpack_edgecoord(pc, e8_4, epp1, epp2) ;
pc.move(mv) ;
eperm_map[ind][mvi] =
((permcube::c12_8[pc.et]*FACT4+pc.etp)*FACT4+pc.ebp)/2*3 ;
mvi++ ;
}
}
@* The inner loop.
We are ready now for the innermost loop of the program, the one that
accounts for probably two-thirds of the runtime in our effort to
prove 20. Examine the assembly generated by your compiler for this
routine very carefully. The inner loop should have about 50
instructions, straight line code; if you see anything that could
be improved, change it here.
Input to this function is a bit complicated. We have a destination
page we are going to write (but surprisingly, not read). We have
a set of ten source pages, one for each of the instructions in $H$,
from which we will read bits, transform them into the new locations
in our destination page, and write them back. Finally, we have
the base, which is the portion of the pages to do. This routine
does $12\cdot 24\cdot 24$ bits from each page, which corresponds to
2,304 bytes from each page. One full execution of this routine
does 69,120 group multiplies. The inner body is executed 288 times
on each call, and each inner body execution does 240 group
multiplies.
The code below uses unaligned reads and writes. When I was
originally writing this code, I used a bunch of byte reads and
writes, but it turned out to be significantly faster to just
use unaligned memory accesses. On modern Intel processors,
there is almost no penalty for unaligned reads and writes. On
other processors, this code may perform substantially
suboptimally.
Note that this routine smashes one byte past the end of each page.
For this reason, we allocate each page one byte larger than it
really needs to be. The code restores the smashed value at the
end.
This routine does not read the source. With the exception of the
empty sequence, every sequence that ends in the trivial $H$ group
either ends with a move in the $H$ group, or ends with one of the
eight moves F1, F3, R1, R3, B1, B3, L1, L3. These eight moves always
occur in pairs, as do the sequences that use them; any sequence ending
in F1 that ends in $H$ has a matching sequence that ends in F3 that
also ends in $H$. What this means is that for every position that has
already been found, there is a matching position also already found
that is either F2, R2, B2, or L2 away. So, except for the trivial
coset and the empty sequence, we never need to consider the source
bitmap in the inner loop; just considering the ten adjacent bitmaps
always suffices and never loses any bits.
@<Utility...@>=
void innerloop3(unsigned char *dst, unsigned char **srcs, int base) {
dst += 3 * base ;
unsigned char *end = dst + 12*24*3 ;
unsigned char tval = *end ; // save this byte
unsigned short *cpp = eperm_map[base] ;
for (; dst<end; dst += 3, cpp+=PREPASS_MOVES) {
int wf2 = *(int *)(srcs[ILF2]+cpp[ILF2]) ;
int wr2 = *(int *)(srcs[ILR2]+cpp[ILR2]) ;
int wb2 = *(int *)(srcs[ILB2]+cpp[ILB2]) ;
int wl2 = *(int *)(srcs[ILL2]+cpp[ILL2]) ;
#ifdef SLICE
int wj2 = *(int *)(srcs[ILJ2]+cpp[ILJ2]) ;
int wk2 = *(int *)(srcs[ILK2]+cpp[ILK2]) ;
#endif
*(int *)dst =
(( (wf2 & 0xfff) | // F2, even to odd
rearrange[0][0][wr2 & 0xfff] | // R2, even to odd
rearrange[0][1][wb2 & 0xfff] | // B2, even to odd
#ifdef SLICE
rearrange[0][3][(wj2 >> 12) & 0xfff] | // J2, odd to odd
rearrange[1][4][(wk2 >> 12) & 0xfff] | // K2, odd to odd
#endif
rearrange[0][2][wl2 & 0xfff]) << 12) | // L2, even to odd
((wf2 >> 12) & 0xfff) | // F2, odd to even
rearrange[1][0][(wr2 >> 12) & 0xfff] | // R2, odd to even
rearrange[0][1][(wb2 >> 12) & 0xfff] | // B2, odd to even
#ifdef SLICE
rearrange[0][3][wj2 & 0xfff] | // J2, even to even
rearrange[0][4][wk2 & 0xfff] | // K2, even to even
#endif
rearrange[1][2][(wl2 >> 12) & 0xfff] | // L2, odd to even
#ifdef SLICE
*(int *)(srcs[ILI1]+cpp[ILI1]) | // I1
*(int *)(srcs[ILI2]+cpp[ILI2]) | // I2
*(int *)(srcs[ILI3]+cpp[ILI3]) | // I3
#endif
*(int *)(srcs[ILU1]+cpp[ILU1]) | // U1
#ifndef QUARTER
*(int *)(srcs[ILU2]+cpp[ILU2]) | // U2
#endif
*(int *)(srcs[ILU3]+cpp[ILU3]) | // U3
*(int *)(srcs[ILD1]+cpp[ILD1]) | // D1
#ifndef QUARTER
*(int *)(srcs[ILD2]+cpp[ILD2]) | // D2
#endif
*(int *)(srcs[ILD3]+cpp[ILD3]) ; // D3
}
*end = tval ; // restore smashed value
}
@ After updating a page we want to count the bits set. This
routine does a fairly good job. This can probably be replaced
with some intrinsics that might do a better job.
@<Utility...@>=
int countbits(unsigned int *a) {
int r = 0 ;
const unsigned int mask1 = 0x55555555 ;
const unsigned int mask2 = 0x33333333 ;
const unsigned int mask3 = 0x0f0f0f0f ;
for (int i=0; i<PAGESIZE; i += 24) {
unsigned int w1 = *a++ ;
unsigned int w2 = *a++ ;
unsigned int w3 = *a++ ;
w1 = (w1 & mask1) + ((w1 >> 1) & mask1) + (w2 & mask1) ;
w2 = ((w2 >> 1) & mask1) + (w3 & mask1) + ((w3 >> 1) & mask1) ;
unsigned int s1 = (w1 & mask2) + ((w1 >> 2) & mask2) +
(w2 & mask2) + ((w2 >> 2) & mask2) ;
s1 = (s1 & mask3) + ((s1 >> 4) & mask3) ;
w1 = *a++ ;
w2 = *a++ ;
w3 = *a++ ;
w1 = (w1 & mask1) + ((w1 >> 1) & mask1) + (w2 & mask1) ;
w2 = ((w2 >> 1) & mask1) + (w3 & mask1) + ((w3 >> 1) & mask1) ;
unsigned int s2 = (w1 & mask2) + ((w1 >> 2) & mask2) +
(w2 & mask2) + ((w2 >> 2) & mask2) ;
s1 += (s2 & mask3) + ((s2 >> 4) & mask3) ;
r += 255 & ((s1 >> 24) + (s1 >> 16) + (s1 >> 8) + s1) ;
}
return r ;
}
@ Sometimes, like when we are doing level counting, we need to
do a bit count as above, but we also need to calculate a weighted
sum. This routine handles that; it is slower than the above
routine, but it should only very rarely be called. It depends
on an array containing bit counts.
@<Data...@>=
#ifdef LEVELCOUNTS
unsigned char bc[1<<12] ;
#endif
@ Setting up the bit count array is easy.
@<Initialize...@>=
#ifdef LEVELCOUNTS
for (int i=1; i<(1<<12); i++)
bc[i] = 1+bc[i&(i-1)] ;
#endif
@ Finally, we have our alternative bitcount function. This does
nasty things with pointers; it will only work, for instance, on a
little endian machine (just like our prepass).
@<Utility...@>=
int parity(int coord) {
return (permcube::c8_4_parity[coord/(FACT4*FACT4)]^(coord/24)^coord) & 1 ;
}
#ifdef QUARTER
int thispagestatic(int coperm) {
return (parity(coperm) ^ global_depth ^ qparity) & 1 ;
}
#endif
#ifdef LEVELCOUNTS
int countbits2(int cperm, int *a, int &rv2) {
int coparity = parity(cperm) ;
int r = 0, r2 = 0 ;
int ind = 0 ;
for (int e8_4=0; e8_4<C8_4; e8_4++) {
int p2 = coparity ^ permcube::c8_4_parity[e8_4] ;
for (int epp1=0; epp1<FACT4; epp1++) {
int off1 = (p2 ^ epp1) & 1 ;
int off2 = 1 - off1 ;
for (int epp2=0; epp2<FACT4; epp2 += 2, ind += 2) {
int w = *a ;
int v1 = bc[(w & 0xfff)] ;
int v2 = bc[((w >> 12) & 0xfff)] ;
r += v1 + v2 ;
r2 += v1 * levmul[ind+off1] + v2 * levmul[ind+off2] ;
a = (int *)(((char *)a) + 3) ;
}
}
}
rv2 = r2 ;
return r ;
}
#endif
@ For even more cache-friendliness, we do sets of related pages as a
group. These pages are typically related by the up and down face
moves. To store data about every collection of pages, we use the
following structure, which contains pointers to the pages, as well
as a mapping of the order to do the various offsets in.
@<Utility...@>=
struct elemdata {
unsigned char *dst ;
unsigned char *from[PREPASS_MOVES] ;
unsigned char e84map[70] ;
} ;
@ The number of pages in a collection is defined by this constant.
@<Data...@>=
const int STRIDE = 16 ;
@ The ordering of pages to solve was defined by an external
optimizer, which we do not go into here. Suffice it to say that
|cornerorder| contains a permutation of the integers $0\ldots 40319$;
any order will work, but some orders may cause this program to
use a fair bit less memory. The output of this routine is in an
include file named |corner_order.h| and its values are stored
in an array called |cornerorder|.
@<Data...@>=
#include "corner_order.h"
@ The following routine computes the neighbors of a corner.
@<Utility...@>=
void calcneighbors(int cperm, int *a) {
permcube pc, pc2 ;
pc.c8_4 = cperm / (FACT4 * FACT4) ;
pc.ctp = cperm / FACT4 % FACT4 ;
pc.cbp = cperm % FACT4 ;
for (int mv=0; mv<NMOVES_EXT; mv++) {
if (!kocsymm::in_Kociemba_group(mv))
continue ;
pc2 = pc ;
pc2.move(mv) ;
*a++ = (pc2.c8_4 * FACT4 + pc2.ctp) * FACT4 + pc2.cbp ;
}
}
@ Our main routine to do a collection of |STRIDE| pages is below. The
magic array |moveseq16| is the sequence of moves that generates the
relevant values in a single group of |corder_order|; we check that
this holds.
@<Utility...@>=
int moveseq16[] = { U3, D1, U1, D1, U3, D1, U1, U1,
U1, D3, U3, D3, U1, D3, U3, U1 } ;
unsigned char initorder[70] ;
void doouter(int r) {
elemdata edata[16] ;
int neighbors[PREPASS_MOVES] ;
int tcperm = cornerorder[r] ;
permcube pc, pc2 ;
#ifdef QUARTER
int startparity = thispagestatic(cornerorder[r]) ;
#endif
for (int i=0; i<STRIDE; i++) {