-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathautomaton_e.txt
648 lines (557 loc) · 22.9 KB
/
automaton_e.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
CODE TRANSFORMATION AND FINITE AUTOMATONS
-----------------------------------------
There are such interesting object as finite automaton.
This automaton has some internal states (just such variables),
so when they changed, some action is performed.
So, there is set of actions, and each action is performed
when some automaton's state is changed to some specified value,
and there are matrix(es), which defines state changes.
Here is simple task:
to convert string like "1,3-7,9-100,105" into binary form.
Lets consider implementation, based on finite automaton.
Lets introduce single internal state S, and its values will be:
0 - before first (before '-') number
1 - inside of first number
2 - before second (after '-') number
3 - inside of second number
4 - end of string (after last char)
5 - syntax error
so, program will look as follows:
c - pointer to current character
*c - current character's value
A - action #
change char A action: situation:
0-->1 0..9 1 l=*c-'0'; new pair, got digit
0-->5 other new pair, got NOT a digit
1-->0 , 2 store(l,l); single number, ends with ','
1-->1 0..9 3 l=l*10+*c-'0' adding new digit to first number
1-->2 - two numbers, 1st one ends with '-'
1-->4 eoln 2 store(l,l); single number, ends with eoln
1-->5 other unavailable char within first number
2-->3 0..9 4 h=*c-'0'; two numbers, got digit after '-'
2-->5 other two numbers, got NOT a digit after '-'
3-->0 , 5 store(l,h); two numbers, ends with ','
3-->3 0..9 6 h=h*10+*c-'0'; adding new digit to second number
3-->4 eoln 5 store(l,h); two number, ends with eoln
3-->5 other unavailable char within first number
4--> 7 exit(); eoln, all ok
5--> 8 error(); syntax error
so, matrix which describes state changes and corresponding actions,
will look as follows:
next state:
c 0 1 2 3 4 5
u s action #:
r t 0 - 1 - - - 2
r a 1 3 4 5 - 3 2
e t 2 - - - 6 - 2
n e 3 7 - - 8 7 2
t 4 - - - - - -
5 - - - - - -
Simple source in C will look as follows:
int S = 0;
char*c = "1,3-7,9-100,105";
for(;;)
{
switch(S)
{
case 0:
if ( isdigit(*c) ) { S=1; l=*c-'0'; } else
{ S=5; };
break;
case 1:
if ( *c == ',' ) { S=0; store(l,l); } else
if ( isdigit(*c) ) { S=1; l=l*10+*c-'0'; } else
if ( *c == '-' ) { S=2; } else
if ( *c == 0 ) { S=4; store(l,l); } else
{ S=5; };
break;
case 2:
if ( isdigit(*c) ) { S=3; h=*c-'0'; } else
{ S=5; };
break;
case 3:
if ( *c == ',' ) { S=0; store(l,h); } else
if ( isdigit(*c) ) { S=3; h=h*10+*c-'0'; } else
if ( *c == 0 ) { S=4; store(l,h); } else
{ S=5; };
break;
case 4:
exit();
break;
case 5:
error();
break;
}
c++;
}
Now, if we will make source listed above a bit more low-level,
and remove state-variables, we will get the following:
x:
x0:
if ( isdigit(*c) ) { l=*c-'0'; c++; goto x1; } else
{ c++; goto x5; };
x1:
if ( *c == ',' ) { store(l,l); c++; goto x0; } else
if ( isdigit(*c) ) { l=l*10+*c-'0'; c++; goto x1; } else
if ( *c == '-' ) { goto x2; } else
if ( *c == 0 ) { store(l,l); c++; goto x4; } else
{ goto x5; };
x2:
if ( isdigit(*c) ) { h=*c-'0'; c++; goto x3; } else
{ goto x5; };
x3:
if ( *c == ',' ) { store(l,h); c++; goto x0; } else
if ( isdigit(*c) ) { h=h*10+*c-'0'; c++; goto x3; } else
if ( *c == 0 ) { store(l,h); c++; goto x4; } else
{ goto x5; };
x4: exit();
x5: error();
goto x
Now, if somebody will try to reverse this code into classical C
constructions, such as for, while, if-else, there will remain many
redundand goto(jmp) commands, and if number of them will be big enough,
it will be very hard to understand what this code does.
Execution graph for such code will differ from execution graph
for classical code by larger amount of crossed links.
To reverse this code into its source form,
you should select constant code blocks, enum them, then introduce
states, and only after that it will be possible to continue
understanding this code. But this looks as a hard task, because
while optimization, original code blocks can be divided into parts,
mixed and merged.
Now, lets consider a bit more complicated source:
int T[256] =
{
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// , - 0 1 2 3 4 5 6 7 8 9
0,0,0,0,0,0,0,0,0,0,0,0,2,3,0,0, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
for(;;)
{
int A;
switch( S )
{
case 0:
switch( T[*c] )
{
case 1: { S=1; A=1; }; break;
default: { S=5; A=2; }; break;
}
break;
case 1:
switch( T[*c] )
{
case 2: { S=0; A=2; }; break;
case 1: { S=1; A=3; }; break;
case 3: { S=2; }; break;
case 4: { S=4; A=2; }; break;
case 0: { S=5; }; break;
}
break;
case 2:
switch( T[*c] )
{
case 1: { S=3; A=4; }; break;
default: { S=5; }; break;
}
break;
case 3:
switch( T[*c] )
{
case 2: { S=0; A=5; }; break;
case 1: { S=3; A=6; }; break;
case 4: { S=4; A=5; }; break;
default: { S=5; }; break;
}
break;
case 4:
A = 7;
break;
case 5:
A = 8;
break;
}
switch( A )
{
case 1: { l=*c-'0'; c++; }; break;
case 2: { store(l,l); c++; }; break;
case 3: { l=l*10+*c-'0'; c++; }; break;
case 4: { h=*c-'0'; c++; }; break;
case 5: { store(l,h); c++; }; break;
case 6: { h=h*10+*c-'0'; c++; }; break;
case 7: { exit(); }; break;
case 8: { error(); }; break;
}
}
As you can see, this sample is written without comparison commands,
and conditional jmps; everything is based on switch'es which is
equivalent to tables.
In such form our program is a cycle, consisting of three parts:
in first part, external data is analyzed (here it is string of chars),
and converted into some internal variables (here it is shown
indirectly using matrix T[]), then state changes are calculated
(here it is first switch), and at the same time action # is
calculated (here it is once again first switch and variable A),
and in third part, some action is performed (here it is 2nd switch).
And here is interesting moment: now, instead of program, we have
action #'s (indexes of code blocks),
and sequence of these indexes is the core of our program.
Now lets consider the following question: how next action # is generated.
It is generated depending on current automaton state values and some
other data.
This means that we can use tables, which will tell us which state is
next, and which action should be executed on such state change.
Such tables can be converted into a function.
Requirements for this function are simple:
argument of this function is a number, which contains current state values,
and probably some identifier, and result of this function is a number,
which contains next state values, probably another identifier,
and action index.
As such, our program will just iterate the following cycle:
1. Call main function, passing state values as argument;
then extract new state values and action index from the result.
2. Execute action by given index,
which will probably change state values too.
Lets consider the following sample:
States are register values, from EAX to EDI, and action number
is an instruction value, padded with NOP's or RETN.
Then, ALL main part our program will be a ...function.
Lets initially pass to function:
EAX = 2, ECX = 3, other regs = 0, command is NOP.
So, argument will be:
00000000...00000003000000020000C390
<--EDI->...<--ECX-><--EAX-><--cmd->
Lets function result then will be:
00000000...00000003000000020000C341
<--EDI->...<--ECX-><--EAX-><--cmd->
which means same state values, and command = inc ecx.
So, we execute given command, and ecx state is changed,
and then we pass into a function this argument:
00000000...00000004000000020000C341
<--EDI->...<--ECX-><--EAX-><--cmd->
and so on.
For sure, argument and result values can also contain
unique instruction id, in case when some opcodes are duplicated.
Interesting fact is that our function can indirectly encode
many instructions, such as jumps and arithmetic commands.
The only commands can not be encoded are commands modifying
memory, working with other external devices and api-calls.
So, the question appears: how to build such function.
There can be many variants, from simple tables up to neural networks.
In the sample given above, it is impossible, for sure,
because there are many registers, and they can have many
values, so function will be so big so it will not fit into
existing computers.
So, lets consider simplified sample.
Argument of function is just a current state number.
Result is WORD.
High result byte is first opcode byte.
Low result byte is state number,
and low state bit is value of ZF flag.
As such, we can indirectly encode JZ/JNZ commands.
Also, state number is equal to action index.
Program will just encrypt/decrypt some asciiz text string.
Source:
ACTIONS: STATES/ACTIONS AND STATE CHANGES:
NZ ZR
xormsg: -1 --> 00 1ST BYTE:
xor edx, edx 00 --> 02 01 --> 02 33
cycle1: lea esi, msg 02 --> 04 03 --> 04 BE
mov ecx, msg_size 04 --> 06 05 --> 06 B9
cycle2: xor [esi], dh 06 --> 08 07 --> 08 30
sub dh, dl 08 --> 10 09 --> 10 2A
inc esi 10 --> 12 11 --> 12 46
dec ecx 12 --> 06 13 --> 14 49
jnz cycle2
inc dl 14 --> 16 15 --> 16 FE
cmp dl, 'Z' 16 --> 02 17 --> 18 80
jne cycle1
retn 18
Here is a function, defined by arguments and results:
Argument Result
f( -1 ) = 0x3300
f(0x00) = 0xBE02
f(0x01) = 0xBE02
f(0x02) = 0xB904
f(0x03) = 0xB904
f(0x04) = 0x3006
f(0x05) = 0x3006
f(0x06) = 0x2A08
f(0x07) = 0x2A08
f(0x08) = 0x460A
f(0x09) = 0x460A
f(0x0A) = 0x490C
f(0x0B) = 0x490C
f(0x0C) = 0x3006
f(0x0D) = 0xFE0E
f(0x0E) = 0x8010
f(0x0F) = 0x8010
f(0x10) = 0xBE02
f(0x11) = 0x0012
Now lets generate a function.
Let it be polynomial.
f(X) = SUMi( C[i]*X^i ), i=0..18
Here X is function argument, which, as we decided, is current state,
and function result contains next state and its first opcode byte.
Simple program to calculate coefficients is here: see (1).
So, now we have everything to implement function:
Here it is:
float calc(float X)
{
float y = 0;
for(int j=0; j<N; j++)
y += pow(X,j) * C[j];
return y;
}
Or, in more nice form:
N equ 19
go_next_state: pushf ; IN/OUT: EBX=current/next state
pusha
finit
fild dword ptr [esp+4*4] ; pusha.ebx
push N
pop ecx
lea edx, C_table
fldz ; st(1)
fld1 ; st(0)
__c1: fld st(0)
fld tbyte ptr [edx]
fmulp
faddp st(2),st(0)
fmul st(0),st(2)
add edx, 10
loop __c1
fistp dword ptr [esp+4*4] ; pusha.ebx
fistp dword ptr [esp+4*4] ; pusha.ebx
popa
popf
retn
C_table label tbyte
dt 4.864199999999999986e+04 ; C[ 0]
dt 1.052028658321440037e+06 ; C[ 1]
dt -2.226893544084362929e+06 ; C[ 2]
dt 1.054917653361728822e+06 ; C[ 3]
dt 1.030581898921544049e+06 ; C[ 4]
dt -1.641889337850409245e+06 ; C[ 5]
dt 1.056816179457771135e+06 ; C[ 6]
dt -4.226269487577827341e+05 ; C[ 7]
dt 1.179126264336835917e+05 ; C[ 8]
dt -2.418954943314347526e+04 ; C[ 9]
dt 3.749247680199179089e+03 ; C[10]
dt -4.446691826539333110e+02 ; C[11]
dt 4.044639078866551414e+01 ; C[12]
dt -2.801020107772053989e+00 ; C[13]
dt 1.450610807381378830e-01 ; C[14]
dt -5.436999378694686739e-03 ; C[15]
dt 1.391774067561251339e-04 ; C[16]
dt -2.174850123595773034e-06 ; C[17]
dt 1.563443629925163634e-08 ; C[18]
Since first opcode byte is encoded within high byte of function result,
it will be defined in our program as ?:
.data
msg db 'Hello world!',0
msg_size equ $-msg
start:
call xormsg
push 0
push offset msg
push offset msg
push 0
callW MessageBoxA
call xormsg
push 0
push offset msg
push offset msg
push 0
callW MessageBoxA
push -1
callW ExitProcess
xormsg:
xor ebx, ebx ; initial state = -1
dec ebx
jmp begin
cycle: pushf ; copy ZF into bit 0 of EBX
push eax ;
lahf ;
shr ah, 6 ; ZF ;
and ah, 1 ;
and bl, 0FEh ;
or bl, ah ;
pop eax ;
popf ;
begin:
call go_next_state ; get next state
cmp bl, 18 ; exit if last state is reached
jae quit
push eax ecx ; take 1st opcode byte from
mov cl, bh ; function result, and patch code
mov bh, 0
mov eax, jtab[ebx*4]
mov [eax], cl
pop ecx eax
jmp jtab[ebx*4] ; go to action
quit: retn
jtab label dword ; table of actions
; index = action # = state #
dd s00,s01
dd s02,s03
dd s04,s05
dd s06,s07
dd s08,s09
dd s10,s11
dd s12,s13
dd s14,s15
dd s16,s17
s00:
s01: ; xor edx, edx
db ?,0D2h
jmp cycle
s02:
s03: ; lea esi, msg
db ?
dd offset msg
jmp cycle
s04:
s05: ; mov ecx, msg_size
db ?
dd msg_size
jmp cycle
s06:
s07: ; xor [esi], dh
db ?,36h
jmp cycle
s08:
s09: ; sub dh, dl
db ?,0F2h
jmp cycle
s10:
s11: ; inc esi
db ?
jmp cycle
s12:
s13: ; dec ecx
db ?
jmp cycle
s14:
s15: ; inc dl
db ?,0C2h
jmp cycle
s16:
s17: ; cmp dl, 'Z'
db ?,0FAh,'Z'
jmp cycle
Now, to understate logic of this compiled program, you should calculate
all function results for all state values, then build table of state
changes, and insert JZ/JNZ into corresponding program places.
After that, if program is NOT builded using method shown in the
1st part of this article, i.e. not based on finite automaton conception,
it will be possible to reverse it into classical C constructions, such as
if, for, while.
Now, a bit of theory.
All these code transformation can be applied to mostly all
blocks of existing code, i.e. any linear code can be converted
into finite automaton, and then state change table can be
converted into a function.
Lets assume, that variable state of automaton's state is not ZF flag,
as it show above, but AL register value.
This means, that to extract all information encoded in our
function, it is required to iterate 256 variants.
If this would be AX register, then, for example on checking
two first bytes of some file for MZ sign, current state will be
changed into one state in 65534 cases, and into another state
in 2 cases. The bigger argument size is, the more hardly
is to iterate it; and in some cases some state changes will be
lost, and part of code, which, for example, follows MZ-check
will not be extracted.
Ideally, such function is a black box, which has some value
on input, and outputs some other value, but extract all the
information is impossible.
Well, enough words has been said, and i'll go drink some beer,
while you will think about all that crap. I hope, it will help
you somewhere.
--------------------------------------------------------------------------------
Appendix (1) - program to calculate polynomial coefficients.
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <io.h>
#include <math.h>
#pragma hdrstop
#define float long double
int N = 19;
float X[100] = {-1,0,2,4,6, 8,10,12,14,16,1,3,5,7, 9,11,13,15,17 };
float Y[100] = {
0+0x3300,
2+0xBE00,
4+0xB900,
6+0x3000,
8+0x2A00,
10+0x4600,
12+0x4900,
6+0x3000,
16+0x8000,
2+0xBE00,
2+0xBE00,
4+0xB900,
6+0x3000,
8+0x2A00,
10+0x4600,
12+0x4900,
14+0xFE00,
16+0x8000,
18+0x0000 };
float C[100];
float calc(float X)
{
float y = 0;
for(int j=0; j<N; j++)
y += pow(X,j) * C[j];
return y;
}
void init()
{
float* Z = new float[ N*(N+1) ];
assert(Z);
#define Zyx(y,x) Z[y*(N+1)+x]
for(int y=0; y<N; y++)
{
for(int x=0; x<N; x++)
Zyx(y,x) = pow(X[y], x);
Zyx(y,N) = Y[y];
}
for(int n=0; n<N-1; n++)
for(int y=n+1; y<N; y++)
{
float t = Zyx(y,n) / Zyx(n,n);
for(int x=0; x<=N; x++)
Zyx(y,x) -= Zyx(n,x) * t;
}
for(int n=N-1; n>=0; n--)
{
float s = 0;
for(int t=n+1; t<=N-1; t++)
s += Zyx(n,t) * C[t];
C[n] = (Zyx(n,N) - s) / Zyx(n,n);
}
delete Z;
for(int i=0; i<N; i++)
assert(abs(calc(X[i]) - Y[i]) < 0.0001);
}
void main()
{
init();
for(int n=0; n<N; n++)
printf("f(%5.2Lf) (%02X) = %5.2Lf (%04X)\n", X[n],
(int)ceil(X[n]), Y[n], (int)ceil(Y[n]));
printf("f(X) = SUMi( C[i]*X^i ), i=0..%i\n", N-1);
for(int n=0; n<N; n++)
printf("dt %30.19Le ; C[%2i]\n", C[n], n);
}
--------------------------------------------------------------------------------