diff --git a/build.rs b/build.rs index c7b01355..60f369bf 100644 --- a/build.rs +++ b/build.rs @@ -12,7 +12,6 @@ fn main() { link_args_basm.push("/NODEFAULTLIB"); link_args_basm.push("/DYNAMICBASE"); link_args_basm.push("/ENTRY:_start"); - link_args_basm.push("/BASE:0x0"); link_args_basm.push("/NXCOMPAT:NO"); link_args_basm.push("/STACK:268435456"); link_args_basm.push("/EMITTOOLVERSIONINFO:NO"); diff --git a/scripts/build-and-judge.py b/scripts/build-and-judge.py index e5f81b9a..f828b600 100644 --- a/scripts/build-and-judge.py +++ b/scripts/build-and-judge.py @@ -67,7 +67,7 @@ def test_equal(x, y): # Build the project to generate the source code try: - p = subprocess.run([build_cmd], shell=True, capture_output=True, text=True) + p = subprocess.run([build_cmd], shell=True, capture_output=True, text=True, encoding="utf8") if p.returncode != 0: raise Exception("Build failed. The stderr:\n{0}".format(p.stderr)) source_code = p.stdout diff --git a/scripts/static-pie-elf2bin.py b/scripts/static-pie-elf2bin.py index 550684cc..7cb885d3 100644 --- a/scripts/static-pie-elf2bin.py +++ b/scripts/static-pie-elf2bin.py @@ -232,13 +232,18 @@ def load_elf32(elf): memory_bin = memory_bin[pos_begin:] entrypoint_offset -= pos_begin + # Patch the entrypoint + # We look for: + # 0: f8 clc + # and replace it with: + # 0: f9 stc + # This works for both i686 and amd64. + assert memory_bin[entrypoint_offset:entrypoint_offset+1] == b"\xf8" + memory_bin[entrypoint_offset:entrypoint_offset+1] = b"\xf9" + with open(binary_path, "wb") as f: f.write(bytes(memory_bin)) fdict = {} - fdict['leading_unused_bytes'] = pos_begin fdict['entrypoint_offset'] = entrypoint_offset - fdict['pe_image_base'] = 0 - fdict['pe_off_reloc'] = 0 - fdict['pe_size_reloc'] = 0 print(json.dumps(fdict)) # callers of this script can capture stdout to get this value \ No newline at end of file diff --git a/scripts/static-pie-gen.py b/scripts/static-pie-gen.py index e49430cc..c4aadcae 100644 --- a/scripts/static-pie-gen.py +++ b/scripts/static-pie-gen.py @@ -27,28 +27,33 @@ compressed_binary_path = binary_path + ".lzma" elf2bin = subprocess.check_output([sys.executable, "scripts/static-pie-elf2bin.py", elf_path, binary_path]).decode("utf-8") loader_fdict = json.loads(elf2bin) -assert 'leading_unused_bytes' in loader_fdict assert 'entrypoint_offset' in loader_fdict -assert 'pe_image_base' in loader_fdict -assert 'pe_off_reloc' in loader_fdict -assert 'pe_size_reloc' in loader_fdict # Please refer to the following link for the lzma file format: # https://svn.python.org/projects/external/xz-5.0.3/doc/lzma-file-format.txt +# However, we use a different format: +# [ 0, 1) = (1 << pb) - 1 +# [ 1, 2) = (1 << lp) - 1 +# [ 2, 3) = lc +# [ 3, 4) = lp + lc + 8 +# [ 4, 8) = Uncompressed size +# [ 8, ..) = Compressed data without the leading byte with open(binary_path, "rb") as f: memory_bin = f.read() # Embed these information into the LZMA file to reduce the generated code length - x = loader_fdict['pe_image_base'].to_bytes(8, byteorder='little') + \ - loader_fdict['pe_off_reloc'].to_bytes(8, byteorder='little') + \ - loader_fdict['pe_size_reloc'].to_bytes(8, byteorder='little') + \ - loader_fdict['entrypoint_offset'].to_bytes(8, byteorder='little') + x = loader_fdict['entrypoint_offset'].to_bytes(8, byteorder='little') memory_bin += x -lzma_filter = {'id': lzma.FILTER_LZMA1, 'preset': lzma.PRESET_EXTREME, 'lp': 0, 'lc': 0, 'pb': 2, 'dict_size': 1 << 22} -compressed_memory_bin = lzma.compress(memory_bin, format=lzma.FORMAT_RAW, filters=[lzma_filter]) -lzma_header_properties = ((lzma_filter['pb'] * 5 + lzma_filter['lp']) * 9 + lzma_filter['lc']).to_bytes(1, byteorder='little') -lzma_header_dictionary_size = lzma_filter['dict_size'].to_bytes(4, byteorder='little') -lzma_header_uncompressed_size = len(memory_bin).to_bytes(8, byteorder='little') -compressed_memory_bin = lzma_header_properties + lzma_header_dictionary_size + lzma_header_uncompressed_size + bytes(compressed_memory_bin) +lzma_filter = {'id': lzma.FILTER_LZMA1, 'preset': lzma.PRESET_EXTREME, 'lp': 0, 'lc': 0, 'pb': 0, 'dict_size': 1 << 22, 'depth': 200} +compressed_memory_bin = bytearray(lzma.compress(memory_bin, format=lzma.FORMAT_RAW, filters=[lzma_filter])) +while len(compressed_memory_bin) < 4: + compressed_memory_bin += b'\x00' # append zeros for byte order swap (this won't happen in almost all cases, though) +compressed_memory_bin = compressed_memory_bin[1:] # strip the (redundant) leading zero byte of the LZMA stream +compressed_memory_bin[:4] = reversed(compressed_memory_bin[:4]) # perform byte order swap in advance + +pb, lp, lc = lzma_filter['pb'], lzma_filter['lp'], lzma_filter['lc'] +lzma_header_properties = ((((1 << pb) - 1) + ((1 << lp) - 1) << 8) + (lc << 16) + ((lp + lc + 8) << 24)).to_bytes(4, byteorder='little') +lzma_header_uncompressed_size = len(memory_bin).to_bytes(4, byteorder='little') +compressed_memory_bin = lzma_header_properties + lzma_header_uncompressed_size + bytes(compressed_memory_bin) with open(compressed_binary_path, "wb") as f: f.write(compressed_memory_bin) @@ -90,6 +95,10 @@ # stub with open(stub_path, "rb") as f: stub = f.read() +if lang_name == "Rust" and "x86_64" in target_name: + with open(stub_path.replace("stub-amd64", "prestub-amd64-2"), "rb") as f: + prestub2 = f.read() + stub = prestub2 + stub stub_b91 = base91.encode(stub).decode('ascii') stub_b91_len = len(stub_b91) @@ -129,10 +138,6 @@ def multiple_replace(string, rep_dict): "$$$$binary_base91$$$$": code_b91, "$$$$binary_base91_len$$$$": str(code_b91_len), "$$$$min_len_4096$$$$": str(min(len(code_b85)+1, 4096)), - "$$$$leading_unused_bytes$$$$": str(loader_fdict['leading_unused_bytes']), "$$$$entrypoint_offset$$$$": str(loader_fdict['entrypoint_offset']), - "$$$$pe_image_base$$$$": str(loader_fdict['pe_image_base']), - "$$$$pe_off_reloc$$$$": str(loader_fdict['pe_off_reloc']), - "$$$$pe_size_reloc$$$$": str(loader_fdict['pe_size_reloc']), }) print(out) \ No newline at end of file diff --git a/scripts/static-pie-pe2bin.py b/scripts/static-pie-pe2bin.py index 682540bd..00c6b666 100644 --- a/scripts/static-pie-pe2bin.py +++ b/scripts/static-pie-pe2bin.py @@ -15,8 +15,13 @@ print(f"Usage: {sys.argv[0]} pe_path binary_path", file=sys.stderr) sys.exit(1) + ''' + We relocate the PE image to the base address 0 (ImageBase=0), + regardless of the original ImageBase (which is usually 0x1_4000_0000) + This simplifies the loader code. + ''' pe = pefile.PE(pe_path) - memory_bin = bytearray(pe.get_memory_mapped_image()) + memory_bin = bytearray(pe.get_memory_mapped_image(ImageBase=0)) needed = bytearray(len(memory_bin)) pos_begin = len(memory_bin) pos_end = 0 @@ -40,16 +45,39 @@ if needed[i] == 0: memory_bin[i] = 0 memory_bin = memory_bin[pos_begin:pos_end] + entrypoint_offset = pe.OPTIONAL_HEADER.AddressOfEntryPoint - pos_begin if reloc_sz > 0: reloc_off = len(memory_bin) memory_bin += reloc_bin + reloc_off = 0 if reloc_sz == 0 else pos_begin + reloc_off + + # Patch the entrypoint + # We look for: + # 0: f8 clc + # and replace it with: + # 0: f9 stc + # This works for both i686 and amd64. + assert memory_bin[entrypoint_offset:entrypoint_offset+1] == b"\xf8" + memory_bin[entrypoint_offset:entrypoint_offset+1] = b"\xf9" + + # Patch the relocation offset and size (which is in _start) + # We look for: + # 0: be 78 56 34 12 mov esi,0x12345678 <- replaced with reloc_off + # 5: ba 78 56 34 12 mov edx,0x12345678 <- replaced with reloc_sz + template = b"\xbe\x78\x56\x34\x12\xba\x78\x56\x34\x12" + reloc_patched = False + for i in range(entrypoint_offset, len(memory_bin) - len(template)): + if memory_bin[i:i+len(template)] == template: + memory_bin[i+1:i+5] = reloc_off.to_bytes(4, byteorder='little') + memory_bin[i+6:i+10] = reloc_sz.to_bytes(4, byteorder='little') + reloc_patched = True + break + assert reloc_patched, "Failed to incorporate the relocation information into the binary. Please report this error." + + # Write to file with open(binary_path, "wb") as f: f.write(bytes(memory_bin)) fdict = {} - fdict['leading_unused_bytes'] = pos_begin - fdict['entrypoint_offset'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint - pos_begin - fdict['pe_image_base'] = pe.OPTIONAL_HEADER.ImageBase - fdict['pe_off_reloc'] = 0 if reloc_sz == 0 else pos_begin + reloc_off - fdict['pe_size_reloc'] = reloc_sz + fdict['entrypoint_offset'] = entrypoint_offset print(json.dumps(fdict)) # callers of this script can capture stdout to get this value \ No newline at end of file diff --git a/scripts/static-pie-prestub-amd64-2.asm b/scripts/static-pie-prestub-amd64-2.asm new file mode 100644 index 00000000..37d6d242 --- /dev/null +++ b/scripts/static-pie-prestub-amd64-2.asm @@ -0,0 +1,24 @@ +; -*- tab-width: 4 -*- +; +; The prestub 2 for amd64-rust target +; (prestub 2: the code that runs after the prestub but before the stub and sets the stage) +; +; build: nasm -f bin -O9 static-pie-prestub-amd64-2.asm -o static-pie-prestub-amd64-2.bin + +BITS 64 +ORG 0 +section .text + +; Decode binary (rsi -> rdi) + push rdi + push r14 + pop rsi ; rsi = BINARY_BASE91 + push rsi + pop rdi ; rdi = BINARY_BASE91 (in-place decoding) + push rdi + call rbx + +; Prepare for stub + pop rdx ; rdx = LZMA-compressed binary + pop rdi + lea rcx, qword [rsp+40] ; rcx = PLATFORM_DATA table \ No newline at end of file diff --git a/scripts/static-pie-prestub-amd64-2.bin b/scripts/static-pie-prestub-amd64-2.bin new file mode 100644 index 00000000..bb864151 --- /dev/null +++ b/scripts/static-pie-prestub-amd64-2.bin @@ -0,0 +1 @@ +WAV^V_WZ_HL$( \ No newline at end of file diff --git a/scripts/static-pie-prestub-amd64-print.py b/scripts/static-pie-prestub-amd64-print.py index 84adaeb8..de11e5eb 100644 --- a/scripts/static-pie-prestub-amd64-print.py +++ b/scripts/static-pie-prestub-amd64-print.py @@ -63,7 +63,8 @@ def to_hex_short(y): # convert the table part table_part = table_part.decode('ascii') table_part = table_part.replace('{', '{{').replace('}', '}}').replace('$', '\\\\x24').replace('\0','\\\\0') -out.append(" \"{0}\\\"{1}\\\"\",\n".format(".asciz" if asciz else ".ascii", table_part)) +if asciz or len(table_part) > 0: + out.append(" \"{0}\\\"{1}\\\"\",\n".format(".asciz" if asciz else ".ascii", table_part)) # print the result print("".join(out)) \ No newline at end of file diff --git a/scripts/static-pie-prestub-amd64.asm b/scripts/static-pie-prestub-amd64.asm index d0f91ab1..e2fa7cba 100644 --- a/scripts/static-pie-prestub-amd64.asm +++ b/scripts/static-pie-prestub-amd64.asm @@ -12,100 +12,40 @@ ORG 0 section .text ; Align stack to 16 byte boundary -; [rsp+ 32, rsp+144): PLATFORM_DATA +; [rsp+ 32, rsp+120): PLATFORM_DATA ; [rsp+ 0, rsp+ 32): (shadow space for win64 calling convention) - push rbx - enter 80, 0 - and rsp, 0xFFFFFFFFFFFFFFF0 - -; PLATFORM_DATA - xchg rax, rcx - push rax ; PLATFORM_DATA[24..31] = win_GetProcAddress - push rcx ; PLATFORM_DATA[16..23] = win_kernel32 - xor edx, edx - test rax, rax - sete dl ; Enable ENV_FLAGS_LINUX_STYLE_CHKSTK outside Windows - push rdx ; PLATFORM_DATA[ 8..15] = env_flags (0=None, 1=ENV_FLAGS_LINUX_STYLE_CHKSTK) - inc edx - push rdx ; PLATFORM_DATA[ 0.. 7] = env_id (1=Windows, 2=Linux) - sub rsp, 32 ; shadow space - -; Allocate memory for stub - lea rsi, [rel _svc_alloc_rwx] ; Register svc_alloc_rwx - test rax, rax - jz _u - lea rdx, [rsi + _VirtualAlloc - _svc_alloc_rwx] - call rax ; after the call, rax = pointer to VirtualAlloc -_u: - push rax - pop rbx ; rbx = pointer to VirtualAlloc - push 1 - pop rcx ; rcx = 1 -> will be rounded up to the nearest page size, which is 0x1000 (4K) - call rsi ; svc_alloc_rwx - -; Copy svc_alloc_rwx to the new buffer -; Current state: rax = new buffer, rbx = pointer to VirtualAlloc, rsi = svc_alloc_rwx - mov qword [rsp+56+32], rax ; PLATFORM_DATA[56..63] = ptr_alloc_rwx (on the new buffer) - xchg rax, rdi ; rdi = new buffer - mov ax, 0xB848 ; mov rax, STRICT QWORD imm64 - stosw - xchg rax, rbx ; rax = pointer to VirtualAlloc - stosq - push _svc_alloc_rwx_end - _svc_alloc_rwx - pop rcx - rep movsb ; this progresses rsi to _decode - push rsi - pop rbx ; rbx = _decode - push rdi - push r14 - -; Decode stub (rsi -> rdi) -; Current state: rdi = stub memory - mov rsi, r13 ; rsi = STUB_BASE91 - call rbx - -; Decode binary (rsi -> rdi) - pop rsi ; rsi = BINARY_BASE91 - push rsi - pop rdi ; rdi = BINARY_BASE91 (in-place decoding) - push rdi - call rbx - -; Call stub - pop rdx ; rdx = LZMA-compressed binary - pop rax ; rax = stub entrypoint - lea rcx, qword [rsp+32] ; rcx = PLATFORM_DATA table - call rax - leave - pop rbx - jmp _end_of_everything + enter 48, 0 + jnc _s + push rcx ; Linux: align stack on 16-byte boundary +_s: sbb ecx, ecx + neg ecx ; Enable ENV_FLAGS_LINUX_STYLE_CHKSTK outside Windows + call _t ; svc_alloc_rwx for Windows and Linux ; rcx = size -; rax = pointer to VirtualAlloc (must be supplied before prepending the mov instruction) +; rdi = pointer to VirtualAlloc (must be supplied before prepending the mov instruction) _svc_alloc_rwx: - test rax, rax + push 9 + pop rax ; syscall id of x64 mmap + jecxz _decode + cdq ; rdx=0 + xor r9d, r9d ; offset + test rdi, rdi jz _svc_alloc_rwx_linux _svc_alloc_rwx_windows: - push rcx - pop rdx ; size - xor ecx, ecx + xchg ecx, edx ; rcx=0 / rdx=tsize mov r8d, 0x3000 ; MEM_COMMIT | MEM_RESERVE - push 0x40 - pop r9 ; PAGE_EXECUTE_READWRITE - jmp rax ; kernel32!VirtualAlloc + mov r9b, 0x40 ; PAGE_EXECUTE_READWRITE + jmp rdi ; kernel32!VirtualAlloc _svc_alloc_rwx_linux: push rsi ; save rsi - mov al, 9 ; syscall id of x64 mmap (safe since we have ensured rax=0) - xor edi, edi +; xor edi, edi ; rdi=0 (already ensured) mov esi, ecx ; size - push 7 - pop rdx ; protect + mov dl, 7 ; protect (safe since we have ensured rdx=0) push 0x22 pop r10 ; flags push -1 pop r8 ; fd - xor r9d, r9d ; offset syscall pop rsi ; restore rsi _ret: @@ -114,8 +54,7 @@ _svc_alloc_rwx_end: ; Base91 decoder _decode: - push 0x1f - pop rax + mov al, 0x1f _decode_loop: shl eax, 13 lodsb @@ -134,12 +73,33 @@ _decode_output: jnz _decode_output jmp _decode_loop -align 8, db 0 +; PLATFORM_DATA +_t: ; PLATFORM_DATA[32..39] = ptr_alloc_rwx + pop rbx + push rbx + push rdx ; PLATFORM_DATA[24..31] = win_GetProcAddress + push rax ; PLATFORM_DATA[16..23] = win_kernel32 + push rcx ; PLATFORM_DATA[ 8..15] = env_flags (0=None, 1=ENV_FLAGS_LINUX_STYLE_CHKSTK) + inc ecx + push rcx ; PLATFORM_DATA[ 0.. 7] = env_id (1=Windows, 2=Linux) + sub rsp, 32 ; shadow space + call rbx ; svc_alloc_rwx + +; Current state: rax = new buffer, rdi = pointer to VirtualAlloc + push rax + push rdi + xchg rax, rdi ; rdi = new buffer + +; Decode stub (rsi -> rdi) +; Current state: rdi = stub memory (by the previous instruction) +; rsi = STUB_BASE91 (by the Rust template) + xor ecx, ecx + call rbx + +; Call stub (it will perform the below operations) + pop rdi ; rdi = pointer to VirtualAlloc + pop rax + call rax ; This will jump to the start of the new buffer (stub) + leave -_VirtualAlloc: - db "VirtualAlloc" - db 0 -_kernel32: - db "kernel32" - db 0 -_end_of_everything: \ No newline at end of file +align 8, db 0x90 ; nop \ No newline at end of file diff --git a/scripts/static-pie-prestub-amd64.bin b/scripts/static-pie-prestub-amd64.bin index 67fd2cfb..4977d25a 100644 Binary files a/scripts/static-pie-prestub-amd64.bin and b/scripts/static-pie-prestub-amd64.bin differ diff --git a/scripts/static-pie-stub-amd64.asm b/scripts/static-pie-stub-amd64.asm index ae8549e2..c7b23fdd 100644 --- a/scripts/static-pie-stub-amd64.asm +++ b/scripts/static-pie-stub-amd64.asm @@ -51,48 +51,44 @@ LOC _state, 8 %define Temp rbp +; Does not touch rdi until we call svc_alloc_rwx _start: - push rbp + enter 32, 0 ; shadow space push rdx ; LZMA binary - pop rbp ; "mov rbp, rdx": rbp is preserved upon function calls + pop rsi ; "mov rsi, rdx": rsi is preserved upon function calls push rcx ; PLATFORM_DATA table pop rbx ; "mov rbx, rcx": rbx is preserved upon function calls - sub rsp, 32 ; shadow space - movzx eax, byte [rdx + 0] ; al = pb*45 + lp*9 + lc - cdq ; edx = 0 - push 45 - pop rcx - div ecx ; eax = pb, edx = lp*9 + lc - xor edi, edi - bts edi, eax - lea r13, [rdi-1] ; r13 = (1 << pb) - 1 - xchg eax, edx ; eax = lp*9 + lc - cdq ; edx = 0 - mov cl, 9 - div ecx ; eax = lp, edx = lc - lea ecx, [rax + rdx + 8] - mov r15, rdx ; r15 = lc - cdq ; edx = 0 - bts edx, eax - lea r14, [rdx-1] ; r14 = (1 << lp) - 1 + xor eax, eax + lodsb + mov r13, rax ; r13 = (1 << pb) - 1 + lodsb + mov r14, rax ; r14 = (1 << lp) - 1 + lodsb + mov r15, rax ; r15 = lc + lodsb + mov ecx, eax ; rcx = lp + lc + 8 mov al, 3 shl eax, cl add eax, 2048 - xchg rax, r12 ; r12 = tsize (always a multiple of 256) + xchg rax, r12 ; r12 = tsize (always a multiple of 256) - mov rcx, qword [rbp + 5] ; svc_alloc_rwx: size of memory - call qword [rbx + 56] ; allocate the Dest memory + lodsd + xchg eax, ecx ; svc_alloc_rwx: size of memory + call qword [rbx + 32] ; allocate the Dest memory push rax ; Save rax = Dst xchg rax, r9 ; r9 = Dst - lea r8, [rbp + 18] ; r8 = Src + 18 - mov esi, dword [rbp + 14] - bswap esi ; esi = initial 32 bits of the stream - ; Note: the first byte of the LZMA stream is always the zero byte (ignored) + lodsd + mov r8, rsi ; r8 = Src + 12 + xchg eax, esi ; esi = initial 32 bits of the stream + ; Note: the first byte of the LZMA stream is always the zero byte (ignored), + ; but it is stripped by the packager and does not exist here. + ; Also, the byte swap is also done by the packager. push rbx ; Save rbx + push rbp ; Save rbp lea rdi, [rsp - 2] sub rsp, r12 sub rsp, r12 @@ -354,17 +350,10 @@ _copy: _end: _code_end: lea rsp, [rsp + 2*r12 + 48] ; Restore rsp - pop rax ; rax = PLATFORM_DATA table - lea rsi, [Dest - 32] - lea rdi, [rax + 32] - push 24 - pop rcx - rep movsb - xchg rax, rcx ; rcx = PLATFORM_DATA table - pop rax ; Restore rax = Dst - add rax, qword [rsi] ; Add entrypoint offset - inc byte [rax + 1] ; Change 'push 0' to 'push 1' - add rsp, 32 - pop rbp + pop rbp ; Restore rbp + pop rcx ; rcx = PLATFORM_DATA table + pop rax ; rax = start of the binary + add rax, qword [Dest - 8] ; add entrypoint offset + leave jmp rax ; Jump to the entrypoint of the binary ; (it will inherit the current stackframe) \ No newline at end of file diff --git a/scripts/static-pie-stub-amd64.bin b/scripts/static-pie-stub-amd64.bin index b1f07cdf..085deb0d 100644 Binary files a/scripts/static-pie-stub-amd64.bin and b/scripts/static-pie-stub-amd64.bin differ diff --git a/scripts/static-pie-stub-i686.asm b/scripts/static-pie-stub-i686.asm index 461a97c2..789ff067 100644 --- a/scripts/static-pie-stub-i686.asm +++ b/scripts/static-pie-stub-i686.asm @@ -67,25 +67,12 @@ _start: sub esp, 40 mov esi, dword [ebp + 12] movzx eax, byte [esi + 0] - xor edx, edx - xor ecx, ecx - mov cl, 45 - div ecx - xor ebx, ebx - bts ebx, eax - dec ebx - mov dword [esp + 20], ebx ; [esp + 20] = (1 << pb) - 1 - mov eax, edx - xor edx, edx - mov cl, 9 - div ecx ; eax = lp, edx = lc - lea ecx, [eax + edx + 8] - xor ebx, ebx - bts ebx, eax - dec ebx - mov dword [esp + 16], ebx ; [esp + 16] = (1 << lp) - 1 - mov dword [esp + 12], edx ; [esp + 12] = lc - + mov dword [esp + 20], eax + movzx eax, byte [esi + 1] + mov dword [esp + 16], eax + movzx eax, byte [esi + 2] + mov dword [esp + 12], eax + movzx ecx, byte [esi + 3] mov al, 3 shl eax, cl add eax, 2048 @@ -93,10 +80,10 @@ _start: mov ebx, dword [ebp + 8] ; ebx = PLATFORM_DATA table mov esi, dword [ebp + 12] - mov edi, dword [esi + 5] ; edi = decompressed size of payload + mov edi, dword [esi + 4] ; edi = decompressed size of payload sub esp, 12 push edi ; svc_alloc_rwx: size of memory - call dword [ebx + 56] ; allocate the Dest memory + call dword [ebx + 32] ; allocate the Dest memory add esp, 16 mov dword [esp + 4], eax ; [esp + 4] = Dest mov dword [esp + 32], eax ; [esp + 32] = Dest @@ -106,17 +93,18 @@ _start: sub esp, 8 push 1 ; svc_alloc: alignment (required by Rust) push edi ; svc_alloc: size of memory - call dword [ebx + 60] ; allocate the Temp memory + call dword [ebx + 36] ; allocate the Temp memory add esp, 16 mov dword [esp + 0], eax ; [esp + 0] = Temp mov esi, dword [ebp + 12] - mov edi, dword [esi + 14] - bswap edi ; edi = initial 32 bits of the stream - ; Note: the first byte of the LZMA stream is always the zero byte (ignored) + mov edi, dword [esi + 8] ; edi = initial 32 bits of the stream + ; Note: the first byte of the LZMA stream is always the zero byte (ignored), + ; but it is stripped by the packager and does not exist here. + ; Also, the byte swap is also done by the packager. mov dword [esp + 28], edi ; [esp + 28] = initial 32 bits of the stream - add esi, 18 ; esi = Src + 18 - mov dword [esp + 8], esi ; [esp + 8] = Src + 18 + add esi, 12 ; esi = Src + 12 + mov dword [esp + 8], esi ; [esp + 8] = Src + 12 call _lzma_dec @@ -128,18 +116,12 @@ _start: push eax ; svc_free: size of memory to be freed (required by Rust) push edi ; svc_free: ptr to be freed mov ebx, dword [ebp + 8] ; ebx = PLATFORM_DATA table (since _lzma_dec clobbers ebx) - call dword [ebx + 68] ; free the Temp memory + call dword [ebx + 44] ; free the Temp memory add esp, 16 mov edx, dword [esp + 4] ; edx = (End of the decompressed data) - lea esi, [edx - 32] - lea edi, [ebx + 32] - push 24 - pop ecx - rep movsb mov ecx, dword [esp + 32] ; ecx = Dest - add ecx, dword [esi] ; add entrypoint_offset - mov byte [ecx + 1], 1 ; Change 'push 0' to 'push 1' + add ecx, dword [edx - 8] ; add entrypoint_offset mov dword [esp + 0], ebx ; the PLATFORM_DATA table call ecx ; call the entrypoint of the binary add esp, 40 diff --git a/scripts/static-pie-stub-i686.bin b/scripts/static-pie-stub-i686.bin index 4772e973..b395907c 100644 Binary files a/scripts/static-pie-stub-i686.bin and b/scripts/static-pie-stub-i686.bin differ diff --git a/scripts/static-pie-template-amd64.c b/scripts/static-pie-template-amd64.c index 9303d9ef..af91c4aa 100644 --- a/scripts/static-pie-template-amd64.c +++ b/scripts/static-pie-template-amd64.c @@ -69,9 +69,6 @@ typedef struct { uint64_t env_flags; uint64_t win_kernel32; // handle of kernel32.dll uint64_t win_GetProcAddress; // pointer to kernel32!GetProcAddress - uint64_t pe_image_base; - uint64_t pe_off_reloc; - uint64_t pe_size_reloc; void *ptr_alloc_rwx; // pointer to function void *ptr_alloc; // pointer to function void *ptr_alloc_zeroed; // pointer to function diff --git a/scripts/static-pie-template-amd64.rs b/scripts/static-pie-template-amd64.rs index c4a660a6..886f695d 100644 --- a/scripts/static-pie-template-amd64.rs +++ b/scripts/static-pie-template-amd64.rs @@ -1,7 +1,7 @@ // Generated with https://github.com/kiwiyou/basm-rs // Learn rust (https://doc.rust-lang.org/book/) and get high performance out of the box! -#![crate_type = "cdylib"] // On Windows, omit this line or pass '--crate-type=bin' to rustc to avoid DLL creation. +#![crate_type = "cdylib"] // On Windows, omit this or pass '--crate-type=bin' to rustc to avoid DLL creation. #![cfg_attr(not(windows), no_std)]#![allow(unused)]#[no_link]extern crate std as s; // SOLUTION BEGIN @@ -11,25 +11,21 @@ $$$$solution_src$$$$ // SOLUTION END // LOADER BEGIN -#[cfg(not(target_arch = "x86_64"))] compile_error!("Unsupported target architecture."); -#[cfg(not(any(windows, target_os = "linux")))] compile_error!("Unsupported target operating system."); - +#[cfg(not(all(target_arch = "x86_64", any(windows, target_os = "linux"))))] +compile_error!("Unsupported target architecture or operating system."); +macro_rules! p { () => { "stc" } } #[cfg(windows)] -macro_rules! p { () => { "lea rcx,[rip+209];call LoadLibraryA;lea rcx,[rip+GetProcAddress]" } } -#[cfg(not(windows))] -macro_rules! p { () => { "mov rcx,0" } } +macro_rules! p { () => { "call LoadLibraryA;lea rdx,[rip+GetProcAddress];lea rdi,[rip+VirtualAlloc];clc" } } static mut PAYLOAD: [u8; $$$$binary_base91_len$$$$] = *br$$$$binary_base91$$$$; #[no_mangle] unsafe fn _start() { s::arch::asm!(p!(), - ".quad 0e48348000050c853h,48d23151509148f0h,0c2ff52c2940fc085h,358d4820ec834852h,\ - 74c0854800000045h,50d0ff4f568d4806h,8948d6ff59016a5bh,48b8669748582444h,\ - 6aab489348ab66b8h,41575b56a4f3592eh,565ed3ffee894c56h,8d48585ad3ff575fh,\ - 0eb5bc9d0ff20244ch,5a511074c0854865h,3000b841c931h,0b056e0ff5941406ah,\ - 5a076ace89ff3109h,5841ff6a5a41226ah,6ac35e050fc93145h,242cac0de0c1581fh,\ - 6b242cac9299f472h,8e8c1aad0015bc0h,0e3ebf77510c4f6h;.asciz\"VirtualAlloc\\0kernel32\"", - in("r14") PAYLOAD.as_mut_ptr(), in("r13") r$$$$stub_base91$$$$.as_ptr() + ".quad 19510173000030c8h,4ce8d9f7c9h,459927e36758096ah,870d74ff8548c931h,\ + 4100003000b841cah,0b2ce8956e7ff40b1h,41ff6a5a41226a07h,0c11fb0c35e050f58h,\ + 99f572242cac0de0h,15bc06b242cac92h,10c4f608e8c1aad0h,5052535be3ebf775h,\ + 20ec834851c1ff51h,0c93197485750d3ffh,90c9d0ff585fd3ffh", + in("rcx") "KERNEL32\0".as_ptr(), in("r14") PAYLOAD.as_mut_ptr(), in("rsi") r$$$$stub_base91$$$$.as_ptr() ) } fn main() { unsafe { _start() } } diff --git a/scripts/static-pie-template-i686.c b/scripts/static-pie-template-i686.c index 17ed4739..e901a860 100644 --- a/scripts/static-pie-template-i686.c +++ b/scripts/static-pie-template-i686.c @@ -45,9 +45,6 @@ typedef struct { uint64_t env_flags; uint64_t win_kernel32; // handle of kernel32.dll uint64_t win_GetProcAddress; // pointer to kernel32!GetProcAddress - uint64_t pe_image_base; - uint64_t pe_off_reloc; - uint64_t pe_size_reloc; void *ptr_alloc_rwx; // pointer to function void *ptr_alloc; // pointer to function void *ptr_alloc_zeroed; // pointer to function diff --git a/src/bin/codegen.rs b/src/bin/codegen.rs index de0d8cb1..e8033ddd 100644 --- a/src/bin/codegen.rs +++ b/src/bin/codegen.rs @@ -57,17 +57,15 @@ unsafe extern "win64" fn _start() -> ! { // However, when called as the entrypoint by the Linux OS, // RSP will be 16-byte aligned AFTER `call` instruction. asm!( - "push 0", // rax=0 (running without loader) / rax=1 (running with loader) - "pop rax", - "push rcx", // short form of "sub rsp, 8" + "clc", // CF=0 (running without loader) / CF=1 (running with loader) "mov rbx, rcx", // Save PLATFORM_DATA table - "test eax, eax", - "jnz 1f", - "sub rsp, 88", // 16 + 88 + 8 = 112 = 16*7 -> stack alignment preserved + "jc 1f", + "sub rsp, 72", // 16 + 72 + 8 = 96 = 16*6 -> stack alignment preserved "push 3", // env_flags = 3 (ENV_FLAGS_LINUX_STYLE_CHKSTK | ENV_FLAGS_NATIVE) "push 2", // env_id = 2 (ENV_ID_LINUX) "lea rbx, [rsp]", // rbx = PLATFORM_DATA table "1:", + "push rcx", // short form of "sub rsp, 8" "lea rdi, [rip + __ehdr_start]", "lea rsi, [rip + _DYNAMIC]", "call {0}", @@ -104,31 +102,26 @@ unsafe extern "win64" fn _start() -> ! { // RSP will be 16-byte aligned BEFORE `call` instruction. // In addition, we need to provide a `shadow space` of 32 bytes. asm!( - "push 0", // rax=0 (running without loader) / rax=1 (running with loader) - "pop rax", - "push rbp", - "mov rbp, rsp", - "sub rsp, 80", // 80 = 112 - 32 (tables) + "clc", // CF=0 (running without loader) / CF=1 (running with loader) + "enter 64, 0", // 64 = 88 - 32 (tables) + 8 (alignment) "mov rbx, rcx", // save rcx as rbx is non-volatile (callee-saved) - "test eax, eax", - "jnz 1f", + "jc 1f", "call {3}", "lea rdi, [rip+{4}]", "push rdi", // GetProcAddress "push rax", // handle to kernel32 "push 2", // env_flags = 2 (ENV_FLAGS_NATIVE) "push 1", // env_id = 1 (ENV_ID_WINDOWS) - "lea rbx, [rsp]", // rbx = PLATFORM_DATA table + "mov rbx, rsp", // rbx = PLATFORM_DATA table "sub rsp, 32", "jmp 2f", "1:", - "mov rdi, QWORD PTR [rbx + 32]", // Preferred ImageBase - "lea rsi, [rip + __ImageBase]", // In-memory ImageBase - "mov rdx, QWORD PTR [rbx + 40]", // Offset of relocation table (relative to the in-memory ImageBase) - "mov rcx, QWORD PTR [rbx + 48]", // Size of relocation table (relative to the in-memory ImageBase) + "lea rdi, [rip + __ImageBase]", // In-memory ImageBase (cf. Preferred ImageBase is set to 0x0 by static-pie-pe2bin.py) + "mov esi, 0x12345678", // [replaced by static-pie-pe2bin.py] Offset of relocation table (relative to the in-memory ImageBase) + "mov edx, 0x12345678", // [replaced by static-pie-pe2bin.py] Size of relocation table (relative to the in-memory ImageBase) "call {0}", "2:", - "bt DWORD PTR [rbx + 8], 0", + "bt QWORD PTR [rbx + 8], 0", "jnc 3f", // BEGIN Linux patch // Linux ABI requires us to actually move the stack pointer @@ -143,8 +136,7 @@ unsafe extern "win64" fn _start() -> ! { "3:", "mov rcx, rbx", "call {1}", - "mov rsp, rbp", - "pop rbp", + "leave", "ret", sym loader::amd64_pe::relocate, sym _start_rust, @@ -186,11 +178,9 @@ unsafe extern "cdecl" fn _start() -> ! { // i386 System V ABI requires ESP to be aligned // on the 16-byte boundary BEFORE `call` instruction asm!( - "push 0", // eax=0 (running without loader) / eax=1 (running with loader) - "pop eax", - "test eax, eax", - "jnz 1f", - "sub esp, 76", // 76 = 68 + 12; PLATFORM_DATA pointer (4 bytes) + PLATFORM_DATA (68 (+ 16 = 84 bytes)) + alignment (8 bytes wasted) + "clc", // CF=0 (running without loader) / CF=1 (running with loader) + "jc 1f", + "sub esp, 44", // 44 = 40 + 4; PLATFORM_DATA ptr (4 bytes, pushed later) + PLATFORM_DATA (40 (+ 16 = 56 bytes)) + alignment (4 bytes wasted) "push 0", // zero upper dword "push 3", // env_flags = 3 (ENV_FLAGS_LINUX_STYLE_CHKSTK | ENV_FLAGS_NATIVE) "push 0", // zero upper dword @@ -206,11 +196,11 @@ unsafe extern "cdecl" fn _start() -> ! { "2:", "call 3f", "3:", - "pop ecx", // ecx = _start + 40 (obtained by counting the opcode size in bytes) + "pop ecx", // ecx = _start + 36 (obtained by counting the opcode size in bytes) "push edx", // [esp + 0] = PLATFORM_DATA table "call {2}", // eax = offset of _start from the image base "sub ecx, eax", - "sub ecx, 40", // ecx = the in-memory image base (i.e., __ehdr_start) + "sub ecx, 36", // ecx = the in-memory image base (i.e., __ehdr_start) "call {3}", // eax = offset of _DYNAMIC table from the image base "add eax, ecx", // eax = _DYNAMIC table "sub esp, 8", // For stack alignment diff --git a/src/platform/loader/amd64_pe.rs b/src/platform/loader/amd64_pe.rs index dd9550f8..e13c5e4c 100644 --- a/src/platform/loader/amd64_pe.rs +++ b/src/platform/loader/amd64_pe.rs @@ -17,8 +17,14 @@ const IMAGE_REL_BASED_DIR64: u16 = 10; // The base relocation applies t //} +/* This function assumes the original ImageBase is 0x0, + * which is ensured by `static-pie-pe2bin.py`. + * Note that when the executable runs natively, + * this assumption breaks; but in that case, + * the Windows PE loader handles relocation for us, + * and thus this function is not run; hence no problem. + */ pub unsafe extern "sysv64" fn relocate( - orig_image_base: u64, addr_image_base: u64, off_reloc: u64, size_reloc: u64, @@ -30,7 +36,7 @@ pub unsafe extern "sysv64" fn relocate( let size_of_block: u32 = ptr::read((off + 4) as *const u32); let end_of_block: u64 = off + size_of_block as u64; off += 8; - let reloc_delta: u64 = addr_image_base - orig_image_base; + let reloc_delta: u64 = addr_image_base; while off < end_of_block { let w_val: u16 = ptr::read(off as *const u16); off += 2; diff --git a/src/platform/services.rs b/src/platform/services.rs index 27471280..9c435c16 100644 --- a/src/platform/services.rs +++ b/src/platform/services.rs @@ -35,9 +35,6 @@ pub struct PlatformData { pub env_flags: u64, pub win_kernel32: u64, // handle of kernel32.dll pub win_GetProcAddress: u64, // pointer to kernel32::GetProcAddress - pub pe_image_base: u64, - pub pe_off_reloc: u64, - pub pe_size_reloc: u64, pub fn_table: [usize; 7], } @@ -49,11 +46,11 @@ pub fn install(platform_data_by_loader: usize) { } #[inline(always)] unsafe fn addr(fn_id: usize) -> usize { - core::ptr::read((PLATFORM_DATA + 56 + fn_id * core::mem::size_of::()) as *mut usize) + core::ptr::read((PLATFORM_DATA + 32 + fn_id * core::mem::size_of::()) as *mut usize) } #[inline(always)] pub unsafe fn install_single_service(fn_id: usize, fn_ptr: usize) { - core::ptr::write((PLATFORM_DATA + 56 + fn_id * core::mem::size_of::()) as *mut usize, fn_ptr) + core::ptr::write((PLATFORM_DATA + 32 + fn_id * core::mem::size_of::()) as *mut usize, fn_ptr) } //#[inline(always)] pub unsafe fn alloc(size: usize, align: usize) -> *mut u8 {