Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shorten the generated code #37

Merged
merged 18 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ fn main() {
link_args_basm.push("/NODEFAULTLIB");
link_args_basm.push("/DYNAMICBASE");
link_args_basm.push("/ENTRY:_start");
link_args_basm.push("/BASE:0x0");
link_args_basm.push("/NXCOMPAT:NO");
link_args_basm.push("/STACK:268435456");
link_args_basm.push("/EMITTOOLVERSIONINFO:NO");
Expand Down
2 changes: 1 addition & 1 deletion scripts/build-and-judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_equal(x, y):

# Build the project to generate the source code
try:
p = subprocess.run([build_cmd], shell=True, capture_output=True, text=True)
p = subprocess.run([build_cmd], shell=True, capture_output=True, text=True, encoding="utf8")
if p.returncode != 0:
raise Exception("Build failed. The stderr:\n{0}".format(p.stderr))
source_code = p.stdout
Expand Down
13 changes: 9 additions & 4 deletions scripts/static-pie-elf2bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,18 @@ def load_elf32(elf):
memory_bin = memory_bin[pos_begin:]
entrypoint_offset -= pos_begin

# Patch the entrypoint
# We look for:
# 0: f8 clc
# and replace it with:
# 0: f9 stc
# This works for both i686 and amd64.
assert memory_bin[entrypoint_offset:entrypoint_offset+1] == b"\xf8"
memory_bin[entrypoint_offset:entrypoint_offset+1] = b"\xf9"

with open(binary_path, "wb") as f:
f.write(bytes(memory_bin))

fdict = {}
fdict['leading_unused_bytes'] = pos_begin
fdict['entrypoint_offset'] = entrypoint_offset
fdict['pe_image_base'] = 0
fdict['pe_off_reloc'] = 0
fdict['pe_size_reloc'] = 0
print(json.dumps(fdict)) # callers of this script can capture stdout to get this value
41 changes: 23 additions & 18 deletions scripts/static-pie-gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,33 @@
compressed_binary_path = binary_path + ".lzma"
elf2bin = subprocess.check_output([sys.executable, "scripts/static-pie-elf2bin.py", elf_path, binary_path]).decode("utf-8")
loader_fdict = json.loads(elf2bin)
assert 'leading_unused_bytes' in loader_fdict
assert 'entrypoint_offset' in loader_fdict
assert 'pe_image_base' in loader_fdict
assert 'pe_off_reloc' in loader_fdict
assert 'pe_size_reloc' in loader_fdict

# Please refer to the following link for the lzma file format:
# https://svn.python.org/projects/external/xz-5.0.3/doc/lzma-file-format.txt
# However, we use a different format:
# [ 0, 1) = (1 << pb) - 1
# [ 1, 2) = (1 << lp) - 1
# [ 2, 3) = lc
# [ 3, 4) = lp + lc + 8
# [ 4, 8) = Uncompressed size
# [ 8, ..) = Compressed data without the leading byte
with open(binary_path, "rb") as f:
memory_bin = f.read()
# Embed these information into the LZMA file to reduce the generated code length
x = loader_fdict['pe_image_base'].to_bytes(8, byteorder='little') + \
loader_fdict['pe_off_reloc'].to_bytes(8, byteorder='little') + \
loader_fdict['pe_size_reloc'].to_bytes(8, byteorder='little') + \
loader_fdict['entrypoint_offset'].to_bytes(8, byteorder='little')
x = loader_fdict['entrypoint_offset'].to_bytes(8, byteorder='little')
memory_bin += x
lzma_filter = {'id': lzma.FILTER_LZMA1, 'preset': lzma.PRESET_EXTREME, 'lp': 0, 'lc': 0, 'pb': 2, 'dict_size': 1 << 22}
compressed_memory_bin = lzma.compress(memory_bin, format=lzma.FORMAT_RAW, filters=[lzma_filter])
lzma_header_properties = ((lzma_filter['pb'] * 5 + lzma_filter['lp']) * 9 + lzma_filter['lc']).to_bytes(1, byteorder='little')
lzma_header_dictionary_size = lzma_filter['dict_size'].to_bytes(4, byteorder='little')
lzma_header_uncompressed_size = len(memory_bin).to_bytes(8, byteorder='little')
compressed_memory_bin = lzma_header_properties + lzma_header_dictionary_size + lzma_header_uncompressed_size + bytes(compressed_memory_bin)
lzma_filter = {'id': lzma.FILTER_LZMA1, 'preset': lzma.PRESET_EXTREME, 'lp': 0, 'lc': 0, 'pb': 0, 'dict_size': 1 << 22, 'depth': 200}
compressed_memory_bin = bytearray(lzma.compress(memory_bin, format=lzma.FORMAT_RAW, filters=[lzma_filter]))
while len(compressed_memory_bin) < 4:
compressed_memory_bin += b'\x00' # append zeros for byte order swap (this won't happen in almost all cases, though)
compressed_memory_bin = compressed_memory_bin[1:] # strip the (redundant) leading zero byte of the LZMA stream
compressed_memory_bin[:4] = reversed(compressed_memory_bin[:4]) # perform byte order swap in advance

pb, lp, lc = lzma_filter['pb'], lzma_filter['lp'], lzma_filter['lc']
lzma_header_properties = ((((1 << pb) - 1) + ((1 << lp) - 1) << 8) + (lc << 16) + ((lp + lc + 8) << 24)).to_bytes(4, byteorder='little')
lzma_header_uncompressed_size = len(memory_bin).to_bytes(4, byteorder='little')
compressed_memory_bin = lzma_header_properties + lzma_header_uncompressed_size + bytes(compressed_memory_bin)
with open(compressed_binary_path, "wb") as f:
f.write(compressed_memory_bin)

Expand Down Expand Up @@ -90,6 +95,10 @@
# stub
with open(stub_path, "rb") as f:
stub = f.read()
if lang_name == "Rust" and "x86_64" in target_name:
with open(stub_path.replace("stub-amd64", "prestub-amd64-2"), "rb") as f:
prestub2 = f.read()
stub = prestub2 + stub

stub_b91 = base91.encode(stub).decode('ascii')
stub_b91_len = len(stub_b91)
Expand Down Expand Up @@ -129,10 +138,6 @@ def multiple_replace(string, rep_dict):
"$$$$binary_base91$$$$": code_b91,
"$$$$binary_base91_len$$$$": str(code_b91_len),
"$$$$min_len_4096$$$$": str(min(len(code_b85)+1, 4096)),
"$$$$leading_unused_bytes$$$$": str(loader_fdict['leading_unused_bytes']),
"$$$$entrypoint_offset$$$$": str(loader_fdict['entrypoint_offset']),
"$$$$pe_image_base$$$$": str(loader_fdict['pe_image_base']),
"$$$$pe_off_reloc$$$$": str(loader_fdict['pe_off_reloc']),
"$$$$pe_size_reloc$$$$": str(loader_fdict['pe_size_reloc']),
})
print(out)
40 changes: 34 additions & 6 deletions scripts/static-pie-pe2bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
print(f"Usage: {sys.argv[0]} pe_path binary_path", file=sys.stderr)
sys.exit(1)

'''
We relocate the PE image to the base address 0 (ImageBase=0),
regardless of the original ImageBase (which is usually 0x1_4000_0000)
This simplifies the loader code.
'''
pe = pefile.PE(pe_path)
memory_bin = bytearray(pe.get_memory_mapped_image())
memory_bin = bytearray(pe.get_memory_mapped_image(ImageBase=0))
needed = bytearray(len(memory_bin))
pos_begin = len(memory_bin)
pos_end = 0
Expand All @@ -40,16 +45,39 @@
if needed[i] == 0:
memory_bin[i] = 0
memory_bin = memory_bin[pos_begin:pos_end]
entrypoint_offset = pe.OPTIONAL_HEADER.AddressOfEntryPoint - pos_begin
if reloc_sz > 0:
reloc_off = len(memory_bin)
memory_bin += reloc_bin
reloc_off = 0 if reloc_sz == 0 else pos_begin + reloc_off

# Patch the entrypoint
# We look for:
# 0: f8 clc
# and replace it with:
# 0: f9 stc
# This works for both i686 and amd64.
assert memory_bin[entrypoint_offset:entrypoint_offset+1] == b"\xf8"
memory_bin[entrypoint_offset:entrypoint_offset+1] = b"\xf9"

# Patch the relocation offset and size (which is in _start)
# We look for:
# 0: be 78 56 34 12 mov esi,0x12345678 <- replaced with reloc_off
# 5: ba 78 56 34 12 mov edx,0x12345678 <- replaced with reloc_sz
template = b"\xbe\x78\x56\x34\x12\xba\x78\x56\x34\x12"
reloc_patched = False
for i in range(entrypoint_offset, len(memory_bin) - len(template)):
if memory_bin[i:i+len(template)] == template:
memory_bin[i+1:i+5] = reloc_off.to_bytes(4, byteorder='little')
memory_bin[i+6:i+10] = reloc_sz.to_bytes(4, byteorder='little')
reloc_patched = True
break
assert reloc_patched, "Failed to incorporate the relocation information into the binary. Please report this error."

# Write to file
with open(binary_path, "wb") as f:
f.write(bytes(memory_bin))

fdict = {}
fdict['leading_unused_bytes'] = pos_begin
fdict['entrypoint_offset'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint - pos_begin
fdict['pe_image_base'] = pe.OPTIONAL_HEADER.ImageBase
fdict['pe_off_reloc'] = 0 if reloc_sz == 0 else pos_begin + reloc_off
fdict['pe_size_reloc'] = reloc_sz
fdict['entrypoint_offset'] = entrypoint_offset
print(json.dumps(fdict)) # callers of this script can capture stdout to get this value
24 changes: 24 additions & 0 deletions scripts/static-pie-prestub-amd64-2.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
; -*- tab-width: 4 -*-
;
; The prestub 2 for amd64-rust target
; (prestub 2: the code that runs after the prestub but before the stub and sets the stage)
;
; build: nasm -f bin -O9 static-pie-prestub-amd64-2.asm -o static-pie-prestub-amd64-2.bin

BITS 64
ORG 0
section .text

; Decode binary (rsi -> rdi)
push rdi
push r14
pop rsi ; rsi = BINARY_BASE91
push rsi
pop rdi ; rdi = BINARY_BASE91 (in-place decoding)
push rdi
call rbx

; Prepare for stub
pop rdx ; rdx = LZMA-compressed binary
pop rdi
lea rcx, qword [rsp+40] ; rcx = PLATFORM_DATA table
1 change: 1 addition & 0 deletions scripts/static-pie-prestub-amd64-2.bin
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
WAV^V_WÿÓZ_HL$(
3 changes: 2 additions & 1 deletion scripts/static-pie-prestub-amd64-print.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def to_hex_short(y):
# convert the table part
table_part = table_part.decode('ascii')
table_part = table_part.replace('{', '{{').replace('}', '}}').replace('$', '\\\\x24').replace('\0','\\\\0')
out.append(" \"{0}\\\"{1}\\\"\",\n".format(".asciz" if asciz else ".ascii", table_part))
if asciz or len(table_part) > 0:
out.append(" \"{0}\\\"{1}\\\"\",\n".format(".asciz" if asciz else ".ascii", table_part))

# print the result
print("".join(out))
138 changes: 49 additions & 89 deletions scripts/static-pie-prestub-amd64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -12,100 +12,40 @@ ORG 0
section .text

; Align stack to 16 byte boundary
; [rsp+ 32, rsp+144): PLATFORM_DATA
; [rsp+ 32, rsp+120): PLATFORM_DATA
; [rsp+ 0, rsp+ 32): (shadow space for win64 calling convention)
push rbx
enter 80, 0
and rsp, 0xFFFFFFFFFFFFFFF0

; PLATFORM_DATA
xchg rax, rcx
push rax ; PLATFORM_DATA[24..31] = win_GetProcAddress
push rcx ; PLATFORM_DATA[16..23] = win_kernel32
xor edx, edx
test rax, rax
sete dl ; Enable ENV_FLAGS_LINUX_STYLE_CHKSTK outside Windows
push rdx ; PLATFORM_DATA[ 8..15] = env_flags (0=None, 1=ENV_FLAGS_LINUX_STYLE_CHKSTK)
inc edx
push rdx ; PLATFORM_DATA[ 0.. 7] = env_id (1=Windows, 2=Linux)
sub rsp, 32 ; shadow space

; Allocate memory for stub
lea rsi, [rel _svc_alloc_rwx] ; Register svc_alloc_rwx
test rax, rax
jz _u
lea rdx, [rsi + _VirtualAlloc - _svc_alloc_rwx]
call rax ; after the call, rax = pointer to VirtualAlloc
_u:
push rax
pop rbx ; rbx = pointer to VirtualAlloc
push 1
pop rcx ; rcx = 1 -> will be rounded up to the nearest page size, which is 0x1000 (4K)
call rsi ; svc_alloc_rwx

; Copy svc_alloc_rwx to the new buffer
; Current state: rax = new buffer, rbx = pointer to VirtualAlloc, rsi = svc_alloc_rwx
mov qword [rsp+56+32], rax ; PLATFORM_DATA[56..63] = ptr_alloc_rwx (on the new buffer)
xchg rax, rdi ; rdi = new buffer
mov ax, 0xB848 ; mov rax, STRICT QWORD imm64
stosw
xchg rax, rbx ; rax = pointer to VirtualAlloc
stosq
push _svc_alloc_rwx_end - _svc_alloc_rwx
pop rcx
rep movsb ; this progresses rsi to _decode
push rsi
pop rbx ; rbx = _decode
push rdi
push r14

; Decode stub (rsi -> rdi)
; Current state: rdi = stub memory
mov rsi, r13 ; rsi = STUB_BASE91
call rbx

; Decode binary (rsi -> rdi)
pop rsi ; rsi = BINARY_BASE91
push rsi
pop rdi ; rdi = BINARY_BASE91 (in-place decoding)
push rdi
call rbx

; Call stub
pop rdx ; rdx = LZMA-compressed binary
pop rax ; rax = stub entrypoint
lea rcx, qword [rsp+32] ; rcx = PLATFORM_DATA table
call rax
leave
pop rbx
jmp _end_of_everything
enter 48, 0
jnc _s
push rcx ; Linux: align stack on 16-byte boundary
_s: sbb ecx, ecx
neg ecx ; Enable ENV_FLAGS_LINUX_STYLE_CHKSTK outside Windows
call _t

; svc_alloc_rwx for Windows and Linux
; rcx = size
; rax = pointer to VirtualAlloc (must be supplied before prepending the mov instruction)
; rdi = pointer to VirtualAlloc (must be supplied before prepending the mov instruction)
_svc_alloc_rwx:
test rax, rax
push 9
pop rax ; syscall id of x64 mmap
jecxz _decode
cdq ; rdx=0
xor r9d, r9d ; offset
test rdi, rdi
jz _svc_alloc_rwx_linux
_svc_alloc_rwx_windows:
push rcx
pop rdx ; size
xor ecx, ecx
xchg ecx, edx ; rcx=0 / rdx=tsize
mov r8d, 0x3000 ; MEM_COMMIT | MEM_RESERVE
push 0x40
pop r9 ; PAGE_EXECUTE_READWRITE
jmp rax ; kernel32!VirtualAlloc
mov r9b, 0x40 ; PAGE_EXECUTE_READWRITE
jmp rdi ; kernel32!VirtualAlloc
_svc_alloc_rwx_linux:
push rsi ; save rsi
mov al, 9 ; syscall id of x64 mmap (safe since we have ensured rax=0)
xor edi, edi
; xor edi, edi ; rdi=0 (already ensured)
mov esi, ecx ; size
push 7
pop rdx ; protect
mov dl, 7 ; protect (safe since we have ensured rdx=0)
push 0x22
pop r10 ; flags
push -1
pop r8 ; fd
xor r9d, r9d ; offset
syscall
pop rsi ; restore rsi
_ret:
Expand All @@ -114,8 +54,7 @@ _svc_alloc_rwx_end:

; Base91 decoder
_decode:
push 0x1f
pop rax
mov al, 0x1f
_decode_loop:
shl eax, 13
lodsb
Expand All @@ -134,12 +73,33 @@ _decode_output:
jnz _decode_output
jmp _decode_loop

align 8, db 0
; PLATFORM_DATA
_t: ; PLATFORM_DATA[32..39] = ptr_alloc_rwx
pop rbx
push rbx
push rdx ; PLATFORM_DATA[24..31] = win_GetProcAddress
push rax ; PLATFORM_DATA[16..23] = win_kernel32
push rcx ; PLATFORM_DATA[ 8..15] = env_flags (0=None, 1=ENV_FLAGS_LINUX_STYLE_CHKSTK)
inc ecx
push rcx ; PLATFORM_DATA[ 0.. 7] = env_id (1=Windows, 2=Linux)
sub rsp, 32 ; shadow space
call rbx ; svc_alloc_rwx

; Current state: rax = new buffer, rdi = pointer to VirtualAlloc
push rax
push rdi
xchg rax, rdi ; rdi = new buffer

; Decode stub (rsi -> rdi)
; Current state: rdi = stub memory (by the previous instruction)
; rsi = STUB_BASE91 (by the Rust template)
xor ecx, ecx
call rbx

; Call stub (it will perform the below operations)
pop rdi ; rdi = pointer to VirtualAlloc
pop rax
call rax ; This will jump to the start of the new buffer (stub)
leave

_VirtualAlloc:
db "VirtualAlloc"
db 0
_kernel32:
db "kernel32"
db 0
_end_of_everything:
align 8, db 0x90 ; nop
Binary file modified scripts/static-pie-prestub-amd64.bin
Binary file not shown.
Loading
Loading