mcfx's blog


TCTF 2023 Writeup


In this challenge, we are given a huge binary, and the first step is to reverse engineer it.

But unfortunately, the stack frame is too big, and IDA can't decompile it. Some of my teammates started looking on the assembly, and I found that Binary Ninja demo version can decompile it, and we started respectively.

After some time, I found some QEMU strings, and that's why the binary is so huge. But I didn't figure out how the binary is interacted with QEMU. Finally Riatre figured out some unicorn functions, and we successfully get the details of the binary.

(Here should be an image of the reverse engineering result, but I'm too lazy to find it now)

We need to write some shellcode satisfying the following requirements:

And the 5 levels have their own requirements:

A program can read something at the end of its code, and construct the last block, then it can calculate the hash. The hash state of the previous blocks should be stored at the last block. Since the size of one SHA256 block is 64 while the size of result is 32, it's totally achievable. This solves level 3.

In level 2, we can use a similar approach, but this time we can only execute some code at the last block. That code will load some hash state into register/stack, and then the main program will handle them.

If we bind the two restrictions together, it seems impossible. But Riatre found that our code is mapped rwx, then we can construct such an approach:

AddressCode usage
0x1000000Write some code to END+1.
END-50 (%64==0)Load the state before last block into r8-r15.
END-2jmp 1
ENDWhen the program gets here, the simulator will stop it.
END+1Actual code to compute hash of last block (Extracted by the initial one)
END+njmp END
from pwn import *
# with modifications
import sha256
import hashlib
import base64

context.arch = 'amd64'

clang sha256.c -o sha256 -Os


typedef unsigned char BYTE;             // 8-bit byte
typedef unsigned int  WORD;             // 32-bit word, change to "long" for 16-bit machines

#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b))))

#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))

static const WORD k[64] = {

__attribute__((noinline))void sha256_transform_state(WORD state[8], WORD m[64])
    WORD a, b, c, d, e, f, g, h, i, t1, t2;

#pragma clang loop unroll(full)
    for ( i=16 ; i < 64; ++i)
        m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];

    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    f = state[5];
    g = state[6];
    h = state[7];

#pragma clang loop unroll(full)
    for (i = 0; i < 64; ++i) {
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;

    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    state[5] += f;
    state[6] += g;
    state[7] += h;

void _start(){}

elf = ELF('sha256')
func = elf.functions['sha256_transform_state']
func_body =, func.size)
print(len(func_body), type(func_body))
func_body = func_body[:-1]

code = []
mov rsp,0x2000ff0;
mov rbx,0x2000000;
mov rcx,0x2000020;
mov rax,0x2000300;
mov rdi,0x2000400;

for i in range(8):
mov [rax+{i*4}],r{i+8}d

for i in range(0, 8, 2):
mov esi,r{i+8}d
shl rsi,16
add esi,0xb{hex(8+i)[2:]}41
mov [rdi+{i*6}],esi
mov esi,r{i+8}d
shr rsi,16
add esi,0xb{hex(9+i)[2:]}410000
mov [rdi+{i*6+4}],esi
mov [rdi+{i*6+8}],r{i+9}d
mov esi,0x8001eb
mov [rdi+48],esi
xor esi,esi
mov [rdi+52],esi
mov [rdi+56],esi
mov rsi,0x90010300
mov [rdi+60],esi

prefix_len = 0x6000
tot_len = 0x6000 + 0x32
# tot_len*8==0x30190

for i in range(8):
mov edx,[rax+{i*4}]
mov [rbx+{i*4}],edx
for i in range(16):
mov edx,[rdi+{i*4}]
bswap edx
mov [rcx+{i*4}],edx
mov rdi,rbx
mov rsi,rcx
# here the main func
code2 = []
code2.append('mov rbx,0x2000000;')
for i in range(8):
mov edx,[rbx+{i*4}]
bswap edx
mov [rbx+{i*4}],edx
mov rax, 0
push rax

ac = asm('\n'.join(code)) + func_body + asm('\n'.join(code2))
while len(ac) % 8:
    ac += b'\0'

code = [f'''
mov rbx,{hex(0x1000000+tot_len+1)}

for i in range(0, len(ac), 8):
    v = int.from_bytes(ac[i:i + 8], 'little')
mov rax,{v}
mov [rbx],rax
add rbx,8

# code.append('mov rcx,0x2000000;mov [rcx],rbx')

ac2 = asm('\n'.join(code))
assert len(ac2) < prefix_len
ac2 = ac2.ljust(prefix_len, b'\x90')

state = sha256.init_state()
for i in range(0, len(ac2), 64):
    state = sha256.compute_round(state, ac2[i:i + 64])
for i in range(8):
    ac2 += bytes([0x41, 0xb8 + i]) + state[i].to_bytes(4, 'little')
ac2 += b'\xeb\x01'
state = sha256.compute_round(state, ac2[prefix_len:] + b'\x80' + b'\0' * 10 + b'\x03\x01\x90')
open('f4_code.bin', 'wb').write(ac2)
# open('f4_code.txt', 'w').write(disasm(ac2))

open('in.txt', 'wb').write(b'4\n' + base64.b64encode(ac2))

Economical Sort

We need to write some shellcode in x86 (32 bit) to sort an array, while

With 1-byte operators, we can only:

It easy to see that we can compare values and save the result to registers. However, how to branch based on the conditions? If we can add 0x1000000 to register, we can do something like:

; eax=comparison result
add eax, 0x1000000
add eax, 0x1000000
push eax

It stores a jump table inside the code, and loads the jump table by the comparison result.

But how to add 0x1000000 to a register? That's x86 magic - unaligned load/store. Consider the following code:

push eax
inc esp
inc esp
pop eax
inc eax
push eax
dec esp
dec esp
pop eax

It increases eax by 0x1000000.

With this sort of code, we can finally branch, and then it's almost done. The rest is only heavy implementing/debugging work.

from pwn import *

# stack layout
DATA = 0
CODE_100 = 1
CNT = 2
LEN = 3

# useful variables

LOOP1_I = 'ecx'
TMP1 = 'edx'
TMP2 = 'ebp'

LOOP2_I = 'ecx'
TMP3 = 'edx'
RES_PTR = 'ebp'

# temp regs
TEMP_REGS = ['eax', 'ebx', 'edi', 'esi']

def push(x):
    return ['push ' + x]

def pop(x):
    return ['pop ' + x]

def mov(dst_reg, src_reg):
    if dst_reg == src_reg:
        return []
    return [

def loadmemoffset(arr_reg, index_reg, value_reg):
    return [
        *mov('ebx', arr_reg),
        *mov('eax', index_reg),
        *mov(value_reg, 'eax'),

def loadmem(src_ptr_reg, value_reg):
    return [
        *mov('esi', src_ptr_reg),
        *mov(value_reg, 'eax'),

def storemem(dst_ptr_reg, value_reg):
    return [
        *mov('edi', dst_ptr_reg),
        *mov('eax', value_reg),

def addconst(reg, x):
    if x >= 0:
        return ['inc ' + reg] * x
    return ['dec ' + reg] * (-x)

def incstackval(n=1):
    return [
        *addconst('eax', n),

def add_code(reg, add_0x100):
    return [
        *addconst('esp', -1),
        *addconst('esp', -2),
        *(incstackval() if add_0x100 else []),
        *addconst('esp', -1),

def add_100(reg, n=1):
    return [
        *addconst('esp', 1),
        *addconst('esp', -1),

def movstack(reg, stack_offset, tr=None):
    if tr is None:
        tr = TEMP_REGS[:stack_offset + 1]
        if reg in tr:
            for i in range(len(tr)):
                if tr[i] == reg:
                    tr[i], tr[-1] = tr[-1], tr[i]
    res = []
    for x in tr:
    for x in tr[::-1]:
    res.extend(mov(reg, tr[stack_offset]))
    return res

def gen_if_(reg, offset):
    return [
        *movstack('ebx', CODE_100, tr=['eax', 'ebx']),
        *mov('edi', 'esp'),
        # *mov('eax', 'ebx'),
        '.byte 0xd6',
        *addconst('eax', offset),
        # *add_code('eax', False),

        *addconst('esp', 1),
        *addconst('ebx', -1),
        *addconst('esp', -1),
        *addconst('esp', 1),
        *addconst('esp', -1),

        # *add_100('eax', -1),

last_labels = []

def gen_if(reg, le0_label, else_label):
    p = len(last_labels)
    return gen_if_(reg, p + 1)

def goto(label, is_last):
    assert label in last_labels
    p = 0
    while last_labels[p] != label:
        p += 1
    return [
        *movstack('ebx', CODE_100),
        *mov('eax', 'ebx'),
        *addconst('eax', p),
        # *add_code('eax', is_last),
        *([]if is_last else add_100('eax', -1)),


code = [
    *push('esi'),  # LEN
    *mov('eax', 'edi'),
    *push('eax'),  # CNT (=DATA_100)
    *add_code('ebx', True),
    *push('ebx'),  # CODE_100
    *push('edi'),  # DATA

    *mov(LOOP1_I, 'esi'),
    *addconst(LOOP1_I, -1),
    *movstack(TMP2, CNT, tr=['ebx', 'eax', 'edi']),
    *loadmemoffset('ebx', LOOP1_I, TMP1),

    *addconst(TMP2, -1),
    *addconst(TMP1, 1),

    *addconst(TMP2, 1),
    *addconst(TMP1, -1),
    *gen_if(TMP1, 'loop2_end', 'loop2'),

    *loadmem(TMP2, 'eax'),
    *addconst('eax', 1),
    *storemem(TMP2, 'eax'),

    *gen_if(LOOP1_I, 'loop1_end', 'loop1'),

    # set RES_PTR = DATA + LEN
    *movstack(RES_PTR, LEN),
    *addconst('esp', 1),
    *addconst('esp', 1),
    *addconst('esp', -1),
    *addconst('esp', -1),

    *addconst(LOOP2_I, -1),
    *movstack('ebx', CNT),
    *loadmemoffset('ebx', LOOP2_I, TMP3),

    *gen_if(TMP3, 'loop4_end', 'loop4'),
    *addconst(RES_PTR, -1),
    *storemem(RES_PTR, LOOP2_I),
    *addconst(TMP3, -1),
    *goto('loop4_start', False),

    *gen_if(RES_PTR, 'loop3_end', 'loop3'),
    *goto('final', True),

# print(code)
rc1 = asm('\n'.join(code))
assert len(rc1) < 256
for i in range(256 - len(rc1)):

for lb in last_labels:
    if lb == 'final':
        code.append(f'.byte {lb}-start-256')
        code.append(f'.byte {lb}-start')

rc = asm('\n'.join(code))
open('code.bin', 'wb').write(rc)
open('code.txt', 'w').write(disasm(rc))

日期: 2024-04-12

标签: CTF Writeup