mcfx's blog

题解、Writeup、游记和碎碎念

TCTF 2023 Writeup

HashMaster

In this challenge, we are given a huge binary, and the first step is to reverse engineer it.

But unfortunately, the stack frame is too big, and IDA can't decompile it. Some of my teammates started looking on the assembly, and I found that Binary Ninja demo version can decompile it, and we started respectively.

After some time, I found some QEMU strings, and that's why the binary is so huge. But I didn't figure out how the binary is interacted with QEMU. Finally Riatre figured out some unicorn functions, and we successfully get the details of the binary.

(Here should be an image of the reverse engineering result, but I'm too lazy to find it now)

We need to write some shellcode satisfying the following requirements:

The code is mapped at 0x1000000, and there is some writable memory at 0x2000000.
The code should calculate the SHA256 of itself, and put the result at 0x2000000.

And the 5 levels have their own requirements:

In level 0, we should output 0xffff....ff as the result.
In level 3 and 4, the rip should only increase. (i.e. the code can't have loops)
In level 2 and 4, the code can only read memory at [0x2000000, 0x2001000). (The code can't read itself)

A program can read something at the end of its code, and construct the last block, then it can calculate the hash. The hash state of the previous blocks should be stored at the last block. Since the size of one SHA256 block is 64 while the size of result is 32, it's totally achievable. This solves level 3.

In level 2, we can use a similar approach, but this time we can only execute some code at the last block. That code will load some hash state into register/stack, and then the main program will handle them.

If we bind the two restrictions together, it seems impossible. But Riatre found that our code is mapped rwx, then we can construct such an approach:

Address	Code usage
0x1000000	Write some code to END+1.
END-50 (%64==0)	Load the state before last block into r8-r15.
END-2	jmp 1
END	When the program gets here, the simulator will stop it.
END+1	Actual code to compute hash of last block (Extracted by the initial one)
END+n	jmp END

from pwn import *
# https://github.com/keanemind/python-sha-256/blob/master/sha256.py with modifications
import sha256
import hashlib
import base64

context.arch = 'amd64'

'''
clang sha256.c -o sha256 -Os

sha256.c:

typedef unsigned char BYTE;             // 8-bit byte
typedef unsigned int  WORD;             // 32-bit word, change to "long" for 16-bit machines


#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b))))

#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))

static const WORD k[64] = {
    0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
    0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
    0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
    0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
    0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
    0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
    0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
    0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};

__attribute__((noinline))void sha256_transform_state(WORD state[8], WORD m[64])
{
    WORD a, b, c, d, e, f, g, h, i, t1, t2;

#pragma clang loop unroll(full)
    for ( i=16 ; i < 64; ++i)
        m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];

    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    f = state[5];
    g = state[6];
    h = state[7];

#pragma clang loop unroll(full)
    for (i = 0; i < 64; ++i) {
        t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
        t2 = EP0(a) + MAJ(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;
    }

    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    state[5] += f;
    state[6] += g;
    state[7] += h;
}

void _start(){}
'''

elf = ELF('sha256')
func = elf.functions['sha256_transform_state']
func_body = elf.read(func.address, func.size)
print(len(func_body), type(func_body))
func_body = func_body[:-1]

code = []
code.append(
    '''
mov rsp,0x2000ff0;
mov rbx,0x2000000;
mov rcx,0x2000020;
mov rax,0x2000300;
mov rdi,0x2000400;
''')

for i in range(8):
    code.append(
        f'''
mov [rax+{i*4}],r{i+8}d
''')

for i in range(0, 8, 2):
    code.append(
        f'''
mov esi,r{i+8}d
shl rsi,16
add esi,0xb{hex(8+i)[2:]}41
mov [rdi+{i*6}],esi
mov esi,r{i+8}d
shr rsi,16
add esi,0xb{hex(9+i)[2:]}410000
mov [rdi+{i*6+4}],esi
mov [rdi+{i*6+8}],r{i+9}d
''')
code.append(
    '''
mov esi,0x8001eb
mov [rdi+48],esi
xor esi,esi
mov [rdi+52],esi
mov [rdi+56],esi
mov rsi,0x90010300
mov [rdi+60],esi
''')


prefix_len = 0x6000
tot_len = 0x6000 + 0x32
# tot_len*8==0x30190

for i in range(8):
    code.append(
        f'''
mov edx,[rax+{i*4}]
mov [rbx+{i*4}],edx
'''
    )
for i in range(16):
    code.append(
        f'''
mov edx,[rdi+{i*4}]
bswap edx
mov [rcx+{i*4}],edx
'''
    )
code.append(
    '''
mov rdi,rbx
mov rsi,rcx
''')
# here the main func
code2 = []
code2.append('mov rbx,0x2000000;')
for i in range(8):
    code2.append(
        f'''
mov edx,[rbx+{i*4}]
bswap edx
mov [rbx+{i*4}],edx
''')
code2.append(
    '''
mov rax, 0
push rax
ret
''')

ac = asm('\n'.join(code)) + func_body + asm('\n'.join(code2))
print(len(ac))
while len(ac) % 8:
    ac += b'\0'

code = [f'''
mov rbx,{hex(0x1000000+tot_len+1)}
''']

for i in range(0, len(ac), 8):
    v = int.from_bytes(ac[i:i + 8], 'little')
    code.append(
        f'''
mov rax,{v}
mov [rbx],rax
add rbx,8
'''
    )

# code.append('mov rcx,0x2000000;mov [rcx],rbx')

ac2 = asm('\n'.join(code))
print(len(ac2))
assert len(ac2) < prefix_len
ac2 = ac2.ljust(prefix_len, b'\x90')

state = sha256.init_state()
for i in range(0, len(ac2), 64):
    state = sha256.compute_round(state, ac2[i:i + 64])
print(hex(state[0]))
for i in range(8):
    ac2 += bytes([0x41, 0xb8 + i]) + state[i].to_bytes(4, 'little')
ac2 += b'\xeb\x01'
state = sha256.compute_round(state, ac2[prefix_len:] + b'\x80' + b'\0' * 10 + b'\x03\x01\x90')
print(hex(state[0]))
print(hashlib.sha256(ac2).hexdigest())
open('f4_code.bin', 'wb').write(ac2)
# open('f4_code.txt', 'w').write(disasm(ac2))

open('in.txt', 'wb').write(b'4\n' + base64.b64encode(ac2))

Economical Sort

We need to write some shellcode in x86 (32 bit) to sort an array, while

The length of the array is 100, each element is one byte, and it's located at 0x400000.
Stack is located at 0x800000.
Our code is located at 0x1000000.
Some registers contains 100, 0x400000, 0x8001000.
Each instruction should be 1-byte.
The length of the shellcode should be at most 400, and it can execute at most 1000000 steps.

With 1-byte operators, we can only:

Increase/decrease register.
Push/pop register.
Exchange register with eax.
With movs, we can move one byte from one address to another.
With cmps, we can compare one byte in memory with another byte in memory.
With stos, we can store a register to some address.
With lods, we can load a register from some address.
With scas, we can compare the byte in register with another byte in memory.
Return to some address in stack.
With xlat, we can load a register from some address with an offset.
With salc (undocumented 0xd6), we can set al according to cf.
With daa series instructions, we can check if one value matches certain values.

It easy to see that we can compare values and save the result to registers. However, how to branch based on the conditions? If we can add 0x1000000 to register, we can do something like:

; eax=comparison result
add eax, 0x1000000
xlat
add eax, 0x1000000
push eax
ret

It stores a jump table inside the code, and loads the jump table by the comparison result.

But how to add 0x1000000 to a register? That's x86 magic - unaligned load/store. Consider the following code:

push eax
inc esp
inc esp
pop eax
inc eax
push eax
dec esp
dec esp
pop eax

It increases eax by 0x1000000.

With this sort of code, we can finally branch, and then it's almost done. The rest is only heavy implementing/debugging work.

from pwn import *

# stack layout
DATA = 0
CODE_100 = 1
CNT = 2
LEN = 3


# useful variables

LOOP1_I = 'ecx'
TMP1 = 'edx'
TMP2 = 'ebp'

LOOP2_I = 'ecx'
TMP3 = 'edx'
RES_PTR = 'ebp'

# temp regs
TEMP_REGS = ['eax', 'ebx', 'edi', 'esi']


def push(x):
    return ['push ' + x]


def pop(x):
    return ['pop ' + x]


def mov(dst_reg, src_reg):
    if dst_reg == src_reg:
        return []
    return [
        *push(src_reg),
        *pop(dst_reg),
    ]


def loadmemoffset(arr_reg, index_reg, value_reg):
    return [
        *mov('ebx', arr_reg),
        *mov('eax', index_reg),
        'xlat',
        *mov(value_reg, 'eax'),
    ]


def loadmem(src_ptr_reg, value_reg):
    return [
        *mov('esi', src_ptr_reg),
        'lodsb',
        *mov(value_reg, 'eax'),
    ]


def storemem(dst_ptr_reg, value_reg):
    return [
        *mov('edi', dst_ptr_reg),
        *mov('eax', value_reg),
        'stosb',
    ]


def addconst(reg, x):
    if x >= 0:
        return ['inc ' + reg] * x
    return ['dec ' + reg] * (-x)


def incstackval(n=1):
    return [
        *pop('eax'),
        *addconst('eax', n),
        *push('eax'),
    ]


def add_code(reg, add_0x100):
    return [
        *push(reg),
        *pop('eax'),
        *addconst('esp', -1),
        *incstackval(),
        *addconst('esp', -2),
        *(incstackval() if add_0x100 else []),
        *addconst('esp', -1),
        *pop(reg),
    ]


def add_100(reg, n=1):
    return [
        *push(reg),
        *addconst('esp', 1),
        *incstackval(n),
        *addconst('esp', -1),
        *pop(reg),
    ]


def movstack(reg, stack_offset, tr=None):
    if tr is None:
        tr = TEMP_REGS[:stack_offset + 1]
        if reg in tr:
            for i in range(len(tr)):
                if tr[i] == reg:
                    tr[i], tr[-1] = tr[-1], tr[i]
                    break
    res = []
    for x in tr:
        res.extend(pop(x))
    for x in tr[::-1]:
        res.extend(push(x))
    res.extend(mov(reg, tr[stack_offset]))
    return res


def gen_if_(reg, offset):
    return [
        *movstack('ebx', CODE_100, tr=['eax', 'ebx']),
        *push(reg),
        *mov('edi', 'esp'),
        # *mov('eax', 'ebx'),
        'scasb',
        '.byte 0xd6',
        *addconst('eax', offset),
        'xlat',
        # *add_code('eax', False),

        *push('ebx'),
        *addconst('esp', 1),
        *pop('ebx'),
        *addconst('ebx', -1),
        *addconst('esp', -1),
        *push('eax'),
        *pop('eax'),
        *addconst('esp', 1),
        *push('ebx'),
        *addconst('esp', -1),
        *pop('eax'),

        # *add_100('eax', -1),
        *pop('ebx'),
        *push('eax'),
        'ret',
    ]


last_labels = []


def gen_if(reg, le0_label, else_label):
    p = len(last_labels)
    last_labels.append(else_label)
    last_labels.append(le0_label)
    return gen_if_(reg, p + 1)


def goto(label, is_last):
    assert label in last_labels
    p = 0
    while last_labels[p] != label:
        p += 1
    return [
        *movstack('ebx', CODE_100),
        *mov('eax', 'ebx'),
        *addconst('eax', p),
        'xlat',
        # *add_code('eax', is_last),
        *([]if is_last else add_100('eax', -1)),
        *push('eax'),
        'ret',
    ]


last_labels.append('loop4_start')
last_labels.append('final')

code = [
    'start:',
    *push('eax'),
    *push('esi'),  # LEN
    *mov('eax', 'edi'),
    *add_100('eax'),
    *push('eax'),  # CNT (=DATA_100)
    *add_code('ebx', True),
    *push('ebx'),  # CODE_100
    *push('edi'),  # DATA

    *mov(LOOP1_I, 'esi'),
    'loop1:',
    *addconst(LOOP1_I, -1),
    *movstack(TMP2, CNT, tr=['ebx', 'eax', 'edi']),
    *loadmemoffset('ebx', LOOP1_I, TMP1),

    *addconst(TMP2, -1),
    *addconst(TMP1, 1),

    'loop2:',
    *addconst(TMP2, 1),
    *addconst(TMP1, -1),
    *gen_if(TMP1, 'loop2_end', 'loop2'),
    'loop2_end:',

    *loadmem(TMP2, 'eax'),
    *addconst('eax', 1),
    *storemem(TMP2, 'eax'),

    *gen_if(LOOP1_I, 'loop1_end', 'loop1'),
    'loop1_end:',

    # set RES_PTR = DATA + LEN
    *movstack(RES_PTR, LEN),
    *push('eax'),
    *push('eax'),
    *addconst('esp', 1),
    *pop(TMP3),
    *push(RES_PTR),
    *pop(RES_PTR),
    *addconst('esp', 1),
    *push(TMP3),
    *addconst('esp', -1),
    *pop(RES_PTR),
    *addconst('esp', -1),
    *pop('eax'),

    'loop3:',
    *addconst(LOOP2_I, -1),
    *movstack('ebx', CNT),
    *loadmemoffset('ebx', LOOP2_I, TMP3),

    'loop4_start:',
    *gen_if(TMP3, 'loop4_end', 'loop4'),
    'loop4:',
    *addconst(RES_PTR, -1),
    *storemem(RES_PTR, LOOP2_I),
    *addconst(TMP3, -1),
    *goto('loop4_start', False),
    'loop4_end:',

    *gen_if(RES_PTR, 'loop3_end', 'loop3'),
    'loop3_end:',
    *goto('final', True),
]

# print(code)
print(len(code))
rc1 = asm('\n'.join(code))
print(len(rc1))
assert len(rc1) < 256
for i in range(256 - len(rc1)):
    code.append('nop')

for lb in last_labels:
    if lb == 'final':
        code.append(f'.byte {lb}-start-256')
    else:
        code.append(f'.byte {lb}-start')
code.append(f'final:')

rc = asm('\n'.join(code))
print(len(rc))
open('code.bin', 'wb').write(rc)
open('code.txt', 'w').write(disasm(rc))

日期： 2024-04-12
标签： CTF Writeup