From e3d5ca8419ab0e831e8ce30f6aac97ff882c126a Mon Sep 17 00:00:00 2001 From: David Guillen Fandos Date: Sat, 6 Nov 2021 12:17:50 +0100 Subject: [PATCH] [x86/x64] Add support for x86-64 and improve 32 bit mode too. This adds support for x86-64 dynarec both on Windows and Linux. Since they have different requirements there's some macro magic in the stubs file. This also fixes x86 support in some cases: stack alignment requirements where violated all over. This allows the usage of clang as a compiler (which has a tendency to use SSE instructions more often than gcc does). To support this I also reworked the mmap/VirtualAlloc magic to make sure JIT arena stays close to .text. Fixed some other minor issues and removed some unnecessary JIT code here and there. clang tends to do some (wrong?) assumptions about global symbols alignment. --- Makefile | 10 +- cpu_threaded.c | 4 +- memmap.c | 52 +++++- x86/x86_emit.h | 45 +++--- x86/x86_stub.S | 427 ++++++++++++++++++++++++++----------------------- 5 files changed, 312 insertions(+), 226 deletions(-) diff --git a/Makefile b/Makefile index 734b71e..6d47bf6 100644 --- a/Makefile +++ b/Makefile @@ -21,19 +21,21 @@ endif ifeq ($(firstword $(filter x86_64,$(UNAME))),x86_64) - + HAVE_DYNAREC := 1 + CPU_ARCH := x86_32 else ifeq ($(firstword $(filter amd64,$(UNAME))),amd64) - + HAVE_DYNAREC := 1 + CPU_ARCH := x86_32 else ifeq ($(firstword $(filter x86,$(UNAME))),x86) FORCE_32BIT_ARCH = 1 + HAVE_DYNAREC := 1 + CPU_ARCH := x86_32 endif FORCE_32BIT := ifeq ($(FORCE_32BIT_ARCH),1) - HAVE_DYNAREC := 1 FORCE_32BIT := -m32 - CPU_ARCH := x86_32 endif # system platform diff --git a/cpu_threaded.c b/cpu_threaded.c index 0f500dc..0d3a989 100644 --- a/cpu_threaded.c +++ b/cpu_threaded.c @@ -2937,7 +2937,7 @@ block_data_type block_data[MAX_BLOCK_SIZE]; block_exit_type block_exits[MAX_EXITS]; #define smc_write_arm_yes() { \ - int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \ + intptr_t offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \ if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \ { \ address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \ @@ -2946,7 +2946,7 @@ block_exit_type block_exits[MAX_EXITS]; } #define smc_write_thumb_yes() { \ - int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \ + intptr_t offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \ if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \ { \ address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \ diff --git a/memmap.c b/memmap.c index 84b86b4..63448da 100644 --- a/memmap.c +++ b/memmap.c @@ -1,6 +1,25 @@ +#include + #include "memmap.h" +// The JIT cache buffer is allocated via mmap (or win equivalent) so that it +// can be RWX. On top of that, we need the bufer to be "close" to the text +// segment, so that we can perform jumps between the two code blocks. +// Android and some other platforms discourage the usage of sections in the +// binary (ie. on-disk ELF) that are marked as executable and writtable for +// security reasons. Therefore we prefer to use mmap even though it can be +// tricky to map correctly. + +// To map a block close to the code, we take the function address as a proxy +// of the text section address, and try to map the cache next to it. This is +// an iterative process of trial and error that is hopefully successful. + +// x86-64 has a +/- 2GB offset requirement. +// ARM64 has a +/-128MB offset requirement. +// ARM32 has a +/- 32MB offset requirement (gpsp does not require this). +// MIPS requires blocks to be within the same 256MB boundary (identical 4 MSB) + #ifdef MMAP_JIT_CACHE #ifdef WIN32 @@ -9,7 +28,21 @@ #include void *map_jit_block(unsigned size) { - return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); + unsigned i; + uintptr_t base = (uintptr_t)(map_jit_block) & (~0xFFFFFULL); + for (i = 0; i < 256; i++) { + int offset = ((i & 1) ? 1 : -1) * (i >> 1) * 1024 * 1024; + uintptr_t baddr = base + (intptr_t)offset; + if (!baddr) + continue; // Do not map NULL, bad things happen :) + + void *p = VirtualAlloc((void*)baddr, size, MEM_COMMIT|MEM_RESERVE, PAGE_EXECUTE_READWRITE); + if (p == (void*)baddr) + return p; + if (p) + VirtualFree(p, 0, MEM_RELEASE); + } + return 0; } void unmap_jit_block(void *bufptr, unsigned size) { @@ -22,7 +55,22 @@ // Posix implementation void *map_jit_block(unsigned size) { - return mmap(0, size, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0); + unsigned i; + uintptr_t base = (uintptr_t)(map_jit_block) & (~0xFFFFFULL); + for (i = 0; i < 256; i++) { + int offset = ((i & 1) ? 1 : -1) * (i >> 1) * 1024 * 1024; + uintptr_t baddr = base + (intptr_t)offset; + if (!baddr) + continue; // Do not map NULL, bad things happen :) + + void *p = mmap((void*)baddr, size, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_ANON|MAP_PRIVATE, -1, 0); + if (p == (void*)baddr) + return p; + if (p) + munmap(p, size); + } + return 0; } void unmap_jit_block(void *bufptr, unsigned size) { diff --git a/x86/x86_emit.h b/x86/x86_emit.h index b8b8532..88394d7 100644 --- a/x86/x86_emit.h +++ b/x86/x86_emit.h @@ -180,7 +180,7 @@ typedef enum } x86_condition_codes; #define x86_relative_offset(source, offset, next) \ - ((u32)offset - ((u32)source + next)) \ + ((u32)((uintptr_t)offset - ((uintptr_t)source + next))) \ #define x86_unequal_operands(op_a, op_b) \ (x86_reg_number_##op_a != x86_reg_number_##op_b) \ @@ -404,6 +404,17 @@ typedef enum #define reg_t0 esi #define reg_rv eax +#if defined(_WIN64) + #define reg_arg0 ecx + #define reg_arg1 edx +#elif defined(__x86_64__) || defined(__amd64__) + #define reg_arg0 edi + #define reg_arg1 esi +#else + #define reg_arg0 eax + #define reg_arg1 edx +#endif + /* Offsets from reg_base, see stub.S */ #define SPSR_BASE_OFF 0xA9100 @@ -588,11 +599,11 @@ typedef enum #define generate_indirect_branch_cycle_update(type) \ generate_cycle_update(); \ - x86_emit_jmp_offset(x86_relative_offset(translation_ptr, \ + x86_emit_call_offset(x86_relative_offset(translation_ptr, \ x86_indirect_branch_##type, 4)) \ #define generate_indirect_branch_no_cycle_update(type) \ - x86_emit_jmp_offset(x86_relative_offset(translation_ptr, \ + x86_emit_call_offset(x86_relative_offset(translation_ptr, \ x86_indirect_branch_##type, 4)) \ #define block_prologue_size 0 @@ -663,8 +674,8 @@ typedef enum } #define emit_trace_instruction(pc, mode) \ - x86_emit_mov_reg_imm(reg_a0, pc); \ - x86_emit_mov_reg_imm(reg_a1, mode); \ + x86_emit_mov_reg_imm(reg_arg0, pc); \ + x86_emit_mov_reg_imm(reg_arg1, mode); \ generate_function_call(trace_instruction); #define emit_trace_arm_instruction(pc) \ emit_trace_instruction(pc, 1) @@ -1062,9 +1073,8 @@ u32 function_cc execute_spsr_restore(u32 address) generate_store_reg(ireg, reg_index); \ if(reg_index == 15) \ { \ - generate_mov(a0, ireg); \ + generate_mov(arg0, ireg); \ generate_function_call(execute_spsr_restore); \ - generate_mov(a0, rv); \ generate_indirect_branch_dual(); \ } \ @@ -1355,18 +1365,17 @@ u32 function_cc execute_spsr_restore(u32 address) // store_mask and address are stored in the SAVE slots, since there's no real // register space to nicely pass them. -u32 function_cc execute_store_cpsr_body(u32 _cpsr) +u32 execute_store_cpsr_body() { - reg[REG_CPSR] = _cpsr; if(reg[REG_SAVE] & 0xFF) { - set_cpu_mode(cpu_modes[_cpsr & 0x1F]); + set_cpu_mode(cpu_modes[reg[REG_CPSR] & 0x1F]); if((io_registers[REG_IE] & io_registers[REG_IF]) && - io_registers[REG_IME] && ((_cpsr & 0x80) == 0)) + io_registers[REG_IME] && ((reg[REG_CPSR] & 0x80) == 0)) { reg_mode[MODE_IRQ][6] = reg[REG_SAVE2] + 4; - spsr[MODE_IRQ] = _cpsr; - reg[REG_CPSR] = (_cpsr & 0xFFFFFF00) | 0xD2; + spsr[MODE_IRQ] = reg[REG_CPSR]; + reg[REG_CPSR] = (reg[REG_CPSR] & 0xFFFFFF00) | 0xD2; set_cpu_mode(MODE_IRQ); return 0x00000018; } @@ -1518,7 +1527,6 @@ u32 function_cc execute_store_cpsr_body(u32 _cpsr) #define arm_block_memory_adjust_pc_load() \ if(reg_list & 0x8000) \ { \ - generate_mov(a0, rv); \ generate_indirect_branch_arm(); \ } \ @@ -1865,7 +1873,6 @@ u32 function_cc execute_store_cpsr_body(u32 _cpsr) generate_load_pc(a1, pc); \ generate_function_call(execute_load_u32); \ generate_store_reg(rv, REG_PC); \ - generate_mov(a0, rv); \ generate_indirect_branch_cycle_update(thumb) \ #define thumb_block_memory_extra_push_lr(base_reg) \ @@ -2138,7 +2145,7 @@ static void function_cc execute_swi(u32 pc) #define arm_swi() \ collapse_flags(a0, a1); \ - generate_load_pc(a0, (pc + 4)); \ + generate_load_pc(arg0, (pc + 4)); \ generate_function_call(execute_swi); \ generate_branch() \ @@ -2182,7 +2189,7 @@ static void function_cc execute_swi(u32 pc) #define thumb_swi() \ collapse_flags(a0, a1); \ - generate_load_pc(a0, (pc + 2)); \ + generate_load_pc(arg0, (pc + 2)); \ generate_function_call(execute_swi); \ generate_branch_cycle_update( \ block_exits[block_exit_position].branch_source, \ @@ -2233,8 +2240,8 @@ static void function_cc execute_swi(u32 pc) generate_load_pc(a0, pc); \ generate_indirect_branch_no_cycle_update(type) \ -extern u32 x86_table_data[9][16]; -extern u32 x86_table_info[9][16]; +extern void* x86_table_data[9][16]; +extern void* x86_table_info[9][16]; void init_emitter(void) { memcpy(x86_table_info, x86_table_data, sizeof(x86_table_data)); diff --git a/x86/x86_stub.S b/x86/x86_stub.S index 8b6741c..8681510 100644 --- a/x86/x86_stub.S +++ b/x86/x86_stub.S @@ -26,28 +26,56 @@ symbol: \ _##symbol: -#ifndef _WIN32 -# External symbols (data + functions) -#define _update_gba update_gba -#define _block_lookup_address_arm block_lookup_address_arm -#define _block_lookup_address_thumb block_lookup_address_thumb -#define _block_lookup_address_dual block_lookup_address_dual -#define _write_io_register8 write_io_register8 -#define _write_io_register16 write_io_register16 -#define _write_io_register32 write_io_register32 -#define _flush_translation_cache_ram flush_translation_cache_ram -#define _write_eeprom write_eeprom -#define _write_backup write_backup -#define _write_rtc write_rtc -#define _read_memory8 read_memory8 -#define _read_memory8s read_memory8s -#define _read_memory16 read_memory16 -#define _read_memory16s read_memory16s -#define _read_memory32 read_memory32 -#define _execute_store_cpsr_body execute_store_cpsr_body +// Windows 32 bit ABI prefixes functions with underscore +#if defined(_WIN32) && !defined(_WIN64) + #define fnm(name) _##name +#else + #define fnm(name) name #endif -.extern _spsr +// Convention calls are different, and register allocations, which makes it tricky +// All functions in this file are called manually from the JIT arena (unless stated +// otherwise), where we use our own convention call. However calls to C code must +// follow the calling convention. x86 is built with regparm=2 to avoid stack usage. +#if defined(__x86_64__) || defined(__amd64__) + #define ADDR_TYPE .quad + #define ADDR_SIZE_BYTES 8 + #define STACK_REG %rsp + #define FULLREG(rn) %r##rn + #define SAVE_REGISTERS push %rbx; push %rsi; push %rdi; push %rbp + #define REST_REGISTERS pop %rbp; pop %rdi; pop %rsi; pop %rbx + #define REG_BASE %rbx + #ifdef _WIN64 + #define CARG1_REG %ecx // Windows x64 ABI, of course different :D + #define CARG2_REG %edx + #define CARG2_REGPTR %rdx + #define CALL_FUNC(name) \ + sub $32, %rsp; \ + call fnm(name); \ + add $32, %rsp + #else + #define CARG1_REG %edi // SystemV AMD64 ABI + #define CARG2_REG %esi + #define CARG2_REGPTR %rsi + #define CALL_FUNC(name) \ + call fnm(name) + #endif + #define SETUP_ARGS mov %eax, CARG1_REG; mov %edx, CARG2_REG; +#else + #define ADDR_TYPE .long + #define ADDR_SIZE_BYTES 4 + #define STACK_REG %esp + #define FULLREG(rn) %e##rn + #define SAVE_REGISTERS sub $8, %esp; push %ebx; push %esi; push %edi; push %ebp + #define REST_REGISTERS pop %ebp; pop %edi; pop %esi; pop %ebx; add $8, %esp; + #define REG_BASE %ebx + #define CARG1_REG %eax + #define CARG2_REG %edx + #define CARG2_REGPTR %edx + #define SETUP_ARGS + #define CALL_FUNC(name) \ + call fnm(name) +#endif .equ REG_SP, (13 * 4) .equ REG_LR, (14 * 4) @@ -69,15 +97,16 @@ _##symbol: .equ REG_SAVE4, (30 * 4) .equ REG_SAVE5, (31 * 4) -.equ load_u8_tbl, -(9 * 16 * 4) -.equ load_s8_tbl, -(8 * 16 * 4) -.equ load_u16_tbl, -(7 * 16 * 4) -.equ load_s16_tbl, -(6 * 16 * 4) -.equ load_u32_tbl, -(5 * 16 * 4) -.equ store_u8_tbl, -(4 * 16 * 4) -.equ store_u16_tbl, -(3 * 16 * 4) -.equ store_u32_tbl, -(2 * 16 * 4) -.equ store_aligned_u32_tbl, -(1 * 16 * 4) +.equ load_u8_tbl, -(9 * 16 * ADDR_SIZE_BYTES) +.equ load_s8_tbl, -(8 * 16 * ADDR_SIZE_BYTES) +.equ load_u16_tbl, -(7 * 16 * ADDR_SIZE_BYTES) +.equ load_s16_tbl, -(6 * 16 * ADDR_SIZE_BYTES) +.equ load_u32_tbl, -(5 * 16 * ADDR_SIZE_BYTES) +.equ store_u8_tbl, -(4 * 16 * ADDR_SIZE_BYTES) +.equ store_u16_tbl, -(3 * 16 * ADDR_SIZE_BYTES) +.equ store_u32_tbl, -(2 * 16 * ADDR_SIZE_BYTES) +.equ store_aligned_u32_tbl, -(1 * 16 * ADDR_SIZE_BYTES) + .equ PALETTE_RAM_OFF, 0x0100 .equ PALETTE_RAM_CNV_OFF, 0x0500 .equ OAM_RAM_OFF, 0x0900 @@ -93,7 +122,7 @@ _##symbol: # destroys ecx and edx .macro collapse_flag offset, shift - mov \offset(%ebx), %ecx + mov \offset(REG_BASE), %ecx shl $\shift, %ecx or %ecx, %edx .endm @@ -104,7 +133,7 @@ _##symbol: collapse_flag REG_Z_FLAG, 30 collapse_flag REG_C_FLAG, 29 collapse_flag REG_V_FLAG, 28 - mov REG_CPSR(%ebx), %ecx + mov REG_CPSR(REG_BASE), %ecx and $0xFF, %ecx or %ecx, %edx .endm @@ -112,14 +141,14 @@ _##symbol: .macro collapse_flags collapse_flags_no_update - mov %edx, REG_CPSR(%ebx) + mov %edx, REG_CPSR(REG_BASE) .endm .macro extract_flag shift, offset - mov REG_CPSR(%ebx), %edx + mov REG_CPSR(REG_BASE), %edx shr $\shift, %edx and $0x01, %edx - mov %edx, \offset(%ebx) + mov %edx, \offset(REG_BASE) .endm .macro extract_flags @@ -132,25 +161,21 @@ _##symbol: # Process a hardware event. Since an interrupt might be # raised we have to check if the PC has changed. -# eax: current address - -st: - .asciz "u\n" - +# arg0 (always in eax): current PC address defsymbl(x86_update_gba) - mov %eax, REG_PC(%ebx) # current PC = eax - collapse_flags # update cpsr, trashes ecx and edx + mov %eax, REG_PC(REG_BASE) # current PC = eax + collapse_flags # update cpsr, trashes ecx and edx - call _update_gba # process the next event + CALL_FUNC(update_gba) # process the next event - mov %eax, REG_CYCLES # new cycle count + mov %eax, REG_CYCLES # new cycle count # did we just complete a frame? go back to main then - cmpl $0, COMPLETED_FRAME(%ebx) + cmpl $0, COMPLETED_FRAME(REG_BASE) jne return_to_main # did the PC change? - cmpl $1, CHANGED_PC_STATUS(%ebx) + cmpl $1, CHANGED_PC_STATUS(REG_BASE) je lookup_pc ret # if not, go back to caller @@ -158,26 +183,33 @@ defsymbl(x86_update_gba) # ARM code, IE anything that changes the PC in ARM mode except # for BX and data processing to PC with the S bit set. -# eax: GBA address to branch to - +# arg0 (always in eax): GBA address to branch to defsymbl(x86_indirect_branch_arm) - call _block_lookup_address_arm - jmp *%eax + mov %eax, CARG1_REG + CALL_FUNC(block_lookup_address_arm) + add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr + jmp *FULLREG(ax) # For indirect branches that'll definitely go to Thumb. In # Thumb mode any indirect branches except for BX. +# arg0 (always in eax): GBA address to branch to defsymbl(x86_indirect_branch_thumb) - call _block_lookup_address_thumb - jmp *%eax + mov %eax, CARG1_REG + CALL_FUNC(block_lookup_address_thumb) + add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr + jmp *FULLREG(ax) # For indirect branches that can go to either Thumb or ARM, # mainly BX (also data processing to PC with S bit set, be # sure to adjust the target with a 1 in the lowest bit for this) +# arg0 (always in eax): GBA address to branch to defsymbl(x86_indirect_branch_dual) - call _block_lookup_address_dual - jmp *%eax + mov %eax, CARG1_REG + CALL_FUNC(block_lookup_address_dual) + add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr + jmp *FULLREG(ax) # General ext memory routines @@ -192,12 +224,16 @@ ext_store_ignore: ext_store_rtc16: and $0xFFFF, %edx # make value 16bit and $0xFF, %eax # mask address - jmp _write_rtc # write out RTC register + SETUP_ARGS # Setup addr, value + CALL_FUNC(write_rtc) # write out RTC register + ret ext_store_backup8: and $0xFF, %edx # make value 8bit and $0xFFFF, %eax # mask address - jmp _write_backup # perform backup write + SETUP_ARGS # Setup addr, value + CALL_FUNC(write_backup) # perform backup write + ret @@ -210,14 +246,14 @@ write_epilogue: je smc_write alert_loop: - call _update_gba # process the next event + CALL_FUNC(update_gba) # process the next event # did we just complete a frame? go back to main then - cmpl $0, COMPLETED_FRAME(%ebx) + cmpl $0, COMPLETED_FRAME(REG_BASE) jne return_to_main # see if the halt status has changed - mov CPU_HALT_STATE(%ebx), %edx + mov CPU_HALT_STATE(REG_BASE), %edx cmp $0, %edx # 0 means it has jnz alert_loop # if not go again @@ -229,7 +265,8 @@ no_alert: ret ext_store_eeprom: - jmp _write_eeprom # perform eeprom write + CALL_FUNC(write_eeprom) # perform eeprom write + ret # Register wrapping for various sizes @@ -260,40 +297,41 @@ defsymbl(execute_##fname##_u##wsize) ;\ cmp $15, %ecx ;\ ja ext_store_ignore ;\ /* ecx = ext_store_u*_jtable[address >> 24] */ ;\ - jmp *fname##_u##wsize##_tbl(%ebx, %ecx, 4) ;\ + jmp *fname##_u##wsize##_tbl(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES) ;\ ;\ ext_##fname##_iwram##wsize: ;\ - and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\ - mov regfn(d), (IWRAM_OFF+0x8000)(%ebx, %eax) /* Actual write */ ;\ - smc_check_##fname(opsuf, IWRAM_OFF(%ebx, %eax)) ;\ + and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\ + mov regfn(d), (IWRAM_OFF+0x8000)(REG_BASE, FULLREG(ax)) /* Actual write */ ;\ + smc_check_##fname(opsuf, IWRAM_OFF(REG_BASE, FULLREG(ax))) ;\ ret ;\ ;\ ext_##fname##_ewram##wsize: ;\ - and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\ - mov regfn(d), EWRAM_OFF(%ebx, %eax) /* Actual write */ ;\ - smc_check_##fname(opsuf, (EWRAM_OFF+0x40000)(%ebx, %eax)) ;\ + and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\ + mov regfn(d), EWRAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\ + smc_check_##fname(opsuf, (EWRAM_OFF+0x40000)(REG_BASE, FULLREG(ax))) ;\ ret ;\ ;\ ext_##fname##_vram##wsize: ;\ - and $(0x1FFFE & addrm), %eax /* Addr wrap */ ;\ - dup8fn() /* Double byte for 8b access */ ;\ - cmp $0x18000, %eax /* Weird 96KB mirror */ ;\ + and $(0x1FFFE & addrm), %eax /* Addr wrap */ ;\ + dup8fn() /* Double byte for 8b access */ ;\ + cmp $0x18000, %eax /* Weird 96KB mirror */ ;\ jb 1f ;\ - sub $0x8000, %eax /* Mirror last bank */ ;\ + sub $0x8000, %eax /* Mirror last bank */ ;\ 1: ;\ - mov regfn16(d), VRAM_OFF(%ebx, %eax) /* Actual write */ ;\ + mov regfn16(d), VRAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\ ret ;\ ;\ ext_##fname##_oam##wsize: ;\ - and $(0x3FE & addrm), %eax /* Addr wrap */ ;\ - movl $1, OAM_UPDATED(%ebx) /* flag OAM update */ ;\ - dup8fn() /* Double byte for 8b access */ ;\ - mov regfn16(d), OAM_RAM_OFF(%ebx, %eax) /* Actual write */ ;\ + and $(0x3FE & addrm), %eax /* Addr wrap */ ;\ + movl $1, OAM_UPDATED(REG_BASE) /* flag OAM update */ ;\ + dup8fn() /* Double byte for 8b access */ ;\ + mov regfn16(d), OAM_RAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\ ret ;\ ;\ ext_##fname##_io##wsize: ;\ and $(0x3FF & addrm), %eax /* Addr wrap */ ;\ - call _write_io_register##wsize /* Call C code */ ;\ + SETUP_ARGS ;\ + CALL_FUNC(write_io_register##wsize) /* Call C code */ ;\ jmp write_epilogue /* Might need an update */ ;\ @@ -312,7 +350,7 @@ ext_store_palette8: ext_store_palette16: and $0x3FF, %eax # wrap around address ext_store_palette16b: # entry point for 8bit write - mov %dx, PALETTE_RAM_OFF(%ebx, %eax) # write out palette value + mov %dx, PALETTE_RAM_OFF(REG_BASE, FULLREG(ax)) # write out palette value mov %edx, %ecx # cx = dx shl $11, %ecx # cx <<= 11 (red component is in high bits) mov %dh, %cl # bottom bits of cx = top bits of dx @@ -321,7 +359,7 @@ ext_store_palette16b: # entry point for 8bit write shl $1, %dx # make green component 6bits or %edx, %ecx # combine green component into ecx # write out the freshly converted palette value - mov %cx, PALETTE_RAM_CNV_OFF(%ebx, %eax) + mov %cx, PALETTE_RAM_CNV_OFF(REG_BASE, FULLREG(ax)) ret # done ext_store_palette32: @@ -345,20 +383,20 @@ defsymbl(execute_load_##rtype) ;\ and $((1<<(8+albits))-1), %ecx /* preserve align+msb */ ;\ cmp $15, %ecx ;\ ja ext_load_slow##rtype ;\ - jmp *load_##rtype##_tbl(%ebx, %ecx, 4) ;\ + jmp *load_##rtype##_tbl(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES) ;\ ;\ ext_load_bios##rtype: ;\ - mov %edx, REG_PC(%ebx) /* Store current PC */ ;\ + mov %edx, REG_PC(REG_BASE) /* Store current PC */ ;\ jmp ext_load_slow##rtype ;\ ;\ ext_load_iwram##rtype: ;\ and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\ - movop (IWRAM_OFF+0x8000)(%ebx, %eax), %eax /* Read mem */ ;\ + movop (IWRAM_OFF+0x8000)(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_ewram##rtype: ;\ and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\ - movop EWRAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\ + movop EWRAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_vram##rtype: ;\ @@ -367,165 +405,155 @@ ext_load_vram##rtype: ;\ jb 1f ;\ sub $0x8000, %eax /* Mirror last bank */ ;\ 1: ;\ - movop VRAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\ + movop VRAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_oam##rtype: ;\ and $(0x3FF & addrm), %eax /* Addr wrap */ ;\ - movop OAM_RAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\ + movop OAM_RAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_palette##rtype: ;\ and $(0x3FF & addrm), %eax /* Addr wrap */ ;\ - movop PALETTE_RAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\ + movop PALETTE_RAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_io##rtype: ;\ and $(0x3FF & addrm), %eax /* Addr wrap */ ;\ - movop IORAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\ + movop IORAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_rom##rtype: ;\ mov %eax, %ecx /* ecx = address */ ;\ shr $15, %ecx /* ecx = address >> 15 */ ;\ - mov RDMAP_OFF(%ebx, %ecx, 4), %edx /* Read rdmap pointer */ ;\ + /* Read rdmap pointer */ ;\ + mov RDMAP_OFF(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES), FULLREG(dx) ;\ mov %eax, %ecx /* ecx = address */ ;\ and $0x7FFF, %ecx /* ecx = address LSB */ ;\ - movop (%edx, %ecx), %eax /* Read mem */ ;\ + movop (FULLREG(dx), FULLREG(cx)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_slow##rtype: ;\ - jmp slowfn ;\ + SETUP_ARGS ;\ + CALL_FUNC(slowfn) ;\ + ret ;\ -load_stubs(u32, mov, ~3, 2, _read_memory32) -load_stubs(u16, movzwl, ~1, 1, _read_memory16) -load_stubs(s16, movswl, ~1, 1, _read_memory16s) -load_stubs( u8, movzbl, ~0, 0, _read_memory8) -load_stubs( s8, movsbl, ~0, 0, _read_memory8s) +load_stubs(u32, mov, ~3, 2, read_memory32) +load_stubs(u16, movzwl, ~1, 1, read_memory16) +load_stubs(s16, movswl, ~1, 1, read_memory16s) +load_stubs( u8, movzbl, ~0, 0, read_memory8) +load_stubs( s8, movsbl, ~0, 0, read_memory8s) -# %eax = new_cpsr -# %edx = store_mask - +# arg0 (%eax) = new_cpsr +# arg1 (%edx) = store_mask defsymbl(execute_store_cpsr) - mov %edx, REG_SAVE(%ebx) # save store_mask + mov %edx, REG_SAVE(REG_BASE) # save store_mask mov %eax, %ecx # ecx = new_cpsr and %edx, %ecx # ecx = new_cpsr & store_mask - mov REG_CPSR(%ebx), %eax # eax = cpsr + mov REG_CPSR(REG_BASE), %eax # eax = cpsr not %edx # edx = ~store_mask and %edx, %eax # eax = cpsr & ~store_mask or %ecx, %eax # eax = new cpsr combined with old + mov %eax, REG_CPSR(REG_BASE) # save new cpsr to register - call _execute_store_cpsr_body # do the dirty work in this C function - + CALL_FUNC(execute_store_cpsr_body) # do the dirty work in this C function extract_flags # pull out flag vars from new CPSR cmp $0, %eax # see if return value is 0 - jnz changed_pc_cpsr # might have changed the PC - + jnz 1f # might have changed the PC ret # return +1: # PC has changed, due to IRQ triggered + mov %eax, CARG1_REG # Returned addr from C function + CALL_FUNC(block_lookup_address_arm) # lookup new PC + add $ADDR_SIZE_BYTES, STACK_REG # get rid of current return address + jmp *FULLREG(ax) -changed_pc_cpsr: - add $4, %esp # get rid of current return address - call _block_lookup_address_arm # lookup new PC - jmp *%eax +# On writes that overwrite code, cache is flushed and execution re-started smc_write: - call _flush_translation_cache_ram - + CALL_FUNC(flush_translation_cache_ram) lookup_pc: - add $4, %esp # Can't return, discard addr - movl $0, CHANGED_PC_STATUS(%ebx) # Lookup new block and jump to it - mov REG_PC(%ebx), %eax - testl $0x20, REG_CPSR(%ebx) - jz lookup_pc_arm + movl $0, CHANGED_PC_STATUS(REG_BASE) # Lookup new block and jump to it + mov REG_PC(REG_BASE), CARG1_REG # Load PC as argument0 + testl $0x20, REG_CPSR(REG_BASE) + jz 1f +### Thumb mode + CALL_FUNC(block_lookup_address_thumb) + add $ADDR_SIZE_BYTES, STACK_REG # Can't return, discard addr + jmp *FULLREG(ax) +1:# ARM mode + CALL_FUNC(block_lookup_address_arm) + add $ADDR_SIZE_BYTES, STACK_REG # Can't return, discard addr + jmp *FULLREG(ax) -lookup_pc_thumb: - call _block_lookup_address_thumb - jmp *%eax - -lookup_pc_arm: - call _block_lookup_address_arm - jmp *%eax - -# eax: cycle counter +# Called from C, args are platform dependant :/ +# arg0 (eax/edi/ecx): cycle counter +# arg1 (edx/rsi/rdx): reg base pointer defsymbl(execute_arm_translate_internal) # Save main context, since we need to return gracefully - pushl %ebx - pushl %esi - pushl %edi - pushl %ebp + SAVE_REGISTERS # Pushes 16 or 32 bytes + # The stack here is aligned to 16 bytes minus 4 or 8 bytes. - movl %edx, %ebx # load base register (arg1) - extract_flags # load flag variables - movl %eax, REG_CYCLES # load cycle counter (arg0) + mov CARG1_REG, REG_CYCLES # load cycle counter (arg0) + mov CARG2_REGPTR, REG_BASE # load base register (arg1) - movl REG_PC(%ebx), %eax # load PC + extract_flags # load flag variables # (if the CPU is halted, do not start executing but # loop in the alert loop until it wakes up) - cmpl $0, CPU_HALT_STATE(%ebx) + cmpl $0, CPU_HALT_STATE(REG_BASE) je 1f - call alert_loop # Need to push something to the stack - + call alert_loop # Need to push something to the stack + 1: - testl $0x20, REG_CPSR(%ebx) - jnz 2f - - call _block_lookup_address_arm - jmp *%eax # jump to it - -2: - call _block_lookup_address_thumb - jmp *%eax + call lookup_pc # Go fetch and execute PC return_to_main: - add $4, %esp # remove current return addr - popl %ebp - popl %edi - popl %esi - popl %ebx + add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr + REST_REGISTERS # Restore saved registers ret #define load_table(atype) ;\ - .long ext_load_bios##atype /* 0x00 BIOS */;\ - .long ext_load_slow##atype /* 0x01 open read */;\ - .long ext_load_ewram##atype /* 0x02 EWRAM */;\ - .long ext_load_iwram##atype /* 0x03 IWRAM */;\ - .long ext_load_io##atype /* 0x04 I/O registers */;\ - .long ext_load_palette##atype /* 0x05 Palette RAM */;\ - .long ext_load_vram##atype /* 0x06 VRAM */;\ - .long ext_load_oam##atype /* 0x07 OAM RAM */;\ - .long ext_load_rom##atype /* 0x08 gamepak (or RTC) */;\ - .long ext_load_rom##atype /* 0x09 gamepak */;\ - .long ext_load_rom##atype /* 0x0A gamepak */;\ - .long ext_load_rom##atype /* 0x0B gamepak */;\ - .long ext_load_rom##atype /* 0x0C gamepak */;\ - .long ext_load_slow##atype /* 0x0D EEPROM (possibly) */;\ - .long ext_load_slow##atype /* 0x0E Flash ROM/SRAM */;\ - .long ext_load_slow##atype /* 0x0F open read */;\ + ADDR_TYPE ext_load_bios##atype /* 0x00 BIOS */;\ + ADDR_TYPE ext_load_slow##atype /* 0x01 open read */;\ + ADDR_TYPE ext_load_ewram##atype /* 0x02 EWRAM */;\ + ADDR_TYPE ext_load_iwram##atype /* 0x03 IWRAM */;\ + ADDR_TYPE ext_load_io##atype /* 0x04 I/O registers */;\ + ADDR_TYPE ext_load_palette##atype /* 0x05 Palette RAM */;\ + ADDR_TYPE ext_load_vram##atype /* 0x06 VRAM */;\ + ADDR_TYPE ext_load_oam##atype /* 0x07 OAM RAM */;\ + ADDR_TYPE ext_load_rom##atype /* 0x08 gamepak (or RTC) */;\ + ADDR_TYPE ext_load_rom##atype /* 0x09 gamepak */;\ + ADDR_TYPE ext_load_rom##atype /* 0x0A gamepak */;\ + ADDR_TYPE ext_load_rom##atype /* 0x0B gamepak */;\ + ADDR_TYPE ext_load_rom##atype /* 0x0C gamepak */;\ + ADDR_TYPE ext_load_slow##atype /* 0x0D EEPROM (possibly) */;\ + ADDR_TYPE ext_load_slow##atype /* 0x0E Flash ROM/SRAM */;\ + ADDR_TYPE ext_load_slow##atype /* 0x0F open read */;\ #define store_table(asize) ;\ - .long ext_store_ignore /* 0x00 BIOS, ignore */;\ - .long ext_store_ignore /* 0x01 invalid, ignore */;\ - .long ext_store_ewram##asize /* 0x02 EWRAM */;\ - .long ext_store_iwram##asize /* 0x03 IWRAM */;\ - .long ext_store_io##asize /* 0x04 I/O registers */;\ - .long ext_store_palette##asize /* 0x05 Palette RAM */;\ - .long ext_store_vram##asize /* 0x06 VRAM */;\ - .long ext_store_oam##asize /* 0x07 OAM RAM */;\ - .long ext_store_rtc##asize /* 0x08 gamepak (RTC or ignore) */;\ - .long ext_store_ignore /* 0x09 gamepak, ignore */;\ - .long ext_store_ignore /* 0x0A gamepak, ignore */;\ - .long ext_store_ignore /* 0x0B gamepak, ignore */;\ - .long ext_store_ignore /* 0x0C gamepak, ignore */;\ - .long ext_store_eeprom /* 0x0D EEPROM (possibly) */;\ - .long ext_store_backup##asize /* 0x0E Flash ROM/SRAM */;\ - .long ext_store_ignore /* 0x0F ignore */;\ + ADDR_TYPE ext_store_ignore /* 0x00 BIOS, ignore */;\ + ADDR_TYPE ext_store_ignore /* 0x01 invalid, ignore */;\ + ADDR_TYPE ext_store_ewram##asize /* 0x02 EWRAM */;\ + ADDR_TYPE ext_store_iwram##asize /* 0x03 IWRAM */;\ + ADDR_TYPE ext_store_io##asize /* 0x04 I/O registers */;\ + ADDR_TYPE ext_store_palette##asize /* 0x05 Palette RAM */;\ + ADDR_TYPE ext_store_vram##asize /* 0x06 VRAM */;\ + ADDR_TYPE ext_store_oam##asize /* 0x07 OAM RAM */;\ + ADDR_TYPE ext_store_rtc##asize /* 0x08 gamepak (RTC or ignore) */;\ + ADDR_TYPE ext_store_ignore /* 0x09 gamepak, ignore */;\ + ADDR_TYPE ext_store_ignore /* 0x0A gamepak, ignore */;\ + ADDR_TYPE ext_store_ignore /* 0x0B gamepak, ignore */;\ + ADDR_TYPE ext_store_ignore /* 0x0C gamepak, ignore */;\ + ADDR_TYPE ext_store_eeprom /* 0x0D EEPROM (possibly) */;\ + ADDR_TYPE ext_store_backup##asize /* 0x0E Flash ROM/SRAM */;\ + ADDR_TYPE ext_store_ignore /* 0x0F ignore */;\ .data +.align 16 defsymbl(x86_table_data) load_table(u8) @@ -538,29 +566,29 @@ defsymbl(x86_table_data) store_table(32) # aligned word writes (non SMC signaling) - .long ext_store_ignore # 0x00 BIOS, ignore - .long ext_store_ignore # 0x01 invalid, ignore - .long ext_store_aligned_ewram32 # 0x02 EWRAM - .long ext_store_aligned_iwram32 # 0x03 IWRAM - .long ext_store_io32 # 0x04 I/O registers - .long ext_store_palette32 # 0x05 Palette RAM - .long ext_store_vram32 # 0x06 VRAM - .long ext_store_oam32 # 0x07 OAM RAM - .long ext_store_ignore # 0x08 gamepak, ignore (no RTC in 32bit) - .long ext_store_ignore # 0x09 gamepak, ignore - .long ext_store_ignore # 0x0A gamepak, ignore - .long ext_store_ignore # 0x0B gamepak, ignore - .long ext_store_ignore # 0x0C gamepak, ignore - .long ext_store_eeprom # 0x0D EEPROM (possibly) - .long ext_store_ignore # 0x0E Flash ROM/SRAM must be 8bit - .long ext_store_ignore # 0x0F ignore + ADDR_TYPE ext_store_ignore # 0x00 BIOS, ignore + ADDR_TYPE ext_store_ignore # 0x01 invalid, ignore + ADDR_TYPE ext_store_aligned_ewram32 # 0x02 EWRAM + ADDR_TYPE ext_store_aligned_iwram32 # 0x03 IWRAM + ADDR_TYPE ext_store_io32 # 0x04 I/O registers + ADDR_TYPE ext_store_palette32 # 0x05 Palette RAM + ADDR_TYPE ext_store_vram32 # 0x06 VRAM + ADDR_TYPE ext_store_oam32 # 0x07 OAM RAM + ADDR_TYPE ext_store_ignore # 0x08 gamepak, ignore (no RTC in 32bit) + ADDR_TYPE ext_store_ignore # 0x09 gamepak, ignore + ADDR_TYPE ext_store_ignore # 0x0A gamepak, ignore + ADDR_TYPE ext_store_ignore # 0x0B gamepak, ignore + ADDR_TYPE ext_store_ignore # 0x0C gamepak, ignore + ADDR_TYPE ext_store_eeprom # 0x0D EEPROM (possibly) + ADDR_TYPE ext_store_ignore # 0x0E Flash ROM/SRAM must be 8bit + ADDR_TYPE ext_store_ignore # 0x0F ignore .bss .align 64 defsymbl(x86_table_info) - .space 9*4*16 + .space 9*16*ADDR_SIZE_BYTES defsymbl(reg) .space 0x100 defsymbl(palette_ram) @@ -579,11 +607,12 @@ defsymbl(io_registers) .space 0x400 defsymbl(spsr) .space 24 + .space 8 # padding defsymbl(reg_mode) .space 196 - .space 36 # padding + .space 28 # padding defsymbl(memory_map_read) - .space 0x8000 + .space 8*1024*ADDR_SIZE_BYTES #ifndef MMAP_JIT_CACHE #error "x86 dynarec builds *require* MMAP_JIT_CACHE"