[x86/x64] Add support for x86-64 and improve 32 bit mode too.

This adds support for x86-64 dynarec both on Windows and Linux. Since
they have different requirements there's some macro magic in the stubs
file.

This also fixes x86 support in some cases: stack alignment requirements
where violated all over. This allows the usage of clang as a compiler
(which has a tendency to use SSE instructions more often than gcc does).

To support this I also reworked the mmap/VirtualAlloc magic to make sure
JIT arena stays close to .text.

Fixed some other minor issues and removed some unnecessary JIT code here
and there. clang tends to do some (wrong?) assumptions about global
symbols alignment.
This commit is contained in:
David Guillen Fandos 2021-11-06 12:17:50 +01:00
parent d63fea580c
commit e3d5ca8419
5 changed files with 312 additions and 226 deletions

View File

@ -21,19 +21,21 @@ endif
ifeq ($(firstword $(filter x86_64,$(UNAME))),x86_64)
HAVE_DYNAREC := 1
CPU_ARCH := x86_32
else ifeq ($(firstword $(filter amd64,$(UNAME))),amd64)
HAVE_DYNAREC := 1
CPU_ARCH := x86_32
else ifeq ($(firstword $(filter x86,$(UNAME))),x86)
FORCE_32BIT_ARCH = 1
HAVE_DYNAREC := 1
CPU_ARCH := x86_32
endif
FORCE_32BIT :=
ifeq ($(FORCE_32BIT_ARCH),1)
HAVE_DYNAREC := 1
FORCE_32BIT := -m32
CPU_ARCH := x86_32
endif
# system platform

View File

@ -2937,7 +2937,7 @@ block_data_type block_data[MAX_BLOCK_SIZE];
block_exit_type block_exits[MAX_EXITS];
#define smc_write_arm_yes() { \
int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
intptr_t offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
{ \
address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
@ -2946,7 +2946,7 @@ block_exit_type block_exits[MAX_EXITS];
}
#define smc_write_thumb_yes() { \
int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
intptr_t offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
{ \
address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \

View File

@ -1,6 +1,25 @@
#include <stdint.h>
#include "memmap.h"
// The JIT cache buffer is allocated via mmap (or win equivalent) so that it
// can be RWX. On top of that, we need the bufer to be "close" to the text
// segment, so that we can perform jumps between the two code blocks.
// Android and some other platforms discourage the usage of sections in the
// binary (ie. on-disk ELF) that are marked as executable and writtable for
// security reasons. Therefore we prefer to use mmap even though it can be
// tricky to map correctly.
// To map a block close to the code, we take the function address as a proxy
// of the text section address, and try to map the cache next to it. This is
// an iterative process of trial and error that is hopefully successful.
// x86-64 has a +/- 2GB offset requirement.
// ARM64 has a +/-128MB offset requirement.
// ARM32 has a +/- 32MB offset requirement (gpsp does not require this).
// MIPS requires blocks to be within the same 256MB boundary (identical 4 MSB)
#ifdef MMAP_JIT_CACHE
#ifdef WIN32
@ -9,7 +28,21 @@
#include <io.h>
void *map_jit_block(unsigned size) {
return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
unsigned i;
uintptr_t base = (uintptr_t)(map_jit_block) & (~0xFFFFFULL);
for (i = 0; i < 256; i++) {
int offset = ((i & 1) ? 1 : -1) * (i >> 1) * 1024 * 1024;
uintptr_t baddr = base + (intptr_t)offset;
if (!baddr)
continue; // Do not map NULL, bad things happen :)
void *p = VirtualAlloc((void*)baddr, size, MEM_COMMIT|MEM_RESERVE, PAGE_EXECUTE_READWRITE);
if (p == (void*)baddr)
return p;
if (p)
VirtualFree(p, 0, MEM_RELEASE);
}
return 0;
}
void unmap_jit_block(void *bufptr, unsigned size) {
@ -22,7 +55,22 @@
// Posix implementation
void *map_jit_block(unsigned size) {
return mmap(0, size, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
unsigned i;
uintptr_t base = (uintptr_t)(map_jit_block) & (~0xFFFFFULL);
for (i = 0; i < 256; i++) {
int offset = ((i & 1) ? 1 : -1) * (i >> 1) * 1024 * 1024;
uintptr_t baddr = base + (intptr_t)offset;
if (!baddr)
continue; // Do not map NULL, bad things happen :)
void *p = mmap((void*)baddr, size, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_ANON|MAP_PRIVATE, -1, 0);
if (p == (void*)baddr)
return p;
if (p)
munmap(p, size);
}
return 0;
}
void unmap_jit_block(void *bufptr, unsigned size) {

View File

@ -180,7 +180,7 @@ typedef enum
} x86_condition_codes;
#define x86_relative_offset(source, offset, next) \
((u32)offset - ((u32)source + next)) \
((u32)((uintptr_t)offset - ((uintptr_t)source + next))) \
#define x86_unequal_operands(op_a, op_b) \
(x86_reg_number_##op_a != x86_reg_number_##op_b) \
@ -404,6 +404,17 @@ typedef enum
#define reg_t0 esi
#define reg_rv eax
#if defined(_WIN64)
#define reg_arg0 ecx
#define reg_arg1 edx
#elif defined(__x86_64__) || defined(__amd64__)
#define reg_arg0 edi
#define reg_arg1 esi
#else
#define reg_arg0 eax
#define reg_arg1 edx
#endif
/* Offsets from reg_base, see stub.S */
#define SPSR_BASE_OFF 0xA9100
@ -588,11 +599,11 @@ typedef enum
#define generate_indirect_branch_cycle_update(type) \
generate_cycle_update(); \
x86_emit_jmp_offset(x86_relative_offset(translation_ptr, \
x86_emit_call_offset(x86_relative_offset(translation_ptr, \
x86_indirect_branch_##type, 4)) \
#define generate_indirect_branch_no_cycle_update(type) \
x86_emit_jmp_offset(x86_relative_offset(translation_ptr, \
x86_emit_call_offset(x86_relative_offset(translation_ptr, \
x86_indirect_branch_##type, 4)) \
#define block_prologue_size 0
@ -663,8 +674,8 @@ typedef enum
}
#define emit_trace_instruction(pc, mode) \
x86_emit_mov_reg_imm(reg_a0, pc); \
x86_emit_mov_reg_imm(reg_a1, mode); \
x86_emit_mov_reg_imm(reg_arg0, pc); \
x86_emit_mov_reg_imm(reg_arg1, mode); \
generate_function_call(trace_instruction);
#define emit_trace_arm_instruction(pc) \
emit_trace_instruction(pc, 1)
@ -1062,9 +1073,8 @@ u32 function_cc execute_spsr_restore(u32 address)
generate_store_reg(ireg, reg_index); \
if(reg_index == 15) \
{ \
generate_mov(a0, ireg); \
generate_mov(arg0, ireg); \
generate_function_call(execute_spsr_restore); \
generate_mov(a0, rv); \
generate_indirect_branch_dual(); \
} \
@ -1355,18 +1365,17 @@ u32 function_cc execute_spsr_restore(u32 address)
// store_mask and address are stored in the SAVE slots, since there's no real
// register space to nicely pass them.
u32 function_cc execute_store_cpsr_body(u32 _cpsr)
u32 execute_store_cpsr_body()
{
reg[REG_CPSR] = _cpsr;
if(reg[REG_SAVE] & 0xFF)
{
set_cpu_mode(cpu_modes[_cpsr & 0x1F]);
set_cpu_mode(cpu_modes[reg[REG_CPSR] & 0x1F]);
if((io_registers[REG_IE] & io_registers[REG_IF]) &&
io_registers[REG_IME] && ((_cpsr & 0x80) == 0))
io_registers[REG_IME] && ((reg[REG_CPSR] & 0x80) == 0))
{
reg_mode[MODE_IRQ][6] = reg[REG_SAVE2] + 4;
spsr[MODE_IRQ] = _cpsr;
reg[REG_CPSR] = (_cpsr & 0xFFFFFF00) | 0xD2;
spsr[MODE_IRQ] = reg[REG_CPSR];
reg[REG_CPSR] = (reg[REG_CPSR] & 0xFFFFFF00) | 0xD2;
set_cpu_mode(MODE_IRQ);
return 0x00000018;
}
@ -1518,7 +1527,6 @@ u32 function_cc execute_store_cpsr_body(u32 _cpsr)
#define arm_block_memory_adjust_pc_load() \
if(reg_list & 0x8000) \
{ \
generate_mov(a0, rv); \
generate_indirect_branch_arm(); \
} \
@ -1865,7 +1873,6 @@ u32 function_cc execute_store_cpsr_body(u32 _cpsr)
generate_load_pc(a1, pc); \
generate_function_call(execute_load_u32); \
generate_store_reg(rv, REG_PC); \
generate_mov(a0, rv); \
generate_indirect_branch_cycle_update(thumb) \
#define thumb_block_memory_extra_push_lr(base_reg) \
@ -2138,7 +2145,7 @@ static void function_cc execute_swi(u32 pc)
#define arm_swi() \
collapse_flags(a0, a1); \
generate_load_pc(a0, (pc + 4)); \
generate_load_pc(arg0, (pc + 4)); \
generate_function_call(execute_swi); \
generate_branch() \
@ -2182,7 +2189,7 @@ static void function_cc execute_swi(u32 pc)
#define thumb_swi() \
collapse_flags(a0, a1); \
generate_load_pc(a0, (pc + 2)); \
generate_load_pc(arg0, (pc + 2)); \
generate_function_call(execute_swi); \
generate_branch_cycle_update( \
block_exits[block_exit_position].branch_source, \
@ -2233,8 +2240,8 @@ static void function_cc execute_swi(u32 pc)
generate_load_pc(a0, pc); \
generate_indirect_branch_no_cycle_update(type) \
extern u32 x86_table_data[9][16];
extern u32 x86_table_info[9][16];
extern void* x86_table_data[9][16];
extern void* x86_table_info[9][16];
void init_emitter(void) {
memcpy(x86_table_info, x86_table_data, sizeof(x86_table_data));

View File

@ -26,28 +26,56 @@
symbol: \
_##symbol:
#ifndef _WIN32
# External symbols (data + functions)
#define _update_gba update_gba
#define _block_lookup_address_arm block_lookup_address_arm
#define _block_lookup_address_thumb block_lookup_address_thumb
#define _block_lookup_address_dual block_lookup_address_dual
#define _write_io_register8 write_io_register8
#define _write_io_register16 write_io_register16
#define _write_io_register32 write_io_register32
#define _flush_translation_cache_ram flush_translation_cache_ram
#define _write_eeprom write_eeprom
#define _write_backup write_backup
#define _write_rtc write_rtc
#define _read_memory8 read_memory8
#define _read_memory8s read_memory8s
#define _read_memory16 read_memory16
#define _read_memory16s read_memory16s
#define _read_memory32 read_memory32
#define _execute_store_cpsr_body execute_store_cpsr_body
// Windows 32 bit ABI prefixes functions with underscore
#if defined(_WIN32) && !defined(_WIN64)
#define fnm(name) _##name
#else
#define fnm(name) name
#endif
.extern _spsr
// Convention calls are different, and register allocations, which makes it tricky
// All functions in this file are called manually from the JIT arena (unless stated
// otherwise), where we use our own convention call. However calls to C code must
// follow the calling convention. x86 is built with regparm=2 to avoid stack usage.
#if defined(__x86_64__) || defined(__amd64__)
#define ADDR_TYPE .quad
#define ADDR_SIZE_BYTES 8
#define STACK_REG %rsp
#define FULLREG(rn) %r##rn
#define SAVE_REGISTERS push %rbx; push %rsi; push %rdi; push %rbp
#define REST_REGISTERS pop %rbp; pop %rdi; pop %rsi; pop %rbx
#define REG_BASE %rbx
#ifdef _WIN64
#define CARG1_REG %ecx // Windows x64 ABI, of course different :D
#define CARG2_REG %edx
#define CARG2_REGPTR %rdx
#define CALL_FUNC(name) \
sub $32, %rsp; \
call fnm(name); \
add $32, %rsp
#else
#define CARG1_REG %edi // SystemV AMD64 ABI
#define CARG2_REG %esi
#define CARG2_REGPTR %rsi
#define CALL_FUNC(name) \
call fnm(name)
#endif
#define SETUP_ARGS mov %eax, CARG1_REG; mov %edx, CARG2_REG;
#else
#define ADDR_TYPE .long
#define ADDR_SIZE_BYTES 4
#define STACK_REG %esp
#define FULLREG(rn) %e##rn
#define SAVE_REGISTERS sub $8, %esp; push %ebx; push %esi; push %edi; push %ebp
#define REST_REGISTERS pop %ebp; pop %edi; pop %esi; pop %ebx; add $8, %esp;
#define REG_BASE %ebx
#define CARG1_REG %eax
#define CARG2_REG %edx
#define CARG2_REGPTR %edx
#define SETUP_ARGS
#define CALL_FUNC(name) \
call fnm(name)
#endif
.equ REG_SP, (13 * 4)
.equ REG_LR, (14 * 4)
@ -69,15 +97,16 @@ _##symbol:
.equ REG_SAVE4, (30 * 4)
.equ REG_SAVE5, (31 * 4)
.equ load_u8_tbl, -(9 * 16 * 4)
.equ load_s8_tbl, -(8 * 16 * 4)
.equ load_u16_tbl, -(7 * 16 * 4)
.equ load_s16_tbl, -(6 * 16 * 4)
.equ load_u32_tbl, -(5 * 16 * 4)
.equ store_u8_tbl, -(4 * 16 * 4)
.equ store_u16_tbl, -(3 * 16 * 4)
.equ store_u32_tbl, -(2 * 16 * 4)
.equ store_aligned_u32_tbl, -(1 * 16 * 4)
.equ load_u8_tbl, -(9 * 16 * ADDR_SIZE_BYTES)
.equ load_s8_tbl, -(8 * 16 * ADDR_SIZE_BYTES)
.equ load_u16_tbl, -(7 * 16 * ADDR_SIZE_BYTES)
.equ load_s16_tbl, -(6 * 16 * ADDR_SIZE_BYTES)
.equ load_u32_tbl, -(5 * 16 * ADDR_SIZE_BYTES)
.equ store_u8_tbl, -(4 * 16 * ADDR_SIZE_BYTES)
.equ store_u16_tbl, -(3 * 16 * ADDR_SIZE_BYTES)
.equ store_u32_tbl, -(2 * 16 * ADDR_SIZE_BYTES)
.equ store_aligned_u32_tbl, -(1 * 16 * ADDR_SIZE_BYTES)
.equ PALETTE_RAM_OFF, 0x0100
.equ PALETTE_RAM_CNV_OFF, 0x0500
.equ OAM_RAM_OFF, 0x0900
@ -93,7 +122,7 @@ _##symbol:
# destroys ecx and edx
.macro collapse_flag offset, shift
mov \offset(%ebx), %ecx
mov \offset(REG_BASE), %ecx
shl $\shift, %ecx
or %ecx, %edx
.endm
@ -104,7 +133,7 @@ _##symbol:
collapse_flag REG_Z_FLAG, 30
collapse_flag REG_C_FLAG, 29
collapse_flag REG_V_FLAG, 28
mov REG_CPSR(%ebx), %ecx
mov REG_CPSR(REG_BASE), %ecx
and $0xFF, %ecx
or %ecx, %edx
.endm
@ -112,14 +141,14 @@ _##symbol:
.macro collapse_flags
collapse_flags_no_update
mov %edx, REG_CPSR(%ebx)
mov %edx, REG_CPSR(REG_BASE)
.endm
.macro extract_flag shift, offset
mov REG_CPSR(%ebx), %edx
mov REG_CPSR(REG_BASE), %edx
shr $\shift, %edx
and $0x01, %edx
mov %edx, \offset(%ebx)
mov %edx, \offset(REG_BASE)
.endm
.macro extract_flags
@ -132,25 +161,21 @@ _##symbol:
# Process a hardware event. Since an interrupt might be
# raised we have to check if the PC has changed.
# eax: current address
st:
.asciz "u\n"
# arg0 (always in eax): current PC address
defsymbl(x86_update_gba)
mov %eax, REG_PC(%ebx) # current PC = eax
collapse_flags # update cpsr, trashes ecx and edx
mov %eax, REG_PC(REG_BASE) # current PC = eax
collapse_flags # update cpsr, trashes ecx and edx
call _update_gba # process the next event
CALL_FUNC(update_gba) # process the next event
mov %eax, REG_CYCLES # new cycle count
mov %eax, REG_CYCLES # new cycle count
# did we just complete a frame? go back to main then
cmpl $0, COMPLETED_FRAME(%ebx)
cmpl $0, COMPLETED_FRAME(REG_BASE)
jne return_to_main
# did the PC change?
cmpl $1, CHANGED_PC_STATUS(%ebx)
cmpl $1, CHANGED_PC_STATUS(REG_BASE)
je lookup_pc
ret # if not, go back to caller
@ -158,26 +183,33 @@ defsymbl(x86_update_gba)
# ARM code, IE anything that changes the PC in ARM mode except
# for BX and data processing to PC with the S bit set.
# eax: GBA address to branch to
# arg0 (always in eax): GBA address to branch to
defsymbl(x86_indirect_branch_arm)
call _block_lookup_address_arm
jmp *%eax
mov %eax, CARG1_REG
CALL_FUNC(block_lookup_address_arm)
add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr
jmp *FULLREG(ax)
# For indirect branches that'll definitely go to Thumb. In
# Thumb mode any indirect branches except for BX.
# arg0 (always in eax): GBA address to branch to
defsymbl(x86_indirect_branch_thumb)
call _block_lookup_address_thumb
jmp *%eax
mov %eax, CARG1_REG
CALL_FUNC(block_lookup_address_thumb)
add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr
jmp *FULLREG(ax)
# For indirect branches that can go to either Thumb or ARM,
# mainly BX (also data processing to PC with S bit set, be
# sure to adjust the target with a 1 in the lowest bit for this)
# arg0 (always in eax): GBA address to branch to
defsymbl(x86_indirect_branch_dual)
call _block_lookup_address_dual
jmp *%eax
mov %eax, CARG1_REG
CALL_FUNC(block_lookup_address_dual)
add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr
jmp *FULLREG(ax)
# General ext memory routines
@ -192,12 +224,16 @@ ext_store_ignore:
ext_store_rtc16:
and $0xFFFF, %edx # make value 16bit
and $0xFF, %eax # mask address
jmp _write_rtc # write out RTC register
SETUP_ARGS # Setup addr, value
CALL_FUNC(write_rtc) # write out RTC register
ret
ext_store_backup8:
and $0xFF, %edx # make value 8bit
and $0xFFFF, %eax # mask address
jmp _write_backup # perform backup write
SETUP_ARGS # Setup addr, value
CALL_FUNC(write_backup) # perform backup write
ret
@ -210,14 +246,14 @@ write_epilogue:
je smc_write
alert_loop:
call _update_gba # process the next event
CALL_FUNC(update_gba) # process the next event
# did we just complete a frame? go back to main then
cmpl $0, COMPLETED_FRAME(%ebx)
cmpl $0, COMPLETED_FRAME(REG_BASE)
jne return_to_main
# see if the halt status has changed
mov CPU_HALT_STATE(%ebx), %edx
mov CPU_HALT_STATE(REG_BASE), %edx
cmp $0, %edx # 0 means it has
jnz alert_loop # if not go again
@ -229,7 +265,8 @@ no_alert:
ret
ext_store_eeprom:
jmp _write_eeprom # perform eeprom write
CALL_FUNC(write_eeprom) # perform eeprom write
ret
# Register wrapping for various sizes
@ -260,40 +297,41 @@ defsymbl(execute_##fname##_u##wsize) ;\
cmp $15, %ecx ;\
ja ext_store_ignore ;\
/* ecx = ext_store_u*_jtable[address >> 24] */ ;\
jmp *fname##_u##wsize##_tbl(%ebx, %ecx, 4) ;\
jmp *fname##_u##wsize##_tbl(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES) ;\
;\
ext_##fname##_iwram##wsize: ;\
and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\
mov regfn(d), (IWRAM_OFF+0x8000)(%ebx, %eax) /* Actual write */ ;\
smc_check_##fname(opsuf, IWRAM_OFF(%ebx, %eax)) ;\
and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\
mov regfn(d), (IWRAM_OFF+0x8000)(REG_BASE, FULLREG(ax)) /* Actual write */ ;\
smc_check_##fname(opsuf, IWRAM_OFF(REG_BASE, FULLREG(ax))) ;\
ret ;\
;\
ext_##fname##_ewram##wsize: ;\
and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\
mov regfn(d), EWRAM_OFF(%ebx, %eax) /* Actual write */ ;\
smc_check_##fname(opsuf, (EWRAM_OFF+0x40000)(%ebx, %eax)) ;\
and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\
mov regfn(d), EWRAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\
smc_check_##fname(opsuf, (EWRAM_OFF+0x40000)(REG_BASE, FULLREG(ax))) ;\
ret ;\
;\
ext_##fname##_vram##wsize: ;\
and $(0x1FFFE & addrm), %eax /* Addr wrap */ ;\
dup8fn() /* Double byte for 8b access */ ;\
cmp $0x18000, %eax /* Weird 96KB mirror */ ;\
and $(0x1FFFE & addrm), %eax /* Addr wrap */ ;\
dup8fn() /* Double byte for 8b access */ ;\
cmp $0x18000, %eax /* Weird 96KB mirror */ ;\
jb 1f ;\
sub $0x8000, %eax /* Mirror last bank */ ;\
sub $0x8000, %eax /* Mirror last bank */ ;\
1: ;\
mov regfn16(d), VRAM_OFF(%ebx, %eax) /* Actual write */ ;\
mov regfn16(d), VRAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\
ret ;\
;\
ext_##fname##_oam##wsize: ;\
and $(0x3FE & addrm), %eax /* Addr wrap */ ;\
movl $1, OAM_UPDATED(%ebx) /* flag OAM update */ ;\
dup8fn() /* Double byte for 8b access */ ;\
mov regfn16(d), OAM_RAM_OFF(%ebx, %eax) /* Actual write */ ;\
and $(0x3FE & addrm), %eax /* Addr wrap */ ;\
movl $1, OAM_UPDATED(REG_BASE) /* flag OAM update */ ;\
dup8fn() /* Double byte for 8b access */ ;\
mov regfn16(d), OAM_RAM_OFF(REG_BASE, FULLREG(ax)) /* Actual write */ ;\
ret ;\
;\
ext_##fname##_io##wsize: ;\
and $(0x3FF & addrm), %eax /* Addr wrap */ ;\
call _write_io_register##wsize /* Call C code */ ;\
SETUP_ARGS ;\
CALL_FUNC(write_io_register##wsize) /* Call C code */ ;\
jmp write_epilogue /* Might need an update */ ;\
@ -312,7 +350,7 @@ ext_store_palette8:
ext_store_palette16:
and $0x3FF, %eax # wrap around address
ext_store_palette16b: # entry point for 8bit write
mov %dx, PALETTE_RAM_OFF(%ebx, %eax) # write out palette value
mov %dx, PALETTE_RAM_OFF(REG_BASE, FULLREG(ax)) # write out palette value
mov %edx, %ecx # cx = dx
shl $11, %ecx # cx <<= 11 (red component is in high bits)
mov %dh, %cl # bottom bits of cx = top bits of dx
@ -321,7 +359,7 @@ ext_store_palette16b: # entry point for 8bit write
shl $1, %dx # make green component 6bits
or %edx, %ecx # combine green component into ecx
# write out the freshly converted palette value
mov %cx, PALETTE_RAM_CNV_OFF(%ebx, %eax)
mov %cx, PALETTE_RAM_CNV_OFF(REG_BASE, FULLREG(ax))
ret # done
ext_store_palette32:
@ -345,20 +383,20 @@ defsymbl(execute_load_##rtype) ;\
and $((1<<(8+albits))-1), %ecx /* preserve align+msb */ ;\
cmp $15, %ecx ;\
ja ext_load_slow##rtype ;\
jmp *load_##rtype##_tbl(%ebx, %ecx, 4) ;\
jmp *load_##rtype##_tbl(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES) ;\
;\
ext_load_bios##rtype: ;\
mov %edx, REG_PC(%ebx) /* Store current PC */ ;\
mov %edx, REG_PC(REG_BASE) /* Store current PC */ ;\
jmp ext_load_slow##rtype ;\
;\
ext_load_iwram##rtype: ;\
and $(0x7FFF & addrm), %eax /* Addr wrap */ ;\
movop (IWRAM_OFF+0x8000)(%ebx, %eax), %eax /* Read mem */ ;\
movop (IWRAM_OFF+0x8000)(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_ewram##rtype: ;\
and $(0x3FFFF & addrm), %eax /* Addr wrap */ ;\
movop EWRAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\
movop EWRAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_vram##rtype: ;\
@ -367,165 +405,155 @@ ext_load_vram##rtype: ;\
jb 1f ;\
sub $0x8000, %eax /* Mirror last bank */ ;\
1: ;\
movop VRAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\
movop VRAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_oam##rtype: ;\
and $(0x3FF & addrm), %eax /* Addr wrap */ ;\
movop OAM_RAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\
movop OAM_RAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_palette##rtype: ;\
and $(0x3FF & addrm), %eax /* Addr wrap */ ;\
movop PALETTE_RAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\
movop PALETTE_RAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_io##rtype: ;\
and $(0x3FF & addrm), %eax /* Addr wrap */ ;\
movop IORAM_OFF(%ebx, %eax), %eax /* Read mem */ ;\
movop IORAM_OFF(REG_BASE, FULLREG(ax)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_rom##rtype: ;\
mov %eax, %ecx /* ecx = address */ ;\
shr $15, %ecx /* ecx = address >> 15 */ ;\
mov RDMAP_OFF(%ebx, %ecx, 4), %edx /* Read rdmap pointer */ ;\
/* Read rdmap pointer */ ;\
mov RDMAP_OFF(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES), FULLREG(dx) ;\
mov %eax, %ecx /* ecx = address */ ;\
and $0x7FFF, %ecx /* ecx = address LSB */ ;\
movop (%edx, %ecx), %eax /* Read mem */ ;\
movop (FULLREG(dx), FULLREG(cx)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_slow##rtype: ;\
jmp slowfn ;\
SETUP_ARGS ;\
CALL_FUNC(slowfn) ;\
ret ;\
load_stubs(u32, mov, ~3, 2, _read_memory32)
load_stubs(u16, movzwl, ~1, 1, _read_memory16)
load_stubs(s16, movswl, ~1, 1, _read_memory16s)
load_stubs( u8, movzbl, ~0, 0, _read_memory8)
load_stubs( s8, movsbl, ~0, 0, _read_memory8s)
load_stubs(u32, mov, ~3, 2, read_memory32)
load_stubs(u16, movzwl, ~1, 1, read_memory16)
load_stubs(s16, movswl, ~1, 1, read_memory16s)
load_stubs( u8, movzbl, ~0, 0, read_memory8)
load_stubs( s8, movsbl, ~0, 0, read_memory8s)
# %eax = new_cpsr
# %edx = store_mask
# arg0 (%eax) = new_cpsr
# arg1 (%edx) = store_mask
defsymbl(execute_store_cpsr)
mov %edx, REG_SAVE(%ebx) # save store_mask
mov %edx, REG_SAVE(REG_BASE) # save store_mask
mov %eax, %ecx # ecx = new_cpsr
and %edx, %ecx # ecx = new_cpsr & store_mask
mov REG_CPSR(%ebx), %eax # eax = cpsr
mov REG_CPSR(REG_BASE), %eax # eax = cpsr
not %edx # edx = ~store_mask
and %edx, %eax # eax = cpsr & ~store_mask
or %ecx, %eax # eax = new cpsr combined with old
mov %eax, REG_CPSR(REG_BASE) # save new cpsr to register
call _execute_store_cpsr_body # do the dirty work in this C function
CALL_FUNC(execute_store_cpsr_body) # do the dirty work in this C function
extract_flags # pull out flag vars from new CPSR
cmp $0, %eax # see if return value is 0
jnz changed_pc_cpsr # might have changed the PC
jnz 1f # might have changed the PC
ret # return
1: # PC has changed, due to IRQ triggered
mov %eax, CARG1_REG # Returned addr from C function
CALL_FUNC(block_lookup_address_arm) # lookup new PC
add $ADDR_SIZE_BYTES, STACK_REG # get rid of current return address
jmp *FULLREG(ax)
changed_pc_cpsr:
add $4, %esp # get rid of current return address
call _block_lookup_address_arm # lookup new PC
jmp *%eax
# On writes that overwrite code, cache is flushed and execution re-started
smc_write:
call _flush_translation_cache_ram
CALL_FUNC(flush_translation_cache_ram)
lookup_pc:
add $4, %esp # Can't return, discard addr
movl $0, CHANGED_PC_STATUS(%ebx) # Lookup new block and jump to it
mov REG_PC(%ebx), %eax
testl $0x20, REG_CPSR(%ebx)
jz lookup_pc_arm
movl $0, CHANGED_PC_STATUS(REG_BASE) # Lookup new block and jump to it
mov REG_PC(REG_BASE), CARG1_REG # Load PC as argument0
testl $0x20, REG_CPSR(REG_BASE)
jz 1f
### Thumb mode
CALL_FUNC(block_lookup_address_thumb)
add $ADDR_SIZE_BYTES, STACK_REG # Can't return, discard addr
jmp *FULLREG(ax)
1:# ARM mode
CALL_FUNC(block_lookup_address_arm)
add $ADDR_SIZE_BYTES, STACK_REG # Can't return, discard addr
jmp *FULLREG(ax)
lookup_pc_thumb:
call _block_lookup_address_thumb
jmp *%eax
lookup_pc_arm:
call _block_lookup_address_arm
jmp *%eax
# eax: cycle counter
# Called from C, args are platform dependant :/
# arg0 (eax/edi/ecx): cycle counter
# arg1 (edx/rsi/rdx): reg base pointer
defsymbl(execute_arm_translate_internal)
# Save main context, since we need to return gracefully
pushl %ebx
pushl %esi
pushl %edi
pushl %ebp
SAVE_REGISTERS # Pushes 16 or 32 bytes
# The stack here is aligned to 16 bytes minus 4 or 8 bytes.
movl %edx, %ebx # load base register (arg1)
extract_flags # load flag variables
movl %eax, REG_CYCLES # load cycle counter (arg0)
mov CARG1_REG, REG_CYCLES # load cycle counter (arg0)
mov CARG2_REGPTR, REG_BASE # load base register (arg1)
movl REG_PC(%ebx), %eax # load PC
extract_flags # load flag variables
# (if the CPU is halted, do not start executing but
# loop in the alert loop until it wakes up)
cmpl $0, CPU_HALT_STATE(%ebx)
cmpl $0, CPU_HALT_STATE(REG_BASE)
je 1f
call alert_loop # Need to push something to the stack
call alert_loop # Need to push something to the stack
1:
testl $0x20, REG_CPSR(%ebx)
jnz 2f
call _block_lookup_address_arm
jmp *%eax # jump to it
2:
call _block_lookup_address_thumb
jmp *%eax
call lookup_pc # Go fetch and execute PC
return_to_main:
add $4, %esp # remove current return addr
popl %ebp
popl %edi
popl %esi
popl %ebx
add $ADDR_SIZE_BYTES, STACK_REG # remove current return addr
REST_REGISTERS # Restore saved registers
ret
#define load_table(atype) ;\
.long ext_load_bios##atype /* 0x00 BIOS */;\
.long ext_load_slow##atype /* 0x01 open read */;\
.long ext_load_ewram##atype /* 0x02 EWRAM */;\
.long ext_load_iwram##atype /* 0x03 IWRAM */;\
.long ext_load_io##atype /* 0x04 I/O registers */;\
.long ext_load_palette##atype /* 0x05 Palette RAM */;\
.long ext_load_vram##atype /* 0x06 VRAM */;\
.long ext_load_oam##atype /* 0x07 OAM RAM */;\
.long ext_load_rom##atype /* 0x08 gamepak (or RTC) */;\
.long ext_load_rom##atype /* 0x09 gamepak */;\
.long ext_load_rom##atype /* 0x0A gamepak */;\
.long ext_load_rom##atype /* 0x0B gamepak */;\
.long ext_load_rom##atype /* 0x0C gamepak */;\
.long ext_load_slow##atype /* 0x0D EEPROM (possibly) */;\
.long ext_load_slow##atype /* 0x0E Flash ROM/SRAM */;\
.long ext_load_slow##atype /* 0x0F open read */;\
ADDR_TYPE ext_load_bios##atype /* 0x00 BIOS */;\
ADDR_TYPE ext_load_slow##atype /* 0x01 open read */;\
ADDR_TYPE ext_load_ewram##atype /* 0x02 EWRAM */;\
ADDR_TYPE ext_load_iwram##atype /* 0x03 IWRAM */;\
ADDR_TYPE ext_load_io##atype /* 0x04 I/O registers */;\
ADDR_TYPE ext_load_palette##atype /* 0x05 Palette RAM */;\
ADDR_TYPE ext_load_vram##atype /* 0x06 VRAM */;\
ADDR_TYPE ext_load_oam##atype /* 0x07 OAM RAM */;\
ADDR_TYPE ext_load_rom##atype /* 0x08 gamepak (or RTC) */;\
ADDR_TYPE ext_load_rom##atype /* 0x09 gamepak */;\
ADDR_TYPE ext_load_rom##atype /* 0x0A gamepak */;\
ADDR_TYPE ext_load_rom##atype /* 0x0B gamepak */;\
ADDR_TYPE ext_load_rom##atype /* 0x0C gamepak */;\
ADDR_TYPE ext_load_slow##atype /* 0x0D EEPROM (possibly) */;\
ADDR_TYPE ext_load_slow##atype /* 0x0E Flash ROM/SRAM */;\
ADDR_TYPE ext_load_slow##atype /* 0x0F open read */;\
#define store_table(asize) ;\
.long ext_store_ignore /* 0x00 BIOS, ignore */;\
.long ext_store_ignore /* 0x01 invalid, ignore */;\
.long ext_store_ewram##asize /* 0x02 EWRAM */;\
.long ext_store_iwram##asize /* 0x03 IWRAM */;\
.long ext_store_io##asize /* 0x04 I/O registers */;\
.long ext_store_palette##asize /* 0x05 Palette RAM */;\
.long ext_store_vram##asize /* 0x06 VRAM */;\
.long ext_store_oam##asize /* 0x07 OAM RAM */;\
.long ext_store_rtc##asize /* 0x08 gamepak (RTC or ignore) */;\
.long ext_store_ignore /* 0x09 gamepak, ignore */;\
.long ext_store_ignore /* 0x0A gamepak, ignore */;\
.long ext_store_ignore /* 0x0B gamepak, ignore */;\
.long ext_store_ignore /* 0x0C gamepak, ignore */;\
.long ext_store_eeprom /* 0x0D EEPROM (possibly) */;\
.long ext_store_backup##asize /* 0x0E Flash ROM/SRAM */;\
.long ext_store_ignore /* 0x0F ignore */;\
ADDR_TYPE ext_store_ignore /* 0x00 BIOS, ignore */;\
ADDR_TYPE ext_store_ignore /* 0x01 invalid, ignore */;\
ADDR_TYPE ext_store_ewram##asize /* 0x02 EWRAM */;\
ADDR_TYPE ext_store_iwram##asize /* 0x03 IWRAM */;\
ADDR_TYPE ext_store_io##asize /* 0x04 I/O registers */;\
ADDR_TYPE ext_store_palette##asize /* 0x05 Palette RAM */;\
ADDR_TYPE ext_store_vram##asize /* 0x06 VRAM */;\
ADDR_TYPE ext_store_oam##asize /* 0x07 OAM RAM */;\
ADDR_TYPE ext_store_rtc##asize /* 0x08 gamepak (RTC or ignore) */;\
ADDR_TYPE ext_store_ignore /* 0x09 gamepak, ignore */;\
ADDR_TYPE ext_store_ignore /* 0x0A gamepak, ignore */;\
ADDR_TYPE ext_store_ignore /* 0x0B gamepak, ignore */;\
ADDR_TYPE ext_store_ignore /* 0x0C gamepak, ignore */;\
ADDR_TYPE ext_store_eeprom /* 0x0D EEPROM (possibly) */;\
ADDR_TYPE ext_store_backup##asize /* 0x0E Flash ROM/SRAM */;\
ADDR_TYPE ext_store_ignore /* 0x0F ignore */;\
.data
.align 16
defsymbl(x86_table_data)
load_table(u8)
@ -538,29 +566,29 @@ defsymbl(x86_table_data)
store_table(32)
# aligned word writes (non SMC signaling)
.long ext_store_ignore # 0x00 BIOS, ignore
.long ext_store_ignore # 0x01 invalid, ignore
.long ext_store_aligned_ewram32 # 0x02 EWRAM
.long ext_store_aligned_iwram32 # 0x03 IWRAM
.long ext_store_io32 # 0x04 I/O registers
.long ext_store_palette32 # 0x05 Palette RAM
.long ext_store_vram32 # 0x06 VRAM
.long ext_store_oam32 # 0x07 OAM RAM
.long ext_store_ignore # 0x08 gamepak, ignore (no RTC in 32bit)
.long ext_store_ignore # 0x09 gamepak, ignore
.long ext_store_ignore # 0x0A gamepak, ignore
.long ext_store_ignore # 0x0B gamepak, ignore
.long ext_store_ignore # 0x0C gamepak, ignore
.long ext_store_eeprom # 0x0D EEPROM (possibly)
.long ext_store_ignore # 0x0E Flash ROM/SRAM must be 8bit
.long ext_store_ignore # 0x0F ignore
ADDR_TYPE ext_store_ignore # 0x00 BIOS, ignore
ADDR_TYPE ext_store_ignore # 0x01 invalid, ignore
ADDR_TYPE ext_store_aligned_ewram32 # 0x02 EWRAM
ADDR_TYPE ext_store_aligned_iwram32 # 0x03 IWRAM
ADDR_TYPE ext_store_io32 # 0x04 I/O registers
ADDR_TYPE ext_store_palette32 # 0x05 Palette RAM
ADDR_TYPE ext_store_vram32 # 0x06 VRAM
ADDR_TYPE ext_store_oam32 # 0x07 OAM RAM
ADDR_TYPE ext_store_ignore # 0x08 gamepak, ignore (no RTC in 32bit)
ADDR_TYPE ext_store_ignore # 0x09 gamepak, ignore
ADDR_TYPE ext_store_ignore # 0x0A gamepak, ignore
ADDR_TYPE ext_store_ignore # 0x0B gamepak, ignore
ADDR_TYPE ext_store_ignore # 0x0C gamepak, ignore
ADDR_TYPE ext_store_eeprom # 0x0D EEPROM (possibly)
ADDR_TYPE ext_store_ignore # 0x0E Flash ROM/SRAM must be 8bit
ADDR_TYPE ext_store_ignore # 0x0F ignore
.bss
.align 64
defsymbl(x86_table_info)
.space 9*4*16
.space 9*16*ADDR_SIZE_BYTES
defsymbl(reg)
.space 0x100
defsymbl(palette_ram)
@ -579,11 +607,12 @@ defsymbl(io_registers)
.space 0x400
defsymbl(spsr)
.space 24
.space 8 # padding
defsymbl(reg_mode)
.space 196
.space 36 # padding
.space 28 # padding
defsymbl(memory_map_read)
.space 0x8000
.space 8*1024*ADDR_SIZE_BYTES
#ifndef MMAP_JIT_CACHE
#error "x86 dynarec builds *require* MMAP_JIT_CACHE"