[ARM] Rework memory handlers for speed amb simplicity

This removes one branch and emits the region selection code directly in
the JIT cache. Trading memory for speed (although it's not a big
improvement).

This is a step towards enabling MMAP caches in ARM (due to the 32MB
offset limitation in branches).
This commit is contained in:
David Guillen Fandos 2021-10-23 23:33:15 +02:00
parent 4bebb6135d
commit d558fb4fc4
3 changed files with 53 additions and 55 deletions

View File

@ -1378,6 +1378,12 @@ typedef struct {
(1 << 25) | \
ARM_DEF_COND(cond)
#define ARM_USAT_ASR(p, rd, sat, rm, sa, cond) \
ARM_EMIT(p, ARM_DEF_DPI_REG_IMMSHIFT_COND((rm) | 0x10, 2, sa, rd, sat, 0, 0x37, cond))
#define ARM_USAT_LSL(p, rd, sat, rm, sa, cond) \
ARM_EMIT(p, ARM_DEF_DPI_REG_IMMSHIFT_COND((rm) | 0x10, 0, sa, rd, sat, 0, 0x37, cond))
typedef union {
ARMInstrBR br;

View File

@ -51,8 +51,6 @@ u32 execute_spsr_restore(u32 address);
void execute_swi_arm(u32 pc);
void execute_swi_thumb(u32 pc);
void execute_store_u32_safe(u32 address, u32 source);
#define STORE_TBL_OFF 0x1DC
#define SPSR_RAM_OFF 0x100
@ -1271,16 +1269,46 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
* (same with the stores)
*/
#define generate_load_call_byte(tblnum) \
ARM_USAT_ASR(0, reg_a1, 4, reg_a0, 24, ARMCOND_AL); \
generate_add_imm(reg_a1, (STORE_TBL_OFF + 64*tblnum) >> 2, 0); \
ARM_LDR_REG_REG_SHIFT(0, reg_a1, reg_base, reg_a1, 0, 2); \
ARM_BLX(0, reg_a1); \
#define generate_load_call_mbyte(tblnum, abits) \
ARM_MOV_REG_IMMSHIFT(0, reg_a1, reg_a0, ARMSHIFT_ROR, abits) \
ARM_USAT_ASR(0, reg_a1, 4, reg_a1, 24-abits, ARMCOND_AL); \
generate_add_imm(reg_a1, (STORE_TBL_OFF + 64*tblnum) >> 2, 0); \
ARM_LDR_REG_REG_SHIFT(0, reg_a1, reg_base, reg_a1, 0, 2); \
ARM_BLX(0, reg_a1); \
#define generate_store_call(tblnum) \
ARM_USAT_ASR(0, reg_a2, 4, reg_a0, 24, ARMCOND_AL); \
generate_add_imm(reg_a2, (STORE_TBL_OFF + 64*tblnum) >> 2, 0); \
ARM_LDR_REG_REG_SHIFT(0, reg_a2, reg_base, reg_a2, 0, 2); \
ARM_BLX(0, reg_a2); \
#define generate_store_call_u8() generate_store_call(0)
#define generate_store_call_u16() generate_store_call(1)
#define generate_store_call_u32() generate_store_call(2)
#define generate_store_call_u32_safe() generate_store_call(3)
#define generate_load_call_u8() generate_load_call_byte(4)
#define generate_load_call_s8() generate_load_call_byte(5)
#define generate_load_call_u16() generate_load_call_mbyte(6, 1)
#define generate_load_call_s16() generate_load_call_mbyte(7, 1)
#define generate_load_call_u32() generate_load_call_mbyte(8, 2)
#define arm_access_memory_load(mem_type) \
cycle_count += 2; \
generate_function_call(execute_load_##mem_type); \
generate_load_call_##mem_type(); \
write32((pc + 8)); \
arm_generate_store_reg_pc_no_flags(reg_rv, rd) \
#define arm_access_memory_store(mem_type) \
cycle_count++; \
arm_generate_load_reg_pc(reg_a1, rd, 12); \
generate_function_call(execute_store_##mem_type); \
generate_store_call_##mem_type(); \
write32((pc + 4)) \
/* Calculate the address into a0 from _rn, _rm */
@ -1384,20 +1412,20 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
/* TODO: Make these use cached registers. Implement iwram_stack_optimize. */
#define arm_block_memory_load() \
generate_function_call(execute_load_u32); \
generate_load_call_u32(); \
write32((pc + 8)); \
arm_generate_store_reg(reg_rv, i) \
#define arm_block_memory_store() \
arm_generate_load_reg_pc(reg_a1, i, 8); \
generate_function_call(execute_store_u32_safe) \
generate_store_call_u32_safe() \
#define arm_block_memory_final_load() \
arm_block_memory_load() \
#define arm_block_memory_final_store() \
arm_generate_load_reg_pc(reg_a1, i, 12); \
generate_function_call(execute_store_u32); \
generate_store_call_u32(); \
write32((pc + 4)) \
#define arm_block_memory_adjust_pc_store() \
@ -1482,13 +1510,13 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
arm_decode_swap(); \
cycle_count += 3; \
arm_generate_load_reg(reg_a0, rn); \
generate_function_call(execute_load_##type); \
generate_load_call_##type(); \
write32((pc + 8)); \
generate_mov(reg_a2, reg_rv); \
arm_generate_load_reg(reg_a0, rn); \
arm_generate_load_reg(reg_a1, rm); \
arm_generate_store_reg(reg_a2, rd); \
generate_function_call(execute_store_##type); \
generate_store_call_##type(); \
write32((pc + 4)); \
} \
@ -1651,14 +1679,14 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
#define thumb_access_memory_load(mem_type, _rd) \
cycle_count += 2; \
generate_function_call(execute_load_##mem_type); \
generate_load_call_##mem_type(); \
write32((pc + 4)); \
thumb_generate_store_reg(reg_rv, _rd) \
#define thumb_access_memory_store(mem_type, _rd) \
cycle_count++; \
thumb_generate_load_reg(reg_a1, _rd); \
generate_function_call(execute_store_##mem_type); \
generate_store_call_##mem_type(); \
write32((pc + 2)) \
#define thumb_access_memory_generate_address_pc_relative(offset, _rb, _ro) \
@ -1727,7 +1755,7 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
#define thumb_block_memory_extra_pop_pc() \
thumb_generate_load_reg(reg_s0, REG_SAVE); \
generate_add_reg_reg_imm(reg_a0, reg_s0, (bit_count[reg_list] * 4), 0); \
generate_function_call(execute_load_u32); \
generate_load_call_u32(); \
write32((pc + 4)); \
generate_indirect_branch_cycle_update(thumb) \
@ -1735,23 +1763,23 @@ u32 execute_store_cpsr_body(u32 _cpsr, u32 store_mask, u32 address)
thumb_generate_load_reg(reg_s0, REG_SAVE); \
generate_add_reg_reg_imm(reg_a0, reg_s0, (bit_count[reg_list] * 4), 0); \
thumb_generate_load_reg(reg_a1, REG_LR); \
generate_function_call(execute_store_u32_safe) \
generate_store_call_u32_safe()
#define thumb_block_memory_load() \
generate_function_call(execute_load_u32); \
generate_load_call_u32(); \
write32((pc + 4)); \
thumb_generate_store_reg(reg_rv, i) \
#define thumb_block_memory_store() \
thumb_generate_load_reg(reg_a1, i); \
generate_function_call(execute_store_u32_safe) \
generate_store_call_u32_safe()
#define thumb_block_memory_final_load() \
thumb_block_memory_load() \
#define thumb_block_memory_final_store() \
thumb_generate_load_reg(reg_a1, i); \
generate_function_call(execute_store_u32); \
generate_store_call_u32(); \
write32((pc + 2)) \
#define thumb_block_memory_final_no(access_type) \

View File

@ -94,12 +94,7 @@ _##symbol:
ldr reg_x5, [reg_base, #REG_R14] ;\
#define load_registers_thumb() ;\
ldr reg_x0, [reg_base, #REG_R0] ;\
ldr reg_x1, [reg_base, #REG_R1] ;\
ldr reg_x2, [reg_base, #REG_R2] ;\
ldr reg_x3, [reg_base, #REG_R3] ;\
ldr reg_x4, [reg_base, #REG_R4] ;\
ldr reg_x5, [reg_base, #REG_R5] ;\
ldm reg_base, {reg_x0, reg_x1, reg_x2, reg_x3, reg_x4, reg_x5}
@ Will store the register set from cached registers back to memory.
@ -113,12 +108,7 @@ _##symbol:
str reg_x5, [reg_base, #REG_R14] ;\
#define store_registers_thumb() ;\
str reg_x0, [reg_base, #REG_R0] ;\
str reg_x1, [reg_base, #REG_R1] ;\
str reg_x2, [reg_base, #REG_R2] ;\
str reg_x3, [reg_base, #REG_R3] ;\
str reg_x4, [reg_base, #REG_R4] ;\
str reg_x5, [reg_base, #REG_R5] ;\
stm reg_base, {reg_x0, reg_x1, reg_x2, reg_x3, reg_x4, reg_x5}
@ Returns an updated persistent cpsr with the cached flags register.
@ -512,12 +502,6 @@ return_to_main:
#define execute_store_builder(store_type, str_op, str_op16, load_op, tnum) ;\
;\
defsymbl(execute_store_u##store_type) ;\
usat r2, #4, r0, asr #24 /* r2 contains [0-15] */;\
add r2, r2, #((STORE_TBL_OFF + 16*4*tnum) >> 2) /* add table offset */;\
ldr pc, [reg_base, r2, lsl #2] /* load handler addr */;\
nop ;\
;\
ext_store_u##store_type: ;\
save_flags() ;\
ldr r2, [lr] /* load PC */;\
@ -571,7 +555,6 @@ ext_store_oam_ram_u##store_type: ;\
ldr r0, [lr] /* load PC */;\
str r0, [reg_base, #REG_PC] /* write out PC */;\
b smc_write /* perform smc write */;\
.size execute_store_u##store_type, .-execute_store_u##store_type
@ for ignored areas, just return
ext_store_ignore:
@ -601,12 +584,6 @@ execute_store_builder(32, str, str, ldr, 2)
@ This is a store that is executed in a strm case (so no SMC checks in-between)
defsymbl(execute_store_u32_safe)
usat r2, #4, r0, asr #24
add r2, r2, #((STORE_TBL_OFF + 16*4*3) >> 2)
ldr pc, [reg_base, r2, lsl #2]
nop
ext_store_u32_safe:
str lr, [reg_base, #REG_SAVE3] @ Restore lr
save_flags()
@ -642,7 +619,6 @@ ext_store_oam_ram_u32_safe:
str r1, [r0, r2] @ store data
str r2, [reg_base, #OAM_UPDATED] @ store anything non zero here
bx lr @ Return
.size execute_store_u32_safe, .-execute_store_u32_safe
write_epilogue:
@ -744,17 +720,6 @@ lookup_pc_arm:
#define execute_load_builder(load_type, albits, load_function, tnum) ;\
;\
defsymbl(execute_load_##load_type) ;\
.if albits >= 1 ;\
ror r1, r0, #(albits) /* move alignment bits to MSB */;\
usat r1, #4, r1, asr #(24-albits) /* r1 contains [0-15] */;\
.else ;\
usat r1, #4, r0, asr #24 /* r1 contains [0-15] */;\
.endif ;\
add r1, r1, #((STORE_TBL_OFF + 16*4*tnum) >> 2) /* add table offset */;\
ldr pc, [reg_base, r1, lsl #2] /* load handler addr */;\
nop ;\
;\
ld_bios_##load_type: /* BIOS area, need to verify PC */;\
save_flags() ;\
ldr r1, [lr] /* r1 = PC */;\
@ -809,8 +774,7 @@ ld_slow_##load_type: ;\
call_c_function(load_function) ;\
restore_flags() ;\
add pc, lr, #4 /* return */;\
;\
.size execute_load_##load_type, .-execute_load_##load_type
#define load_table_gen(load_type) ;\
.long ld_bios_##load_type /* 0 BIOS */;\