Fix ROM swapping capabilities
This fixes ROM swapping for x86/64, arm32 and arm64. On top of that it improves speed by removing unnecessary slow paths on small ROMs for arm32 and mips. If the ROM can fit in RAM, it will emit more efficient code that assumes the ROM is fully loaded. For low-memory Linux platforms it would be better to use some mmap'ed ROM, that way the OS would transparently handle page swapping, which is perhaps faster. Will investigate and follow up on this in a separate commit.
This commit is contained in:
parent
42c6bb2761
commit
541adc9e1c
|
@ -1869,7 +1869,7 @@ extern void* ldst_handler_functions[16*4 + 17*6];
|
|||
extern void* ldst_lookup_tables[16*4 + 17*6];
|
||||
|
||||
|
||||
void init_emitter() {
|
||||
void init_emitter(bool must_swap) {
|
||||
rom_cache_watermark = INITIAL_ROM_WATERMARK;
|
||||
init_bios_hooks();
|
||||
|
||||
|
|
|
@ -443,6 +443,7 @@ ld_rdmap_##load_type: ;\
|
|||
lsr w4, w0, #15 /* Each block is 32KB */;\
|
||||
add x3, reg_base, #(RDMAP_OFF) ;\
|
||||
ldr x4, [x3, x4, lsl #3] /* x4 = table pointer */;\
|
||||
cbz x4, ld_slow_##load_type /* not mapped, go slow */ ;\
|
||||
and w0, w0, #(0x7fff) /* 32KB pages */;\
|
||||
ldop w0, [x4, x0] /* load actual value */;\
|
||||
ret ;\
|
||||
|
|
|
@ -1981,14 +1981,24 @@ void *div6, *divarm7;
|
|||
generate_indirect_branch_no_cycle_update(type) \
|
||||
|
||||
|
||||
extern u32 ldst_handler_functions[9][17];
|
||||
extern u32 ldst_lookup_tables[9][17];
|
||||
extern u32 st_handler_functions[4][17];
|
||||
extern u32 ld_handler_functions[5][17];
|
||||
extern u32 ld_swap_handler_functions[5][17];
|
||||
|
||||
void init_emitter(void) {
|
||||
// Tables used by the memory handlers (placed near reg_base)
|
||||
extern u32 ld_lookup_tables[5][17];
|
||||
extern u32 st_lookup_tables[4][17];
|
||||
|
||||
void init_emitter(bool must_swap) {
|
||||
int i;
|
||||
|
||||
// Generate handler table
|
||||
memcpy(ldst_lookup_tables, ldst_handler_functions, sizeof(ldst_lookup_tables));
|
||||
memcpy(st_lookup_tables, st_handler_functions, sizeof(st_lookup_tables));
|
||||
// Issue faster paths if swapping is not required
|
||||
if (must_swap)
|
||||
memcpy(ld_lookup_tables, ld_swap_handler_functions, sizeof(ld_lookup_tables));
|
||||
else
|
||||
memcpy(ld_lookup_tables, ld_handler_functions, sizeof(ld_lookup_tables));
|
||||
|
||||
rom_cache_watermark = INITIAL_ROM_WATERMARK;
|
||||
u8 *translation_ptr = (u8*)&rom_translation_cache[0];
|
||||
|
|
|
@ -803,7 +803,20 @@ ld_oamram_##load_type: /* OAM RAM area */;\
|
|||
exec_ld_op_##load_type(10) /* Clear upper bits (10 LSB) */;\
|
||||
add pc, lr, #4 ;\
|
||||
;\
|
||||
/* ROM area (or VRAM): uses generic memory handlers */ ;\
|
||||
/* ROM area: uses generic memory handlers to handle swapping */ ;\
|
||||
ld_rdmap_slow_##load_type: ;\
|
||||
save_flags() ;\
|
||||
add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\
|
||||
mov r1, r0, lsr #15 /* r1 = page index of address */;\
|
||||
ldr r2, [r2, r1, lsl #2] /* r2 = base addr */;\
|
||||
cmp r2, #0 ;\
|
||||
beq 9f /* Page miss, go slow */;\
|
||||
;\
|
||||
exec_ld_op_##load_type(15) /* Pages are 32KB big */;\
|
||||
restore_flags() ;\
|
||||
add pc, lr, #4 ;\
|
||||
;\
|
||||
/* ROM/VRAM area: uses generic memory handlers, assumes is mapped */ ;\
|
||||
ld_rdmap_##load_type: ;\
|
||||
add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\
|
||||
mov r1, r0, lsr #15 /* r1 = page index of address */;\
|
||||
|
@ -815,6 +828,7 @@ ld_rdmap_##load_type: ;\
|
|||
/* Slow load path, for open/unmapped loads */;\
|
||||
ld_slow_##load_type: ;\
|
||||
save_flags() ;\
|
||||
9: ;\
|
||||
ldr r1, [lr] /* r1 = PC */;\
|
||||
10: ;\
|
||||
str r1, [reg_base, #REG_PC] /* update PC */;\
|
||||
|
@ -823,7 +837,7 @@ ld_slow_##load_type: ;\
|
|||
add pc, lr, #4 /* return */;\
|
||||
|
||||
|
||||
#define load_table_gen(load_type) ;\
|
||||
#define load_table_gen(load_type, rdmapfn) ;\
|
||||
.long ld_slow_##load_type /* -1 (for regions above F) */;\
|
||||
.long ld_bios_##load_type /* 0 BIOS */;\
|
||||
.long ld_slow_##load_type /* 1 Bad region */;\
|
||||
|
@ -833,11 +847,11 @@ ld_slow_##load_type: ;\
|
|||
.long ld_palram_##load_type /* 5 Palette RAM, via map */;\
|
||||
.long ld_rdmap_##load_type /* 6 VRAM area */;\
|
||||
.long ld_oamram_##load_type /* 7 OAM RAM */;\
|
||||
.long ld_rdmap_##load_type /* 8 ROM, via map */;\
|
||||
.long ld_rdmap_##load_type /* 9 ROM, via map */;\
|
||||
.long ld_rdmap_##load_type /* A ROM, via map */;\
|
||||
.long ld_rdmap_##load_type /* B ROM, via map */;\
|
||||
.long ld_rdmap_##load_type /* C ROM, via map */;\
|
||||
.long ld_##rdmapfn##_##load_type /* 8 ROM, via map */;\
|
||||
.long ld_##rdmapfn##_##load_type /* 9 ROM, via map */;\
|
||||
.long ld_##rdmapfn##_##load_type /* A ROM, via map */;\
|
||||
.long ld_##rdmapfn##_##load_type /* B ROM, via map */;\
|
||||
.long ld_##rdmapfn##_##load_type /* C ROM, via map */;\
|
||||
.long ld_slow_##load_type /* D ROM or EEPROM/FLASH */;\
|
||||
.long ld_slow_##load_type /* E EEPROM/FLASH */;\
|
||||
.long ld_slow_##load_type /* F Bad region */;\
|
||||
|
@ -853,16 +867,23 @@ execute_load_builder(u32, 2, read_memory32 )
|
|||
.data
|
||||
.align 4
|
||||
|
||||
defsymbl(ldst_handler_functions)
|
||||
defsymbl(st_handler_functions)
|
||||
store_lookup_table(8)
|
||||
store_lookup_table(16)
|
||||
store_lookup_table(32)
|
||||
store_lookup_table(32_safe)
|
||||
load_table_gen(u8)
|
||||
load_table_gen(s8)
|
||||
load_table_gen(u16)
|
||||
load_table_gen(s16)
|
||||
load_table_gen(u32)
|
||||
defsymbl(ld_handler_functions)
|
||||
load_table_gen(u8, rdmap)
|
||||
load_table_gen(s8, rdmap)
|
||||
load_table_gen(u16, rdmap)
|
||||
load_table_gen(s16, rdmap)
|
||||
load_table_gen(u32, rdmap)
|
||||
defsymbl(ld_swap_handler_functions)
|
||||
load_table_gen(u8, rdmap_slow)
|
||||
load_table_gen(s8, rdmap_slow)
|
||||
load_table_gen(u16, rdmap_slow)
|
||||
load_table_gen(s16, rdmap_slow)
|
||||
load_table_gen(u32, rdmap_slow)
|
||||
|
||||
.bss
|
||||
.align 4
|
||||
|
@ -878,8 +899,9 @@ defsymbl(reg)
|
|||
defsymbl(spsr)
|
||||
.space 24
|
||||
@ Place lookup tables here for easy access via base_reg too
|
||||
defsymbl(ldst_lookup_tables)
|
||||
defsymbl(st_lookup_tables)
|
||||
.space 4*17*4 @ store
|
||||
defsymbl(ld_lookup_tables)
|
||||
.space 5*17*4 @ loads
|
||||
.space 132 @ Padding for alignment
|
||||
defsymbl(reg_mode)
|
||||
|
|
2
cpu.h
2
cpu.h
|
@ -164,7 +164,7 @@ void flush_translation_cache_rom(void);
|
|||
void flush_translation_cache_ram(void);
|
||||
void dump_translation_cache(void);
|
||||
void init_caches(void);
|
||||
void init_emitter(void);
|
||||
void init_emitter(bool);
|
||||
void init_bios_hooks(void);
|
||||
|
||||
extern u32 reg_mode[7][7];
|
||||
|
|
15
gba_memory.c
15
gba_memory.c
|
@ -318,6 +318,8 @@ dma_transfer_type dma[4];
|
|||
u8 *gamepak_buffers[32]; /* Pointers to malloc'ed blocks */
|
||||
u32 gamepak_buffer_count; /* Value between 1 and 32 */
|
||||
u32 gamepak_size; /* Size of the ROM in bytes */
|
||||
// We allocate in 1MB chunks.
|
||||
const unsigned gamepak_buffer_blocksize = 1024*1024;
|
||||
|
||||
// LRU queue with the loaded blocks and what they map to
|
||||
struct {
|
||||
|
@ -2965,7 +2967,7 @@ void init_gamepak_buffer(void)
|
|||
gamepak_buffer_count = 0;
|
||||
while (gamepak_buffer_count < ROM_BUFFER_SIZE)
|
||||
{
|
||||
void *ptr = malloc(1024*1024);
|
||||
void *ptr = malloc(gamepak_buffer_blocksize);
|
||||
if (!ptr)
|
||||
break;
|
||||
gamepak_buffers[gamepak_buffer_count++] = (u8*)ptr;
|
||||
|
@ -2982,6 +2984,13 @@ void init_gamepak_buffer(void)
|
|||
gamepak_lru_tail = 32 * gamepak_buffer_count - 1;
|
||||
}
|
||||
|
||||
bool gamepak_must_swap(void)
|
||||
{
|
||||
// Returns whether the current gamepak buffer is not big enough to hold
|
||||
// the full gamepak ROM. In these cases the device must swap.
|
||||
return gamepak_buffer_count * gamepak_buffer_blocksize < gamepak_size;
|
||||
}
|
||||
|
||||
void init_memory(void)
|
||||
{
|
||||
u32 map_offset = 0;
|
||||
|
@ -3182,7 +3191,7 @@ static s32 load_gamepak_raw(const char *name)
|
|||
gamepak_size = (gamepak_size + 0x7FFF) & ~0x7FFF;
|
||||
|
||||
// Load stuff in 1MB chunks
|
||||
u32 buf_blocks = (gamepak_size + 1024*1024-1) / (1024*1024);
|
||||
u32 buf_blocks = (gamepak_size + gamepak_buffer_blocksize-1) / (gamepak_buffer_blocksize);
|
||||
u32 rom_blocks = gamepak_size >> 15;
|
||||
u32 ldblks = buf_blocks < gamepak_buffer_count ?
|
||||
buf_blocks : gamepak_buffer_count;
|
||||
|
@ -3194,7 +3203,7 @@ static s32 load_gamepak_raw(const char *name)
|
|||
for (i = 0; i < ldblks; i++)
|
||||
{
|
||||
// Load 1MB chunk and map it
|
||||
filestream_read(gamepak_file_large, gamepak_buffers[i], 1024*1024);
|
||||
filestream_read(gamepak_file_large, gamepak_buffers[i], gamepak_buffer_blocksize);
|
||||
for (j = 0; j < 32 && i*32 + j < rom_blocks; j++)
|
||||
{
|
||||
u32 phyn = i*32 + j;
|
||||
|
|
|
@ -200,6 +200,7 @@ s32 load_bios(char *name);
|
|||
void update_backup(void);
|
||||
void init_memory(void);
|
||||
void init_gamepak_buffer(void);
|
||||
bool gamepak_must_swap(void);
|
||||
void memory_term(void);
|
||||
u8 *load_gamepak_page(u32 physical_index);
|
||||
|
||||
|
|
2
main.c
2
main.c
|
@ -102,7 +102,7 @@ void init_main(void)
|
|||
video_count = 960;
|
||||
|
||||
#ifdef HAVE_DYNAREC
|
||||
init_emitter();
|
||||
init_emitter(gamepak_must_swap());
|
||||
init_caches();
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -2099,7 +2099,7 @@ typedef struct {
|
|||
static void emit_pmemld_stub(
|
||||
unsigned memop_number, const t_stub_meminfo *meminfo,
|
||||
bool signext, unsigned size,
|
||||
unsigned alignment, bool aligned,
|
||||
unsigned alignment, bool aligned, bool must_swap,
|
||||
u8 **tr_ptr)
|
||||
{
|
||||
u8 *translation_ptr = *tr_ptr;
|
||||
|
@ -2158,28 +2158,31 @@ static void emit_pmemld_stub(
|
|||
}
|
||||
|
||||
if (region >= 8 && region <= 12) {
|
||||
u8 *jmppatch;
|
||||
// ROM area: might need to load the ROM on-demand
|
||||
mips_emit_srl(reg_rv, reg_a0, 15); // 32KB page number
|
||||
mips_emit_sll(reg_rv, reg_rv, 2); // (word indexed)
|
||||
mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset
|
||||
mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset
|
||||
mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000] is readmap ptr
|
||||
mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [can go in delay slot]
|
||||
|
||||
mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000]
|
||||
mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page
|
||||
mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [delay]
|
||||
if (must_swap) { // Do not emit if the ROM is fully loaded, save some cycles
|
||||
u8 *jmppatch;
|
||||
mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page
|
||||
generate_swap_delay();
|
||||
|
||||
// This code call the C routine to map the relevant ROM page
|
||||
emit_save_regs(aligned);
|
||||
mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3);
|
||||
extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff
|
||||
genccall(&load_gamepak_page);
|
||||
mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1);
|
||||
// This code call the C routine to map the relevant ROM page
|
||||
emit_save_regs(aligned);
|
||||
mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3);
|
||||
extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff
|
||||
genccall(&load_gamepak_page); // Returns valid pointer in rv
|
||||
mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1);
|
||||
|
||||
mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1);
|
||||
emit_restore_regs(aligned);
|
||||
mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3);
|
||||
mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1);
|
||||
emit_restore_regs(aligned);
|
||||
mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3);
|
||||
|
||||
generate_branch_patch_conditional(jmppatch, translation_ptr);
|
||||
generate_branch_patch_conditional(jmppatch - 4, translation_ptr);
|
||||
}
|
||||
// Now we can proceed to load, place addr in the right register
|
||||
mips_emit_addu(reg_rv, reg_rv, reg_temp);
|
||||
} else if (region == 14) {
|
||||
|
@ -2620,7 +2623,7 @@ typedef void (*sthldr_t)(
|
|||
typedef void (*ldhldr_t)(
|
||||
unsigned memop_number, const t_stub_meminfo *meminfo,
|
||||
bool signext, unsigned size,
|
||||
unsigned alignment, bool aligned,
|
||||
unsigned alignment, bool aligned, bool must_swap,
|
||||
u8 **tr_ptr);
|
||||
|
||||
// Generates a patch handler for a given access size
|
||||
|
@ -2684,7 +2687,7 @@ static void emit_phand(
|
|||
// - memop patcher: Patches a memop whenever it accesses the wrong mem region
|
||||
// - mem stubs: There's stubs for load & store, and every memory region
|
||||
// and possible operand size and misaligment (+sign extensions)
|
||||
void init_emitter() {
|
||||
void init_emitter(bool must_swap) {
|
||||
int i;
|
||||
// Initialize memory to a debuggable state
|
||||
rom_cache_watermark = INITIAL_ROM_WATERMARK;
|
||||
|
@ -2756,20 +2759,20 @@ void init_emitter() {
|
|||
for (i = 0; i < sizeof(ldinfo)/sizeof(ldinfo[0]); i++) {
|
||||
ldhldr_t handler = (ldhldr_t)ldinfo[i].emitter;
|
||||
/* region info signext sz al isaligned */
|
||||
handler(0, &ldinfo[i], false, 0, 0, false, &translation_ptr); // ld u8
|
||||
handler(1, &ldinfo[i], true, 0, 0, false, &translation_ptr); // ld s8
|
||||
handler(0, &ldinfo[i], false, 0, 0, false, must_swap, &translation_ptr); // ld u8
|
||||
handler(1, &ldinfo[i], true, 0, 0, false, must_swap, &translation_ptr); // ld s8
|
||||
|
||||
handler(2, &ldinfo[i], false, 1, 0, false, &translation_ptr); // ld u16
|
||||
handler(3, &ldinfo[i], false, 1, 1, false, &translation_ptr); // ld u16u1
|
||||
handler(4, &ldinfo[i], true, 1, 0, false, &translation_ptr); // ld s16
|
||||
handler(5, &ldinfo[i], true, 1, 1, false, &translation_ptr); // ld s16u1
|
||||
handler(2, &ldinfo[i], false, 1, 0, false, must_swap, &translation_ptr); // ld u16
|
||||
handler(3, &ldinfo[i], false, 1, 1, false, must_swap, &translation_ptr); // ld u16u1
|
||||
handler(4, &ldinfo[i], true, 1, 0, false, must_swap, &translation_ptr); // ld s16
|
||||
handler(5, &ldinfo[i], true, 1, 1, false, must_swap, &translation_ptr); // ld s16u1
|
||||
|
||||
handler(6, &ldinfo[i], false, 2, 0, false, &translation_ptr); // ld u32
|
||||
handler(7, &ldinfo[i], false, 2, 1, false, &translation_ptr); // ld u32u1
|
||||
handler(8, &ldinfo[i], false, 2, 2, false, &translation_ptr); // ld u32u2
|
||||
handler(9, &ldinfo[i], false, 2, 3, false, &translation_ptr); // ld u32u3
|
||||
handler(6, &ldinfo[i], false, 2, 0, false, must_swap, &translation_ptr); // ld u32
|
||||
handler(7, &ldinfo[i], false, 2, 1, false, must_swap, &translation_ptr); // ld u32u1
|
||||
handler(8, &ldinfo[i], false, 2, 2, false, must_swap, &translation_ptr); // ld u32u2
|
||||
handler(9, &ldinfo[i], false, 2, 3, false, must_swap, &translation_ptr); // ld u32u3
|
||||
|
||||
handler(10,&ldinfo[i], false, 2, 0, true, &translation_ptr); // aligned ld u32
|
||||
handler(10,&ldinfo[i], false, 2, 0, true, must_swap, &translation_ptr); // aligned ld u32
|
||||
}
|
||||
|
||||
const t_stub_meminfo stinfo [] = {
|
||||
|
|
|
@ -2260,7 +2260,7 @@ static void function_cc execute_swi(u32 pc)
|
|||
extern void* x86_table_data[9][16];
|
||||
extern void* x86_table_info[9][16];
|
||||
|
||||
void init_emitter(void) {
|
||||
void init_emitter(bool must_swap) {
|
||||
memcpy(x86_table_info, x86_table_data, sizeof(x86_table_data));
|
||||
|
||||
rom_cache_watermark = INITIAL_ROM_WATERMARK;
|
||||
|
|
|
@ -38,6 +38,7 @@ _##symbol:
|
|||
// otherwise), where we use our own convention call. However calls to C code must
|
||||
// follow the calling convention. x86 is built with regparm=2 to avoid stack usage.
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
#define JUMP_CX_ZERO jrcxz
|
||||
#define ADDR_TYPE .quad
|
||||
#define ADDR_SIZE_BYTES 8
|
||||
#define STACK_REG %rsp
|
||||
|
@ -62,6 +63,7 @@ _##symbol:
|
|||
#endif
|
||||
#define SETUP_ARGS mov %eax, CARG1_REG; mov %edx, CARG2_REG;
|
||||
#else
|
||||
#define JUMP_CX_ZERO jecxz
|
||||
#define ADDR_TYPE .long
|
||||
#define ADDR_SIZE_BYTES 4
|
||||
#define STACK_REG %esp
|
||||
|
@ -422,13 +424,14 @@ ext_load_io##rtype: ;\
|
|||
ret ;\
|
||||
;\
|
||||
ext_load_rom##rtype: ;\
|
||||
mov %eax, %ecx /* ecx = address */ ;\
|
||||
shr $15, %ecx /* ecx = address >> 15 */ ;\
|
||||
mov %eax, %edx /* edx = address */ ;\
|
||||
shr $15, %edx /* edx = address >> 15 */ ;\
|
||||
/* Read rdmap pointer */ ;\
|
||||
mov RDMAP_OFF(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES), FULLREG(dx) ;\
|
||||
mov %eax, %ecx /* ecx = address */ ;\
|
||||
and $0x7FFF, %ecx /* ecx = address LSB */ ;\
|
||||
movop (FULLREG(dx), FULLREG(cx)), %eax /* Read mem */ ;\
|
||||
mov RDMAP_OFF(REG_BASE, FULLREG(dx), ADDR_SIZE_BYTES), FULLREG(cx) ;\
|
||||
JUMP_CX_ZERO ext_load_slow##rtype /* page not loaded, slow */ ;\
|
||||
mov %eax, %edx /* edx = address */ ;\
|
||||
and $0x7FFF, %edx /* edx = address LSB */ ;\
|
||||
movop (FULLREG(cx), FULLREG(dx)), %eax /* Read mem */ ;\
|
||||
ret ;\
|
||||
;\
|
||||
ext_load_slow##rtype: ;\
|
||||
|
|
Loading…
Reference in New Issue