Fix ROM swapping capabilities

This fixes ROM swapping for x86/64, arm32 and arm64. On top of that it
improves speed by removing unnecessary slow paths on small ROMs for
arm32 and mips. If the ROM can fit in RAM, it will emit more efficient
code that assumes the ROM is fully loaded.

For low-memory Linux platforms it would be better to use some mmap'ed
ROM, that way the OS would transparently handle page swapping, which is
perhaps faster. Will investigate and follow up on this in a separate
commit.
This commit is contained in:
David Guillen Fandos 2023-03-03 20:59:08 +01:00
parent 42c6bb2761
commit 541adc9e1c
11 changed files with 109 additions and 60 deletions

View File

@ -1869,7 +1869,7 @@ extern void* ldst_handler_functions[16*4 + 17*6];
extern void* ldst_lookup_tables[16*4 + 17*6];
void init_emitter() {
void init_emitter(bool must_swap) {
rom_cache_watermark = INITIAL_ROM_WATERMARK;
init_bios_hooks();

View File

@ -443,6 +443,7 @@ ld_rdmap_##load_type: ;\
lsr w4, w0, #15 /* Each block is 32KB */;\
add x3, reg_base, #(RDMAP_OFF) ;\
ldr x4, [x3, x4, lsl #3] /* x4 = table pointer */;\
cbz x4, ld_slow_##load_type /* not mapped, go slow */ ;\
and w0, w0, #(0x7fff) /* 32KB pages */;\
ldop w0, [x4, x0] /* load actual value */;\
ret ;\

View File

@ -1981,14 +1981,24 @@ void *div6, *divarm7;
generate_indirect_branch_no_cycle_update(type) \
extern u32 ldst_handler_functions[9][17];
extern u32 ldst_lookup_tables[9][17];
extern u32 st_handler_functions[4][17];
extern u32 ld_handler_functions[5][17];
extern u32 ld_swap_handler_functions[5][17];
void init_emitter(void) {
// Tables used by the memory handlers (placed near reg_base)
extern u32 ld_lookup_tables[5][17];
extern u32 st_lookup_tables[4][17];
void init_emitter(bool must_swap) {
int i;
// Generate handler table
memcpy(ldst_lookup_tables, ldst_handler_functions, sizeof(ldst_lookup_tables));
memcpy(st_lookup_tables, st_handler_functions, sizeof(st_lookup_tables));
// Issue faster paths if swapping is not required
if (must_swap)
memcpy(ld_lookup_tables, ld_swap_handler_functions, sizeof(ld_lookup_tables));
else
memcpy(ld_lookup_tables, ld_handler_functions, sizeof(ld_lookup_tables));
rom_cache_watermark = INITIAL_ROM_WATERMARK;
u8 *translation_ptr = (u8*)&rom_translation_cache[0];

View File

@ -803,7 +803,20 @@ ld_oamram_##load_type: /* OAM RAM area */;\
exec_ld_op_##load_type(10) /* Clear upper bits (10 LSB) */;\
add pc, lr, #4 ;\
;\
/* ROM area (or VRAM): uses generic memory handlers */ ;\
/* ROM area: uses generic memory handlers to handle swapping */ ;\
ld_rdmap_slow_##load_type: ;\
save_flags() ;\
add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\
mov r1, r0, lsr #15 /* r1 = page index of address */;\
ldr r2, [r2, r1, lsl #2] /* r2 = base addr */;\
cmp r2, #0 ;\
beq 9f /* Page miss, go slow */;\
;\
exec_ld_op_##load_type(15) /* Pages are 32KB big */;\
restore_flags() ;\
add pc, lr, #4 ;\
;\
/* ROM/VRAM area: uses generic memory handlers, assumes is mapped */ ;\
ld_rdmap_##load_type: ;\
add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\
mov r1, r0, lsr #15 /* r1 = page index of address */;\
@ -815,6 +828,7 @@ ld_rdmap_##load_type: ;\
/* Slow load path, for open/unmapped loads */;\
ld_slow_##load_type: ;\
save_flags() ;\
9: ;\
ldr r1, [lr] /* r1 = PC */;\
10: ;\
str r1, [reg_base, #REG_PC] /* update PC */;\
@ -823,7 +837,7 @@ ld_slow_##load_type: ;\
add pc, lr, #4 /* return */;\
#define load_table_gen(load_type) ;\
#define load_table_gen(load_type, rdmapfn) ;\
.long ld_slow_##load_type /* -1 (for regions above F) */;\
.long ld_bios_##load_type /* 0 BIOS */;\
.long ld_slow_##load_type /* 1 Bad region */;\
@ -833,11 +847,11 @@ ld_slow_##load_type: ;\
.long ld_palram_##load_type /* 5 Palette RAM, via map */;\
.long ld_rdmap_##load_type /* 6 VRAM area */;\
.long ld_oamram_##load_type /* 7 OAM RAM */;\
.long ld_rdmap_##load_type /* 8 ROM, via map */;\
.long ld_rdmap_##load_type /* 9 ROM, via map */;\
.long ld_rdmap_##load_type /* A ROM, via map */;\
.long ld_rdmap_##load_type /* B ROM, via map */;\
.long ld_rdmap_##load_type /* C ROM, via map */;\
.long ld_##rdmapfn##_##load_type /* 8 ROM, via map */;\
.long ld_##rdmapfn##_##load_type /* 9 ROM, via map */;\
.long ld_##rdmapfn##_##load_type /* A ROM, via map */;\
.long ld_##rdmapfn##_##load_type /* B ROM, via map */;\
.long ld_##rdmapfn##_##load_type /* C ROM, via map */;\
.long ld_slow_##load_type /* D ROM or EEPROM/FLASH */;\
.long ld_slow_##load_type /* E EEPROM/FLASH */;\
.long ld_slow_##load_type /* F Bad region */;\
@ -853,16 +867,23 @@ execute_load_builder(u32, 2, read_memory32 )
.data
.align 4
defsymbl(ldst_handler_functions)
defsymbl(st_handler_functions)
store_lookup_table(8)
store_lookup_table(16)
store_lookup_table(32)
store_lookup_table(32_safe)
load_table_gen(u8)
load_table_gen(s8)
load_table_gen(u16)
load_table_gen(s16)
load_table_gen(u32)
defsymbl(ld_handler_functions)
load_table_gen(u8, rdmap)
load_table_gen(s8, rdmap)
load_table_gen(u16, rdmap)
load_table_gen(s16, rdmap)
load_table_gen(u32, rdmap)
defsymbl(ld_swap_handler_functions)
load_table_gen(u8, rdmap_slow)
load_table_gen(s8, rdmap_slow)
load_table_gen(u16, rdmap_slow)
load_table_gen(s16, rdmap_slow)
load_table_gen(u32, rdmap_slow)
.bss
.align 4
@ -878,8 +899,9 @@ defsymbl(reg)
defsymbl(spsr)
.space 24
@ Place lookup tables here for easy access via base_reg too
defsymbl(ldst_lookup_tables)
defsymbl(st_lookup_tables)
.space 4*17*4 @ store
defsymbl(ld_lookup_tables)
.space 5*17*4 @ loads
.space 132 @ Padding for alignment
defsymbl(reg_mode)

2
cpu.h
View File

@ -164,7 +164,7 @@ void flush_translation_cache_rom(void);
void flush_translation_cache_ram(void);
void dump_translation_cache(void);
void init_caches(void);
void init_emitter(void);
void init_emitter(bool);
void init_bios_hooks(void);
extern u32 reg_mode[7][7];

View File

@ -318,6 +318,8 @@ dma_transfer_type dma[4];
u8 *gamepak_buffers[32]; /* Pointers to malloc'ed blocks */
u32 gamepak_buffer_count; /* Value between 1 and 32 */
u32 gamepak_size; /* Size of the ROM in bytes */
// We allocate in 1MB chunks.
const unsigned gamepak_buffer_blocksize = 1024*1024;
// LRU queue with the loaded blocks and what they map to
struct {
@ -2965,7 +2967,7 @@ void init_gamepak_buffer(void)
gamepak_buffer_count = 0;
while (gamepak_buffer_count < ROM_BUFFER_SIZE)
{
void *ptr = malloc(1024*1024);
void *ptr = malloc(gamepak_buffer_blocksize);
if (!ptr)
break;
gamepak_buffers[gamepak_buffer_count++] = (u8*)ptr;
@ -2982,6 +2984,13 @@ void init_gamepak_buffer(void)
gamepak_lru_tail = 32 * gamepak_buffer_count - 1;
}
bool gamepak_must_swap(void)
{
// Returns whether the current gamepak buffer is not big enough to hold
// the full gamepak ROM. In these cases the device must swap.
return gamepak_buffer_count * gamepak_buffer_blocksize < gamepak_size;
}
void init_memory(void)
{
u32 map_offset = 0;
@ -3182,7 +3191,7 @@ static s32 load_gamepak_raw(const char *name)
gamepak_size = (gamepak_size + 0x7FFF) & ~0x7FFF;
// Load stuff in 1MB chunks
u32 buf_blocks = (gamepak_size + 1024*1024-1) / (1024*1024);
u32 buf_blocks = (gamepak_size + gamepak_buffer_blocksize-1) / (gamepak_buffer_blocksize);
u32 rom_blocks = gamepak_size >> 15;
u32 ldblks = buf_blocks < gamepak_buffer_count ?
buf_blocks : gamepak_buffer_count;
@ -3194,7 +3203,7 @@ static s32 load_gamepak_raw(const char *name)
for (i = 0; i < ldblks; i++)
{
// Load 1MB chunk and map it
filestream_read(gamepak_file_large, gamepak_buffers[i], 1024*1024);
filestream_read(gamepak_file_large, gamepak_buffers[i], gamepak_buffer_blocksize);
for (j = 0; j < 32 && i*32 + j < rom_blocks; j++)
{
u32 phyn = i*32 + j;

View File

@ -200,6 +200,7 @@ s32 load_bios(char *name);
void update_backup(void);
void init_memory(void);
void init_gamepak_buffer(void);
bool gamepak_must_swap(void);
void memory_term(void);
u8 *load_gamepak_page(u32 physical_index);

2
main.c
View File

@ -102,7 +102,7 @@ void init_main(void)
video_count = 960;
#ifdef HAVE_DYNAREC
init_emitter();
init_emitter(gamepak_must_swap());
init_caches();
#endif
}

View File

@ -2099,7 +2099,7 @@ typedef struct {
static void emit_pmemld_stub(
unsigned memop_number, const t_stub_meminfo *meminfo,
bool signext, unsigned size,
unsigned alignment, bool aligned,
unsigned alignment, bool aligned, bool must_swap,
u8 **tr_ptr)
{
u8 *translation_ptr = *tr_ptr;
@ -2158,28 +2158,31 @@ static void emit_pmemld_stub(
}
if (region >= 8 && region <= 12) {
u8 *jmppatch;
// ROM area: might need to load the ROM on-demand
mips_emit_srl(reg_rv, reg_a0, 15); // 32KB page number
mips_emit_sll(reg_rv, reg_rv, 2); // (word indexed)
mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset
mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset
mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000] is readmap ptr
mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [can go in delay slot]
mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000]
mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page
mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [delay]
if (must_swap) { // Do not emit if the ROM is fully loaded, save some cycles
u8 *jmppatch;
mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page
generate_swap_delay();
// This code call the C routine to map the relevant ROM page
emit_save_regs(aligned);
mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3);
extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff
genccall(&load_gamepak_page);
mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1);
// This code call the C routine to map the relevant ROM page
emit_save_regs(aligned);
mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3);
extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff
genccall(&load_gamepak_page); // Returns valid pointer in rv
mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1);
mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1);
emit_restore_regs(aligned);
mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3);
mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1);
emit_restore_regs(aligned);
mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3);
generate_branch_patch_conditional(jmppatch, translation_ptr);
generate_branch_patch_conditional(jmppatch - 4, translation_ptr);
}
// Now we can proceed to load, place addr in the right register
mips_emit_addu(reg_rv, reg_rv, reg_temp);
} else if (region == 14) {
@ -2620,7 +2623,7 @@ typedef void (*sthldr_t)(
typedef void (*ldhldr_t)(
unsigned memop_number, const t_stub_meminfo *meminfo,
bool signext, unsigned size,
unsigned alignment, bool aligned,
unsigned alignment, bool aligned, bool must_swap,
u8 **tr_ptr);
// Generates a patch handler for a given access size
@ -2684,7 +2687,7 @@ static void emit_phand(
// - memop patcher: Patches a memop whenever it accesses the wrong mem region
// - mem stubs: There's stubs for load & store, and every memory region
// and possible operand size and misaligment (+sign extensions)
void init_emitter() {
void init_emitter(bool must_swap) {
int i;
// Initialize memory to a debuggable state
rom_cache_watermark = INITIAL_ROM_WATERMARK;
@ -2756,20 +2759,20 @@ void init_emitter() {
for (i = 0; i < sizeof(ldinfo)/sizeof(ldinfo[0]); i++) {
ldhldr_t handler = (ldhldr_t)ldinfo[i].emitter;
/* region info signext sz al isaligned */
handler(0, &ldinfo[i], false, 0, 0, false, &translation_ptr); // ld u8
handler(1, &ldinfo[i], true, 0, 0, false, &translation_ptr); // ld s8
handler(0, &ldinfo[i], false, 0, 0, false, must_swap, &translation_ptr); // ld u8
handler(1, &ldinfo[i], true, 0, 0, false, must_swap, &translation_ptr); // ld s8
handler(2, &ldinfo[i], false, 1, 0, false, &translation_ptr); // ld u16
handler(3, &ldinfo[i], false, 1, 1, false, &translation_ptr); // ld u16u1
handler(4, &ldinfo[i], true, 1, 0, false, &translation_ptr); // ld s16
handler(5, &ldinfo[i], true, 1, 1, false, &translation_ptr); // ld s16u1
handler(2, &ldinfo[i], false, 1, 0, false, must_swap, &translation_ptr); // ld u16
handler(3, &ldinfo[i], false, 1, 1, false, must_swap, &translation_ptr); // ld u16u1
handler(4, &ldinfo[i], true, 1, 0, false, must_swap, &translation_ptr); // ld s16
handler(5, &ldinfo[i], true, 1, 1, false, must_swap, &translation_ptr); // ld s16u1
handler(6, &ldinfo[i], false, 2, 0, false, &translation_ptr); // ld u32
handler(7, &ldinfo[i], false, 2, 1, false, &translation_ptr); // ld u32u1
handler(8, &ldinfo[i], false, 2, 2, false, &translation_ptr); // ld u32u2
handler(9, &ldinfo[i], false, 2, 3, false, &translation_ptr); // ld u32u3
handler(6, &ldinfo[i], false, 2, 0, false, must_swap, &translation_ptr); // ld u32
handler(7, &ldinfo[i], false, 2, 1, false, must_swap, &translation_ptr); // ld u32u1
handler(8, &ldinfo[i], false, 2, 2, false, must_swap, &translation_ptr); // ld u32u2
handler(9, &ldinfo[i], false, 2, 3, false, must_swap, &translation_ptr); // ld u32u3
handler(10,&ldinfo[i], false, 2, 0, true, &translation_ptr); // aligned ld u32
handler(10,&ldinfo[i], false, 2, 0, true, must_swap, &translation_ptr); // aligned ld u32
}
const t_stub_meminfo stinfo [] = {

View File

@ -2260,7 +2260,7 @@ static void function_cc execute_swi(u32 pc)
extern void* x86_table_data[9][16];
extern void* x86_table_info[9][16];
void init_emitter(void) {
void init_emitter(bool must_swap) {
memcpy(x86_table_info, x86_table_data, sizeof(x86_table_data));
rom_cache_watermark = INITIAL_ROM_WATERMARK;

View File

@ -38,6 +38,7 @@ _##symbol:
// otherwise), where we use our own convention call. However calls to C code must
// follow the calling convention. x86 is built with regparm=2 to avoid stack usage.
#if defined(__x86_64__) || defined(__amd64__)
#define JUMP_CX_ZERO jrcxz
#define ADDR_TYPE .quad
#define ADDR_SIZE_BYTES 8
#define STACK_REG %rsp
@ -62,6 +63,7 @@ _##symbol:
#endif
#define SETUP_ARGS mov %eax, CARG1_REG; mov %edx, CARG2_REG;
#else
#define JUMP_CX_ZERO jecxz
#define ADDR_TYPE .long
#define ADDR_SIZE_BYTES 4
#define STACK_REG %esp
@ -422,13 +424,14 @@ ext_load_io##rtype: ;\
ret ;\
;\
ext_load_rom##rtype: ;\
mov %eax, %ecx /* ecx = address */ ;\
shr $15, %ecx /* ecx = address >> 15 */ ;\
mov %eax, %edx /* edx = address */ ;\
shr $15, %edx /* edx = address >> 15 */ ;\
/* Read rdmap pointer */ ;\
mov RDMAP_OFF(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES), FULLREG(dx) ;\
mov %eax, %ecx /* ecx = address */ ;\
and $0x7FFF, %ecx /* ecx = address LSB */ ;\
movop (FULLREG(dx), FULLREG(cx)), %eax /* Read mem */ ;\
mov RDMAP_OFF(REG_BASE, FULLREG(dx), ADDR_SIZE_BYTES), FULLREG(cx) ;\
JUMP_CX_ZERO ext_load_slow##rtype /* page not loaded, slow */ ;\
mov %eax, %edx /* edx = address */ ;\
and $0x7FFF, %edx /* edx = address LSB */ ;\
movop (FULLREG(cx), FULLREG(dx)), %eax /* Read mem */ ;\
ret ;\
;\
ext_load_slow##rtype: ;\