diff --git a/arm/arm64_emit.h b/arm/arm64_emit.h index 4b4cb7a..6b4137c 100644 --- a/arm/arm64_emit.h +++ b/arm/arm64_emit.h @@ -1869,7 +1869,7 @@ extern void* ldst_handler_functions[16*4 + 17*6]; extern void* ldst_lookup_tables[16*4 + 17*6]; -void init_emitter() { +void init_emitter(bool must_swap) { rom_cache_watermark = INITIAL_ROM_WATERMARK; init_bios_hooks(); diff --git a/arm/arm64_stub.S b/arm/arm64_stub.S index fb66ece..38414fc 100644 --- a/arm/arm64_stub.S +++ b/arm/arm64_stub.S @@ -443,6 +443,7 @@ ld_rdmap_##load_type: ;\ lsr w4, w0, #15 /* Each block is 32KB */;\ add x3, reg_base, #(RDMAP_OFF) ;\ ldr x4, [x3, x4, lsl #3] /* x4 = table pointer */;\ + cbz x4, ld_slow_##load_type /* not mapped, go slow */ ;\ and w0, w0, #(0x7fff) /* 32KB pages */;\ ldop w0, [x4, x0] /* load actual value */;\ ret ;\ diff --git a/arm/arm_emit.h b/arm/arm_emit.h index 0b08a8d..ab242d0 100644 --- a/arm/arm_emit.h +++ b/arm/arm_emit.h @@ -1981,14 +1981,24 @@ void *div6, *divarm7; generate_indirect_branch_no_cycle_update(type) \ -extern u32 ldst_handler_functions[9][17]; -extern u32 ldst_lookup_tables[9][17]; +extern u32 st_handler_functions[4][17]; +extern u32 ld_handler_functions[5][17]; +extern u32 ld_swap_handler_functions[5][17]; -void init_emitter(void) { +// Tables used by the memory handlers (placed near reg_base) +extern u32 ld_lookup_tables[5][17]; +extern u32 st_lookup_tables[4][17]; + +void init_emitter(bool must_swap) { int i; // Generate handler table - memcpy(ldst_lookup_tables, ldst_handler_functions, sizeof(ldst_lookup_tables)); + memcpy(st_lookup_tables, st_handler_functions, sizeof(st_lookup_tables)); + // Issue faster paths if swapping is not required + if (must_swap) + memcpy(ld_lookup_tables, ld_swap_handler_functions, sizeof(ld_lookup_tables)); + else + memcpy(ld_lookup_tables, ld_handler_functions, sizeof(ld_lookup_tables)); rom_cache_watermark = INITIAL_ROM_WATERMARK; u8 *translation_ptr = (u8*)&rom_translation_cache[0]; diff --git a/arm/arm_stub.S b/arm/arm_stub.S index 2248bdb..a5beeda 100644 --- a/arm/arm_stub.S +++ b/arm/arm_stub.S @@ -803,7 +803,20 @@ ld_oamram_##load_type: /* OAM RAM area */;\ exec_ld_op_##load_type(10) /* Clear upper bits (10 LSB) */;\ add pc, lr, #4 ;\ ;\ -/* ROM area (or VRAM): uses generic memory handlers */ ;\ +/* ROM area: uses generic memory handlers to handle swapping */ ;\ +ld_rdmap_slow_##load_type: ;\ + save_flags() ;\ + add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\ + mov r1, r0, lsr #15 /* r1 = page index of address */;\ + ldr r2, [r2, r1, lsl #2] /* r2 = base addr */;\ + cmp r2, #0 ;\ + beq 9f /* Page miss, go slow */;\ + ;\ + exec_ld_op_##load_type(15) /* Pages are 32KB big */;\ + restore_flags() ;\ + add pc, lr, #4 ;\ + ;\ +/* ROM/VRAM area: uses generic memory handlers, assumes is mapped */ ;\ ld_rdmap_##load_type: ;\ add r2, reg_base, #RDMAP_OFF /* r2 = memory_map_read */;\ mov r1, r0, lsr #15 /* r1 = page index of address */;\ @@ -815,6 +828,7 @@ ld_rdmap_##load_type: ;\ /* Slow load path, for open/unmapped loads */;\ ld_slow_##load_type: ;\ save_flags() ;\ +9: ;\ ldr r1, [lr] /* r1 = PC */;\ 10: ;\ str r1, [reg_base, #REG_PC] /* update PC */;\ @@ -823,7 +837,7 @@ ld_slow_##load_type: ;\ add pc, lr, #4 /* return */;\ -#define load_table_gen(load_type) ;\ +#define load_table_gen(load_type, rdmapfn) ;\ .long ld_slow_##load_type /* -1 (for regions above F) */;\ .long ld_bios_##load_type /* 0 BIOS */;\ .long ld_slow_##load_type /* 1 Bad region */;\ @@ -833,11 +847,11 @@ ld_slow_##load_type: ;\ .long ld_palram_##load_type /* 5 Palette RAM, via map */;\ .long ld_rdmap_##load_type /* 6 VRAM area */;\ .long ld_oamram_##load_type /* 7 OAM RAM */;\ - .long ld_rdmap_##load_type /* 8 ROM, via map */;\ - .long ld_rdmap_##load_type /* 9 ROM, via map */;\ - .long ld_rdmap_##load_type /* A ROM, via map */;\ - .long ld_rdmap_##load_type /* B ROM, via map */;\ - .long ld_rdmap_##load_type /* C ROM, via map */;\ + .long ld_##rdmapfn##_##load_type /* 8 ROM, via map */;\ + .long ld_##rdmapfn##_##load_type /* 9 ROM, via map */;\ + .long ld_##rdmapfn##_##load_type /* A ROM, via map */;\ + .long ld_##rdmapfn##_##load_type /* B ROM, via map */;\ + .long ld_##rdmapfn##_##load_type /* C ROM, via map */;\ .long ld_slow_##load_type /* D ROM or EEPROM/FLASH */;\ .long ld_slow_##load_type /* E EEPROM/FLASH */;\ .long ld_slow_##load_type /* F Bad region */;\ @@ -853,16 +867,23 @@ execute_load_builder(u32, 2, read_memory32 ) .data .align 4 -defsymbl(ldst_handler_functions) +defsymbl(st_handler_functions) store_lookup_table(8) store_lookup_table(16) store_lookup_table(32) store_lookup_table(32_safe) - load_table_gen(u8) - load_table_gen(s8) - load_table_gen(u16) - load_table_gen(s16) - load_table_gen(u32) +defsymbl(ld_handler_functions) + load_table_gen(u8, rdmap) + load_table_gen(s8, rdmap) + load_table_gen(u16, rdmap) + load_table_gen(s16, rdmap) + load_table_gen(u32, rdmap) +defsymbl(ld_swap_handler_functions) + load_table_gen(u8, rdmap_slow) + load_table_gen(s8, rdmap_slow) + load_table_gen(u16, rdmap_slow) + load_table_gen(s16, rdmap_slow) + load_table_gen(u32, rdmap_slow) .bss .align 4 @@ -878,8 +899,9 @@ defsymbl(reg) defsymbl(spsr) .space 24 @ Place lookup tables here for easy access via base_reg too -defsymbl(ldst_lookup_tables) +defsymbl(st_lookup_tables) .space 4*17*4 @ store +defsymbl(ld_lookup_tables) .space 5*17*4 @ loads .space 132 @ Padding for alignment defsymbl(reg_mode) diff --git a/cpu.h b/cpu.h index 1c641c3..00aeddf 100644 --- a/cpu.h +++ b/cpu.h @@ -164,7 +164,7 @@ void flush_translation_cache_rom(void); void flush_translation_cache_ram(void); void dump_translation_cache(void); void init_caches(void); -void init_emitter(void); +void init_emitter(bool); void init_bios_hooks(void); extern u32 reg_mode[7][7]; diff --git a/gba_memory.c b/gba_memory.c index e8e9bda..10b3c7d 100644 --- a/gba_memory.c +++ b/gba_memory.c @@ -318,6 +318,8 @@ dma_transfer_type dma[4]; u8 *gamepak_buffers[32]; /* Pointers to malloc'ed blocks */ u32 gamepak_buffer_count; /* Value between 1 and 32 */ u32 gamepak_size; /* Size of the ROM in bytes */ +// We allocate in 1MB chunks. +const unsigned gamepak_buffer_blocksize = 1024*1024; // LRU queue with the loaded blocks and what they map to struct { @@ -2965,7 +2967,7 @@ void init_gamepak_buffer(void) gamepak_buffer_count = 0; while (gamepak_buffer_count < ROM_BUFFER_SIZE) { - void *ptr = malloc(1024*1024); + void *ptr = malloc(gamepak_buffer_blocksize); if (!ptr) break; gamepak_buffers[gamepak_buffer_count++] = (u8*)ptr; @@ -2982,6 +2984,13 @@ void init_gamepak_buffer(void) gamepak_lru_tail = 32 * gamepak_buffer_count - 1; } +bool gamepak_must_swap(void) +{ + // Returns whether the current gamepak buffer is not big enough to hold + // the full gamepak ROM. In these cases the device must swap. + return gamepak_buffer_count * gamepak_buffer_blocksize < gamepak_size; +} + void init_memory(void) { u32 map_offset = 0; @@ -3182,7 +3191,7 @@ static s32 load_gamepak_raw(const char *name) gamepak_size = (gamepak_size + 0x7FFF) & ~0x7FFF; // Load stuff in 1MB chunks - u32 buf_blocks = (gamepak_size + 1024*1024-1) / (1024*1024); + u32 buf_blocks = (gamepak_size + gamepak_buffer_blocksize-1) / (gamepak_buffer_blocksize); u32 rom_blocks = gamepak_size >> 15; u32 ldblks = buf_blocks < gamepak_buffer_count ? buf_blocks : gamepak_buffer_count; @@ -3194,7 +3203,7 @@ static s32 load_gamepak_raw(const char *name) for (i = 0; i < ldblks; i++) { // Load 1MB chunk and map it - filestream_read(gamepak_file_large, gamepak_buffers[i], 1024*1024); + filestream_read(gamepak_file_large, gamepak_buffers[i], gamepak_buffer_blocksize); for (j = 0; j < 32 && i*32 + j < rom_blocks; j++) { u32 phyn = i*32 + j; diff --git a/gba_memory.h b/gba_memory.h index 8e5cf7e..865711a 100644 --- a/gba_memory.h +++ b/gba_memory.h @@ -200,6 +200,7 @@ s32 load_bios(char *name); void update_backup(void); void init_memory(void); void init_gamepak_buffer(void); +bool gamepak_must_swap(void); void memory_term(void); u8 *load_gamepak_page(u32 physical_index); diff --git a/main.c b/main.c index ac738b8..5b84953 100644 --- a/main.c +++ b/main.c @@ -102,7 +102,7 @@ void init_main(void) video_count = 960; #ifdef HAVE_DYNAREC - init_emitter(); + init_emitter(gamepak_must_swap()); init_caches(); #endif } diff --git a/mips/mips_emit.h b/mips/mips_emit.h index f9ce27c..4c7da99 100644 --- a/mips/mips_emit.h +++ b/mips/mips_emit.h @@ -2099,7 +2099,7 @@ typedef struct { static void emit_pmemld_stub( unsigned memop_number, const t_stub_meminfo *meminfo, bool signext, unsigned size, - unsigned alignment, bool aligned, + unsigned alignment, bool aligned, bool must_swap, u8 **tr_ptr) { u8 *translation_ptr = *tr_ptr; @@ -2158,28 +2158,31 @@ static void emit_pmemld_stub( } if (region >= 8 && region <= 12) { - u8 *jmppatch; // ROM area: might need to load the ROM on-demand mips_emit_srl(reg_rv, reg_a0, 15); // 32KB page number mips_emit_sll(reg_rv, reg_rv, 2); // (word indexed) - mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset + mips_emit_addu(reg_rv, reg_rv, reg_base); // base + offset + mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000] is readmap ptr + mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [can go in delay slot] - mips_emit_lw(reg_rv, reg_rv, 0x8000); // base[offset-0x8000] - mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page - mips_emit_andi(reg_temp, reg_a0, memmask); // Get the lowest 15 bits [delay] + if (must_swap) { // Do not emit if the ROM is fully loaded, save some cycles + u8 *jmppatch; + mips_emit_b_filler(bne, reg_rv, reg_zero, jmppatch); // if not null, can skip load page + generate_swap_delay(); - // This code call the C routine to map the relevant ROM page - emit_save_regs(aligned); - mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3); - extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff - genccall(&load_gamepak_page); - mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1); + // This code call the C routine to map the relevant ROM page + emit_save_regs(aligned); + mips_emit_sw(mips_reg_ra, reg_base, ReOff_SaveR3); + extract_bits(reg_a0, reg_a0, 15, 10); // a0 = (addr >> 15) & 0x3ff + genccall(&load_gamepak_page); // Returns valid pointer in rv + mips_emit_sw(reg_temp, reg_base, ReOff_SaveR1); - mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1); - emit_restore_regs(aligned); - mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3); + mips_emit_lw(reg_temp, reg_base, ReOff_SaveR1); + emit_restore_regs(aligned); + mips_emit_lw(mips_reg_ra, reg_base, ReOff_SaveR3); - generate_branch_patch_conditional(jmppatch, translation_ptr); + generate_branch_patch_conditional(jmppatch - 4, translation_ptr); + } // Now we can proceed to load, place addr in the right register mips_emit_addu(reg_rv, reg_rv, reg_temp); } else if (region == 14) { @@ -2620,7 +2623,7 @@ typedef void (*sthldr_t)( typedef void (*ldhldr_t)( unsigned memop_number, const t_stub_meminfo *meminfo, bool signext, unsigned size, - unsigned alignment, bool aligned, + unsigned alignment, bool aligned, bool must_swap, u8 **tr_ptr); // Generates a patch handler for a given access size @@ -2684,7 +2687,7 @@ static void emit_phand( // - memop patcher: Patches a memop whenever it accesses the wrong mem region // - mem stubs: There's stubs for load & store, and every memory region // and possible operand size and misaligment (+sign extensions) -void init_emitter() { +void init_emitter(bool must_swap) { int i; // Initialize memory to a debuggable state rom_cache_watermark = INITIAL_ROM_WATERMARK; @@ -2756,20 +2759,20 @@ void init_emitter() { for (i = 0; i < sizeof(ldinfo)/sizeof(ldinfo[0]); i++) { ldhldr_t handler = (ldhldr_t)ldinfo[i].emitter; /* region info signext sz al isaligned */ - handler(0, &ldinfo[i], false, 0, 0, false, &translation_ptr); // ld u8 - handler(1, &ldinfo[i], true, 0, 0, false, &translation_ptr); // ld s8 + handler(0, &ldinfo[i], false, 0, 0, false, must_swap, &translation_ptr); // ld u8 + handler(1, &ldinfo[i], true, 0, 0, false, must_swap, &translation_ptr); // ld s8 - handler(2, &ldinfo[i], false, 1, 0, false, &translation_ptr); // ld u16 - handler(3, &ldinfo[i], false, 1, 1, false, &translation_ptr); // ld u16u1 - handler(4, &ldinfo[i], true, 1, 0, false, &translation_ptr); // ld s16 - handler(5, &ldinfo[i], true, 1, 1, false, &translation_ptr); // ld s16u1 + handler(2, &ldinfo[i], false, 1, 0, false, must_swap, &translation_ptr); // ld u16 + handler(3, &ldinfo[i], false, 1, 1, false, must_swap, &translation_ptr); // ld u16u1 + handler(4, &ldinfo[i], true, 1, 0, false, must_swap, &translation_ptr); // ld s16 + handler(5, &ldinfo[i], true, 1, 1, false, must_swap, &translation_ptr); // ld s16u1 - handler(6, &ldinfo[i], false, 2, 0, false, &translation_ptr); // ld u32 - handler(7, &ldinfo[i], false, 2, 1, false, &translation_ptr); // ld u32u1 - handler(8, &ldinfo[i], false, 2, 2, false, &translation_ptr); // ld u32u2 - handler(9, &ldinfo[i], false, 2, 3, false, &translation_ptr); // ld u32u3 + handler(6, &ldinfo[i], false, 2, 0, false, must_swap, &translation_ptr); // ld u32 + handler(7, &ldinfo[i], false, 2, 1, false, must_swap, &translation_ptr); // ld u32u1 + handler(8, &ldinfo[i], false, 2, 2, false, must_swap, &translation_ptr); // ld u32u2 + handler(9, &ldinfo[i], false, 2, 3, false, must_swap, &translation_ptr); // ld u32u3 - handler(10,&ldinfo[i], false, 2, 0, true, &translation_ptr); // aligned ld u32 + handler(10,&ldinfo[i], false, 2, 0, true, must_swap, &translation_ptr); // aligned ld u32 } const t_stub_meminfo stinfo [] = { diff --git a/x86/x86_emit.h b/x86/x86_emit.h index 0df117e..ebe4b28 100644 --- a/x86/x86_emit.h +++ b/x86/x86_emit.h @@ -2260,7 +2260,7 @@ static void function_cc execute_swi(u32 pc) extern void* x86_table_data[9][16]; extern void* x86_table_info[9][16]; -void init_emitter(void) { +void init_emitter(bool must_swap) { memcpy(x86_table_info, x86_table_data, sizeof(x86_table_data)); rom_cache_watermark = INITIAL_ROM_WATERMARK; diff --git a/x86/x86_stub.S b/x86/x86_stub.S index 57c4eab..90a156b 100644 --- a/x86/x86_stub.S +++ b/x86/x86_stub.S @@ -38,6 +38,7 @@ _##symbol: // otherwise), where we use our own convention call. However calls to C code must // follow the calling convention. x86 is built with regparm=2 to avoid stack usage. #if defined(__x86_64__) || defined(__amd64__) + #define JUMP_CX_ZERO jrcxz #define ADDR_TYPE .quad #define ADDR_SIZE_BYTES 8 #define STACK_REG %rsp @@ -62,6 +63,7 @@ _##symbol: #endif #define SETUP_ARGS mov %eax, CARG1_REG; mov %edx, CARG2_REG; #else + #define JUMP_CX_ZERO jecxz #define ADDR_TYPE .long #define ADDR_SIZE_BYTES 4 #define STACK_REG %esp @@ -422,13 +424,14 @@ ext_load_io##rtype: ;\ ret ;\ ;\ ext_load_rom##rtype: ;\ - mov %eax, %ecx /* ecx = address */ ;\ - shr $15, %ecx /* ecx = address >> 15 */ ;\ + mov %eax, %edx /* edx = address */ ;\ + shr $15, %edx /* edx = address >> 15 */ ;\ /* Read rdmap pointer */ ;\ - mov RDMAP_OFF(REG_BASE, FULLREG(cx), ADDR_SIZE_BYTES), FULLREG(dx) ;\ - mov %eax, %ecx /* ecx = address */ ;\ - and $0x7FFF, %ecx /* ecx = address LSB */ ;\ - movop (FULLREG(dx), FULLREG(cx)), %eax /* Read mem */ ;\ + mov RDMAP_OFF(REG_BASE, FULLREG(dx), ADDR_SIZE_BYTES), FULLREG(cx) ;\ + JUMP_CX_ZERO ext_load_slow##rtype /* page not loaded, slow */ ;\ + mov %eax, %edx /* edx = address */ ;\ + and $0x7FFF, %edx /* edx = address LSB */ ;\ + movop (FULLREG(cx), FULLREG(dx)), %eax /* Read mem */ ;\ ret ;\ ;\ ext_load_slow##rtype: ;\