Rework ram block ptrs to remove second indirection table.

This removes ram_block_ptrs and encodes the pointer directly in the
block tag. Saves ~256KB at no performance cost.
Drawback is that it limits the ram cache size to 512KB (we were using
768KB before). Should not be a problem since most games use less than
32KB of cache anyway.

Fixed ARM routines accordingly.
This commit is contained in:
David Guillen Fandos 2021-07-08 21:29:48 +02:00
parent aded681de2
commit 3144d9e277
3 changed files with 66 additions and 48 deletions

View File

@ -179,6 +179,7 @@ execute_pc_##mode: ;\
.long 3f /* F Bad region */;\
;\
3: ;\
/* r0 already contains the PC to jump to */ ;\
call_c_function(block_lookup_address_##mode) ;\
restore_flags() ;\
bx r0 ;\
@ -187,23 +188,23 @@ execute_pc_##mode: ;\
mov r2, r0, lsl #14 /* addr &= 0x3ffff */;\
mov r2, r2, lsr #14 ;\
ldrh r2, [r1, r2] /* Load half word there */;\
ldr r1, =(ram_block_ptrs) ;\
ldr r1, [r1, r2, lsl #2] /* Pointer to the cache */;\
cmp r1, #0 /* NULL means not translated */;\
beq 3b /* Need to translate */;\
ldr r1, =(ram_translation_cache) ;\
subs r2, #0x0104 /* Check valid tag + rebase */;\
ble 3b /* Data/non-entry code -> transl.*/;\
mov r2, r2, lsr #1 /* Ignores LSB */;\
restore_flags() ;\
bx r1 ;\
add pc, r1, r2, lsl #4 /* Offset is 16 byte aligned */;\
2: ;\
ldr r1, =(iwram) /* Load base addr */;\
mov r2, r0, lsl #17 /* addr &= 0x7fff */;\
mov r2, r2, lsr #17 ;\
ldrh r2, [r1, r2] /* Load half word there */;\
ldr r1, =(ram_block_ptrs) ;\
ldr r1, [r1, r2, lsl #2] /* Pointer to the cache */;\
cmp r1, #0 /* NULL means not translated */;\
beq 3b /* Need to translate */;\
ldr r1, =(ram_translation_cache) ;\
subs r2, #0x0104 /* Check valid tag + rebase */;\
ble 3b /* Data/non-entry code -> transl.*/;\
mov r2, r2, lsr #1 /* Ignores LSB */;\
restore_flags() ;\
bx r1 ;\
add pc, r1, r2, lsl #4 /* Offset is 16 byte aligned */;\
.size arm_indirect_branch_##mode, .-arm_indirect_branch_##mode
execute_pc_builder(arm, 0x3)

View File

@ -2440,8 +2440,27 @@ void translate_icache_sync() {
block_data[block_data_position].flag_data = flag_status; \
} \
u8 *ram_block_ptrs[1024 * 64];
u32 ram_block_tag_top = 0x0101;
// I/EWRAM memory tagging
// Code emitted in the RAM cache has tags (16 bit values) in the mirror tag ram
// that indicate that the address contains code. The following values are used:
// 0x0000 : this is just data (never translated)
// 0x00XX : not used (since first byte is zero)
// 0x0101 : this is code that is not the start of a translated block
// 0x0105 : first possible tag (we do not use 103 due to arm asm immediates)
// 0xXXXX : this is the start of a translated block
//
// The tag value is a pointer to the ram cache where the translated block starts
// The tag value must have both bytes set to non-zero, therefore the LSB is
// always 1 and starts from 0x0101 value.
// The 15 value is an index to 16 byte blocks in the RAM CACHE, therefore this
// cache is limited to ~ 2^15 * 16byte ~= 512KB
#define CODE_TAG_BLOCK16 0x0101
#define CODE_TAG_BLOCK32 0x01010101
#define INITIAL_TOP_TAG 0x0105
#define TAG2ADDR(tag) (&ram_translation_cache[((((tag) - INITIAL_TOP_TAG) >> 1) << 4)])
#define ADDR2TAG(addr) ((((((u8*)addr) - &ram_translation_cache[0]) >> 4) << 1) + INITIAL_TOP_TAG)
#define VALIDTAG(tag) (tag >= INITIAL_TOP_TAG)
// This function will return a pointer to a translated block of code. If it
// doesn't exist it will translate it, if it does it will pass it back.
@ -2493,41 +2512,38 @@ u32 ram_block_tag_top = 0x0101;
mem_type##_translation_region, smc_enable); \
} \
// 0x0101 is the smallest tag that can be used. 0xFFFF is marked
// in the middle of blocks and used for write guarding, it doesn't
// indicate a valid block either (it's okay to compile a new block
// that overlaps the earlier one, although this should be relatively
// uncommon)
#define fill_tag_arm(tag) \
location[0] = (tag); \
location[1] = CODE_TAG_BLOCK16
#define fill_tag_arm(mem_type) \
location[0] = mem_type##_block_tag_top; \
location[1] = 0xFFFF \
#define fill_tag_thumb(tag) \
*location = (tag)
#define fill_tag_thumb(mem_type) \
*location = mem_type##_block_tag_top \
#define fill_tag_dual(mem_type) \
#define fill_tag_dual(tag) \
if(thumb) \
fill_tag_thumb(mem_type); \
else \
fill_tag_arm(mem_type) \
fill_tag_thumb(tag); \
else { \
fill_tag_arm(tag); \
} \
#define block_lookup_translate(instruction_type, mem_type, smc_enable) \
#define block_lookup_translate_ram(instruction_type) \
block_tag = *location; \
if((block_tag < 0x0101) || (block_tag == 0xFFFF)) \
if(!VALIDTAG(block_tag)) \
{ \
__label__ redo; \
s32 translation_result; \
\
redo: \
\
translation_recursion_level++; \
block_address = mem_type##_translation_ptr + block_prologue_size; \
mem_type##_block_ptrs[mem_type##_block_tag_top] = block_address; \
fill_tag_##instruction_type(mem_type); \
mem_type##_block_tag_top++; \
/* Pad the start of the block to 16 bytes, see "memory tagging" above */ \
while ((((uintptr_t)ram_translation_ptr) % 16) != block_prologue_size) \
ram_translation_ptr++; \
\
block_lookup_translate_##instruction_type(mem_type, smc_enable); \
translation_recursion_level++; \
block_address = ram_translation_ptr + block_prologue_size; \
fill_tag_##instruction_type(ADDR2TAG(block_address)); \
\
block_lookup_translate_##instruction_type(ram, 1); \
translation_recursion_level--; \
\
/* If the translation failed then pass that failure on if we're in \
@ -2545,7 +2561,7 @@ u32 ram_block_tag_top = 0x0101;
} \
else \
{ \
block_address = mem_type##_block_ptrs[block_tag]; \
block_address = (u8*)TAG2ADDR(block_tag); \
} \
u32 translation_recursion_level = 0;
@ -2569,12 +2585,12 @@ u8 function_cc *block_lookup_address_##type(u32 pc) \
{ \
case 0x2: \
location = (u16 *)(ewram + (pc & 0x3FFFF) + 0x40000); \
block_lookup_translate(type, ram, 1); \
block_lookup_translate_ram(type); \
break; \
\
case 0x3: \
location = (u16 *)(iwram + (pc & 0x7FFF)); \
block_lookup_translate(type, ram, 1); \
block_lookup_translate_ram(type); \
break; \
\
case 0x0: \
@ -2879,7 +2895,7 @@ block_exit_type block_exits[MAX_EXITS];
if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
{ \
address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
0xFFFFFFFF; \
CODE_TAG_BLOCK32; \
} \
}
@ -2887,7 +2903,8 @@ block_exit_type block_exits[MAX_EXITS];
int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
{ \
address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = 0xFFFF; \
address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
CODE_TAG_BLOCK16; \
} \
}
@ -3405,7 +3422,6 @@ void flush_translation_cache_ram(void)
last_ram_translation_ptr = ram_translation_cache;
ram_translation_ptr = ram_translation_cache;
ram_block_tag_top = 0x0101;
// Proceed to clean the SMC area if needed
// (also try to memset as little as possible for performance)
@ -3446,8 +3462,6 @@ void init_caches(void)
iwram_code_min = 0;
iwram_code_max = 0x7FFF;
flush_translation_cache_ram();
/* Ensure 0 and FFFF get zeroed out */
memset(ram_block_ptrs, 0, sizeof(ram_block_ptrs));
}
#define cache_dump_prefix ""

View File

@ -4,15 +4,18 @@
/* Cache sizes and their config knobs */
#if defined(PSP)
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4)
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 2)
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 384)
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024)
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 2)
#else
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4 * 5)
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 384 * 2)
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 32)
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 10)
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 512)
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 8)
#endif
/* Please note that RAM_TRANSLATION_CACHE_SIZE is limited to 512KB
Check cpu_threaded.c for "memory tagging" for more info. */
/* This is MIPS specific for now */
#define STUB_ARENA_SIZE (16*1024)