Rework ram block ptrs to remove second indirection table.
This removes ram_block_ptrs and encodes the pointer directly in the block tag. Saves ~256KB at no performance cost. Drawback is that it limits the ram cache size to 512KB (we were using 768KB before). Should not be a problem since most games use less than 32KB of cache anyway. Fixed ARM routines accordingly.
This commit is contained in:
parent
aded681de2
commit
3144d9e277
|
@ -179,6 +179,7 @@ execute_pc_##mode: ;\
|
||||||
.long 3f /* F Bad region */;\
|
.long 3f /* F Bad region */;\
|
||||||
;\
|
;\
|
||||||
3: ;\
|
3: ;\
|
||||||
|
/* r0 already contains the PC to jump to */ ;\
|
||||||
call_c_function(block_lookup_address_##mode) ;\
|
call_c_function(block_lookup_address_##mode) ;\
|
||||||
restore_flags() ;\
|
restore_flags() ;\
|
||||||
bx r0 ;\
|
bx r0 ;\
|
||||||
|
@ -187,23 +188,23 @@ execute_pc_##mode: ;\
|
||||||
mov r2, r0, lsl #14 /* addr &= 0x3ffff */;\
|
mov r2, r0, lsl #14 /* addr &= 0x3ffff */;\
|
||||||
mov r2, r2, lsr #14 ;\
|
mov r2, r2, lsr #14 ;\
|
||||||
ldrh r2, [r1, r2] /* Load half word there */;\
|
ldrh r2, [r1, r2] /* Load half word there */;\
|
||||||
ldr r1, =(ram_block_ptrs) ;\
|
ldr r1, =(ram_translation_cache) ;\
|
||||||
ldr r1, [r1, r2, lsl #2] /* Pointer to the cache */;\
|
subs r2, #0x0104 /* Check valid tag + rebase */;\
|
||||||
cmp r1, #0 /* NULL means not translated */;\
|
ble 3b /* Data/non-entry code -> transl.*/;\
|
||||||
beq 3b /* Need to translate */;\
|
mov r2, r2, lsr #1 /* Ignores LSB */;\
|
||||||
restore_flags() ;\
|
restore_flags() ;\
|
||||||
bx r1 ;\
|
add pc, r1, r2, lsl #4 /* Offset is 16 byte aligned */;\
|
||||||
2: ;\
|
2: ;\
|
||||||
ldr r1, =(iwram) /* Load base addr */;\
|
ldr r1, =(iwram) /* Load base addr */;\
|
||||||
mov r2, r0, lsl #17 /* addr &= 0x7fff */;\
|
mov r2, r0, lsl #17 /* addr &= 0x7fff */;\
|
||||||
mov r2, r2, lsr #17 ;\
|
mov r2, r2, lsr #17 ;\
|
||||||
ldrh r2, [r1, r2] /* Load half word there */;\
|
ldrh r2, [r1, r2] /* Load half word there */;\
|
||||||
ldr r1, =(ram_block_ptrs) ;\
|
ldr r1, =(ram_translation_cache) ;\
|
||||||
ldr r1, [r1, r2, lsl #2] /* Pointer to the cache */;\
|
subs r2, #0x0104 /* Check valid tag + rebase */;\
|
||||||
cmp r1, #0 /* NULL means not translated */;\
|
ble 3b /* Data/non-entry code -> transl.*/;\
|
||||||
beq 3b /* Need to translate */;\
|
mov r2, r2, lsr #1 /* Ignores LSB */;\
|
||||||
restore_flags() ;\
|
restore_flags() ;\
|
||||||
bx r1 ;\
|
add pc, r1, r2, lsl #4 /* Offset is 16 byte aligned */;\
|
||||||
.size arm_indirect_branch_##mode, .-arm_indirect_branch_##mode
|
.size arm_indirect_branch_##mode, .-arm_indirect_branch_##mode
|
||||||
|
|
||||||
execute_pc_builder(arm, 0x3)
|
execute_pc_builder(arm, 0x3)
|
||||||
|
|
|
@ -2440,8 +2440,27 @@ void translate_icache_sync() {
|
||||||
block_data[block_data_position].flag_data = flag_status; \
|
block_data[block_data_position].flag_data = flag_status; \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
u8 *ram_block_ptrs[1024 * 64];
|
// I/EWRAM memory tagging
|
||||||
u32 ram_block_tag_top = 0x0101;
|
// Code emitted in the RAM cache has tags (16 bit values) in the mirror tag ram
|
||||||
|
// that indicate that the address contains code. The following values are used:
|
||||||
|
// 0x0000 : this is just data (never translated)
|
||||||
|
// 0x00XX : not used (since first byte is zero)
|
||||||
|
// 0x0101 : this is code that is not the start of a translated block
|
||||||
|
// 0x0105 : first possible tag (we do not use 103 due to arm asm immediates)
|
||||||
|
// 0xXXXX : this is the start of a translated block
|
||||||
|
//
|
||||||
|
// The tag value is a pointer to the ram cache where the translated block starts
|
||||||
|
// The tag value must have both bytes set to non-zero, therefore the LSB is
|
||||||
|
// always 1 and starts from 0x0101 value.
|
||||||
|
// The 15 value is an index to 16 byte blocks in the RAM CACHE, therefore this
|
||||||
|
// cache is limited to ~ 2^15 * 16byte ~= 512KB
|
||||||
|
|
||||||
|
#define CODE_TAG_BLOCK16 0x0101
|
||||||
|
#define CODE_TAG_BLOCK32 0x01010101
|
||||||
|
#define INITIAL_TOP_TAG 0x0105
|
||||||
|
#define TAG2ADDR(tag) (&ram_translation_cache[((((tag) - INITIAL_TOP_TAG) >> 1) << 4)])
|
||||||
|
#define ADDR2TAG(addr) ((((((u8*)addr) - &ram_translation_cache[0]) >> 4) << 1) + INITIAL_TOP_TAG)
|
||||||
|
#define VALIDTAG(tag) (tag >= INITIAL_TOP_TAG)
|
||||||
|
|
||||||
// This function will return a pointer to a translated block of code. If it
|
// This function will return a pointer to a translated block of code. If it
|
||||||
// doesn't exist it will translate it, if it does it will pass it back.
|
// doesn't exist it will translate it, if it does it will pass it back.
|
||||||
|
@ -2493,41 +2512,38 @@ u32 ram_block_tag_top = 0x0101;
|
||||||
mem_type##_translation_region, smc_enable); \
|
mem_type##_translation_region, smc_enable); \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
// 0x0101 is the smallest tag that can be used. 0xFFFF is marked
|
#define fill_tag_arm(tag) \
|
||||||
// in the middle of blocks and used for write guarding, it doesn't
|
location[0] = (tag); \
|
||||||
// indicate a valid block either (it's okay to compile a new block
|
location[1] = CODE_TAG_BLOCK16
|
||||||
// that overlaps the earlier one, although this should be relatively
|
|
||||||
// uncommon)
|
|
||||||
|
|
||||||
#define fill_tag_arm(mem_type) \
|
#define fill_tag_thumb(tag) \
|
||||||
location[0] = mem_type##_block_tag_top; \
|
*location = (tag)
|
||||||
location[1] = 0xFFFF \
|
|
||||||
|
|
||||||
#define fill_tag_thumb(mem_type) \
|
#define fill_tag_dual(tag) \
|
||||||
*location = mem_type##_block_tag_top \
|
|
||||||
|
|
||||||
#define fill_tag_dual(mem_type) \
|
|
||||||
if(thumb) \
|
if(thumb) \
|
||||||
fill_tag_thumb(mem_type); \
|
fill_tag_thumb(tag); \
|
||||||
else \
|
else { \
|
||||||
fill_tag_arm(mem_type) \
|
fill_tag_arm(tag); \
|
||||||
|
} \
|
||||||
|
|
||||||
#define block_lookup_translate(instruction_type, mem_type, smc_enable) \
|
#define block_lookup_translate_ram(instruction_type) \
|
||||||
block_tag = *location; \
|
block_tag = *location; \
|
||||||
if((block_tag < 0x0101) || (block_tag == 0xFFFF)) \
|
if(!VALIDTAG(block_tag)) \
|
||||||
{ \
|
{ \
|
||||||
__label__ redo; \
|
__label__ redo; \
|
||||||
s32 translation_result; \
|
s32 translation_result; \
|
||||||
\
|
\
|
||||||
redo: \
|
redo: \
|
||||||
\
|
\
|
||||||
translation_recursion_level++; \
|
/* Pad the start of the block to 16 bytes, see "memory tagging" above */ \
|
||||||
block_address = mem_type##_translation_ptr + block_prologue_size; \
|
while ((((uintptr_t)ram_translation_ptr) % 16) != block_prologue_size) \
|
||||||
mem_type##_block_ptrs[mem_type##_block_tag_top] = block_address; \
|
ram_translation_ptr++; \
|
||||||
fill_tag_##instruction_type(mem_type); \
|
|
||||||
mem_type##_block_tag_top++; \
|
|
||||||
\
|
\
|
||||||
block_lookup_translate_##instruction_type(mem_type, smc_enable); \
|
translation_recursion_level++; \
|
||||||
|
block_address = ram_translation_ptr + block_prologue_size; \
|
||||||
|
fill_tag_##instruction_type(ADDR2TAG(block_address)); \
|
||||||
|
\
|
||||||
|
block_lookup_translate_##instruction_type(ram, 1); \
|
||||||
translation_recursion_level--; \
|
translation_recursion_level--; \
|
||||||
\
|
\
|
||||||
/* If the translation failed then pass that failure on if we're in \
|
/* If the translation failed then pass that failure on if we're in \
|
||||||
|
@ -2545,7 +2561,7 @@ u32 ram_block_tag_top = 0x0101;
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
block_address = mem_type##_block_ptrs[block_tag]; \
|
block_address = (u8*)TAG2ADDR(block_tag); \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
u32 translation_recursion_level = 0;
|
u32 translation_recursion_level = 0;
|
||||||
|
@ -2569,12 +2585,12 @@ u8 function_cc *block_lookup_address_##type(u32 pc) \
|
||||||
{ \
|
{ \
|
||||||
case 0x2: \
|
case 0x2: \
|
||||||
location = (u16 *)(ewram + (pc & 0x3FFFF) + 0x40000); \
|
location = (u16 *)(ewram + (pc & 0x3FFFF) + 0x40000); \
|
||||||
block_lookup_translate(type, ram, 1); \
|
block_lookup_translate_ram(type); \
|
||||||
break; \
|
break; \
|
||||||
\
|
\
|
||||||
case 0x3: \
|
case 0x3: \
|
||||||
location = (u16 *)(iwram + (pc & 0x7FFF)); \
|
location = (u16 *)(iwram + (pc & 0x7FFF)); \
|
||||||
block_lookup_translate(type, ram, 1); \
|
block_lookup_translate_ram(type); \
|
||||||
break; \
|
break; \
|
||||||
\
|
\
|
||||||
case 0x0: \
|
case 0x0: \
|
||||||
|
@ -2879,7 +2895,7 @@ block_exit_type block_exits[MAX_EXITS];
|
||||||
if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
|
if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
|
||||||
{ \
|
{ \
|
||||||
address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
|
address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
|
||||||
0xFFFFFFFF; \
|
CODE_TAG_BLOCK32; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2887,7 +2903,8 @@ block_exit_type block_exits[MAX_EXITS];
|
||||||
int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
|
int offset = (pc < 0x03000000) ? 0x40000 : -0x8000; \
|
||||||
if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
|
if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0) \
|
||||||
{ \
|
{ \
|
||||||
address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = 0xFFFF; \
|
address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = \
|
||||||
|
CODE_TAG_BLOCK16; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3405,7 +3422,6 @@ void flush_translation_cache_ram(void)
|
||||||
|
|
||||||
last_ram_translation_ptr = ram_translation_cache;
|
last_ram_translation_ptr = ram_translation_cache;
|
||||||
ram_translation_ptr = ram_translation_cache;
|
ram_translation_ptr = ram_translation_cache;
|
||||||
ram_block_tag_top = 0x0101;
|
|
||||||
|
|
||||||
// Proceed to clean the SMC area if needed
|
// Proceed to clean the SMC area if needed
|
||||||
// (also try to memset as little as possible for performance)
|
// (also try to memset as little as possible for performance)
|
||||||
|
@ -3446,8 +3462,6 @@ void init_caches(void)
|
||||||
iwram_code_min = 0;
|
iwram_code_min = 0;
|
||||||
iwram_code_max = 0x7FFF;
|
iwram_code_max = 0x7FFF;
|
||||||
flush_translation_cache_ram();
|
flush_translation_cache_ram();
|
||||||
/* Ensure 0 and FFFF get zeroed out */
|
|
||||||
memset(ram_block_ptrs, 0, sizeof(ram_block_ptrs));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define cache_dump_prefix ""
|
#define cache_dump_prefix ""
|
||||||
|
|
|
@ -4,15 +4,18 @@
|
||||||
|
|
||||||
/* Cache sizes and their config knobs */
|
/* Cache sizes and their config knobs */
|
||||||
#if defined(PSP)
|
#if defined(PSP)
|
||||||
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4)
|
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 2)
|
||||||
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 384)
|
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 384)
|
||||||
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024)
|
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 2)
|
||||||
#else
|
#else
|
||||||
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4 * 5)
|
#define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 10)
|
||||||
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 384 * 2)
|
#define RAM_TRANSLATION_CACHE_SIZE (1024 * 512)
|
||||||
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 32)
|
#define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 8)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Please note that RAM_TRANSLATION_CACHE_SIZE is limited to 512KB
|
||||||
|
Check cpu_threaded.c for "memory tagging" for more info. */
|
||||||
|
|
||||||
/* This is MIPS specific for now */
|
/* This is MIPS specific for now */
|
||||||
#define STUB_ARENA_SIZE (16*1024)
|
#define STUB_ARENA_SIZE (16*1024)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue