Rework ram block ptrs to remove second indirection table.
This removes ram_block_ptrs and encodes the pointer directly in the block tag. Saves ~256KB at no performance cost. Drawback is that it limits the ram cache size to 512KB (we were using 768KB before). Should not be a problem since most games use less than 32KB of cache anyway. Fixed ARM routines accordingly.
This commit is contained in:
		
							parent
							
								
									aded681de2
								
							
						
					
					
						commit
						3144d9e277
					
				
					 3 changed files with 66 additions and 48 deletions
				
			
		| 
						 | 
					@ -179,6 +179,7 @@ execute_pc_##mode:                                                           ;\
 | 
				
			||||||
  .long 3f                                /* F Bad region                  */;\
 | 
					  .long 3f                                /* F Bad region                  */;\
 | 
				
			||||||
                                                                             ;\
 | 
					                                                                             ;\
 | 
				
			||||||
3:                                                                           ;\
 | 
					3:                                                                           ;\
 | 
				
			||||||
 | 
					  /* r0 already contains the PC to jump to */                                ;\
 | 
				
			||||||
  call_c_function(block_lookup_address_##mode)                               ;\
 | 
					  call_c_function(block_lookup_address_##mode)                               ;\
 | 
				
			||||||
  restore_flags()                                                            ;\
 | 
					  restore_flags()                                                            ;\
 | 
				
			||||||
  bx r0                                                                      ;\
 | 
					  bx r0                                                                      ;\
 | 
				
			||||||
| 
						 | 
					@ -187,23 +188,23 @@ execute_pc_##mode:                                                           ;\
 | 
				
			||||||
  mov r2, r0, lsl #14                     /* addr &= 0x3ffff               */;\
 | 
					  mov r2, r0, lsl #14                     /* addr &= 0x3ffff               */;\
 | 
				
			||||||
  mov r2, r2, lsr #14                                                        ;\
 | 
					  mov r2, r2, lsr #14                                                        ;\
 | 
				
			||||||
  ldrh r2, [r1, r2]                       /* Load half word there          */;\
 | 
					  ldrh r2, [r1, r2]                       /* Load half word there          */;\
 | 
				
			||||||
  ldr r1, =(ram_block_ptrs)                                                  ;\
 | 
					  ldr r1, =(ram_translation_cache)                                           ;\
 | 
				
			||||||
  ldr r1, [r1, r2, lsl #2]                /* Pointer to the cache          */;\
 | 
					  subs r2, #0x0104                        /* Check valid tag + rebase      */;\
 | 
				
			||||||
  cmp r1, #0                              /* NULL means not translated     */;\
 | 
					  ble 3b                                  /* Data/non-entry code -> transl.*/;\
 | 
				
			||||||
  beq 3b                                  /* Need to translate             */;\
 | 
					  mov r2, r2, lsr #1                      /* Ignores LSB                   */;\
 | 
				
			||||||
  restore_flags()                                                            ;\
 | 
					  restore_flags()                                                            ;\
 | 
				
			||||||
  bx r1                                                                      ;\
 | 
					  add pc, r1, r2, lsl #4                  /* Offset is 16 byte aligned     */;\
 | 
				
			||||||
2:                                                                           ;\
 | 
					2:                                                                           ;\
 | 
				
			||||||
  ldr r1, =(iwram)                        /* Load base addr                */;\
 | 
					  ldr r1, =(iwram)                        /* Load base addr                */;\
 | 
				
			||||||
  mov r2, r0, lsl #17                     /* addr &= 0x7fff                */;\
 | 
					  mov r2, r0, lsl #17                     /* addr &= 0x7fff                */;\
 | 
				
			||||||
  mov r2, r2, lsr #17                                                        ;\
 | 
					  mov r2, r2, lsr #17                                                        ;\
 | 
				
			||||||
  ldrh r2, [r1, r2]                       /* Load half word there          */;\
 | 
					  ldrh r2, [r1, r2]                       /* Load half word there          */;\
 | 
				
			||||||
  ldr r1, =(ram_block_ptrs)                                                  ;\
 | 
					  ldr r1, =(ram_translation_cache)                                           ;\
 | 
				
			||||||
  ldr r1, [r1, r2, lsl #2]                /* Pointer to the cache          */;\
 | 
					  subs r2, #0x0104                        /* Check valid tag + rebase      */;\
 | 
				
			||||||
  cmp r1, #0                              /* NULL means not translated     */;\
 | 
					  ble 3b                                  /* Data/non-entry code -> transl.*/;\
 | 
				
			||||||
  beq 3b                                  /* Need to translate             */;\
 | 
					  mov r2, r2, lsr #1                      /* Ignores LSB                   */;\
 | 
				
			||||||
  restore_flags()                                                            ;\
 | 
					  restore_flags()                                                            ;\
 | 
				
			||||||
  bx r1                                                                      ;\
 | 
					  add pc, r1, r2, lsl #4                  /* Offset is 16 byte aligned     */;\
 | 
				
			||||||
.size arm_indirect_branch_##mode, .-arm_indirect_branch_##mode
 | 
					.size arm_indirect_branch_##mode, .-arm_indirect_branch_##mode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
execute_pc_builder(arm, 0x3)
 | 
					execute_pc_builder(arm, 0x3)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2440,8 +2440,27 @@ void translate_icache_sync() {
 | 
				
			||||||
  block_data[block_data_position].flag_data = flag_status;                    \
 | 
					  block_data[block_data_position].flag_data = flag_status;                    \
 | 
				
			||||||
}                                                                             \
 | 
					}                                                                             \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
u8 *ram_block_ptrs[1024 * 64];
 | 
					// I/EWRAM memory tagging
 | 
				
			||||||
u32 ram_block_tag_top = 0x0101;
 | 
					// Code emitted in the RAM cache has tags (16 bit values) in the mirror tag ram
 | 
				
			||||||
 | 
					// that indicate that the address contains code. The following values are used:
 | 
				
			||||||
 | 
					// 0x0000 : this is just data (never translated)
 | 
				
			||||||
 | 
					// 0x00XX : not used (since first byte is zero)
 | 
				
			||||||
 | 
					// 0x0101 : this is code that is not the start of a translated block
 | 
				
			||||||
 | 
					// 0x0105 : first possible tag (we do not use 103 due to arm asm immediates)
 | 
				
			||||||
 | 
					// 0xXXXX : this is the start of a translated block
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// The tag value is a pointer to the ram cache where the translated block starts
 | 
				
			||||||
 | 
					// The tag value must have both bytes set to non-zero, therefore the LSB is
 | 
				
			||||||
 | 
					// always 1 and starts from 0x0101 value.
 | 
				
			||||||
 | 
					// The 15 value is an index to 16 byte blocks in the RAM CACHE, therefore this
 | 
				
			||||||
 | 
					// cache is limited to ~ 2^15 * 16byte ~= 512KB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define CODE_TAG_BLOCK16   0x0101
 | 
				
			||||||
 | 
					#define CODE_TAG_BLOCK32   0x01010101
 | 
				
			||||||
 | 
					#define INITIAL_TOP_TAG    0x0105
 | 
				
			||||||
 | 
					#define TAG2ADDR(tag)      (&ram_translation_cache[((((tag) - INITIAL_TOP_TAG) >> 1) << 4)])
 | 
				
			||||||
 | 
					#define ADDR2TAG(addr)     ((((((u8*)addr) - &ram_translation_cache[0]) >> 4) << 1) + INITIAL_TOP_TAG)
 | 
				
			||||||
 | 
					#define VALIDTAG(tag)      (tag >= INITIAL_TOP_TAG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// This function will return a pointer to a translated block of code. If it
 | 
					// This function will return a pointer to a translated block of code. If it
 | 
				
			||||||
// doesn't exist it will translate it, if it does it will pass it back.
 | 
					// doesn't exist it will translate it, if it does it will pass it back.
 | 
				
			||||||
| 
						 | 
					@ -2493,41 +2512,38 @@ u32 ram_block_tag_top = 0x0101;
 | 
				
			||||||
     mem_type##_translation_region, smc_enable);                              \
 | 
					     mem_type##_translation_region, smc_enable);                              \
 | 
				
			||||||
  }                                                                           \
 | 
					  }                                                                           \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// 0x0101 is the smallest tag that can be used. 0xFFFF is marked
 | 
					#define fill_tag_arm(tag)                                                     \
 | 
				
			||||||
// in the middle of blocks and used for write guarding, it doesn't
 | 
					  location[0] = (tag);                                                        \
 | 
				
			||||||
// indicate a valid block either (it's okay to compile a new block
 | 
					  location[1] = CODE_TAG_BLOCK16
 | 
				
			||||||
// that overlaps the earlier one, although this should be relatively
 | 
					 | 
				
			||||||
// uncommon)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define fill_tag_arm(mem_type)                                                \
 | 
					#define fill_tag_thumb(tag)                                                   \
 | 
				
			||||||
  location[0] = mem_type##_block_tag_top;                                     \
 | 
					  *location = (tag)
 | 
				
			||||||
  location[1] = 0xFFFF                                                        \
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define fill_tag_thumb(mem_type)                                              \
 | 
					#define fill_tag_dual(tag)                                                    \
 | 
				
			||||||
  *location = mem_type##_block_tag_top                                        \
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define fill_tag_dual(mem_type)                                               \
 | 
					 | 
				
			||||||
  if(thumb)                                                                   \
 | 
					  if(thumb)                                                                   \
 | 
				
			||||||
    fill_tag_thumb(mem_type);                                                 \
 | 
					    fill_tag_thumb(tag);                                                      \
 | 
				
			||||||
  else                                                                        \
 | 
					  else {                                                                      \
 | 
				
			||||||
    fill_tag_arm(mem_type)                                                    \
 | 
					    fill_tag_arm(tag);                                                        \
 | 
				
			||||||
 | 
					  }                                                                           \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define block_lookup_translate(instruction_type, mem_type, smc_enable)        \
 | 
					#define block_lookup_translate_ram(instruction_type)                          \
 | 
				
			||||||
  block_tag = *location;                                                      \
 | 
					  block_tag = *location;                                                      \
 | 
				
			||||||
  if((block_tag < 0x0101) || (block_tag == 0xFFFF))                           \
 | 
					  if(!VALIDTAG(block_tag))                                                    \
 | 
				
			||||||
  {                                                                           \
 | 
					  {                                                                           \
 | 
				
			||||||
    __label__ redo;                                                           \
 | 
					    __label__ redo;                                                           \
 | 
				
			||||||
    s32 translation_result;                                                   \
 | 
					    s32 translation_result;                                                   \
 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    redo:                                                                     \
 | 
					    redo:                                                                     \
 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    translation_recursion_level++;                                            \
 | 
					    /* Pad the start of the block to 16 bytes, see "memory tagging" above */  \
 | 
				
			||||||
    block_address = mem_type##_translation_ptr + block_prologue_size;         \
 | 
					    while ((((uintptr_t)ram_translation_ptr) % 16) != block_prologue_size)    \
 | 
				
			||||||
    mem_type##_block_ptrs[mem_type##_block_tag_top] = block_address;          \
 | 
					      ram_translation_ptr++;                                                  \
 | 
				
			||||||
    fill_tag_##instruction_type(mem_type);                                    \
 | 
					 | 
				
			||||||
    mem_type##_block_tag_top++;                                               \
 | 
					 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    block_lookup_translate_##instruction_type(mem_type, smc_enable);          \
 | 
					    translation_recursion_level++;                                            \
 | 
				
			||||||
 | 
					    block_address = ram_translation_ptr + block_prologue_size;                \
 | 
				
			||||||
 | 
					    fill_tag_##instruction_type(ADDR2TAG(block_address));                     \
 | 
				
			||||||
 | 
					                                                                              \
 | 
				
			||||||
 | 
					    block_lookup_translate_##instruction_type(ram, 1);                        \
 | 
				
			||||||
    translation_recursion_level--;                                            \
 | 
					    translation_recursion_level--;                                            \
 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    /* If the translation failed then pass that failure on if we're in        \
 | 
					    /* If the translation failed then pass that failure on if we're in        \
 | 
				
			||||||
| 
						 | 
					@ -2545,7 +2561,7 @@ u32 ram_block_tag_top = 0x0101;
 | 
				
			||||||
  }                                                                           \
 | 
					  }                                                                           \
 | 
				
			||||||
  else                                                                        \
 | 
					  else                                                                        \
 | 
				
			||||||
  {                                                                           \
 | 
					  {                                                                           \
 | 
				
			||||||
    block_address = mem_type##_block_ptrs[block_tag];                         \
 | 
					    block_address = (u8*)TAG2ADDR(block_tag);                                 \
 | 
				
			||||||
  }                                                                           \
 | 
					  }                                                                           \
 | 
				
			||||||
 | 
					
 | 
				
			||||||
u32 translation_recursion_level = 0;
 | 
					u32 translation_recursion_level = 0;
 | 
				
			||||||
| 
						 | 
					@ -2569,12 +2585,12 @@ u8 function_cc *block_lookup_address_##type(u32 pc)                           \
 | 
				
			||||||
  {                                                                           \
 | 
					  {                                                                           \
 | 
				
			||||||
    case 0x2:                                                                 \
 | 
					    case 0x2:                                                                 \
 | 
				
			||||||
      location = (u16 *)(ewram + (pc & 0x3FFFF) + 0x40000);                   \
 | 
					      location = (u16 *)(ewram + (pc & 0x3FFFF) + 0x40000);                   \
 | 
				
			||||||
      block_lookup_translate(type, ram, 1);                                   \
 | 
					      block_lookup_translate_ram(type);                                       \
 | 
				
			||||||
      break;                                                                  \
 | 
					      break;                                                                  \
 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    case 0x3:                                                                 \
 | 
					    case 0x3:                                                                 \
 | 
				
			||||||
      location = (u16 *)(iwram + (pc & 0x7FFF));                              \
 | 
					      location = (u16 *)(iwram + (pc & 0x7FFF));                              \
 | 
				
			||||||
      block_lookup_translate(type, ram, 1);                                   \
 | 
					      block_lookup_translate_ram(type);                                       \
 | 
				
			||||||
      break;                                                                  \
 | 
					      break;                                                                  \
 | 
				
			||||||
                                                                              \
 | 
					                                                                              \
 | 
				
			||||||
    case 0x0:                                                                 \
 | 
					    case 0x0:                                                                 \
 | 
				
			||||||
| 
						 | 
					@ -2879,7 +2895,7 @@ block_exit_type block_exits[MAX_EXITS];
 | 
				
			||||||
  if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0)      \
 | 
					  if(address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0)      \
 | 
				
			||||||
  {                                                                           \
 | 
					  {                                                                           \
 | 
				
			||||||
    address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) =           \
 | 
					    address32(pc_address_block, (block_end_pc & 0x7FFF) + offset) =           \
 | 
				
			||||||
     0xFFFFFFFF;                                                              \
 | 
					      CODE_TAG_BLOCK32;                                                       \
 | 
				
			||||||
  }                                                                           \
 | 
					  }                                                                           \
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2887,7 +2903,8 @@ block_exit_type block_exits[MAX_EXITS];
 | 
				
			||||||
  int offset = (pc < 0x03000000) ? 0x40000 : -0x8000;                         \
 | 
					  int offset = (pc < 0x03000000) ? 0x40000 : -0x8000;                         \
 | 
				
			||||||
  if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0)      \
 | 
					  if(address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) == 0)      \
 | 
				
			||||||
  {                                                                           \
 | 
					  {                                                                           \
 | 
				
			||||||
    address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) = 0xFFFF;   \
 | 
					    address16(pc_address_block, (block_end_pc & 0x7FFF) + offset) =           \
 | 
				
			||||||
 | 
					      CODE_TAG_BLOCK16;                                                       \
 | 
				
			||||||
  }                                                                           \
 | 
					  }                                                                           \
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3405,7 +3422,6 @@ void flush_translation_cache_ram(void)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  last_ram_translation_ptr = ram_translation_cache;
 | 
					  last_ram_translation_ptr = ram_translation_cache;
 | 
				
			||||||
  ram_translation_ptr = ram_translation_cache;
 | 
					  ram_translation_ptr = ram_translation_cache;
 | 
				
			||||||
  ram_block_tag_top = 0x0101;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Proceed to clean the SMC area if needed
 | 
					  // Proceed to clean the SMC area if needed
 | 
				
			||||||
  // (also try to memset as little as possible for performance)
 | 
					  // (also try to memset as little as possible for performance)
 | 
				
			||||||
| 
						 | 
					@ -3446,8 +3462,6 @@ void init_caches(void)
 | 
				
			||||||
  iwram_code_min = 0;
 | 
					  iwram_code_min = 0;
 | 
				
			||||||
  iwram_code_max = 0x7FFF;
 | 
					  iwram_code_max = 0x7FFF;
 | 
				
			||||||
  flush_translation_cache_ram();
 | 
					  flush_translation_cache_ram();
 | 
				
			||||||
  /* Ensure 0 and FFFF get zeroed out */
 | 
					 | 
				
			||||||
  memset(ram_block_ptrs, 0, sizeof(ram_block_ptrs));
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define cache_dump_prefix ""
 | 
					#define cache_dump_prefix ""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,15 +4,18 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Cache sizes and their config knobs */
 | 
					/* Cache sizes and their config knobs */
 | 
				
			||||||
#if defined(PSP)
 | 
					#if defined(PSP)
 | 
				
			||||||
  #define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4)
 | 
					  #define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 2)
 | 
				
			||||||
  #define RAM_TRANSLATION_CACHE_SIZE (1024 * 384)
 | 
					  #define RAM_TRANSLATION_CACHE_SIZE (1024 * 384)
 | 
				
			||||||
  #define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024)
 | 
					  #define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 2)
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
  #define ROM_TRANSLATION_CACHE_SIZE (1024 * 512 * 4 * 5)
 | 
					  #define ROM_TRANSLATION_CACHE_SIZE (1024 * 1024 * 10)
 | 
				
			||||||
  #define RAM_TRANSLATION_CACHE_SIZE (1024 * 384 * 2)
 | 
					  #define RAM_TRANSLATION_CACHE_SIZE (1024 * 512)
 | 
				
			||||||
  #define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 32)
 | 
					  #define TRANSLATION_CACHE_LIMIT_THRESHOLD (1024 * 8)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Please note that RAM_TRANSLATION_CACHE_SIZE is limited to 512KB
 | 
				
			||||||
 | 
					   Check cpu_threaded.c for "memory tagging" for more info. */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* This is MIPS specific for now */
 | 
					/* This is MIPS specific for now */
 | 
				
			||||||
#define STUB_ARENA_SIZE  (16*1024)
 | 
					#define STUB_ARENA_SIZE  (16*1024)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue