From cde8a90ff80bd3f0f6c4ae83d4ae4600cd7cf200 Mon Sep 17 00:00:00 2001 From: David Guillen Fandos Date: Tue, 1 Aug 2023 01:20:29 +0200 Subject: [PATCH] Move pixel blitting from single mem access to byte-level --- common.h | 2 -- video.cc | 27 ++++++++++++--------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/common.h b/common.h index 63beb36..d83a44a 100644 --- a/common.h +++ b/common.h @@ -138,11 +138,9 @@ typedef u32 fixed8_24; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define eswap16(value) __builtin_bswap16(value) #define eswap32(value) __builtin_bswap32(value) - #define eswap64(value) __builtin_bswap64(value) #else #define eswap16(value) (value) #define eswap32(value) (value) - #define eswap64(value) (value) #endif #define readaddress8(base, offset) eswap8( address8( base, offset)) diff --git a/video.cc b/video.cc index 5181f5e..467e745 100644 --- a/video.cc +++ b/video.cc @@ -154,11 +154,10 @@ static inline void render_tile_Nbpp(u32 bg_comb, u32 px_comb, if (is8bpp) { // Each byte is a color, mapped to a palete. 8 bytes can be read as 64bit - u64 tilepix = eswap64(*(u64*)tile_ptr); for (u32 i = start; i < end; i++, dest_ptr++) { // Honor hflip by selecting bytes in the correct order u32 sel = hflip ? (7-i) : i; - u8 pval = (tilepix >> (sel*8)) & 0xFF; + u8 pval = tile_ptr[sel]; // Alhpa mode stacks previous value (unless rendering the first layer) if (pval) { if (rdtype == FULLCOLOR) @@ -180,18 +179,17 @@ static inline void render_tile_Nbpp(u32 bg_comb, u32 px_comb, // In 4bpp mode, the tile[15..12] bits contain the sub-palette number. u16 tilepal = (tile >> 12) << 4; // Only 32 bits (8 pixels * 4 bits) - u32 tilepix = eswap32(*(u32*)tile_ptr); for (u32 i = start; i < end; i++, dest_ptr++) { - u32 sel = hflip ? (7-i) : i; - u8 pval = (tilepix >> (sel*4)) & 0xF; + u32 selb = hflip ? (3-i/2) : i/2; + u32 seln = hflip ? ((i & 1) ^ 1) : (i & 1); + u8 pval = (tile_ptr[selb] >> (seln * 4)) & 0xF; if (pval) { - u8 colidx = pval | tilepal; if (rdtype == FULLCOLOR) - *dest_ptr = palette_ram_converted[colidx]; + *dest_ptr = palette_ram_converted[tilepal | pval]; else if (rdtype == INDXCOLOR) - *dest_ptr = colidx | px_comb; + *dest_ptr = px_comb | tilepal | pval; else if (rdtype == STCKCOLOR) - *dest_ptr = colidx | px_comb | ((isbase ? bg_comb : *dest_ptr) << 16); // Stack pixels + *dest_ptr = px_comb | tilepal | pval | ((isbase ? bg_comb : *dest_ptr) << 16); // Stack pixels } else if (isbase) { if (rdtype == FULLCOLOR) @@ -677,12 +675,11 @@ static inline void render_obj_tile_Nbpp(u32 px_comb, const u8* tile_ptr = &vram[0x10000 + (tile_offset & 0x7FFF)]; if (is8bpp) { - // Each byte is a color, mapped to a palete. 8 bytes can be read as 64bit - u64 tilepix = eswap64(*(u64*)tile_ptr); + // Each byte is a color, mapped to a palete. for (u32 i = start; i < end; i++, dest_ptr++) { // Honor hflip by selecting bytes in the correct order u32 sel = hflip ? (7-i) : i; - u8 pval = (tilepix >> (sel*8)) & 0xFF; + u8 pval = tile_ptr[sel]; // Alhpa mode stacks previous value if (pval) { if (rdtype == FULLCOLOR) @@ -703,10 +700,10 @@ static inline void render_obj_tile_Nbpp(u32 px_comb, } } else { // Only 32 bits (8 pixels * 4 bits) - u32 tilepix = eswap32(*(u32*)tile_ptr); for (u32 i = start; i < end; i++, dest_ptr++) { - u32 sel = hflip ? (7-i) : i; - u8 pval = (tilepix >> (sel*4)) & 0xF; + u32 selb = hflip ? (3-i/2) : i/2; + u32 seln = hflip ? ((i & 1) ^ 1) : (i & 1); + u8 pval = (tile_ptr[selb] >> (seln * 4)) & 0xF; if (pval) { u8 colidx = pval | palette; if (rdtype == FULLCOLOR)