From b4f394459dc3165d6332c1db0a73c43bfe1cd016 Mon Sep 17 00:00:00 2001 From: David Guillen Fandos Date: Tue, 1 Aug 2023 00:29:49 +0200 Subject: [PATCH] Rewrite pixel blit functions for speed Expand some conditions to make it clear what values are required in each path. --- video.cc | 187 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 87 deletions(-) diff --git a/video.cc b/video.cc index 0bde3a9..5181f5e 100644 --- a/video.cc +++ b/video.cc @@ -135,7 +135,7 @@ void video_reload_counters() // it in either 8 or 4 bpp mode. Honors vertical and horizontal flip. template -static inline void render_tile_Nbpp(u32 layer, +static inline void render_tile_Nbpp(u32 bg_comb, u32 px_comb, dsttype *dest_ptr, u32 start, u32 end, u16 tile, const u8 *tile_base, int vertical_pixel_flip ) { @@ -148,11 +148,6 @@ static inline void render_tile_Nbpp(u32 layer, // tile_base already points to the right tile-line vertical offset const u8 *tile_ptr = &tile_base[(tile & 0x3FF) * (is8bpp ? 64 : 32)]; - // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. - // If set, the current pixel belongs to a layer that is 1st or 2nd target. - u32 bg_comb = color_flags(5); - u32 px_comb = color_flags(layer); - // On vertical flip, apply the mirror offset if (tile & 0x800) tile_ptr += vertical_pixel_flip; @@ -164,17 +159,21 @@ static inline void render_tile_Nbpp(u32 layer, // Honor hflip by selecting bytes in the correct order u32 sel = hflip ? (7-i) : i; u8 pval = (tilepix >> (sel*8)) & 0xFF; - // Combine mask is different if we are rendering the backdrop color - u16 combflg = pval ? px_comb : bg_comb; // Alhpa mode stacks previous value (unless rendering the first layer) - if (isbase || pval) { + if (pval) { if (rdtype == FULLCOLOR) *dest_ptr = palette_ram_converted[pval]; else if (rdtype == INDXCOLOR) - *dest_ptr = pval | combflg; // Add combine flags + *dest_ptr = pval | px_comb; // Add combine flags else if (rdtype == STCKCOLOR) // Stack pixels on top of the pixel value and combine flags - *dest_ptr = pval | combflg | ((isbase ? bg_comb : *dest_ptr) << 16); + *dest_ptr = pval | px_comb | ((isbase ? bg_comb : *dest_ptr) << 16); + } + else if (isbase) { + if (rdtype == FULLCOLOR) + *dest_ptr = palette_ram_converted[0]; + else + *dest_ptr = 0 | bg_comb; // Add combine flags } } } else { @@ -185,15 +184,20 @@ static inline void render_tile_Nbpp(u32 layer, for (u32 i = start; i < end; i++, dest_ptr++) { u32 sel = hflip ? (7-i) : i; u8 pval = (tilepix >> (sel*4)) & 0xF; - u16 combflg = pval ? px_comb : bg_comb; - if (isbase || pval) { - u8 colidx = pval ? (pval | tilepal) : 0; + if (pval) { + u8 colidx = pval | tilepal; if (rdtype == FULLCOLOR) *dest_ptr = palette_ram_converted[colidx]; else if (rdtype == INDXCOLOR) - *dest_ptr = colidx | combflg; + *dest_ptr = colidx | px_comb; else if (rdtype == STCKCOLOR) - *dest_ptr = colidx | combflg | ((isbase ? bg_comb : *dest_ptr) << 16); // Stack pixels + *dest_ptr = colidx | px_comb | ((isbase ? bg_comb : *dest_ptr) << 16); // Stack pixels + } + else if (isbase) { + if (rdtype == FULLCOLOR) + *dest_ptr = palette_ram_converted[0]; + else + *dest_ptr = 0 | bg_comb; } } } @@ -212,6 +216,11 @@ static void render_scanline_text(u32 layer, stype *dest_ptr = ((stype*)scanline) + start; u32 i; + // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. + // If set, the current pixel belongs to a layer that is 1st or 2nd target. + u32 bg_comb = color_flags(5); + u32 px_comb = color_flags(layer); + // Background map data is in vram, at an offset specified in 2K blocks. // (each map data block is 32x32 tiles, at 16bpp, so 2KB) u32 base_block = (bg_control >> 8) & 0x1F; @@ -279,9 +288,11 @@ static void render_scanline_text(u32 layer, u16 tile = eswap16(*map_ptr++); if (tile & 0x400) // Tile horizontal flip - render_tile_Nbpp(layer, dest_ptr, tile_hoff, stop, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, dest_ptr, tile_hoff, stop, tile, tile_base, vflip_off); else - render_tile_Nbpp(layer, dest_ptr, tile_hoff, stop, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, dest_ptr, tile_hoff, stop, tile, tile_base, vflip_off); dest_ptr += todraw; end -= todraw; @@ -297,9 +308,11 @@ static void render_scanline_text(u32 layer, for (i = 0; i < todraw; i++) { u16 tile = eswap16(*map_ptr++); if (tile & 0x400) // Tile horizontal flip - render_tile_Nbpp(layer, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); else - render_tile_Nbpp(layer, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); } end -= todraw * 8; @@ -318,9 +331,11 @@ static void render_scanline_text(u32 layer, for (i = 0; i < todraw; i++) { u16 tile = eswap16(*map_ptr++); if (tile & 0x400) // Tile horizontal flip - render_tile_Nbpp(layer, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); else - render_tile_Nbpp(layer, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, &dest_ptr[i * 8], 0, 8, tile, tile_base, vflip_off); } end -= todraw * 8; @@ -331,16 +346,19 @@ static void render_scanline_text(u32 layer, if (end) { u16 tile = eswap16(*map_ptr++); if (tile & 0x400) // Tile horizontal flip - render_tile_Nbpp(layer, dest_ptr, 0, end, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, dest_ptr, 0, end, tile, tile_base, vflip_off); else - render_tile_Nbpp(layer, dest_ptr, 0, end, tile, tile_base, vflip_off); + render_tile_Nbpp( + bg_comb, px_comb, dest_ptr, 0, end, tile, tile_base, vflip_off); } } } template static inline void render_pixel_8bpp(u32 layer, - dsttype *dest_ptr, u32 px, u32 py, const u8 *tile_base, const u8 *map_base, u32 map_size + dsttype *dest_ptr, u32 px, u32 py, u32 bg_comb, u32 px_comb, + const u8 *tile_base, const u8 *map_base, u32 map_size ) { // Pitch represents the log2(number of tiles per row) (from 16 to 128) u32 map_pitch = map_size + 4; @@ -351,40 +369,32 @@ static inline void render_pixel_8bpp(u32 layer, // Read the 8bit color within the tile. u8 pval = tile_ptr[(px % 8) + ((py % 8) * 8)]; - // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. - // If set, the current pixel belongs to a layer that is 1st or 2nd target. - u32 bg_comb = color_flags(5); - u32 px_comb = color_flags(layer); - - // Combine mask is different if we are rendering the backdrop color - u16 combflg = pval ? px_comb : bg_comb; // Alhpa mode stacks previous value (unless rendering the first layer) - if (isbase || pval) { + if (pval) { if (rdtype == FULLCOLOR) *dest_ptr = palette_ram_converted[pval]; else if (rdtype == INDXCOLOR) - *dest_ptr = pval | combflg; // Add combine flags + *dest_ptr = pval | px_comb; // Add combine flags else if (rdtype == STCKCOLOR) - // Stack pixels on top of the pixel value and combine flags - *dest_ptr = pval | combflg | ((isbase ? bg_comb : *dest_ptr) << 16); + // Stack pixels. If base, stack the base pixel. + *dest_ptr = pval | px_comb | ((isbase ? bg_comb : *dest_ptr) << 16); + } + else if (isbase) { + // Transparent pixel, but we are base layer, so render background. + if (rdtype == FULLCOLOR) + *dest_ptr = palette_ram_converted[0]; + else + *dest_ptr = 0 | bg_comb; // Just backdrop color and combine flags } } template -static inline void render_bdrop_pixel_8bpp(dsttype *dest_ptr) { - // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. - // If set, the current pixel belongs to a layer that is 1st or 2nd target. - u32 bg_comb = color_flags(5); - u32 pval = 0; - +static inline void render_bdrop_pixel_8bpp(dsttype *dest_ptr, u32 bg_comb) { // Alhpa mode stacks previous value (unless rendering the first layer) if (rdtype == FULLCOLOR) - *dest_ptr = palette_ram_converted[pval]; - else if (rdtype == INDXCOLOR) - *dest_ptr = pval | bg_comb; // Add combine flags - else if (rdtype == STCKCOLOR) - // Stack pixels on top of the pixel value and combine flags - *dest_ptr = pval | bg_comb; + *dest_ptr = palette_ram_converted[0]; + else + *dest_ptr = 0 | bg_comb; } // Affine background rendering logic. @@ -395,6 +405,10 @@ static inline void render_affine_background( u32 layer, u32 start, u32 cnt, const u8 *map_base, u32 map_size, const u8 *tile_base, dsttype *dst_ptr) { + // Backdrop and current layer combine bits. + u32 bg_comb = color_flags(5); + u32 px_comb = color_flags(layer); + s32 dx = (s16)read_ioreg(REG_BGxPA(layer)); s32 dy = (s16)read_ioreg(REG_BGxPC(layer)); @@ -412,7 +426,8 @@ static inline void render_affine_background( // Lookup pixel and draw it. render_pixel_8bpp( - layer, dst_ptr++, pixel_x, pixel_y, tile_base, map_base, map_size); + layer, dst_ptr++, pixel_x, pixel_y, bg_comb, px_comb, + tile_base, map_base, map_size); // Move to the next pixel, update coords accordingly source_x += dx; @@ -438,7 +453,7 @@ static inline void render_affine_background( // Draw a backdrop pixel if we are the base layer. if (isbase) - render_bdrop_pixel_8bpp(dst_ptr); + render_bdrop_pixel_8bpp(dst_ptr, bg_comb); dst_ptr++; source_x += dx; @@ -457,7 +472,8 @@ static inline void render_affine_background( // Lookup pixel and draw it. render_pixel_8bpp( - layer, dst_ptr++, pixel_x, pixel_y, tile_base, map_base, map_size); + layer, dst_ptr++, pixel_x, pixel_y, bg_comb, px_comb, + tile_base, map_base, map_size); // Move to the next pixel, update coords accordingly cnt--; @@ -471,7 +487,7 @@ static inline void render_affine_background( // Only necessary for the base layer, otherwise we can safely finish. if (isbase) while (cnt--) - render_bdrop_pixel_8bpp(dst_ptr++); + render_bdrop_pixel_8bpp(dst_ptr++, bg_comb); } } @@ -651,7 +667,7 @@ typedef struct { // Renders a tile row (8 pixels) for a regular (non-affine) object/sprite. template -static inline void render_obj_tile_Nbpp(bool forcebld, +static inline void render_obj_tile_Nbpp(u32 px_comb, dsttype *dest_ptr, u32 start, u32 end, u32 tile_offset, u16 palette ) { // tile_ptr points to the tile row (32 or 64 bits depending on bpp). @@ -660,10 +676,6 @@ static inline void render_obj_tile_Nbpp(bool forcebld, // Note that the last VRAM bank wrap around, hence the offset aliasing const u8* tile_ptr = &vram[0x10000 + (tile_offset & 0x7FFF)]; - // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. - // If set, the current pixel belongs to a layer that is 1st or 2nd target. - u32 px_comb = (forcebld ? 0x800 : 0) | color_flags(4); - if (is8bpp) { // Each byte is a color, mapped to a palete. 8 bytes can be read as 64bit u64 tilepix = eswap64(*(u64*)tile_ptr); @@ -717,9 +729,9 @@ static inline void render_obj_tile_Nbpp(bool forcebld, // Renders a regular sprite (non-affine) row to screen. // delta_x is the object X coordinate referenced from the window start. // cnt is the maximum number of pixels to draw, honoring window, obj width, etc. -template +template static void render_object( - s32 delta_x, u32 cnt, stype *dst_ptr, u32 tile_offset, u16 palette + s32 delta_x, u32 cnt, stype *dst_ptr, u32 tile_offset, u32 px_comb, u16 palette ) { // Tile size in bytes for each mode u32 tile_bsize = is8bpp ? tile_size_8bpp : tile_size_4bpp; @@ -738,7 +750,7 @@ static void render_object( if (tile_off) { u32 residual = 8 - tile_off; // Pixel count to complete the first tile u32 maxpix = MIN(residual, cnt); - render_obj_tile_Nbpp(forcebld, dst_ptr, tile_off, tile_off + maxpix, tile_offset, palette); + render_obj_tile_Nbpp(px_comb, dst_ptr, tile_off, tile_off + maxpix, tile_offset, palette); // Move to the next tile tile_offset += tile_size_off; @@ -755,7 +767,7 @@ static void render_object( s32 num_tiles = cnt / 8; while (num_tiles--) { // Render full tiles - render_obj_tile_Nbpp(forcebld, dst_ptr, 0, 8, tile_offset, palette); + render_obj_tile_Nbpp(px_comb, dst_ptr, 0, 8, tile_offset, palette); tile_offset += tile_size_off; dst_ptr += 8; } @@ -763,15 +775,15 @@ static void render_object( // Render any partial tile on the end cnt = cnt % 8; if (cnt) - render_obj_tile_Nbpp(forcebld, dst_ptr, 0, cnt, tile_offset, palette); + render_obj_tile_Nbpp(px_comb, dst_ptr, 0, cnt, tile_offset, palette); } // Renders an affine sprite row to screen. -template +template static void render_affine_object( const t_sprite *obji, const t_affp *affp, bool is_double, - u32 start, u32 end, stype *dst_ptr, u32 base_tile, u16 palette + u32 start, u32 end, stype *dst_ptr, u32 base_tile, u32 pxcomb, u16 palette ) { // Tile size in bytes for each mode const u32 tile_bsize = is8bpp ? tile_size_8bpp : tile_size_4bpp; @@ -836,8 +848,6 @@ static void render_affine_object( return; // Lookup pixel and draw it. - //render_pixel_8bpp( - // layer, dst_ptr++, pixel_x, pixel_y, tile_base, map_base, map_size); u8 pixval; if (is8bpp) { // We lookup the byte directly and render it. @@ -862,18 +872,17 @@ static void render_affine_object( } // Render the pixel value - u32 comb = (forcebld ? 0x800 : 0) | color_flags(4); if (pixval) { if (rdtype == FULLCOLOR) *dst_ptr = palette_ram_converted[pixval | palette | 0x100]; else if (rdtype == INDXCOLOR) - *dst_ptr = pixval | palette | 0x100 | comb; // Add combine flags + *dst_ptr = pixval | palette | 0x100 | pxcomb; // Add combine flags else if (rdtype == STCKCOLOR) { // Stack pixels on top of the pixel value and combine flags if (*dst_ptr & 0x100) - *dst_ptr = pixval | palette | 0x100 | comb | ((*dst_ptr) & 0xFFFF0000); + *dst_ptr = pixval | palette | 0x100 | pxcomb | ((*dst_ptr) & 0xFFFF0000); else - *dst_ptr = pixval | palette | 0x100 | comb | ((*dst_ptr) << 16); // Stack pixels + *dst_ptr = pixval | palette | 0x100 | pxcomb | ((*dst_ptr) << 16); // Stack pixels } else if (rdtype == PIXCOPY) *dst_ptr = dst_ptr[240]; @@ -889,9 +898,9 @@ static void render_affine_object( } // Renders a single sprite on the current scanline -template +template inline static void render_sprite( - const t_sprite *obji, bool is_affine, u32 start, u32 end, stype *scanline + const t_sprite *obji, bool is_affine, u32 start, u32 end, stype *scanline, u32 pxcomb ) { s32 vcount = read_ioreg(REG_VCOUNT); bool obj1dmap = read_ioreg(REG_DISPCNT) & 0x40; @@ -905,9 +914,11 @@ inline static void render_sprite( u16 pal = is8bpp ? 0 : ((obji->attr2 >> 8) & 0xF0); if (affp->dy == 0) // No rotation happening (just scale) - render_affine_object(obji, affp, obji->is_double, start, end, scanline, base_tile, pal); + render_affine_object( + obji, affp, obji->is_double, start, end, scanline, base_tile, pxcomb, pal); else // Full rotation and scaling - render_affine_object(obji, affp, obji->is_double, start, end, scanline, base_tile, pal); + render_affine_object( + obji, affp, obji->is_double, start, end, scanline, base_tile, pxcomb, pal); } else { // The object could be out of the window, check and skip. if (obji->obj_x >= (signed)end || obji->obj_x + obji->obj_w <= (signed)start) @@ -944,9 +955,11 @@ inline static void render_sprite( u16 pal = is8bpp ? 0 : ((obji->attr2 >> 8) & 0xF0); if (hflip) - render_object(obj_x_offset, max_draw, &scanline[start], tile_offset, pal); + render_object( + obj_x_offset, max_draw, &scanline[start], tile_offset, pxcomb, pal); else - render_object(obj_x_offset, max_draw, &scanline[start], tile_offset, pal); + render_object( + obj_x_offset, max_draw, &scanline[start], tile_offset, pxcomb, pal); } } @@ -1014,17 +1027,17 @@ static void render_scanline_objs( render_conditional_bitmap(sec_start, sec_end, tmp_ptr, obj_enable); } - if (is_8bpp) { - if (forcebld) - render_sprite(&obji, is_affine, start, end, scanline); - else - render_sprite(&obji, is_affine, start, end, scanline); - } else { - if (forcebld) - render_sprite(&obji, is_affine, start, end, scanline); - else - render_sprite(&obji, is_affine, start, end, scanline); - } + // Calculate combine masks. These store 2 bits of info: 1st and 2nd target. + // If set, the current pixel belongs to a layer that is 1st or 2nd target. + // For ST-objs, we set an extra bit, for later blending. + u32 pxcomb = (forcebld ? 0x800 : 0) | color_flags(4); + + if (is_8bpp) + render_sprite( + &obji, is_affine, start, end, scanline, pxcomb); + else + render_sprite( + &obji, is_affine, start, end, scanline, pxcomb); } }