350 lines
12 KiB
ArmAsm
350 lines
12 KiB
ArmAsm
|
@@
|
||
|
@@ Copyright (C) 2012 Roman Pauer
|
||
|
@@
|
||
|
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||
|
@@ this software and associated documentation files (the "Software"), to deal in
|
||
|
@@ the Software without restriction, including without limitation the rights to
|
||
|
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||
|
@@ of the Software, and to permit persons to whom the Software is furnished to do
|
||
|
@@ so, subject to the following conditions:
|
||
|
@@
|
||
|
@@ The above copyright notice and this permission notice shall be included in all
|
||
|
@@ copies or substantial portions of the Software.
|
||
|
@@
|
||
|
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
|
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
|
@@ SOFTWARE.
|
||
|
@@
|
||
|
|
||
|
.arm
|
||
|
|
||
|
#include "neon_scale3x.Sinc"
|
||
|
#include "neon_normalxx.Sinc"
|
||
|
|
||
|
.global neon_scale3x_8_8
|
||
|
.global neon_scale3x_16_16
|
||
|
.global neon_scale3x_8_16
|
||
|
|
||
|
.align 4
|
||
|
neon_scale3x_8_8:
|
||
|
|
||
|
@ r0 = const uint8_t *src
|
||
|
@ r1 = uint8_t *dst
|
||
|
@ r2 = unsigned int width (pixels)
|
||
|
@ r3 = unsigned int srcstride (bytes)
|
||
|
@ [sp] = unsigned int dststride (bytes)
|
||
|
@ [sp+4] = unsigned int height
|
||
|
@ lr = return address
|
||
|
|
||
|
ldr ip, [sp] @ ip = dststride
|
||
|
push {r4-r11}
|
||
|
ldr r9, [sp, #(9*4)] @ r9 = height
|
||
|
sub r4, r0, r3 @ r4 = src - srcstride
|
||
|
mov r11, sp @ oldsp = sp
|
||
|
add r5, r0, r3 @ r5 = src + srcstride
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
add r6, r1, ip @ r6 = dst + dststride
|
||
|
sub sp, sp, #64 @ sp -= 64
|
||
|
sub r3, r3, r2 @ r3 = srcstride - width
|
||
|
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
|
||
|
add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
|
||
|
add r8, sp, #32 @ r8 = sp + 32
|
||
|
sub ip, ip, r2 @ ip = dststride - width
|
||
|
vst1.64 {d12-d15}, [r8:256] @ save q6,q7
|
||
|
add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 3 * width
|
||
|
mov r8, r2 @ r8 = width
|
||
|
sub r9, r9, #2 @ r9 = height - 2
|
||
|
|
||
|
|
||
|
@ r0 = src
|
||
|
@ r1 = dst
|
||
|
@ r2 = width
|
||
|
@ r3 = srcdiff (srcstride - width)
|
||
|
@ r4 = src - srcstride
|
||
|
@ r5 = src + srcstride
|
||
|
@ r6 = dst + dststride
|
||
|
@ r7 = dst + 2 * dststride
|
||
|
@ r8 = counter
|
||
|
@ r9 = height
|
||
|
@ r10 = tmpreg
|
||
|
@ r11 = oldsp
|
||
|
@ ip = dstdiff (3 * dststride - 3 * width)
|
||
|
|
||
|
@ first line
|
||
|
neon_scale3x_8_8_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
add r0, r0, r3
|
||
|
add r4, r4, r3
|
||
|
add r5, r5, r3
|
||
|
add r1, r1, ip
|
||
|
add r6, r6, ip
|
||
|
add r7, r7, ip
|
||
|
|
||
|
@ middle lines
|
||
|
101:
|
||
|
mov r8, r2
|
||
|
|
||
|
neon_scale3x_8_8_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
subS r9, r9, #1
|
||
|
add r0, r0, r3
|
||
|
add r4, r4, r3
|
||
|
add r5, r5, r3
|
||
|
add r1, r1, ip
|
||
|
add r6, r6, ip
|
||
|
add r7, r7, ip
|
||
|
bne 101b
|
||
|
|
||
|
@ last line
|
||
|
mov r8, r2
|
||
|
|
||
|
neon_scale3x_8_8_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
add ip, sp, #32 @ ip = sp + 32
|
||
|
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
|
||
|
mov sp, r11 @ sp = oldsp
|
||
|
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
|
||
|
pop {r4-r11}
|
||
|
bx lr
|
||
|
|
||
|
@ end procedure neon_scale3x_8_8
|
||
|
|
||
|
|
||
|
neon_scale3x_16_16:
|
||
|
|
||
|
@ r0 = const uint16_t *src
|
||
|
@ r1 = uint16_t *dst
|
||
|
@ r2 = unsigned int width (pixels)
|
||
|
@ r3 = unsigned int srcstride (bytes)
|
||
|
@ [sp] = unsigned int dststride (bytes)
|
||
|
@ [sp+4] = unsigned int height
|
||
|
@ lr = return address
|
||
|
|
||
|
ldr ip, [sp] @ ip = dststride
|
||
|
push {r4-r11}
|
||
|
ldr r9, [sp, #(9*4)] @ r9 = height
|
||
|
sub r4, r0, r3 @ r4 = src - srcstride
|
||
|
mov r11, sp @ oldsp = sp
|
||
|
add r5, r0, r3 @ r5 = src + srcstride
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
add r6, r1, ip @ r6 = dst + dststride
|
||
|
sub sp, sp, #64 @ sp -= 64
|
||
|
sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width
|
||
|
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
|
||
|
add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
|
||
|
add r8, sp, #32 @ r8 = sp + 32
|
||
|
sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width
|
||
|
vst1.64 {d12-d15}, [r8:256] @ save q6,q7
|
||
|
add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 6 * width
|
||
|
mov r8, r2 @ r8 = width
|
||
|
sub r9, r9, #2 @ r9 = height - 2
|
||
|
|
||
|
@ r0 = src
|
||
|
@ r1 = dst
|
||
|
@ r2 = width
|
||
|
@ r3 = srcdiff (srcstride - 2 * width)
|
||
|
@ r4 = src - srcstride
|
||
|
@ r5 = src + srcstride
|
||
|
@ r6 = dst + dststride
|
||
|
@ r7 = dst + 2 * dststride
|
||
|
@ r8 = counter
|
||
|
@ r9 = height
|
||
|
@ r10 = tmpreg
|
||
|
@ r11 = oldsp
|
||
|
@ ip = dstdiff (3 * dststride - 6 * width)
|
||
|
|
||
|
@ first line
|
||
|
neon_scale3x_16_16_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
add r0, r0, r3
|
||
|
add r4, r4, r3
|
||
|
add r5, r5, r3
|
||
|
add r1, r1, ip
|
||
|
add r6, r6, ip
|
||
|
add r7, r7, ip
|
||
|
|
||
|
@ middle lines
|
||
|
101:
|
||
|
mov r8, r2
|
||
|
|
||
|
neon_scale3x_16_16_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
subS r9, r9, #1
|
||
|
add r0, r0, r3
|
||
|
add r4, r4, r3
|
||
|
add r5, r5, r3
|
||
|
add r1, r1, ip
|
||
|
add r6, r6, ip
|
||
|
add r7, r7, ip
|
||
|
bne 101b
|
||
|
|
||
|
@ last line
|
||
|
mov r8, r2
|
||
|
|
||
|
neon_scale3x_16_16_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
|
||
|
|
||
|
add ip, sp, #32 @ ip = sp + 32
|
||
|
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
|
||
|
mov sp, r11 @ sp = oldsp
|
||
|
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
|
||
|
pop {r4-r11}
|
||
|
bx lr
|
||
|
|
||
|
@ end procedure neon_scale3x_16_16
|
||
|
|
||
|
|
||
|
neon_scale3x_8_16:
|
||
|
|
||
|
@ r0 = const uint8_t *src
|
||
|
@ r1 = uint8_t *dst
|
||
|
@ r2 = const uint32_t *palette
|
||
|
@ r3 = unsigned int width (pixels)
|
||
|
@ [sp] = unsigned int srcstride (bytes)
|
||
|
@ [sp+4] = unsigned int dststride (bytes)
|
||
|
@ [sp+8] = unsigned int height
|
||
|
@ lr = return address
|
||
|
|
||
|
@ three temporary lines
|
||
|
|
||
|
ldr ip, [sp] @ ip = srcstride
|
||
|
push {r4-r11,lr}
|
||
|
ldr r4, [sp, #(4*10)] @ r4 = dststride
|
||
|
ldr r5, [sp, #(4*11)] @ r5 = height
|
||
|
mov r6, sp @ r6 = sp
|
||
|
sub ip, ip, r3 @ ip = srcstride - width
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width
|
||
|
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
|
||
|
sub r5, r5, #2 @ height -= 2
|
||
|
mov r10, sp @ tmpline3 = sp
|
||
|
add r7, r7, r7, lsl #1 @ r7 = 3 * dststride - 6 * width
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
|
||
|
mov r11, sp @ tmpline2 = sp
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
|
||
|
mov lr, sp @ tmpline1 = sp
|
||
|
bic sp, sp, #31 @ align sp to 32 bytes
|
||
|
sub r8, sp, #64 @ r8 = sp - 64
|
||
|
vst1.64 {d8-d11}, [r8:256] @ save q4,q5
|
||
|
sub r9, sp, #32 @ r9 = sp - 32
|
||
|
vst1.64 {d12-d15}, [r9:256] @ save q6,q7
|
||
|
sub sp, sp, #(36 + 64) @ sp -= (36 + 64)
|
||
|
str r6, [sp] @ oldsp = r6
|
||
|
str r5, [sp, #4] @ height = r5
|
||
|
str ip, [sp, #8] @ srcdiff = ip
|
||
|
str r7, [sp, #12] @ dstdiff = r7
|
||
|
str r4, [sp, #16] @ dststride = r4
|
||
|
str lr, [sp, #20] @ tmpline1 = lr
|
||
|
str r11, [sp, #24] @ tmpline2 = r11
|
||
|
str r10, [sp, #28] @ tmpline3 = r10
|
||
|
str r3, [sp, #32] @ width = r3
|
||
|
|
||
|
@ r0 = src
|
||
|
@ r1 = dst
|
||
|
@ r2 = palette
|
||
|
@ r3 = counter
|
||
|
@ r4 = dst2
|
||
|
@ r5 = dst3
|
||
|
|
||
|
@ r11 = bufptr1
|
||
|
@ ip = bufptr2
|
||
|
@ lr = bufptr3
|
||
|
|
||
|
@ [sp] = oldsp
|
||
|
@ [sp, #4] = height
|
||
|
@ [sp, #8] = srcdiff (srcstride - width)
|
||
|
@ [sp, #12] = dstdiff (3 * dststride - 6 * width)
|
||
|
@ [sp, #16] = dststride
|
||
|
@ [sp, #20] = tmpline1
|
||
|
@ [sp, #24] = tmpline2
|
||
|
@ [sp, #28] = tmpline3
|
||
|
@ [sp, #32] = width
|
||
|
|
||
|
@ lr = tmpline1
|
||
|
@ r3 = counter
|
||
|
|
||
|
@ first line
|
||
|
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
|
||
|
|
||
|
ldr r7, [sp, #8] @ r7 = srcdiff
|
||
|
ldr r3, [sp, #32] @ counter = width
|
||
|
ldr lr, [sp, #24] @ bufptr3 = tmpline2
|
||
|
add r0, r0, r7 @ src += srcdiff
|
||
|
|
||
|
@ second line
|
||
|
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
|
||
|
|
||
|
ldr r9, [sp, #16] @ r9 = dststride
|
||
|
ldr r3, [sp, #32] @ counter = width
|
||
|
ldr ip, [sp, #20] @ bufptr2 = tmpline1
|
||
|
ldr lr, [sp, #24] @ bufptr3 = tmpline2
|
||
|
add r4, r1, r9 @ dst2 = dst + dststride
|
||
|
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
|
||
|
|
||
|
@ first temporary line
|
||
|
neon_scale3x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
|
||
|
|
||
|
ldr r7, [sp, #8] @ r7 = srcdiff
|
||
|
ldr r8, [sp, #12] @ r8 = dstdiff
|
||
|
ldr r3, [sp, #32] @ counter = width
|
||
|
ldr lr, [sp, #28] @ bufptr3 = tmpline3
|
||
|
add r0, r0, r7 @ src += srcdiff
|
||
|
add r1, r1, r8 @ dst += dstdiff
|
||
|
|
||
|
100:
|
||
|
|
||
|
@ line n+1
|
||
|
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
|
||
|
|
||
|
ldr r9, [sp, #16] @ r9 = dststride
|
||
|
ldr r11, [sp, #20] @ bufptr1 = tmpline1
|
||
|
ldr ip, [sp, #24] @ bufptr2 = tmpline2
|
||
|
ldr lr, [sp, #28] @ bufptr3 = tmpline3
|
||
|
add r4, r1, r9 @ dst2 = dst + dststride
|
||
|
ldr r3, [sp, #32] @ counter = width
|
||
|
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
|
||
|
str r11, [sp, #28] @ tmpline3 = bufptr1
|
||
|
str ip, [sp, #20] @ tmpline1 = bufptr2
|
||
|
str lr, [sp, #24] @ tmpline2 = bufptr3
|
||
|
|
||
|
@ temporary line n
|
||
|
neon_scale3x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
|
||
|
|
||
|
ldr r6, [sp, #4] @ r6 = height
|
||
|
ldr r7, [sp, #8] @ r7 = srcdiff
|
||
|
ldr r8, [sp, #12] @ r8 = dstdiff
|
||
|
ldr r3, [sp, #32] @ counter = width
|
||
|
subS r6, r6, #1 @ height--
|
||
|
ldr lr, [sp, #28] @ bufptr3 = tmpline3
|
||
|
add r0, r0, r7 @ src += srcdiff
|
||
|
add r1, r1, r8 @ dst += dstdiff
|
||
|
str r6, [sp, #4] @ height = r6
|
||
|
bne 100b
|
||
|
|
||
|
|
||
|
ldr r9, [sp, #16] @ r9 = dststride
|
||
|
ldr r11, [sp, #20] @ bufptr1 = tmpline1
|
||
|
ldr ip, [sp, #24] @ bufptr2 = tmpline2
|
||
|
add r4, r1, r9 @ dst2 = dst + dststride
|
||
|
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
|
||
|
|
||
|
@ last temporary line
|
||
|
neon_scale3x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
|
||
|
|
||
|
|
||
|
add r6, sp, #36 @ r6 = sp + 36
|
||
|
ldr sp, [sp] @ sp = oldsp
|
||
|
vld1.64 {d8-d11}, [r6:256] @ restore q4,q5
|
||
|
add ip, r6, #32 @ ip = r6 + 32
|
||
|
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
|
||
|
pop {r4-r11,lr}
|
||
|
bx lr
|
||
|
|
||
|
@ end procedure neon_scale3x_8_16
|
||
|
|