Take out NEON CPU filters

This commit is contained in:
twinaphex 2014-07-14 04:59:32 +02:00
parent 30b95e686b
commit a8ff99c68b
11 changed files with 0 additions and 4019 deletions

View File

@ -1,337 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
.arm
#include "neon_eagle2x.Sinc"
#include "neon_normalxx.Sinc"
.global neon_eagle2x_8_8
.global neon_eagle2x_16_16
.global neon_eagle2x_8_16
.align 4
neon_eagle2x_8_8:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r10}
ldr r9, [sp, #(8*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
mov r10, sp @ oldsp = sp
add r5, r0, r3 @ r5 = src + srcstride
bic sp, sp, #31 @ align sp to 32 bytes
add r6, r1, ip @ r6 = dst + dststride
sub sp, sp, #64 @ sp -= 64
sub r3, r3, r2 @ r3 = srcstride - width
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
add r7, sp, #32 @ r7 = sp + 32
sub ip, ip, r2 @ ip = dststride - width
vst1.64 {d12-d15}, [r7:256] @ save q6,q7
lsl ip, #1 @ ip = 2 * dststride - 2 * width
mov r7, r2 @ r7 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = counter
@ r8 = tmpreg
@ r9 = height
@ r10 = oldsp
@ ip = dstdiff (2 * dststride - 2 * width)
@ first line
neon_eagle2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
@ middle lines
101:
mov r7, r2
neon_eagle2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
bne 101b
@ last line
mov r7, r2
neon_eagle2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0
add ip, sp, #32 @ ip = sp + 32
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
mov sp, r10 @ sp = oldsp
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r10}
bx lr
@ end procedure neon_eagle2x_8_8
neon_eagle2x_16_16:
@ r0 = const uint16_t *src
@ r1 = uint16_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r10}
ldr r9, [sp, #(8*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
mov r10, sp @ oldsp = sp
add r5, r0, r3 @ r5 = src + srcstride
bic sp, sp, #31 @ align sp to 32 bytes
add r6, r1, ip @ r6 = dst + dststride
sub sp, sp, #64 @ sp -= 64
sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
add r7, sp, #32 @ r7 = sp + 32
sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width
vst1.64 {d12-d15}, [r7:256] @ save q6,q7
lsl ip, #1 @ ip = 2 * dststride - 4 * width
mov r7, r2 @ r7 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - 2 * width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = counter
@ r8 = tmpreg
@ r9 = height
@ r10 = oldsp
@ ip = dstdiff (2 * dststride - 4 * width)
@ first line
neon_eagle2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
@ middle lines
101:
mov r7, r2
neon_eagle2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
bne 101b
@ last line
mov r7, r2
neon_eagle2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0
add ip, sp, #32 @ ip = sp + 32
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
mov sp, r10 @ sp = oldsp
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r10}
bx lr
@ end procedure neon_eagle2x_16_16
neon_eagle2x_8_16:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = const uint32_t *palette
@ r3 = unsigned int width (pixels)
@ [sp] = unsigned int srcstride (bytes)
@ [sp+4] = unsigned int dststride (bytes)
@ [sp+8] = unsigned int height
@ lr = return address
@ three temporary lines
ldr ip, [sp] @ ip = srcstride
push {r4-r11,lr}
ldr r4, [sp, #(4*10)] @ r4 = dststride
ldr r5, [sp, #(4*11)] @ r5 = height
mov r6, sp @ r6 = sp
sub ip, ip, r3 @ ip = srcstride - width
bic sp, sp, #31 @ align sp to 32 bytes
sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
sub r5, r5, #2 @ height -= 2
mov r10, sp @ tmpline3 = sp
lsl r7, #1 @ r7 = 2 * dststride - 4 * width
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov r11, sp @ tmpline2 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov lr, sp @ tmpline1 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub r8, sp, #64 @ r8 = sp - 64
vst1.64 {d8-d11}, [r8:256] @ save q4,q5
sub r9, sp, #32 @ r9 = sp - 32
vst1.64 {d12-d15}, [r9:256] @ save q6,q7
sub sp, sp, #(36 + 64) @ sp -= (36 + 64)
str r6, [sp] @ oldsp = r6
str r5, [sp, #4] @ height = r5
str ip, [sp, #8] @ srcdiff = ip
str r7, [sp, #12] @ dstdiff = r7
str r4, [sp, #16] @ dststride = r4
str lr, [sp, #20] @ tmpline1 = lr
str r11, [sp, #24] @ tmpline2 = r11
str r10, [sp, #28] @ tmpline3 = r10
str r3, [sp, #32] @ width = r3
@ r0 = src
@ r1 = dst
@ r2 = palette
@ r3 = counter
@ r4 = dst2
@ r11 = bufptr1
@ ip = bufptr2
@ lr = bufptr3
@ [sp] = oldsp
@ [sp, #4] = height
@ [sp, #8] = srcdiff (srcstride - width)
@ [sp, #12] = dstdiff (2 * dststride - 4 * width)
@ [sp, #16] = dststride
@ [sp, #20] = tmpline1
@ [sp, #24] = tmpline2
@ [sp, #28] = tmpline3
@ [sp, #32] = width
@ lr = tmpline1
@ r3 = counter
@ first line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r0, r0, r7 @ src += srcdiff
@ second line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r3, [sp, #32] @ counter = width
ldr ip, [sp, #20] @ bufptr2 = tmpline1
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
@ first temporary line
neon_eagle2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
100:
@ line n+1
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r4, r1, r9 @ dst2 = dst + dststride
ldr r3, [sp, #32] @ counter = width
str r11, [sp, #28] @ tmpline3 = bufptr1
str ip, [sp, #20] @ tmpline1 = bufptr2
str lr, [sp, #24] @ tmpline2 = bufptr3
@ temporary line n
neon_eagle2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0
ldr r6, [sp, #4] @ r6 = height
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
subS r6, r6, #1 @ height--
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
str r6, [sp, #4] @ height = r6
bne 100b
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
@ last temporary line
neon_eagle2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0
add r6, sp, #36 @ r6 = sp + 36
ldr sp, [sp] @ sp = oldsp
vld1.64 {d8-d11}, [r6:256] @ restore q4,q5
add ip, r6, #32 @ ip = r6 + 32
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r11,lr}
bx lr
@ end procedure neon_eagle2x_8_16

View File

@ -1,761 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
@ S T U --\ E1 E2
@ V C W --/ E3 E4
@ X Y Z
@ q0 = S1sl < S >
@ q1 = S2sl < V >
@ q2 = S3sl < X >
@ q3 = S1sr < U >
@ q4 = S2sr < W >
@ q5 = S3sr < Z >
@ q6 = E3
@ q7 = E4
@ q8 = S1
@ q9 = S2
@ q10 = S3
@ q11 = S1prev < T >
@ q12 = S2prev < C >
@ q13 = S3prev < Y >
@ q14 = E1
@ q15 = E2
.macro __neon_eagle2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2
.ifeqs "\qT", "q11"
vld1.8 {d23[7]}, [\src1] @ S1prev[15] = src[-srcstride]
.endif
vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]
.ifeqs "\qY", "q13"
vld1.8 {d27[7]}, [\src3] @ S3prev[15] = src[srcstride]
.endif
andS \reg1, \counter, #15 @ reg1 = counter & 15
.ifnes "\qT", "q11"
add \src1, \src1, \counter @ src1 += counter
.endif
.ifnes "\qY", "q13"
add \src3, \src3, \counter @ src3 += counter
.endif
beq 1f
@ first 1-15 pixels - align counter to 16 bytes
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vld1.8 {q8}, [\src1], \reg1 @ S1 = [src - srcstride]; src1 += counter & 15
.endif
vld1.8 {q9}, [\src2], \reg1 @ S2 = [src ]; src2 += counter & 15
.ifeqs "\qY", "q13"
vld1.8 {q10}, [\src3], \reg1 @ S3 = [src + srcstride]; src3 += counter & 15
.endif
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >
.endif
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >
.ifeqs "\qY", "q13"
vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
vceq.i8 q14, q0, \qT @ E1 = < S == T >
vceq.i8 q15, q0, q1 @ tmp1 = < S == V >
vceq.i8 q6, q2, \qY @ E3 = < X == Y >
vceq.i8 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i8 q15, q3, \qT @ E2 = < U == T >
vceq.i8 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i8 q7, q5, \qY @ E4 = < Z == Y >
vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
.ifeqs "\qT", "q11"
sub \reg1, \src1, #1
.else
sub \reg1, \src2, #1
.endif
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
.ifeqs "\qT", "q11"
vld1.8 {d23[7]}, [\reg1] @ S1prev[15] = src[counter & 15 - 1 - srcstride]
sub \reg1, \src2, #1
.endif
vld1.8 {d25[7]}, [\reg1] @ S2prev[15] = src[counter & 15 - 1]
.ifeqs "\qY", "q13"
sub \reg1, \src3, #1
vld1.8 {d27[7]}, [\reg1] @ S3prev[15] = src[counter & 15 - 1 + srcstride]
.endif
ubfx \reg1, \counter, #0, #4 @ reg1 = counter & 15
lsl \reg1, #1
vst2.8 {q14-q15}, [\dst1],\reg1 @ [dst] = E1,E2; dst1 += reg1
bic \counter, \counter, #15
vst2.8 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1
@ counter is aligned to 16 bytes
1:
.ifeqs "\qT", "q11"
vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16
.endif
vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16
.ifeqs "\qY", "q13"
vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16
.endif
@ inner loop (16 pixels per iteration)
2:
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16
vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >
.endif
vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >
.ifeqs "\qY", "q13"
vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16
vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
sub \counter, \counter, #16 @ counter -= 16
vceq.i8 q14, q0, \qT @ E1 = < S == T >
vceq.i8 q15, q0, q1 @ tmp1 = < S == V >
vceq.i8 q6, q2, \qY @ E3 = < X == Y >
vceq.i8 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i8 q15, q3, \qT @ E2 = < U == T >
vceq.i8 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i8 q7, q5, \qY @ E4 = < Z == Y >
vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16
cmp \counter, #16
vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16
bhi 2b
@ last 16 pixels
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...
.endif
vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...
.ifeqs "\qY", "q13"
vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...
.endif
.ifeqs "\qT", "q11"
vext.8 q3, \qT, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < U >
.endif
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < W >
.ifeqs "\qY", "q13"
vext.8 q5, \qY, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
vceq.i8 q14, q0, \qT @ E1 = < S == T >
vceq.i8 q15, q0, q1 @ tmp1 = < S == V >
vceq.i8 q6, q2, \qY @ E3 = < X == Y >
vceq.i8 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i8 q15, q3, \qT @ E2 = < U == T >
vceq.i8 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i8 q7, q5, \qY @ E4 = < Z == Y >
vceq.i8 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
vst2.8 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*16
vst2.8 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*16
.endm
.macro _neon_eagle2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_eagle2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_eagle2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro neon_eagle2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32
.ifeq \srcalign16
.ifeq \dstalign32
_neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2
.else
_neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256
.endif
.else
.ifeq \dstalign32
_neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2
.else
_neon_eagle2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256
.endif
.endif
.endm
.macro __neon_eagle2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qT, qY, alsrc1, alsrc2, alsrc3, aldst1, aldst2
.ifeqs "\qT", "q11"
vld1.16 {d23[3]}, [\src1] @ S1prev[7] = src[-srcstride]
.endif
vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]
.ifeqs "\qY", "q13"
vld1.16 {d27[3]}, [\src3] @ S3prev[7] = src[srcstride]
.endif
andS \reg1, \counter, #7 @ reg1 = counter & 7
.ifnes "\qT", "q11"
add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter
.endif
.ifnes "\qY", "q13"
add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter
.endif
beq 1f
@ first 1-7 pixels - align counter to 16 bytes
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]
add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * (counter & 7)
.endif
vld1.16 {q9}, [\src2] @ S2 = [src ]
add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * (counter & 7)
.ifeqs "\qY", "q13"
vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]
add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * (counter & 7)
.endif
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >
.endif
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >
.ifeqs "\qY", "q13"
vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
vceq.i16 q14, q0, \qT @ E1 = < S == T >
vceq.i16 q15, q0, q1 @ tmp1 = < S == V >
vceq.i16 q6, q2, \qY @ E3 = < X == Y >
vceq.i16 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i16 q15, q3, \qT @ E2 = < U == T >
vceq.i16 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i16 q7, q5, \qY @ E4 = < Z == Y >
vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
.ifeqs "\qT", "q11"
sub \reg1, \src1, #2
.else
sub \reg1, \src2, #2
.endif
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
.ifeqs "\qT", "q11"
vld1.16 {d23[3]}, [\reg1] @ S1prev[7] = src[2 * (counter & 7) - 2 - srcstride]
sub \reg1, \src2, #2
.endif
vld1.16 {d25[3]}, [\reg1] @ S2prev[7] = src[2 * (counter & 7) - 2]
.ifeqs "\qY", "q13"
sub \reg1, \src3, #2
vld1.16 {d27[3]}, [\reg1] @ S3prev[7] = src[2 * (counter & 7) - 2 + srcstride]
.endif
ubfx \reg1, \counter, #0, #3 @ reg1 = counter & 7
lsl \reg1, #2
vst2.16 {q14-q15}, [\dst1], \reg1 @ [dst] = E1,E2; dst1 += reg1
bic \counter, \counter, #7
vst2.16 {q6-q7}, [\dst2], \reg1 @ [dst + dststride] = E3,E4; dst2 += reg1
@ counter is aligned to 16 bytes
1:
.ifeqs "\qT", "q11"
vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8
.endif
vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8
.ifeqs "\qY", "q13"
vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8
.endif
@ inner loop (8 pixels per iteration)
2:
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8
vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >
.endif
vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >
.ifeqs "\qY", "q13"
vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8
vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
sub \counter, \counter, #8 @ counter -= 8
vceq.i16 q14, q0, \qT @ E1 = < S == T >
vceq.i16 q15, q0, q1 @ tmp1 = < S == V >
vceq.i16 q6, q2, \qY @ E3 = < X == Y >
vceq.i16 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i16 q15, q3, \qT @ E2 = < U == T >
vceq.i16 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i16 q7, q5, \qY @ E4 = < Z == Y >
vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8
cmp \counter, #8
vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8
bhi 2b
@ last 8 pixels
@ q0 = S1sl < S >
@ q2 = S3sl < X >
@ q7 = tmp2
@ q15 = tmp1
.ifeqs "\qT", "q11"
vext.8 q0, \qT, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >
vmov \qT, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qY", "q13"
vext.8 q2, \qY, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >
vmov \qY, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qT", "q11"
vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...
.endif
vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...
.ifeqs "\qY", "q13"
vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...
.endif
.ifeqs "\qT", "q11"
vext.8 q3, \qT, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < U >
.endif
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < W >
.ifeqs "\qY", "q13"
vext.8 q5, \qY, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qT", "q11"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.endif
vceq.i16 q14, q0, \qT @ E1 = < S == T >
vceq.i16 q15, q0, q1 @ tmp1 = < S == V >
vceq.i16 q6, q2, \qY @ E3 = < X == Y >
vceq.i16 q7, q2, q1 @ tmp2 = < X == V >
vand q14, q14, q15 @ E1 = < S == T && S == V >
@ q0 = tmp3
@ q15 = E2
vceq.i16 q15, q3, \qT @ E2 = < U == T >
vceq.i16 q0, q3, q4 @ tmp3 = < U == W >
vand q6, q6, q7 @ E3 = < X == Y && X == V >
@ q2 = tmp4
@ q7 = E4
vceq.i16 q7, q5, \qY @ E4 = < Z == Y >
vceq.i16 q2, q5, q4 @ tmp4 = < Z == W >
vand q15, q15, q0 @ E2 = < U == T && U == W >
vbsl q14, \qT, q12 @ E1 = < (S == T && S == V) ? T : C >
vbsl q15, \qT, q12 @ E2 = < (U == T && U == W) ? T : C >
vand q7, q7, q2 @ E4 = < Z == Y && Z == W >
vbsl q6, \qY, q12 @ E3 = < (X == Y && X == V) ? Y : C >
vbsl q7, \qY, q12 @ E4 = < (Z == Y && Z == W) ? Y : C >
vst2.16 {q14-q15}, [\aldst1]! @ [dst] = E1,E2; dst1 += 2*2*8
vst2.16 {q6-q7}, [\aldst2]! @ [dst + dststride] = E3,E4; dst2 += 2*2*8
.endm
.macro _neon_eagle2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_eagle2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_eagle2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_eagle2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro neon_eagle2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32
.ifeq \srcalign16
.ifeq \dstalign32
_neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2
.else
_neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256
.endif
.else
.ifeq \dstalign32
_neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2
.else
_neon_eagle2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256
.endif
.endif
.endm

View File

@ -1,44 +0,0 @@
/**
*
* Copyright (C) 2012 Roman Pauer
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#if !defined(_NEON_EAGLE2X_H_INCLUDED_)
#define _NEON_EAGLE2X_H_INCLUDED_
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
extern void neon_eagle2x_8_8(const uint8_t *src, uint8_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_eagle2x_16_16(const uint16_t *src, uint16_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_eagle2x_8_16(const uint8_t *src, uint16_t *dst, const uint32_t *palette, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
#ifdef __cplusplus
}
#endif
#endif /* _NEON_EAGLE2X_H_INCLUDED_ */

View File

@ -1,665 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
.macro _neon_normalxx_8_16_line_middle src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride, dA, dB
ldr \reg1, [\src] @ reg1 = src[0-3]
ldr \reg2, [\src, #4] @ reg2 = src[4-7]
ldr \reg3, [\src, #8] @ reg3 = src[8-11]
ldr \reg4, [\src, #12] @ reg4 = src[12-15]
ubfx \reg5, \reg1, #0, #8 @ reg5 = src[0]
ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[0]]
ubfx \reg6, \reg1, #8, #8 @ reg6 = src[1]
ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[1]]
ubfx \reg7, \reg1, #16, #8 @ reg7 = src[2]
ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[2]]
lsr \reg1, \reg1, #24 @ reg1 = src[3]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
ubfx \reg8, \reg2, #0, #8 @ reg8 = src[4]
ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[4]]
ubfx \reg9, \reg2, #8, #8 @ reg9 = src[5]
ldr \reg9, [\pal, \reg9, lsl #2] @ reg9 = pal[src[5]]
bfi \reg5, \reg6, #16, #16 @ reg5 = pal[src[0]] | pal[src[1]] << 16
bfi \reg7, \reg1, #16, #16 @ reg7 = pal[src[2]] | pal[src[3]] << 16
ubfx \reg6, \reg2, #16, #8 @ reg6 = src[6]
vmov d16, \reg5, \reg7 @ d16 = pal[src[0-3]]
lsr \reg2, \reg2, #24 @ reg2 = src[7]
ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[6]]
bfi \reg8, \reg9, #16, #16 @ reg8 = pal[src[4]] | pal[src[5]] << 16
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[7]]
ubfx \reg1, \reg3, #0, #8 @ reg1 = src[8]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[8]]
ubfx \reg5, \reg3, #8, #8 @ reg5 = src[9]
ldr \reg5, [\pal, \reg5, lsl #2] @ reg5 = pal[src[9]]
ubfx \reg7, \reg3, #16, #8 @ reg7 = src[10]
ldr \reg7, [\pal, \reg7, lsl #2] @ reg7 = pal[src[10]]
bfi \reg6, \reg2, #16, #16 @ reg6 = pal[src[6]] | pal[src[7]] << 16
vmov d17, \reg8, \reg6 @ d17 = pal[src[4-7]]
lsr \reg3, \reg3, #24 @ reg3 = src[11]
ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[11]]
ubfx \reg2, \reg4, #0, #8 @ reg2 = src[12]
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[12]]
ubfx \reg6, \reg4, #8, #8 @ reg6 = src[13]
ldr \reg6, [\pal, \reg6, lsl #2] @ reg6 = pal[src[13]]
ubfx \reg8, \reg4, #16, #8 @ reg8 = src[14]
ldr \reg8, [\pal, \reg8, lsl #2] @ reg8 = pal[src[14]]
lsr \reg4, \reg4, #24 @ reg4 = src[15]
ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[15]]
bfi \reg1, \reg5, #16, #16 @ reg1 = pal[src[8]] | pal[src[9]] << 16
bfi \reg7, \reg3, #16, #16 @ reg7 = pal[src[10]] | pal[src[11]] << 16
bfi \reg2, \reg6, #16, #16 @ reg2 = pal[src[12]] | pal[src[13]] << 16
vmov \dA, \reg1, \reg7 @ dA = pal[src[8-11]]
sub \counter, \counter, #16 @ counter -= 16
bfi \reg8, \reg4, #16, #16 @ reg8 = pal[src[14]] | pal[src[15]] << 16
add \src, \src, #16 @ src += 16
vmov \dB, \reg2, \reg8 @ dB = pal[src[12-15]]
cmp \counter, #16
.endm
.macro neon_normal1x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9
@ align src to 4 bytes
andS \reg5, \src, #3 @ reg5 = src & 3
beq 10f
@ first 1-3 pixels
ldr \reg1, [\src] @ reg1 = src[0-3]
rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
add \src, \src, \reg5 @ src += reg5
sub \counter, \counter, \reg5 @ counter -= reg5
subS \reg5, \reg5, #1 @ reg5--
ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
ubfxne \reg3, \reg1, #8, #8 @ reg3 = src[1]
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[reg2]
ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[reg3]
strh \reg2, [\dst] @ dst[0] = reg2
strneh \reg3, [\dst, #2]! @ dst[1] = reg3; dst++
subneS \reg5, \reg5, #1 @ reg5--
ubfxne \reg4, \reg1, #16, #8 @ reg4 = src[2]
add \dst, \dst, #2 @ dst++
ldrne \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[reg4]
strneh \reg4, [\dst], #2 @ dst[2] = reg4; dst++
@ middle pixels (16 per iteration)
10:
_neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, , d18, d19
vst1.16 {d16-d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 2*16
bhs 10b
@ last 0-15 bytes
cmp \counter, #0
beq 40f
cmp \counter, #4
blo 30f
@ 4-12 pixels (4 pre iteration)
20:
ldr \reg1, [\src] @ reg1 = src[0-3]
sub \counter, \counter, #4 @ counter -= 4
add \src, \src, #4 @ src += 4
add \dst, \dst, #(2*4) @ dst += 4
ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
cmp \counter, #4
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
lsr \reg1, \reg1, #24 @ reg1 = src[3]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
strh \reg2, [\dst, #-8] @ dst[0] = reg2
strh \reg3, [\dst, #-6] @ dst[1] = reg3
strh \reg4, [\dst, #-4] @ dst[2] = reg4
strh \reg1, [\dst, #-2] @ dst[3] = reg1
bhs 20b
cmp \counter, #0
beq 40f
@ last 1-3 pixels
30:
ldrb \reg1, [\src] @ reg1 = src[0]
subS \counter, \counter, #1 @ counter--
ldrneb \reg2, [\src, #1]! @ reg2 = src[1]; src++
add \src, \src, #1 @ src++
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
ldrne \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[1]]
strh \reg1, [\dst] @ dst[0] = reg1
strneh \reg2, [\dst, #2]! @ dst[1] = reg2; dst++
subneS \counter, \counter, #1 @ counter--
ldrneb \reg3, [\src], #1 @ reg3 = src[2]; src++
add \dst, \dst, #2 @ dst++
ldrne \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[2]]
strneh \reg3, [\dst], #2 @ dst[2] = reg3; dst++
40:
.endm
.macro neon_normal2x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
@ align src to 4 bytes
andS \reg5, \src, #3 @ reg5 = src & 3
beq 10f
@ first 1-3 pixels
rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
1:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \dst, \dst, #4 @ dst += 2*2
sub \counter, \counter, #1 @ counter--
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
subS \reg5, \reg5, #1 @ reg5--
strh \reg1, [\dst, #-4] @ dst[0] = reg1
strh \reg1, [\dst, #-2] @ dst[1] = reg1
strh \reg1, [\reg2] @ dst1[0] = reg1
strh \reg1, [\reg2, #2] @ dst1[1] = reg1
bne 1b
@ middle pixels (16 per iteration)
10:
_neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d20, d21
vmov q9, q8
add \reg1, \dst, \dststride @ reg1 = dst + dststride
vmov q11, q10
vst2.16 {q8,q9}, [\dst]! @ dst[0-7] = q8-q9; dst += 2*2*8
vst2.16 {q10,q11}, [\dst]! @ dst[8-15] = q10-q11; dst += 2*2*8
vst2.16 {q8,q9}, [\reg1]! @ dst1[0-7] = q8-q9; dst1 += 2*2*8
vst2.16 {q10,q11}, [\reg1]! @ dst1[8-15] = q10-q11; dst1 += 2*2*8
bhs 10b
@ last 0-15 bytes
cmp \counter, #0
beq 40f
cmp \counter, #4
blo 30f
@ 4-12 pixels (4 pre iteration)
20:
ldr \reg1, [\src] @ reg1 = src[0-3]
sub \counter, \counter, #4 @ counter -= 4
add \src, \src, #4 @ src += 4
ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
cmp \counter, #4
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
lsr \reg1, \reg1, #24 @ reg1 = src[3]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
add \reg5, \dst, \dststride
bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
vmov.32 d16[0], \reg2
bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
vmov.32 d16[1], \reg4
vmov d17, d16
vst2.16 {d16,d17}, [\dst]! @ dst[0-7] = d16-d17; dst += 2*2*4
vst2.16 {d16,d17}, [\reg5] @ dst1[0-7] = d16-d17
bhs 20b
cmp \counter, #0
beq 40f
@ last 1-3 pixels
30:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \dst, \dst, #4 @ dst += 2*2
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
subS \counter, \counter, #1 @ counter--
strh \reg1, [\dst, #-4] @ dst[0] = reg1
strh \reg1, [\dst, #-2] @ dst[1] = reg1
strh \reg1, [\reg2] @ dst1[0] = reg1
strh \reg1, [\reg2, #2] @ dst1[1] = reg1
bne 30b
40:
.endm
.macro neon_normal3x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
@ align src to 4 bytes
andS \reg5, \src, #3 @ reg5 = src & 3
beq 10f
@ first 1-3 pixels
rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
1:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \reg3, \reg2, \dststride
add \dst, \dst, #6 @ dst += 3*2
sub \counter, \counter, #1 @ counter--
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
subS \reg5, \reg5, #1 @ reg5--
strh \reg1, [\dst, #-6] @ dst[0] = reg1
strh \reg1, [\dst, #-4] @ dst[1] = reg1
strh \reg1, [\dst, #-2] @ dst[2] = reg1
bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
strh \reg1, [\reg2] @ dst1[0] = reg1
str \reg1, [\reg2, #2] @ dst1[1-2] = reg1
strh \reg1, [\reg3] @ dst2[0] = reg1
str \reg1, [\reg3, #2] @ dst2[1-2] = reg1
bne 1b
@ middle pixels (16 per iteration)
10:
_neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d22, d23
vmov q9, q8
add \reg1, \dst, \dststride @ reg1 = dst + dststride
vmov q10, q8
add \reg2, \dst, \dststride, lsl #1 @ reg1 = dst + 2 * dststride
vmov q12, q11
vst3.16 {d16,d18,d20}, [\dst]! @ dst[0-3] = q8-q10[0]; dst += 3*2*4
vmov q13, q11
vst3.16 {d17,d19,d21}, [\dst]! @ dst[4-7] = q8-q10[1]; dst += 3*2*4
vst3.16 {d22,d24,d26}, [\dst]! @ dst[8-11] = q11-q13[0]; dst += 3*2*4
vst3.16 {d23,d25,d27}, [\dst]! @ dst[12-15] = q11-q13[1]; dst += 3*2*4
vst3.16 {d16,d18,d20}, [\reg1]! @ dst1[0-3] = q8-q10[0]; dst1 += 3*2*4
vst3.16 {d17,d19,d21}, [\reg1]! @ dst1[4-7] = q8-q10[1]; dst1 += 3*2*4
vst3.16 {d22,d24,d26}, [\reg1]! @ dst1[8-11] = q11-q13[0]; dst1 += 3*2*4
vst3.16 {d23,d25,d27}, [\reg1]! @ dst1[12-15] = q11-q13[1]; dst1 += 3*2*4
vst3.16 {d16,d18,d20}, [\reg2]! @ dst2[0-3] = q8-q10[0]; dst2 += 3*2*4
vst3.16 {d17,d19,d21}, [\reg2]! @ dst2[4-7] = q8-q10[1]; dst2 += 3*2*4
vst3.16 {d22,d24,d26}, [\reg2]! @ dst2[8-11] = q11-q13[0]; dst2 += 3*2*4
vst3.16 {d23,d25,d27}, [\reg2]! @ dst2[12-15] = q11-q13[1]; dst2 += 3*2*4
bhs 10b
@ last 0-15 bytes
cmp \counter, #0
beq 40f
cmp \counter, #4
blo 30f
@ 4-12 pixels (4 pre iteration)
20:
ldr \reg1, [\src] @ reg1 = src[0-3]
sub \counter, \counter, #4 @ counter -= 4
add \src, \src, #4 @ src += 4
ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
cmp \counter, #4
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
lsr \reg1, \reg1, #24 @ reg1 = src[3]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
add \reg5, \dst, \dststride
bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
vmov.32 d16[0], \reg2
add \reg6, \reg5, \dststride
bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
vmov.32 d16[1], \reg4
vmov d17, d16
vmov d18, d16
vst3.16 {d16,d17,d18}, [\dst]! @ dst[0-11] = d16-d18; dst += 3*2*4
vst3.16 {d16,d17,d18}, [\reg5] @ dst1[0-11] = d16-d18
vst3.16 {d16,d17,d18}, [\reg6] @ dst2[0-11] = d16-d18
bhs 20b
cmp \counter, #0
beq 40f
@ last 1-3 pixels
30:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \reg3, \reg2, \dststride
add \dst, \dst, #6 @ dst += 3*2
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
subS \counter, \counter, #1 @ counter--
strh \reg1, [\dst, #-6] @ dst[0] = reg1
strh \reg1, [\dst, #-4] @ dst[1] = reg1
strh \reg1, [\dst, #-2] @ dst[2] = reg1
bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
strh \reg1, [\reg2] @ dst1[0] = reg1
str \reg1, [\reg2, #2] @ dst1[1-2] = reg1
strh \reg1, [\reg3] @ dst2[0] = reg1
str \reg1, [\reg3, #2] @ dst2[1-2] = reg1
bne 30b
40:
.endm
.macro neon_normal4x_8_16_line src, dst, pal, counter, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, dststride
@ align src to 4 bytes
andS \reg5, \src, #3 @ reg5 = src & 3
beq 10f
@ first 1-3 pixels
rsb \reg5, \reg5, #4 @ reg5 = 4 - (src & 3)
1:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \reg3, \reg2, \dststride
add \dst, \dst, #8 @ dst += 4*2
sub \counter, \counter, #1 @ counter--
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
add \reg4, \reg3, \dststride
strh \reg1, [\dst, #-8] @ dst[0] = reg1
subS \reg5, \reg5, #1 @ reg5--
strh \reg1, [\dst, #-6] @ dst[1] = reg1
bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
str \reg1, [\dst, #-4] @ dst[2-3] = reg1
str \reg1, [\reg2] @ dst1[0-1] = reg1
str \reg1, [\reg2, #4] @ dst1[2-3] = reg1
str \reg1, [\reg3] @ dst2[0-1] = reg1
str \reg1, [\reg3, #4] @ dst2[2-3] = reg1
str \reg1, [\reg4] @ dst3[0-1] = reg1
str \reg1, [\reg4, #4] @ dst3[2-3] = reg1
bne 1b
@ middle pixels (16 per iteration)
10:
_neon_normalxx_8_16_line_middle \src, \dst, \pal, \counter, \reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8, \reg9, \dststride, d24, d25
vmov q9, q8
add \reg1, \dst, \dststride @ reg1 = dst + dststride
vmov q10, q8
add \reg2, \dst, \dststride, lsl #1 @ reg2 = dst + 2 * dststride
vmov q11, q8
add \reg3, \reg1, \dststride,lsl #1 @ reg3 = dst + 3 * dststride
vmov q13, q12
vst4.16 {d16,d18,d20,d22}, [\dst]! @ dst[0-3] = q8-q11[0]; dst += 4*2*4
vmov q14, q12
vmov q15, q12
vst4.16 {d17,d19,d21,d23}, [\dst]! @ dst[4-7] = q8-q11[1]; dst += 4*2*4
vst4.16 {d24,d26,d28,d30}, [\dst]! @ dst[8-11] = q12-q15[0]; dst += 4*2*4
vst4.16 {d25,d27,d29,d31}, [\dst]! @ dst[12-15] = q12-q15[1]; dst += 4*2*4
vst4.16 {d16,d18,d20,d22}, [\reg1]! @ dst1[0-3] = q8-q11[0]; dst1 += 4*2*4
vst4.16 {d17,d19,d21,d23}, [\reg1]! @ dst1[4-7] = q8-q11[1]; dst1 += 4*2*4
vst4.16 {d24,d26,d28,d30}, [\reg1]! @ dst1[8-11] = q12-q15[0]; dst1 += 4*2*4
vst4.16 {d25,d27,d29,d31}, [\reg1]! @ dst1[12-15] = q12-q15[1]; dst1 += 4*2*4
vst4.16 {d16,d18,d20,d22}, [\reg2]! @ dst2[0-3] = q8-q11[0]; dst2 += 4*2*4
vst4.16 {d17,d19,d21,d23}, [\reg2]! @ dst2[4-7] = q8-q11[1]; dst2 += 4*2*4
vst4.16 {d24,d26,d28,d30}, [\reg2]! @ dst2[8-11] = q12-q15[0]; dst2 += 4*2*4
vst4.16 {d25,d27,d29,d31}, [\reg2]! @ dst2[12-15] = q12-q15[1]; dst2 += 4*2*4
vst4.16 {d16,d18,d20,d22}, [\reg3]! @ dst3[0-3] = q8-q11[0]; dst3 += 4*2*4
vst4.16 {d17,d19,d21,d23}, [\reg3]! @ dst3[4-7] = q8-q11[1]; dst3 += 4*2*4
vst4.16 {d24,d26,d28,d30}, [\reg3]! @ dst3[8-11] = q12-q15[0]; dst3 += 4*2*4
vst4.16 {d25,d27,d29,d31}, [\reg3]! @ dst3[12-15] = q12-q15[1]; dst3 += 4*2*4
bhs 10b
@ last 0-15 bytes
cmp \counter, #0
beq 40f
cmp \counter, #4
blo 30f
@ 4-12 pixels (4 pre iteration)
20:
ldr \reg1, [\src] @ reg1 = src[0-3]
sub \counter, \counter, #4 @ counter -= 4
add \src, \src, #4 @ src += 4
ubfx \reg2, \reg1, #0, #8 @ reg2 = src[0]
cmp \counter, #4
ldr \reg2, [\pal, \reg2, lsl #2] @ reg2 = pal[src[0]]
ubfx \reg3, \reg1, #8, #8 @ reg3 = src[1]
ldr \reg3, [\pal, \reg3, lsl #2] @ reg3 = pal[src[1]]
ubfx \reg4, \reg1, #16, #8 @ reg4 = src[2]
ldr \reg4, [\pal, \reg4, lsl #2] @ reg4 = pal[src[2]]
lsr \reg1, \reg1, #24 @ reg1 = src[3]
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[3]]
add \reg5, \dst, \dststride
bfi \reg2, \reg3, #16, #16 @ reg2 = reg2 | reg3 << 16
vmov.32 d16[0], \reg2
add \reg6, \reg5, \dststride
bfi \reg4, \reg1, #16, #16 @ reg4 = reg4 | reg1 << 16
add \reg7, \reg6, \dststride
vmov.32 d16[1], \reg4
vmov d17, d16
vmov d18, d16
vmov d19, d16
vst4.16 {d16,d17,d18,d19}, [\dst]! @ dst[0-15] = d16-d19; dst += 4*2*4
vst4.16 {d16,d17,d18,d19}, [\reg5] @ dst1[0-15] = d16-d19
vst4.16 {d16,d17,d18,d19}, [\reg6] @ dst2[0-15] = d16-d19
vst4.16 {d16,d17,d18,d19}, [\reg7] @ dst3[0-15] = d16-d19
bhs 20b
cmp \counter, #0
beq 40f
@ last 1-3 pixels
30:
ldrb \reg1, [\src], #1 @ reg1 = src[0]; src++
add \reg2, \dst, \dststride
add \reg3, \reg2, \dststride
add \dst, \dst, #8 @ dst += 4*2
ldr \reg1, [\pal, \reg1, lsl #2] @ reg1 = pal[src[0]]
add \reg4, \reg3, \dststride
strh \reg1, [\dst, #-8] @ dst[0] = reg1
subS \counter, \counter, #1 @ counter--
strh \reg1, [\dst, #-6] @ dst[1] = reg1
bfi \reg1, \reg1, #16, #16 @ reg1 = reg1 | reg1 << 16
str \reg1, [\dst, #-4] @ dst[2-3] = reg1
str \reg1, [\reg2] @ dst1[0-1] = reg1
str \reg1, [\reg2, #4] @ dst1[2-3] = reg1
str \reg1, [\reg3] @ dst2[0-1] = reg1
str \reg1, [\reg3, #4] @ dst2[2-3] = reg1
str \reg1, [\reg4] @ dst3[0-1] = reg1
str \reg1, [\reg4, #4] @ dst3[2-3] = reg1
bne 30b
40:
.endm

View File

@ -1,306 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
.arm
#include "neon_scale2x.Sinc"
#include "neon_normalxx.Sinc"
.global neon_scale2x_8_8
.global neon_scale2x_16_16
.global neon_scale2x_8_16
.align 4
neon_scale2x_8_8:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r9}
ldr r9, [sp, #(7*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
add r5, r0, r3 @ r5 = src + srcstride
add r6, r1, ip @ r6 = dst + dststride
sub r3, r3, r2 @ r3 = srcstride - width
sub ip, ip, r2 @ ip = dststride - width
lsl ip, #1 @ ip = 2 * dststride - 2 * width
mov r7, r2 @ r7 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = counter
@ r8 = tmpreg
@ r9 = height
@ ip = dstdiff (2 * dststride - 2 * width)
@ first line
neon_scale2x_8_8_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
@ middle lines
101:
mov r7, r2
neon_scale2x_8_8_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
bne 101b
@ last line
mov r7, r2
neon_scale2x_8_8_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0
pop {r4-r9}
bx lr
@ end procedure neon_scale2x_8_8
neon_scale2x_16_16:
@ r0 = const uint16_t *src
@ r1 = uint16_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r9}
ldr r9, [sp, #(7*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
add r5, r0, r3 @ r5 = src + srcstride
add r6, r1, ip @ r6 = dst + dststride
sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width
sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width
lsl ip, #1 @ ip = 2 * dststride - 4 * width
mov r7, r2 @ r7 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - 2 * width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = counter
@ r8 = tmpreg
@ r9 = height
@ ip = dstdiff (2 * dststride - 4 * width)
@ first line
neon_scale2x_16_16_line first, r4, r0, r5, r7, r1, r6, r8, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
@ middle lines
101:
mov r7, r2
neon_scale2x_16_16_line middle, r4, r0, r5, r7, r1, r6, r8, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
bne 101b
@ last line
mov r7, r2
neon_scale2x_16_16_line last, r4, r0, r5, r7, r1, r6, r8, 0, 0
pop {r4-r9}
bx lr
@ end procedure neon_scale2x_16_16
neon_scale2x_8_16:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = const uint32_t *palette
@ r3 = unsigned int width (pixels)
@ [sp] = unsigned int srcstride (bytes)
@ [sp+4] = unsigned int dststride (bytes)
@ [sp+8] = unsigned int height
@ lr = return address
@ three temporary lines
ldr ip, [sp] @ ip = srcstride
push {r4-r11,lr}
ldr r4, [sp, #(4*10)] @ r4 = dststride
ldr r5, [sp, #(4*11)] @ r5 = height
mov r6, sp @ r6 = sp
sub ip, ip, r3 @ ip = srcstride - width
bic sp, sp, #31 @ align sp to 32 bytes
sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
sub r5, r5, #2 @ height -= 2
mov r10, sp @ tmpline3 = sp
lsl r7, #1 @ r7 = 2 * dststride - 4 * width
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov r11, sp @ tmpline2 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov lr, sp @ tmpline1 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, #36
str r6, [sp] @ oldsp = r6
str r5, [sp, #4] @ height = r5
str ip, [sp, #8] @ srcdiff = ip
str r7, [sp, #12] @ dstdiff = r7
str r4, [sp, #16] @ dststride = r4
str lr, [sp, #20] @ tmpline1 = lr
str r11, [sp, #24] @ tmpline2 = r11
str r10, [sp, #28] @ tmpline3 = r10
str r3, [sp, #32] @ width = r3
@ r0 = src
@ r1 = dst
@ r2 = palette
@ r3 = counter
@ r4 = dst2
@ r11 = bufptr1
@ ip = bufptr2
@ lr = bufptr3
@ [sp] = oldsp
@ [sp, #4] = height
@ [sp, #8] = srcdiff (srcstride - width)
@ [sp, #12] = dstdiff (2 * dststride - 4 * width)
@ [sp, #16] = dststride
@ [sp, #20] = tmpline1
@ [sp, #24] = tmpline2
@ [sp, #28] = tmpline3
@ [sp, #32] = width
@ lr = tmpline1
@ r3 = counter
@ first line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r0, r0, r7 @ src += srcdiff
@ second line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r3, [sp, #32] @ counter = width
ldr ip, [sp, #20] @ bufptr2 = tmpline1
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
@ first temporary line
neon_scale2x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, 1, 0
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
100:
@ line n+1
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r4, r1, r9 @ dst2 = dst + dststride
ldr r3, [sp, #32] @ counter = width
str r11, [sp, #28] @ tmpline3 = bufptr1
str ip, [sp, #20] @ tmpline1 = bufptr2
str lr, [sp, #24] @ tmpline2 = bufptr3
@ temporary line n
neon_scale2x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, 1, 0
ldr r6, [sp, #4] @ r6 = height
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
subS r6, r6, #1 @ height--
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
str r6, [sp, #4] @ height = r6
bne 100b
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
@ last temporary line
neon_scale2x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, 1, 0
ldr sp, [sp] @ sp = oldsp
pop {r4-r11,lr}
bx lr
@ end procedure neon_scale2x_8_16

View File

@ -1,474 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
@ A B C --\ E0 E1
@ D E F --/ E2 E3
@ G H I
@ q0 = E0 (tmp0)
@ q1 = E1 (tmp1)
@ q2 = E2 (tmp2)
@ q3 = E3 (tmp3)
@ q8 = S2prev
@ q9 = S2next
@ q10 = C0 < B == H || D == F >
@ q11 = S1 < B >
@ q12 = S2 < E >
@ q13 = S3 < H >
@ q14 = S2sl < D >
@ q15 = S2sr < F >
.macro __neon_scale2x_8_8_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2
vld1.8 {d17[7]}, [\src2] @ S2prev[15] = src[0]
andS \reg1, \counter, #15 @ reg1 = counter & 15
.ifnes "\qB", "q11"
add \src1, \src1, \counter @ src1 += counter
.endif
.ifnes "\qH", "q13"
add \src3, \src3, \counter @ src3 += counter
.endif
beq 1f
@ first 1-15 pixels - align counter to 16 bytes
vld1.8 {q12}, [\src2], \reg1 @ S2 = [src] < E >; src2 += counter & 15
.ifeqs "\qB", "q11"
vld1.8 {\qB}, [\src1], \reg1 @ S1 = [src - srcstride] < B >; src1 += counter & 15
.endif
.ifeqs "\qH", "q13"
vld1.8 {\qH}, [\src3], \reg1 @ S3 = [src + srcstride] < H >; src3 += counter & 15
.endif
vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >
vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >
vmov.8 d17[7], \reg1 @ S2prev[15] = reg1
vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | ... < F >
vceq.i8 q0, q14, \qB @ tmp0 = < D == B >
vceq.i8 q3, q14, q15 @ tmp3 = < D == F >
vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >
vtbl.8 d17, {d28, d29}, d17 @ S2prev[15] = src[reg1 - 1]
lsl \reg1, #1
vorr q10, q2, q3 @ C0 = < B == H || D == F >
vceq.i8 q2, q14, \qH @ tmp2 = < D == H >
vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.8 {q0-q1}, [\dst1], \reg1 @ [dst] = E0,E1; dst1 += reg1
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
bic \counter, \counter, #15
vst2.8 {q2-q3}, [\dst2], \reg1 @ [dst + dststride] = E2,E3; dst2 += reg1
@ counter is aligned to 16 bytes
1:
vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16
@ inner loop (16 pixels per iteration)
2:
vmov q12, q9 @ S2 = S2next < E >
.ifeqs "\qB", "q11"
vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16
.endif
.ifeqs "\qH", "q13"
vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16
.endif
vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >
vld1.8 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 16
vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >
vmov q8, q12 @ S2prev = S2
vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >
vceq.i8 q0, q14, \qB @ tmp0 = < D == B >
vceq.i8 q3, q14, q15 @ tmp3 = < D == F >
vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >
sub \counter, \counter, #16 @ counter -= 16
vorr q10, q2, q3 @ C0 = < B == H || D == F >
vceq.i8 q2, q14, \qH @ tmp2 = < D == H >
vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16
cmp \counter, #16
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16
bhi 2b
@ last 16 pixels
vmov q12, q9 @ S2 = S2next < E >
vshr.u64 d18, d19, #(64-8) @ S2next[0] = S2[15] | ...
.ifeqs "\qB", "q11"
vld1.8 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 16
.endif
vext.8 q14, q8, q12, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >
vext.8 q15, q12, q9, #1 @ S2sr = (S2 >> 8) | S2next[0] < F >
.ifeqs "\qH", "q13"
vld1.8 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 16
.endif
vceq.i8 q0, q14, \qB @ tmp0 = < D == B >
vceq.i8 q2, \qB, \qH @ tmp2 = < B == H >
vceq.i8 q3, q14, q15 @ tmp3 = < D == F >
vceq.i8 q1, \qB, q15 @ tmp1 = < B == F >
vorr q10, q2, q3 @ C0 = < B == H || D == F >
vceq.i8 q2, q14, \qH @ tmp2 = < D == H >
vceq.i8 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.8 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*16
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
vst2.8 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*16
.endm
.macro _neon_scale2x_8_8_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_scale2x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_scale2x_8_8_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro neon_scale2x_8_8_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32
.ifeq \srcalign16
.ifeq \dstalign32
_neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2
.else
_neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256
.endif
.else
.ifeq \dstalign32
_neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2
.else
_neon_scale2x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256
.endif
.endif
.endm
.macro __neon_scale2x_16_16_line src1, src2, src3, counter, dst1, dst2, reg1, qB, qH, alsrc1, alsrc2, alsrc3, aldst1, aldst2
vld1.16 {d17[3]}, [\src2] @ S2prev[7] = src[0]
andS \reg1, \counter, #7 @ reg1 = counter & 7
.ifnes "\qB", "q11"
add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter
.endif
.ifnes "\qH", "q13"
add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter
.endif
beq 1f
@ first 1-7 pixels - align counter to 16 bytes
vld1.16 {q12}, [\src2] @ S2 = [src] < E >
lsl \reg1, #1
.ifeqs "\qB", "q11"
vld1.16 {\qB}, [\src1] @ S1 = [src - srcstride] < B >
.endif
bfi \reg1, \reg1, #8, #8
.ifeqs "\qH", "q13"
vld1.16 {\qH}, [\src3] @ S3 = [src + srcstride] < H >
.endif
vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >
add \reg1, \reg1, #256
vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >
vmov.16 d17[3], \reg1 @ S2prev[7] = reg1
vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | ... < F >
vceq.i16 q0, q14, \qB @ tmp0 = < D == B >
vceq.i16 q3, q14, q15 @ tmp3 = < D == F >
vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >
vtbl.8 d17, {d28, d29}, d17 @ S2prev[7] = src[reg1 - 1]
vorr q10, q2, q3 @ C0 = < B == H || D == F >
and \reg1, \counter, #7
vceq.i16 q2, q14, \qH @ tmp2 = < D == H >
vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.16 {q0-q1}, [\dst1] @ [dst] = E0,E1
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
bic \counter, \counter, #7
.ifeqs "\qB", "q11"
add \src1, \src1, \reg1, lsl #1
.endif
add \src2, \src2, \reg1, lsl #1
.ifeqs "\qH", "q13"
add \src3, \src3, \reg1, lsl #1
.endif
vst2.16 {q2-q3}, [\dst2] @ [dst + dststride] = E2,E3
add \dst1, \dst1, \reg1, lsl #2
add \dst2, \dst2, \reg1, lsl #2
@ counter is aligned to 16 bytes
1:
vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8
@ inner loop (8 pixels per iteration)
2:
vmov q12, q9 @ S2 = S2next < E >
.ifeqs "\qB", "q11"
vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8
.endif
.ifeqs "\qH", "q13"
vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8
.endif
vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >
vld1.16 {q9}, [\alsrc2]! @ S2next = [src]; src2 += 2*8
vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >
vmov q8, q12 @ S2prev = S2
vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >
vceq.i16 q0, q14, \qB @ tmp0 = < D == B >
vceq.i16 q3, q14, q15 @ tmp3 = < D == F >
vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >
sub \counter, \counter, #8 @ counter -= 8
vorr q10, q2, q3 @ C0 = < B == H || D == F >
vceq.i16 q2, q14, \qH @ tmp2 = < D == H >
vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8
cmp \counter, #8
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8
bhi 2b
@ last 8 pixels
vmov q12, q9 @ S2 = S2next < E >
vshr.u64 d18, d19, #(64-16) @ S2next[0] = S2[7] | ...
.ifeqs "\qB", "q11"
vld1.16 {\qB}, [\alsrc1]! @ S1 = [src - srcstride] < B >; src1 += 2*8
.endif
vext.8 q14, q8, q12, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >
vext.8 q15, q12, q9, #2 @ S2sr = (S2 >> 16) | S2next[0] < F >
.ifeqs "\qH", "q13"
vld1.16 {\qH}, [\alsrc3]! @ S3 = [src + srcstride] < H >; src3 += 2*8
.endif
vceq.i16 q0, q14, \qB @ tmp0 = < D == B >
vceq.i16 q2, \qB, \qH @ tmp2 = < B == H >
vceq.i16 q3, q14, q15 @ tmp3 = < D == F >
vceq.i16 q1, \qB, q15 @ tmp1 = < B == F >
vorr q10, q2, q3 @ C0 = < B == H || D == F >
vceq.i16 q2, q14, \qH @ tmp2 = < D == H >
vceq.i16 q3, \qH, q15 @ tmp3 = < H == F >
vorn q0, q10, q0 @ tmp0 = < C0 || !(D == B) >
vorn q1, q10, q1 @ tmp1 = < C0 || !(B == F) >
vbsl q0, q12, q14 @ E0 = < (C0 || !(D == B)) ? E : D >
vbsl q1, q12, q15 @ E1 = < (C0 || !(B == F)) ? E : F >
vorn q2, q10, q2 @ tmp2 = < C0 || !(D == H) >
vorn q3, q10, q3 @ tmp3 = < C0 || !(H == F) >
vbsl q2, q12, q14 @ E2 = < (C0 || !(D == H)) ? E : D >
vst2.16 {q0-q1}, [\aldst1]! @ [dst] = E0,E1; dst1 += 2*2*8
vbsl q3, q12, q15 @ E3 = < (C0 || !(H == F)) ? E : F >
vst2.16 {q2-q3}, [\aldst2]! @ [dst + dststride] = E2,E3; dst2 += 2*2*8
.endm
.macro _neon_scale2x_16_16_line_first src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q12, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_scale2x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q13, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro _neon_scale2x_16_16_line_last src1, src2, src3, counter, dst1, dst2, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2
__neon_scale2x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, q11, q12, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2
.endm
.macro neon_scale2x_16_16_line part, src1, src2, src3, counter, dst1, dst2, reg1, srcalign16, dstalign32
.ifeq \srcalign16
.ifeq \dstalign32
_neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1, \dst2
.else
_neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1, \src2, \src3, \dst1:256, \dst2:256
.endif
.else
.ifeq \dstalign32
_neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2
.else
_neon_scale2x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \reg1, \src1:128, \src2:128, \src3:128, \dst1:256, \dst2:256
.endif
.endif
.endm

View File

@ -1,44 +0,0 @@
/**
*
* Copyright (C) 2012 Roman Pauer
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#if !defined(_NEON_SCALE2X_H_INCLUDED_)
#define _NEON_SCALE2X_H_INCLUDED_
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
extern void neon_scale2x_8_8(const uint8_t *src, uint8_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_scale2x_16_16(const uint16_t *src, uint16_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_scale2x_8_16(const uint8_t *src, uint16_t *dst, const uint32_t *palette, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
#ifdef __cplusplus
}
#endif
#endif /* _NEON_SCALE2X_H_INCLUDED_ */

View File

@ -1,349 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
.arm
#include "neon_scale3x.Sinc"
#include "neon_normalxx.Sinc"
.global neon_scale3x_8_8
.global neon_scale3x_16_16
.global neon_scale3x_8_16
.align 4
neon_scale3x_8_8:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r11}
ldr r9, [sp, #(9*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
mov r11, sp @ oldsp = sp
add r5, r0, r3 @ r5 = src + srcstride
bic sp, sp, #31 @ align sp to 32 bytes
add r6, r1, ip @ r6 = dst + dststride
sub sp, sp, #64 @ sp -= 64
sub r3, r3, r2 @ r3 = srcstride - width
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
add r8, sp, #32 @ r8 = sp + 32
sub ip, ip, r2 @ ip = dststride - width
vst1.64 {d12-d15}, [r8:256] @ save q6,q7
add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 3 * width
mov r8, r2 @ r8 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = dst + 2 * dststride
@ r8 = counter
@ r9 = height
@ r10 = tmpreg
@ r11 = oldsp
@ ip = dstdiff (3 * dststride - 3 * width)
@ first line
neon_scale3x_8_8_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
add r7, r7, ip
@ middle lines
101:
mov r8, r2
neon_scale3x_8_8_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
add r7, r7, ip
bne 101b
@ last line
mov r8, r2
neon_scale3x_8_8_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
add ip, sp, #32 @ ip = sp + 32
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
mov sp, r11 @ sp = oldsp
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r11}
bx lr
@ end procedure neon_scale3x_8_8
neon_scale3x_16_16:
@ r0 = const uint16_t *src
@ r1 = uint16_t *dst
@ r2 = unsigned int width (pixels)
@ r3 = unsigned int srcstride (bytes)
@ [sp] = unsigned int dststride (bytes)
@ [sp+4] = unsigned int height
@ lr = return address
ldr ip, [sp] @ ip = dststride
push {r4-r11}
ldr r9, [sp, #(9*4)] @ r9 = height
sub r4, r0, r3 @ r4 = src - srcstride
mov r11, sp @ oldsp = sp
add r5, r0, r3 @ r5 = src + srcstride
bic sp, sp, #31 @ align sp to 32 bytes
add r6, r1, ip @ r6 = dst + dststride
sub sp, sp, #64 @ sp -= 64
sub r3, r3, r2, lsl #1 @ r3 = srcstride - 2 * width
vst1.64 {d8-d11}, [sp:256] @ save q4,q5
add r7, r1, ip, lsl #1 @ r7 = dst + 2 * dststride
add r8, sp, #32 @ r8 = sp + 32
sub ip, ip, r2, lsl #1 @ ip = dststride - 2 * width
vst1.64 {d12-d15}, [r8:256] @ save q6,q7
add ip, ip, ip, lsl #1 @ ip = 3 * dststride - 6 * width
mov r8, r2 @ r8 = width
sub r9, r9, #2 @ r9 = height - 2
@ r0 = src
@ r1 = dst
@ r2 = width
@ r3 = srcdiff (srcstride - 2 * width)
@ r4 = src - srcstride
@ r5 = src + srcstride
@ r6 = dst + dststride
@ r7 = dst + 2 * dststride
@ r8 = counter
@ r9 = height
@ r10 = tmpreg
@ r11 = oldsp
@ ip = dstdiff (3 * dststride - 6 * width)
@ first line
neon_scale3x_16_16_line first, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
add r7, r7, ip
@ middle lines
101:
mov r8, r2
neon_scale3x_16_16_line middle, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
subS r9, r9, #1
add r0, r0, r3
add r4, r4, r3
add r5, r5, r3
add r1, r1, ip
add r6, r6, ip
add r7, r7, ip
bne 101b
@ last line
mov r8, r2
neon_scale3x_16_16_line last, r4, r0, r5, r8, r1, r6, r7, r10, 0, 0
add ip, sp, #32 @ ip = sp + 32
vld1.64 {d8-d11}, [sp:256] @ restore q4,q5
mov sp, r11 @ sp = oldsp
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r11}
bx lr
@ end procedure neon_scale3x_16_16
neon_scale3x_8_16:
@ r0 = const uint8_t *src
@ r1 = uint8_t *dst
@ r2 = const uint32_t *palette
@ r3 = unsigned int width (pixels)
@ [sp] = unsigned int srcstride (bytes)
@ [sp+4] = unsigned int dststride (bytes)
@ [sp+8] = unsigned int height
@ lr = return address
@ three temporary lines
ldr ip, [sp] @ ip = srcstride
push {r4-r11,lr}
ldr r4, [sp, #(4*10)] @ r4 = dststride
ldr r5, [sp, #(4*11)] @ r5 = height
mov r6, sp @ r6 = sp
sub ip, ip, r3 @ ip = srcstride - width
bic sp, sp, #31 @ align sp to 32 bytes
sub r7, r4, r3, lsl #1 @ r7 = dststride - 2 * width
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
sub r5, r5, #2 @ height -= 2
mov r10, sp @ tmpline3 = sp
add r7, r7, r7, lsl #1 @ r7 = 3 * dststride - 6 * width
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov r11, sp @ tmpline2 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub sp, sp, r3, lsl #1 @ sp -= 2 * width
mov lr, sp @ tmpline1 = sp
bic sp, sp, #31 @ align sp to 32 bytes
sub r8, sp, #64 @ r8 = sp - 64
vst1.64 {d8-d11}, [r8:256] @ save q4,q5
sub r9, sp, #32 @ r9 = sp - 32
vst1.64 {d12-d15}, [r9:256] @ save q6,q7
sub sp, sp, #(36 + 64) @ sp -= (36 + 64)
str r6, [sp] @ oldsp = r6
str r5, [sp, #4] @ height = r5
str ip, [sp, #8] @ srcdiff = ip
str r7, [sp, #12] @ dstdiff = r7
str r4, [sp, #16] @ dststride = r4
str lr, [sp, #20] @ tmpline1 = lr
str r11, [sp, #24] @ tmpline2 = r11
str r10, [sp, #28] @ tmpline3 = r10
str r3, [sp, #32] @ width = r3
@ r0 = src
@ r1 = dst
@ r2 = palette
@ r3 = counter
@ r4 = dst2
@ r5 = dst3
@ r11 = bufptr1
@ ip = bufptr2
@ lr = bufptr3
@ [sp] = oldsp
@ [sp, #4] = height
@ [sp, #8] = srcdiff (srcstride - width)
@ [sp, #12] = dstdiff (3 * dststride - 6 * width)
@ [sp, #16] = dststride
@ [sp, #20] = tmpline1
@ [sp, #24] = tmpline2
@ [sp, #28] = tmpline3
@ [sp, #32] = width
@ lr = tmpline1
@ r3 = counter
@ first line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r0, r0, r7 @ src += srcdiff
@ second line
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r3, [sp, #32] @ counter = width
ldr ip, [sp, #20] @ bufptr2 = tmpline1
ldr lr, [sp, #24] @ bufptr3 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
@ first temporary line
neon_scale3x_16_16_line first, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
100:
@ line n+1
neon_normal1x_8_16_line r0, lr, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, ip
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r4, r1, r9 @ dst2 = dst + dststride
ldr r3, [sp, #32] @ counter = width
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
str r11, [sp, #28] @ tmpline3 = bufptr1
str ip, [sp, #20] @ tmpline1 = bufptr2
str lr, [sp, #24] @ tmpline2 = bufptr3
@ temporary line n
neon_scale3x_16_16_line middle, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
ldr r6, [sp, #4] @ r6 = height
ldr r7, [sp, #8] @ r7 = srcdiff
ldr r8, [sp, #12] @ r8 = dstdiff
ldr r3, [sp, #32] @ counter = width
subS r6, r6, #1 @ height--
ldr lr, [sp, #28] @ bufptr3 = tmpline3
add r0, r0, r7 @ src += srcdiff
add r1, r1, r8 @ dst += dstdiff
str r6, [sp, #4] @ height = r6
bne 100b
ldr r9, [sp, #16] @ r9 = dststride
ldr r11, [sp, #20] @ bufptr1 = tmpline1
ldr ip, [sp, #24] @ bufptr2 = tmpline2
add r4, r1, r9 @ dst2 = dst + dststride
add r5, r1, r9, lsl #1 @ dst3 = dst + 2 * dststride
@ last temporary line
neon_scale3x_16_16_line last, r11, ip, lr, r3, r1, r4, r5, r6, 1, 0
add r6, sp, #36 @ r6 = sp + 36
ldr sp, [sp] @ sp = oldsp
vld1.64 {d8-d11}, [r6:256] @ restore q4,q5
add ip, r6, #32 @ ip = r6 + 32
vld1.64 {d12-d15}, [ip:256] @ restore q6,q7
pop {r4-r11,lr}
bx lr
@ end procedure neon_scale3x_8_16

View File

@ -1,976 +0,0 @@
@@
@@ Copyright (C) 2012 Roman Pauer
@@
@@ Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ this software and associated documentation files (the "Software"), to deal in
@@ the Software without restriction, including without limitation the rights to
@@ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
@@ of the Software, and to permit persons to whom the Software is furnished to do
@@ so, subject to the following conditions:
@@
@@ The above copyright notice and this permission notice shall be included in all
@@ copies or substantial portions of the Software.
@@
@@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
@@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
@@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
@@ SOFTWARE.
@@
@ A B C --\ E0 E1 E2
@ D E F --/ E3 E4 E5
@ G H I E6 E7 E8
@ q0 = S1sl < A >
@ q1 = S2sl < D >
@ q2 = S3sl < G >
@ q3 = S1sr < C >
@ q4 = S2sr < F >
@ q5 = S3sr < I >
@ q6 =
@ q7 =
@ q8 = S1
@ q9 = S2
@ q10 = S3
@ q11 =
@ q12 = S2prev < E >
@ q13 =
@ q14 = S1prev < B >
@ q15 = S3prev < H >
.macro ___neon_scale3x_8_8_slice counter, dst1, dst2, dst3, reg1, dB0, dB1, dH0, dH1, first15, last16
@ d12 = E0[0]
@ d13 = E1[0]
@ d14 = E2[0]
@ d15 = tmp0[0]
@ d22 = E3[0]
@ d23 = tmp1[0]
@ d24 = E4[0]
@ d25 = E4[1]
@ d26 = E5[0]
@ d27 = C0[0]
@ q0 = tmp2
@ q3 = tmp3
@ q2 = tmp4
@ q5 = tmp5
vceq.i8 d15, \dB0, \dH0 @ tmp0[0] = < B == H >
vceq.i8 d23, d2, d8 @ tmp1[0] = < D == F >
.if \first15
cmp \reg1, #8
.endif
vceq.i8 q0, q12, q0 @ tmp2 = < E == A >
vceq.i8 q3, q12, q3 @ tmp3 = < E == C >
.if \first15
add \reg1, \reg1, \reg1, lsl #1 @ reg1 = 3 * (counter & 15)
.endif
vorr d27, d15, d23 @ C0[0] = < B == H || D == F >
vceq.i8 d22, d2, \dB0 @ E3[0] = < D == B >
vceq.i8 d26, \dB0, d8 @ E5[0] = < B == F >
vceq.i8 q2, q12, q2 @ tmp4 = < E == G >
vceq.i8 q5, q12, q5 @ tmp5 = < E == I >
.if \first15
sub \reg1, \reg1, #(3*8) @ reg1 -= 3*8
.endif
vorn d15, d6, d22 @ tmp0[0] = < (E == C) || !(D == B) >
vorn d23, d0, d26 @ tmp1[0] = < (E == A) || !(B == F) >
vorn d12, d27, d22 @ E0[0] = < C0 || !(D == B) >
vand d13, d15, d23 @ E1[0] = < ((E == C) || !(D == B)) && ((E == A) || !(B == F)) >
vorn d14, d27, d26 @ E2[0] = < C0 || !(B == F) >
vorr d13, d27, d13 @ E1[0] = < C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F))) >
vbsl d12, d24, d2 @ E0[0] = < (C0 || !(D == B)) ? E : D >
vbsl d14, d24, d8 @ E2[0] = < (C0 || !(B == F)) ? E : F >
vbsl d13, d24, \dB0 @ E1[0] = < (C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F)))) ? E : B >
vceq.i8 d15, d2, \dH0 @ tmp0[0] = < D == H >
vceq.i8 d23, \dH0, d8 @ tmp1[0] = < H == F >
vorn d22, d4, d22 @ E3[0] = < (E == G) || !(D == B) >
vst3.8 {d12-d14}, [\dst1]! @ [dst] = E0,E1,E2; dst1 += 3*8
.if \first15
addls \dst1, \dst1, \reg1 @ dst1 += reg1
.endif
vorn d26, d10, d26 @ E5[0] = < (E == I) || !(B == F) >
@ d12 = tmp6[0]
@ d13 = tmp7[0]
vorn d12, d0, d15 @ tmp6[0] = < (E == A) || !(D == H) >
vorn d13, d6, d23 @ tmp7[0] = < (E == C) || !(H == F) >
vand d22, d22, d12 @ E3[0] = < ((E == G) || !(D == B)) && ((E == A) || !(D == H)) >
vand d26, d26, d13 @ E5[0] = < ((E == I) || !(B == F)) && ((E == C) || !(H == F)) >
@ d12 = E6[0]
@ d13 = E7[0]
@ d14 = E8[0]
vorr d22, d27, d22 @ E3[0] = < C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H))) >
vorr d26, d27, d26 @ E5[0] = < C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F))) >
vbsl d22, d24, d2 @ E3[0] = < (C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H)))) ? E : D >
vbsl d26, d24, d8 @ E5[0] = < (C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F)))) ? E : F >
vorn d13, d10, d15 @ E7[0] = < (E == I) || !(D == H) >
vorn d12, d27, d15 @ E6[0] = < C0 || !(D == H) >
vst3.8 {d22,d24,d26}, [\dst2]! @ [dst + dststride] = E3,E4,E5; dst2 += 3*8
.if \first15
addls \dst2, \dst2, \reg1 @ dst2 += reg1
.endif
vorn d15, d4, d23 @ tmp0[0] = < (E == G) || !(H == F) >
vorn d14, d27, d23 @ E8[0] = < C0 || !(H == F) >
vand d13, d13, d15 @ E7[0] = < ((E == I) || !(D == H)) && ((E == G) || !(H == F)) >
vbsl d12, d24, d2 @ E6[0] = < (C0 || !(D == H)) ? E : D >
vorr d13, d27, d13 @ E7[0] = < C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F))) >
vbsl d14, d24, d8 @ E8[0] = < (C0 || !(H == F)) ? E : F >
vbsl d13, d24, \dH0 @ E7[0] = < (C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F)))) ? E : H >
@ d15 = tmp0[1]
@ d22 = tmp1[1]
@ d23 = E3[1]
@ d24 = E4[0]
@ d25 = E4[1]
@ d26 = C0[1]
@ d27 = E5[1]
vceq.i8 d15, \dB1, \dH1 @ tmp0[1] = < B == H >
vceq.i8 d22, d3, d9 @ tmp1[1] = < D == F >
vceq.i8 d23, d3, \dB1 @ E3[1] = < D == B >
vst3.8 {d12-d14}, [\dst3]! @ [dst + 2 * dststride] = E6,E7,E8; dst3 += 3*8
.if \first15
addls \dst3, \dst3, \reg1 @ dst3 += reg1
bls 0f
.endif
@ d12 = E0[1]
@ d13 = E1[1]
@ d14 = E2[1]
vorr d26, d15, d22 @ C0[1] = < B == H || D == F >
vceq.i8 d27, \dB1, d9 @ E5[1] = < B == F >
vorn d15, d7, d23 @ tmp0[1] = < (E == C) || !(D == B) >
vorn d22, d1, d27 @ tmp1[1] = < (E == A) || !(B == F) >
vorn d12, d26, d23 @ E0[1] = < C0 || !(D == B) >
vand d13, d15, d22 @ E1[1] = < ((E == C) || !(D == B)) && ((E == A) || !(B == F)) >
vorn d14, d26, d27 @ E2[1] = < C0 || !(B == F) >
vorr d13, d26, d13 @ E1[1] = < C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F))) >
vbsl d12, d25, d3 @ E0[1] = < (C0 || !(D == B)) ? E : D >
vbsl d14, d25, d9 @ E2[1] = < (C0 || !(B == F)) ? E : F >
vbsl d13, d25, \dB1 @ E1[1] = < (C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F)))) ? E : B >
vceq.i8 d15, d3, \dH1 @ tmp0[1] = < D == H >
vceq.i8 d22, \dH1, d9 @ tmp1[1] = < H == F >
vorn d23, d5, d23 @ E3[1] = < (E == G) || !(D == B) >
.ifeq \first15
vst3.8 {d12-d14}, [\dst1]! @ [dst] = E0,E1,E2; dst1 += 3*8
.else
vst3.8 {d12-d14}, [\dst1],\reg1 @ [dst] = E0,E1,E2; dst1 += reg1
.endif
vorn d27, d11, d27 @ E5[1] = < (E == I) || !(B == F) >
@ d12 = tmp6[1]
@ d13 = tmp7[1]
vorn d12, d1, d15 @ tmp6[1] = < (E == A) || !(D == H) >
vorn d13, d7, d22 @ tmp7[1] = < (E == C) || !(H == F) >
vand d23, d23, d12 @ E3[1] = < ((E == G) || !(D == B)) && ((E == A) || !(D == H)) >
vand d27, d27, d13 @ E5[1] = < ((E == I) || !(B == F)) && ((E == C) || !(H == F)) >
@ d12 = E6[1]
@ d13 = E7[1]
@ d14 = E8[1]
vorr d23, d26, d23 @ E3[1] = < C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H))) >
vorr d27, d26, d27 @ E5[1] = < C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F))) >
vbsl d23, d25, d3 @ E3[1] = < (C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H)))) ? E : D >
vbsl d27, d25, d9 @ E5[1] = < (C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F)))) ? E : F >
vorn d13, d11, d15 @ E7[1] = < (E == I) || !(D == H) >
vorn d12, d26, d15 @ E6[1] = < C0 || !(D == H) >
.ifeq \first15
vst3.8 {d23,d25,d27}, [\dst2]! @ [dst + dststride] = E3,E4,E5; dst2 += 3*8
.else
vst3.8 {d23,d25,d27}, [\dst2],\reg1 @ [dst + dststride] = E3,E4,E5; dst2 += reg1
.endif
vorn d15, d5, d22 @ tmp0[1] = < (E == G) || !(H == F) >
vorn d14, d26, d22 @ E8[1] = < C0 || !(H == F) >
vand d13, d13, d15 @ E7[1] = < ((E == I) || !(D == H)) && ((E == G) || !(H == F)) >
vbsl d12, d25, d3 @ E6[1] = < (C0 || !(D == H)) ? E : D >
vorr d13, d26, d13 @ E7[1] = < C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F))) >
vbsl d14, d25, d9 @ E8[1] = < (C0 || !(H == F)) ? E : F >
vbsl d13, d25, \dH1 @ E7[1] = < (C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F)))) ? E : H >
.ifeq \first15
.ifeq \last16
sub \counter, \counter, #16 @ counter -= 16
cmp \counter, #16
.endif
vst3.8 {d12-d14}, [\dst3]! @ [dst + 2 * dststride] = E6,E7,E8; dst3 += 3*8
.else
vst3.8 {d12-d14}, [\dst3],\reg1 @ [dst + 2 * dststride] = E6,E7,E8; dst3 += reg1
0:
.endif
.endm
.macro __neon_scale3x_8_8_line src1, src2, src3, counter, dst1, dst2, dst3, reg1, qB, qH, dB0, dB1, dH0, dH1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
.ifeqs "\qB", "q14"
vld1.8 {d29[7]}, [\src1] @ S1prev[15] = src[-srcstride]
.endif
vld1.8 {d25[7]}, [\src2] @ S2prev[15] = src[0]
.ifeqs "\qH", "q15"
vld1.8 {d31[7]}, [\src3] @ S3prev[15] = src[srcstride]
.endif
andS \reg1, \counter, #15 @ reg1 = counter & 15
.ifnes "\qB", "q14"
add \src1, \src1, \counter @ src1 += counter
.endif
.ifnes "\qH", "q15"
add \src3, \src3, \counter @ src3 += counter
.endif
beq 1f
@ first 1-15 pixels - align counter to 16 bytes
sub \reg1, \reg1, #1 @ reg1 = (counter & 15) - 1
.ifeqs "\qB", "q14"
vld1.8 {q8}, [\src1] @ S1 = [src - srcstride]
add \src1, \src1, \reg1 @ src1 += (counter & 15) - 1
.endif
vld1.8 {q9}, [\src2] @ S2 = [src ]
add \src2, \src2, \reg1 @ src2 += (counter & 15) - 1
.ifeqs "\qH", "q15"
vld1.8 {q10}, [\src3] @ S3 = [src + srcstride]
add \src3, \src3, \reg1 @ src3 += (counter & 15) - 1
.endif
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < S >
vmov \qB, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < X >
vmov \qH, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qB", "q14"
vext.8 q3, \qB, q8, #1 @ S1sr = (S1prev >> 8) | ... < U >
.endif
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | ... < W >
.ifeqs "\qH", "q15"
vext.8 q5, \qH, q10, #1 @ S3sr = (S3prev >> 8) | ... < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.else
vld1.8 {d29[7]}, [\src1]! @ S1prev[15] = src[counter & 15 - 1 - srcstride]; src1++
.endif
add \reg1, \reg1, #1 @ reg1 = counter & 15
vld1.8 {d25[7]}, [\src2]! @ S2prev[15] = src[counter & 15 - 1]; src2++
bic \counter, \counter, #15
.ifeqs "\qH", "q15"
vld1.8 {d31[7]}, [\src3]! @ S3prev[15] = src[counter & 15 - 1 + srcstride]; src3++
.endif
___neon_scale3x_8_8_slice \counter, \dst1, \dst2, \dst3, \reg1, \dB0, \dB1, \dH0, \dH1, 1, 0
@ counter is aligned to 16 bytes
1:
.ifeqs "\qB", "q14"
vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16
.endif
vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16
.ifeqs "\qH", "q15"
vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16
.endif
@ inner loop (16 pixels per iteration)
2:
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < A >
vmov \qB, q8 @ S1prev = S1 < B >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >
vmov q12, q9 @ S2prev = S2 < E >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < G >
vmov \qH, q10 @ S3prev = S3 < H >
.endif
.ifeqs "\qB", "q14"
vld1.8 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 16
vext.8 q3, \qB, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < C >
.endif
vld1.8 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 16
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < F >
.ifeqs "\qH", "q15"
vld1.8 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 16
vext.8 q5, \qH, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < I >
.else
vmov q2, q1 @ S3sl = S2sl < G >
vmov q5, q4 @ S3sr = S2sr < I >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < A >
vmov q3, q4 @ S1sr = S2sr < C >
.endif
___neon_scale3x_8_8_slice \counter, \aldst1, \aldst2, \aldst3, \reg1, \dB0, \dB1, \dH0, \dH1, 0, 0
bhi 2b
@ last 16 pixels
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #15 @ S1sl = S1prev[15] | (S1 << 8) < A >
vmov \qB, q8 @ S1prev = S1 < B >
.endif
vext.8 q1, q12, q9, #15 @ S2sl = S2prev[15] | (S2 << 8) < D >
vmov q12, q9 @ S2prev = S2 < E >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #15 @ S3sl = S3prev[15] | (S3 << 8) < G >
vmov \qH, q10 @ S3prev = S3 < H >
.endif
.ifeqs "\qB", "q14"
vshr.u64 d16, d17, #(64-8) @ S1[0] = S1[15] | ...
.endif
vshr.u64 d18, d19, #(64-8) @ S2[0] = S2[15] | ...
.ifeqs "\qH", "q15"
vshr.u64 d20, d21, #(64-8) @ S3[0] = S3[15] | ...
.endif
.ifeqs "\qB", "q14"
vext.8 q3, \qB, q8, #1 @ S1sr = (S1prev >> 8) | S1[0] < C >
.endif
vext.8 q4, q12, q9, #1 @ S2sr = (S2prev >> 8) | S2[0] < F >
.ifeqs "\qH", "q15"
vext.8 q5, \qH, q10, #1 @ S3sr = (S3prev >> 8) | S3[0] < I >
.else
vmov q2, q1 @ S3sl = S2sl < G >
vmov q5, q4 @ S3sr = S2sr < I >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < A >
vmov q3, q4 @ S1sr = S2sr < C >
.endif
___neon_scale3x_8_8_slice \counter, \aldst1, \aldst2, \aldst3, \reg1, \dB0, \dB1, \dH0, \dH1, 0, 1
.endm
.macro _neon_scale3x_8_8_line_first src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q12, q15, d24, d25, d30, d31, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro _neon_scale3x_8_8_line_middle src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q14, q15, d28, d29, d30, d31, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro _neon_scale3x_8_8_line_last src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_8_8_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q14, q12, d28, d29, d24, d25, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro neon_scale3x_8_8_line part, src1, src2, src3, counter, dst1, dst2, dst3, reg1, srcalign16, dstalign8
.ifeq \srcalign16
.ifeq \dstalign8
_neon_scale3x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1, \src2, \src3, \dst1, \dst2, \dst3
.else
_neon_scale3x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1, \src2, \src3, \dst1:64, \dst2:64, \dst3:64
.endif
.else
.ifeq \dstalign8
_neon_scale3x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2, \dst3
.else
_neon_scale3x_8_8_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1:128, \src2:128, \src3:128, \dst1:64, \dst2:64, \dst3:64
.endif
.endif
.endm
.macro ___neon_scale3x_16_16_slice counter, dst1, dst2, dst3, reg1, dB0, dB1, dH0, dH1, first7, last8
@ d12 = E0[0]
@ d13 = E1[0]
@ d14 = E2[0]
@ d15 = tmp0[0]
@ d22 = E3[0]
@ d23 = tmp1[0]
@ d24 = E4[0]
@ d25 = E4[1]
@ d26 = E5[0]
@ d27 = C0[0]
@ q0 = tmp2
@ q3 = tmp3
@ q2 = tmp4
@ q5 = tmp5
vceq.i16 d15, \dB0, \dH0 @ tmp0[0] = < B == H >
vceq.i16 d23, d2, d8 @ tmp1[0] = < D == F >
.if \first7
cmp \reg1, #4
.endif
vceq.i16 q0, q12, q0 @ tmp2 = < E == A >
vceq.i16 q3, q12, q3 @ tmp3 = < E == C >
.if \first7
lsl \reg1, #1 @ reg1 = 2 * (counter & 7)
.endif
vorr d27, d15, d23 @ C0[0] = < B == H || D == F >
vceq.i16 d22, d2, \dB0 @ E3[0] = < D == B >
vceq.i16 d26, \dB0, d8 @ E5[0] = < B == F >
vceq.i16 q2, q12, q2 @ tmp4 = < E == G >
vceq.i16 q5, q12, q5 @ tmp5 = < E == I >
.if \first7
add \reg1, \reg1, \reg1, lsl #1 @ reg1 = 2 * 3 * (counter & 7)
.endif
vorn d15, d6, d22 @ tmp0[0] = < (E == C) || !(D == B) >
vorn d23, d0, d26 @ tmp1[0] = < (E == A) || !(B == F) >
vorn d12, d27, d22 @ E0[0] = < C0 || !(D == B) >
.if \first7
sub \reg1, \reg1, #(3*2*4) @ reg1 -= 3*2*4
.endif
vand d13, d15, d23 @ E1[0] = < ((E == C) || !(D == B)) && ((E == A) || !(B == F)) >
vorn d14, d27, d26 @ E2[0] = < C0 || !(B == F) >
vorr d13, d27, d13 @ E1[0] = < C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F))) >
vbsl d12, d24, d2 @ E0[0] = < (C0 || !(D == B)) ? E : D >
vbsl d14, d24, d8 @ E2[0] = < (C0 || !(B == F)) ? E : F >
vbsl d13, d24, \dB0 @ E1[0] = < (C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F)))) ? E : B >
vceq.i16 d15, d2, \dH0 @ tmp0[0] = < D == H >
vceq.i16 d23, \dH0, d8 @ tmp1[0] = < H == F >
vorn d22, d4, d22 @ E3[0] = < (E == G) || !(D == B) >
vst3.16 {d12-d14}, [\dst1]! @ [dst] = E0,E1,E2; dst1 += 3*2*4
.if \first7
addls \dst1, \dst1, \reg1 @ dst1 += reg1
.endif
vorn d26, d10, d26 @ E5[0] = < (E == I) || !(B == F) >
@ d12 = tmp6[0]
@ d13 = tmp7[0]
vorn d12, d0, d15 @ tmp6[0] = < (E == A) || !(D == H) >
vorn d13, d6, d23 @ tmp7[0] = < (E == C) || !(H == F) >
vand d22, d22, d12 @ E3[0] = < ((E == G) || !(D == B)) && ((E == A) || !(D == H)) >
vand d26, d26, d13 @ E5[0] = < ((E == I) || !(B == F)) && ((E == C) || !(H == F)) >
@ d12 = E6[0]
@ d13 = E7[0]
@ d14 = E8[0]
vorr d22, d27, d22 @ E3[0] = < C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H))) >
vorr d26, d27, d26 @ E5[0] = < C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F))) >
vbsl d22, d24, d2 @ E3[0] = < (C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H)))) ? E : D >
vbsl d26, d24, d8 @ E5[0] = < (C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F)))) ? E : F >
vorn d13, d10, d15 @ E7[0] = < (E == I) || !(D == H) >
vorn d12, d27, d15 @ E6[0] = < C0 || !(D == H) >
vst3.16 {d22,d24,d26}, [\dst2]! @ [dst + dststride] = E3,E4,E5; dst2 += 3*2*4
.if \first7
addls \dst2, \dst2, \reg1 @ dst2 += reg1
.endif
vorn d15, d4, d23 @ tmp0[0] = < (E == G) || !(H == F) >
vorn d14, d27, d23 @ E8[0] = < C0 || !(H == F) >
vand d13, d13, d15 @ E7[0] = < ((E == I) || !(D == H)) && ((E == G) || !(H == F)) >
vbsl d12, d24, d2 @ E6[0] = < (C0 || !(D == H)) ? E : D >
vorr d13, d27, d13 @ E7[0] = < C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F))) >
vbsl d14, d24, d8 @ E8[0] = < (C0 || !(H == F)) ? E : F >
vbsl d13, d24, \dH0 @ E7[0] = < (C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F)))) ? E : H >
@ d15 = tmp0[1]
@ d22 = tmp1[1]
@ d23 = E3[1]
@ d24 = E4[0]
@ d25 = E4[1]
@ d26 = C0[1]
@ d27 = E5[1]
vceq.i16 d15, \dB1, \dH1 @ tmp0[1] = < B == H >
vceq.i16 d22, d3, d9 @ tmp1[1] = < D == F >
vceq.i16 d23, d3, \dB1 @ E3[1] = < D == B >
vst3.16 {d12-d14}, [\dst3]! @ [dst + 2 * dststride] = E6,E7,E8; dst3 += 3*2*4
.if \first7
addls \dst3, \dst3, \reg1 @ dst3 += reg1
bls 0f
.endif
@ d12 = E0[1]
@ d13 = E1[1]
@ d14 = E2[1]
vorr d26, d15, d22 @ C0[1] = < B == H || D == F >
vceq.i16 d27, \dB1, d9 @ E5[1] = < B == F >
vorn d15, d7, d23 @ tmp0[1] = < (E == C) || !(D == B) >
vorn d22, d1, d27 @ tmp1[1] = < (E == A) || !(B == F) >
vorn d12, d26, d23 @ E0[1] = < C0 || !(D == B) >
vand d13, d15, d22 @ E1[1] = < ((E == C) || !(D == B)) && ((E == A) || !(B == F)) >
vorn d14, d26, d27 @ E2[1] = < C0 || !(B == F) >
vorr d13, d26, d13 @ E1[1] = < C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F))) >
vbsl d12, d25, d3 @ E0[1] = < (C0 || !(D == B)) ? E : D >
vbsl d14, d25, d9 @ E2[1] = < (C0 || !(B == F)) ? E : F >
vbsl d13, d25, \dB1 @ E1[1] = < (C0 || (((E == C) || !(D == B)) && ((E == A) || !(B == F)))) ? E : B >
vceq.i16 d15, d3, \dH1 @ tmp0[1] = < D == H >
vceq.i16 d22, \dH1, d9 @ tmp1[1] = < H == F >
vorn d23, d5, d23 @ E3[1] = < (E == G) || !(D == B) >
.ifeq \first7
vst3.16 {d12-d14}, [\dst1]! @ [dst] = E0,E1,E2; dst1 += 3*2*4
.else
vst3.16 {d12-d14}, [\dst1], \reg1 @ [dst] = E0,E1,E2; dst1 += reg1
.endif
vorn d27, d11, d27 @ E5[1] = < (E == I) || !(B == F) >
@ d12 = tmp6[1]
@ d13 = tmp7[1]
vorn d12, d1, d15 @ tmp6[1] = < (E == A) || !(D == H) >
vorn d13, d7, d22 @ tmp7[1] = < (E == C) || !(H == F) >
vand d23, d23, d12 @ E3[1] = < ((E == G) || !(D == B)) && ((E == A) || !(D == H)) >
vand d27, d27, d13 @ E5[1] = < ((E == I) || !(B == F)) && ((E == C) || !(H == F)) >
@ d12 = E6[1]
@ d13 = E7[1]
@ d14 = E8[1]
vorr d23, d26, d23 @ E3[1] = < C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H))) >
vorr d27, d26, d27 @ E5[1] = < C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F))) >
vbsl d23, d25, d3 @ E3[1] = < (C0 || (((E == G) || !(D == B)) && ((E == A) || !(D == H)))) ? E : D >
vbsl d27, d25, d9 @ E5[1] = < (C0 || (((E == I) || !(B == F)) && ((E == C) || !(H == F)))) ? E : F >
vorn d13, d11, d15 @ E7[1] = < (E == I) || !(D == H) >
vorn d12, d26, d15 @ E6[1] = < C0 || !(D == H) >
.ifeq \first7
vst3.16 {d23,d25,d27}, [\dst2]! @ [dst + dststride] = E3,E4,E5; dst2 += 3*2*4
.else
vst3.16 {d23,d25,d27}, [\dst2], \reg1 @ [dst + dststride] = E3,E4,E5; dst2 += reg1
.endif
vorn d15, d5, d22 @ tmp0[1] = < (E == G) || !(H == F) >
vorn d14, d26, d22 @ E8[1] = < C0 || !(H == F) >
vand d13, d13, d15 @ E7[1] = < ((E == I) || !(D == H)) && ((E == G) || !(H == F)) >
vbsl d12, d25, d3 @ E6[1] = < (C0 || !(D == H)) ? E : D >
vorr d13, d26, d13 @ E7[1] = < C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F))) >
vbsl d14, d25, d9 @ E8[1] = < (C0 || !(H == F)) ? E : F >
vbsl d13, d25, \dH1 @ E7[1] = < (C0 || (((E == I) || !(D == H)) && ((E == G) || !(H == F)))) ? E : H >
.ifeq \first7
.ifeq \last8
sub \counter, \counter, #8 @ counter -= 8
cmp \counter, #8
.endif
vst3.16 {d12-d14}, [\dst3]! @ [dst + 2 * dststride] = E6,E7,E8; dst3 += 3*2*4
.else
vst3.16 {d12-d14}, [\dst3], \reg1 @ [dst + 2 * dststride] = E6,E7,E8; dst3 += reg1
0:
.endif
.endm
.macro __neon_scale3x_16_16_line src1, src2, src3, counter, dst1, dst2, dst3, reg1, qB, qH, dB0, dB1, dH0, dH1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
.ifeqs "\qB", "q14"
vld1.16 {d29[3]}, [\src1] @ S1prev[7] = src[-srcstride]
.endif
vld1.16 {d25[3]}, [\src2] @ S2prev[7] = src[0]
.ifeqs "\qH", "q15"
vld1.16 {d31[3]}, [\src3] @ S3prev[7] = src[srcstride]
.endif
andS \reg1, \counter, #7 @ reg1 = counter & 7
.ifnes "\qB", "q14"
add \src1, \src1, \counter, lsl #1 @ src1 += 2 * counter
.endif
.ifnes "\qH", "q15"
add \src3, \src3, \counter, lsl #1 @ src3 += 2 * counter
.endif
beq 1f
@ first 1-7 pixels - align counter to 16 bytes
sub \reg1, \reg1, #1 @ reg1 = (counter & 7) - 1
.ifeqs "\qB", "q14"
vld1.16 {q8}, [\src1] @ S1 = [src - srcstride]
add \src1, \src1, \reg1, lsl #1 @ src1 += 2 * ((counter & 7) - 1)
.endif
vld1.16 {q9}, [\src2] @ S2 = [src ]
add \src2, \src2, \reg1, lsl #1 @ src2 += 2 * ((counter & 7) - 1)
.ifeqs "\qH", "q15"
vld1.16 {q10}, [\src3] @ S3 = [src + srcstride]
add \src3, \src3, \reg1, lsl #1 @ src3 += 2 * ((counter & 7) - 1)
.endif
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < S >
vmov \qB, q8 @ S1prev = S1 < T >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < V >
vmov q12, q9 @ S2prev = S2 < C >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < X >
vmov \qH, q10 @ S3prev = S3 < Y >
.endif
.ifeqs "\qB", "q14"
vext.8 q3, \qB, q8, #2 @ S1sr = (S1prev >> 16) | ... < U >
.endif
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | ... < W >
.ifeqs "\qH", "q15"
vext.8 q5, \qH, q10, #2 @ S3sr = (S3prev >> 16) | ... < Z >
.else
vmov q2, q1 @ S3sl = S2sl < X >
vmov q5, q4 @ S3sr = S2sr < Z >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < S >
vmov q3, q4 @ S1sr = S2sr < U >
.else
vld1.16 {d29[3]}, [\src1]! @ S1prev[7] = src[counter & 7 - 1 - srcstride]; src1 += 2
.endif
add \reg1, \reg1, #1 @ reg1 = counter & 7
vld1.16 {d25[3]}, [\src2]! @ S2prev[7] = src[counter & 7 - 1]; src2 += 2
bic \counter, \counter, #7
.ifeqs "\qH", "q15"
vld1.16 {d31[3]}, [\src3]! @ S3prev[7] = src[counter & 7 - 1 + srcstride]; src3 += 2
.endif
___neon_scale3x_16_16_slice \counter, \dst1, \dst2, \dst3, \reg1, \dB0, \dB1, \dH0, \dH1, 1, 0
@ counter is aligned to 16 bytes
1:
.ifeqs "\qB", "q14"
vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8
.endif
vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8
.ifeqs "\qH", "q15"
vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8
.endif
@ inner loop (8 pixels per iteration)
2:
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < A >
vmov \qB, q8 @ S1prev = S1 < B >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >
vmov q12, q9 @ S2prev = S2 < E >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < G >
vmov \qH, q10 @ S3prev = S3 < H >
.endif
.ifeqs "\qB", "q14"
vld1.16 {q8}, [\alsrc1]! @ S1 = [src - srcstride]; src1 += 2*8
vext.8 q3, \qB, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < C >
.endif
vld1.16 {q9}, [\alsrc2]! @ S2 = [src ]; src2 += 2*8
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < F >
.ifeqs "\qH", "q15"
vld1.16 {q10}, [\alsrc3]! @ S3 = [src + srcstride]; src3 += 2*8
vext.8 q5, \qH, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < I >
.else
vmov q2, q1 @ S3sl = S2sl < G >
vmov q5, q4 @ S3sr = S2sr < I >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < A >
vmov q3, q4 @ S1sr = S2sr < C >
.endif
___neon_scale3x_16_16_slice \counter, \aldst1, \aldst2, \aldst3, \reg1, \dB0, \dB1, \dH0, \dH1, 0, 0
bhi 2b
@ last 8 pixels
.ifeqs "\qB", "q14"
vext.8 q0, \qB, q8, #14 @ S1sl = S1prev[7] | (S1 << 16) < A >
vmov \qB, q8 @ S1prev = S1 < B >
.endif
vext.8 q1, q12, q9, #14 @ S2sl = S2prev[7] | (S2 << 16) < D >
vmov q12, q9 @ S2prev = S2 < E >
.ifeqs "\qH", "q15"
vext.8 q2, \qH, q10, #14 @ S3sl = S3prev[7] | (S3 << 16) < G >
vmov \qH, q10 @ S3prev = S3 < H >
.endif
.ifeqs "\qB", "q14"
vshr.u64 d16, d17, #(64-16) @ S1[0] = S1[7] | ...
.endif
vshr.u64 d18, d19, #(64-16) @ S2[0] = S2[7] | ...
.ifeqs "\qH", "q15"
vshr.u64 d20, d21, #(64-16) @ S3[0] = S3[7] | ...
.endif
.ifeqs "\qB", "q14"
vext.8 q3, \qB, q8, #2 @ S1sr = (S1prev >> 16) | S1[0] < C >
.endif
vext.8 q4, q12, q9, #2 @ S2sr = (S2prev >> 16) | S2[0] < F >
.ifeqs "\qH", "q15"
vext.8 q5, \qH, q10, #2 @ S3sr = (S3prev >> 16) | S3[0] < I >
.else
vmov q2, q1 @ S3sl = S2sl < G >
vmov q5, q4 @ S3sr = S2sr < I >
.endif
.ifnes "\qB", "q14"
vmov q0, q1 @ S1sl = S2sl < A >
vmov q3, q4 @ S1sr = S2sr < C >
.endif
___neon_scale3x_16_16_slice \counter, \aldst1, \aldst2, \aldst3, \reg1, \dB0, \dB1, \dH0, \dH1, 0, 1
.endm
.macro _neon_scale3x_16_16_line_first src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q12, q15, d24, d25, d30, d31, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro _neon_scale3x_16_16_line_middle src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q14, q15, d28, d29, d30, d31, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro _neon_scale3x_16_16_line_last src1, src2, src3, counter, dst1, dst2, dst3, reg1, alsrc1, alsrc2, alsrc3, aldst1, aldst2, aldst3
__neon_scale3x_16_16_line \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, q14, q12, d28, d29, d24, d25, \alsrc1, \alsrc2, \alsrc3, \aldst1, \aldst2, \aldst3
.endm
.macro neon_scale3x_16_16_line part, src1, src2, src3, counter, dst1, dst2, dst3, reg1, srcalign16, dstalign8
.ifeq \srcalign16
.ifeq \dstalign8
_neon_scale3x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1, \src2, \src3, \dst1, \dst2, \dst3
.else
_neon_scale3x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1, \src2, \src3, \dst1:64, \dst2:64, \dst3:64
.endif
.else
.ifeq \dstalign8
_neon_scale3x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1:128, \src2:128, \src3:128, \dst1, \dst2, \dst3
.else
_neon_scale3x_16_16_line_\part \src1, \src2, \src3, \counter, \dst1, \dst2, \dst3, \reg1, \src1:128, \src2:128, \src3:128, \dst1:64, \dst2:64, \dst3:64
.endif
.endif
.endm

View File

@ -1,44 +0,0 @@
/**
*
* Copyright (C) 2012 Roman Pauer
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#if !defined(_NEON_SCALE3X_H_INCLUDED_)
#define _NEON_SCALE3X_H_INCLUDED_
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
extern void neon_scale3x_8_8(const uint8_t *src, uint8_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_scale3x_16_16(const uint16_t *src, uint16_t *dst, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
extern void neon_scale3x_8_16(const uint8_t *src, uint16_t *dst, const uint32_t *palette, unsigned int width, unsigned int srcstride, unsigned int dststride, unsigned int height);
#ifdef __cplusplus
}
#endif
#endif /* _NEON_SCALE3X_H_INCLUDED_ */

View File

@ -1,19 +0,0 @@
Copyright (C) 2012 Roman Pauer
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.