Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box3_row_v_neon, export=1
push {r4-r9,lr}
ldr r4, [sp, #28]
ldrd r6, r7, [r0]
ldr r0, [r0, #8]
add r4, r4, #2
ldrd r8, r9, [r1]
ldr r1, [r1, #8]
1:
vld1.32 {q8, q9}, [r6]!
vld1.32 {q10, q11}, [r7]!
vld1.16 {q14}, [r8]!
vld1.16 {q15}, [r9]!
subs r4, r4, #8
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vld1.32 {q12, q13}, [r0]!
vadd.i16 q14, q14, q15
vld1.16 {q15}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q14, q14, q15
vst1.32 {q8, q9}, [r2]!
vst1.16 {q14}, [r3]!
bgt 1b
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box5_row_v_neon, export=1
push {r4-r11,lr}
ldr lr, [sp, #36]
ldrd r4, r5, [r0]
ldrd r6, r7, [r0, #8]
ldr r0, [r0, #16]
add lr, lr, #2
ldrd r8, r9, [r1]
ldrd r10, r11, [r1, #8]
ldr r1, [r1, #16]
1:
vld1.32 {q8, q9}, [r4]!
vld1.32 {q10, q11}, [r5]!
vld1.32 {q12, q13}, [r6]!
vld1.32 {q14, q15}, [r7]!
vld1.16 {q0}, [r8]!
vld1.16 {q1}, [r9]!
vld1.16 {q2}, [r10]!
vld1.16 {q3}, [r11]!
subs lr, lr, #8
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vadd.i32 q12, q12, q14
vadd.i32 q13, q13, q15
vld1.32 {q14, q15}, [r0]!
vadd.i16 q0, q0, q1
vadd.i16 q2, q2, q3
vld1.16 {q3}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q0, q0, q2
vadd.i32 q8, q8, q14
vadd.i32 q9, q9, q15
vadd.i16 q0, q0, q3
vst1.32 {q8, q9}, [r2]!
vst1.16 {q0}, [r3]!
bgt 1b
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_calc_row_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_row_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
function sgr_calc_row_ab1_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldr r4, [sp, #84]
clz r6, r4
vmov.i32 q15, #9 // n
movw r5, #455
b sgr_calc_ab_neon
endfunc
function sgr_calc_row_ab2_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldr r4, [sp, #84]
clz r6, r4
vmov.i32 q15, #25 // n
mov r5, #164
endfunc
function sgr_calc_ab_neon
movrel r12, X(sgr_x_by_x)
sub r6, r6, #24 // -bitdepth_min_8
vld1.8 {q8, q9}, [r12, :128]!
add r7, r6, r6 // -2*bitdepth_min_8
vmov.i8 q11, #5
vmov.i8 d10, #55 // idx of last 5
vld1.8 {q10}, [r12, :128]
vmov.i8 d11, #72 // idx of last 4
vmov.i8 d12, #101 // idx of last 3
vmov.i8 d13, #169 // idx of last 2
vmov.i8 d14, #254 // idx of last 1
vmov.i8 d15, #32 // elements consumed in first vtbl
add r2, r2, #2 // w += 2
vdup.32 q12, r3
vsub.i8 q8, q8, q11
vsub.i8 q9, q9, q11
vsub.i8 q10, q10, q11
vdup.32 q13, r7 // -2*bitdepth_min_8
1:
vld1.32 {q0, q1}, [r0, :128] // a
vld1.16 {q2}, [r1, :128] // b
vdup.16 q14, r6 // -bitdepth_min_8
subs r2, r2, #8
vrshl.s32 q0, q0, q13
vrshl.s32 q1, q1, q13
vrshl.s16 q4, q2, q14
vmul.i32 q0, q0, q15 // a * n
vmul.i32 q1, q1, q15 // a * n
vmull.u16 q3, d8, d8 // b * b
vmull.u16 q4, d9, d9 // b * b
vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
vmul.i32 q0, q0, q12 // p * s
vmul.i32 q1, q1, q12 // p * s
vqshrn.u32 d0, q0, #16
vqshrn.u32 d1, q1, #16
vqrshrn.u16 d0, q0, #4 // imin(z, 255)
vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
vtbl.8 d1, {q8, q9}, d0
vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
vsub.i8 d9, d0, d15 // indices for vtbx
vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
vadd.i8 d2, d2, d3
vtbx.8 d1, {q10}, d9
vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
vadd.i8 d6, d6, d7
vadd.i8 d8, d8, d22
vadd.i8 d2, d2, d6
vadd.i8 d1, d1, d8
vadd.i8 d1, d1, d2
vmovl.u8 q0, d1 // x
vdup.32 q14, r5 // one_by_x
vmull.u16 q1, d0, d4 // x * BB[i]
vmull.u16 q2, d1, d5 // x * BB[i]
vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
vrshr.s32 q1, q1, #12 // AA[i]
vrshr.s32 q2, q2, #12 // AA[i]
vst1.32 {q1, q2}, [r0, :128]!
vst1.16 {q0}, [r1, :128]!
bgt 1b
vpop {q4-q7}
pop {r4-r7,pc}
endfunc