Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
#define REST_UNIT_STRIDE (400)
.macro MADD_HU_BU in0, in1, out0, out1
vsllwil.hu.bu vr12, \in0, 0
vexth.hu.bu vr13, \in0
vmadd.h \out0, vr12, \in1
vmadd.h \out1, vr13, \in1
.endm
const wiener_shuf
.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
endconst
/*
void wiener_filter_h_lsx(int32_t *hor_ptr,
uint8_t *tmp_ptr,
const int16_t filterh[8],
const int w, const int h)
*/
function wiener_filter_h_8bpc_lsx
addi.d sp, sp, -40
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
li.w t7, 1<<14 // clip_limit
la.local t1, wiener_shuf
vld vr4, t1, 0
vld vr14, a2, 0 // filter[0][k]
vreplvei.h vr21, vr14, 0
vreplvei.h vr22, vr14, 1
vreplvei.h vr23, vr14, 2
vreplvei.h vr24, vr14, 3
vreplvei.h vr25, vr14, 4
vreplvei.h vr26, vr14, 5
vreplvei.h vr27, vr14, 6
vreplgr2vr.w vr0, t7
.WIENER_FILTER_H_H:
addi.w a4, a4, -1 // h
addi.w t0, a3, 0 // w
addi.d t1, a1, 0 // tmp_ptr
addi.d t2, a0, 0 // hor_ptr
.WIENER_FILTER_H_W:
addi.w t0, t0, -16
vld vr5, t1, 0
vld vr13, t1, 16
vsubi.bu vr14, vr4, 2
vsubi.bu vr15, vr4, 1
vshuf.b vr6, vr13, vr5, vr14 // 1 ... 8, 9 ... 16
vshuf.b vr7, vr13, vr5, vr15 // 2 ... 9, 10 ... 17
vshuf.b vr8, vr13, vr5, vr4 // 3 ... 10, 11 ... 18
vaddi.bu vr14, vr4, 1
vaddi.bu vr15, vr4, 2
vshuf.b vr9, vr13, vr5, vr14 // 4 ... 11, 12 ... 19
vshuf.b vr10, vr13, vr5, vr15 // 5 ... 12, 13 ... 20
vaddi.bu vr14, vr4, 3
vshuf.b vr11, vr13, vr5, vr14 // 6 ... 13, 14 ... 21
vsllwil.hu.bu vr15, vr8, 0 // 3 4 5 6 7 8 9 10
vexth.hu.bu vr16, vr8 // 11 12 13 14 15 16 17 18
vsllwil.wu.hu vr17, vr15, 7 // 3 4 5 6
vexth.wu.hu vr18, vr15 // 7 8 9 10
vsllwil.wu.hu vr19, vr16, 7 // 11 12 13 14
vexth.wu.hu vr20, vr16 // 15 16 17 18
vslli.w vr18, vr18, 7
vslli.w vr20, vr20, 7
vxor.v vr15, vr15, vr15
vxor.v vr14, vr14, vr14
MADD_HU_BU vr5, vr21, vr14, vr15
MADD_HU_BU vr6, vr22, vr14, vr15
MADD_HU_BU vr7, vr23, vr14, vr15
MADD_HU_BU vr8, vr24, vr14, vr15
MADD_HU_BU vr9, vr25, vr14, vr15
MADD_HU_BU vr10, vr26, vr14, vr15
MADD_HU_BU vr11, vr27, vr14, vr15
vsllwil.w.h vr5, vr14, 0 // 0 1 2 3
vexth.w.h vr6, vr14 // 4 5 6 7
vsllwil.w.h vr7, vr15, 0 // 8 9 10 11
vexth.w.h vr8, vr15 // 12 13 14 15
vadd.w vr17, vr17, vr5
vadd.w vr18, vr18, vr6
vadd.w vr19, vr19, vr7
vadd.w vr20, vr20, vr8
vadd.w vr17, vr17, vr0
vadd.w vr18, vr18, vr0
vadd.w vr19, vr19, vr0
vadd.w vr20, vr20, vr0
vsrli.w vr1, vr0, 1
vsubi.wu vr1, vr1, 1
vxor.v vr3, vr3, vr3
vsrari.w vr17, vr17, 3
vsrari.w vr18, vr18, 3
vsrari.w vr19, vr19, 3
vsrari.w vr20, vr20, 3
vclip.w vr17, vr17, vr3, vr1
vclip.w vr18, vr18, vr3, vr1
vclip.w vr19, vr19, vr3, vr1
vclip.w vr20, vr20, vr3, vr1
vst vr17, t2, 0
vst vr18, t2, 16
vst vr19, t2, 32
vst vr20, t2, 48
addi.d t1, t1, 16
addi.d t2, t2, 64
blt zero, t0, .WIENER_FILTER_H_W
addi.d a1, a1, REST_UNIT_STRIDE
addi.d a0, a0, (REST_UNIT_STRIDE << 2)
bnez a4, .WIENER_FILTER_H_H
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
addi.d sp, sp, 40
endfunc
.macro APPLY_FILTER in0, in1, in2
alsl.d t7, \in0, \in1, 2
vld vr10, t7, 0
vld vr11, t7, 16
vld vr12, t7, 32
vld vr13, t7, 48
vmadd.w vr14, vr10, \in2
vmadd.w vr15, vr11, \in2
vmadd.w vr16, vr12, \in2
vmadd.w vr17, vr13, \in2
.endm
.macro wiener_filter_v_8bpc_core_lsx
vreplgr2vr.w vr14, t6
vreplgr2vr.w vr15, t6
vreplgr2vr.w vr16, t6
vreplgr2vr.w vr17, t6
addi.w t7, t2, 0 // j + index k
mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE
add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i
APPLY_FILTER t7, a2, vr2
APPLY_FILTER t8, t7, vr3
APPLY_FILTER t8, t7, vr4
APPLY_FILTER t8, t7, vr5
APPLY_FILTER t8, t7, vr6
APPLY_FILTER t8, t7, vr7
APPLY_FILTER t8, t7, vr8
vssrarni.hu.w vr15, vr14, 11
vssrarni.hu.w vr17, vr16, 11
vssrlni.bu.h vr17, vr15, 0
.endm
/*
void wiener_filter_v_lsx(uint8_t *p,
const ptrdiff_t p_stride,
const int32_t *hor,
const int16_t filterv[8],
const int w, const int h)
*/
function wiener_filter_v_8bpc_lsx
li.w t6, -(1 << 18)
li.w t8, REST_UNIT_STRIDE
ld.h t0, a3, 0
ld.h t1, a3, 2
vreplgr2vr.w vr2, t0
vreplgr2vr.w vr3, t1
ld.h t0, a3, 4
ld.h t1, a3, 6
vreplgr2vr.w vr4, t0
vreplgr2vr.w vr5, t1
ld.h t0, a3, 8
ld.h t1, a3, 10
vreplgr2vr.w vr6, t0
vreplgr2vr.w vr7, t1
ld.h t0, a3, 12
vreplgr2vr.w vr8, t0
andi t1, a4, 0xf
sub.w t0, a4, t1 // w-w%16
or t2, zero, zero // j
or t4, zero, zero
beqz t0, .WIENER_FILTER_V_W_LT16
.WIENER_FILTER_V_H:
andi t1, a4, 0xf
add.d t3, zero, a0 // p
or t4, zero, zero // i
.WIENER_FILTER_V_W:
wiener_filter_v_8bpc_core_lsx
mul.w t5, t2, a1 // j * stride
add.w t5, t5, t4 // j * stride + i
add.d t3, a0, t5
addi.w t4, t4, 16
vst vr17, t3, 0
bne t0, t4, .WIENER_FILTER_V_W
beqz t1, .WIENER_FILTER_V_W_EQ16
wiener_filter_v_8bpc_core_lsx
addi.d t3, t3, 16
andi t1, a4, 0xf
.WIENER_FILTER_V_ST_REM:
vstelm.b vr17, t3, 0, 0
vbsrl.v vr17, vr17, 1
addi.d t3, t3, 1
addi.w t1, t1, -1
bnez t1, .WIENER_FILTER_V_ST_REM
.WIENER_FILTER_V_W_EQ16:
addi.w t2, t2, 1
blt t2, a5, .WIENER_FILTER_V_H
b .WIENER_FILTER_V_END
.WIENER_FILTER_V_W_LT16:
andi t1, a4, 0xf
add.d t3, zero, a0
wiener_filter_v_8bpc_core_lsx
mul.w t5, t2, a1 // j * stride
add.d t3, a0, t5
.WIENER_FILTER_V_ST_REM_1:
vstelm.b vr17, t3, 0, 0
vbsrl.v vr17, vr17, 1
addi.d t3, t3, 1
addi.w t1, t1, -1
bnez t1, .WIENER_FILTER_V_ST_REM_1
addi.w t2, t2, 1
blt t2, a5, .WIENER_FILTER_V_W_LT16
.WIENER_FILTER_V_END:
endfunc
/*
void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
const int w, const int h)
*/
function boxsum3_h_8bpc_lsx
addi.d a2, a2, REST_UNIT_STRIDE
li.w t0, 1
addi.w a3, a3, -2
addi.w a4, a4, -4
.LBS3_H_H:
alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x
alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x
add.d t3, t0, a2 // s
addi.w t5, a3, 0
.LBS3_H_W:
vld vr0, t3, 0
vld vr1, t3, REST_UNIT_STRIDE
vld vr2, t3, (REST_UNIT_STRIDE<<1)
vilvl.b vr3, vr1, vr0
vhaddw.hu.bu vr4, vr3, vr3
vilvh.b vr5, vr1, vr0
vhaddw.hu.bu vr6, vr5, vr5
vsllwil.hu.bu vr7, vr2, 0
vexth.hu.bu vr8, vr2
// sum_v
vadd.h vr4, vr4, vr7
vadd.h vr6, vr6, vr8
vst vr4, t1, REST_UNIT_STRIDE<<1
vst vr6, t1, (REST_UNIT_STRIDE<<1)+16
addi.d t1, t1, 32
// sumsq
vmulwev.h.bu vr9, vr3, vr3
vmulwod.h.bu vr10, vr3, vr3
vmulwev.h.bu vr11, vr5, vr5
vmulwod.h.bu vr12, vr5, vr5
vaddwev.w.hu vr13, vr10, vr9
vaddwod.w.hu vr14, vr10, vr9
vaddwev.w.hu vr15, vr12, vr11
vaddwod.w.hu vr16, vr12, vr11
vmaddwev.w.hu vr13, vr7, vr7
vmaddwod.w.hu vr14, vr7, vr7
vmaddwev.w.hu vr15, vr8, vr8
vmaddwod.w.hu vr16, vr8, vr8
vilvl.w vr9, vr14, vr13
vilvh.w vr10, vr14, vr13
vilvl.w vr11, vr16, vr15
vilvh.w vr12, vr16, vr15
vst vr9, t2, REST_UNIT_STRIDE<<2
vst vr10, t2, (REST_UNIT_STRIDE<<2)+16
vst vr11, t2, (REST_UNIT_STRIDE<<2)+32
vst vr12, t2, (REST_UNIT_STRIDE<<2)+48
addi.d t2, t2, 64
addi.w t5, t5, -16
addi.d t3, t3, 16
blt zero, t5, .LBS3_H_W
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a2, a2, REST_UNIT_STRIDE
addi.d a4, a4, -1
blt zero, a4, .LBS3_H_H
endfunc
/*
void boxsum3_v(int32_t *sumsq, coef *sum,
const int w, const int h)
*/
function boxsum3_v_8bpc_lsx
addi.d a0, a0, (REST_UNIT_STRIDE<<2)
addi.d a1, a1, (REST_UNIT_STRIDE<<1)
addi.w a3, a3, -4
addi.w a2, a2, -4
.LBS3_V_H:
sub.w t3, a2, zero
addi.d t0, a0, 4
addi.d t1, a1, 2
addi.d t5, a0, 8
addi.d t6, a1, 4
vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
vld vr3, t0, 0 // a2 0 1 2 3
vld vr4, t0, 4 // b2 1 2 3 4
vld vr5, t0, 8 // c2 2 3 4 5
vld vr6, t0, 16 // 3 4 5 6
vld vr7, t0, 20 // 4 5 6 7
vld vr8, t0, 24 // 5 6 7 8
vadd.h vr9, vr0, vr1
vadd.w vr10, vr3, vr4
vadd.w vr11, vr6, vr7
vadd.h vr9, vr9, vr2
vadd.w vr10, vr10, vr5
vadd.w vr11, vr11, vr8
vpickve2gr.h t7, vr2, 6
vpickve2gr.w t8, vr8, 2
vst vr9, t6, 0
vst vr10, t5, 0
vst vr11, t5, 16
addi.d t1, t1, 16
addi.d t0, t0, 32
addi.d t5, t5, 32
addi.d t6, t6, 16
addi.d t3, t3, -8
ble t3, zero, .LBS3_V_H0
.LBS3_V_W8:
vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
vld vr2, t1, 4 // c 2 3 4 5 6 7 8 9
vld vr3, t0, 0 // a2 0 1 2 3
vld vr4, t0, 4 // b2 1 2 3 4
vld vr5, t0, 8 // c2 2 3 4 5
vld vr6, t0, 16 // 3 4 5 6
vld vr7, t0, 20 // 4 5 6 7
vld vr8, t0, 24 // 5 6 7 8
vinsgr2vr.h vr0, t7, 0
vinsgr2vr.w vr3, t8, 0
vpickve2gr.h t7, vr2, 6
vpickve2gr.w t8, vr8, 2
vadd.h vr9, vr0, vr1
vadd.w vr10, vr3, vr4
vadd.w vr11, vr6, vr7
vadd.h vr9, vr9, vr2
vadd.w vr10, vr10, vr5
vadd.w vr11, vr11, vr8
vst vr9, t6, 0
vst vr10, t5, 0
vst vr11, t5, 16
addi.d t3, t3, -8
addi.d t1, t1, 16
addi.d t0, t0, 32
addi.d t5, t5, 32
addi.d t6, t6, 16
blt zero, t3, .LBS3_V_W8
.LBS3_V_H0:
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.w a3, a3, -1
bnez a3, .LBS3_V_H
endfunc
/*
boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
const int w, const int h,
const unsigned s)
*/
function boxsum3_sgf_h_8bpc_lsx
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a0, a0, 12 // AA
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a1, a1, 6 // BB
la.local t8, dav1d_sgr_x_by_x
li.w t6, 455
vreplgr2vr.w vr20, t6
li.w t6, 255
vreplgr2vr.w vr22, t6
vaddi.wu vr21, vr22, 1 // 256
vreplgr2vr.w vr6, a4
vldi vr19, 0x809
addi.w a2, a2, 2 // w + 2
addi.w a3, a3, 2 // h + 2
.LBS3SGF_H_H:
addi.w t2, a2, 0
addi.d t0, a0, -4
addi.d t1, a1, -2
.LBS3SGF_H_W:
addi.w t2, t2, -8
vld vr0, t0, 0 // AA[i]
vld vr1, t0, 16
vld vr2, t1, 0 // BB[i]
vmul.w vr4, vr0, vr19 // a * n
vmul.w vr5, vr1, vr19 // a * n
vsllwil.w.h vr9, vr2, 0
vexth.w.h vr10, vr2
vmsub.w vr4, vr9, vr9 // p
vmsub.w vr5, vr10, vr10 // p
vmaxi.w vr4, vr4, 0
vmaxi.w vr5, vr5, 0 // p
vmul.w vr4, vr4, vr6 // p * s
vmul.w vr5, vr5, vr6 // p * s
vsrlri.w vr4, vr4, 20
vsrlri.w vr5, vr5, 20 // z
vmin.w vr4, vr4, vr22
vmin.w vr5, vr5, vr22
vpickve2gr.w t6, vr4, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 0
vpickve2gr.w t6, vr4, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 1
vpickve2gr.w t6, vr4, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 2
vpickve2gr.w t6, vr4, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 3
vpickve2gr.w t6, vr5, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 0
vpickve2gr.w t6, vr5, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 1
vpickve2gr.w t6, vr5, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 2
vpickve2gr.w t6, vr5, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 3 // x
vmul.w vr9, vr7, vr9 // x * BB[i]
vmul.w vr10, vr8, vr10
vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
vmul.w vr10, vr10, vr20
vsrlri.w vr9, vr9, 12
vsrlri.w vr10, vr10, 12
vsub.w vr7, vr21, vr7
vsub.w vr8, vr21, vr8
vpickev.h vr8, vr8, vr7
vst vr9, t0, 0
vst vr10, t0, 16
vst vr8, t1, 0
addi.d t0, t0, 32
addi.d t1, t1, 16
blt zero, t2, .LBS3SGF_H_W
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.w a3, a3, -1
bnez a3, .LBS3SGF_H_H
endfunc
/*
boxsum3_selfguided_filter(coef *dst, pixel *src,
int32_t *sumsq, coef *sum,
const int w, const int h)
*/
function boxsum3_sgf_v_8bpc_lsx
addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src
addi.d a2, a2, REST_UNIT_STRIDE<<2
addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12
addi.d a3, a3, REST_UNIT_STRIDE<<2
addi.d a3, a3, 6
.LBS3SGF_V_H:
// A int32_t *sumsq
addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride
addi.d t1, a2, 0 // sumsq
addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride
addi.d t6, a1, 0
addi.w t7, a4, 0
addi.d t8, a0, 0
// B coef *sum
addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride
addi.d t4, a3, 0
addi.d t5, a3, REST_UNIT_STRIDE<<1
.LBS3SGF_V_W:
vld vr0, t0, 0 // P[i - REST_UNIT_STRIDE]
vld vr1, t0, 16
vld vr2, t1, -4 // P[i-1] -1 0 1 2
vld vr3, t1, 12 // 3 4 5 6
vld vr4, t2, 0 // P[i + REST_UNIT_STRIDE]
vld vr5, t2, 16
vld vr6, t1, 0 // p[i] 0 1 2 3
vld vr7, t1, 16 // 4 5 6 7
vld vr8, t1, 4 // p[i+1] 1 2 3 4
vld vr9, t1, 20 // 5 6 7 8
vld vr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE]
vld vr11, t0, 12
vld vr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE]
vld vr13, t2, 12
vld vr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE]
vld vr15, t0, 20
vld vr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE]
vld vr17, t2, 20
vadd.w vr0, vr2, vr0
vadd.w vr4, vr6, vr4
vadd.w vr0, vr0, vr8
vadd.w vr20, vr0, vr4
vslli.w vr20, vr20, 2 // 0 1 2 3
vadd.w vr0, vr1, vr3
vadd.w vr4, vr5, vr7
vadd.w vr0, vr0, vr9
vadd.w vr21, vr0, vr4
vslli.w vr21, vr21, 2 // 4 5 6 7
vadd.w vr12, vr10, vr12
vadd.w vr16, vr14, vr16
vadd.w vr22, vr12, vr16
vslli.w vr23, vr22, 1
vadd.w vr22, vr23, vr22
vadd.w vr11, vr11, vr13
vadd.w vr15, vr15, vr17
vadd.w vr0, vr11, vr15
vslli.w vr23, vr0, 1
vadd.w vr23, vr23, vr0
vadd.w vr20, vr20, vr22 // b
vadd.w vr21, vr21, vr23
// B coef *sum
vld vr0, t3, 0 // P[i - REST_UNIT_STRIDE]
vld vr1, t4, -2 // p[i - 1]
vld vr2, t4, 0 // p[i]
vld vr3, t4, 2 // p[i + 1]
vld vr4, t5, 0 // P[i + REST_UNIT_STRIDE]
vld vr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE]
vld vr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE]
vld vr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE]
vld vr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE]
vaddwev.w.h vr9, vr0, vr1
vaddwod.w.h vr10, vr0, vr1
vaddwev.w.h vr11, vr2, vr3
vaddwod.w.h vr12, vr2, vr3
vadd.w vr9, vr11, vr9
vadd.w vr10, vr12, vr10
vilvl.w vr11, vr10, vr9 // 0 1 2 3
vilvh.w vr12, vr10, vr9 // 4 5 6 7
vsllwil.w.h vr0, vr4, 0
vexth.w.h vr1, vr4
vadd.w vr0, vr11, vr0
vadd.w vr1, vr12, vr1
vslli.w vr0, vr0, 2
vslli.w vr1, vr1, 2
vaddwev.w.h vr9, vr5, vr6
vaddwod.w.h vr10, vr5, vr6
vaddwev.w.h vr11, vr7, vr8
vaddwod.w.h vr12, vr7, vr8
vadd.w vr9, vr11, vr9
vadd.w vr10, vr12, vr10
vilvl.w vr13, vr10, vr9
vilvh.w vr14, vr10, vr9
vslli.w vr15, vr13, 1
vslli.w vr16, vr14, 1
vadd.w vr15, vr13, vr15 // a
vadd.w vr16, vr14, vr16
vadd.w vr22, vr0, vr15
vadd.w vr23, vr1, vr16
vld vr0, t6, 0 // src
vsllwil.hu.bu vr0, vr0, 0
vsllwil.wu.hu vr1, vr0, 0
vexth.wu.hu vr2, vr0
vmadd.w vr20, vr22, vr1
vmadd.w vr21, vr23, vr2
vssrlrni.h.w vr21, vr20, 9
vst vr21, t8, 0
addi.d t8, t8, 16
addi.d t0, t0, 32
addi.d t1, t1, 32
addi.d t2, t2, 32
addi.d t3, t3, 16
addi.d t4, t4, 16
addi.d t5, t5, 16
addi.d t6, t6, 8
addi.w t7, t7, -8
blt zero, t7, .LBS3SGF_V_W
addi.w a5, a5, -1
addi.d a0, a0, 384*2
addi.d a1, a1, REST_UNIT_STRIDE
addi.d a3, a3, REST_UNIT_STRIDE<<1
addi.d a2, a2, REST_UNIT_STRIDE<<2
bnez a5, .LBS3SGF_V_H
endfunc
function boxsum3_sgf_v_8bpc_lasx
addi.d a1, a1, (3*REST_UNIT_STRIDE+3) // src
addi.d a2, a2, REST_UNIT_STRIDE<<2
addi.d a2, a2, (REST_UNIT_STRIDE<<2)+12
addi.d a3, a3, REST_UNIT_STRIDE<<2
addi.d a3, a3, 6
.LBS3SGF_V_H_LASX:
// A int32_t *sumsq
addi.d t0, a2, -(REST_UNIT_STRIDE<<2) // -stride
addi.d t1, a2, 0 // sumsq
addi.d t2, a2, REST_UNIT_STRIDE<<2 // +stride
addi.d t6, a1, 0
addi.w t7, a4, 0
addi.d t8, a0, 0
// B coef *sum
addi.d t3, a3, -(REST_UNIT_STRIDE<<1) // -stride
addi.d t4, a3, 0
addi.d t5, a3, REST_UNIT_STRIDE<<1
.LBS3SGF_V_W_LASX:
xvld xr0, t0, 0 // P[i - REST_UNIT_STRIDE]
xvld xr1, t0, 32
xvld xr2, t1, -4 // P[i-1] -1 0 1 2
xvld xr3, t1, 28 // 3 4 5 6
xvld xr4, t2, 0 // P[i + REST_UNIT_STRIDE]
xvld xr5, t2, 32
xvld xr6, t1, 0 // p[i] 0 1 2 3
xvld xr7, t1, 32 // 4 5 6 7
xvld xr8, t1, 4 // p[i+1] 1 2 3 4
xvld xr9, t1, 36 // 5 6 7 8
xvld xr10, t0, -4 // P[i - 1 - REST_UNIT_STRIDE]
xvld xr11, t0, 28
xvld xr12, t2, -4 // P[i - 1 + REST_UNIT_STRIDE]
xvld xr13, t2, 28
xvld xr14, t0, 4 // P[i + 1 - REST_UNIT_STRIDE]
xvld xr15, t0, 36
xvld xr16, t2, 4 // P[i + 1 + REST_UNIT_STRIDE]
xvld xr17, t2, 36
xvadd.w xr0, xr2, xr0
xvadd.w xr4, xr6, xr4
xvadd.w xr0, xr0, xr8
xvadd.w xr20, xr0, xr4
xvslli.w xr20, xr20, 2 // 0 1 2 3
xvadd.w xr0, xr1, xr3
xvadd.w xr4, xr5, xr7
xvadd.w xr0, xr0, xr9
xvadd.w xr21, xr0, xr4
xvslli.w xr21, xr21, 2 // 4 5 6 7
xvadd.w xr12, xr10, xr12
xvadd.w xr16, xr14, xr16
xvadd.w xr22, xr12, xr16
xvslli.w xr23, xr22, 1
xvadd.w xr22, xr23, xr22
xvadd.w xr11, xr11, xr13
xvadd.w xr15, xr15, xr17
xvadd.w xr0, xr11, xr15
xvslli.w xr23, xr0, 1
xvadd.w xr23, xr23, xr0
xvadd.w xr20, xr20, xr22 // b
xvadd.w xr21, xr21, xr23
// B coef *sum
xvld xr0, t3, 0 // P[i - REST_UNIT_STRIDE]
xvld xr1, t4, -2 // p[i - 1]
xvld xr2, t4, 0 // p[i]
xvld xr3, t4, 2 // p[i + 1]
xvld xr4, t5, 0 // P[i + REST_UNIT_STRIDE]
xvld xr5, t3, -2 // P[i - 1 - REST_UNIT_STRIDE]
xvld xr6, t5, -2 // P[i - 1 + REST_UNIT_STRIDE]
xvld xr7, t3, 2 // P[i + 1 - REST_UNIT_STRIDE]
xvld xr8, t5, 2 // P[i + 1 + REST_UNIT_STRIDE]
xvaddwev.w.h xr9, xr0, xr1
xvaddwod.w.h xr10, xr0, xr1
xvaddwev.w.h xr11, xr2, xr3
xvaddwod.w.h xr12, xr2, xr3
xvadd.w xr9, xr11, xr9 // 0 2 4 6 8 10 12 14
xvadd.w xr10, xr12, xr10 // 1 3 5 7 9 11 13 15
xvilvl.w xr11, xr10, xr9 // 0 1 2 3 8 9 10 11
xvilvh.w xr12, xr10, xr9 // 4 5 6 7 12 13 14 15
xvsllwil.w.h xr0, xr4, 0 // 0 1 2 3 8 9 10 11
xvexth.w.h xr1, xr4 // 4 5 6 7 12 13 14 15
xvadd.w xr0, xr11, xr0
xvadd.w xr1, xr12, xr1
xvslli.w xr0, xr0, 2
xvslli.w xr1, xr1, 2
xvaddwev.w.h xr9, xr5, xr6
xvaddwod.w.h xr10, xr5, xr6
xvaddwev.w.h xr11, xr7, xr8
xvaddwod.w.h xr12, xr7, xr8
xvadd.w xr9, xr11, xr9
xvadd.w xr10, xr12, xr10
xvilvl.w xr13, xr10, xr9 // 0 1 2 3 8 9 10 11
xvilvh.w xr14, xr10, xr9 // 4 5 6 7 12 13 14 15
xvslli.w xr15, xr13, 1
xvslli.w xr16, xr14, 1
xvadd.w xr15, xr13, xr15 // a
xvadd.w xr16, xr14, xr16
xvadd.w xr22, xr0, xr15 // A B
xvadd.w xr23, xr1, xr16 // C D
vld vr0, t6, 0 // src
vilvh.d vr2, vr0, vr0
vext2xv.wu.bu xr1, xr0
vext2xv.wu.bu xr2, xr2
xvor.v xr15, xr22, xr22 // A B
xvpermi.q xr22, xr23, 0b00000010 // A C
xvpermi.q xr23, xr15, 0b00110001
xvmadd.w xr20, xr22, xr1
xvmadd.w xr21, xr23, xr2
xvssrlrni.h.w xr21, xr20, 9
xvpermi.d xr22, xr21, 0b11011000
xvst xr22, t8, 0
addi.d t8, t8, 32
addi.d t0, t0, 64
addi.d t1, t1, 64
addi.d t2, t2, 64
addi.d t3, t3, 32
addi.d t4, t4, 32
addi.d t5, t5, 32
addi.d t6, t6, 16
addi.w t7, t7, -16
blt zero, t7, .LBS3SGF_V_W_LASX
addi.w a5, a5, -1
addi.d a0, a0, 384*2
addi.d a1, a1, REST_UNIT_STRIDE
addi.d a3, a3, REST_UNIT_STRIDE<<1
addi.d a2, a2, REST_UNIT_STRIDE<<2
bnez a5, .LBS3SGF_V_H_LASX
endfunc
#define FILTER_OUT_STRIDE (384)
/*
sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
const int16_t *dst, const int w1;
const int w, const int h);
*/
function sgr_3x3_finish_8bpc_lsx
vreplgr2vr.w vr3, a3 // w1
andi t4, a4, 0x7
sub.w t5, a4, t4
beq zero, t5, .LSGR3X3_REM
.LSGR3X3_H:
addi.d t0, a0, 0
addi.d t1, a2, 0
addi.w t2, t5, 0
andi t4, a4, 0x7
.LSGR3X3_W:
vld vr0, t0, 0
vld vr1, t1, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // p
vexth.wu.hu vr5, vr2 // p
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst
vexth.w.h vr9, vr1 // dst
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
vstelm.d vr7, t0, 0, 0
addi.d t0, t0, 8
addi.d t1, t1, 16
addi.d t2, t2, -8
bne zero, t2, .LSGR3X3_W
beq t4, zero, .LSGR3X3_NOREM
vld vr0, t0, 0
vld vr1, t1, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // p
vexth.wu.hu vr5, vr2 // p
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst
vexth.w.h vr9, vr1 // dst
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
.LSGR3X3_ST:
vstelm.b vr7, t0, 0, 0
addi.d t0, t0, 1
vbsrl.v vr7, vr7, 1
addi.w t4, t4, -1
bnez t4, .LSGR3X3_ST
.LSGR3X3_NOREM:
addi.w a5, a5, -1
add.d a0, a0, a1
addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
bnez a5, .LSGR3X3_H
b .LSGR3X3_END
.LSGR3X3_REM:
andi t4, a4, 0x7
addi.d t0, a0, 0
vld vr0, t0, 0
vld vr1, a2, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // p
vexth.wu.hu vr5, vr2 // p
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst
vexth.w.h vr9, vr1 // dst
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
.LSGR3X3_REM_ST:
vstelm.b vr7, t0, 0, 0
addi.d t0, t0, 1
vbsrl.v vr7, vr7, 1
addi.w t4, t4, -1
bnez t4, .LSGR3X3_REM_ST
addi.w a5, a5, -1
add.d a0, a0, a1
addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
bnez a5, .LSGR3X3_REM
.LSGR3X3_END:
endfunc
/*
void boxsum5(int32_t *sumsq, coef *sum,
const pixel *const src,
const int w, const int h)
*/
function boxsum5_h_8bpc_lsx
addi.w a4, a4, -4
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
li.w t6, 1
.LBOXSUM5_H_H:
addi.w t3, a3, 0
addi.d t2, a2, 0
addi.d t0, a0, 0
addi.d t1, a1, 0
.LBOXSUM5_H_W:
vld vr0, t2, 0 // a
vld vr1, t2, REST_UNIT_STRIDE // b
vld vr2, t2, REST_UNIT_STRIDE<<1 // c
vld vr3, t2, REST_UNIT_STRIDE*3 // d
vld vr4, t2, REST_UNIT_STRIDE<<2 // e
vilvl.b vr5, vr1, vr0
vilvh.b vr6, vr1, vr0
vilvl.b vr7, vr3, vr2
vilvh.b vr8, vr3, vr2
//sum_v
vhaddw.hu.bu vr9, vr5, vr5 // 0 1 2 3 4 5 6 7
vhaddw.hu.bu vr10, vr6, vr6 // 8 9 10 11 12 13 14 15 a+b
vhaddw.hu.bu vr11, vr7, vr7
vhaddw.hu.bu vr12, vr8, vr8
vadd.h vr9, vr9, vr11
vadd.h vr10, vr10, vr12 // a + b + c + d
vsllwil.hu.bu vr11, vr4, 0
vexth.hu.bu vr12, vr4
vadd.h vr9, vr9, vr11
vadd.h vr10, vr10, vr12
vst vr9, t1, 0
vst vr10, t1, 16
addi.d t1, t1, 32
// sumsq
vmulwev.h.bu vr9, vr5, vr5 // a*a 0 1 2 3 4 5 6 7
vmulwev.h.bu vr10, vr6, vr6 // a*a 8 9 10 11 12 13 14 15
vmulwod.h.bu vr13, vr5, vr5 // b*b 0 1 2 3 4 5 6 7
vmulwod.h.bu vr14, vr6, vr6 // b*b 8 9 10 11 12 13 14 15
vmulwev.h.bu vr15, vr7, vr7 // c*c 0 1 2 3 4 5 6 7
vmulwev.h.bu vr16, vr8, vr8 // c*c 8 9 10 11 12 13 14 15
vmulwod.h.bu vr17, vr7, vr7 // d*d 0 1 2 3 4 5 6 7
vmulwod.h.bu vr18, vr8, vr8 // d*d 8 9 10 11 12 13 14 15
vaddwev.w.hu vr5, vr9, vr13 // 0 2 4 6
vaddwod.w.hu vr6, vr9, vr13 // 1 3 5 7
vaddwev.w.hu vr7, vr10, vr14 // 8 10 12 14
vaddwod.w.hu vr8, vr10, vr14 // 9 11 13 15 a + b
vaddwev.w.hu vr19, vr15, vr17 // 0 2 4 6
vaddwod.w.hu vr20, vr15, vr17 // 1 3 5 7
vaddwev.w.hu vr21, vr16, vr18 // 8 10 12 14
vaddwod.w.hu vr22, vr16, vr18 // 9 11 13 15 c + d
vadd.w vr5, vr5, vr19
vadd.w vr6, vr6, vr20
vadd.w vr7, vr7, vr21
vadd.w vr8, vr8, vr22
vmaddwev.w.hu vr5, vr11, vr11
vmaddwod.w.hu vr6, vr11, vr11
vmaddwev.w.hu vr7, vr12, vr12
vmaddwod.w.hu vr8, vr12, vr12
vilvl.w vr19, vr6, vr5
vilvh.w vr20, vr6, vr5
vilvl.w vr21, vr8, vr7
vilvh.w vr22, vr8, vr7
vst vr19, t0, 0
vst vr20, t0, 16
vst vr21, t0, 32
vst vr22, t0, 48
addi.d t0, t0, 64
addi.d t2, t2, 16
addi.w t3, t3, -16
blt zero, t3, .LBOXSUM5_H_W
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a2, a2, REST_UNIT_STRIDE
addi.d a4, a4, -1
bnez a4, .LBOXSUM5_H_H
endfunc
/*
void boxsum5_h(int32_t *sumsq, coef *sum,
const int w, const int h)
*/
function boxsum5_v_8bpc_lsx
addi.d a0, a0, (REST_UNIT_STRIDE<<2)
addi.d a1, a1, (REST_UNIT_STRIDE<<1)
addi.w a3, a3, -4
addi.w a2, a2, -4
.LBOXSUM5_V_H:
addi.w t3, a2, 0
addi.d t0, a0, 0
addi.d t1, a1, 0
addi.d t2, a0, 8
addi.d t3, a1, 4
addi.d t4, a2, 0
vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
vld vr2, t1, 4 // c 2
vld vr3, t1, 6 // d 3
vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
vadd.h vr5, vr0, vr1
vadd.h vr6, vr2, vr3
vpickve2gr.w t5, vr4, 2
vadd.h vr5, vr5, vr6
vadd.h vr5, vr5, vr4
vst vr5, t3, 0
vld vr0, t0, 0 // 0 1 2 3 a
vld vr1, t0, 4 // 1 2 3 4 b
vld vr2, t0, 8 // 2 3 4 5 c
vld vr3, t0, 12 // 3 4 5 6 d
vld vr4, t0, 16 // 4 5 6 7 e a
vld vr5, t0, 20 // 5 6 7 8 b
vld vr6, t0, 24 // 6 7 8 9 c
vld vr7, t0, 28 // 7 8 9 10 d
vld vr8, t0, 32 // 8 9 10 11 e
vadd.w vr9, vr0, vr1
vadd.w vr10, vr2, vr3
vadd.w vr9, vr9, vr10
vadd.w vr9, vr9, vr4
vadd.w vr10, vr4, vr5
vadd.w vr11, vr6, vr7
vadd.w vr10, vr10, vr8
vadd.w vr10, vr10, vr11
vst vr9, t2, 0
vst vr10, t2, 16
addi.d t3, t3, 16
addi.d t1, t1, 16
addi.d t0, t0, 32
addi.d t2, t2, 32
addi.w t4, t4, -8
ble t4, zero, .LBOXSUM5_V_H1
.LBOXSUM5_V_W:
vld vr0, t1, 0 // a 0 1 2 3 4 5 6 7
vld vr1, t1, 2 // b 1 2 3 4 5 6 7 8
vld vr2, t1, 4 // c 2
vld vr3, t1, 6 // d 3
vld vr4, t1, 8 // e 4 5 6 7 8 9 10 11
vinsgr2vr.w vr0, t5, 0
vpickve2gr.w t5, vr4, 2
vextrins.h vr1, vr0, 0x01
vadd.h vr5, vr0, vr1
vadd.h vr6, vr2, vr3
vadd.h vr5, vr5, vr6
vadd.h vr5, vr5, vr4
vst vr5, t3, 0
vaddi.hu vr0, vr8, 0 // 8 9 10 11 a
vld vr1, t0, 4 // 9 10 11 12 b
vld vr2, t0, 8 // 10 11 12 13 c
vld vr3, t0, 12 // 14 15 16 17 d
vld vr4, t0, 16 // 15 16 17 18 e a
vld vr5, t0, 20 // 16 17 18 19 b
vld vr6, t0, 24 // 17 18 19 20 c
vld vr7, t0, 28 // 18 19 20 21 d
vld vr8, t0, 32 // 19 20 21 22 e
vextrins.w vr1, vr0, 0x01
vadd.w vr9, vr0, vr1
vadd.w vr10, vr2, vr3
vadd.w vr9, vr9, vr10
vadd.w vr9, vr9, vr4
vadd.w vr10, vr4, vr5
vadd.w vr11, vr6, vr7
vadd.w vr10, vr10, vr8
vadd.w vr10, vr10, vr11
vst vr9, t2, 0
vst vr10, t2, 16
addi.d t3, t3, 16
addi.d t1, t1, 16
addi.d t0, t0, 32
addi.d t2, t2, 32
addi.w t4, t4, -8
blt zero, t4, .LBOXSUM5_V_W
.LBOXSUM5_V_H1:
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.w a3, a3, -1
bnez a3, .LBOXSUM5_V_H
endfunc
/*
selfguided_filter(int32_t *sumsq, coef *sum,
const int w, const int h,
const unsigned s)
*/
function boxsum5_sgf_h_8bpc_lsx
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a0, a0, 12 // AA
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a1, a1, 6 // BB
la.local t8, dav1d_sgr_x_by_x
li.w t6, 164
vreplgr2vr.w vr20, t6
li.w t6, 255
vreplgr2vr.w vr22, t6
vaddi.wu vr21, vr22, 1 // 256
vreplgr2vr.w vr6, a4
vldi vr19, 0x819
addi.w a2, a2, 2 // w + 2
addi.w a3, a3, 2 // h + 2
.LBS5SGF_H_H:
addi.w t2, a2, 0
addi.d t0, a0, -4
addi.d t1, a1, -2
.LBS5SGF_H_W:
vld vr0, t0, 0 // AA[i]
vld vr1, t0, 16
vld vr2, t1, 0 // BB[i]
vmul.w vr4, vr0, vr19 // a * n
vmul.w vr5, vr1, vr19 // a * n
vsllwil.w.h vr9, vr2, 0
vexth.w.h vr10, vr2
vmsub.w vr4, vr9, vr9 // p
vmsub.w vr5, vr10, vr10 // p
vmaxi.w vr4, vr4, 0
vmaxi.w vr5, vr5, 0 // p
vmul.w vr4, vr4, vr6 // p * s
vmul.w vr5, vr5, vr6 // p * s
vsrlri.w vr4, vr4, 20
vsrlri.w vr5, vr5, 20 // z
vmin.w vr4, vr4, vr22
vmin.w vr5, vr5, vr22
// load table data
vpickve2gr.w t6, vr4, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 0
vpickve2gr.w t6, vr4, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 1
vpickve2gr.w t6, vr4, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 2
vpickve2gr.w t6, vr4, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 3
vpickve2gr.w t6, vr5, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 0
vpickve2gr.w t6, vr5, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 1
vpickve2gr.w t6, vr5, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 2
vpickve2gr.w t6, vr5, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 3 // x
vmul.w vr9, vr7, vr9 // x * BB[i]
vmul.w vr10, vr8, vr10
vmul.w vr9, vr9, vr20 // x * BB[i] * sgr_one_by_x
vmul.w vr10, vr10, vr20
vsrlri.w vr9, vr9, 12
vsrlri.w vr10, vr10, 12
vsub.w vr7, vr21, vr7
vsub.w vr8, vr21, vr8
vpickev.h vr8, vr8, vr7
vst vr9, t0, 0
vst vr10, t0, 16
vst vr8, t1, 0
addi.d t0, t0, 32
addi.d t1, t1, 16
addi.w t2, t2, -8
blt zero, t2, .LBS5SGF_H_W
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<2
addi.w a3, a3, -2
blt zero, a3, .LBS5SGF_H_H
endfunc
/*
selfguided_filter(coef *dst, pixel *src,
int32_t *sumsq, coef *sum,
const int w, const int h)
*/
function boxsum5_sgf_v_8bpc_lsx
addi.d a1, a1, 3*REST_UNIT_STRIDE+3 // src
addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1 // A
addi.d a2, a2, (2*REST_UNIT_STRIDE+3)<<1
addi.d a3, a3, (2*REST_UNIT_STRIDE+3)<<1 // B
addi.w a5, a5, -1
vldi vr10, 0x806
vldi vr11, 0x805
vldi vr22, 0x406
.LBS5SGF_V_H:
addi.d t0, a0, 0
addi.d t1, a1, 0
addi.d t2, a2, 0
addi.d t3, a3, 0
addi.w t4, a4, 0
addi.d t5, a0, 384*2
addi.d t6, a1, REST_UNIT_STRIDE
addi.d t7, a2, REST_UNIT_STRIDE<<2
addi.d t8, a3, REST_UNIT_STRIDE<<1 // B
.LBS5SGF_V_W:
// a
vld vr0, t3, -REST_UNIT_STRIDE*2
vld vr1, t3, REST_UNIT_STRIDE*2
vld vr2, t3, (-REST_UNIT_STRIDE-1)*2
vld vr3, t3, (REST_UNIT_STRIDE-1)*2
vld vr4, t3, (1-REST_UNIT_STRIDE)*2
vld vr5, t3, (1+REST_UNIT_STRIDE)*2
vaddwev.w.h vr6, vr0, vr1
vaddwod.w.h vr7, vr0, vr1
vmul.w vr6, vr6, vr10
vmul.w vr7, vr7, vr10
vaddwev.w.h vr8, vr2, vr3
vaddwod.w.h vr9, vr2, vr3
vaddwev.w.h vr12, vr4, vr5
vaddwod.w.h vr13, vr4, vr5
vadd.w vr8, vr8, vr12
vadd.w vr9, vr9, vr13
vmadd.w vr6, vr8, vr11
vmadd.w vr7, vr9, vr11
vilvl.w vr18, vr7, vr6
vilvh.w vr19, vr7, vr6
// b
vld vr0, t2, -REST_UNIT_STRIDE*4
vld vr1, t2, -REST_UNIT_STRIDE*4+16
vld vr2, t2, REST_UNIT_STRIDE*4
vld vr3, t2, REST_UNIT_STRIDE*4+16
vld vr4, t2, (-REST_UNIT_STRIDE-1)*4
vld vr5, t2, (-REST_UNIT_STRIDE-1)*4+16
vld vr8, t2, (REST_UNIT_STRIDE-1)*4
vld vr9, t2, (REST_UNIT_STRIDE-1)*4+16
vld vr12, t2, (1-REST_UNIT_STRIDE)*4
vld vr13, t2, (1-REST_UNIT_STRIDE)*4+16
vld vr14, t2, (1+REST_UNIT_STRIDE)*4
vld vr15, t2, (1+REST_UNIT_STRIDE)*4+16
vadd.w vr0, vr0, vr2 // 0 1 2 3
vadd.w vr1, vr1, vr3 // 4 5 6 7
vmul.w vr20, vr0, vr10
vmul.w vr21, vr1, vr10
vadd.w vr4, vr4, vr8 // 0 1 2 3
vadd.w vr5, vr5, vr9 // 4 5 6 7
vadd.w vr12, vr12, vr14
vadd.w vr13, vr13, vr15
vadd.w vr12, vr12, vr4
vadd.w vr13, vr13, vr5
vmadd.w vr20, vr12, vr11
vmadd.w vr21, vr13, vr11
vld vr2, t1, 0
vsllwil.hu.bu vr2, vr2, 0
vsllwil.wu.hu vr3, vr2, 0
vexth.wu.hu vr4, vr2
vmadd.w vr20, vr18, vr3
vmadd.w vr21, vr19, vr4
vssrlrni.h.w vr21, vr20, 9
vst vr21, t0, 0
addi.d t1, t1, 8
addi.d t2, t2, 32
addi.d t3, t3, 16
// a
vld vr0, t8, 0
vld vr1, t8, -2
vld vr2, t8, 2
vmulwev.w.h vr3, vr0, vr22
vmulwod.w.h vr4, vr0, vr22
vaddwev.w.h vr5, vr1, vr2
vaddwod.w.h vr6, vr1, vr2
vmadd.w vr3, vr5, vr11
vmadd.w vr4, vr6, vr11
vilvl.w vr19, vr4, vr3
vilvh.w vr20, vr4, vr3
// b
vld vr0, t7, 0
vld vr1, t7, -4
vld vr2, t7, 4
vld vr5, t7, 16
vld vr6, t7, 12
vld vr7, t7, 20
vmul.w vr8, vr0, vr10
vmul.w vr9, vr5, vr10
vadd.w vr12, vr1, vr2
vadd.w vr13, vr6, vr7
vmadd.w vr8, vr12, vr11
vmadd.w vr9, vr13, vr11
vld vr2, t6, 0
vsllwil.hu.bu vr2, vr2, 0
vsllwil.wu.hu vr3, vr2, 0
vexth.wu.hu vr4, vr2
vmadd.w vr8, vr19, vr3
vmadd.w vr9, vr20, vr4
vssrlrni.h.w vr9, vr8, 8
vst vr9, t0, 384*2
addi.d t0, t0, 16
addi.d t8, t8, 16
addi.d t7, t7, 32
addi.d t6, t6, 8
addi.w t4, t4, -8
blt zero, t4, .LBS5SGF_V_W
addi.w a5, a5, -2
addi.d a0, a0, 384*4 // dst
addi.d a1, a1, REST_UNIT_STRIDE<<1 // src
addi.d a2, a2, REST_UNIT_STRIDE<<2 //
addi.d a2, a2, REST_UNIT_STRIDE<<2
addi.d a3, a3, REST_UNIT_STRIDE<<2 //
blt zero, a5, .LBS5SGF_V_H
bnez a5, .LBS5SGF_END
.LBS5SGF_V_W1:
// a
vld vr0, a3, -REST_UNIT_STRIDE*2
vld vr1, a3, REST_UNIT_STRIDE*2
vld vr2, a3, (-REST_UNIT_STRIDE-1)*2
vld vr3, a3, (REST_UNIT_STRIDE-1)*2
vld vr4, a3, (1-REST_UNIT_STRIDE)*2
vld vr5, a3, (1+REST_UNIT_STRIDE)*2
vaddwev.w.h vr6, vr0, vr1
vaddwod.w.h vr7, vr0, vr1
vmul.w vr6, vr6, vr10
vmul.w vr7, vr7, vr10
vaddwev.w.h vr8, vr2, vr3
vaddwod.w.h vr9, vr2, vr3
vaddwev.w.h vr12, vr4, vr5
vaddwod.w.h vr13, vr4, vr5
vadd.w vr8, vr8, vr12
vadd.w vr9, vr9, vr13
vmadd.w vr6, vr8, vr11
vmadd.w vr7, vr9, vr11
vilvl.w vr18, vr7, vr6
vilvh.w vr19, vr7, vr6
// b
vld vr0, a2, -REST_UNIT_STRIDE*4
vld vr1, a2, -REST_UNIT_STRIDE*4+16
vld vr2, a2, REST_UNIT_STRIDE*4
vld vr3, a2, REST_UNIT_STRIDE*4+16
vld vr4, a2, (-REST_UNIT_STRIDE-1)*4
vld vr5, a2, (-REST_UNIT_STRIDE-1)*4+16
vld vr8, a2, (REST_UNIT_STRIDE-1)*4
vld vr9, a2, (REST_UNIT_STRIDE-1)*4+16
vld vr12, a2, (1-REST_UNIT_STRIDE)*4
vld vr13, a2, (1-REST_UNIT_STRIDE)*4+16
vld vr14, a2, (1+REST_UNIT_STRIDE)*4
vld vr15, a2, (1+REST_UNIT_STRIDE)*4+16
vadd.w vr0, vr0, vr2 // 0 1 2 3
vadd.w vr1, vr1, vr3 // 4 5 6 7
vmul.w vr20, vr0, vr10
vmul.w vr21, vr1, vr10
vadd.w vr4, vr4, vr8 // 0 1 2 3
vadd.w vr5, vr5, vr9 // 4 5 6 7
vadd.w vr12, vr12, vr14
vadd.w vr13, vr13, vr15
vadd.w vr12, vr12, vr4
vadd.w vr13, vr13, vr5
vmadd.w vr20, vr12, vr11
vmadd.w vr21, vr13, vr11
vld vr2, a1, 0
vsllwil.hu.bu vr2, vr2, 0
vsllwil.wu.hu vr3, vr2, 0
vexth.wu.hu vr4, vr2
vmadd.w vr20, vr18, vr3
vmadd.w vr21, vr19, vr4
vssrlrni.h.w vr21, vr20, 9
vst vr21, a0, 0
addi.d a3, a3, 16
addi.d a2, a2, 32
addi.d a1, a1, 8
addi.d a0, a0, 16
addi.w a4, a4, -8
blt zero, a4, .LBS5SGF_V_W1
.LBS5SGF_END:
endfunc
/*
void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
const int16_t *dst0, const int16_t *dst1,
const int w0, const int w1,
const int w, const int h);
*/
function sgr_mix_finish_8bpc_lsx
vreplgr2vr.w vr3, a4 // w0
vreplgr2vr.w vr13, a5 // w1
andi t4, a6, 0x7
sub.w t5, a6, t4
beq zero, t5, .LSGRMIX_REM
.LSGRMIX_H:
addi.d t0, a0, 0
addi.d t1, a2, 0 // dst0
addi.d t3, a3, 0 // dst1
addi.w t2, t5, 0
andi t4, a6, 0x7
.LSGRMIX_W:
vld vr0, t0, 0
vld vr1, t1, 0
vld vr10, t3, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // u 0 1 2 3
vexth.wu.hu vr5, vr2 // u 4 5 6 7
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst0
vexth.w.h vr9, vr1 // dst0
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vsllwil.w.h vr11, vr10, 0 // dst1
vexth.w.h vr12, vr10 // dst1
vsub.w vr11, vr11, vr4
vsub.w vr12, vr12, vr5
vmadd.w vr6, vr11, vr13
vmadd.w vr7, vr12, vr13
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
vstelm.d vr7, t0, 0, 0
addi.d t0, t0, 8
addi.d t1, t1, 16
addi.d t3, t3, 16
addi.d t2, t2, -8
bne zero, t2, .LSGRMIX_W
beq t4, zero, .LSGRMIX_W8
vld vr0, t0, 0
vld vr1, t1, 0
vld vr10, t3, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // p
vexth.wu.hu vr5, vr2 // p
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst
vexth.w.h vr9, vr1 // dst
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vsllwil.w.h vr11, vr10, 0 // dst1
vexth.w.h vr12, vr10 // dst1
vsub.w vr11, vr11, vr4
vsub.w vr12, vr12, vr5
vmadd.w vr6, vr11, vr13
vmadd.w vr7, vr12, vr13
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
.LSGRMIX_ST:
vstelm.b vr7, t0, 0, 0
addi.d t0, t0, 1
vbsrl.v vr7, vr7, 1
addi.w t4, t4, -1
bnez t4, .LSGRMIX_ST
.LSGRMIX_W8:
addi.w a7, a7, -1
add.d a0, a0, a1
addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
bnez a7, .LSGRMIX_H
b .LSGR_MIX_END
.LSGRMIX_REM:
andi t4, a6, 0x7
vld vr0, a0, 0
vld vr1, a2, 0
vld vr10, a3, 0
vsllwil.hu.bu vr2, vr0, 4 // u 8 h
vsllwil.wu.hu vr4, vr2, 0 // p
vexth.wu.hu vr5, vr2 // p
vslli.w vr6, vr4, 7
vslli.w vr7, vr5, 7
vsllwil.w.h vr8, vr1, 0 // dst
vexth.w.h vr9, vr1 // dst
vsub.w vr8, vr8, vr4
vsub.w vr9, vr9, vr5
vmadd.w vr6, vr8, vr3 // v 0 - 3
vmadd.w vr7, vr9, vr3 // v 4 - 7
vsllwil.w.h vr11, vr10, 0 // dst1
vexth.w.h vr12, vr10 // dst1
vsub.w vr11, vr11, vr4
vsub.w vr12, vr12, vr5
vmadd.w vr6, vr11, vr13
vmadd.w vr7, vr12, vr13
vssrarni.hu.w vr7, vr6, 11
vssrlni.bu.h vr7, vr7, 0
addi.d t0, a0, 0
.LSGRMIX_REM_ST:
vstelm.b vr7, t0, 0, 0
addi.d t0, t0, 1
vbsrl.v vr7, vr7, 1
addi.w t4, t4, -1
bnez t4, .LSGRMIX_REM_ST
addi.w a7, a7, -1
add.d a0, a0, a1
addi.d a2, a2, (FILTER_OUT_STRIDE<<1)
addi.d a3, a3, (FILTER_OUT_STRIDE<<1)
bnez a7, .LSGRMIX_REM
.LSGR_MIX_END:
endfunc
.macro MADD_HU_BU_LASX in0, in1, out0, out1
xvsllwil.hu.bu xr12, \in0, 0
xvexth.hu.bu xr13, \in0
xvmadd.h \out0, xr12, \in1
xvmadd.h \out1, xr13, \in1
.endm
const wiener_shuf_lasx
.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
endconst
function wiener_filter_h_8bpc_lasx
addi.d sp, sp, -40
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
li.w t7, 1<<14 // clip_limit
la.local t1, wiener_shuf_lasx
xvld xr4, t1, 0
vld vr27, a2, 0 // filter[0][k]
xvpermi.q xr14, xr27, 0b00000000
xvrepl128vei.h xr21, xr14, 0
xvrepl128vei.h xr22, xr14, 1
xvrepl128vei.h xr23, xr14, 2
xvrepl128vei.h xr24, xr14, 3
xvrepl128vei.h xr25, xr14, 4
xvrepl128vei.h xr26, xr14, 5
xvrepl128vei.h xr27, xr14, 6
xvreplgr2vr.w xr0, t7
.WIENER_FILTER_H_H_LASX:
addi.w a4, a4, -1 // h
addi.w t0, a3, 0 // w
addi.d t1, a1, 0 // tmp_ptr
addi.d t2, a0, 0 // hor_ptr
.WIENER_FILTER_H_W_LASX:
addi.w t0, t0, -32
xvld xr5, t1, 0
xvld xr13, t1, 16
xvsubi.bu xr14, xr4, 2
xvsubi.bu xr15, xr4, 1
xvshuf.b xr6, xr13, xr5, xr14 // 1 ... 8, 9 ... 16
xvshuf.b xr7, xr13, xr5, xr15 // 2 ... 9, 10 ... 17
xvshuf.b xr8, xr13, xr5, xr4 // 3 ... 10, 11 ... 18
xvaddi.bu xr14, xr4, 1
xvaddi.bu xr15, xr4, 2
xvshuf.b xr9, xr13, xr5, xr14 // 4 ... 11, 12 ... 19
xvshuf.b xr10, xr13, xr5, xr15 // 5 ... 12, 13 ... 20
xvaddi.bu xr14, xr4, 3
xvshuf.b xr11, xr13, xr5, xr14 // 6 ... 13, 14 ... 21
xvsllwil.hu.bu xr15, xr8, 0 // 3 4 5 6 7 8 9 10
xvexth.hu.bu xr16, xr8 // 11 12 13 14 15 16 17 18
xvsllwil.wu.hu xr17, xr15, 7 // 3 4 5 6
xvexth.wu.hu xr18, xr15 // 7 8 9 10
xvsllwil.wu.hu xr19, xr16, 7 // 11 12 13 14
xvexth.wu.hu xr20, xr16 // 15 16 17 18
xvslli.w xr18, xr18, 7
xvslli.w xr20, xr20, 7
xvxor.v xr15, xr15, xr15
xvxor.v xr14, xr14, xr14
MADD_HU_BU_LASX xr5, xr21, xr14, xr15
MADD_HU_BU_LASX xr6, xr22, xr14, xr15
MADD_HU_BU_LASX xr7, xr23, xr14, xr15
MADD_HU_BU_LASX xr8, xr24, xr14, xr15
MADD_HU_BU_LASX xr9, xr25, xr14, xr15
MADD_HU_BU_LASX xr10, xr26, xr14, xr15
MADD_HU_BU_LASX xr11, xr27, xr14, xr15
xvsllwil.w.h xr5, xr14, 0 // 0 1 2 3
xvexth.w.h xr6, xr14 // 4 5 6 7
xvsllwil.w.h xr7, xr15, 0 // 8 9 10 11
xvexth.w.h xr8, xr15 // 12 13 14 15
xvadd.w xr17, xr17, xr5
xvadd.w xr18, xr18, xr6
xvadd.w xr19, xr19, xr7
xvadd.w xr20, xr20, xr8
xvadd.w xr17, xr17, xr0
xvadd.w xr18, xr18, xr0
xvadd.w xr19, xr19, xr0
xvadd.w xr20, xr20, xr0
xvsrli.w xr1, xr0, 1
xvsubi.wu xr1, xr1, 1
xvxor.v xr3, xr3, xr3
xvsrari.w xr17, xr17, 3
xvsrari.w xr18, xr18, 3
xvsrari.w xr19, xr19, 3
xvsrari.w xr20, xr20, 3
xvclip.w xr17, xr17, xr3, xr1
xvclip.w xr18, xr18, xr3, xr1
xvclip.w xr19, xr19, xr3, xr1
xvclip.w xr20, xr20, xr3, xr1
xvor.v xr5, xr17, xr17
xvor.v xr6, xr19, xr19
xvpermi.q xr17, xr18, 0b00000010
xvpermi.q xr19, xr20, 0b00000010
xvst xr17, t2, 0
xvst xr19, t2, 32
xvpermi.q xr18, xr5, 0b00110001
xvpermi.q xr20, xr6, 0b00110001
xvst xr18, t2, 64
xvst xr20, t2, 96
addi.d t1, t1, 32
addi.d t2, t2, 128
blt zero, t0, .WIENER_FILTER_H_W_LASX
addi.d a1, a1, REST_UNIT_STRIDE
addi.d a0, a0, (REST_UNIT_STRIDE << 2)
bnez a4, .WIENER_FILTER_H_H_LASX
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
addi.d sp, sp, 40
endfunc
.macro APPLY_FILTER_LASX in0, in1, in2
alsl.d t7, \in0, \in1, 2
xvld xr10, t7, 0
xvld xr12, t7, 32
xvmadd.w xr14, xr10, \in2
xvmadd.w xr16, xr12, \in2
.endm
.macro wiener_filter_v_8bpc_core_lasx
xvreplgr2vr.w xr14, t6
xvreplgr2vr.w xr16, t6
addi.w t7, t2, 0 // j + index k
mul.w t7, t7, t8 // (j + index) * REST_UNIT_STRIDE
add.w t7, t7, t4 // (j + index) * REST_UNIT_STRIDE + i
APPLY_FILTER_LASX t7, a2, xr2
APPLY_FILTER_LASX t8, t7, xr3
APPLY_FILTER_LASX t8, t7, xr4
APPLY_FILTER_LASX t8, t7, xr5
APPLY_FILTER_LASX t8, t7, xr6
APPLY_FILTER_LASX t8, t7, xr7
APPLY_FILTER_LASX t8, t7, xr8
xvssrarni.hu.w xr16, xr14, 11
xvpermi.d xr17, xr16, 0b11011000
xvssrlni.bu.h xr17, xr17, 0
xvpermi.d xr17, xr17, 0b00001000
.endm
function wiener_filter_v_8bpc_lasx
li.w t6, -(1 << 18)
li.w t8, REST_UNIT_STRIDE
ld.h t0, a3, 0
ld.h t1, a3, 2
xvreplgr2vr.w xr2, t0
xvreplgr2vr.w xr3, t1
ld.h t0, a3, 4
ld.h t1, a3, 6
xvreplgr2vr.w xr4, t0
xvreplgr2vr.w xr5, t1
ld.h t0, a3, 8
ld.h t1, a3, 10
xvreplgr2vr.w xr6, t0
xvreplgr2vr.w xr7, t1
ld.h t0, a3, 12
xvreplgr2vr.w xr8, t0
andi t1, a4, 0xf
sub.w t0, a4, t1 // w-w%16
or t2, zero, zero // j
or t4, zero, zero
beqz t0, .WIENER_FILTER_V_W_LT16_LASX
.WIENER_FILTER_V_H_LASX:
andi t1, a4, 0xf
add.d t3, zero, a0 // p
or t4, zero, zero // i
.WIENER_FILTER_V_W_LASX:
wiener_filter_v_8bpc_core_lasx
mul.w t5, t2, a1 // j * stride
add.w t5, t5, t4 // j * stride + i
add.d t3, a0, t5
addi.w t4, t4, 16
vst vr17, t3, 0
bne t0, t4, .WIENER_FILTER_V_W_LASX
beqz t1, .WIENER_FILTER_V_W_EQ16_LASX
wiener_filter_v_8bpc_core_lsx
addi.d t3, t3, 16
andi t1, a4, 0xf
.WIENER_FILTER_V_ST_REM_LASX:
vstelm.b vr17, t3, 0, 0
vbsrl.v vr17, vr17, 1
addi.d t3, t3, 1
addi.w t1, t1, -1
bnez t1, .WIENER_FILTER_V_ST_REM_LASX
.WIENER_FILTER_V_W_EQ16_LASX:
addi.w t2, t2, 1
blt t2, a5, .WIENER_FILTER_V_H_LASX
b .WIENER_FILTER_V_LASX_END
.WIENER_FILTER_V_W_LT16_LASX:
andi t1, a4, 0xf
add.d t3, zero, a0
wiener_filter_v_8bpc_core_lsx
mul.w t5, t2, a1 // j * stride
add.d t3, a0, t5
.WIENER_FILTER_V_ST_REM_1_LASX:
vstelm.b vr17, t3, 0, 0
vbsrl.v vr17, vr17, 1
addi.d t3, t3, 1
addi.w t1, t1, -1
bnez t1, .WIENER_FILTER_V_ST_REM_1_LASX
addi.w t2, t2, 1
blt t2, a5, .WIENER_FILTER_V_W_LT16_LASX
.WIENER_FILTER_V_LASX_END:
endfunc
function boxsum3_sgf_h_8bpc_lasx
addi.d a0, a0, (REST_UNIT_STRIDE<<2)+12 // AA
//addi.d a0, a0, 12 // AA
addi.d a1, a1, (REST_UNIT_STRIDE<<1)+6 // BB
//addi.d a1, a1, 6 // BB
la.local t8, dav1d_sgr_x_by_x
li.w t6, 455
xvreplgr2vr.w xr20, t6
li.w t6, 255
xvreplgr2vr.w xr22, t6
xvaddi.wu xr21, xr22, 1 // 256
xvreplgr2vr.w xr6, a4
xvldi xr19, 0x809
addi.w a2, a2, 2 // w + 2
addi.w a3, a3, 2 // h + 2
.LBS3SGF_H_H_LASX:
addi.w t2, a2, 0
addi.d t0, a0, -4
addi.d t1, a1, -2
.LBS3SGF_H_W_LASX:
addi.w t2, t2, -16
xvld xr0, t0, 0 // AA[i]
xvld xr1, t0, 32
xvld xr2, t1, 0 // BB[i]
xvmul.w xr4, xr0, xr19 // a * n
xvmul.w xr5, xr1, xr19
vext2xv.w.h xr9, xr2
xvpermi.q xr10, xr2, 0b00000001
vext2xv.w.h xr10, xr10
xvmsub.w xr4, xr9, xr9 // p
xvmsub.w xr5, xr10, xr10
xvmaxi.w xr4, xr4, 0
xvmaxi.w xr5, xr5, 0
xvmul.w xr4, xr4, xr6 // p * s
xvmul.w xr5, xr5, xr6
xvsrlri.w xr4, xr4, 20
xvsrlri.w xr5, xr5, 20
xvmin.w xr4, xr4, xr22
xvmin.w xr5, xr5, xr22
vpickve2gr.w t6, vr4, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 0
vpickve2gr.w t6, vr4, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 1
vpickve2gr.w t6, vr4, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 2
vpickve2gr.w t6, vr4, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr7, t7, 3
xvpickve2gr.w t6, xr4, 4
ldx.bu t7, t8, t6
xvinsgr2vr.w xr7, t7, 4
xvpickve2gr.w t6, xr4, 5
ldx.bu t7, t8, t6
xvinsgr2vr.w xr7, t7, 5
xvpickve2gr.w t6, xr4, 6
ldx.bu t7, t8, t6
xvinsgr2vr.w xr7, t7, 6
xvpickve2gr.w t6, xr4, 7
ldx.bu t7, t8, t6
xvinsgr2vr.w xr7, t7, 7 // x
vpickve2gr.w t6, vr5, 0
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 0
vpickve2gr.w t6, vr5, 1
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 1
vpickve2gr.w t6, vr5, 2
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 2
vpickve2gr.w t6, vr5, 3
ldx.bu t7, t8, t6
vinsgr2vr.w vr8, t7, 3
xvpickve2gr.w t6, xr5, 4
ldx.bu t7, t8, t6
xvinsgr2vr.w xr8, t7, 4
xvpickve2gr.w t6, xr5, 5
ldx.bu t7, t8, t6
xvinsgr2vr.w xr8, t7, 5
xvpickve2gr.w t6, xr5, 6
ldx.bu t7, t8, t6
xvinsgr2vr.w xr8, t7, 6
xvpickve2gr.w t6, xr5, 7
ldx.bu t7, t8, t6
xvinsgr2vr.w xr8, t7, 7 // x
xvmul.w xr9, xr7, xr9 // x * BB[i]
xvmul.w xr10, xr8, xr10
xvmul.w xr9, xr9, xr20 // x * BB[i] * sgr_one_by_x
xvmul.w xr10, xr10, xr20
xvsrlri.w xr9, xr9, 12
xvsrlri.w xr10, xr10, 12
xvsub.w xr7, xr21, xr7
xvsub.w xr8, xr21, xr8
xvpickev.h xr12, xr8, xr7
xvpermi.d xr11, xr12, 0b11011000
xvst xr9, t0, 0
xvst xr10, t0, 32
xvst xr11, t1, 0
addi.d t0, t0, 64
addi.d t1, t1, 32
blt zero, t2, .LBS3SGF_H_W_LASX
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.w a3, a3, -1
bnez a3, .LBS3SGF_H_H_LASX
endfunc
function boxsum3_h_8bpc_lasx
addi.d a2, a2, REST_UNIT_STRIDE
li.w t0, 1
addi.w a3, a3, -2
addi.w a4, a4, -4
.LBS3_H_H_LASX:
alsl.d t1, t0, a1, 1 // sum_v *sum_v = sum + x
alsl.d t2, t0, a0, 2 // sumsq_v *sumsq_v = sumsq + x
add.d t3, t0, a2 // s
addi.w t5, a3, 0
.LBS3_H_W_LASX:
xvld xr0, t3, 0
xvld xr1, t3, REST_UNIT_STRIDE
xvld xr2, t3, (REST_UNIT_STRIDE<<1)
xvilvl.b xr3, xr1, xr0
xvhaddw.hu.bu xr4, xr3, xr3
xvilvh.b xr5, xr1, xr0
xvhaddw.hu.bu xr6, xr5, xr5
xvsllwil.hu.bu xr7, xr2, 0
xvexth.hu.bu xr8, xr2
// sum_v
xvadd.h xr4, xr4, xr7 // 0 2
xvadd.h xr6, xr6, xr8 // 1 3
xvor.v xr9, xr4, xr4
xvpermi.q xr4, xr6, 0b00000010
xvpermi.q xr6, xr9, 0b00110001
xvst xr4, t1, REST_UNIT_STRIDE<<1
xvst xr6, t1, (REST_UNIT_STRIDE<<1)+32
addi.d t1, t1, 64
// sumsq
xvmulwev.h.bu xr9, xr3, xr3
xvmulwod.h.bu xr10, xr3, xr3
xvmulwev.h.bu xr11, xr5, xr5
xvmulwod.h.bu xr12, xr5, xr5
xvaddwev.w.hu xr13, xr10, xr9
xvaddwod.w.hu xr14, xr10, xr9
xvaddwev.w.hu xr15, xr12, xr11
xvaddwod.w.hu xr16, xr12, xr11
xvmaddwev.w.hu xr13, xr7, xr7
xvmaddwod.w.hu xr14, xr7, xr7
xvmaddwev.w.hu xr15, xr8, xr8
xvmaddwod.w.hu xr16, xr8, xr8
xvilvl.w xr9, xr14, xr13
xvilvh.w xr10, xr14, xr13
xvilvl.w xr11, xr16, xr15
xvilvh.w xr12, xr16, xr15
xvor.v xr7, xr9, xr9
xvor.v xr8, xr11, xr11
xvpermi.q xr9, xr10, 0b00000010
xvpermi.q xr10, xr7, 0b00110001
xvpermi.q xr11, xr12, 0b00000010
xvpermi.q xr12, xr8, 0b00110001
xvst xr9, t2, REST_UNIT_STRIDE<<2
xvst xr11, t2, (REST_UNIT_STRIDE<<2)+32
xvst xr10, t2, (REST_UNIT_STRIDE<<2)+64
xvst xr12, t2, (REST_UNIT_STRIDE<<2)+96
addi.d t2, t2, 128
addi.w t5, t5, -32
addi.d t3, t3, 32
blt zero, t5, .LBS3_H_W_LASX
addi.d a0, a0, REST_UNIT_STRIDE<<2
addi.d a1, a1, REST_UNIT_STRIDE<<1
addi.d a2, a2, REST_UNIT_STRIDE
addi.d a4, a4, -1
blt zero, a4, .LBS3_H_H_LASX
endfunc