Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
/*
static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
const int16_t *const abcd, int mx, int my
HIGHBD_DECL_SUFFIX)
*/
.macro vld_filter_row dst, src, inc
addi.w t3, \src, 512
srai.w t3, t3, 10
add.w \src, \src, \inc
addi.w t3, t3, 64
slli.w t3, t3, 3
fldx.d \dst, t4, t3
.endm
.macro warp_filter_horz_lsx
addi.w t5, a5, 0
vld vr10, a2, 0
add.d a2, a2, a3
vld_filter_row f0, t5, t0
vld_filter_row f1, t5, t0
vld_filter_row f2, t5, t0
vld_filter_row f3, t5, t0
vld_filter_row f4, t5, t0
vld_filter_row f5, t5, t0
vld_filter_row f6, t5, t0
vld_filter_row f7, t5, t0
vxor.v vr10, vr10, vr20
vbsrl.v vr8, vr10, 1
vbsrl.v vr9, vr10, 2
vilvl.d vr8, vr8, vr10
vilvl.d vr0, vr1, vr0
vmulwev.h.b vr11, vr8, vr0
vmulwod.h.b vr12, vr8, vr0
vbsrl.v vr8, vr10, 3
vbsrl.v vr19, vr10, 4
vilvl.d vr8, vr8, vr9
vilvl.d vr2, vr3, vr2
vmulwev.h.b vr13, vr8, vr2
vmulwod.h.b vr14, vr8, vr2
vbsrl.v vr8, vr10, 5
vbsrl.v vr9, vr10, 6
vilvl.d vr8, vr8, vr19
vilvl.d vr4, vr5, vr4
vmulwev.h.b vr15, vr8, vr4
vmulwod.h.b vr16, vr8, vr4
vbsrl.v vr8, vr10, 7
vilvl.d vr8, vr8, vr9
vilvl.d vr6, vr7, vr6
vmulwev.h.b vr17, vr8, vr6
vmulwod.h.b vr18, vr8, vr6
vadd.h vr11, vr11, vr12
vadd.h vr13, vr13, vr14
vadd.h vr15, vr15, vr16
vadd.h vr17, vr17, vr18
vpickev.h vr12, vr13, vr11
vpickod.h vr14, vr13, vr11
vpickev.h vr16, vr17, vr15
vpickod.h vr18, vr17, vr15
vadd.h vr11, vr12, vr14
vadd.h vr15, vr16, vr18
vpickev.h vr12, vr15, vr11
vpickod.h vr14, vr15, vr11
vadd.h vr11, vr12, vr14
add.d a5, a5, t1
.endm
.macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7
vilvl.b \in0, \in1, \in0
vilvl.b \in2, \in3, \in2
vilvl.b \in4, \in5, \in4
vilvl.b \in6, \in7, \in6
vpackev.h \in1, \in2, \in0
vpackod.h \in3, \in2, \in0
vpackev.h \in5, \in6, \in4
vpackod.h \in7, \in6, \in4
vpackev.w \in0, \in5, \in1
vpackod.w \in2, \in5, \in1
vpackev.w \in1, \in7, \in3
vpackod.w \in3, \in7, \in3
vexth.h.b \in4, \in0
vsllwil.h.b \in0, \in0, 0
vexth.h.b \in5, \in1
vsllwil.h.b \in1, \in1, 0
vexth.h.b \in6, \in2
vsllwil.h.b \in2, \in2, 0
vexth.h.b \in7, \in3
vsllwil.h.b \in3, \in3, 0
.endm
.macro warp t, shift
function warp_affine_8x8\t\()_8bpc_lsx
addi.d sp, sp, -64
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
ld.h t0, a4, 0
ld.h t1, a4, 2
ld.h t2, a4, 4
ld.h a4, a4, 6
li.d t7, 8
alsl.w t3, a3, a3, 1
sub.d a2, a2, t3
addi.d a2, a2, -3
la.local t4, dav1d_mc_warp_filter
.ifnb \t
slli.d a1, a1, 1
.endif
li.w t3, 128
vreplgr2vr.b vr20, t3
.ifb \t
vreplgr2vr.h vr21, t3
.else
li.w t3, 2048
vreplgr2vr.h vr21, t3
.endif
warp_filter_horz_lsx
vsrari.h vr24, vr11, 3
warp_filter_horz_lsx
vsrari.h vr25, vr11, 3
warp_filter_horz_lsx
vsrari.h vr26, vr11, 3
warp_filter_horz_lsx
vsrari.h vr27, vr11, 3
warp_filter_horz_lsx
vsrari.h vr28, vr11, 3
warp_filter_horz_lsx
vsrari.h vr29, vr11, 3
warp_filter_horz_lsx
vsrari.h vr30, vr11, 3
1:
addi.d t6, a6, 0
warp_filter_horz_lsx
vsrari.h vr31, vr11, 3
vld_filter_row f0, t6, t2
vld_filter_row f1, t6, t2
vld_filter_row f2, t6, t2
vld_filter_row f3, t6, t2
vld_filter_row f4, t6, t2
vld_filter_row f5, t6, t2
vld_filter_row f6, t6, t2
vld_filter_row f7, t6, t2
transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
vmulwev.w.h vr16, vr24, vr0
vmulwod.w.h vr17, vr24, vr0
vmaddwev.w.h vr16, vr25, vr1
vmaddwod.w.h vr17, vr25, vr1
vmaddwev.w.h vr16, vr26, vr2
vmaddwod.w.h vr17, vr26, vr2
vmaddwev.w.h vr16, vr27, vr3
vmaddwod.w.h vr17, vr27, vr3
vmaddwev.w.h vr16, vr28, vr4
vmaddwod.w.h vr17, vr28, vr4
vmaddwev.w.h vr16, vr29, vr5
vmaddwod.w.h vr17, vr29, vr5
vmaddwev.w.h vr16, vr30, vr6
vmaddwod.w.h vr17, vr30, vr6
vmaddwev.w.h vr16, vr31, vr7
vmaddwod.w.h vr17, vr31, vr7
vssrarni.h.w vr16, vr16, \shift
vssrarni.h.w vr17, vr17, \shift
vilvl.h vr16, vr17, vr16
vadd.h vr16, vr16, vr21
vor.v vr24, vr25, vr25
vor.v vr25, vr26, vr26
vor.v vr26, vr27, vr27
vor.v vr27, vr28, vr28
vor.v vr28, vr29, vr29
vor.v vr29, vr30, vr30
vor.v vr30, vr31, vr31
.ifb \t
vssrarni.bu.h vr16, vr16, 0
.endif
addi.d t7, t7, -1
.ifnb \t
vst vr16, a0, 0
.else
vstelm.d vr16, a0, 0, 0
.endif
add.d a0, a1, a0
add.d a6, a6, a4
blt zero, t7, 1b
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 64
endfunc
.endm
warp , 11
warp t, 7
.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
xvshuf.b xr2, \in0, \in0, \in2
addi.w t4, \in1, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr3, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr4, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr5, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
addi.w t4, t3, 512
srai.w t4, t4, 10
addi.w t4, t4, 64
slli.w t4, t4, 3
vldx vr6, t5, t4
add.w t3, t3, t0 // tmx += abcd[0]
xvinsve0.d xr3, xr5, 1
xvinsve0.d xr3, xr4, 2
xvinsve0.d xr3, xr6, 3
xvmulwev.h.bu.b xr4, xr2, xr3
xvmulwod.h.bu.b xr5, xr2, xr3
xvilvl.d xr2, xr5, xr4
xvilvh.d xr3, xr5, xr4
xvhaddw.w.h xr2, xr2, xr2
xvhaddw.w.h xr3, xr3, xr3
xvhaddw.d.w xr2, xr2, xr2
xvhaddw.d.w xr3, xr3, xr3
xvhaddw.q.d xr2, xr2, xr2
xvhaddw.q.d xr3, xr3, xr3
xvextrins.w \out0, xr2, \out1
xvextrins.w \out2, xr3, \out3
.endm
.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
add.w \in0, \in0, \in1
addi.w t6, \in0, 512
srai.w t6, t6, 10
addi.w t6, t6, 64
slli.w t6, t6, 3
fldx.d f1, t5, t6
add.w t2, t2, t7
addi.w t6, t2, 512
srai.w t6, t6, 10
addi.w t6, t6, 64
slli.w t6, t6, 3
fldx.d f2, t5, t6
vilvl.d vr0, vr2, vr1
vext2xv.h.b xr0, xr0
xvmulwev.w.h xr3, \in2, xr0
xvmaddwod.w.h xr3, \in2, xr0
xvhaddw.d.w xr3, xr3, xr3
xvhaddw.q.d xr3, xr3, xr3
xvextrins.w \out0, xr3, \out1
.endm
const shuf0
.byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
.byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
endconst
const warp_sh
.rept 2
.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
.endr
.rept 2
.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.endr
endconst
.macro warp_lasx t, shift
function warp_affine_8x8\t\()_8bpc_lasx
addi.d sp, sp, -16
ld.h t0, a4, 0 // abcd[0]
ld.h t1, a4, 2 // abcd[1]
fst.d f24, sp, 0
fst.d f25, sp, 8
alsl.w t2, a3, a3, 1
addi.w t3, a5, 0
la.local t4, warp_sh
la.local t5, dav1d_mc_warp_filter
sub.d a2, a2, t2
addi.d a2, a2, -3
vld vr0, a2, 0
xvld xr24, t4, 0
xvld xr25, t4, 32
la.local t2, shuf0
xvld xr1, t2, 0
xvpermi.q xr0, xr0, 0x00
xvaddi.bu xr9, xr1, 4
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
xvsrarni.h.w xr12, xr7, 3
xvsrarni.h.w xr13, xr8, 3
xvsrarni.h.w xr14, xr10, 3
xvsrarni.h.w xr15, xr11, 3
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
add.w a5, a5, t1
or t3, a5, a5
add.d a2, a2, a3
vld vr0, a2, 0
xvpermi.q xr0, xr0, 0x00
FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
xvsrarni.h.w xr16, xr7, 3
xvsrarni.h.w xr17, xr8, 3
xvsrarni.h.w xr18, xr10, 3
xvsrarni.h.w xr19, xr11, 3
addi.w t2, a6, 0 // my
ld.h t7, a4, 4 // abcd[2]
ld.h t8, a4, 6 // abcd[3]
.ifnb \t
slli.d a1, a1, 1
.endif
// y = 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, \shift
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
xvaddi.bu xr25, xr25, 2
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
xvextrins.h xr24, xr25, 0x70
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30
xvshuf.b xr12, xr16, xr12, xr24
xvshuf.b xr13, xr17, xr13, xr24
xvshuf.b xr14, xr18, xr14, xr24
xvshuf.b xr15, xr19, xr15, xr24
add.w a6, a6, t8
addi.w t2, a6, 0
FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00
FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10
FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20
FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30
.ifnb \t
xvssrarni.h.w xr21, xr20, \shift
alsl.d a0, a1, a0, 1
xvpermi.q xr22, xr21, 0x01
vilvl.h vr23, vr22, vr21
vilvh.h vr21, vr22, vr21
vst vr23, a0, 0
vstx vr21, a0, a1
.else
xvssrarni.hu.w xr21, xr20, 11
xvssrlni.bu.h xr22, xr21, 0
xvpermi.q xr23, xr22, 0x01
vilvl.b vr21, vr23, vr22
add.d a0, a0, a1
fst.d f21, a0, 0
add.d a0, a0, a1
vstelm.d vr21, a0, 0, 1
.endif
fld.d f24, sp, 0
fld.d f25, sp, 8
addi.d sp, sp, 16
endfunc
.endm
warp_lasx , 11
warp_lasx t, 7
/*
static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2,
const int w, int h,
const int weight HIGHBD_DECL_SUFFIX)
*/
#define bpc8_sh 5 // sh = intermediate_bits + 1
#define bpcw8_sh 8 // sh = intermediate_bits + 4
#define bpc_sh bpc8_sh
#define bpcw_sh bpcw8_sh
function avg_8bpc_lsx
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .AVG_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.AVG_LSX_JRTABLE:
.hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W64_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W32_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W16_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W8_LSX - .AVG_LSX_JRTABLE
.hword .AVG_W4_LSX - .AVG_LSX_JRTABLE
.AVG_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vadd.h vr2, vr0, vr1
vssrarni.bu.h vr3, vr2, bpc_sh
vstelm.w vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .AVG_W4_LSX
b .AVG_END_LSX
.AVG_W8_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr4, vr0, vr1
vadd.h vr5, vr2, vr3
vssrarni.bu.h vr5, vr4, bpc_sh
addi.w a5, a5, -2
addi.d a2, a2, 32
vstelm.d vr5, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr5, a0, 0, 1
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .AVG_W8_LSX
b .AVG_END_LSX
.AVG_W16_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr4, vr0, vr1
vadd.h vr5, vr2, vr3
vssrarni.bu.h vr5, vr4, bpc_sh
addi.w a5, a5, -1
addi.d a2, a2, 32
vst vr5, a0, 0
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .AVG_W16_LSX
b .AVG_END_LSX
.AVG_W32_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr4, a2, 32
vld vr6, a2, 48
vld vr1, a3, 0
vld vr3, a3, 16
vld vr5, a3, 32
vld vr7, a3, 48
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vadd.h vr4, vr4, vr5
vadd.h vr6, vr6, vr7
vssrarni.bu.h vr2, vr0, bpc_sh
vssrarni.bu.h vr6, vr4, bpc_sh
addi.w a5, a5, -1
addi.d a2, a2, 64
vst vr2, a0, 0
vst vr6, a0, 16
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .AVG_W32_LSX
b .AVG_END_LSX
.AVG_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vssrarni.bu.h vr2, vr0, bpc_sh
addi.d a2, a2, 32
addi.d a3, a3, 32
vst vr2, a0, 0
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .AVG_W64_LSX
b .AVG_END_LSX
.AVG_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vadd.h vr0, vr0, vr1
vadd.h vr2, vr2, vr3
vssrarni.bu.h vr2, vr0, bpc_sh
addi.d a2, a2, 32
addi.d a3, a3, 32
vst vr2, a0, 0
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .AVG_W128_LSX
.AVG_END_LSX:
endfunc
function avg_8bpc_lasx
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .AVG_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.AVG_LASX_JRTABLE:
.hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W64_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W32_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W16_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W8_LASX - .AVG_LASX_JRTABLE
.hword .AVG_W4_LASX - .AVG_LASX_JRTABLE
.AVG_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
vadd.h vr0, vr0, vr1
vssrarni.bu.h vr1, vr0, bpc_sh
vstelm.w vr1, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr1, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .AVG_W4_LASX
b .AVG_END_LASX
.AVG_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvadd.h xr2, xr0, xr1
xvssrarni.bu.h xr1, xr2, bpc_sh
xvstelm.d xr1, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr1, a0, 0, 2
addi.w a5, a5, -2
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a1, a0
blt zero, a5, .AVG_W8_LASX
b .AVG_END_LASX
.AVG_W16_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvadd.h xr4, xr0, xr1
xvadd.h xr5, xr2, xr3
xvssrarni.bu.h xr5, xr4, bpc_sh
xvpermi.d xr2, xr5, 0xd8
xvpermi.d xr3, xr5, 0x8d
vst vr2, a0, 0
vstx vr3, a0, a1
addi.w a5, a5, -2
addi.d a2, a2, 64
addi.d a3, a3, 64
alsl.d a0, a1, a0, 1
blt zero, a5, .AVG_W16_LASX
b .AVG_END_LASX
.AVG_W32_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvadd.h xr4, xr0, xr1
xvadd.h xr5, xr2, xr3
xvssrarni.bu.h xr5, xr4, bpc_sh
xvpermi.d xr6, xr5, 0xd8
xvst xr6, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 64
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .AVG_W32_LASX
b .AVG_END_LASX
.AVG_W64_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr4, a2, 64
xvld xr6, a2, 96
xvld xr1, a3, 0
xvld xr3, a3, 32
xvld xr5, a3, 64
xvld xr7, a3, 96
xvadd.h xr0, xr0, xr1
xvadd.h xr2, xr2, xr3
xvadd.h xr4, xr4, xr5
xvadd.h xr6, xr6, xr7
xvssrarni.bu.h xr2, xr0, bpc_sh
xvssrarni.bu.h xr6, xr4, bpc_sh
xvpermi.d xr1, xr2, 0xd8
xvpermi.d xr3, xr6, 0xd8
xvst xr1, a0, 0
xvst xr3, a0, 32
addi.w a5, a5, -1
addi.d a2, a2, 128
addi.d a3, a3, 128
add.d a0, a0, a1
blt zero, a5, .AVG_W64_LASX
b .AVG_END_LASX
.AVG_W128_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr4, a2, 64
xvld xr6, a2, 96
xvld xr8, a2, 128
xvld xr10, a2, 160
xvld xr12, a2, 192
xvld xr14, a2, 224
xvld xr1, a3, 0
xvld xr3, a3, 32
xvld xr5, a3, 64
xvld xr7, a3, 96
xvld xr9, a3, 128
xvld xr11, a3, 160
xvld xr13, a3, 192
xvld xr15, a3, 224
xvadd.h xr0, xr0, xr1
xvadd.h xr2, xr2, xr3
xvadd.h xr4, xr4, xr5
xvadd.h xr6, xr6, xr7
xvadd.h xr8, xr8, xr9
xvadd.h xr10, xr10, xr11
xvadd.h xr12, xr12, xr13
xvadd.h xr14, xr14, xr15
xvssrarni.bu.h xr2, xr0, bpc_sh
xvssrarni.bu.h xr6, xr4, bpc_sh
xvssrarni.bu.h xr10, xr8, bpc_sh
xvssrarni.bu.h xr14, xr12, bpc_sh
xvpermi.d xr1, xr2, 0xd8
xvpermi.d xr3, xr6, 0xd8
xvpermi.d xr5, xr10, 0xd8
xvpermi.d xr7, xr14, 0xd8
xvst xr1, a0, 0
xvst xr3, a0, 32
xvst xr5, a0, 64
xvst xr7, a0, 96
addi.w a5, a5, -1
addi.d a2, a2, 256
addi.d a3, a3, 256
add.d a0, a0, a1
blt zero, a5, .AVG_W128_LASX
.AVG_END_LASX:
endfunc
function w_avg_8bpc_lsx
addi.d t8, a0, 0
li.w t2, 16
sub.w t2, t2, a6 // 16 - weight
vreplgr2vr.h vr21, a6
vreplgr2vr.h vr22, t2
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .W_AVG_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LSX_JRTABLE:
.hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE
.hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE
.W_AVG_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LSX
b .W_AVG_END_LSX
.W_AVG_W8_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
vmulwev.w.h vr2, vr0, vr21
vmulwod.w.h vr3, vr0, vr21
vmaddwev.w.h vr2, vr1, vr22
vmaddwod.w.h vr3, vr1, vr22
vssrarni.hu.w vr3, vr2, bpcw_sh
vssrlni.bu.h vr1, vr3, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.d f0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LSX
b .W_AVG_END_LSX
.W_AVG_W16_LSX:
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LSX
b .W_AVG_END_LSX
.W_AVG_W32_LSX:
.rept 2
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W32_LSX
b .W_AVG_END_LSX
.W_AVG_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LSX
b .W_AVG_END_LSX
.W_AVG_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr2, a2, 16
vld vr1, a3, 0
vld vr3, a3, 16
vmulwev.w.h vr4, vr0, vr21
vmulwod.w.h vr5, vr0, vr21
vmulwev.w.h vr6, vr2, vr21
vmulwod.w.h vr7, vr2, vr21
vmaddwev.w.h vr4, vr1, vr22
vmaddwod.w.h vr5, vr1, vr22
vmaddwev.w.h vr6, vr3, vr22
vmaddwod.w.h vr7, vr3, vr22
vssrarni.hu.w vr6, vr4, bpcw_sh
vssrarni.hu.w vr7, vr5, bpcw_sh
vssrlrni.bu.h vr7, vr6, 0
vshuf4i.w vr8, vr7, 0x4E
vilvl.b vr0, vr8, vr7
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a0, a0, 16
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LSX
.W_AVG_END_LSX:
endfunc
function w_avg_8bpc_lasx
addi.d t8, a0, 0
li.w t2, 16
sub.w t2, t2, a6 // 16 - weight
xvreplgr2vr.h xr21, a6
xvreplgr2vr.h xr22, t2
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .W_AVG_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.W_AVG_LASX_JRTABLE:
.hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE
.hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE
.W_AVG_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
xvpermi.d xr2, xr0, 0xD8
xvpermi.d xr3, xr1, 0xD8
xvilvl.h xr4, xr3, xr2
xvmulwev.w.h xr0, xr4, xr21
xvmaddwod.w.h xr0, xr4, xr22
xvssrarni.hu.w xr1, xr0, bpcw_sh
xvssrlni.bu.h xr0, xr1, 0
fst.s f0, a0, 0
add.d a0, a0, a1
xvstelm.w xr0, a0, 0, 4
addi.w a5, a5, -2
addi.d a2, a2, 16
addi.d a3, a3, 16
add.d a0, a1, a0
blt zero, a5, .W_AVG_W4_LASX
b .W_AVG_END_LASX
.W_AVG_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvstelm.d xr0, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr0, a0, 0, 2
addi.w a5, a5, -2
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W8_LASX
b .W_AVG_END_LASX
.W_AVG_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
xvmulwev.w.h xr2, xr0, xr21
xvmulwod.w.h xr3, xr0, xr21
xvmaddwev.w.h xr2, xr1, xr22
xvmaddwod.w.h xr3, xr1, xr22
xvssrarni.hu.w xr3, xr2, bpcw_sh
xvssrlni.bu.h xr1, xr3, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvpermi.d xr1, xr0, 0xD8
vst vr1, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 32
addi.d a3, a3, 32
add.d a0, a0, a1
blt zero, a5, .W_AVG_W16_LASX
b .W_AVG_END_LSX
.W_AVG_W32_LASX:
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.w a5, a5, -1
addi.d a2, a2, 64
addi.d a3, a3, 64
add.d a0, a0, a1
blt zero, a5, .W_AVG_W32_LASX
b .W_AVG_END_LASX
.W_AVG_W64_LASX:
.rept 2
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W64_LASX
b .W_AVG_END_LASX
.W_AVG_W128_LASX:
.rept 4
xvld xr0, a2, 0
xvld xr2, a2, 32
xvld xr1, a3, 0
xvld xr3, a3, 32
xvmulwev.w.h xr4, xr0, xr21
xvmulwod.w.h xr5, xr0, xr21
xvmulwev.w.h xr6, xr2, xr21
xvmulwod.w.h xr7, xr2, xr21
xvmaddwev.w.h xr4, xr1, xr22
xvmaddwod.w.h xr5, xr1, xr22
xvmaddwev.w.h xr6, xr3, xr22
xvmaddwod.w.h xr7, xr3, xr22
xvssrarni.hu.w xr6, xr4, bpcw_sh
xvssrarni.hu.w xr7, xr5, bpcw_sh
xvssrlni.bu.h xr7, xr6, 0
xvshuf4i.w xr8, xr7, 0x4E
xvilvl.b xr9, xr8, xr7
xvpermi.d xr0, xr9, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a0, a0, 32
.endr
addi.w a5, a5, -1
add.d t8, t8, a1
add.d a0, t8, zero
blt zero, a5, .W_AVG_W128_LASX
.W_AVG_END_LASX:
endfunc
#undef bpc_sh
#undef bpcw_sh
#define mask_sh 10
/*
static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
const uint8_t *mask HIGHBD_DECL_SUFFIX)
*/
function mask_8bpc_lsx
vldi vr21, 0x440 // 64
vxor.v vr19, vr19, vr19
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .MASK_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.MASK_LSX_JRTABLE:
.hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W64_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W32_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W16_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W8_LSX - .MASK_LSX_JRTABLE
.hword .MASK_W4_LSX - .MASK_LSX_JRTABLE
.MASK_W4_LSX:
vld vr0, a2, 0
vld vr1, a3, 0
fld.d f22, a6, 0
vilvl.b vr2, vr19, vr22
vsub.h vr3, vr21, vr2
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vssrarni.hu.w vr5, vr4, mask_sh
vssrlrni.bu.h vr1, vr5, 0
vpickod.w vr4, vr2, vr1
vilvl.b vr0, vr4, vr1
fst.s f0, a0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
addi.d a2, a2, 16
addi.d a3, a3, 16
addi.d a6, a6, 8
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W4_LSX
b .MASK_END_LSX
.MASK_W8_LSX:
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
fst.d f0, a0, 0
add.d a0, a0, a1
vstelm.d vr0, a0, 0, 1
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W8_LSX
b .MASK_END_LSX
.MASK_W16_LSX:
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W16_LSX
b .MASK_END_LSX
.MASK_W32_LSX:
.rept 2
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W32_LSX
b .MASK_END_LSX
.MASK_W64_LSX:
.rept 4
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W64_LSX
b .MASK_END_LSX
.MASK_W128_LSX:
.rept 8
vld vr0, a2, 0
vld vr10, a2, 16
vld vr1, a3, 0
vld vr11, a3, 16
vld vr22, a6, 0
vilvl.b vr2, vr19, vr22
vilvh.b vr12, vr19, vr22
vsub.h vr3, vr21, vr2
vsub.h vr13, vr21, vr12
vmulwev.w.h vr4, vr0, vr2
vmulwod.w.h vr5, vr0, vr2
vmulwev.w.h vr14, vr10, vr12
vmulwod.w.h vr15, vr10, vr12
vmaddwev.w.h vr4, vr1, vr3
vmaddwod.w.h vr5, vr1, vr3
vmaddwev.w.h vr14, vr11, vr13
vmaddwod.w.h vr15, vr11, vr13
vssrarni.hu.w vr14, vr4, mask_sh
vssrarni.hu.w vr15, vr5, mask_sh
vssrlrni.bu.h vr15, vr14, 0
vshuf4i.w vr6, vr15, 0x4E
vilvl.b vr0, vr6, vr15
vst vr0, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
addi.d a0, a0, 16
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W128_LSX
.MASK_END_LSX:
endfunc
function mask_8bpc_lasx
xvldi xr21, 0x440 // 64
xvxor.v xr19, xr19, xr19
addi.d t8, a0, 0
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .MASK_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0
add.d t1, t1, t2
jirl $r0, t1, 0
.align 3
.MASK_LASX_JRTABLE:
.hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W64_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W32_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W16_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W8_LASX - .MASK_LASX_JRTABLE
.hword .MASK_W4_LASX - .MASK_LASX_JRTABLE
.MASK_W4_LASX:
vld vr0, a2, 0
vld vr1, a3, 0
fld.d f22, a6, 0
vilvl.h vr4, vr1, vr0
vilvh.h vr14, vr1, vr0
vilvl.b vr2, vr19, vr22
vsub.h vr3, vr21, vr2
xvpermi.q xr14, xr4, 0x20
vilvl.h vr5, vr3, vr2
vilvh.h vr15, vr3, vr2
xvpermi.q xr15, xr5, 0x20
xvmulwev.w.h xr0, xr14, xr15
xvmaddwod.w.h xr0, xr14, xr15
xvssrarni.hu.w xr1, xr0, mask_sh
xvssrlni.bu.h xr2, xr1, 0
fst.s f2, a0, 0
add.d a0, a0, a1
xvstelm.w xr2, a0, 0, 4
addi.d a2, a2, 16
addi.d a3, a3, 16
addi.d a6, a6, 8
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W4_LASX
b .MASK_END_LASX
.MASK_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
vld vr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvsub.h xr3, xr21, xr2
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvssrarni.hu.w xr5, xr4, mask_sh
xvssrlni.bu.h xr1, xr5, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
fst.d f0, a0, 0
add.d a0, a0, a1
xvstelm.d xr0, a0, 0, 2
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -2
blt zero, a5, .MASK_W8_LASX
b .MASK_END_LASX
.MASK_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
vld vr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvsub.h xr3, xr21, xr2
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvssrarni.hu.w xr5, xr4, mask_sh
xvssrlni.bu.h xr1, xr5, 0
xvpickod.w xr4, xr2, xr1
xvilvl.b xr0, xr4, xr1
xvpermi.d xr1, xr0, 0xD8
vst vr1, a0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 16
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W16_LASX
b .MASK_END_LASX
.MASK_W32_LASX:
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
add.d a0, a0, a1
addi.w a5, a5, -1
blt zero, a5, .MASK_W32_LASX
b .MASK_END_LASX
.MASK_W64_LASX:
.rept 2
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
addi.d a0, a0, 32
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W64_LASX
b .MASK_END_LASX
.MASK_W128_LASX:
.rept 4
xvld xr0, a2, 0
xvld xr10, a2, 32
xvld xr1, a3, 0
xvld xr11, a3, 32
xvld xr22, a6, 0
vext2xv.hu.bu xr2, xr22
xvpermi.q xr4, xr22, 0x01
vext2xv.hu.bu xr12, xr4
xvsub.h xr3, xr21, xr2
xvsub.h xr13, xr21, xr12
xvmulwev.w.h xr4, xr0, xr2
xvmulwod.w.h xr5, xr0, xr2
xvmulwev.w.h xr14, xr10, xr12
xvmulwod.w.h xr15, xr10, xr12
xvmaddwev.w.h xr4, xr1, xr3
xvmaddwod.w.h xr5, xr1, xr3
xvmaddwev.w.h xr14, xr11, xr13
xvmaddwod.w.h xr15, xr11, xr13
xvssrarni.hu.w xr14, xr4, mask_sh
xvssrarni.hu.w xr15, xr5, mask_sh
xvssrlni.bu.h xr15, xr14, 0
xvshuf4i.w xr6, xr15, 0x4E
xvilvl.b xr1, xr6, xr15
xvpermi.d xr0, xr1, 0xD8
xvst xr0, a0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 32
addi.d a0, a0, 32
.endr
add.d t8, t8, a1
add.d a0, t8, zero
addi.w a5, a5, -1
blt zero, a5, .MASK_W128_LASX
.MASK_END_LASX:
endfunc
/*
static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
uint8_t *mask, const int sign,
const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
*/
function w_mask_420_8bpc_lsx
addi.d sp, sp, -24
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
vldi vr20, 0x440
vreplgr2vr.h vr21, a7
vldi vr22, 0x426
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .WMASK420_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t8, t0, 0
add.d t1, t1, t8
jirl $r0, t1, 0
.align 3
.WMASK420_LSX_JRTABLE:
.hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE
.hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE
.WMASK420_W4_LSX:
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a3, 0
vld vr3, a3, 16
addi.w a5, a5, -4
vabsd.h vr4, vr0, vr2
vabsd.h vr5, vr1, vr3
vaddi.hu vr4, vr4, 8
vaddi.hu vr5, vr5, 8
vsrli.h vr4, vr4, 8
vsrli.h vr5, vr5, 8
vadd.h vr4, vr4, vr22
vadd.h vr5, vr5, vr22
vmin.hu vr6, vr4, vr20
vmin.hu vr7, vr5, vr20
vsub.h vr8, vr20, vr6
vsub.h vr9, vr20, vr7
vmulwev.w.h vr4, vr6, vr0
vmulwod.w.h vr5, vr6, vr0
vmulwev.w.h vr10, vr7, vr1
vmulwod.w.h vr11, vr7, vr1
vmaddwev.w.h vr4, vr8, vr2
vmaddwod.w.h vr5, vr8, vr2
vmaddwev.w.h vr10, vr9, vr3
vmaddwod.w.h vr11, vr9, vr3
vilvl.w vr0, vr5, vr4
vilvh.w vr1, vr5, vr4
vilvl.w vr2, vr11, vr10
vilvh.w vr3, vr11, vr10
vssrarni.hu.w vr1, vr0, 10
vssrarni.hu.w vr3, vr2, 10
vssrlni.bu.h vr3, vr1, 0
vstelm.w vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 1
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 2
add.d a0, a0, a1
vstelm.w vr3, a0, 0, 3
add.d a0, a0, a1
vpickev.h vr0, vr7, vr6
vpickod.h vr1, vr7, vr6
vadd.h vr0, vr0, vr1
vshuf4i.h vr0, vr0, 0xd8
vhaddw.w.h vr2, vr0, vr0
vpickev.h vr2, vr2, vr2
vsub.h vr2, vr2, vr21
vaddi.hu vr2, vr2, 2
vssrani.bu.h vr2, vr2, 2
vstelm.w vr2, a6, 0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 4
blt zero, a5, .WMASK420_W4_LSX
b .END_W420
.WMASK420_W8_LSX:
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a3, 0
vld vr3, a3, 16
addi.w a5, a5, -2
vabsd.h vr4, vr0, vr2
vabsd.h vr5, vr1, vr3
vaddi.hu vr4, vr4, 8
vaddi.hu vr5, vr5, 8
vsrli.h vr4, vr4, 8
vsrli.h vr5, vr5, 8
vadd.h vr4, vr4, vr22
vadd.h vr5, vr5, vr22
vmin.hu vr6, vr4, vr20
vmin.hu vr7, vr5, vr20
vsub.h vr8, vr20, vr6
vsub.h vr9, vr20, vr7
vmulwev.w.h vr4, vr6, vr0
vmulwod.w.h vr5, vr6, vr0
vmulwev.w.h vr10, vr7, vr1
vmulwod.w.h vr11, vr7, vr1
vmaddwev.w.h vr4, vr8, vr2
vmaddwod.w.h vr5, vr8, vr2
vmaddwev.w.h vr10, vr9, vr3
vmaddwod.w.h vr11, vr9, vr3
vssrarni.hu.w vr10, vr4, 10
vssrarni.hu.w vr11, vr5, 10
vssrlni.bu.h vr11, vr10, 0
vshuf4i.w vr0, vr11, 0x4E
vilvl.b vr3, vr0, vr11
vstelm.d vr3, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr3, a0, 0, 1
add.d a0, a0, a1
vpickev.h vr0, vr7, vr6
vpickod.h vr1, vr7, vr6
vadd.h vr0, vr0, vr1
vilvh.d vr2, vr0, vr0
vadd.h vr2, vr2, vr0
vsub.h vr2, vr2, vr21
vaddi.hu vr2, vr2, 2
vssrani.bu.h vr2, vr2, 2
vstelm.w vr2, a6, 0, 0
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 4
blt zero, a5, .WMASK420_W8_LSX
b .END_W420
.WMASK420_W16_LSX:
vld vr0, a2, 0
vld vr1, a2, 16
alsl.d a2, a4, a2, 1
vld vr2, a2, 0
vld vr3, a2, 16
vld vr4, a3, 0
vld vr5, a3, 16
alsl.d a3, a4, a3, 1
vld vr6, a3, 0
vld vr7, a3, 16
vabsd.h vr8, vr0, vr4
vabsd.h vr9, vr1, vr5
vabsd.h vr10, vr2, vr6
vabsd.h vr11, vr3, vr7
vaddi.hu vr8, vr8, 8
vaddi.hu vr9, vr9, 8
vaddi.hu vr10, vr10, 8
vaddi.hu vr11, vr11, 8
vsrli.h vr8, vr8, 8
vsrli.h vr9, vr9, 8
vsrli.h vr10, vr10, 8
vsrli.h vr11, vr11, 8
vadd.h vr8, vr8, vr22
vadd.h vr9, vr9, vr22
vadd.h vr10, vr10, vr22
vadd.h vr11, vr11, vr22
vmin.hu vr12, vr8, vr20
vmin.hu vr13, vr9, vr20
vmin.hu vr14, vr10, vr20
vmin.hu vr15, vr11, vr20
vsub.h vr16, vr20, vr12
vsub.h vr17, vr20, vr13
vsub.h vr18, vr20, vr14
vsub.h vr19, vr20, vr15
vmulwev.w.h vr8, vr12, vr0
vmulwod.w.h vr9, vr12, vr0
vmulwev.w.h vr10, vr13, vr1
vmulwod.w.h vr11, vr13, vr1
vmulwev.w.h vr23, vr14, vr2
vmulwod.w.h vr24, vr14, vr2
vmulwev.w.h vr25, vr15, vr3
vmulwod.w.h vr26, vr15, vr3
vmaddwev.w.h vr8, vr16, vr4
vmaddwod.w.h vr9, vr16, vr4
vmaddwev.w.h vr10, vr17, vr5
vmaddwod.w.h vr11, vr17, vr5
vmaddwev.w.h vr23, vr18, vr6
vmaddwod.w.h vr24, vr18, vr6
vmaddwev.w.h vr25, vr19, vr7
vmaddwod.w.h vr26, vr19, vr7
vssrarni.hu.w vr10, vr8, 10
vssrarni.hu.w vr11, vr9, 10
vssrarni.hu.w vr25, vr23, 10
vssrarni.hu.w vr26, vr24, 10
vssrlni.bu.h vr11, vr10, 0
vssrlni.bu.h vr26, vr25, 0
vshuf4i.w vr0, vr11, 0x4E
vshuf4i.w vr1, vr26, 0x4E
vilvl.b vr3, vr0, vr11
vilvl.b vr7, vr1, vr26
vst vr3, a0, 0
vstx vr7, a0, a1
vpickev.h vr0, vr13, vr12
vpickod.h vr1, vr13, vr12
vpickev.h vr2, vr15, vr14
vpickod.h vr3, vr15, vr14
vadd.h vr4, vr0, vr1
vadd.h vr5, vr2, vr3
vadd.h vr4, vr4, vr5
vsub.h vr4, vr4, vr21
vssrarni.bu.h vr4, vr4, 2
vstelm.d vr4, a6, 0, 0
alsl.d a2, a4, a2, 1
alsl.d a3, a4, a3, 1
alsl.d a0, a1, a0, 1
addi.d a6, a6, 8
addi.w a5, a5, -2
blt zero, a5, .WMASK420_W16_LSX
b .END_W420
.WMASK420_W32_LSX:
.WMASK420_W64_LSX:
.WMASK420_W128_LSX:
.LOOP_W32_420_LSX:
add.d t1, a2, zero
add.d t2, a3, zero
add.d t3, a0, zero
add.d t4, a6, zero
alsl.d t5, a4, t1, 1
alsl.d t6, a4, t2, 1
or t7, a4, a4
.W32_420_LSX:
vld vr0, t1, 0
vld vr1, t1, 16
vld vr2, t2, 0
vld vr3, t2, 16
vld vr4, t5, 0
vld vr5, t5, 16
vld vr6, t6, 0
vld vr7, t6, 16
addi.d t1, t1, 32
addi.d t2, t2, 32
addi.d t5, t5, 32
addi.d t6, t6, 32
addi.w t7, t7, -16
vabsd.h vr8, vr0, vr2
vabsd.h vr9, vr1, vr3
vabsd.h vr10, vr4, vr6
vabsd.h vr11, vr5, vr7
vaddi.hu vr8, vr8, 8
vaddi.hu vr9, vr9, 8
vaddi.hu vr10, vr10, 8
vaddi.hu vr11, vr11, 8
vsrli.h vr8, vr8, 8
vsrli.h vr9, vr9, 8
vsrli.h vr10, vr10, 8
vsrli.h vr11, vr11, 8
vadd.h vr8, vr8, vr22
vadd.h vr9, vr9, vr22
vadd.h vr10, vr10, vr22
vadd.h vr11, vr11, vr22
vmin.hu vr12, vr8, vr20
vmin.hu vr13, vr9, vr20
vmin.hu vr14, vr10, vr20
vmin.hu vr15, vr11, vr20
vsub.h vr16, vr20, vr12
vsub.h vr17, vr20, vr13
vsub.h vr18, vr20, vr14
vsub.h vr19, vr20, vr15
vmulwev.w.h vr8, vr12, vr0
vmulwod.w.h vr9, vr12, vr0
vmulwev.w.h vr10, vr13, vr1
vmulwod.w.h vr11, vr13, vr1
vmulwev.w.h vr23, vr14, vr4
vmulwod.w.h vr24, vr14, vr4
vmulwev.w.h vr25, vr15, vr5
vmulwod.w.h vr26, vr15, vr5
vmaddwev.w.h vr8, vr16, vr2
vmaddwod.w.h vr9, vr16, vr2
vmaddwev.w.h vr10, vr17, vr3
vmaddwod.w.h vr11, vr17, vr3
vmaddwev.w.h vr23, vr18, vr6
vmaddwod.w.h vr24, vr18, vr6
vmaddwev.w.h vr25, vr19, vr7
vmaddwod.w.h vr26, vr19, vr7
vssrarni.hu.w vr10, vr8, 10
vssrarni.hu.w vr11, vr9, 10
vssrarni.hu.w vr25, vr23, 10
vssrarni.hu.w vr26, vr24, 10
vssrlni.bu.h vr11, vr10, 0
vssrlni.bu.h vr26, vr25, 0
vshuf4i.w vr8, vr11, 0x4E
vshuf4i.w vr9, vr26, 0x4E
vilvl.b vr3, vr8, vr11
vilvl.b vr7, vr9, vr26
vst vr3, t3, 0
vstx vr7, a1, t3
addi.d t3, t3, 16
vpickev.h vr8, vr13, vr12
vpickod.h vr9, vr13, vr12
vpickev.h vr10, vr15, vr14
vpickod.h vr11, vr15, vr14
vadd.h vr8, vr8, vr9
vadd.h vr10, vr10, vr11
vadd.h vr12, vr8, vr10
vsub.h vr12, vr12, vr21
vssrarni.bu.h vr12, vr12, 2
vstelm.d vr12, t4, 0, 0
addi.d t4, t4, 8
bne t7, zero, .W32_420_LSX
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
alsl.d a0, a1, a0, 1
srai.w t8, a4, 1
add.d a6, a6, t8
addi.w a5, a5, -2
blt zero, a5, .LOOP_W32_420_LSX
.END_W420:
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
addi.d sp, sp, 24
endfunc
function w_mask_420_8bpc_lasx
xvldi xr20, 0x440
xvreplgr2vr.h xr21, a7
xvldi xr22, 0x426
clz.w t0, a4
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .WMASK420_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t8, t0, 0
add.d t1, t1, t8
jirl $r0, t1, 0
.align 3
.WMASK420_LASX_JRTABLE:
.hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
.hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE
.hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE
.hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE
.hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE
.hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE
.WMASK420_W4_LASX:
xvld xr0, a2, 0
xvld xr1, a3, 0
addi.w a5, a5, -4
xvabsd.h xr2, xr0, xr1
xvaddi.hu xr2, xr2, 8
xvsrli.h xr2, xr2, 8
xvadd.h xr2, xr2, xr22
xvmin.hu xr3, xr2, xr20
xvsub.h xr4, xr20, xr3
xvmulwev.w.h xr5, xr3, xr0
xvmulwod.w.h xr6, xr3, xr0
xvmaddwev.w.h xr5, xr4, xr1
xvmaddwod.w.h xr6, xr4, xr1
xvilvl.w xr7, xr6, xr5
xvilvh.w xr8, xr6, xr5
xvssrarni.hu.w xr8, xr7, 10
xvssrlni.bu.h xr9, xr8, 0
vstelm.w vr9, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr9, a0, 0, 1
add.d a0, a0, a1
xvstelm.w xr9, a0, 0, 4
add.d a0, a0, a1
xvstelm.w xr9, a0, 0, 5
add.d a0, a0, a1
xvhaddw.w.h xr3, xr3, xr3
xvpermi.d xr4, xr3, 0xb1
xvadd.h xr3, xr3, xr4
xvpickev.h xr3, xr3, xr3
xvsub.h xr3, xr3, xr21
xvssrarni.bu.h xr3, xr3, 2
vstelm.h vr3, a6, 0, 0
xvstelm.h xr3, a6, 2, 8
addi.d a2, a2, 32
addi.d a3, a3, 32
addi.d a6, a6, 4
blt zero, a5, .WMASK420_W4_LASX
b .END_W420_LASX
.WMASK420_W8_LASX:
xvld xr0, a2, 0
xvld xr1, a2, 32
xvld xr2, a3, 0
xvld xr3, a3, 32
addi.w a5, a5, -4
xvabsd.h xr4, xr0, xr2
xvabsd.h xr5, xr1, xr3
xvaddi.hu xr4, xr4, 8
xvaddi.hu xr5, xr5, 8
xvsrli.h xr4, xr4, 8
xvsrli.h xr5, xr5, 8
xvadd.h xr4, xr4, xr22
xvadd.h xr5, xr5, xr22
xvmin.hu xr6, xr4, xr20
xvmin.hu xr7, xr5, xr20
xvsub.h xr8, xr20, xr6
xvsub.h xr9, xr20, xr7
xvmulwev.w.h xr10, xr6, xr0
xvmulwod.w.h xr11, xr6, xr0
xvmulwev.w.h xr12, xr7, xr1
xvmulwod.w.h xr13, xr7, xr1
xvmaddwev.w.h xr10, xr8, xr2
xvmaddwod.w.h xr11, xr8, xr2
xvmaddwev.w.h xr12, xr9, xr3
xvmaddwod.w.h xr13, xr9, xr3
xvssrarni.hu.w xr12, xr10, 10
xvssrarni.hu.w xr13, xr11, 10
xvssrlni.bu.h xr13, xr12, 0
xvshuf4i.w xr1, xr13, 0x4E
xvilvl.b xr17, xr1, xr13
vstelm.d vr17, a0, 0, 0
add.d a0, a0, a1
xvstelm.d xr17, a0, 0, 2
add.d a0, a0, a1
xvstelm.d xr17, a0, 0, 1
add.d a0, a0, a1
xvstelm.d xr17, a0, 0, 3
add.d a0, a0, a1
xvhaddw.w.h xr6, xr6, xr6
xvhaddw.w.h xr7, xr7, xr7
xvpickev.h xr8, xr7, xr6
xvpermi.q xr9, xr8, 0x01
vadd.h vr8, vr8, vr9
vsub.h vr8, vr8, vr21
vssrarni.bu.h vr8, vr8, 2
vstelm.d vr8, a6, 0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 8
blt zero, a5, .WMASK420_W8_LASX
b .END_W420_LASX
.WMASK420_W16_LASX:
xvld xr0, a2, 0
xvld xr1, a2, 32
xvld xr2, a3, 0
xvld xr3, a3, 32
addi.w a5, a5, -2
xvabsd.h xr4, xr0, xr2
xvabsd.h xr5, xr1, xr3
xvaddi.hu xr4, xr4, 8
xvaddi.hu xr5, xr5, 8
xvsrli.h xr4, xr4, 8
xvsrli.h xr5, xr5, 8
xvadd.h xr4, xr4, xr22
xvadd.h xr5, xr5, xr22
xvmin.hu xr4, xr4, xr20
xvmin.hu xr5, xr5, xr20
xvsub.h xr6, xr20, xr4
xvsub.h xr7, xr20, xr5
xvmulwev.w.h xr8, xr4, xr0
xvmulwod.w.h xr9, xr4, xr0
xvmulwev.w.h xr10, xr5, xr1
xvmulwod.w.h xr11, xr5, xr1
xvmaddwev.w.h xr8, xr6, xr2
xvmaddwod.w.h xr9, xr6, xr2
xvmaddwev.w.h xr10, xr7, xr3
xvmaddwod.w.h xr11, xr7, xr3
xvssrarni.hu.w xr10, xr8, 10
xvssrarni.hu.w xr11, xr9, 10
xvssrlni.bu.h xr11, xr10, 0
xvshuf4i.w xr8, xr11, 0x4E
xvilvl.b xr15, xr8, xr11
xvpermi.d xr16, xr15, 0xd8
vst vr16, a0, 0
add.d a0, a0, a1
xvpermi.q xr16, xr16, 0x01
vst vr16, a0, 0
add.d a0, a0, a1
xvhaddw.w.h xr4, xr4, xr4
xvhaddw.w.h xr5, xr5, xr5
xvadd.h xr4, xr5, xr4
xvpickev.h xr6, xr4, xr4
xvpermi.d xr7, xr6, 0x08
vsub.h vr7, vr7, vr21
vssrarni.bu.h vr7, vr7, 2
vstelm.d vr7, a6, 0, 0
addi.d a2, a2, 64
addi.d a3, a3, 64
addi.d a6, a6, 8
blt zero, a5, .WMASK420_W16_LASX
b .END_W420_LASX
.WMASK420_W32_LASX:
.WMASK420_W64_LASX:
.WMASK420_W128_LASX:
.LOOP_W32_420_LASX:
add.d t1, a2, zero
add.d t2, a3, zero
add.d t3, a0, zero
add.d t4, a6, zero
alsl.d t5, a4, t1, 1
alsl.d t6, a4, t2, 1
or t7, a4, a4
.W32_420_LASX:
xvld xr0, t1, 0
xvld xr1, t2, 0
xvld xr2, t5, 0
xvld xr3, t6, 0
addi.d t1, t1, 32
addi.d t2, t2, 32
addi.d t5, t5, 32
addi.d t6, t6, 32
addi.w t7, t7, -16
xvabsd.h xr4, xr0, xr1
xvabsd.h xr5, xr2, xr3
xvaddi.hu xr4, xr4, 8
xvaddi.hu xr5, xr5, 8
xvsrli.h xr4, xr4, 8
xvsrli.h xr5, xr5, 8
xvadd.h xr4, xr4, xr22
xvadd.h xr5, xr5, xr22
xvmin.hu xr6, xr4, xr20
xvmin.hu xr7, xr5, xr20
xvsub.h xr8, xr20, xr6
xvsub.h xr9, xr20, xr7
xvmulwev.w.h xr10, xr6, xr0
xvmulwod.w.h xr11, xr6, xr0
xvmulwev.w.h xr12, xr7, xr2
xvmulwod.w.h xr13, xr7, xr2
xvmaddwev.w.h xr10, xr8, xr1
xvmaddwod.w.h xr11, xr8, xr1
xvmaddwev.w.h xr12, xr9, xr3
xvmaddwod.w.h xr13, xr9, xr3
xvssrarni.hu.w xr12, xr10, 10
xvssrarni.hu.w xr13, xr11, 10
xvssrlni.bu.h xr13, xr12, 0
xvshuf4i.w xr10, xr13, 0x4E
xvilvl.b xr17, xr10, xr13
xvpermi.d xr18, xr17, 0x08
xvpermi.d xr19, xr17, 0x0d
vst vr18, t3, 0
vstx vr19, t3, a1
addi.d t3, t3, 16
xvhaddw.w.h xr6, xr6, xr6
xvhaddw.w.h xr7, xr7, xr7
xvadd.h xr6, xr7, xr6
xvpickev.h xr7, xr6, xr6
xvpermi.d xr8, xr7, 0x08
vsub.h vr9, vr8, vr21
vssrarni.bu.h vr9, vr9, 2
vstelm.d vr9, t4, 0, 0
addi.d t4, t4, 8
bne t7, zero, .W32_420_LASX
alsl.d a2, a4, a2, 2
alsl.d a3, a4, a3, 2
alsl.d a0, a1, a0, 1
srai.w t8, a4, 1
add.d a6, a6, t8
addi.w a5, a5, -2
blt zero, a5, .LOOP_W32_420_LASX
.END_W420_LASX:
endfunc
#undef bpc_sh
#undef bpcw_sh
.macro vhaddw.d.h in0
vhaddw.w.h \in0, \in0, \in0
vhaddw.d.w \in0, \in0, \in0
.endm
.macro vhaddw.q.w in0
vhaddw.d.w \in0, \in0, \in0
vhaddw.q.d \in0, \in0, \in0
.endm
.macro PUT_H_8W in0
vshuf.b vr2, \in0, \in0, vr6
vshuf.b vr3, \in0, \in0, vr7
vshuf.b vr4, \in0, \in0, vr8
vmulwev.h.bu.b vr12, vr2, vr10
vmulwev.h.bu.b vr13, vr3, vr11
vmulwev.h.bu.b vr14, vr3, vr10
vmulwev.h.bu.b vr15, vr4, vr11
vmaddwod.h.bu.b vr12, vr2, vr10
vmaddwod.h.bu.b vr13, vr3, vr11
vmaddwod.h.bu.b vr14, vr3, vr10
vmaddwod.h.bu.b vr15, vr4, vr11
vadd.h vr12, vr12, vr13
vadd.h vr14, vr14, vr15
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h vr14, vr14, vr14
vpickev.h \in0, vr14, vr12
vadd.h \in0, \in0, vr9
.endm
const subpel_h_shuf0
.byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20
endconst
const subpel_h_shuf1
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
endconst
const subpel_h_shuf2
.byte 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
.byte 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
endconst
const subpel_h_shuf3
.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
endconst
.macro FILTER_8TAP_8W in0
vshuf.b vr13, \in0, \in0, vr7
vshuf.b vr14, \in0, \in0, vr11
vshuf.b vr15, \in0, \in0, vr12
vmulwev.h.bu.b vr16, vr13, vr8
vmulwev.h.bu.b vr17, vr14, vr10
vmulwev.h.bu.b vr18, vr14, vr8
vmulwev.h.bu.b vr19, vr15, vr10
vmaddwod.h.bu.b vr16, vr13, vr8
vmaddwod.h.bu.b vr17, vr14, vr10
vmaddwod.h.bu.b vr18, vr14, vr8
vmaddwod.h.bu.b vr19, vr15, vr10
vadd.h vr16, vr16, vr17
vadd.h vr18, vr18, vr19
vhaddw.w.h vr16, vr16, vr16
vhaddw.w.h \in0, vr18, vr18
vssrarni.h.w \in0, vr16, 2
.endm
.macro PUT_8TAP_8BPC_LSX lable
li.w t0, 4
la.local t6, dav1d_mc_subpel_filters
slli.d t2, a3, 1 //src_stride*2
add.d t3, t2, a3 //src_stride*3
slli.d t4, t2, 1 //src_stride*4
bnez a6, .l_\lable\()put_h //mx
bnez a7, .l_\lable\()put_v //my
clz.w t1, a4
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()put_hv0_jtable
alsl.d t1, t1, t5, 3
ld.d t6, t1, 0
add.d t5, t5, t6
jirl $r0, t5, 0
.align 3
.l_\lable\()put_hv0_jtable:
.dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable
.dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable
.l_\lable\()put_hv0_2w:
vldrepl.h vr0, a2, 0
add.d a2, a2, a3
vldrepl.h vr1, a2, 0
vstelm.h vr0, a0, 0, 0
add.d a0, a0, a1
vstelm.h vr1, a0, 0, 0
add.d a2, a2, a3
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_2w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_4w:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fst.s f0, a0, 0
fstx.s f1, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_4w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_8w:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fst.d f0, a0, 0
fstx.d f1, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_8w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_16w:
vld vr0, a2, 0
vldx vr1, a2, a3
vst vr0, a0, 0
vstx vr1, a0, a1
alsl.d a2, a3, a2, 1
alsl.d a0, a1, a0, 1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_16w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_32w:
vld vr0, a2, 0
vld vr1, a2, 16
add.d a2, a2, a3
vld vr2, a2, 0
vld vr3, a2, 16
vst vr0, a0, 0
vst vr1, a0, 16
add.d a0, a0, a1
vst vr2, a0, 0
vst vr3, a0, 16
add.d a2, a2, a3
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_32w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_64w:
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a2, 32
vld vr3, a2, 48
add.d a2, a2, a3
vld vr4, a2, 0
vld vr5, a2, 16
vld vr6, a2, 32
vld vr7, a2, 48
add.d a2, a2, a3
vst vr0, a0, 0
vst vr1, a0, 16
vst vr2, a0, 32
vst vr3, a0, 48
add.d a0, a0, a1
vst vr4, a0, 0
vst vr5, a0, 16
vst vr6, a0, 32
vst vr7, a0, 48
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_64w
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv0_128w:
vld vr0, a2, 0
vld vr1, a2, 16
vld vr2, a2, 32
vld vr3, a2, 48
vld vr4, a2, 64
vld vr5, a2, 80
vld vr6, a2, 96
vld vr7, a2, 112
add.d a2, a2, a3
vld vr8, a2, 0
vld vr9, a2, 16
vld vr10, a2, 32
vld vr11, a2, 48
vld vr12, a2, 64
vld vr13, a2, 80
vld vr14, a2, 96
vld vr15, a2, 112
add.d a2, a2, a3
vst vr0, a0, 0
vst vr1, a0, 16
vst vr2, a0, 32
vst vr3, a0, 48
vst vr4, a0, 64
vst vr5, a0, 80
vst vr6, a0, 96
vst vr7, a0, 112
add.d a0, a0, a1
vst vr8, a0, 0
vst vr9, a0, 16
vst vr10, a0, 32
vst vr11, a0, 48
vst vr12, a0, 64
vst vr13, a0, 80
vst vr14, a0, 96
vst vr15, a0, 112
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv0_128w
b .l_\lable\()end_put_8tap
.l_\lable\()put_h:
bnez a7, .l_\lable\()put_hv //if(fh) && if (fv)
ld.d t5, sp, 0 //filter_type
andi t1, t5, 3
blt t0, a4, .l_\lable\()put_h_idx_fh
andi t1, t5, 1
addi.w t1, t1, 3
.l_\lable\()put_h_idx_fh:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t7, t6, t1 //fh's offset
li.w t1, 34
vreplgr2vr.h vr9, t1
clz.w t1, a4
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()put_h_jtable
alsl.d t1, t1, t5, 3
ld.d t6, t1, 0
add.d t5, t5, t6
jirl $r0, t5, 0
.align 3
.l_\lable\()put_h_jtable:
.dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable
.dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable
.l_\lable\()put_h_2w:
addi.d t7, t7, 2
addi.d a2, a2, -1
vldrepl.w vr8, t7, 0
la.local t7, subpel_h_shuf0
vld vr7, t7, 0
.l_\lable\()put_h_2w_loop:
vld vr0, a2, 0
vldx vr1, a2, a3
add.d a2, a2, t2
vshuf.b vr0, vr1, vr0, vr7
vdp2.h.bu.b vr1, vr0, vr8
vhaddw.w.h vr0, vr1, vr1
vpickev.h vr0, vr0, vr0
vadd.h vr0, vr0, vr9
vssrani.bu.h vr0, vr0, 6
vstelm.h vr0, a0, 0, 0
add.d a0, a0, a1
vstelm.h vr0, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_h_2w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_h_4w:
addi.d t7, t7, 2
addi.d a2, a2, -1
vldrepl.w vr8, t7, 0
la.local t7, subpel_h_shuf1
vld vr7, t7, 0
.l_\lable\()put_h_4w_loop:
vld vr0, a2, 0
vldx vr1, a2, a3
add.d a2, a2, t2
vshuf.b vr0, vr0, vr0, vr7
vshuf.b vr1, vr1, vr1, vr7
vmulwev.h.bu.b vr2, vr0, vr8
vmulwev.h.bu.b vr3, vr1, vr8
vmaddwod.h.bu.b vr2, vr0, vr8
vmaddwod.h.bu.b vr3, vr1, vr8
vhaddw.w.h vr0, vr2, vr2
vhaddw.w.h vr1, vr3, vr3
vpickev.h vr0, vr1, vr0
vadd.h vr0, vr0, vr9
vssrani.bu.h vr0, vr0, 6
vstelm.w vr0, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr0, a0, 0, 1
add.d a0, a0, a1
addi.d a5, a5, -2
bnez a5, .l_\lable\()put_h_4w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_h_8w:
fld.d f10, t7, 0
vreplvei.w vr11, vr10, 1
vreplvei.w vr10, vr10, 0
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vaddi.bu vr7, vr6, 4
vaddi.bu vr8, vr6, 8
addi.d a2, a2, -3
.l_\lable\()put_h_8w_loop:
vld vr0, a2, 0
vldx vr1, a2, a3
add.d a2, a2, t2
PUT_H_8W vr0
PUT_H_8W vr1
vssrani.bu.h vr1, vr0, 6
vstelm.d vr1, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr1, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_h_8w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_h_16w:
.l_\lable\()put_h_32w:
.l_\lable\()put_h_64w:
.l_\lable\()put_h_128w:
fld.d f10, t7, 0
vreplvei.w vr11, vr10, 1
vreplvei.w vr10, vr10, 0
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vaddi.bu vr7, vr6, 4
vaddi.bu vr8, vr6, 8
addi.d a2, a2, -3
addi.d t0, a2, 0 //src
addi.w t5, a5, 0 //h
addi.d t8, a0, 0 //dst
.l_\lable\()put_h_16w_loop:
vld vr0, a2, 0
vld vr1, a2, 8
add.d a2, a2, a3
PUT_H_8W vr0
PUT_H_8W vr1
vssrani.bu.h vr1, vr0, 6
vst vr1, a0, 0
add.d a0, a0, a1
addi.d a5, a5, -1
bnez a5, .l_\lable\()put_h_16w_loop
addi.d a2, t0, 16
addi.d t0, t0, 16
addi.d a0, t8, 16
addi.d t8, t8, 16
addi.w a5, t5, 0
addi.w a4, a4, -16
bnez a4, .l_\lable\()put_h_16w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_v:
ld.d t1, sp, 0 //filter_type
srli.w t1, t1, 2
blt t0, a5, .l_\lable\()put_v_idx_fv
andi t1, t1, 1
addi.w t1, t1, 3
.l_\lable\()put_v_idx_fv:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a7, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fv's offset
vldrepl.d vr8, t1, 0
sub.d a2, a2, t3
vilvl.h vr8, vr8, vr8
vreplvei.w vr9, vr8, 1
vreplvei.w vr10, vr8, 2
vreplvei.w vr11, vr8, 3
vreplvei.w vr8, vr8, 0
clz.w t1, a4
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()put_v_jtable
alsl.d t1, t1, t5, 3
ld.d t6, t1, 0
add.d t5, t5, t6
jirl $r0, t5, 0
.align 3
.l_\lable\()put_v_jtable:
.dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable
.dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable
.l_\lable\()put_v_2w:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fldx.s f2, a2, t2
add.d a2, a2, t3
fld.s f3, a2, 0
fldx.s f4, a2, a3
fldx.s f5, a2, t2
fldx.s f6, a2, t3
add.d a2, a2, t4
vilvl.h vr0, vr1, vr0 //0 1
vilvl.h vr1, vr2, vr1 //1 2
vilvl.b vr0, vr1, vr0 //01 12
vilvl.h vr2, vr3, vr2 //2 3
vilvl.h vr3, vr4, vr3 //3 4
vilvl.b vr1, vr3, vr2 //23 34
vilvl.h vr2, vr5, vr4 //4 5
vilvl.h vr3, vr6, vr5 //5 6
vilvl.b vr2, vr3, vr2 //45 56
.l_\lable\()put_v_2w_loop:
fld.s f7, a2, 0
vilvl.h vr3, vr7, vr6 //6 7
fldx.s f6, a2, a3
add.d a2, a2, t2
vilvl.h vr4, vr6, vr7 //7 8
vilvl.b vr3, vr4, vr3 //67 78
vmulwev.h.bu.b vr12, vr0, vr8
vmulwev.h.bu.b vr13, vr1, vr9
vmulwev.h.bu.b vr14, vr2, vr10
vmulwev.h.bu.b vr15, vr3, vr11
vmaddwod.h.bu.b vr12, vr0, vr8
vmaddwod.h.bu.b vr13, vr1, vr9
vmaddwod.h.bu.b vr14, vr2, vr10
vmaddwod.h.bu.b vr15, vr3, vr11
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr3, 0
vadd.h vr12, vr12, vr13
vadd.h vr12, vr12, vr14
vadd.h vr12, vr12, vr15
vssrarni.bu.h vr12, vr12, 6
vstelm.h vr12, a0, 0, 0
add.d a0, a0, a1
vstelm.h vr12, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_v_2w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_v_4w:
fld.s f0, a2, 0
fldx.s f1, a2, a3
fldx.s f2, a2, t2
add.d a2, a2, t3
fld.s f3, a2, 0
fldx.s f4, a2, a3
fldx.s f5, a2, t2
fldx.s f6, a2, t3
add.d a2, a2, t4
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr2, vr1
vilvl.b vr0, vr1, vr0
vilvl.w vr1, vr3, vr2
vilvl.w vr2, vr4, vr3
vilvl.b vr1, vr2, vr1
vilvl.w vr2, vr5, vr4
vilvl.w vr3, vr6, vr5
vilvl.b vr2, vr3, vr2
.l_\lable\()put_v_4w_loop:
fld.s f7, a2, 0
vilvl.w vr3, vr7, vr6
fldx.s f6, a2, a3
add.d a2, a2, t2
vilvl.w vr4, vr6, vr7
vilvl.b vr3, vr4, vr3
vmulwev.h.bu.b vr12, vr0, vr8
vmulwev.h.bu.b vr13, vr1, vr9
vmulwev.h.bu.b vr14, vr2, vr10
vmulwev.h.bu.b vr15, vr3, vr11
vmaddwod.h.bu.b vr12, vr0, vr8
vmaddwod.h.bu.b vr13, vr1, vr9
vmaddwod.h.bu.b vr14, vr2, vr10
vmaddwod.h.bu.b vr15, vr3, vr11
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr3, 0
vadd.h vr12, vr12, vr13
vadd.h vr12, vr12, vr14
vadd.h vr12, vr12, vr15
vssrarni.bu.h vr12, vr12, 6
vstelm.w vr12, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr12, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_v_4w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_v_8w:
.l_\lable\()put_v_16w:
.l_\lable\()put_v_32w:
.l_\lable\()put_v_64w:
.l_\lable\()put_v_128w:
addi.d t0, a2, 0 //src
addi.d t5, a5, 0 //h
addi.d t8, a0, 0 //dst
.l_\lable\()put_v_8w_loop0:
fld.d f0, a2, 0
fldx.d f1, a2, a3
fldx.d f2, a2, t2
add.d a2, a2, t3
fld.d f3, a2, 0
fldx.d f4, a2, a3
fldx.d f5, a2, t2
fldx.d f6, a2, t3
add.d a2, a2, t4
vilvl.b vr0, vr1, vr0 //0 1
vilvl.b vr1, vr2, vr1 //1 2
vilvl.b vr2, vr3, vr2 //2 3
vilvl.b vr3, vr4, vr3 //3 4
vilvl.b vr4, vr5, vr4 //4 5
vilvl.b vr5, vr6, vr5 //5 6
.l_\lable\()put_v_8w_loop:
fld.d f7, a2, 0
vilvl.b vr12, vr7, vr6 //6 7
fldx.d f6, a2, a3
add.d a2, a2, t2
vilvl.b vr13, vr6, vr7 //7 8
vmulwev.h.bu.b vr14, vr0, vr8
vmulwev.h.bu.b vr15, vr1, vr8
vmulwev.h.bu.b vr16, vr2, vr9
vmulwev.h.bu.b vr17, vr3, vr9
vmulwev.h.bu.b vr18, vr4, vr10
vmulwev.h.bu.b vr19, vr5, vr10
vmulwev.h.bu.b vr20, vr12, vr11
vmulwev.h.bu.b vr21, vr13, vr11
vmaddwod.h.bu.b vr14, vr0, vr8
vmaddwod.h.bu.b vr15, vr1, vr8
vmaddwod.h.bu.b vr16, vr2, vr9
vmaddwod.h.bu.b vr17, vr3, vr9
vmaddwod.h.bu.b vr18, vr4, vr10
vmaddwod.h.bu.b vr19, vr5, vr10
vmaddwod.h.bu.b vr20, vr12, vr11
vmaddwod.h.bu.b vr21, vr13, vr11
vaddi.hu vr0, vr2, 0
vaddi.hu vr1, vr3, 0
vaddi.hu vr2, vr4, 0
vaddi.hu vr3, vr5, 0
vaddi.hu vr4, vr12, 0
vaddi.hu vr5, vr13, 0
vadd.h vr14, vr14, vr16
vadd.h vr14, vr14, vr18
vadd.h vr14, vr14, vr20
vadd.h vr15, vr15, vr17
vadd.h vr15, vr15, vr19
vadd.h vr15, vr15, vr21
vssrarni.bu.h vr15, vr14, 6
vstelm.d vr15, a0, 0, 0
add.d a0, a0, a1
vstelm.d vr15, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_v_8w_loop
addi.d a2, t0, 8
addi.d t0, t0, 8
addi.d a0, t8, 8
addi.d t8, t8, 8
addi.d a5, t5, 0
addi.w a4, a4, -8
bnez a4, .l_\lable\()put_v_8w_loop0
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv:
ld.d t5, sp, 0 //filter_type
andi t1, t5, 3
blt t0, a4, .l_\lable\()put_hv_idx_fh
andi t1, t5, 1
addi.w t1, t1, 3
.l_\lable\()put_hv_idx_fh:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fh's offset
vldrepl.d vr8, t1, 0
ld.d t1, sp, 0 //filter_type
srli.w t1, t1, 2
blt t0, a5, .l_\lable\()put_hv_idx_fv
andi t1, t1, 1
addi.w t1, t1, 3
.l_\lable\()put_hv_idx_fv:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a7, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fv's offset
vldrepl.d vr9, t1, 0
vexth.h.b vr9, vr9
sub.d a2, a2, t3
addi.d a2, a2, -3
clz.w t1, a4
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()put_hv_jtable
alsl.d t1, t1, t5, 3
ld.d t6, t1, 0
add.d t5, t5, t6
jirl $r0, t5, 0
.align 3
.l_\lable\()put_hv_jtable:
.dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable
.dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable
.l_\lable\()put_hv_2w:
addi.d a2, a2, 2
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t2
add.d a2, a2, t3
vld vr3, a2, 0
vldx vr4, a2, a3
vldx vr5, a2, t2
vldx vr6, a2, t3
add.d a2, a2, t4
la.local t1, subpel_h_shuf0
vld vr7, t1, 0
vbsrl.v vr8, vr8, 2
vreplvei.w vr8, vr8, 0
//fv
vreplvei.w vr14, vr9, 1
vreplvei.w vr15, vr9, 2
vreplvei.w vr16, vr9, 3
vreplvei.w vr9, vr9, 0
vshuf.b vr0, vr1, vr0, vr7
vshuf.b vr1, vr3, vr2, vr7
vshuf.b vr2, vr5, vr4, vr7
vshuf.b vr3, vr6, vr6, vr7
vmulwev.h.bu.b vr10, vr0, vr8
vmulwev.h.bu.b vr11, vr1, vr8
vmulwev.h.bu.b vr12, vr2, vr8
vmulwev.h.bu.b vr13, vr3, vr8
vmaddwod.h.bu.b vr10, vr0, vr8
vmaddwod.h.bu.b vr11, vr1, vr8
vmaddwod.h.bu.b vr12, vr2, vr8
vmaddwod.h.bu.b vr13, vr3, vr8
vhaddw.w.h vr0, vr10, vr10
vhaddw.w.h vr1, vr11, vr11
vssrarni.h.w vr1, vr0, 2 //h0 h1 h2 h3
vhaddw.w.h vr2, vr12, vr12
vhaddw.w.h vr3, vr13, vr13
vssrarni.h.w vr3, vr2, 2 //h4 h5 h6 ~
vbsrl.v vr2, vr1, 4
vextrins.w vr2, vr3, 0x30 //h1 h2 h3 h4
vilvl.h vr4, vr2, vr1 //h0 h1 h1 h2 --
vilvh.h vr5, vr2, vr1 //h2 h3 h3 h4 --
vbsrl.v vr6, vr3, 4
vilvl.h vr6, vr6, vr3 //h4 h5 h5 h6 --
vbsrl.v vr3, vr3, 8 //h6 ~
.l_\lable\()put_hv_2w_loop:
vld vr0, a2, 0
vldx vr2, a2, a3
add.d a2, a2, t2
vshuf.b vr0, vr2, vr0, vr7
vdp2.h.bu.b vr17, vr0, vr8
vhaddw.w.h vr17, vr17, vr17
vssrarni.h.w vr17, vr17, 2 //h7 h8
vextrins.w vr3, vr17, 0x10 //h6 h7
vilvl.h vr3, vr17, vr3 //h6 h7 h7 h8 --
vmulwev.w.h vr18, vr4, vr9
vmulwev.w.h vr19, vr5, vr14
vmulwev.w.h vr20, vr6, vr15
vmulwev.w.h vr21, vr3, vr16
vmaddwod.w.h vr18, vr4, vr9
vmaddwod.w.h vr19, vr5, vr14
vmaddwod.w.h vr20, vr6, vr15
vmaddwod.w.h vr21, vr3, vr16
vaddi.hu vr4, vr5, 0
vaddi.hu vr5, vr6, 0
vaddi.hu vr6, vr3, 0
vbsrl.v vr3, vr17, 4 //h8 ~
vadd.w vr18, vr18, vr19
vadd.w vr18, vr18, vr20
vadd.w vr18, vr18, vr21
vssrarni.hu.w vr0, vr18, 10
vssrani.bu.h vr0, vr0, 0
vstelm.h vr0, a0, 0, 0
add.d a0, a0, a1
vstelm.h vr0, a0, 0, 1
add.d a0, a0, a1
addi.d a5, a5, -2
bnez a5, .l_\lable\()put_hv_2w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv_4w:
addi.d a2, a2, 2 //ignore leading 0
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t2
add.d a2, a2, t3
vld vr3, a2, 0
vldx vr4, a2, a3
vldx vr5, a2, t2
vldx vr6, a2, t3
add.d a2, a2, t4
la.local t1, subpel_h_shuf1
vld vr7, t1, 0
vbsrl.v vr8, vr8, 2
vreplvei.w vr8, vr8, 0
//fv
vreplvei.w vr17, vr9, 0
vreplvei.w vr18, vr9, 1
vreplvei.w vr19, vr9, 2
vreplvei.w vr20, vr9, 3
//DAV1D_FILTER_8TAP_RND
vshuf.b vr0, vr0, vr0, vr7
vshuf.b vr1, vr1, vr1, vr7
vshuf.b vr2, vr2, vr2, vr7
vshuf.b vr3, vr3, vr3, vr7
vshuf.b vr4, vr4, vr4, vr7
vshuf.b vr5, vr5, vr5, vr7
vshuf.b vr6, vr6, vr6, vr7
vmulwev.h.bu.b vr10, vr0, vr8
vmulwev.h.bu.b vr11, vr1, vr8
vmulwev.h.bu.b vr12, vr2, vr8
vmulwev.h.bu.b vr13, vr3, vr8
vmulwev.h.bu.b vr14, vr4, vr8
vmulwev.h.bu.b vr15, vr5, vr8
vmulwev.h.bu.b vr16, vr6, vr8
vmaddwod.h.bu.b vr10, vr0, vr8
vmaddwod.h.bu.b vr11, vr1, vr8
vmaddwod.h.bu.b vr12, vr2, vr8
vmaddwod.h.bu.b vr13, vr3, vr8
vmaddwod.h.bu.b vr14, vr4, vr8
vmaddwod.h.bu.b vr15, vr5, vr8
vmaddwod.h.bu.b vr16, vr6, vr8
vhaddw.w.h vr10, vr10, vr10
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h vr13, vr13, vr13
vhaddw.w.h vr14, vr14, vr14
vhaddw.w.h vr15, vr15, vr15
vhaddw.w.h vr16, vr16, vr16
vssrarni.h.w vr10, vr10, 2 //h0
vssrarni.h.w vr11, vr11, 2 //h1
vssrarni.h.w vr12, vr12, 2 //h2
vssrarni.h.w vr13, vr13, 2 //h3
vssrarni.h.w vr14, vr14, 2 //h4
vssrarni.h.w vr15, vr15, 2 //h5
vssrarni.h.w vr16, vr16, 2 //h6
//h0
vilvl.h vr0, vr11, vr10 //01
vilvl.h vr1, vr13, vr12 //23
vilvl.h vr2, vr15, vr14 //45
//h1
vilvl.h vr4, vr12, vr11 //12
vilvl.h vr5, vr14, vr13 //34
vilvl.h vr6, vr16, vr15 //56
.l_\lable\()put_hv_4w_loop:
vld vr9, a2, 0
vldx vr10, a2, a3
add.d a2, a2, t2
//DAV1D_FILTER_8TAP_CLIP
vshuf.b vr9, vr9, vr9, vr7
vshuf.b vr10, vr10, vr10, vr7
vmulwev.h.bu.b vr11, vr9, vr8
vmulwev.h.bu.b vr12, vr10, vr8
vmaddwod.h.bu.b vr11, vr9, vr8
vmaddwod.h.bu.b vr12, vr10, vr8
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vssrarni.h.w vr11, vr11, 2 //h7
vssrarni.h.w vr12, vr12, 2 //h8
vilvl.h vr3, vr11, vr16 //67
vilvl.h vr13, vr12, vr11 //78
vmulwev.w.h vr9, vr0, vr17
vmulwev.w.h vr10, vr1, vr18
vmulwev.w.h vr14, vr2, vr19
vmulwev.w.h vr15, vr3, vr20
vmaddwod.w.h vr9, vr0, vr17
vmaddwod.w.h vr10, vr1, vr18
vmaddwod.w.h vr14, vr2, vr19
vmaddwod.w.h vr15, vr3, vr20
vadd.w vr16, vr9, vr10
vadd.w vr16, vr16, vr14
vadd.w vr16, vr16, vr15
vmulwev.w.h vr9, vr4, vr17
vmulwev.w.h vr10, vr5, vr18
vmulwev.w.h vr14, vr6, vr19
vmulwev.w.h vr15, vr13, vr20
vmaddwod.w.h vr9, vr4, vr17
vmaddwod.w.h vr10, vr5, vr18
vmaddwod.w.h vr14, vr6, vr19
vmaddwod.w.h vr15, vr13, vr20
vadd.w vr21, vr9, vr10
vadd.w vr21, vr21, vr14
vadd.w vr21, vr21, vr15
vssrarni.hu.w vr21, vr16, 10
vssrani.bu.h vr21, vr21, 0
//cache
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr3, 0
vaddi.hu vr4, vr5, 0
vaddi.hu vr5, vr6, 0
vaddi.hu vr6, vr13, 0
vaddi.hu vr16, vr12, 0
vstelm.w vr21, a0, 0, 0
add.d a0, a0, a1
vstelm.w vr21, a0, 0, 1
add.d a0, a0, a1
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv_4w_loop
b .l_\lable\()end_put_8tap
.l_\lable\()put_hv_8w:
.l_\lable\()put_hv_16w:
.l_\lable\()put_hv_32w:
.l_\lable\()put_hv_64w:
.l_\lable\()put_hv_128w:
addi.d sp, sp, -8*8
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
addi.d t0, a2, 0 //src
addi.d t5, a5, 0 //h
addi.d t8, a0, 0 //dst
la.local t1, subpel_h_shuf1
vld vr7, t1, 0
vaddi.bu vr11, vr7, 4
vaddi.bu vr12, vr7, 8
vreplvei.w vr10, vr8, 1
vreplvei.w vr8, vr8, 0
vreplvei.w vr20, vr9, 1
vreplvei.w vr21, vr9, 2
vreplvei.w vr22, vr9, 3
vreplvei.w vr9, vr9, 0
.l_\lable\()put_hv_8w_loop0:
vld vr0, a2, 0
vldx vr1, a2, a3
vldx vr2, a2, t2
add.d a2, a2, t3
vld vr3, a2, 0
vldx vr4, a2, a3
vldx vr5, a2, t2
vldx vr6, a2, t3
add.d a2, a2, t4
FILTER_8TAP_8W vr0 //h0
FILTER_8TAP_8W vr1 //h1
FILTER_8TAP_8W vr2 //h2
FILTER_8TAP_8W vr3 //h3
FILTER_8TAP_8W vr4 //h4
FILTER_8TAP_8W vr5 //h5
FILTER_8TAP_8W vr6 //h6
//h0' low part
vilvl.h vr23, vr1, vr0 //01
vilvl.h vr24, vr3, vr2 //23
vilvl.h vr25, vr5, vr4 //45
//h0' high part
vilvh.h vr26, vr1, vr0 //01
vilvh.h vr27, vr3, vr2 //23
vilvh.h vr28, vr5, vr4 //45
//h1' low part
vilvl.h vr29, vr2, vr1 //12
vilvl.h vr30, vr4, vr3 //34
vilvl.h vr31, vr6, vr5 //56
//h1' high part
vilvh.h vr0, vr2, vr1 //12
vilvh.h vr1, vr4, vr3 //34
vilvh.h vr2, vr6, vr5 //56
.l_\lable\()put_hv_8w_loop:
vld vr3, a2, 0
vldx vr4, a2, a3
add.d a2, a2, t2
FILTER_8TAP_8W vr3 //h7
FILTER_8TAP_8W vr4 //h8
//h0' low part
vilvl.h vr16, vr3, vr6 //67 ~low
vmulwev.w.h vr13, vr23, vr9
vmulwev.w.h vr14, vr24, vr20
vmulwev.w.h vr15, vr25, vr21
vmulwev.w.h vr17, vr16, vr22
vmaddwod.w.h vr13, vr23, vr9
vmaddwod.w.h vr14, vr24, vr20
vmaddwod.w.h vr15, vr25, vr21
vmaddwod.w.h vr17, vr16, vr22
vadd.w vr13, vr13, vr14
vadd.w vr13, vr13, vr15
vadd.w vr13, vr13, vr17
//cache
vaddi.hu vr23, vr24, 0
vaddi.hu vr24, vr25, 0
vaddi.hu vr25, vr16, 0
//h0' high part
vilvh.h vr17, vr3, vr6 //67 ~high
vmulwev.w.h vr14, vr26, vr9
vmulwev.w.h vr15, vr27, vr20
vmulwev.w.h vr16, vr28, vr21
vmulwev.w.h vr18, vr17, vr22
vmaddwod.w.h vr14, vr26, vr9
vmaddwod.w.h vr15, vr27, vr20
vmaddwod.w.h vr16, vr28, vr21
vmaddwod.w.h vr18, vr17, vr22
vadd.w vr14, vr14, vr15
vadd.w vr14, vr14, vr16
vadd.w vr14, vr14, vr18
vssrarni.hu.w vr14, vr13, 10
vssrarni.bu.h vr5, vr14, 0
vstelm.d vr5, a0, 0, 0
add.d a0, a0, a1
//cache
vaddi.hu vr26, vr27, 0
vaddi.hu vr27, vr28, 0
vaddi.hu vr28, vr17, 0
vaddi.hu vr6, vr4, 0
vilvl.h vr5, vr4, vr3 //78 ~low
vilvh.h vr4, vr4, vr3 //78 ~high
//h1' low part
vmulwev.w.h vr13, vr29, vr9
vmulwev.w.h vr14, vr30, vr20
vmulwev.w.h vr15, vr31, vr21
vmulwev.w.h vr16, vr5, vr22
vmaddwod.w.h vr13, vr29, vr9
vmaddwod.w.h vr14, vr30, vr20
vmaddwod.w.h vr15, vr31, vr21
vmaddwod.w.h vr16, vr5, vr22
vadd.w vr13, vr13, vr14
vadd.w vr13, vr13, vr15
vadd.w vr13, vr13, vr16
//cache
vaddi.hu vr29, vr30, 0
vaddi.hu vr30, vr31, 0
vaddi.hu vr31, vr5, 0
//h1' high part
vmulwev.w.h vr14, vr0, vr9
vmulwev.w.h vr15, vr1, vr20
vmulwev.w.h vr16, vr2, vr21
vmulwev.w.h vr17, vr4, vr22
vmaddwod.w.h vr14, vr0, vr9
vmaddwod.w.h vr15, vr1, vr20
vmaddwod.w.h vr16, vr2, vr21
vmaddwod.w.h vr17, vr4, vr22
vadd.w vr14, vr14, vr15
vadd.w vr14, vr14, vr16
vadd.w vr14, vr14, vr17
vssrarni.hu.w vr14, vr13, 10
vssrarni.bu.h vr5, vr14, 0
vstelm.d vr5, a0, 0, 0
add.d a0, a0, a1
//cache
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr4, 0
addi.w a5, a5, -2
bnez a5, .l_\lable\()put_hv_8w_loop
addi.d a2, t0, 8
addi.d t0, t0, 8
addi.d a0, t8, 8
addi.d t8, t8, 8
addi.d a5, t5, 0
addi.w a4, a4, -8
bnez a4, .l_\lable\()put_hv_8w_loop0
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 8*8
.l_\lable\()end_put_8tap:
.endm
function put_8tap_regular_8bpc_lsx
addi.d sp, sp, -16
st.d zero, sp, 0
PUT_8TAP_8BPC_LSX 0
addi.d sp, sp, 16
endfunc
function put_8tap_smooth_regular_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 1
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 1
addi.d sp, sp, 16
endfunc
function put_8tap_sharp_regular_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 2
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 2
addi.d sp, sp, 16
endfunc
function put_8tap_regular_smooth_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 4
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 4
addi.d sp, sp, 16
endfunc
function put_8tap_smooth_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 5
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 5
addi.d sp, sp, 16
endfunc
function put_8tap_sharp_smooth_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 6
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 6
addi.d sp, sp, 16
endfunc
function put_8tap_regular_sharp_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 8
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 8
addi.d sp, sp, 16
endfunc
function put_8tap_smooth_sharp_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 9
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 9
addi.d sp, sp, 16
endfunc
function put_8tap_sharp_8bpc_lsx
addi.d sp, sp, -16
li.w t0, 10
st.d t0, sp, 0
PUT_8TAP_8BPC_LSX 10
addi.d sp, sp, 16
endfunc
const shufb1
.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
endconst
.macro PREP_H_8W in0
vshuf.b vr2, \in0, \in0, vr6
vshuf.b vr3, \in0, \in0, vr7
vshuf.b vr4, \in0, \in0, vr8
vmulwev.h.bu.b vr12, vr2, vr22
vmulwev.h.bu.b vr13, vr3, vr23
vmulwev.h.bu.b vr14, vr3, vr22
vmulwev.h.bu.b vr15, vr4, vr23
vmaddwod.h.bu.b vr12, vr2, vr22
vmaddwod.h.bu.b vr13, vr3, vr23
vmaddwod.h.bu.b vr14, vr3, vr22
vmaddwod.h.bu.b vr15, vr4, vr23
vadd.h vr12, vr12, vr13
vadd.h vr14, vr14, vr15
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h \in0, vr14, vr14
vssrarni.h.w \in0, vr12, 2
.endm
.macro PREP_HV_8W_LASX in0
xvshuf.b xr4, \in0, \in0, xr19
xvshuf.b xr5, \in0, \in0, xr20
xvshuf.b xr6, \in0, \in0, xr21
xvmulwev.h.bu.b xr7, xr4, xr22
xvmulwev.h.bu.b xr9, xr5, xr23
xvmulwev.h.bu.b xr10, xr5, xr22
xvmulwev.h.bu.b xr11, xr6, xr23
xvmaddwod.h.bu.b xr7, xr4, xr22
xvmaddwod.h.bu.b xr9, xr5, xr23
xvmaddwod.h.bu.b xr10, xr5, xr22
xvmaddwod.h.bu.b xr11, xr6, xr23
xvadd.h xr7, xr7, xr9
xvadd.h xr9, xr10, xr11
xvhaddw.w.h xr7, xr7, xr7
xvhaddw.w.h \in0, xr9, xr9
xvssrarni.h.w \in0, xr7, 2
.endm
.macro PREP_8TAP_8BPC_LASX lable
li.w t0, 4
la.local t6, dav1d_mc_subpel_filters
slli.d t2, a2, 1 //src_stride*2
add.d t3, t2, a2 //src_stride*3
slli.d t4, t2, 1
bnez a5, .l_\lable\()h_lasx //mx
bnez a6, .l_\lable\()v_lasx
clz.w t1, a3
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()prep_hv0_jtable_lasx
alsl.d t1, t1, t5, 1
ld.h t8, t1, 0
add.d t5, t5, t8
jirl $r0, t5, 0
.align 3
.l_\lable\()prep_hv0_jtable_lasx:
.hword .l_\lable\()hv0_128w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.hword .l_\lable\()hv0_64w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.hword .l_\lable\()hv0_32w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.hword .l_\lable\()hv0_16w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.hword .l_\lable\()hv0_8w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.hword .l_\lable\()hv0_4w_lasx - .l_\lable\()prep_hv0_jtable_lasx
.l_\lable\()hv0_4w_lasx:
fld.s f0, a1, 0
fldx.s f1, a1, a2
fldx.s f2, a1, t2
fldx.s f3, a1, t3
add.d a1, a1, t4
xvpackev.w xr0, xr1, xr0
xvpackev.w xr1, xr3, xr2
xvpermi.q xr0, xr1, 0x02
xvsllwil.hu.bu xr0, xr0, 4
xvst xr0, a0, 0
addi.d a0, a0, 32
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv0_4w_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv0_8w_lasx:
fld.d f0, a1, 0
fldx.d f1, a1, a2
fldx.d f2, a1, t2
fldx.d f3, a1, t3
add.d a1, a1, t4
xvpermi.q xr0, xr1, 0x02
xvpermi.q xr2, xr3, 0x02
xvsllwil.hu.bu xr0, xr0, 4
xvsllwil.hu.bu xr2, xr2, 4
xvst xr0, a0, 0
xvst xr2, a0, 32
addi.d a0, a0, 64
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv0_8w_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv0_16w_lasx:
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t2
vldx vr3, a1, t3
add.d a1, a1, t4
vext2xv.hu.bu xr0, xr0
vext2xv.hu.bu xr1, xr1
vext2xv.hu.bu xr2, xr2
vext2xv.hu.bu xr3, xr3
xvslli.h xr0, xr0, 4
xvslli.h xr1, xr1, 4
xvslli.h xr2, xr2, 4
xvslli.h xr3, xr3, 4
xvst xr0, a0, 0
xvst xr1, a0, 32
xvst xr2, a0, 64
xvst xr3, a0, 96
addi.d a0, a0, 128
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv0_16w_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv0_32w_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
xvldx xr2, a1, t2
xvldx xr3, a1, t3
add.d a1, a1, t4
xvpermi.d xr4, xr0, 0xD8
xvpermi.d xr5, xr1, 0xD8
xvpermi.d xr6, xr2, 0xD8
xvpermi.d xr7, xr3, 0xD8
xvpermi.d xr10, xr0, 0x32
xvpermi.d xr11, xr1, 0x32
xvpermi.d xr12, xr2, 0x32
xvpermi.d xr13, xr3, 0x32
xvsllwil.hu.bu xr0, xr4, 4
xvsllwil.hu.bu xr1, xr5, 4
xvsllwil.hu.bu xr2, xr6, 4
xvsllwil.hu.bu xr3, xr7, 4
xvsllwil.hu.bu xr4, xr10, 4
xvsllwil.hu.bu xr5, xr11, 4
xvsllwil.hu.bu xr6, xr12, 4
xvsllwil.hu.bu xr7, xr13, 4
xvst xr0, a0, 0
xvst xr4, a0, 32
xvst xr1, a0, 64
xvst xr5, a0, 96
xvst xr2, a0, 128
xvst xr6, a0, 160
xvst xr3, a0, 192
xvst xr7, a0, 224
addi.d a0, a0, 256
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv0_32w_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv0_64w_lasx:
.l_\lable\()hv0_128w_lasx:
addi.d t0, a1, 0
addi.d t5, a4, 0
srli.w t7, a3, 5
slli.w t7, t7, 6
addi.d t8, a0, 0
.l_\lable\()hv0_32_loop_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
xvldx xr2, a1, t2
xvldx xr3, a1, t3
add.d a1, a1, t4
xvpermi.d xr4, xr0, 0xD8
xvpermi.d xr5, xr1, 0xD8
xvpermi.d xr6, xr2, 0xD8
xvpermi.d xr7, xr3, 0xD8
xvpermi.d xr10, xr0, 0x32
xvpermi.d xr11, xr1, 0x32
xvpermi.d xr12, xr2, 0x32
xvpermi.d xr13, xr3, 0x32
xvsllwil.hu.bu xr0, xr4, 4
xvsllwil.hu.bu xr1, xr5, 4
xvsllwil.hu.bu xr2, xr6, 4
xvsllwil.hu.bu xr3, xr7, 4
xvsllwil.hu.bu xr4, xr10, 4
xvsllwil.hu.bu xr5, xr11, 4
xvsllwil.hu.bu xr6, xr12, 4
xvsllwil.hu.bu xr7, xr13, 4
xvst xr0, a0, 0
xvst xr4, a0, 32
add.d t1, a0, t7
xvst xr1, t1, 0
xvst xr5, t1, 32
add.d t1, t1, t7
xvst xr2, t1, 0
xvst xr6, t1, 32
add.d t1, t1, t7
xvst xr3, t1, 0
xvst xr7, t1, 32
add.d a0, t1, t7
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv0_32_loop_lasx
addi.d a1, t0, 32
addi.d t0, t0, 32
addi.d a0, t8, 64
addi.d t8, t8, 64
addi.d a4, t5, 0
addi.d a3, a3, -32
bnez a3, .l_\lable\()hv0_32_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()h_lasx:
bnez a6, .l_\lable\()hv_lasx //if(fh) && if (fv)
andi t1, a7, 3
blt t0, a3, .l_\lable\()h_idx_fh_lasx
andi t1, a7, 1
addi.w t1, t1, 3
.l_\lable\()h_idx_fh_lasx:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a5, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fh's offset
xvldrepl.d xr22, t1, 0
addi.d a1, a1, -3
clz.w t1, a3
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()prep_h_jtable_lasx
alsl.d t1, t1, t5, 1
ld.h t8, t1, 0
add.d t5, t5, t8
jirl $r0, t5, 0
.align 3
.l_\lable\()prep_h_jtable_lasx:
.hword .l_\lable\()h_128w_lasx - .l_\lable\()prep_h_jtable_lasx
.hword .l_\lable\()h_64w_lasx - .l_\lable\()prep_h_jtable_lasx
.hword .l_\lable\()h_32w_lasx - .l_\lable\()prep_h_jtable_lasx
.hword .l_\lable\()h_16w_lasx - .l_\lable\()prep_h_jtable_lasx
.hword .l_\lable\()h_8w_lasx - .l_\lable\()prep_h_jtable_lasx
.hword .l_\lable\()h_4w_lasx - .l_\lable\()prep_h_jtable_lasx
.l_\lable\()h_4w_lasx:
addi.d a1, a1, 2
la.local t7, subpel_h_shuf1
vld vr7, t7, 0
xvreplve0.q xr7, xr7
xvbsrl.v xr22, xr22, 2
xvreplve0.w xr22, xr22
.l_\lable\()h_4w_loop_lasx:
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t2
vldx vr3, a1, t3
add.d a1, a1, t4
xvpermi.q xr1, xr0, 0x20
xvpermi.q xr3, xr2, 0x20
xvshuf.b xr1, xr1, xr1, xr7
xvshuf.b xr3, xr3, xr3, xr7
xvmulwev.h.bu.b xr0, xr1, xr22
xvmulwev.h.bu.b xr2, xr3, xr22
xvmaddwod.h.bu.b xr0, xr1, xr22
xvmaddwod.h.bu.b xr2, xr3, xr22
xvhaddw.w.h xr0, xr0, xr0
xvhaddw.w.h xr2, xr2, xr2
xvssrarni.h.w xr2, xr0, 2
xvpermi.d xr2, xr2, 0xd8
xvst xr2, a0, 0
addi.d a0, a0, 32
addi.w a4, a4, -4
bnez a4, .l_\lable\()h_4w_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()h_8w_lasx:
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vbsrl.v vr23, vr22, 4 //fh
xvreplve0.w xr23, xr23
xvreplve0.w xr22, xr22
xvreplve0.q xr19, xr6
xvaddi.bu xr20, xr19, 4
xvaddi.bu xr21, xr19, 8
.l_\lable\()h_8w_loop_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
add.d a1, a1, t2
xvpermi.q xr0, xr1, 0x02
PREP_HV_8W_LASX xr0
xvst xr0, a0, 0
addi.d a0, a0, 32
addi.d a4, a4, -2
bnez a4, .l_\lable\()h_8w_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()h_16w_lasx:
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vbsrl.v vr23, vr22, 4 //fh
xvreplve0.w xr23, xr23
xvreplve0.w xr22, xr22
xvreplve0.q xr19, xr6
xvaddi.bu xr20, xr19, 4
xvaddi.bu xr21, xr19, 8
.l_\lable\()h_16w_loop_lasx:
xvld xr0, a1, 0
xvld xr1, a1, 8
add.d a1, a1, a2
xvpermi.q xr0, xr1, 0x02
PREP_HV_8W_LASX xr0
xvst xr0, a0, 0
xvld xr0, a1, 0
xvld xr1, a1, 8
add.d a1, a1, a2
xvpermi.q xr0, xr1, 0x02
PREP_HV_8W_LASX xr0
xvst xr0, a0, 32
addi.d a0, a0, 64
addi.w a4, a4, -2
bnez a4, .l_\lable\()h_16w_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()h_32w_lasx:
.l_\lable\()h_64w_lasx:
.l_\lable\()h_128w_lasx:
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vbsrl.v vr23, vr22, 4 //fh
xvreplve0.w xr23, xr23
xvreplve0.w xr22, xr22
xvreplve0.q xr19, xr6
xvaddi.bu xr20, xr19, 4
xvaddi.bu xr21, xr19, 8
addi.d t5, a1, 0 //src
addi.d t6, a3, 0 //w
slli.w t7, a3, 1 //store offset
addi.d t8, a0, 0 //dst
.l_\lable\()h_16_loop_lasx:
xvld xr0, a1, 0
xvld xr1, a1, 8
xvpermi.q xr0, xr1, 0x02
PREP_HV_8W_LASX xr0
xvst xr0, a0, 0
xvld xr0, a1, 16
xvld xr1, a1, 24
xvpermi.q xr0, xr1, 0x02
PREP_HV_8W_LASX xr0
xvst xr0, a0, 32
addi.d a0, a0, 64
addi.d a1, a1, 32
addi.d a3, a3, -32
bnez a3, .l_\lable\()h_16_loop_lasx
add.d a1, t5, a2
add.d t5, t5, a2
add.d a0, t8, t7
add.d t8, t8, t7
addi.d a3, t6, 0
addi.d a4, a4, -1
bnez a4, .l_\lable\()h_16_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv_lasx:
andi t1, a7, 3
blt t0, a3, .l_\lable\()hv_idx_fh_lasx
andi t1, a7, 1
addi.w t1, t1, 3
.l_\lable\()hv_idx_fh_lasx:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a5, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fh's offset
xvldrepl.d xr22, t1, 0
srli.w a7, a7, 2
blt t0, a4, .l_\lable\()hv_idx_fv_lasx
andi a7, a7, 1
addi.w a7, a7, 3
.l_\lable\()hv_idx_fv_lasx:
addi.w t5, zero, 120
mul.w a7, a7, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w a7, a7, t5
add.d a7, t6, a7 //fv's offset
xvldrepl.d xr8, a7, 0
xvsllwil.h.b xr8, xr8, 0
sub.d a1, a1, t3
addi.d a1, a1, -1 //ignore leading 0s
beq a3, t0, .l_\lable\()hv_4w_lasx
addi.d a1, a1, -2
b .l_\lable\()hv_8w_lasx
.l_\lable\()hv_4w_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
xvldx xr2, a1, t2
xvldx xr3, a1, t3
add.d a1, a1, t4
xvld xr4, a1, 0
xvldx xr5, a1, a2
xvldx xr6, a1, t2
la.local t1, subpel_h_shuf2
xvld xr7, t1, 0
vbsrl.v vr22, vr22, 2
xvreplve0.w xr22, xr22
xvreplve0.q xr8, xr8
xvrepl128vei.w xr12, xr8, 0
xvrepl128vei.w xr13, xr8, 1
xvrepl128vei.w xr14, xr8, 2
xvrepl128vei.w xr15, xr8, 3
xvilvl.d xr0, xr1, xr0
xvilvl.d xr2, xr3, xr2
xvilvl.d xr4, xr5, xr4
xvreplve0.q xr0, xr0
xvreplve0.q xr2, xr2
xvreplve0.q xr4, xr4
xvreplve0.q xr6, xr6
xvshuf.b xr0, xr0, xr0, xr7
xvshuf.b xr2, xr2, xr2, xr7
xvshuf.b xr4, xr4, xr4, xr7
xvshuf.b xr6, xr6, xr6, xr7
xvmulwev.h.bu.b xr1, xr0, xr22
xvmulwev.h.bu.b xr3, xr2, xr22
xvmulwev.h.bu.b xr5, xr4, xr22
xvmulwev.h.bu.b xr9, xr6, xr22
xvmaddwod.h.bu.b xr1, xr0, xr22
xvmaddwod.h.bu.b xr3, xr2, xr22
xvmaddwod.h.bu.b xr5, xr4, xr22
xvmaddwod.h.bu.b xr9, xr6, xr22
xvhaddw.w.h xr1, xr1, xr1 // a0 b0 a1 b1 c0 d0 c1 d1
xvhaddw.w.h xr3, xr3, xr3 // a2 b2 a3 b3 c2 d2 c3 d3
xvhaddw.w.h xr5, xr5, xr5 // a4 b4 a5 b5 c4 d4 c5 d5
xvhaddw.w.h xr9, xr9, xr9 // a6 b6 - - c6 d6 - -
xvssrarni.h.w xr3, xr1, 2 // a0 b0 a1 b1 a2 b2 a3 b3 c0 d0 c1 d1 c2 d2 c3 d3
xvssrarni.h.w xr9, xr5, 2 // a4 b4 a5 b5 a6 b6 - - c4 d4 c5 d5 c6 d6 - -
xvbsrl.v xr4, xr3, 4
xvextrins.w xr4, xr9, 0x30 // a1 b1 a2 b2 a3 b3 a4 b4 c1 d1 c2 d2 c3 d3 c4 d4
xvilvl.h xr5, xr4, xr3 // a0 a1 b0 b1 a1 a2 b1 b2 c0 c1 d0 d1 c1 c2 d1 d2
xvilvh.h xr6, xr4, xr3 // a2 a3 b2 b3 a3 a4 b3 b4 c2 c3 d2 d3 c3 c4 d3 d4
xvbsrl.v xr10, xr9, 4 // a5 b5 a6 b6 - - - - c5 d5 c6 d6 - - - -
xvilvl.h xr11, xr10, xr9 // a4 a5 b4 b5 a5 a6 b5 b6 c4 c5 d4 d5 c5 c6 d5 d6
.l_\lable\()hv_w4_loop_lasx:
xvmulwev.w.h xr16, xr5, xr12 //a0 a1 (h0)
xvmulwev.w.h xr17, xr6, xr12 //a2 a3 (h1)
xvmulwev.w.h xr18, xr6, xr13 //a2 a3 (h0)
xvmulwev.w.h xr19, xr11, xr13 //a4 a5 (h1)
xvmulwev.w.h xr20, xr11, xr14 //a4 a5 (h0)
xvmaddwod.w.h xr16, xr5, xr12 //
xvmaddwod.w.h xr17, xr6, xr12 //
xvmaddwod.w.h xr18, xr6, xr13 //
xvmaddwod.w.h xr19, xr11, xr13 //
xvmaddwod.w.h xr20, xr11, xr14 //
xvaddi.wu xr5, xr11, 0
xvadd.w xr16, xr16, xr18 //a0 a1 + a2 a3
xvldx xr18, a1, t3 //a7 b7 c7 d7
add.d a1, a1, t4
xvadd.w xr17, xr17, xr19 //a2 a3 + a4 a5
xvld xr19, a1, 0 //a8 b8 c8 d8
xvadd.w xr16, xr16, xr20 //a0 a1 + a2 a3 + a4 a5
xvldx xr20, a1, a2 //a9 b9 c9 d9
xvilvl.d xr18, xr19, xr18
xvreplve0.q xr18, xr18
xvldx xr19, a1, t2 //aa ba ca da
xvilvl.d xr20, xr19, xr20
xvreplve0.q xr20, xr20
xvshuf.b xr18, xr18, xr18, xr7
xvshuf.b xr20, xr20, xr20, xr7
xvmulwev.h.bu.b xr21, xr18, xr22
xvmulwev.h.bu.b xr23, xr20, xr22
xvmaddwod.h.bu.b xr21, xr18, xr22
xvmaddwod.h.bu.b xr23, xr20, xr22
xvhaddw.w.h xr21, xr21, xr21 //a7 b7 a8 b8 c7 d7 c8 d8
xvhaddw.w.h xr23, xr23, xr23 //a9 b9 aa ba c9 d9 ca da
xvssrarni.h.w xr23, xr21, 2 //a7 b7 a8 b8 a9 b9 aa ba c7 d7 c8 d8 c9 d9 ca da
xvbsll.v xr0, xr23, 4
xvextrins.w xr0, xr9, 0x02 //a6 b6 a7 b7 a8 b8 a9 b9 c6 d6 c7 d7 c8 d8 c9 d9
xvilvl.h xr6, xr23, xr0 //a6 a7 b6 b7 a7 a8 b7 b8 c6 c7 d6 d7 c7 c8 d7 d8
xvilvh.h xr11, xr23, xr0 //a8 a9 b8 b9 a9 aa b9 ba c8 c9 d8 d9 c9 ca d9 da
xvbsrl.v xr9, xr23, 4
xvmulwev.w.h xr1 , xr6, xr14 //a6 a7 (h0)
xvmulwev.w.h xr2 , xr6, xr15 //a6 a7 (h1)
xvmulwev.w.h xr3 , xr11, xr15 //a8 a9 (h1)
xvmaddwod.w.h xr1 , xr6, xr14
xvmaddwod.w.h xr2 , xr6, xr15
xvmaddwod.w.h xr3 , xr11, xr15
xvadd.w xr17, xr17, xr1 //a2 a3 + a4 a5 + a6 a7
xvadd.w xr16, xr16, xr2 //a0 a1 + a2 a3 + a4 a5 + a6 a7
xvadd.w xr17, xr17, xr3 //a2 a3 + a4 a5 + a6 a7 + a8 a9
xvssrarni.h.w xr17, xr16, 6 //a01 b01 a12 b12 a23 b23 a34 b34 c01 d01 c12 d12 c23 d23 c34 d34
xvpermi.d xr17, xr17, 0xd8 //a01 b01 a12 b12 c01 d01 c12 d12 a23 b23 a34 b34 c23 d23 c34 d34
xvshuf4i.w xr17, xr17, 0xd8
xvst xr17, a0, 0
addi.d a0, a0, 32
addi.d a4, a4, -4
bnez a4, .l_\lable\()hv_w4_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()hv_8w_lasx:
addi.d sp, sp, -4*8
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
la.local t1, subpel_h_shuf1
vld vr19, t1, 0
addi.d t0, a1, 0
addi.d t5, a4, 0
slli.w t7, a3, 1 // store offset
addi.d t8, a0, 0
xvreplve0.q xr19, xr19
xvaddi.bu xr20, xr19, 4
xvaddi.bu xr21, xr19, 8
vbsrl.v vr23, vr22, 4
xvreplve0.w xr22, xr22 //f0f1f2f3
xvreplve0.w xr23, xr23 //f4f5f6f7
xvreplve0.q xr8, xr8
xvrepl128vei.w xr24, xr8, 0
xvrepl128vei.w xr25, xr8, 1
xvrepl128vei.w xr26, xr8, 2
xvrepl128vei.w xr27, xr8, 3
.l_\lable\()hv_8w_loop0_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
xvldx xr2, a1, t2
add.d a1, a1, t3
xvld xr3, a1, 0
xvldx xr4, a1, a2
xvldx xr5, a1, t2
xvldx xr6, a1, t3
add.d a1, a1, t4
xvpermi.q xr0, xr3, 0x02 //0 3
xvpermi.q xr1, xr4, 0x02 //1 4
xvpermi.q xr2, xr5, 0x02 //2 5
xvpermi.q xr3, xr6, 0x02 //3 6
PREP_HV_8W_LASX xr0 //a0b0c0d0 e0f0g0h0 a3b3c3d3 e3f3g3h3
PREP_HV_8W_LASX xr1 //a1b1c1d1 e1f1g1h1 a4b4c4d4 e4f4g4h4
PREP_HV_8W_LASX xr2 //a2b2c2d2 e2f2g2h2 a5b5c5d5 e5f5g5h5
PREP_HV_8W_LASX xr3 //a3b3c3d3 e3f3g3h3 a6b6c6d6 e6f6g6h6
xvpermi.d xr0, xr0, 0xd8
xvpermi.d xr1, xr1, 0xd8
xvpermi.d xr2, xr2, 0xd8
xvpermi.d xr18, xr3, 0xd8
xvilvl.h xr12, xr1, xr0 //a0a1b0b1c0c1d0d1 e0e1f0f1g0g1h0h1
xvilvh.h xr13, xr1, xr0 //a3a4b3b4c3c4d3d4 e3e4f3f4g3g4h3h4
xvilvl.h xr14, xr2, xr1 //a1a2b1b2c1c2d1d2 e1e2f1f2g1g2h1h2
xvilvh.h xr15, xr2, xr1 //a4a5b4b5c4c5d4d5 e4e5f4f5g4g5h4h5
xvilvl.h xr16, xr18, xr2 //a2a3b2b3c2c3d2d3 e2e3f2f3g2g3h2h3
xvilvh.h xr17, xr18, xr2 //a5a6b5b6c5c6d5d6 e5e6f5f6g5g6h5h6
.l_\lable\()hv_8w_loop_lasx:
xvld xr0, a1, 0
xvldx xr1, a1, a2
add.d a1, a1, t2
xvpermi.q xr0, xr1, 0x02 //7 8
PREP_HV_8W_LASX xr0 //a7b7c7d7e7f7g7h7 a8b8c8d8e8f8g8h8
xvpermi.q xr3, xr0, 0x03 //a6b6c6d6e6f6g6h6 a7b7c7d7e7f7g7h7
xvpermi.d xr3, xr3, 0xd8 //a6b6c6d6a7b7c7d7 e6f6g6h6e7f7g7h7
xvpermi.d xr1, xr0, 0xd8 //a7b7c7d7a8b8c8d8 e7f7g7h7e8f8g8h8
xvilvl.h xr18, xr1, xr3 //a6a7b6b7c6c7d6d7 e6e7f6f7g6g7h6h7
xvilvh.h xr2, xr1, xr3 //a7a8b7b8c7c8d7d8 e7e8f7f8g7g8h7h8
xvaddi.hu xr3, xr0, 0
xvmulwev.w.h xr4, xr12, xr24 //01
xvmulwev.w.h xr5, xr14, xr24 //12
xvmulwev.w.h xr6, xr16, xr25 //23
xvmulwev.w.h xr7, xr13, xr25 //34
xvmulwev.w.h xr8, xr15, xr26 //45
xvmulwev.w.h xr9, xr17, xr26 //56
xvmulwev.w.h xr10, xr18, xr27 //67
xvmulwev.w.h xr11, xr2, xr27 //78
xvmaddwod.w.h xr4, xr12, xr24 //01
xvmaddwod.w.h xr5, xr14, xr24 //12
xvmaddwod.w.h xr6, xr16, xr25 //23
xvmaddwod.w.h xr7, xr13, xr25 //34
xvmaddwod.w.h xr8, xr15, xr26 //45
xvmaddwod.w.h xr9, xr17, xr26 //56
xvmaddwod.w.h xr10, xr18, xr27 //67
xvmaddwod.w.h xr11, xr2, xr27 //78
xvadd.w xr4, xr4, xr6
xvadd.w xr5, xr5, xr7
xvadd.w xr4, xr4, xr8
xvadd.w xr5, xr5, xr9
xvadd.w xr4, xr4, xr10
xvadd.w xr5, xr5, xr11
xvaddi.hu xr12, xr16, 0 //01 <-- 23
xvaddi.hu xr14, xr13, 0 //12 <-- 34
xvaddi.hu xr16, xr15, 0 //23 <-- 45
xvaddi.hu xr13, xr17, 0 //34 <-- 56
xvaddi.hu xr15, xr18, 0 //45 <-- 67
xvaddi.hu xr17, xr2, 0 //56 <-- 78
xvssrarni.h.w xr5, xr4, 6
xvpermi.d xr5, xr5, 0xd8
vst vr5, a0, 0
xvpermi.q xr5, xr5, 0x11
vstx vr5, a0, t7
alsl.d a0, t7, a0, 1
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv_8w_loop_lasx
addi.d a1, t0, 8
addi.d t0, t0, 8
addi.d a0, t8, 16
addi.d t8, t8, 16
addi.d a4, t5, 0
addi.d a3, a3, -8
bnez a3, .l_\lable\()hv_8w_loop0_lasx
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
addi.d sp, sp, 4*8
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()v_lasx:
srli.w a7, a7, 2
blt t0, a4, .l_\lable\()v_idx_fv_lasx
andi a7, a7, 1
addi.w a7, a7, 3
.l_\lable\()v_idx_fv_lasx:
addi.w t5, zero, 120
mul.w a7, a7, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w a7, a7, t5
add.d a7, t6, a7 //fv's offset
xvldrepl.d xr8, a7, 0
xvrepl128vei.h xr12, xr8, 0
xvrepl128vei.h xr13, xr8, 1
xvrepl128vei.h xr14, xr8, 2
xvrepl128vei.h xr15, xr8, 3
sub.d a1, a1, t3
beq a3, t0, .l_\lable\()v_4w_lasx
addi.w t0, t0, 4
beq a3, t0, .l_\lable\()v_8w_lasx
blt t0, a3, .l_\lable\()v_16w_lasx
.l_\lable\()v_4w_lasx:
la.local t6, subpel_h_shuf3
xvld xr11, t6, 0
fld.s f0, a1, 0 //a0b0c0d0
fldx.s f1, a1, a2 //a1b1c1d1
fldx.s f2, a1, t2 //a2b2c2d2
add.d a1, a1, t3
fld.s f3, a1, 0 //a3b3c3d3
fldx.s f4, a1, a2 //a4b4c4d4
fldx.s f5, a1, t2 //a5b5c5d5
fldx.s f6, a1, t3 //a6b6c6d6
vilvl.w vr0, vr1, vr0 //01
vilvl.w vr1, vr3, vr2 //23
vilvl.d vr0, vr1, vr0 //0123
vilvl.w vr2, vr5, vr4 //45
vilvl.d vr1, vr2, vr1 //2345
xvpermi.q xr0, xr1, 0x02 //0123 2345
xvbsrl.v xr1, xr0, 4 //123- 345-
xvpermi.q xr4, xr6, 0x02
xvextrins.w xr1, xr4, 0x30 //1234 3456
xvilvl.b xr2, xr1, xr0 //0112 2334 //a0a1b0b1c0c1d0d1 a1a2b1b2c1c2d1d2 a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4
xvilvh.b xr3, xr1, xr0 //2334 4556 //a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6
.l_\lable\()v_4w_loop_lasx:
add.d a1, a1, t4
fld.s f0, a1, 0 //a7b7c7d7
fldx.s f1, a1, a2 //a8b8c8d8
fldx.s f4, a1, t2 //a9b9c9d9
fldx.s f5, a1, t3 //aabacada
vilvl.w vr7, vr0, vr6 //67
vilvl.w vr10, vr4, vr1 //89
vextrins.w vr7, vr1, 0x20//678-
vextrins.w vr10, vr5, 0x20//89a-
xvpermi.q xr7, xr10, 0x02//678- 89a-
xvshuf.b xr4, xr7, xr7, xr11 //67 78 89 9a //a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 a8a9b8b9c8c9d8d9 a9aab9bac9cad9da
xvpermi.q xr7, xr3, 0x11 //4556
xvpermi.q xr7, xr4, 0x02 //45 56 67 78 //a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8
xvmulwev.h.bu.b xr16, xr2, xr12
xvmulwev.h.bu.b xr17, xr3, xr13
xvmulwev.h.bu.b xr18, xr7, xr14
xvmulwev.h.bu.b xr19, xr4, xr15
xvmaddwod.h.bu.b xr16, xr2, xr12
xvmaddwod.h.bu.b xr17, xr3, xr13
xvmaddwod.h.bu.b xr18, xr7, xr14
xvmaddwod.h.bu.b xr19, xr4, xr15
xvadd.h xr16, xr16, xr17
xvadd.h xr16, xr16, xr18
xvadd.h xr16, xr16, xr19
xvsrari.h xr16, xr16, 2
xvaddi.bu xr2, xr7, 0
xvaddi.bu xr3, xr4, 0
xvaddi.bu xr6, xr5, 0
xvst xr16, a0, 0
addi.d a0, a0, 32
addi.w a4, a4, -4
bnez a4, .l_\lable\()v_4w_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()v_8w_lasx:
fld.d f0, a1, 0
fldx.d f1, a1, a2
fldx.d f2, a1, t2
add.d a1, a1, t3
fld.d f3, a1, 0
fldx.d f4, a1, a2
fldx.d f5, a1, t2
fldx.d f6, a1, t3
xvpermi.q xr0, xr1, 0x02
xvpermi.q xr1, xr2, 0x02
xvilvl.b xr0, xr1, xr0 //01 12
xvpermi.q xr2, xr3, 0x02
xvpermi.q xr3, xr4, 0x02
xvilvl.b xr2, xr3, xr2 //23 34
xvpermi.q xr4, xr5, 0x02
xvpermi.q xr5, xr6, 0x02
xvilvl.b xr4, xr5, xr4 //45 56
.l_\lable\()v_8w_loop_lasx:
add.d a1, a1, t4
fld.d f7, a1, 0 //7
fldx.d f10, a1, a2 //8
fldx.d f11, a1, t2 //9
fldx.d f18, a1, t3 //a
xvpermi.q xr6, xr7, 0x02
xvpermi.q xr7, xr10, 0x02
xvilvl.b xr6, xr7, xr6 //67 78
xvpermi.q xr10, xr11, 0x02
xvpermi.q xr11, xr18, 0x02
xvilvl.b xr10, xr11, xr10 //89 9a
xvmulwev.h.bu.b xr1, xr0, xr12
xvmulwev.h.bu.b xr3, xr2, xr13
xvmulwev.h.bu.b xr5, xr4, xr14
xvmulwev.h.bu.b xr7, xr6, xr15
xvmulwev.h.bu.b xr9, xr2, xr12
xvmulwev.h.bu.b xr11, xr4, xr13
xvmulwev.h.bu.b xr16, xr6, xr14
xvmulwev.h.bu.b xr17, xr10, xr15
xvmaddwod.h.bu.b xr1, xr0, xr12
xvmaddwod.h.bu.b xr3, xr2, xr13
xvmaddwod.h.bu.b xr5, xr4, xr14
xvmaddwod.h.bu.b xr7, xr6, xr15
xvmaddwod.h.bu.b xr9, xr2, xr12
xvmaddwod.h.bu.b xr11, xr4, xr13
xvmaddwod.h.bu.b xr16, xr6, xr14
xvmaddwod.h.bu.b xr17, xr10, xr15
xvadd.h xr1, xr1, xr3
xvadd.h xr1, xr1, xr5
xvadd.h xr1, xr1, xr7
xvadd.h xr9, xr9, xr11
xvadd.h xr9, xr9, xr16
xvadd.h xr9, xr9, xr17
xvaddi.bu xr0, xr4, 0
xvaddi.bu xr2, xr6, 0
xvaddi.bu xr4, xr10, 0
xvaddi.bu xr6, xr18, 0
xvsrari.h xr1, xr1, 2
xvsrari.h xr9, xr9, 2
xvst xr1, a0, 0
xvst xr9, a0, 32
addi.d a0, a0, 64
addi.w a4, a4, -4
bnez a4, .l_\lable\()v_8w_loop_lasx
b .l_\lable\()end_pre_8tap_lasx
.l_\lable\()v_16w_lasx:
addi.d t0, a0, 0 //dst
addi.d t5, a1, 0 //src
slli.w t7, a3, 1 //w
addi.d t8, a4, 0 //h
.l_\lable\()v_16w_loop0_lasx:
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t2
add.d a1, a1, t3
vld vr3, a1, 0
vldx vr4, a1, a2
vldx vr5, a1, t2
vldx vr6, a1, t3
add.d a1, a1, t4
xvpermi.d xr0, xr0, 0xd8
xvpermi.d xr1, xr1, 0xd8
xvpermi.d xr2, xr2, 0xd8
xvpermi.d xr3, xr3, 0xd8
xvpermi.d xr4, xr4, 0xd8
xvpermi.d xr5, xr5, 0xd8
xvpermi.d xr6, xr6, 0xd8
xvilvl.b xr0, xr1, xr0 //01
xvilvl.b xr1, xr2, xr1 //12
xvilvl.b xr2, xr3, xr2 //23
xvilvl.b xr3, xr4, xr3 //34
xvilvl.b xr4, xr5, xr4 //45
xvilvl.b xr5, xr6, xr5 //56
.l_\lable\()v_16w_loop_lasx:
vld vr7, a1, 0 //7
vldx vr10, a1, a2 //8
add.d a1, a1, t2
xvpermi.d xr7, xr7, 0xd8
xvpermi.d xr10, xr10, 0xd8
xvilvl.b xr6, xr7, xr6 //67
xvilvl.b xr7, xr10, xr7 //78
xvmulwev.h.bu.b xr9, xr0, xr12
xvmulwev.h.bu.b xr11, xr2, xr13
xvmulwev.h.bu.b xr16, xr4, xr14
xvmulwev.h.bu.b xr17, xr6, xr15
xvmulwev.h.bu.b xr18, xr1, xr12
xvmulwev.h.bu.b xr19, xr3, xr13
xvmulwev.h.bu.b xr20, xr5, xr14
xvmulwev.h.bu.b xr21, xr7, xr15
xvmaddwod.h.bu.b xr9, xr0, xr12
xvmaddwod.h.bu.b xr11, xr2, xr13
xvmaddwod.h.bu.b xr16, xr4, xr14
xvmaddwod.h.bu.b xr17, xr6, xr15
xvmaddwod.h.bu.b xr18, xr1, xr12
xvmaddwod.h.bu.b xr19, xr3, xr13
xvmaddwod.h.bu.b xr20, xr5, xr14
xvmaddwod.h.bu.b xr21, xr7, xr15
xvadd.h xr9, xr9, xr11
xvadd.h xr9, xr9, xr16
xvadd.h xr9, xr9, xr17
xvadd.h xr11, xr18, xr19
xvadd.h xr11, xr11, xr20
xvadd.h xr11, xr11, xr21
xvsrari.h xr9, xr9, 2
xvsrari.h xr11, xr11, 2
xvaddi.bu xr0, xr2, 0
xvaddi.bu xr1, xr3, 0
xvaddi.bu xr2, xr4, 0
xvaddi.bu xr3, xr5, 0
xvaddi.bu xr4, xr6, 0
xvaddi.bu xr5, xr7, 0
xvaddi.bu xr6, xr10, 0
xvst xr9, a0, 0
xvstx xr11, a0, t7
alsl.d a0, t7, a0, 1
addi.d a4, a4, -2
bnez a4, .l_\lable\()v_16w_loop_lasx
addi.d a3, a3, -16
addi.d a0, t0, 32
addi.d t0, t0, 32
addi.d a1, t5, 16
addi.d t5, t5, 16
addi.d a4, t8, 0
bnez a3, .l_\lable\()v_16w_loop0_lasx
.l_\lable\()end_pre_8tap_lasx:
.endm
function prep_8tap_regular_8bpc_lasx
addi.w a7, zero, 0
PREP_8TAP_8BPC_LASX 0
endfunc
function prep_8tap_smooth_regular_8bpc_lasx
addi.w a7, zero, 1
PREP_8TAP_8BPC_LASX 1
endfunc
function prep_8tap_sharp_regular_8bpc_lasx
addi.w a7, zero, 2
PREP_8TAP_8BPC_LASX 2
endfunc
function prep_8tap_regular_smooth_8bpc_lasx
addi.w a7, zero, 4
PREP_8TAP_8BPC_LASX 4
endfunc
function prep_8tap_smooth_8bpc_lasx
addi.w a7, zero, 5
PREP_8TAP_8BPC_LASX 5
endfunc
function prep_8tap_sharp_smooth_8bpc_lasx
addi.w a7, zero, 6
PREP_8TAP_8BPC_LASX 6
endfunc
function prep_8tap_regular_sharp_8bpc_lasx
addi.w a7, zero, 8
PREP_8TAP_8BPC_LASX 8
endfunc
function prep_8tap_smooth_sharp_8bpc_lasx
addi.w a7, zero, 9
PREP_8TAP_8BPC_LASX 9
endfunc
function prep_8tap_sharp_8bpc_lasx
addi.w a7, zero, 10
PREP_8TAP_8BPC_LASX 10
endfunc
.macro PREP_8TAP_8BPC_LSX lable
li.w t0, 4
la.local t6, dav1d_mc_subpel_filters
la.local t7, shufb1
vld vr23, t7, 0
slli.d t2, a2, 1 //src_stride*2
add.d t3, t2, a2 //src_stride*3
slli.d t4, t2, 1
bnez a5, .l_\lable\()h_lsx //mx
bnez a6, .l_\lable\()v_lsx
clz.w t1, a3
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()prep_hv0_jtable_lsx
alsl.d t1, t1, t5, 1
ld.h t8, t1, 0
add.d t5, t5, t8
jirl $r0, t5, 0
.align 3
.l_\lable\()prep_hv0_jtable_lsx:
.hword .l_\lable\()hv0_128w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.hword .l_\lable\()hv0_64w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.hword .l_\lable\()hv0_32w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.hword .l_\lable\()hv0_16w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.hword .l_\lable\()hv0_8w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.hword .l_\lable\()hv0_4w_lsx - .l_\lable\()prep_hv0_jtable_lsx
.l_\lable\()hv0_4w_lsx:
fld.s f0, a1, 0
fldx.s f1, a1, a2
add.d a1, a1, t2
vilvl.w vr0, vr1, vr0
vsllwil.hu.bu vr0, vr0, 4
vst vr0, a0, 0
addi.d a0, a0, 16
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv0_4w_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()hv0_8w_lsx:
fld.d f0, a1, 0
fldx.d f1, a1, a2
add.d a1, a1, t2
vsllwil.hu.bu vr0, vr0, 4
vsllwil.hu.bu vr1, vr1, 4
vst vr0, a0, 0
vst vr1, a0, 16
addi.d a0, a0, 32
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv0_8w_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()hv0_16w_lsx:
vld vr0, a1, 0
vldx vr1, a1, a2
add.d a1, a1, t2
vsllwil.hu.bu vr2, vr0, 4
vsllwil.hu.bu vr4, vr1, 4
vexth.hu.bu vr3, vr0
vexth.hu.bu vr5, vr1
vslli.h vr3, vr3, 4
vslli.h vr5, vr5, 4
vst vr2, a0, 0
vst vr3, a0, 16
vst vr4, a0, 32
vst vr5, a0, 48
addi.d a0, a0, 64
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv0_16w_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()hv0_32w_lsx:
.l_\lable\()hv0_64w_lsx:
.l_\lable\()hv0_128w_lsx:
addi.d t0, a1, 0
addi.d t5, a4, 0
srli.w t7, a3, 4
slli.w t7, t7, 5
addi.d t8, a0, 0
.l_\lable\()hv0_16_loop_lsx:
vld vr0, a1, 0
vldx vr1, a1, a2
add.d a1, a1, t2
vsllwil.hu.bu vr2, vr0, 4
vsllwil.hu.bu vr3, vr1, 4
vexth.hu.bu vr0, vr0
vexth.hu.bu vr1, vr1
vslli.h vr0, vr0, 4
vslli.h vr1, vr1, 4
vst vr2, a0, 0
vst vr0, a0, 16
add.d a0, a0, t7
vst vr3, a0, 0
vst vr1, a0, 16
add.d a0, a0, t7
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv0_16_loop_lsx
addi.d a1, t0, 16
addi.d t0, t0, 16
addi.d a0, t8, 32
addi.d t8, t8, 32
addi.d a4, t5, 0
addi.d a3, a3, -16
bnez a3, .l_\lable\()hv0_16_loop_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()h_lsx:
bnez a6, .l_\lable\()hv_lsx //if(fh) && if (fv)
andi t1, a7, 3
blt t0, a3, .l_\lable\()h_idx_fh_lsx
andi t1, a7, 1
addi.w t1, t1, 3
.l_\lable\()h_idx_fh_lsx:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a5, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fh's offset
vldrepl.d vr23, t1, 0
addi.d a1, a1, -3
clz.w t1, a3
li.w t5, 24
sub.w t1, t1, t5
la.local t5, .l_\lable\()prep_h_jtable_lsx
alsl.d t1, t1, t5, 1
ld.h t8, t1, 0
add.d t5, t5, t8
jirl $r0, t5, 0
.align 3
.l_\lable\()prep_h_jtable_lsx:
.hword .l_\lable\()h_128w_lsx - .l_\lable\()prep_h_jtable_lsx
.hword .l_\lable\()h_64w_lsx - .l_\lable\()prep_h_jtable_lsx
.hword .l_\lable\()h_32w_lsx - .l_\lable\()prep_h_jtable_lsx
.hword .l_\lable\()h_16w_lsx - .l_\lable\()prep_h_jtable_lsx
.hword .l_\lable\()h_8w_lsx - .l_\lable\()prep_h_jtable_lsx
.hword .l_\lable\()h_4w_lsx - .l_\lable\()prep_h_jtable_lsx
.l_\lable\()h_4w_lsx:
addi.d a1, a1, 2
la.local t7, subpel_h_shuf1
vld vr7, t7, 0
vbsrl.v vr23, vr23, 2
vreplvei.w vr23, vr23, 0
.l_\lable\()h_4w_loop_lsx:
vld vr0, a1, 0
vldx vr1, a1, a2
add.d a1, a1, t2
vshuf.b vr0, vr0, vr0, vr7
vshuf.b vr1, vr1, vr1, vr7
vmulwev.h.bu.b vr2, vr0, vr23
vmulwev.h.bu.b vr3, vr1, vr23
vmaddwod.h.bu.b vr2, vr0, vr23
vmaddwod.h.bu.b vr3, vr1, vr23
vhaddw.w.h vr0, vr2, vr2
vhaddw.w.h vr1, vr3, vr3
vssrarni.h.w vr1, vr0, 2
vst vr1, a0, 0
addi.d a0, a0, 16
addi.w a4, a4, -2
bnez a4, .l_\lable\()h_4w_loop_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()h_8w_lsx:
vreplvei.w vr22, vr23, 0 //fh
vreplvei.w vr23, vr23, 1
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vaddi.bu vr7, vr6, 4
vaddi.bu vr8, vr6, 8
.l_\lable\()h_8w_loop_lsx:
vld vr0, a1, 0
vldx vr1, a1, a2
add.d a1, a1, t2
PREP_H_8W vr0
PREP_H_8W vr1
vst vr0, a0, 0
vst vr1, a0, 16
addi.d a0, a0, 32
addi.d a4, a4, -2
bnez a4, .l_\lable\()h_8w_loop_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()h_16w_lsx:
.l_\lable\()h_32w_lsx:
.l_\lable\()h_64w_lsx:
.l_\lable\()h_128w_lsx:
vreplvei.w vr22, vr23, 0 //fh
vreplvei.w vr23, vr23, 1
la.local t7, subpel_h_shuf1
vld vr6, t7, 0
vaddi.bu vr7, vr6, 4
vaddi.bu vr8, vr6, 8
srli.w t7, a3, 4
slli.w t6, t7, 5
.l_\lable\()h_16w_loop0_lsx:
addi.d t0, a1, 0 //src
addi.d t5, a4, 0 //h
addi.d t8, a0, 0 //dst
.l_\lable\()h_16w_loop_lsx:
vld vr0, a1, 0
vld vr1, a1, 8
add.d a1, a1, a2
PREP_H_8W vr0
PREP_H_8W vr1
vst vr0, a0, 0
vst vr1, a0, 16
add.d a0, a0, t6
addi.d t5, t5, -1
bnez t5, .l_\lable\()h_16w_loop_lsx
addi.d a1, t0, 16
addi.d a0, t8, 32
addi.w t7, t7, -1
bnez t7, .l_\lable\()h_16w_loop0_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()hv_lsx:
andi t1, a7, 3
blt t0, a3, .l_\lable\()hv_idx_fh_lsx
andi t1, a7, 1
addi.w t1, t1, 3
.l_\lable\()hv_idx_fh_lsx:
addi.w t5, zero, 120
mul.w t1, t1, t5
addi.w t5, a5, -1
slli.w t5, t5, 3
add.w t1, t1, t5
add.d t1, t6, t1 //fh's offset
vldrepl.d vr8, t1, 0
srli.w a7, a7, 2
blt t0, a4, .l_\lable\()hv_idx_fv_lsx
andi a7, a7, 1
addi.w a7, a7, 3
.l_\lable\()hv_idx_fv_lsx:
addi.w t5, zero, 120
mul.w a7, a7, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w a7, a7, t5
add.d a7, t6, a7 //fv's offset
vldrepl.d vr9, a7, 0
vsllwil.h.b vr9, vr9, 0
sub.d a1, a1, t3
addi.d a1, a1, -3
beq a3, t0, .l_\lable\()hv_4w_lsx
b .l_\lable\()hv_8w_lsx
.l_\lable\()hv_4w_lsx:
addi.d a1, a1, 2 //ignore leading 0s
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t2
add.d a1, a1, t3
vld vr3, a1, 0
vldx vr4, a1, a2
vldx vr5, a1, t2
vldx vr6, a1, t3
add.d a1, a1, t4
la.local t1, subpel_h_shuf1
vld vr7, t1, 0
vbsrl.v vr8, vr8, 2
vreplvei.w vr8, vr8, 0
//fv
vreplvei.w vr17, vr9, 0
vreplvei.w vr18, vr9, 1
vreplvei.w vr19, vr9, 2
vreplvei.w vr20, vr9, 3
//DAV1D_FILTER_8TAP_RND
vshuf.b vr0, vr0, vr0, vr7
vshuf.b vr1, vr1, vr1, vr7
vshuf.b vr2, vr2, vr2, vr7
vshuf.b vr3, vr3, vr3, vr7
vshuf.b vr4, vr4, vr4, vr7
vshuf.b vr5, vr5, vr5, vr7
vshuf.b vr6, vr6, vr6, vr7
vmulwev.h.bu.b vr10, vr0, vr8
vmulwev.h.bu.b vr11, vr1, vr8
vmulwev.h.bu.b vr12, vr2, vr8
vmulwev.h.bu.b vr13, vr3, vr8
vmulwev.h.bu.b vr14, vr4, vr8
vmulwev.h.bu.b vr15, vr5, vr8
vmulwev.h.bu.b vr16, vr6, vr8
vmaddwod.h.bu.b vr10, vr0, vr8
vmaddwod.h.bu.b vr11, vr1, vr8
vmaddwod.h.bu.b vr12, vr2, vr8
vmaddwod.h.bu.b vr13, vr3, vr8
vmaddwod.h.bu.b vr14, vr4, vr8
vmaddwod.h.bu.b vr15, vr5, vr8
vmaddwod.h.bu.b vr16, vr6, vr8
vhaddw.w.h vr10, vr10, vr10
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vhaddw.w.h vr13, vr13, vr13
vhaddw.w.h vr14, vr14, vr14
vhaddw.w.h vr15, vr15, vr15
vhaddw.w.h vr16, vr16, vr16
vssrarni.h.w vr10, vr10, 2 //h0
vssrarni.h.w vr11, vr11, 2 //h1
vssrarni.h.w vr12, vr12, 2 //h2
vssrarni.h.w vr13, vr13, 2 //h3
vssrarni.h.w vr14, vr14, 2 //h4
vssrarni.h.w vr15, vr15, 2 //h5
vssrarni.h.w vr16, vr16, 2 //h6
//h0
vilvl.h vr0, vr11, vr10 //01
vilvl.h vr1, vr13, vr12 //23
vilvl.h vr2, vr15, vr14 //45
//h1
vilvl.h vr4, vr12, vr11 //12
vilvl.h vr5, vr14, vr13 //34
vilvl.h vr6, vr16, vr15 //56
.l_\lable\()hv_w4_loop_lsx:
vld vr9, a1, 0
vldx vr10, a1, a2
add.d a1, a1, t2
//DAV1D_FILTER_8TAP_CLIP
vshuf.b vr9, vr9, vr9, vr7
vshuf.b vr10, vr10, vr10, vr7
vmulwev.h.bu.b vr11, vr9, vr8
vmulwev.h.bu.b vr12, vr10, vr8
vmaddwod.h.bu.b vr11, vr9, vr8
vmaddwod.h.bu.b vr12, vr10, vr8
vhaddw.w.h vr11, vr11, vr11
vhaddw.w.h vr12, vr12, vr12
vssrarni.h.w vr11, vr11, 2 //7h
vssrarni.h.w vr12, vr12, 2 //h8
vilvl.h vr3, vr11, vr16 //67
vilvl.h vr13, vr12, vr11 //78
vmulwev.w.h vr9, vr0, vr17
vmulwev.w.h vr10, vr1, vr18
vmulwev.w.h vr14, vr2, vr19
vmulwev.w.h vr15, vr3, vr20
vmaddwod.w.h vr9, vr0, vr17
vmaddwod.w.h vr10, vr1, vr18
vmaddwod.w.h vr14, vr2, vr19
vmaddwod.w.h vr15, vr3, vr20
vadd.w vr16, vr9, vr10
vadd.w vr16, vr16, vr14
vadd.w vr16, vr16, vr15
vmulwev.w.h vr9, vr4, vr17
vmulwev.w.h vr10, vr5, vr18
vmulwev.w.h vr14, vr6, vr19
vmulwev.w.h vr15, vr13, vr20
vmaddwod.w.h vr9, vr4, vr17
vmaddwod.w.h vr10, vr5, vr18
vmaddwod.w.h vr14, vr6, vr19
vmaddwod.w.h vr15, vr13, vr20
vadd.w vr21, vr9, vr10
vadd.w vr21, vr21, vr14
vadd.w vr21, vr21, vr15
vssrarni.h.w vr21, vr16, 6
//cache
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr3, 0
vaddi.hu vr4, vr5, 0
vaddi.hu vr5, vr6, 0
vaddi.hu vr6, vr13, 0
vaddi.hu vr16, vr12, 0
vst vr21, a0, 0
addi.d a0, a0, 16
addi.d a4, a4, -2
bnez a4, .l_\lable\()hv_w4_loop_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()hv_8w_lsx:
.l_\lable\()hv_16w_lsx:
.l_\lable\()hv_32w_lsx:
.l_\lable\()hv_64w_lsx:
.l_\lable\()hv_128w_lsx:
addi.d sp, sp, -8*8
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
addi.d t0, a1, 0 //src
addi.d t5, a4, 0 //h
addi.d t8, a0, 0 //dst
slli.w t6, a3, 1
la.local t1, subpel_h_shuf1
vld vr7, t1, 0
vaddi.bu vr11, vr7, 4
vaddi.bu vr12, vr7, 8
vreplvei.w vr10, vr8, 1
vreplvei.w vr8, vr8, 0
vreplvei.w vr20, vr9, 1
vreplvei.w vr21, vr9, 2
vreplvei.w vr22, vr9, 3
vreplvei.w vr9, vr9, 0
.l_\lable\()prep_hv_8w_loop0_lsx:
vld vr0, a1, 0
vldx vr1, a1, a2
vldx vr2, a1, t2
add.d a1, a1, t3
vld vr3, a1, 0
vldx vr4, a1, a2
vldx vr5, a1, t2
vldx vr6, a1, t3
add.d a1, a1, t4
FILTER_8TAP_8W vr0 //h0
FILTER_8TAP_8W vr1 //h1
FILTER_8TAP_8W vr2 //h2
FILTER_8TAP_8W vr3 //h3
FILTER_8TAP_8W vr4 //h4
FILTER_8TAP_8W vr5 //h5
FILTER_8TAP_8W vr6 //h6
//h0' low part
vilvl.h vr23, vr1, vr0 //01
vilvl.h vr24, vr3, vr2 //23
vilvl.h vr25, vr5, vr4 //45
//h0' high part
vilvh.h vr26, vr1, vr0 //01
vilvh.h vr27, vr3, vr2 //23
vilvh.h vr28, vr5, vr4 //45
//h1' low part
vilvl.h vr29, vr2, vr1 //12
vilvl.h vr30, vr4, vr3 //34
vilvl.h vr31, vr6, vr5 //56
//h1' high part
vilvh.h vr0, vr2, vr1 //12
vilvh.h vr1, vr4, vr3 //34
vilvh.h vr2, vr6, vr5 //56
.l_\lable\()prep_hv_8w_loop_lsx:
vld vr3, a1, 0
vldx vr4, a1, a2
add.d a1, a1, t2
FILTER_8TAP_8W vr3 //h7
FILTER_8TAP_8W vr4 //h8
//h0' low part
vilvl.h vr16, vr3, vr6 //67 ~low
vmulwev.w.h vr13, vr23, vr9
vmulwev.w.h vr14, vr24, vr20
vmulwev.w.h vr15, vr25, vr21
vmulwev.w.h vr17, vr16, vr22
vmaddwod.w.h vr13, vr23, vr9
vmaddwod.w.h vr14, vr24, vr20
vmaddwod.w.h vr15, vr25, vr21
vmaddwod.w.h vr17, vr16, vr22
vadd.w vr13, vr13, vr14
vadd.w vr13, vr13, vr15
vadd.w vr13, vr13, vr17
//cache
vaddi.hu vr23, vr24, 0
vaddi.hu vr24, vr25, 0
vaddi.hu vr25, vr16, 0
//h0' high part
vilvh.h vr17, vr3, vr6 //67 ~high
vmulwev.w.h vr14, vr26, vr9
vmulwev.w.h vr15, vr27, vr20
vmulwev.w.h vr16, vr28, vr21
vmulwev.w.h vr18, vr17, vr22
vmaddwod.w.h vr14, vr26, vr9
vmaddwod.w.h vr15, vr27, vr20
vmaddwod.w.h vr16, vr28, vr21
vmaddwod.w.h vr18, vr17, vr22
vadd.w vr14, vr14, vr15
vadd.w vr14, vr14, vr16
vadd.w vr14, vr14, vr18
vssrarni.h.w vr14, vr13, 6
vst vr14, a0, 0
add.d a0, a0, t6
//cache
vaddi.hu vr26, vr27, 0
vaddi.hu vr27, vr28, 0
vaddi.hu vr28, vr17, 0
vaddi.hu vr6, vr4, 0
vilvl.h vr5, vr4, vr3 //78 ~low
vilvh.h vr4, vr4, vr3 //78 ~high
//h1' low part
vmulwev.w.h vr13, vr29, vr9
vmulwev.w.h vr14, vr30, vr20
vmulwev.w.h vr15, vr31, vr21
vmulwev.w.h vr16, vr5, vr22
vmaddwod.w.h vr13, vr29, vr9
vmaddwod.w.h vr14, vr30, vr20
vmaddwod.w.h vr15, vr31, vr21
vmaddwod.w.h vr16, vr5, vr22
vadd.w vr13, vr13, vr14
vadd.w vr13, vr13, vr15
vadd.w vr13, vr13, vr16
//cache
vaddi.hu vr29, vr30, 0
vaddi.hu vr30, vr31, 0
vaddi.hu vr31, vr5, 0
//h1' high part
vmulwev.w.h vr14, vr0, vr9
vmulwev.w.h vr15, vr1, vr20
vmulwev.w.h vr16, vr2, vr21
vmulwev.w.h vr17, vr4, vr22
vmaddwod.w.h vr14, vr0, vr9
vmaddwod.w.h vr15, vr1, vr20
vmaddwod.w.h vr16, vr2, vr21
vmaddwod.w.h vr17, vr4, vr22
vadd.w vr14, vr14, vr15
vadd.w vr14, vr14, vr16
vadd.w vr14, vr14, vr17
vssrarni.h.w vr14, vr13, 6
vst vr14, a0, 0
add.d a0, a0, t6
//cache
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr4, 0
addi.w a4, a4, -2
bnez a4, .l_\lable\()prep_hv_8w_loop_lsx
addi.d a1, t0, 8
addi.d t0, t0, 8
addi.d a0, t8, 16
addi.d t8, t8, 16
addi.d a4, t5, 0
addi.w a3, a3, -8
bnez a3, .l_\lable\()prep_hv_8w_loop0_lsx
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
addi.d sp, sp, 8*8
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()v_lsx:
srli.w a7, a7, 2
blt t0, a4, .l_\lable\()v_idx_fv_lsx
andi a7, a7, 1
addi.w a7, a7, 3
.l_\lable\()v_idx_fv_lsx:
addi.w t5, zero, 120
mul.w a7, a7, t5
addi.w t5, a6, -1
slli.w t5, t5, 3
add.w a7, a7, t5
add.d a7, t6, a7 //fv's offset
vldrepl.d vr8, a7, 0
vilvl.h vr8, vr8, vr8
vreplvei.w vr9, vr8, 1
vreplvei.w vr10, vr8, 2
vreplvei.w vr11, vr8, 3
vreplvei.w vr8, vr8, 0
sub.d a1, a1, t3
beq a3, t0, .l_\lable\()v_4w_lsx
blt t0, a3, .l_\lable\()v_8w_lsx
.l_\lable\()v_4w_lsx:
fld.s f0, a1, 0
fldx.s f1, a1, a2
fldx.s f2, a1, t2
add.d a1, a1, t3
fld.s f3, a1, 0
fldx.s f4, a1, a2
fldx.s f5, a1, t2
fldx.s f6, a1, t3
add.d a1, a1, t4
vilvl.w vr0, vr1, vr0
vilvl.w vr1, vr2, vr1
vilvl.b vr0, vr1, vr0 //0 1 1 2
vilvl.w vr1, vr3, vr2
vilvl.w vr2, vr4, vr3
vilvl.b vr1, vr2, vr1 //2 3 3 4
vilvl.w vr2, vr5, vr4
vilvl.w vr3, vr6, vr5
vilvl.b vr2, vr3, vr2 //4 5 5 6
.l_\lable\()v_4w_loop_lsx:
fld.s f7, a1, 0
vilvl.w vr3, vr7, vr6
fldx.s f6, a1, a2
add.d a1, a1, t2
vilvl.w vr4, vr6, vr7
vilvl.b vr3, vr4, vr3 //6 7 7 8
vmulwev.h.bu.b vr12, vr0, vr8
vmulwev.h.bu.b vr13, vr1, vr9
vmulwev.h.bu.b vr14, vr2, vr10
vmulwev.h.bu.b vr15, vr3, vr11
vmaddwod.h.bu.b vr12, vr0, vr8
vmaddwod.h.bu.b vr13, vr1, vr9
vmaddwod.h.bu.b vr14, vr2, vr10
vmaddwod.h.bu.b vr15, vr3, vr11
vaddi.hu vr0, vr1, 0
vaddi.hu vr1, vr2, 0
vaddi.hu vr2, vr3, 0
vadd.h vr12, vr12, vr13
vadd.h vr12, vr12, vr14
vadd.h vr12, vr12, vr15
vsrari.h vr12, vr12, 2
vst vr12, a0, 0
addi.d a0, a0, 16
addi.w a4, a4, -2
bnez a4, .l_\lable\()v_4w_loop_lsx
b .l_\lable\()end_pre_8tap_lsx
.l_\lable\()v_8w_lsx:
addi.d t0, a1, 0
addi.d t5, a4, 0
addi.d t8, a0, 0
slli.w t6, a3, 1
.l_\lable\()v_8w_loop0_lsx:
fld.d f0, a1, 0
fldx.d f1, a1, a2
fldx.d f2, a1, t2
add.d a1, a1, t3
fld.d f3, a1, 0
fldx.d f4, a1, a2
fldx.d f5, a1, t2
fldx.d f6, a1, t3
add.d a1, a1, t4
vilvl.b vr0, vr1, vr0 //0 1
vilvl.b vr1, vr2, vr1 //1 2
vilvl.b vr2, vr3, vr2 //2 3
vilvl.b vr3, vr4, vr3 //3 4
vilvl.b vr4, vr5, vr4 //4 5
vilvl.b vr5, vr6, vr5 //5 6
.l_\lable\()v_8w_loop_lsx:
fld.d f7, a1, 0
vilvl.b vr12, vr7, vr6 //6 7
fldx.d f6, a1, a2
add.d a1, a1, t2
vilvl.b vr13, vr6, vr7 //7 8
vmulwev.h.bu.b vr14, vr0, vr8
vmulwev.h.bu.b vr15, vr1, vr8
vmulwev.h.bu.b vr16, vr2, vr9
vmulwev.h.bu.b vr17, vr3, vr9
vmulwev.h.bu.b vr18, vr4, vr10
vmulwev.h.bu.b vr19, vr5, vr10
vmulwev.h.bu.b vr20, vr12, vr11
vmulwev.h.bu.b vr21, vr13, vr11
vmaddwod.h.bu.b vr14, vr0, vr8
vmaddwod.h.bu.b vr15, vr1, vr8
vmaddwod.h.bu.b vr16, vr2, vr9
vmaddwod.h.bu.b vr17, vr3, vr9
vmaddwod.h.bu.b vr18, vr4, vr10
vmaddwod.h.bu.b vr19, vr5, vr10
vmaddwod.h.bu.b vr20, vr12, vr11
vmaddwod.h.bu.b vr21, vr13, vr11
vaddi.hu vr0, vr2, 0
vaddi.hu vr1, vr3, 0
vaddi.hu vr2, vr4, 0
vaddi.hu vr3, vr5, 0
vaddi.hu vr4, vr12, 0
vaddi.hu vr5, vr13, 0
vadd.h vr14, vr14, vr16
vadd.h vr14, vr14, vr18
vadd.h vr14, vr14, vr20
vadd.h vr15, vr15, vr17
vadd.h vr15, vr15, vr19
vadd.h vr15, vr15, vr21
vsrari.h vr14, vr14, 2
vsrari.h vr15, vr15, 2
vst vr14, a0, 0
add.d a0, a0, t6
vst vr15, a0, 0
add.d a0, a0, t6
addi.w a4, a4, -2
bnez a4, .l_\lable\()v_8w_loop_lsx
addi.d a1, t0, 8
addi.d t0, t0, 8
addi.d a0, t8, 16
addi.d t8, t8, 16
addi.d a4, t5, 0
addi.d a3, a3, -8
bnez a3, .l_\lable\()v_8w_loop0_lsx
.l_\lable\()end_pre_8tap_lsx:
.endm
function prep_8tap_regular_8bpc_lsx
addi.w a7, zero, 0
PREP_8TAP_8BPC_LSX 0
endfunc
function prep_8tap_smooth_regular_8bpc_lsx
addi.w a7, zero, 1
PREP_8TAP_8BPC_LSX 1
endfunc
function prep_8tap_sharp_regular_8bpc_lsx
addi.w a7, zero, 2
PREP_8TAP_8BPC_LSX 2
endfunc
function prep_8tap_regular_smooth_8bpc_lsx
addi.w a7, zero, 4
PREP_8TAP_8BPC_LSX 4
endfunc
function prep_8tap_smooth_8bpc_lsx
addi.w a7, zero, 5
PREP_8TAP_8BPC_LSX 5
endfunc
function prep_8tap_sharp_smooth_8bpc_lsx
addi.w a7, zero, 6
PREP_8TAP_8BPC_LSX 6
endfunc
function prep_8tap_regular_sharp_8bpc_lsx
addi.w a7, zero, 8
PREP_8TAP_8BPC_LSX 8
endfunc
function prep_8tap_smooth_sharp_8bpc_lsx
addi.w a7, zero, 9
PREP_8TAP_8BPC_LSX 9
endfunc
function prep_8tap_sharp_8bpc_lsx
addi.w a7, zero, 10
PREP_8TAP_8BPC_LSX 10
endfunc
/*
* static void blend_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h, const uint8_t *mask)
*/
function blend_8bpc_lsx
addi.d t8, zero, 64
vreplgr2vr.b vr23, t8
clz.w t0, a3
li.w t1, 26
sub.w t0, t0, t1
la.local t1, .BLEND_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.BLEND_LSX_JRTABLE:
.hword .BLEND_W32_LSX - .BLEND_LSX_JRTABLE
.hword .BLEND_W16_LSX - .BLEND_LSX_JRTABLE
.hword .BLEND_W8_LSX - .BLEND_LSX_JRTABLE
.hword .BLEND_W4_LSX - .BLEND_LSX_JRTABLE
.BLEND_W4_LSX:
vld vr0, a0, 0
vld vr1, a2, 0
vld vr2, a5, 0
vsllwil.hu.bu vr1, vr1, 0
vsllwil.hu.bu vr4, vr2, 0
vmul.h vr1, vr1, vr4 //b*m
vsub.b vr3, vr23, vr2
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr3, vr3, 0
vmadd.h vr1, vr0, vr3
vssrarni.bu.h vr1, vr1, 6
vstelm.w vr1, a0, 0, 0
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 4
addi.d a5, a5, 4
blt zero, a4, .BLEND_W4_LSX
b .BLEND_END_LSX
.BLEND_W8_LSX:
vld vr0, a0, 0
vld vr1, a2, 0
vld vr2, a5, 0
vsllwil.hu.bu vr1, vr1, 0
vsllwil.hu.bu vr4, vr2, 0
vmul.h vr1, vr1, vr4 //b*m
vsub.b vr3, vr23, vr2
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr3, vr3, 0
vmadd.h vr1, vr0, vr3
vssrarni.bu.h vr1, vr1, 6
vstelm.d vr1, a0, 0, 0
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 8
addi.d a5, a5, 8
blt zero, a4, .BLEND_W8_LSX
b .BLEND_END_LSX
.BLEND_W16_LSX:
vld vr0, a0, 0
vld vr1, a2, 0
vld vr2, a5, 0
vexth.hu.bu vr5, vr1
vsllwil.hu.bu vr1, vr1, 0
vexth.hu.bu vr6, vr2
vsllwil.hu.bu vr4, vr2, 0
vmul.h vr1, vr1, vr4 //b*m
vmul.h vr5, vr5, vr6 //b*m
vsub.b vr3, vr23, vr2
vexth.hu.bu vr7, vr0
vexth.hu.bu vr8, vr3
vmadd.h vr5, vr7, vr8
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr3, vr3, 0
vmadd.h vr1, vr0, vr3
vssrarni.bu.h vr5, vr1, 6
vst vr5, a0, 0
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 16
addi.d a5, a5, 16
blt zero, a4, .BLEND_W16_LSX
b .BLEND_END_LSX
.BLEND_W32_LSX:
vld vr0, a0, 0
vld vr1, a2, 0
vld vr2, a5, 0
vexth.hu.bu vr5, vr1
vsllwil.hu.bu vr1, vr1, 0
vexth.hu.bu vr6, vr2
vsllwil.hu.bu vr4, vr2, 0
vmul.h vr1, vr1, vr4 //b*m
vmul.h vr5, vr5, vr6 //b*m
vsub.b vr3, vr23, vr2
vexth.hu.bu vr7, vr0
vexth.hu.bu vr8, vr3
vmadd.h vr5, vr7, vr8
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr3, vr3, 0
vmadd.h vr1, vr0, vr3
vssrarni.bu.h vr5, vr1, 6
vst vr5, a0, 0
/* sencond */
vld vr0, a0, 16
vld vr1, a2, 16
vld vr2, a5, 16
vexth.hu.bu vr5, vr1
vsllwil.hu.bu vr1, vr1, 0
vexth.hu.bu vr6, vr2
vsllwil.hu.bu vr4, vr2, 0
vmul.h vr1, vr1, vr4 //b*m
vmul.h vr5, vr5, vr6 //b*m
vsub.b vr3, vr23, vr2
vexth.hu.bu vr7, vr0
vexth.hu.bu vr8, vr3
vmadd.h vr5, vr7, vr8
vsllwil.hu.bu vr0, vr0, 0
vsllwil.hu.bu vr3, vr3, 0
vmadd.h vr1, vr0, vr3
vssrarni.bu.h vr5, vr1, 6
vst vr5, a0, 16
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 32
addi.d a5, a5, 32
blt zero, a4, .BLEND_W32_LSX
.BLEND_END_LSX:
endfunc
const obmc_masks_la
/* Unused */
.byte 0, 0, 0, 0
/* 2 */
.byte 45, 19, 64, 0
/* 4 */
.byte 39, 25, 50, 14, 59, 5, 64, 0
/* 8 */
.byte 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
/* 16 */
.byte 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
.byte 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
/* 32 */
.byte 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
.byte 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
.byte 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
endconst
/*
* static void blend_v_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
*/
function blend_v_8bpc_lsx
la.local t8, obmc_masks_la
clz.w t0, a3
li.w t1, 26
sub.w t0, t0, t1
la.local t1, .BLEND_V_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.BLEND_V_LSX_JRTABLE:
.hword .BLEND_V_W32_LSX - .BLEND_V_LSX_JRTABLE
.hword .BLEND_V_W16_LSX - .BLEND_V_LSX_JRTABLE
.hword .BLEND_V_W8_LSX - .BLEND_V_LSX_JRTABLE
.hword .BLEND_V_W4_LSX - .BLEND_V_LSX_JRTABLE
.hword .BLEND_V_W2_LSX - .BLEND_V_LSX_JRTABLE
.hword .BLEND_V_W2_LSX_1 - .BLEND_V_LSX_JRTABLE //Instructions must be 4-byte aligned
.BLEND_V_W2_LSX:
ld.bu t6, t8, 4
ld.bu t7, t8, 5
.BLEND_V_W2_LSX_1:
ld.bu t0, a0, 0
ld.bu t1, a2, 0
mul.d t0, t0, t6
mul.d t1, t1, t7
addi.d t0, t0, 32
add.d t0, t0, t1
srli.d t0, t0, 6
st.b t0, a0, 0
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 2
addi.d a5, a5, 2
blt zero, a4, .BLEND_V_W2_LSX_1
b .BLEND_V_END_LSX
.BLEND_V_W4_LSX:
vld vr20, t8, 8
.BLEND_V_W4_LSX_1:
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.h vr1, a0, 0, 0
vstelm.b vr1, a0, 2, 2
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 4
blt zero, a4, .BLEND_V_W4_LSX_1
b .BLEND_V_END_LSX
.BLEND_V_W8_LSX:
vld vr20, t8, 16
.BLEND_V_W8_LSX_1:
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.w vr1, a0, 0, 0
vstelm.h vr1, a0, 4, 2
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 8
blt zero, a4, .BLEND_V_W8_LSX_1
b .BLEND_V_END_LSX
.BLEND_V_W16_LSX:
vld vr20, t8, 32
vld vr21, t8, 48
.BLEND_V_W16_LSX_1:
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr2, vr1, vr0
vilvh.b vr3, vr1, vr0
vmulwev.h.bu vr4, vr2, vr20
vmulwev.h.bu vr5, vr3, vr21
vmaddwod.h.bu vr4, vr2, vr20
vmaddwod.h.bu vr5, vr3, vr21
vssrarni.bu.h vr5, vr4, 6
vstelm.d vr5, a0, 0, 0
vstelm.w vr5, a0, 8, 2
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 16
blt zero, a4, .BLEND_V_W16_LSX_1
b .BLEND_V_END_LSX
.BLEND_V_W32_LSX:
vld vr20, t8, 64
vld vr21, t8, 80
vld vr22, t8, 96
.BLEND_V_W32_LSX_1:
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a2, 0
vld vr3, a2, 16
vilvl.b vr4, vr2, vr0
vmulwev.h.bu vr7, vr4, vr20
vilvh.b vr5, vr2, vr0
vmulwev.h.bu vr8, vr5, vr21
vilvl.b vr6, vr3, vr1
vmulwev.h.bu vr9, vr6, vr22
vmaddwod.h.bu vr7, vr4, vr20
vmaddwod.h.bu vr8, vr5, vr21
vmaddwod.h.bu vr9, vr6, vr22
vssrarni.bu.h vr8, vr7, 6
vssrarni.bu.h vr9, vr9, 6
vst vr8, a0, 0
vstelm.d vr9, a0, 16, 0
addi.w a4, a4, -1
add.d a0, a0, a1
addi.d a2, a2, 32
blt zero, a4, .BLEND_V_W32_LSX_1
.BLEND_V_END_LSX:
endfunc
/*
* static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
*/
function blend_h_8bpc_lsx
la.local t8, obmc_masks_la
alsl.d t8, a4, t8, 1
srli.d t0, a4, 1
srli.d t1, a4, 2
add.d a4, t0, t1 // h = (h * 3) >> 2;
slli.d a4, a4, 1
add.d a4, a4, t8
clz.w t0, a3
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .BLEND_H_LSX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.BLEND_H_LSX_JRTABLE:
.hword .BLEND_H_W128_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W64_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W32_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W16_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W8_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W4_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_W2_LSX - .BLEND_H_LSX_JRTABLE
.hword .BLEND_H_END_LSX - .BLEND_H_LSX_JRTABLE //Instructions must be 4-byte aligned
.BLEND_H_W2_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.h vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 2
blt t8, a4, .BLEND_H_W2_LSX
b .BLEND_H_END_LSX
.BLEND_H_W4_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.w vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 4
blt t8, a4, .BLEND_H_W4_LSX
b .BLEND_H_END_LSX
.BLEND_H_W8_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.d vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 8
blt t8, a4, .BLEND_H_W8_LSX
b .BLEND_H_END_LSX
.BLEND_H_W16_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr2, vr1, vr0
vilvh.b vr3, vr1, vr0
vmulwev.h.bu vr4, vr2, vr20
vmulwev.h.bu vr5, vr3, vr20
vmaddwod.h.bu vr4, vr2, vr20
vmaddwod.h.bu vr5, vr3, vr20
vssrarni.bu.h vr5, vr4, 6
vst vr5, a0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 16
blt t8, a4, .BLEND_H_W16_LSX
b .BLEND_H_END_LSX
.BLEND_H_W32_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a2, 0
vld vr3, a2, 16
vilvl.b vr4, vr2, vr0
vilvh.b vr5, vr2, vr0
vilvl.b vr6, vr3, vr1
vilvh.b vr3, vr3, vr1
vmulwev.h.bu vr7, vr4, vr20
vmulwev.h.bu vr8, vr5, vr20
vmulwev.h.bu vr9, vr6, vr20
vmulwev.h.bu vr0, vr3, vr20
vmaddwod.h.bu vr7, vr4, vr20
vmaddwod.h.bu vr8, vr5, vr20
vmaddwod.h.bu vr9, vr6, vr20
vmaddwod.h.bu vr0, vr3, vr20
vssrarni.bu.h vr8, vr7, 6
vssrarni.bu.h vr0, vr9, 6
vst vr8, a0, 0
vst vr0, a0, 16
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 32
blt t8, a4, .BLEND_H_W32_LSX
b .BLEND_H_END_LSX
.BLEND_H_W64_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
vld vr4, a2, 0
vld vr5, a2, 16
vld vr6, a2, 32
vld vr7, a2, 48
vilvl.b vr8, vr4, vr0
vilvh.b vr9, vr4, vr0
vilvl.b vr10, vr5, vr1
vilvh.b vr11, vr5, vr1
vilvl.b vr12, vr6, vr2
vilvh.b vr13, vr6, vr2
vilvl.b vr14, vr7, vr3
vilvh.b vr15, vr7, vr3
vmulwev.h.bu vr0, vr8, vr20
vmulwev.h.bu vr1, vr9, vr20
vmulwev.h.bu vr2, vr10, vr20
vmulwev.h.bu vr3, vr11, vr20
vmulwev.h.bu vr4, vr12, vr20
vmulwev.h.bu vr5, vr13, vr20
vmulwev.h.bu vr6, vr14, vr20
vmulwev.h.bu vr7, vr15, vr20
vmaddwod.h.bu vr0, vr8, vr20
vmaddwod.h.bu vr1, vr9, vr20
vmaddwod.h.bu vr2, vr10, vr20
vmaddwod.h.bu vr3, vr11, vr20
vmaddwod.h.bu vr4, vr12, vr20
vmaddwod.h.bu vr5, vr13, vr20
vmaddwod.h.bu vr6, vr14, vr20
vmaddwod.h.bu vr7, vr15, vr20
vssrarni.bu.h vr1, vr0, 6
vssrarni.bu.h vr3, vr2, 6
vssrarni.bu.h vr5, vr4, 6
vssrarni.bu.h vr7, vr6, 6
vst vr1, a0, 0
vst vr3, a0, 16
vst vr5, a0, 32
vst vr7, a0, 48
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 64
blt t8, a4, .BLEND_H_W64_LSX
b .BLEND_H_END_LSX
.BLEND_H_W128_LSX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a0, 16
vld vr2, a0, 32
vld vr3, a0, 48
vld vr4, a2, 0
vld vr5, a2, 16
vld vr6, a2, 32
vld vr7, a2, 48
vilvl.b vr8, vr4, vr0
vilvh.b vr9, vr4, vr0
vilvl.b vr10, vr5, vr1
vilvh.b vr11, vr5, vr1
vilvl.b vr12, vr6, vr2
vilvh.b vr13, vr6, vr2
vilvl.b vr14, vr7, vr3
vilvh.b vr15, vr7, vr3
vmulwev.h.bu vr0, vr8, vr20
vmulwev.h.bu vr1, vr9, vr20
vmulwev.h.bu vr2, vr10, vr20
vmulwev.h.bu vr3, vr11, vr20
vmulwev.h.bu vr4, vr12, vr20
vmulwev.h.bu vr5, vr13, vr20
vmulwev.h.bu vr6, vr14, vr20
vmulwev.h.bu vr7, vr15, vr20
vmaddwod.h.bu vr0, vr8, vr20
vmaddwod.h.bu vr1, vr9, vr20
vmaddwod.h.bu vr2, vr10, vr20
vmaddwod.h.bu vr3, vr11, vr20
vmaddwod.h.bu vr4, vr12, vr20
vmaddwod.h.bu vr5, vr13, vr20
vmaddwod.h.bu vr6, vr14, vr20
vmaddwod.h.bu vr7, vr15, vr20
vssrarni.bu.h vr1, vr0, 6
vssrarni.bu.h vr3, vr2, 6
vssrarni.bu.h vr5, vr4, 6
vssrarni.bu.h vr7, vr6, 6
vst vr1, a0, 0
vst vr3, a0, 16
vst vr5, a0, 32
vst vr7, a0, 48
/* second */
vld vr0, a0, 64
vld vr1, a0, 80
vld vr2, a0, 96
vld vr3, a0, 112
vld vr4, a2, 64
vld vr5, a2, 80
vld vr6, a2, 96
vld vr7, a2, 112
vilvl.b vr8, vr4, vr0
vilvh.b vr9, vr4, vr0
vilvl.b vr10, vr5, vr1
vilvh.b vr11, vr5, vr1
vilvl.b vr12, vr6, vr2
vilvh.b vr13, vr6, vr2
vilvl.b vr14, vr7, vr3
vilvh.b vr15, vr7, vr3
vmulwev.h.bu vr0, vr8, vr20
vmulwev.h.bu vr1, vr9, vr20
vmulwev.h.bu vr2, vr10, vr20
vmulwev.h.bu vr3, vr11, vr20
vmulwev.h.bu vr4, vr12, vr20
vmulwev.h.bu vr5, vr13, vr20
vmulwev.h.bu vr6, vr14, vr20
vmulwev.h.bu vr7, vr15, vr20
vmaddwod.h.bu vr0, vr8, vr20
vmaddwod.h.bu vr1, vr9, vr20
vmaddwod.h.bu vr2, vr10, vr20
vmaddwod.h.bu vr3, vr11, vr20
vmaddwod.h.bu vr4, vr12, vr20
vmaddwod.h.bu vr5, vr13, vr20
vmaddwod.h.bu vr6, vr14, vr20
vmaddwod.h.bu vr7, vr15, vr20
vssrarni.bu.h vr1, vr0, 6
vssrarni.bu.h vr3, vr2, 6
vssrarni.bu.h vr5, vr4, 6
vssrarni.bu.h vr7, vr6, 6
vst vr1, a0, 64
vst vr3, a0, 80
vst vr5, a0, 96
vst vr7, a0, 112
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 128
blt t8, a4, .BLEND_H_W128_LSX
b .BLEND_H_END_LSX
.BLEND_H_END_LSX:
endfunc
/*
* static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
*/
function blend_h_8bpc_lasx
la.local t8, obmc_masks_la
alsl.d t8, a4, t8, 1
srli.d t0, a4, 1
srli.d t1, a4, 2
add.d a4, t0, t1 // h = (h * 3) >> 2;
slli.d a4, a4, 1
add.d a4, a4, t8
clz.w t0, a3
li.w t1, 24
sub.w t0, t0, t1
la.local t1, .BLEND_H_LASX_JRTABLE
alsl.d t0, t0, t1, 1
ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE
add.d t1, t1, t2 // Get absolute address
jirl $r0, t1, 0
.align 3
.BLEND_H_LASX_JRTABLE:
.hword .BLEND_H_W128_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W64_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W32_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W16_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W8_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W4_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_W2_LASX - .BLEND_H_LASX_JRTABLE
.hword .BLEND_H_END_LASX - .BLEND_H_LASX_JRTABLE //Instructions must be 4-byte aligned
.BLEND_H_W2_LASX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.h vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 2
blt t8, a4, .BLEND_H_W2_LASX
b .BLEND_H_END_LASX
.BLEND_H_W4_LASX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.w vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 4
blt t8, a4, .BLEND_H_W4_LASX
b .BLEND_H_END_LASX
.BLEND_H_W8_LASX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr0, vr1, vr0
vdp2.h.bu vr1, vr0, vr20
vssrarni.bu.h vr1, vr1, 6
vstelm.d vr1, a0, 0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 8
blt t8, a4, .BLEND_H_W8_LASX
b .BLEND_H_END_LASX
.BLEND_H_W16_LASX:
vldrepl.h vr20, t8, 0
vld vr0, a0, 0
vld vr1, a2, 0
vilvl.b vr2, vr1, vr0
vilvh.b vr3, vr1, vr0
vmulwev.h.bu vr4, vr2, vr20
vmulwev.h.bu vr5, vr3, vr20
vmaddwod.h.bu vr4, vr2, vr20
vmaddwod.h.bu vr5, vr3, vr20
vssrarni.bu.h vr5, vr4, 6
vst vr5, a0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 16
blt t8, a4, .BLEND_H_W16_LSX
b .BLEND_H_END_LSX
.BLEND_H_W32_LASX:
xvldrepl.h xr20, t8, 0
xvld xr0, a0, 0
xvld xr1, a2, 0
xvilvl.b xr2, xr1, xr0
xvilvh.b xr3, xr1, xr0
xvmulwev.h.bu xr4, xr2, xr20
xvmulwev.h.bu xr5, xr3, xr20
xvmaddwod.h.bu xr4, xr2, xr20
xvmaddwod.h.bu xr5, xr3, xr20
xvssrarni.bu.h xr5, xr4, 6
xvst xr5, a0, 0
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 32
blt t8, a4, .BLEND_H_W32_LASX
b .BLEND_H_END_LASX
.BLEND_H_W64_LASX:
xvldrepl.h xr20, t8, 0
xvld xr0, a0, 0
xvld xr1, a0, 32
xvld xr2, a2, 0
xvld xr3, a2, 32
xvilvl.b xr4, xr2, xr0
xvilvh.b xr5, xr2, xr0
xvilvl.b xr6, xr3, xr1
xvilvh.b xr7, xr3, xr1
xvmulwev.h.bu xr0, xr4, xr20
xvmulwev.h.bu xr1, xr5, xr20
xvmulwev.h.bu xr2, xr6, xr20
xvmulwev.h.bu xr3, xr7, xr20
xvmaddwod.h.bu xr0, xr4, xr20
xvmaddwod.h.bu xr1, xr5, xr20
xvmaddwod.h.bu xr2, xr6, xr20
xvmaddwod.h.bu xr3, xr7, xr20
xvssrarni.bu.h xr1, xr0, 6
xvssrarni.bu.h xr3, xr2, 6
xvst xr1, a0, 0
xvst xr3, a0, 32
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 64
blt t8, a4, .BLEND_H_W64_LASX
b .BLEND_H_END_LASX
.BLEND_H_W128_LASX:
xvldrepl.h xr20, t8, 0
xvld xr0, a0, 0
xvld xr1, a0, 32
xvld xr2, a0, 64
xvld xr3, a0, 96
xvld xr4, a2, 0
xvld xr5, a2, 32
xvld xr6, a2, 64
xvld xr7, a2, 96
xvilvl.b xr8, xr4, xr0
xvilvh.b xr9, xr4, xr0
xvilvl.b xr10, xr5, xr1
xvilvh.b xr11, xr5, xr1
xvilvl.b xr12, xr6, xr2
xvilvh.b xr13, xr6, xr2
xvilvl.b xr14, xr7, xr3
xvilvh.b xr15, xr7, xr3
xvmulwev.h.bu xr0, xr8, xr20
xvmulwev.h.bu xr1, xr9, xr20
xvmulwev.h.bu xr2, xr10, xr20
xvmulwev.h.bu xr3, xr11, xr20
xvmulwev.h.bu xr4, xr12, xr20
xvmulwev.h.bu xr5, xr13, xr20
xvmulwev.h.bu xr6, xr14, xr20
xvmulwev.h.bu xr7, xr15, xr20
xvmaddwod.h.bu xr0, xr8, xr20
xvmaddwod.h.bu xr1, xr9, xr20
xvmaddwod.h.bu xr2, xr10, xr20
xvmaddwod.h.bu xr3, xr11, xr20
xvmaddwod.h.bu xr4, xr12, xr20
xvmaddwod.h.bu xr5, xr13, xr20
xvmaddwod.h.bu xr6, xr14, xr20
xvmaddwod.h.bu xr7, xr15, xr20
xvssrarni.bu.h xr1, xr0, 6
xvssrarni.bu.h xr3, xr2, 6
xvssrarni.bu.h xr5, xr4, 6
xvssrarni.bu.h xr7, xr6, 6
xvst xr1, a0, 0
xvst xr3, a0, 32
xvst xr5, a0, 64
xvst xr7, a0, 96
addi.d t8, t8, 2
add.d a0, a0, a1
addi.d a2, a2, 128
blt t8, a4, .BLEND_H_W128_LASX
b .BLEND_H_END_LASX
.BLEND_H_END_LASX:
endfunc
/*
* a1=16 | a2=8 | a3=4
* temp reg: a4
*/
.macro PIXEL_COPY_LSX _dst, _src, _size
blt \_size, a1, 8f
16:
vld vr0, \_src, 0
vst vr0, \_dst, 0
addi.d \_size, \_size, -16
addi.d \_dst, \_dst, 16
addi.d \_src, \_src, 16
blt a1, \_size, 16b
8:
blt \_size, a2, 14f
ld.d a4, \_src, 0
st.d a4, \_dst, 0
addi.d \_size, \_size, -8
addi.d \_dst, \_dst, 8
addi.d \_src, \_src, 8
14:
blt \_size, a3, 11f
ld.w a4, \_src, 0
st.w a4, \_dst, 0
addi.d \_size, \_size, -4
addi.d \_dst, \_dst, 4
addi.d \_src, \_src, 4
11:
beqz \_size, 110f
111:
ld.b a4, \_src, 0
st.b a4, \_dst, 0
addi.d \_size, \_size, -1
addi.d \_dst, \_dst, 1
addi.d \_src, \_src, 1
bnez \_size, 111b
110:
.endm
/*
* a1=16 | a2=8 | a3=4
*/
.macro PIXEL_SET_LSX _dst, _vsrc, _size
blt \_size, a1, 8f
16:
vst \_vsrc, \_dst, 0
addi.d \_size, \_size, -16
addi.d \_dst, \_dst, 16
blt a1, \_size, 16b
8:
blt \_size, a2, 14f
vstelm.d \_vsrc, \_dst, 0, 0
addi.d \_size, \_size, -8
addi.d \_dst, \_dst, 8
14:
blt \_size, a3, 11f
vstelm.w \_vsrc, \_dst, 0, 0
addi.d \_size, \_size, -4
addi.d \_dst, \_dst, 4
11:
beqz \_size, 110f
111:
vstelm.b \_vsrc, \_dst, 0, 0
addi.d \_size, \_size, -1
addi.d \_dst, \_dst, 1
bnez \_size, 111b
110:
.endm
/*
* temp reg: a4 a5 t2 t3 vr0
*/
.macro DEGE_LOOP need_left, need_right
0:
addi.d t2, t6, 0 // dst
addi.d t3, t7, 0 // src
.if \need_left
vldrepl.b vr0, t3, 0
addi.d a5, t0, 0
PIXEL_SET_LSX t2, vr0, a5
.endif
addi.d a5, t4, 0
PIXEL_COPY_LSX t2, t3, a5
.if \need_right
vldrepl.b vr0, t3, -1
addi.d a5, t1, 0
PIXEL_SET_LSX t2, vr0, a5
.endif
addi.d t5, t5, -1
add.d t7, t7, t8
add.d t6, t6, a7
bnez t5, 0b
.endm
/*
* static void emu_edge_c(const intptr_t bw, const intptr_t bh,
* const intptr_t iw, const intptr_t ih,
* const intptr_t x, const intptr_t y,
* pixel *dst, const ptrdiff_t dst_stride,
* const pixel *ref, const ptrdiff_t ref_stride)
*/
function emu_edge_8bpc_lsx
vxor.v vr23, vr23, vr23 // zero
addi.d t0, a3, -1 // ih - 1
addi.d t1, a2, -1 // iw - 1
vreplgr2vr.w vr22, t0
vinsgr2vr.w vr22, t1, 1
vreplgr2vr.w vr0, a5
vinsgr2vr.w vr0, a4, 1 // [0] - h | [1] - w
vclip.w vr2, vr0, vr23, vr22
vpickve2gr.w t0, vr2, 0
ld.d t2, sp, 0
ld.d t8, sp, 8 // ref_stride
mul.w t0, t0, t8
vpickve2gr.w t1, vr2, 1
add.d t2, t2, t1
add.d t7, t0, t2 // ref
addi.d t0, a0, -1 // bw - 1
addi.d t1, a1, -1 // bh - 1
vreplgr2vr.w vr21, t0
vreplgr2vr.w vr22, t1
vilvl.d vr21, vr22, vr21
sub.d t2, zero, a4 // -x
add.d t3, a0, a4
sub.d t3, t3, a2 // x + bw - iw
sub.d t4, zero, a5 // -y
add.d t5, a1, a5
sub.d t5, t5, a3 // y + bh - ih
vreplgr2vr.w vr0, t2
vinsgr2vr.w vr0, t3, 1
vinsgr2vr.w vr0, t4, 2
vinsgr2vr.w vr0, t5, 3
vclip.w vr2, vr0, vr23, vr21
vpickve2gr.w t0, vr2, 0 // left_ext
vpickve2gr.w t1, vr2, 1 // right_ext
vpickve2gr.w t2, vr2, 2 // top_ext
vpickve2gr.w t3, vr2, 3 // bottom_ext
mul.w t6, t2, a7
add.d t4, t0, t1
add.d t5, t2, t3
sub.d t4, a0, t4 // center_w
sub.d t5, a1, t5 // center_h
addi.d a1, zero, 16
addi.d a2, zero, 8
addi.d a3, zero, 4
add.d t6, t6, a6 // blk
beqz t0, 2f
// need_left
beqz t1, 3f
// need_left + need_right
DEGE_LOOP 1, 1
b 5f
2:
// !need_left
beqz t1, 4f
// !need_left + need_right
DEGE_LOOP 0, 1
b 5f
3:
// need_left + !need_right
DEGE_LOOP 1, 0
b 5f
4:
// !need_left + !need_right
DEGE_LOOP 0, 0
5:
vpickve2gr.w t2, vr2, 2 // top_ext
vpickve2gr.w t3, vr2, 3 // bottom_ext
sub.d t7, a7, a0 // dst_stride - bw
mul.w t8, t2, a7
beqz t3, 2f
// need_bottom
sub.d t0, t6, a7 // &dst[-PXSTRIDE(dst_stride)]
1:
addi.d t1, t0, 0
addi.d a5, a0, 0
PIXEL_COPY_LSX t6, t1, a5
add.d t6, t6, t7
addi.d t3, t3, -1
bnez t3, 1b
2:
beqz t2, 3f
// need_top
add.d t8, t8, a6 // blk
1:
addi.d t1, t8, 0
addi.d a5, a0, 0
PIXEL_COPY_LSX a6, t1, a5
add.d a6, a6, t7
addi.d t2, t2, -1
bnez t2, 1b
3:
endfunc