Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
#include "src/loongarch/loongson_util.S"
// depending on how many pixels need to be stored, returns:
// t4 = (1 << 0) : 0 pixels
// t4 = (1 << 4) : inner 4 pixels
// t4 = (1 << 6) : inner 6 pixels
// t4 = 0 : all pixels
.macro FILTER wd
functionl lpf_16_wd\wd\()_lsx
vabsd.bu vr0, vr22, vr23 // abs(p1 - p0)
vabsd.bu vr1, vr25, vr24 // abs(q1 - q0)
vabsd.bu vr2, vr23, vr24 // abs(p0 - q0)
vabsd.bu vr3, vr22, vr25 // abs(p1 - q1)
.if \wd >= 6
vabsd.bu vr4, vr21, vr22 // abs(p2 - p1)
vabsd.bu vr5, vr26, vr25 // abs(q2 - q1)
.endif
.if \wd >= 8
vabsd.bu vr6, vr20, vr21 // abs(p3 - p2)
vabsd.bu vr7, vr27, vr26 // abs(q3 - q3)
.endif
.if \wd >= 6
vmax.bu vr4, vr4, vr5
.endif
vsadd.bu vr2, vr2, vr2 // abs(p0 - q0) * 2
.if \wd >= 8
vmax.bu vr6, vr6, vr7
.endif
vsrli.b vr3, vr3, 1 // abs(p1 - q1) >> 1
.if \wd >= 8
vmax.bu vr4, vr4, vr6
.endif
.if \wd >= 6
vand.v vr4, vr4, vr14
.endif
vmax.bu vr0, vr0, vr1 // max(abs(p1 - p0), abs(q1 - q0))
vsadd.bu vr2, vr2, vr3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
vmax.bu vr4, vr0, vr4
vsle.bu vr1, vr4, vr11 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
vsle.bu vr1, vr0, vr11 // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
vsle.bu vr2, vr2, vr10 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
vand.v vr1, vr1, vr2 // fm
vand.v vr1, vr1, vr13 // fm && wd >= 4
.if \wd >= 6
vand.v vr14, vr14, vr1 // fm && wd > 4
.endif
.if \wd >= 16
vand.v vr15, vr15, vr1 // fm && wd == 16
.endif
vhaddw.qu.du vr8, vr1, vr1
vpickve2gr.du t6, vr8, 0
bnez t6, 9f // if (!fm || wd < 4) return;
li.w t4, 1 << 0
jirl zero, ra, 0x00
9:
.if \wd >= 6
vabsd.bu vr2, vr21, vr23 // abs(p2 - p0)
vabsd.bu vr3, vr22, vr23 // abs(p1 - p0)
vabsd.bu vr4, vr25, vr24 // abs(q1 - q0)
vabsd.bu vr5, vr26, vr24 // abs(q2 - q0)
.if \wd >= 8
vabsd.bu vr6, vr20, vr23 // abs(p3 - p0)
vabsd.bu vr7, vr27, vr24 // abs(q3 - q0)
.endif
vmax.bu vr2, vr2, vr3
vmax.bu vr4, vr4, vr5
.if \wd >= 8
vmax.bu vr6, vr6, vr7
.endif
vmax.bu vr2, vr2, vr4
.if \wd >= 8
vmax.bu vr2, vr2, vr6
.endif
.if \wd == 16
vabsd.bu vr3, vr17, vr23 // abs(p6 - p0)
vabsd.bu vr4, vr18, vr23 // abs(p5 - p0)
vabsd.bu vr5, vr19, vr23 // abs(p4 - p0)
.endif
vslei.bu vr2, vr2, 1 // flat8in
.if \wd == 16
vabsd.bu vr6, vr28, vr24 // abs(q4 - q0)
vabsd.bu vr7, vr29, vr24 // abs(q5 - q0)
vabsd.bu vr8, vr30, vr24 // abs(q6 - q0)
.endif
vand.v vr14, vr2, vr14 // flat8in && fm && wd > 4
vandn.v vr1, vr14, vr1 // fm && wd >= 4 && !flat8in
.if \wd == 16
vmax.bu vr3, vr3, vr4
vmax.bu vr5, vr5, vr6
.endif
vhaddw.qu.du vr9, vr1, vr1
.if \wd == 16
vmax.bu vr7, vr7, vr8
vmax.bu vr3, vr3, vr5
vmax.bu vr3, vr3, vr7
vslei.bu vr3, vr3, 1 // flat8out
.endif
vpickve2gr.du t6, vr9, 0
.if \wd == 16
vand.v vr15, vr15, vr3 // flat8out && fm && wd == 16
vand.v vr15, vr15, vr14 // flat8out && flat8in && fm && wd == 16
vandn.v vr14, vr15, vr14 // flat8in && fm && wd >= 4 && !flat8out
.endif
beqz t6, 1f // skip wd == 4 case
.endif
vxori.b vr2, vr22, 128 // p1 - 128
vxori.b vr3, vr25, 128 // q1 - 128
vslt.bu vr0, vr12, vr0 // hev
vssub.b vr2, vr2, vr3 // iclip_diff(p1 - q1)
vand.v vr4, vr2, vr0 // if (hev) iclip_diff(p1 - q1)
vandn.v vr0, vr0, vr1 // (fm && wd >= 4 && !hev)
vxor.v vr5, vr5, vr5
vaddi.hu vr5, vr5, 3
vsubwev.h.bu vr2, vr24, vr23
vsubwod.h.bu vr3, vr24, vr23
vmul.h vr2, vr2, vr5
vmul.h vr3, vr3, vr5
vxor.v vr6, vr6, vr6
vaddwev.h.b vr7, vr4, vr6
vaddwod.h.b vr6, vr4, vr6
vadd.h vr2, vr2, vr7
vadd.h vr3, vr3, vr6
vssrani.b.h vr2, vr2, 0
vssrani.b.h vr3, vr3, 0
vilvl.b vr2, vr3, vr2 // f
vxor.v vr6, vr6, vr6
vaddi.bu vr5, vr6, 3
vaddi.bu vr6, vr6, 4 // 4
vsadd.b vr4, vr6, vr2 // imin(f + 4, 127)
vsadd.b vr5, vr5, vr2 // imin(f + 3, 127)
vsrai.b vr4, vr4, 3 // f1
vsrai.b vr5, vr5, 3 // f2
vaddi.bu vr2, vr23, 0 // p0
vaddi.bu vr3, vr24, 0 // q0
vxori.b vr2, vr2, 128
vxori.b vr3, vr3, 128
vsadd.b vr2, vr2, vr5 // p0 + f2 out p0
vssub.b vr3, vr3, vr4 // q0 - f1 out q0
vxori.b vr2, vr2, 128
vxori.b vr3, vr3, 128
vsrari.b vr4, vr4, 1 // (f1 + 1) >> 1
vbitsel.v vr23, vr23, vr2, vr1 // if (fm && wd >= 4)
vbitsel.v vr24, vr24, vr3, vr1 // if (fm && wd >= 4)
vaddi.bu vr2, vr22, 0 // p1
vaddi.bu vr3, vr25, 0 // q1
vxori.b vr2, vr2, 128
vxori.b vr3, vr3, 128
vsadd.b vr2, vr2, vr4 // out p1
vssub.b vr3, vr3, vr4 // out q1
vxori.b vr2, vr2, 128
vxori.b vr3, vr3, 128
vbitsel.v vr22, vr22, vr2, vr0 // if (fm && wd >= 4 && !hev)
vbitsel.v vr25, vr25, vr3, vr0 // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
vhaddw.qu.du vr0, vr14, vr14
vpickve2gr.du t6, vr0, 0
beqz t6, 2f // skip if there's no flat8in
vaddwev.h.bu vr0, vr21, vr21
vaddwod.h.bu vr1, vr21, vr21 // p2 * 2
vaddwev.h.bu vr2, vr21, vr22
vaddwod.h.bu vr3, vr21, vr22 // p2 + p1
vaddwev.h.bu vr4, vr22, vr23
vaddwod.h.bu vr5, vr22, vr23 // p1 + p0
vaddwev.h.bu vr6, vr23, vr24
vaddwod.h.bu vr7, vr23, vr24 // p0 + q0
vadd.h vr8, vr0, vr2
vadd.h vr9, vr1, vr3
vadd.h vr10, vr4, vr6
vadd.h vr11, vr5, vr7
vaddwev.h.bu vr12, vr24, vr25
vaddwod.h.bu vr13, vr24, vr25 // q0 + q1
vadd.h vr8, vr8, vr10
vadd.h vr9, vr9, vr11
vsub.h vr12, vr12, vr0
vsub.h vr13, vr13, vr1
vaddwev.h.bu vr10, vr25, vr26
vaddwod.h.bu vr11, vr25, vr26 // q1 + q2
vssrlrni.bu.h vr0, vr8, 3
vssrlrni.bu.h vr1, vr9, 3
vilvl.b vr0, vr1, vr0 // out p1
vadd.h vr8, vr8, vr12
vadd.h vr9, vr9, vr13
vsub.h vr10, vr10, vr2
vsub.h vr11, vr11, vr3
vaddwev.h.bu vr12, vr26, vr26 // q2 + q2
vaddwod.h.bu vr13, vr26, vr26
vssrlrni.bu.h vr1, vr8, 3
vssrlrni.bu.h vr2, vr9, 3
vilvl.b vr1, vr2, vr1 // out p0
vadd.h vr8, vr8, vr10
vadd.h vr9, vr9, vr11
vsub.h vr12, vr12, vr4
vsub.h vr13, vr13, vr5
vssrlrni.bu.h vr2, vr8, 3
vssrlrni.bu.h vr3, vr9, 3
vilvl.b vr2, vr3, vr2 // out q0
vbitsel.v vr22, vr22, vr0, vr14
vadd.h vr8, vr8, vr12
vadd.h vr9, vr9, vr13
vbitsel.v vr23, vr23, vr1, vr14
vssrlrni.bu.h vr3, vr8, 3
vssrlrni.bu.h vr4, vr9, 3
vilvl.b vr3, vr4, vr3
vbitsel.v vr24, vr24, vr2, vr14
vbitsel.v vr25, vr25, vr3, vr14
.elseif \wd >= 8
vhaddw.qu.du vr0, vr14, vr14
vpickve2gr.du t6, vr0, 0
.if \wd == 8
beqz t6, 8f // skip if there's no flat8in
.else
beqz t6, 2f // skip if there's no flat8in
.endif
vaddwev.h.bu vr0, vr20, vr21
vaddwod.h.bu vr1, vr20, vr21 // p3 + p2
vaddwev.h.bu vr2, vr22, vr25
vaddwod.h.bu vr3, vr22, vr25 // p1 + q1
vaddwev.h.bu vr4, vr20, vr22
vaddwod.h.bu vr5, vr20, vr22 // p3 + p1
vaddwev.h.bu vr6, vr23, vr26
vaddwod.h.bu vr7, vr23, vr26 // p0 + q2
vadd.h vr8, vr0, vr0
vadd.h vr9, vr1, vr1 // 2 * (p3 + p2)
vxor.v vr10, vr10, vr10
vaddwev.h.bu vr11, vr23, vr10
vaddwod.h.bu vr12, vr23, vr10
vaddwev.h.bu vr13, vr24, vr10
vaddwod.h.bu vr10, vr24, vr10
vadd.h vr8, vr8, vr11 // + p0
vadd.h vr9, vr9, vr12
vadd.h vr8, vr8, vr13 // + q0
vadd.h vr9, vr9, vr10
vadd.h vr8, vr8, vr4
vadd.h vr9, vr9, vr5 // + p3 + p1
vsub.h vr2, vr2, vr0
vsub.h vr3, vr3, vr1 // p1 + q1 - p3 - p2
vsub.h vr6, vr6, vr4
vsub.h vr7, vr7, vr5 // p0 + q2 - p3 - p1
vssrlrni.bu.h vr10, vr8, 3
vssrlrni.bu.h vr11, vr9, 3
vilvl.b vr10, vr11, vr10 // out p2
vadd.h vr8, vr8, vr2
vadd.h vr9, vr9, vr3
vaddwev.h.bu vr0, vr20, vr23
vaddwod.h.bu vr1, vr20, vr23 // p3 + p0
vaddwev.h.bu vr2, vr24, vr27
vaddwod.h.bu vr3, vr24, vr27 // q0 + q3
vssrlrni.bu.h vr11, vr8, 3
vssrlrni.bu.h vr12, vr9, 3
vilvl.b vr11, vr12, vr11 // out p1
vadd.h vr8, vr8, vr6
vadd.h vr9, vr9, vr7
vsub.h vr2, vr2, vr0 // q0 + q3 - p3 - p0
vsub.h vr3, vr3, vr1
vaddwev.h.bu vr4, vr21, vr24 // p2 + q0
vaddwod.h.bu vr5, vr21, vr24
vaddwev.h.bu vr6, vr25, vr27 // q1 + q3
vaddwod.h.bu vr7, vr25, vr27
vssrlrni.bu.h vr12, vr8, 3
vssrlrni.bu.h vr13, vr9, 3
vilvl.b vr12, vr13, vr12 // out p0
vadd.h vr8, vr8, vr2
vadd.h vr9, vr9, vr3
vsub.h vr6, vr6, vr4 // q1 + q3 - p2 - q0
vsub.h vr7, vr7, vr5
vaddwev.h.bu vr0, vr22, vr25 // p1 + q1
vaddwod.h.bu vr1, vr22, vr25
vaddwev.h.bu vr2, vr26, vr27
vaddwod.h.bu vr3, vr26, vr27 // q2 + q3
vssrlrni.bu.h vr13, vr8, 3
vssrlrni.bu.h vr4, vr9, 3
vilvl.b vr13, vr4, vr13 // out q0
vadd.h vr8, vr8, vr6
vadd.h vr9, vr9, vr7
vsub.h vr2, vr2, vr0 // q2 + q3 - p1 - q1
vsub.h vr3, vr3, vr1
vssrlrni.bu.h vr0, vr8, 3
vssrlrni.bu.h vr1, vr9, 3
vilvl.b vr0, vr1, vr0 // out q1
vadd.h vr8, vr8, vr2
vadd.h vr9, vr9, vr3
vbitsel.v vr21, vr21, vr10, vr14
vbitsel.v vr22, vr22, vr11, vr14
vbitsel.v vr23, vr23, vr12, vr14
vbitsel.v vr24, vr24, vr13, vr14
vssrlrni.bu.h vr1, vr8, 3
vssrlrni.bu.h vr2, vr9, 3
vilvl.b vr1, vr2, vr1 // out q2
vbitsel.v vr25, vr25, vr0, vr14
vbitsel.v vr26, vr26, vr1, vr14
.endif
2:
.if \wd == 16
vhaddw.qu.du vr2, vr15, vr15
vpickve2gr.du t6, vr2, 0
bnez t6, 1f // check if flat8out is needed
vhaddw.qu.du vr2, vr14, vr14
vpickve2gr.du t6, vr2, 0
beqz t6, 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
vaddwev.h.bu vr2, vr17, vr17 // p6 + p6
vaddwod.h.bu vr3, vr17, vr17
vaddwev.h.bu vr4, vr17, vr18
vaddwod.h.bu vr5, vr17, vr18 // p6 + p5
vaddwev.h.bu vr6, vr17, vr19
vaddwod.h.bu vr7, vr17, vr19 // p6 + p4
vaddwev.h.bu vr8, vr17, vr20
vaddwod.h.bu vr9, vr17, vr20 // p6 + p3
vadd.h vr12, vr2, vr4
vadd.h vr13, vr3, vr5
vadd.h vr10, vr6, vr8
vadd.h vr11, vr7, vr9
vaddwev.h.bu vr6, vr17, vr21
vaddwod.h.bu vr7, vr17, vr21 // p6 + p2
vadd.h vr12, vr12, vr10
vadd.h vr13, vr13, vr11
vaddwev.h.bu vr8, vr17, vr22
vaddwod.h.bu vr9, vr17, vr22 // p6 + p1
vaddwev.h.bu vr10, vr18, vr23
vaddwod.h.bu vr11, vr18, vr23 // p5 + p0
vadd.h vr6, vr6, vr8
vadd.h vr7, vr7, vr9
vaddwev.h.bu vr8, vr19, vr24
vaddwod.h.bu vr9, vr19, vr24 // p4 + q0
vadd.h vr12, vr12, vr6
vadd.h vr13, vr13, vr7
vadd.h vr10, vr10, vr8
vadd.h vr11, vr11, vr9
vaddwev.h.bu vr6, vr20, vr25
vaddwod.h.bu vr7, vr20, vr25 // p3 + q1
vadd.h vr12, vr12, vr10
vadd.h vr13, vr13, vr11
vsub.h vr6, vr6, vr2
vsub.h vr7, vr7, vr3
vaddwev.h.bu vr2, vr21, vr26
vaddwod.h.bu vr3, vr21, vr26 // p2 + q2
vssrlrni.bu.h vr0, vr12, 4
vssrlrni.bu.h vr1, vr13, 4
vilvl.b vr0, vr1, vr0 // out p5
vadd.h vr12, vr12, vr6
vadd.h vr13, vr13, vr7 // - (p6 + p6) + (p3 + q1)
vsub.h vr2, vr2, vr4
vsub.h vr3, vr3, vr5
vaddwev.h.bu vr4, vr22, vr27
vaddwod.h.bu vr5, vr22, vr27 // p1 + q3
vaddwev.h.bu vr6, vr17, vr19
vaddwod.h.bu vr7, vr17, vr19 // p6 + p4
vssrlrni.bu.h vr1, vr12, 4
vssrlrni.bu.h vr8, vr13, 4
vilvl.b vr1, vr8, vr1 // out p4
vadd.h vr12, vr12, vr2
vadd.h vr13, vr13, vr3 // - (p6 + p5) + (p2 + q2)
vsub.h vr4, vr4, vr6
vsub.h vr5, vr5, vr7
vaddwev.h.bu vr6, vr23, vr28
vaddwod.h.bu vr7, vr23, vr28 // p0 + q4
vaddwev.h.bu vr8, vr17, vr20
vaddwod.h.bu vr9, vr17, vr20 // p6 + p3
vssrlrni.bu.h vr2, vr12, 4
vssrlrni.bu.h vr10, vr13, 4
vilvl.b vr2, vr10, vr2 // out p3
vadd.h vr12, vr12, vr4
vadd.h vr13, vr13, vr5 // - (p6 + p4) + (p1 + q3)
vsub.h vr6, vr6, vr8
vsub.h vr7, vr7, vr9
vaddwev.h.bu vr8, vr24, vr29
vaddwod.h.bu vr9, vr24, vr29 // q0 + q5
vaddwev.h.bu vr4, vr17, vr21
vaddwod.h.bu vr5, vr17, vr21 // p6 + p2
vssrlrni.bu.h vr3, vr12, 4
vssrlrni.bu.h vr11, vr13, 4
vilvl.b vr3, vr11, vr3 // out p2
vadd.h vr12, vr12, vr6
vadd.h vr13, vr13, vr7 // - (p6 + p3) + (p0 + q4)
vsub.h vr8, vr8, vr4
vsub.h vr9, vr9, vr5
vaddwev.h.bu vr6, vr25, vr30
vaddwod.h.bu vr7, vr25, vr30 // q1 + q6
vaddwev.h.bu vr10, vr17, vr22
vaddwod.h.bu vr11, vr17, vr22 // p6 + p1
vssrlrni.bu.h vr4, vr12, 4
vssrlrni.bu.h vr5, vr13, 4
vilvl.b vr4, vr5, vr4 // out p1
vadd.h vr12, vr12, vr8
vadd.h vr13, vr13, vr9 // - (p6 + p2) + (q0 + q5)
vsub.h vr6, vr6, vr10
vsub.h vr7, vr7, vr11
vaddwev.h.bu vr8, vr26, vr30
vaddwod.h.bu vr9, vr26, vr30 // q2 + q6
vbitsel.v vr0, vr18, vr0, vr15 // out p5
vaddwev.h.bu vr10, vr18, vr23
vaddwod.h.bu vr11, vr18, vr23 // p5 + p0
vssrlrni.bu.h vr5, vr12, 4
vssrlrni.bu.h vr18, vr13, 4
vilvl.b vr5, vr18, vr5 // out p0
vadd.h vr12, vr12, vr6
vadd.h vr13, vr13, vr7 // - (p6 + p1) + (q1 + q6)
vsub.h vr8, vr8, vr10
vsub.h vr9, vr9, vr11
vaddwev.h.bu vr10, vr27, vr30
vaddwod.h.bu vr11, vr27, vr30 // q3 + q6
vbitsel.v vr1, vr19, vr1, vr15 // out p4
vaddwev.h.bu vr18, vr19, vr24
vaddwod.h.bu vr19, vr19, vr24 // p4 + q0
vssrlrni.bu.h vr6, vr12, 4
vssrlrni.bu.h vr7, vr13, 4
vilvl.b vr6, vr7, vr6 // out q0
vadd.h vr12, vr12, vr8
vadd.h vr13, vr13, vr9 // - (p5 + p0) + (q2 + q6)
vsub.h vr10, vr10, vr18
vsub.h vr11, vr11, vr19
vaddwev.h.bu vr8, vr28, vr30
vaddwod.h.bu vr9, vr28, vr30 // q4 + q6
vbitsel.v vr2, vr20, vr2, vr15 // out p3
vaddwev.h.bu vr18, vr20, vr25
vaddwod.h.bu vr19, vr20, vr25 // p3 + q1
vssrlrni.bu.h vr7, vr12, 4
vssrlrni.bu.h vr20, vr13, 4
vilvl.b vr7, vr20, vr7 // out q1
vadd.h vr12, vr12, vr10
vadd.h vr13, vr13, vr11 // - (p4 + q0) + (q3 + q6)
vsub.h vr18, vr8, vr18
vsub.h vr19, vr9, vr19
vaddwev.h.bu vr10, vr29, vr30
vaddwod.h.bu vr11, vr29, vr30 // q5 + q6
vbitsel.v vr3, vr21, vr3, vr15 // out p2
vaddwev.h.bu vr20, vr21, vr26
vaddwod.h.bu vr21, vr21, vr26 // p2 + q2
vssrlrni.bu.h vr8, vr12, 4
vssrlrni.bu.h vr9, vr13, 4
vilvl.b vr8, vr9, vr8 // out q2
vadd.h vr12, vr12, vr18
vadd.h vr13, vr13, vr19 // - (p3 + q1) + (q4 + q6)
vsub.h vr10, vr10, vr20
vsub.h vr11, vr11, vr21
vaddwev.h.bu vr18, vr30, vr30
vaddwod.h.bu vr19, vr30, vr30 // q6 + q6
vbitsel.v vr4, vr22, vr4, vr15 // out p1
vaddwev.h.bu vr20, vr22, vr27
vaddwod.h.bu vr21, vr22, vr27 // p1 + q3
vssrlrni.bu.h vr9, vr12, 4
vssrlrni.bu.h vr22, vr13, 4
vilvl.b vr9, vr22, vr9 // out q3
vadd.h vr12, vr12, vr10
vadd.h vr13, vr13, vr11 // - (p2 + q2) + (q5 + q6)
vsub.h vr18, vr18, vr20
vsub.h vr19, vr19, vr21
vbitsel.v vr5, vr23, vr5, vr15 // out p0
vssrlrni.bu.h vr10, vr12, 4
vssrlrni.bu.h vr23, vr13, 4
vilvl.b vr10, vr23, vr10 // out q4
vadd.h vr12, vr12, vr18
vadd.h vr13, vr13, vr19 // - (p1 + q3) + (q6 + q6)
vssrlrni.bu.h vr11, vr12, 4
vssrlrni.bu.h vr12, vr13, 4
vilvl.b vr11, vr12, vr11 // out q5
vbitsel.v vr6, vr24, vr6, vr15
vbitsel.v vr7, vr25, vr7, vr15
vbitsel.v vr8, vr26, vr8, vr15
vbitsel.v vr9, vr27, vr9, vr15
vbitsel.v vr10, vr28, vr10, vr15
vbitsel.v vr11, vr29, vr11, vr15
.endif
li.w t4, 0
jirl zero, ra, 0x00
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
li.w t4, 1 << 6
jirl zero, ra, 0x00
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
li.w t4, 1 << 4
jirl zero, ra, 0x00
.endif
endfuncl
.endm
FILTER 16
FILTER 8
FILTER 6
FILTER 4
.macro LPF_16_WD16
move t7, ra
bl lpf_16_wd16_lsx
move ra, t7
beqz t4, 1f
andi t5, t4, 1 << 6
bnez t5, 7f
andi t5, t4, 1 << 4
bnez t5, 8f
jirl zero, ra, 0x00
1:
.endm
.macro LPF_16_WD8
move t7, ra
bl lpf_16_wd8_lsx
move ra, t7
beqz t4, 1f
andi t5, t4, 1 << 4
bnez t5, 8f
jirl zero, ra, 0x00
1:
.endm
.macro LPF_16_WD6
move t7, ra
bl lpf_16_wd6_lsx
move ra, t7
beqz t4, 1f
jirl zero, ra, 0x00
1:
.endm
.macro LPF_16_WD4
move t7, ra
bl lpf_16_wd4_lsx
move ra, t7
beqz t4, 1f
jirl zero, ra, 0x00
1:
.endm
functionl lpf_v_4_16_lsx
slli.d t3, a1, 1
sub.d t3, a0, t3
vld vr22, t3, 0 // p1
vldx vr23, t3, a1 // p0
vld vr24, a0, 0 // q0
vldx vr25, a0, a1 // q1
LPF_16_WD4
vst vr22, t3, 0 // p1
vstx vr23, t3, a1 // p0
vst vr24, a0, 0 // q0
vstx vr25, a0, a1 // q1
endfuncl
functionl lpf_h_4_16_lsx
addi.d t3, a0, -2
fld.s f22, t3, 0
fldx.s f23, t3, a1
alsl.d t3, a1, t3, 1
fld.s f24, t3, 0
fldx.s f25, t3, a1
alsl.d t3, a1, t3, 1
fld.s f17, t3, 0
fldx.s f18, t3, a1
alsl.d t3, a1, t3, 1
fld.s f19, t3, 0
fldx.s f20, t3, a1
alsl.d t3, a1, t3, 1
vilvl.w vr22, vr17, vr22
vilvl.w vr23, vr18, vr23
vilvl.w vr24, vr19, vr24
vilvl.w vr25, vr20, vr25
fld.s f17, t3, 0
fldx.s f18, t3, a1
alsl.d t3, a1, t3, 1
fld.s f19, t3, 0
fldx.s f20, t3, a1
alsl.d t3, a1, t3, 1
fld.s f26, t3, 0
fldx.s f27, t3, a1
alsl.d t3, a1, t3, 1
fld.s f28, t3, 0
fldx.s f29, t3, a1
alsl.d t3, a1, t3, 1
vilvl.w vr17, vr26, vr17
vilvl.w vr18, vr27, vr18
vilvl.w vr19, vr28, vr19
vilvl.w vr20, vr29, vr20
vilvl.d vr22, vr17, vr22
vilvl.d vr23, vr18, vr23
vilvl.d vr24, vr19, vr24
vilvl.d vr25, vr20, vr25
addi.d a0, t3, 2
TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
LPF_16_WD4
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -2
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 1
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 2
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 3
add.d a0, a0, a1
.endr
addi.d a0, a0, 2
endfuncl
functionl lpf_v_6_16_lsx
slli.d t3, a1, 1
sub.d t3, a0, t3
sub.d s0, t3, a1
vld vr21, s0, 0 // p2
vldx vr22, s0, a1 // p1
alsl.d s0, a1, s0, 1
vld vr23, s0, 0 // p0
vldx vr24, s0, a1 // q0
alsl.d s0, a1, s0, 1
vld vr25, s0, 0 // q1
vldx vr26, s0, a1 // q2
LPF_16_WD6
vst vr22, t3, 0 // p1
vstx vr23, t3, a1 // p0
vst vr24, a0, 0 // q0
vstx vr25, a0, a1 // q1
endfuncl
functionl lpf_h_6_16_lsx
addi.d t3, a0, -4
fld.d f20, t3, 0
fldx.d f21, t3, a1
alsl.d t3, a1, t3, 1
fld.d f22, t3, 0
fldx.d f23, t3, a1
alsl.d t3, a1, t3, 1
fld.d f24, t3, 0
fldx.d f25, t3, a1
alsl.d t3, a1, t3, 1
fld.d f26, t3, 0
fldx.d f27, t3, a1
alsl.d t3, a1, t3, 1
fld.d f16, t3, 0
fldx.d f17, t3, a1
alsl.d t3, a1, t3, 1
fld.d f18, t3, 0
fldx.d f19, t3, a1
alsl.d t3, a1, t3, 1
fld.d f28, t3, 0
fldx.d f29, t3, a1
alsl.d t3, a1, t3, 1
fld.d f30, t3, 0
fldx.d f31, t3, a1
alsl.d t3, a1, t3, 1
vilvl.d vr20, vr16, vr20
vilvl.d vr21, vr17, vr21
vilvl.d vr22, vr18, vr22
vilvl.d vr23, vr19, vr23
vilvl.d vr24, vr28, vr24
vilvl.d vr25, vr29, vr25
vilvl.d vr26, vr30, vr26
vilvl.d vr27, vr31, vr27
addi.d a0, t3, 4
TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
LPF_16_WD6
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_4x16b vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -2
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 1
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 2
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 3
add.d a0, a0, a1
.endr
addi.d a0, a0, 2
endfuncl
functionl lpf_v_8_16_lsx
slli.d t3, a1, 2
sub.d s0, a0, t3
vld vr20, s0, 0 // p3
vldx vr21, s0, a1 // p2
alsl.d s0, a1, s0, 1
vld vr22, s0, 0 // p1
vldx vr23, s0, a1 // p0
alsl.d s0, a1, s0, 1
vld vr24, s0, 0 // q0
vldx vr25, s0, a1 // q1
alsl.d s0, a1, s0, 1
vld vr26, s0, 0 // q2
vldx vr27, s0, a1 // q3
LPF_16_WD8
sub.d t3, a0, t3
add.d t3, t3, a1 // -3
vst vr21, t3, 0 // p2
vstx vr22, t3, a1 // p1
alsl.d t3, a1, t3, 1
vst vr23, t3, 0 // p0
vstx vr24, t3, a1 // q0
alsl.d t3, a1, t3, 1
vst vr25, t3, 0 // q1
vstx vr26, t3, a1 // q2
jirl zero, ra, 0x00
8:
slli.d t3, a1, 1
sub.d t3, a0, t3
vst vr22, t3, 0 // p1
vstx vr23, t3, a1 // p0
alsl.d t3, a1, t3, 1
vst vr24, t3, 0 // q0
vstx vr25, t3, a1 // q1
endfuncl
functionl lpf_h_8_16_lsx
addi.d t3, a0, -4
fld.d f20, t3, 0
fldx.d f21, t3, a1
alsl.d t3, a1, t3, 1
fld.d f22, t3, 0
fldx.d f23, t3, a1
alsl.d t3, a1, t3, 1
fld.d f24, t3, 0
fldx.d f25, t3, a1
alsl.d t3, a1, t3, 1
fld.d f26, t3, 0
fldx.d f27, t3, a1
alsl.d t3, a1, t3, 1
fld.d f16, t3, 0
fldx.d f17, t3, a1
alsl.d t3, a1, t3, 1
fld.d f18, t3, 0
fldx.d f19, t3, a1
alsl.d t3, a1, t3, 1
fld.d f28, t3, 0
fldx.d f29, t3, a1
alsl.d t3, a1, t3, 1
fld.d f30, t3, 0
fldx.d f31, t3, a1
alsl.d t3, a1, t3, 1
vilvl.d vr20, vr16, vr20
vilvl.d vr21, vr17, vr21
vilvl.d vr22, vr18, vr22
vilvl.d vr23, vr19, vr23
vilvl.d vr24, vr28, vr24
vilvl.d vr25, vr29, vr25
vilvl.d vr26, vr30, vr26
vilvl.d vr27, vr31, vr27
addi.d a0, t3, 4
TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
LPF_16_WD8
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_8x16b vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -4
.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
vstelm.d \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
vstelm.d \i, a0, 0, 1
add.d a0, a0, a1
.endr
addi.d a0, a0, 4
jirl zero, ra, 0x00
8:
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -2
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 1
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 2
add.d a0, a0, a1
.endr
.irp i, vr22, vr23, vr24, vr25
vstelm.w \i, a0, 0, 3
add.d a0, a0, a1
.endr
addi.d a0, a0, 2
endfuncl
functionl lpf_v_16_16_lsx
slli.d t3, a1, 3
sub.d s0, a0, t3
add.d s0, s0, a1
vld vr17, s0, 0 // p6
vldx vr18, s0, a1 // p5
alsl.d s0, a1, s0, 1
vld vr19, s0, 0 // p4
vldx vr20, s0, a1 // p3
alsl.d s0, a1, s0, 1
vld vr21, s0, 0 // p2
vldx vr22, s0, a1 // p1
alsl.d s0, a1, s0, 1
vld vr23, s0, 0 // p0
vldx vr24, s0, a1 // q0
alsl.d s0, a1, s0, 1
vld vr25, s0, 0 // q1
vldx vr26, s0, a1 // q2
alsl.d s0, a1, s0, 1
vld vr27, s0, 0 // q3
vldx vr28, s0, a1 // q4
alsl.d s0, a1, s0, 1
vld vr29, s0, 0 // q5
vldx vr30, s0, a1 // q6
LPF_16_WD16
sub.d s0, a0, t3
alsl.d s0, a1, s0, 1
vst vr0, s0, 0 // p5
vstx vr1, s0, a1 // p4
alsl.d s0, a1, s0, 1
vst vr2, s0, 0 // p3
vstx vr3, s0, a1 // p2
alsl.d s0, a1, s0, 1
vst vr4, s0, 0 // p1
vstx vr5, s0, a1 // p0
alsl.d s0, a1, s0, 1
vst vr6, s0, 0 // q0
vstx vr7, s0, a1 // q1
alsl.d s0, a1, s0, 1
vst vr8, s0, 0 // q2
vstx vr9, s0, a1 // q3
alsl.d s0, a1, s0, 1
vst vr10, s0, 0 // q4
vstx vr11, s0, a1 // q5
jirl zero, ra, 0x00
7:
slli.d t3, a1, 1
add.d t3, t3, a1
sub.d s0, a0, t3
vst vr21, s0, 0 // p2
vstx vr22, s0, a1 // p1
alsl.d s0, a1, s0, 1
vst vr23, s0, 0 // p0
vstx vr24, s0, a1 // q0
alsl.d s0, a1, s0, 1
vst vr25, s0, 0 // q1
vstx vr26, s0, a1 // q2
jirl zero, ra, 0x00
8:
slli.d t3, a1, 1
sub.d s0, a0, t3
vst vr22, s0, 0 // p1
vstx vr23, s0, a1 // p0
alsl.d s0, a1, s0, 1
vst vr24, s0, 0 // q0
vstx vr25, s0, a1 // q1
endfuncl
functionl lpf_h_16_16_lsx
addi.d t3, a0, -8
vld vr16, t3, 0
vldx vr17, t3, a1
alsl.d t3, a1, t3, 1
vld vr18, t3, 0
vldx vr19, t3, a1
alsl.d t3, a1, t3, 1
vld vr20, t3, 0
vldx vr21, t3, a1
alsl.d t3, a1, t3, 1
vld vr22, t3, 0
vldx vr23, t3, a1
alsl.d t3, a1, t3, 1
vld vr24, t3, 0
vldx vr25, t3, a1
alsl.d t3, a1, t3, 1
vld vr26, t3, 0
vldx vr27, t3, a1
alsl.d t3, a1, t3, 1
vld vr28, t3, 0
vldx vr29, t3, a1
alsl.d t3, a1, t3, 1
vld vr30, t3, 0
vldx vr31, t3, a1
alsl.d t3, a1, t3, 1
.macro SWAPD in0, in1
vaddi.bu vr0, \in0, 0
vilvl.d \in0, \in1, \in0
vilvh.d \in1, \in1, vr0
.endm
SWAPD vr16, vr24
SWAPD vr17, vr25
SWAPD vr18, vr26
SWAPD vr19, vr27
SWAPD vr20, vr28
SWAPD vr21, vr29
SWAPD vr22, vr30
SWAPD vr23, vr31
addi.d a0, t3, 8
TRANSPOSE_8x16B vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr0, vr1
TRANSPOSE_8x16B vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, vr0, vr1
LPF_16_WD16
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_8x16B vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5, vr18, vr19
TRANSPOSE_8x16B vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31, vr18, vr19
addi.d t3, a0, -8
.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5
vstelm.d \i, t3, 0, 0
add.d t3, t3, a1
.endr
.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5
vstelm.d \i, t3, 0, 1
add.d t3, t3, a1
.endr
.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31
vstelm.d \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31
vstelm.d \i, a0, 0, 1
add.d a0, a0, a1
.endr
jirl zero, ra, 0x00
7:
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -4
.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
vstelm.d \i, a0, 0, 0
add.d a0, a0, a1
.endr
.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
vstelm.d \i, a0, 0, 1
add.d a0, a0, a1
.endr
addi.d a0, a0, 4
jirl zero, ra, 0x00
8:
slli.d t3, a1, 4
sub.d a0, a0, t3
TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
addi.d a0, a0, -2
.irp i, 0, 1, 2, 3
vstelm.w vr22, a0, 0, \i
add.d a0, a0, a1
vstelm.w vr23, a0, 0, \i
add.d a0, a0, a1
vstelm.w vr24, a0, 0, \i
add.d a0, a0, a1
vstelm.w vr25, a0, 0, \i
add.d a0, a0, a1
.endr
addi.d a0, a0, 2
endfuncl
.macro PUSH_REG
addi.d sp, sp, -64-8
fst.d f24, sp, 0
fst.d f25, sp, 8
fst.d f26, sp, 16
fst.d f27, sp, 24
fst.d f28, sp, 32
fst.d f29, sp, 40
fst.d f30, sp, 48
fst.d f31, sp, 56
st.d s0, sp, 64
.endm
.macro POP_REG
fld.d f24, sp, 0
fld.d f25, sp, 8
fld.d f26, sp, 16
fld.d f27, sp, 24
fld.d f28, sp, 32
fld.d f29, sp, 40
fld.d f30, sp, 48
fld.d f31, sp, 56
ld.d s0, sp, 64
addi.d sp, sp, 64+8
.endm
const mask_1248
.word 1, 2, 4, 8
endconst
.macro LPF_FUNC DIR, TYPE
function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx
PUSH_REG
move t8, ra
vld vr0, a2, 0 //vmask
vpickve2gr.wu t0, vr0, 0
vpickve2gr.wu t1, vr0, 1
.ifc \TYPE, y
vpickve2gr.wu t2, vr0, 2
.endif
addi.d a5, a5, 128 // Move to sharp part of lut
.ifc \TYPE, y
or t1, t1, t2 // vmask[1] |= vmaks[2]
.endif
slli.d a4, a4, 2
.ifc \DIR, v
sub.d a4, a3, a4
.else
addi.d a3, a3, -4
.endif
or t0, t0, t1 // vmaks[0] |= vmask[1]
1:
andi t3, t0, 0x0f
.ifc \DIR, v
vld vr0, a4, 0 // l[-b4_stride][]
addi.d a4, a4, 16
vld vr1, a3, 0 // l[0][]
addi.d a3, a3, 16
.else
fld.d f0, a3, 0
fldx.d f1, a3, a4
alsl.d a3, a4, a3, 1
fld.d f2, a3, 0
fldx.d f3, a3, a4
alsl.d a3, a4, a3, 1
vilvl.w vr1, vr1, vr0
vilvl.w vr2, vr3, vr2
vilvl.d vr0, vr2, vr1
vilvh.d vr1, vr2, vr1
.endif
beqz t3, 7f
//l[0][] ? l[0][] : l[-b4_stride][]
vseqi.b vr2, vr1, 0
vbitsel.v vr1, vr1, vr0, vr2
li.w t3, 0xff
vreplgr2vr.w vr3, t3
vand.v vr1, vr1, vr3
vshuf4i.b vr1, vr1, 0x00 // L -- 1 0 2 0
vseqi.w vr2, vr1, 0 // 0 -1 0 -1
vseqi.w vr2, vr2, 0 // L != 0 -- -1 0 -1 0
vhaddw.qu.du vr3, vr2, vr2
vpickve2gr.du t4, vr3, 0
beqz t4, 7f // if (!L) continue
la.local t3, mask_1248 // bits x
vld vr16, t3, 0
vreplgr2vr.w vr13, t0 // vmask[0]
vreplgr2vr.w vr14, t1 // vmaks[1]
vand.v vr13, vr13, vr16
vseqi.w vr13, vr13, 0
vseqi.w vr13, vr13, 0 // if (vmask[0] & x)
vand.v vr13, vr13, vr2 // vmask[0] &= L != 0
vand.v vr14, vr14, vr16
vseqi.w vr14, vr14, 0
vseqi.w vr14, vr14, 0 // if (vmask[1] & x)
.ifc \TYPE, y
vreplgr2vr.w vr15, t2 // vmask[2]
vand.v vr15, vr15, vr16
vseqi.w vr15, vr15, 0
vseqi.w vr15, vr15, 0 // if (vmask[2] & x)
.endif
vldrepl.b vr5, a5, 0 // sharp[0]
addi.d t5, a5, 8
vldrepl.b vr6, t5, 0 // sharp[1]
vsrl.b vr3, vr1, vr5 // L >> sharp[0]
vsrli.b vr12, vr1, 4 // H
vmin.bu vr3, vr3, vr6 // imin(L >> sharp[0], sharp[1])
vaddi.bu vr0, vr1, 2 // L + 2
vmaxi.bu vr11, vr3, 1 // imax(imin(), 1) = limit = I
vslli.b vr0, vr0, 1 // 2*(L + 2)
vadd.b vr10, vr0, vr11 // 2*(L + 2) + limit = E
.ifc \TYPE, y
andi t3, t2, 0x0f
beqz t3, 2f
//wd16
bl lpf_\DIR\()_16_16_lsx
b 8f
2:
.endif
andi t3, t1, 0x0f
beqz t3, 3f
.ifc \TYPE, y
// wd8
bl lpf_\DIR\()_8_16_lsx
.else
// wd6
bl lpf_\DIR\()_6_16_lsx
.endif
b 8f
3:
// wd4
bl lpf_\DIR\()_4_16_lsx
.ifc \DIR, h
b 8f
7:
// For dir h, the functions above increment a0.
// If the whole function is skipped, increment it here instead.
alsl.d a0, a1, a0, 4
.else
7:
.endif
8:
srli.d t0, t0, 4
srli.d t1, t1, 4
.ifc \TYPE, y
srli.d t2, t2, 4
.endif
.ifc \DIR, v
addi.d a0, a0, 16
.else
// For dir h, a0 is returned incremented
.endif
bnez t0, 1b
move ra, t8
POP_REG
endfunc
.endm
LPF_FUNC h, y
LPF_FUNC v, y
LPF_FUNC h, uv
LPF_FUNC v, uv