Source code

Revision control

Copy as Markdown

Other Tools

/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "loongson_asm.S"
const min_prob
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
endconst
const ph_0xff00
.rept 8
.short 0xff00
.endr
endconst
.macro decode_symbol_adapt w
addi.d sp, sp, -48
vldrepl.h vr0, a0, 24 //rng
fst.s f0, sp, 0 //val==0
vld vr1, a1, 0 //cdf
.if \w == 16
vld vr11, a1, 16
.endif
vldrepl.d vr2, a0, 16 //dif
ld.w t1, a0, 32 //allow_update_cdf
la.local t2, min_prob
addi.d t2, t2, 30
slli.w t3, a2, 1
sub.d t2, t2, t3
vld vr3, t2, 0 //min_prob
.if \w == 16
vld vr13, t2, 16
.endif
vsrli.h vr4, vr0, 8 //r = s->rng >> 8
vslli.h vr4, vr4, 8 //r << 8
vsrli.h vr5, vr1, 6
vslli.h vr5, vr5, 7
.if \w == 16
vsrli.h vr15, vr11, 6
vslli.h vr15, vr15, 7
.endif
vmuh.hu vr5, vr4, vr5
vadd.h vr5, vr5, vr3 //v
.if \w == 16
vmuh.hu vr15, vr4, vr15
vadd.h vr15, vr15, vr13
.endif
addi.d t8, sp, 2
vst vr5, t8, 0 //store v
.if \w == 16
vst vr15, t8, 16
.endif
vreplvei.h vr20, vr2, 3 //c
vsle.hu vr6, vr5, vr20
.if \w == 16
vsle.hu vr16, vr15, vr20
vpickev.b vr21, vr16, vr6
.endif
.if \w <= 8
vmskltz.h vr10, vr6
.else
vmskltz.b vr10, vr21
.endif
beqz t1, .renorm\()\w
// update_cdf
alsl.d t1, a2, a1, 1
ld.h t2, t1, 0 //count
srli.w t3, t2, 4 //count >> 4
.if \w == 16
addi.w t3, t3, 5 //rate
.else
addi.w t3, t3, 4
li.w t5, 2
sltu t5, t5, a2
add.w t3, t3, t5 //rate
.endif
sltui t5, t2, 32
add.w t2, t2, t5 //count + (count < 32)
vreplgr2vr.h vr9, t3
vseq.h vr7, vr7, vr7
vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768
vsub.h vr5, vr5, vr1
vsub.h vr8, vr1, vr6
.if \w == 16
vavgr.hu vr15, vr16, vr7
vsub.h vr15, vr15, vr11
vsub.h vr18, vr11, vr16
.endif
vsra.h vr5, vr5, vr9
vadd.h vr8, vr8, vr5
.if \w == 4
fst.d f8, a1, 0
.else
vst vr8, a1, 0
.endif
.if \w == 16
vsra.h vr15, vr15, vr9
vadd.h vr18, vr18, vr15
vst vr18, a1, 16
.endif
st.h t2, t1, 0
.renorm\()\w:
vpickve2gr.h t3, vr10, 0
ctz.w a7, t3 // ret
alsl.d t3, a7, t8, 1
ld.hu t4, t3, 0 // v
ld.hu t5, t3, -2 // u
sub.w t5, t5, t4 // rng
slli.d t4, t4, 48
vpickve2gr.d t6, vr2, 0
sub.d t6, t6, t4 // dif
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
ld.w t0, a0, 28 //cnt
sll.w t5, t5, t4
sub.w t7, t0, t4 // cnt-d
st.w t5, a0, 24 // store rng
bgeu t0, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
bltu t1, t2, 2f
ld.d t3, t0, 0 // next_bits
addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
nor t3, t3, t3
sub.w t2, zero, t1
revb.d t3, t3 // next_bits = bswap(next_bits)
srli.w t2, t2, 3 // num_bytes_read
srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
b 3f
1:
addi.w t3, t7, -48
srl.d t3, t3, t3 // pad with ones
b 4f
2:
bgeu t0, t1, 1b
ld.d t3, t1, -8 // next_bits
sub.w t2, t2, t1
sub.w t1, t1, t0 // num_bytes_left
slli.w t2, t2, 3
srl.d t3, t3, t2
addi.w t2, t7, -48
nor t3, t3, t3
sub.w t4, zero, t2
revb.d t3, t3
srli.w t4, t4, 3
srl.d t3, t3, t2
sltu t2, t1, t4
maskeqz t1, t1, t2
masknez t2, t4, t2
or t2, t2, t1 // num_bytes_read
3:
slli.w t1, t2, 3
add.d t0, t0, t2
add.w t7, t7, t1 // cnt += num_bits_read
st.d t0, a0, 0
4:
or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, a7
addi.d sp, sp, 48
.endm
function msac_decode_symbol_adapt4_lsx
decode_symbol_adapt 4
endfunc
function msac_decode_symbol_adapt8_lsx
decode_symbol_adapt 8
endfunc
function msac_decode_symbol_adapt16_lsx
decode_symbol_adapt 16
endfunc
function msac_decode_bool_lsx
ld.w t0, a0, 24 // rng
srli.w a1, a1, 6
ld.d t1, a0, 16 // dif
srli.w t2, t0, 8 // r >> 8
mul.w t2, t2, a1
ld.w a5, a0, 28 // cnt
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // ret
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
// renorm
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
sll.w t5, t5, t4
sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
bltu t1, t2, 2f
ld.d t3, t0, 0 // next_bits
addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
nor t3, t3, t3
sub.w t2, zero, t1
revb.d t3, t3 // next_bits = bswap(next_bits)
srli.w t2, t2, 3 // num_bytes_read
srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
b 3f
1:
addi.w t3, t7, -48
srl.d t3, t3, t3 // pad with ones
b 4f
2:
bgeu t0, t1, 1b
ld.d t3, t1, -8 // next_bits
sub.w t2, t2, t1
sub.w t1, t1, t0 // num_bytes_left
slli.w t2, t2, 3
srl.d t3, t3, t2
addi.w t2, t7, -48
nor t3, t3, t3
sub.w t4, zero, t2
revb.d t3, t3
srli.w t4, t4, 3
srl.d t3, t3, t2
sltu t2, t1, t4
maskeqz t1, t1, t2
masknez t2, t4, t2
or t2, t2, t1 // num_bytes_read
3:
slli.w t1, t2, 3
add.d t0, t0, t2
add.w t7, t7, t1 // cnt += num_bits_read
st.d t0, a0, 0
4:
or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
function msac_decode_bool_equi_lsx
ld.w t0, a0, 24 // rng
ld.d t1, a0, 16 // dif
ld.w a5, a0, 28 // cnt
srli.w t2, t0, 8 // r >> 8
slli.w t2, t2, 7
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // ret
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
// renorm
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
sll.w t5, t5, t4
sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
bltu t1, t2, 2f
ld.d t3, t0, 0 // next_bits
addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
nor t3, t3, t3
sub.w t2, zero, t1
revb.d t3, t3 // next_bits = bswap(next_bits)
srli.w t2, t2, 3 // num_bytes_read
srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
b 3f
1:
addi.w t3, t7, -48
srl.d t3, t3, t3 // pad with ones
b 4f
2:
bgeu t0, t1, 1b
ld.d t3, t1, -8 // next_bits
sub.w t2, t2, t1
sub.w t1, t1, t0 // num_bytes_left
slli.w t2, t2, 3
srl.d t3, t3, t2
addi.w t2, t7, -48
nor t3, t3, t3
sub.w t4, zero, t2
revb.d t3, t3
srli.w t4, t4, 3
srl.d t3, t3, t2
sltu t2, t1, t4
maskeqz t1, t1, t2
masknez t2, t4, t2
or t2, t2, t1 // num_bytes_read
3:
slli.w t1, t2, 3
add.d t0, t0, t2
add.w t7, t7, t1 // cnt += num_bits_read
st.d t0, a0, 0
4:
or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
function msac_decode_bool_adapt_lsx
ld.hu a3, a1, 0 // cdf[0] /f
ld.w t0, a0, 24 // rng
ld.d t1, a0, 16 // dif
srli.w t2, t0, 8 // r >> 8
srli.w a7, a3, 6
mul.w t2, t2, a7
ld.w a4, a0, 32 // allow_update_cdf
ld.w a5, a0, 28 // cnt
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // bit
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
beqz a4, .renorm
// update_cdf
ld.hu t0, a1, 2 // cdf[1]
srli.w t1, t0, 4
addi.w t1, t1, 4 // rate
sltui t2, t0, 32 // count < 32
add.w t0, t0, t2 // count + (count < 32)
sub.w a3, a3, t8 // cdf[0] -= bit
slli.w t4, t8, 15
sub.w t7, a3, t4 // cdf[0] - bit - 32768
sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate
sub.w t7, a3, t7 // cdf[0]
st.h t7, a1, 0
st.h t0, a1, 2
.renorm:
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
sll.w t5, t5, t4
sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
bltu t1, t2, 2f
ld.d t3, t0, 0 // next_bits
addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
nor t3, t3, t3
sub.w t2, zero, t1
revb.d t3, t3 // next_bits = bswap(next_bits)
srli.w t2, t2, 3 // num_bytes_read
srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
b 3f
1:
addi.w t3, t7, -48
srl.d t3, t3, t3 // pad with ones
b 4f
2:
bgeu t0, t1, 1b
ld.d t3, t1, -8 // next_bits
sub.w t2, t2, t1
sub.w t1, t1, t0 // num_bytes_left
slli.w t2, t2, 3
srl.d t3, t3, t2
addi.w t2, t7, -48
nor t3, t3, t3
sub.w t4, zero, t2
revb.d t3, t3
srli.w t4, t4, 3
srl.d t3, t3, t2
sltu t2, t1, t4
maskeqz t1, t1, t2
masknez t2, t4, t2
or t2, t2, t1 // num_bytes_read
3:
slli.w t1, t2, 3
add.d t0, t0, t2
add.w t7, t7, t1 // cnt += num_bits_read
st.d t0, a0, 0
4:
or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
.macro HI_TOK allow_update_cdf
.\allow_update_cdf\()_hi_tok_lsx_start:
.if \allow_update_cdf == 1
ld.hu a4, a1, 0x06 // cdf[3]
.endif
vor.v vr1, vr0, vr0
vsrli.h vr1, vr1, 0x06 // cdf[val] >> EC_PROB_SHIFT
vstelm.h vr2, sp, 0, 0 // -0x1a
vand.v vr2, vr2, vr4 // (8 x rng) & 0xff00
vslli.h vr1, vr1, 0x07
vmuh.hu vr1, vr1, vr2
vadd.h vr1, vr1, vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val);
vst vr1, sp, 0x02 // -0x18
vssub.hu vr1, vr1, vr3 // v - c
vseqi.h vr1, vr1, 0
.if \allow_update_cdf == 1
addi.d t4, a4, 0x50
srli.d t4, t4, 0x04
sltui t7, a4, 32
add.w a4, a4, t7
vreplgr2vr.h vr7, t4
vavgr.hu vr9, vr8, vr1
vsub.h vr9, vr9, vr0
vsub.h vr0, vr0, vr1
vsra.h vr9, vr9, vr7
vadd.h vr0, vr0, vr9
vstelm.d vr0, a1, 0, 0
st.h a4, a1, 0x06
.endif
vmsknz.b vr7, vr1
movfr2gr.s t4, f7
ctz.w t4, t4 // loop_times * 2
addi.d t7, t4, 2
ldx.hu t6, sp, t4 // u
ldx.hu t5, sp, t7 // v
addi.w t3, t3, 0x05
addi.w t4, t4, -0x05 // if t4 == 3, continue
sub.w t6, t6, t5 // u - v , rng for ctx_norm
slli.d t5, t5, 0x30 // (ec_win)v << (EC_WIN_SIZE - 16)
sub.d t1, t1, t5 // s->dif - ((ec_win)v << (EC_WIN_SIZE - 16))
// Init ctx_norm param
clz.w t7, t6
xori t7, t7, 0x1f
xori t7, t7, 0x0f // d = 15 ^ (31 ^ clz(rng));
sll.d t1, t1, t7 // dif << d
sll.d t6, t6, t7 // rng << d
// update vr2 8 x rng
vreplgr2vr.h vr2, t6
vreplvei.h vr2, vr2, 0
st.w t6, a0, 0x18 // store rng
move t0, t2
sub.w t2, t2, t7 // cnt - d
bgeu t0, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end // if ((unsigned)cnt < (unsigned)d) goto ctx_norm_end
// Step into ctx_fill
ld.d t5, a0, 0x00 // buf_pos
ld.d t6, a0, 0x08 // end_pos
addi.d t7, t5, 0x08 // buf_pos + 8
sub.d t7, t7, t6 // (buf_pos + 8) - end_pos
blt zero, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob
// (end_pos - buf_pos) >= 8
ld.d t6, t5, 0x00 // load buf_pos[0]~buf_pos[7]
addi.w t7, t2, -0x30 // cnt - 0x30
nor t6, t6, t6 // not buf data
revb.d t6, t6 // Byte reversal
srl.d t6, t6, t7 // Replace left shift with right shift
sub.w t7, zero, t7 // neg
srli.w t7, t7, 0x03 // Loop times
or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob:
bge t5, t6, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one
// end_pos - buf_pos < 8 && buf_pos < end_pos
ld.d t0, t6, -0x08
slli.d t7, t7, 0x03
srl.d t6, t0, t7 // Retrieve the buf data and remove the excess data
addi.w t7, t2, -0x30 // cnt - 0x30
nor t6, t6, t6 // not
revb.d t6, t6 // Byte reversal
srl.d t6, t6, t7 // Replace left shift with right shift
sub.w t7, zero, t7 // neg
or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
ld.d t6, a0, 0x08 // end_pos
srli.w t7, t7, 0x03 // Loop times
sub.d t6, t6, t5 // end_pos - buf_pos
slt t0, t6, t7
maskeqz a3, t6, t0 // min(loop_times, end_pos - buf_pos)
masknez t0, t7, t0
or t7, a3, t0
b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one:
// buf_pos >= end_pos
addi.w t7, t2, -0x10
andi t7, t7, 0xf
nor t0, zero, zero
srl.d t0, t0, t7
or t1, t1, t0 // dif |= ~(~(ec_win)0xff << c);
b .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end:
add.d t5, t5, t7 // buf_pos + Loop_times
st.d t5, a0, 0x00 // Store buf_pos
alsl.w t2, t7, t2, 0x03 // update cnt
.\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end:
srli.d t7, t1, 0x30
vreplgr2vr.h vr3, t7 // broadcast the high 16 bits of dif
add.w t3, t4, t3 // update control parameter
beqz t3, .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times.
blt zero, t4, .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3
.\allow_update_cdf\()_hi_tok_lsx_end:
addi.d t3, t3, 0x1e
st.d t1, a0, 0x10 // store dif
st.w t2, a0, 0x1c // store cnt
srli.w a0, t3, 0x01 // tok
addi.d sp, sp, 0x1a
.endm
/**
* @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf)
* * Reg Alloction
* * vr0: cdf;
* * vr1: temp;
* * vr2: rng;
* * vr3: dif;
* * vr4: const 0xff00ff00...ff00ff00;
* * vr5: const 0x0004080c;
* * vr6: const 0;
* * t0: allow_update_cdf, tmp;
* * t1: dif;
* * t2: cnt;
* * t3: 0xffffffe8, outermost control parameter;
* * t4: loop time
* * t5: v, buf_pos, temp;
* * t6: u, rng, end_pos, buf, temp;
* * t7: temp;
*/
function msac_decode_hi_tok_lsx
fld.d f0, a1, 0 // Load cdf[0]~cdf[3]
vldrepl.h vr2, a0, 0x18 // 8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid
vldrepl.h vr3, a0, 0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16)
ld.w t0, a0, 0x20 // allow_update_cdf
la.local t7, ph_0xff00
vld vr4, t7, 0x00 // 0xff00ff00...ff00ff00
la.local t7, min_prob
vld vr5, t7, 12 * 2 // 0x0004080c
vxor.v vr6, vr6, vr6 // const 0
ld.d t1, a0, 0x10 // dif
ld.w t2, a0, 0x1c // cnt
orn t3, t3, t3
srli.d t3, t3, 32
addi.d t3, t3, -0x17 // 0xffffffe8
vseq.h vr8, vr8, vr8
addi.d sp, sp, -0x1a // alloc stack
beqz t0, .hi_tok_lsx_no_update_cdf
HI_TOK 1
jirl zero, ra, 0x0
.hi_tok_lsx_no_update_cdf:
HI_TOK 0
endfunc