msac.S - mozsearch

/*

 * Copyright © 2023, VideoLAN and dav1d authors

 * Copyright © 2023, Loongson Technology Corporation Limited

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "loongson_asm.S"

const min_prob

  .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0

endconst

const ph_0xff00

.rept 8

  .short 0xff00

.endr

endconst

.macro decode_symbol_adapt w

    addi.d          sp,      sp,     -48

    vldrepl.h       vr0,     a0,      24   //rng

    fst.s           f0,      sp,      0    //val==0

    vld             vr1,     a1,      0    //cdf

.if \w == 16

    vld             vr11,    a1,      16

.endif

    vldrepl.d       vr2,     a0,      16   //dif

    ld.w            t1,      a0,      32   //allow_update_cdf

    la.local        t2,      min_prob

    addi.d          t2,      t2,      30

    slli.w          t3,      a2,      1

    sub.d           t2,      t2,      t3

    vld             vr3,     t2,      0    //min_prob

.if \w == 16

    vld             vr13,    t2,      16

.endif

    vsrli.h         vr4,     vr0,     8    //r = s->rng >> 8

    vslli.h         vr4,     vr4,     8    //r << 8

    vsrli.h         vr5,     vr1,     6

    vslli.h         vr5,     vr5,     7

.if \w == 16

    vsrli.h         vr15,    vr11,    6

    vslli.h         vr15,    vr15,    7

.endif

    vmuh.hu         vr5,     vr4,     vr5

    vadd.h          vr5,     vr5,     vr3  //v

.if \w == 16

    vmuh.hu         vr15,    vr4,     vr15

    vadd.h          vr15,    vr15,    vr13

.endif

    addi.d          t8,      sp,      2

    vst             vr5,     t8,      0    //store v

.if \w == 16

    vst             vr15,    t8,      16

.endif

    vreplvei.h      vr20,    vr2,     3    //c

    vsle.hu         vr6,     vr5,     vr20

.if \w == 16

    vsle.hu         vr16,    vr15,    vr20

    vpickev.b       vr21,    vr16,    vr6

.endif

.if \w <= 8

    vmskltz.h       vr10,    vr6

.else

    vmskltz.b       vr10,    vr21

.endif

    beqz            t1,      .renorm\()\w

    // update_cdf

    alsl.d          t1,      a2,      a1,   1

    ld.h            t2,      t1,      0    //count

    srli.w          t3,      t2,      4    //count >> 4

.if \w == 16

    addi.w          t3,      t3,      5    //rate

.else

    addi.w          t3,      t3,      4

    li.w            t5,      2

    sltu            t5,      t5,      a2

    add.w           t3,      t3,      t5   //rate

.endif

    sltui           t5,      t2,      32

    add.w           t2,      t2,      t5   //count + (count < 32)

    vreplgr2vr.h    vr9,     t3

    vseq.h          vr7,     vr7,     vr7

    vavgr.hu        vr5,     vr6,     vr7  //i >= val ? -1 : 32768

    vsub.h          vr5,     vr5,     vr1

    vsub.h          vr8,     vr1,     vr6

.if \w == 16

    vavgr.hu        vr15,    vr16,    vr7

    vsub.h          vr15,    vr15,    vr11

    vsub.h          vr18,    vr11,    vr16

.endif

    vsra.h          vr5,     vr5,     vr9

    vadd.h          vr8,     vr8,     vr5

.if \w == 4

    fst.d           f8,      a1,      0

.else

    vst             vr8,     a1,      0

.endif

.if \w == 16

    vsra.h          vr15,    vr15,    vr9

    vadd.h          vr18,    vr18,    vr15

    vst             vr18,    a1,      16

.endif

    st.h            t2,      t1,      0

.renorm\()\w:

    vpickve2gr.h    t3,      vr10,    0

    ctz.w           a7,      t3            // ret

    alsl.d          t3,      a7,      t8,      1

    ld.hu           t4,      t3,      0    // v

    ld.hu           t5,      t3,      -2   // u

    sub.w           t5,      t5,      t4   // rng

    slli.d          t4,      t4,      48

    vpickve2gr.d    t6,      vr2,     0

    sub.d           t6,      t6,      t4   // dif

    clz.w           t4,      t5            // d

    xori            t4,      t4,      16   // d

    sll.d           t6,      t6,      t4

    ld.w            t0,      a0,      28   //cnt

    sll.w           t5,      t5,      t4

    sub.w           t7,      t0,      t4   // cnt-d

    st.w            t5,      a0,      24   // store rng

    bgeu            t0,      t4,      9f

    // refill

    ld.d            t0,      a0,      0    // buf_pos

    ld.d            t1,      a0,      8    // buf_end

    addi.d          t2,      t0,      8

    bltu            t1,      t2,      2f

    ld.d            t3,      t0,      0    // next_bits

    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)

    nor             t3,      t3,      t3

    sub.w           t2,      zero,    t1

    revb.d          t3,      t3            // next_bits = bswap(next_bits)

    srli.w          t2,      t2,      3    // num_bytes_read

    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)

    b               3f

1:

    addi.w          t3,      t7,      -48

    srl.d           t3,      t3,      t3   // pad with ones

    b               4f

2:

    bgeu            t0,      t1,      1b

    ld.d            t3,      t1,      -8   // next_bits

    sub.w           t2,      t2,      t1

    sub.w           t1,      t1,      t0   // num_bytes_left

    slli.w          t2,      t2,      3

    srl.d           t3,      t3,      t2

    addi.w          t2,      t7,      -48

    nor             t3,      t3,      t3

    sub.w           t4,      zero,    t2

    revb.d          t3,      t3

    srli.w          t4,      t4,      3

    srl.d           t3,      t3,      t2

    sltu            t2,      t1,      t4

    maskeqz         t1,      t1,      t2

    masknez         t2,      t4,      t2

    or              t2,      t2,      t1   // num_bytes_read

3:

    slli.w          t1,      t2,      3

    add.d           t0,      t0,      t2

    add.w           t7,      t7,      t1   // cnt += num_bits_read

    st.d            t0,      a0,      0

4:

    or              t6,      t6,      t3   // dif |= next_bits

9:

    st.w            t7,      a0,      28   // store cnt

    st.d            t6,      a0,      16   // store dif

    move            a0,      a7

    addi.d          sp,      sp,      48

.endm

function msac_decode_symbol_adapt4_lsx

    decode_symbol_adapt 4

endfunc

function msac_decode_symbol_adapt8_lsx

    decode_symbol_adapt 8

endfunc

function msac_decode_symbol_adapt16_lsx

    decode_symbol_adapt 16

endfunc

function msac_decode_bool_lsx

    ld.w            t0,      a0,      24   // rng

    srli.w          a1,      a1,      6

    ld.d            t1,      a0,      16   // dif

    srli.w          t2,      t0,      8    // r >> 8

    mul.w           t2,      t2,      a1

    ld.w            a5,      a0,      28   // cnt

    srli.w          t2,      t2,      1

    addi.w          t2,      t2,      4    // v

    slli.d          t3,      t2,      48   // vw

    sltu            t4,      t1,      t3

    move            t8,      t4            // ret

    xori            t4,      t4,      1

    maskeqz         t6,      t3,      t4   // if (ret) vw

    sub.d           t6,      t1,      t6   // dif

    slli.w          t5,      t2,      1

    sub.w           t5,      t0,      t5   // r - 2v

    maskeqz         t7,      t5,      t4   // if (ret) r - 2v

    add.w           t5,      t2,      t7   // v(rng)

    // renorm

    clz.w           t4,      t5            // d

    xori            t4,      t4,      16   // d

    sll.d           t6,      t6,      t4

    sll.w           t5,      t5,      t4

    sub.w           t7,      a5,      t4   // cnt-d

    st.w            t5,      a0,      24   // store rng

    bgeu            a5,      t4,      9f

    // refill

    ld.d            t0,      a0,      0    // buf_pos

    ld.d            t1,      a0,      8    // buf_end

    addi.d          t2,      t0,      8

    bltu            t1,      t2,      2f

    ld.d            t3,      t0,      0    // next_bits

    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)

    nor             t3,      t3,      t3

    sub.w           t2,      zero,    t1

    revb.d          t3,      t3            // next_bits = bswap(next_bits)

    srli.w          t2,      t2,      3    // num_bytes_read

    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)

    b               3f

1:

    addi.w          t3,      t7,      -48

    srl.d           t3,      t3,      t3   // pad with ones

    b               4f

2:

    bgeu            t0,      t1,      1b

    ld.d            t3,      t1,      -8   // next_bits

    sub.w           t2,      t2,      t1

    sub.w           t1,      t1,      t0   // num_bytes_left

    slli.w          t2,      t2,      3

    srl.d           t3,      t3,      t2

    addi.w          t2,      t7,      -48

    nor             t3,      t3,      t3

    sub.w           t4,      zero,    t2

    revb.d          t3,      t3

    srli.w          t4,      t4,      3

    srl.d           t3,      t3,      t2

    sltu            t2,      t1,      t4

    maskeqz         t1,      t1,      t2

    masknez         t2,      t4,      t2

    or              t2,      t2,      t1   // num_bytes_read

3:

    slli.w          t1,      t2,      3

    add.d           t0,      t0,      t2

    add.w           t7,      t7,      t1   // cnt += num_bits_read

    st.d            t0,      a0,      0

4:

    or              t6,      t6,      t3   // dif |= next_bits

9:

    st.w            t7,      a0,      28   // store cnt

    st.d            t6,      a0,      16   // store dif

    move            a0,      t8

endfunc

function msac_decode_bool_equi_lsx

    ld.w            t0,      a0,      24   // rng

    ld.d            t1,      a0,      16   // dif

    ld.w            a5,      a0,      28   // cnt

    srli.w          t2,      t0,      8    // r >> 8

    slli.w          t2,      t2,      7

    addi.w          t2,      t2,      4    // v

    slli.d          t3,      t2,      48   // vw

    sltu            t4,      t1,      t3

    move            t8,      t4            // ret

    xori            t4,      t4,      1

    maskeqz         t6,      t3,      t4   // if (ret) vw

    sub.d           t6,      t1,      t6   // dif

    slli.w          t5,      t2,      1

    sub.w           t5,      t0,      t5   // r - 2v

    maskeqz         t7,      t5,      t4   // if (ret) r - 2v

    add.w           t5,      t2,      t7   // v(rng)

    // renorm

    clz.w           t4,      t5            // d

    xori            t4,      t4,      16   // d

    sll.d           t6,      t6,      t4

    sll.w           t5,      t5,      t4

    sub.w           t7,      a5,      t4   // cnt-d

    st.w            t5,      a0,      24   // store rng

    bgeu            a5,      t4,      9f

    // refill

    ld.d            t0,      a0,      0    // buf_pos

    ld.d            t1,      a0,      8    // buf_end

    addi.d          t2,      t0,      8

    bltu            t1,      t2,      2f

    ld.d            t3,      t0,      0    // next_bits

    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)

    nor             t3,      t3,      t3

    sub.w           t2,      zero,    t1

    revb.d          t3,      t3            // next_bits = bswap(next_bits)

    srli.w          t2,      t2,      3    // num_bytes_read

    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)

    b               3f

1:

    addi.w          t3,      t7,      -48

    srl.d           t3,      t3,      t3   // pad with ones

    b               4f

2:

    bgeu            t0,      t1,      1b

    ld.d            t3,      t1,      -8   // next_bits

    sub.w           t2,      t2,      t1

    sub.w           t1,      t1,      t0   // num_bytes_left

    slli.w          t2,      t2,      3

    srl.d           t3,      t3,      t2

    addi.w          t2,      t7,      -48

    nor             t3,      t3,      t3

    sub.w           t4,      zero,    t2

    revb.d          t3,      t3

    srli.w          t4,      t4,      3

    srl.d           t3,      t3,      t2

    sltu            t2,      t1,      t4

    maskeqz         t1,      t1,      t2

    masknez         t2,      t4,      t2

    or              t2,      t2,      t1   // num_bytes_read

3:

    slli.w          t1,      t2,      3

    add.d           t0,      t0,      t2

    add.w           t7,      t7,      t1   // cnt += num_bits_read

    st.d            t0,      a0,      0

4:

    or              t6,      t6,      t3   // dif |= next_bits

9:

    st.w            t7,      a0,      28   // store cnt

    st.d            t6,      a0,      16   // store dif

    move            a0,      t8

endfunc

function msac_decode_bool_adapt_lsx

    ld.hu           a3,      a1,      0    // cdf[0] /f

    ld.w            t0,      a0,      24   // rng

    ld.d            t1,      a0,      16   // dif

    srli.w          t2,      t0,      8    // r >> 8

    srli.w          a7,      a3,      6

    mul.w           t2,      t2,      a7

    ld.w            a4,      a0,      32   // allow_update_cdf

    ld.w            a5,      a0,      28   // cnt

    srli.w          t2,      t2,      1

    addi.w          t2,      t2,      4    // v

    slli.d          t3,      t2,      48   // vw

    sltu            t4,      t1,      t3

    move            t8,      t4            // bit

    xori            t4,      t4,      1

    maskeqz         t6,      t3,      t4   // if (ret) vw

    sub.d           t6,      t1,      t6   // dif

    slli.w          t5,      t2,      1

    sub.w           t5,      t0,      t5   // r - 2v

    maskeqz         t7,      t5,      t4   // if (ret) r - 2v

    add.w           t5,      t2,      t7   // v(rng)

    beqz            a4,      .renorm

    // update_cdf

    ld.hu           t0,      a1,      2    // cdf[1]

    srli.w          t1,      t0,      4

    addi.w          t1,      t1,      4    // rate

    sltui           t2,      t0,      32   // count < 32

    add.w           t0,      t0,      t2   // count + (count < 32)

    sub.w           a3,      a3,      t8   // cdf[0] -= bit

    slli.w          t4,      t8,      15

    sub.w           t7,      a3,      t4   // cdf[0] - bit - 32768

    sra.w           t7,      t7,      t1   // (cdf[0] - bit - 32768) >> rate

    sub.w           t7,      a3,      t7   // cdf[0]

    st.h            t7,      a1,      0

    st.h            t0,      a1,      2

.renorm:

    clz.w           t4,      t5            // d

    xori            t4,      t4,      16   // d

    sll.d           t6,      t6,      t4

    sll.w           t5,      t5,      t4

    sub.w           t7,      a5,      t4   // cnt-d

    st.w            t5,      a0,      24   // store rng

    bgeu            a5,      t4,      9f

    // refill

    ld.d            t0,      a0,      0    // buf_pos

    ld.d            t1,      a0,      8    // buf_end

    addi.d          t2,      t0,      8

    bltu            t1,      t2,      2f

    ld.d            t3,      t0,      0    // next_bits

    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)

    nor             t3,      t3,      t3

    sub.w           t2,      zero,    t1

    revb.d          t3,      t3            // next_bits = bswap(next_bits)

    srli.w          t2,      t2,      3    // num_bytes_read

    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)

    b               3f

1:

    addi.w          t3,      t7,      -48

    srl.d           t3,      t3,      t3   // pad with ones

    b               4f

2:

    bgeu            t0,      t1,      1b

    ld.d            t3,      t1,      -8   // next_bits

    sub.w           t2,      t2,      t1

    sub.w           t1,      t1,      t0   // num_bytes_left

    slli.w          t2,      t2,      3

    srl.d           t3,      t3,      t2

    addi.w          t2,      t7,      -48

    nor             t3,      t3,      t3

    sub.w           t4,      zero,    t2

    revb.d          t3,      t3

    srli.w          t4,      t4,      3

    srl.d           t3,      t3,      t2

    sltu            t2,      t1,      t4

    maskeqz         t1,      t1,      t2

    masknez         t2,      t4,      t2

    or              t2,      t2,      t1   // num_bytes_read

3:

    slli.w          t1,      t2,      3

    add.d           t0,      t0,      t2

    add.w           t7,      t7,      t1   // cnt += num_bits_read

    st.d            t0,      a0,      0

4:

    or              t6,      t6,      t3   // dif |= next_bits

9:

    st.w            t7,      a0,      28   // store cnt

    st.d            t6,      a0,      16   // store dif

    move            a0,      t8

endfunc

.macro HI_TOK allow_update_cdf

.\allow_update_cdf\()_hi_tok_lsx_start:

.if \allow_update_cdf == 1

    ld.hu        a4,    a1,    0x06 // cdf[3]

.endif

    vor.v        vr1,   vr0,   vr0

    vsrli.h      vr1,   vr1,   0x06 // cdf[val] >> EC_PROB_SHIFT

    vstelm.h     vr2,   sp,    0, 0 // -0x1a

    vand.v       vr2,   vr2,   vr4  // (8 x rng) & 0xff00

    vslli.h      vr1,   vr1,   0x07

    vmuh.hu      vr1,   vr1,   vr2

    vadd.h       vr1,   vr1,   vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val);

    vst          vr1,   sp,    0x02 // -0x18

    vssub.hu     vr1,   vr1,   vr3 // v - c

    vseqi.h      vr1,   vr1,   0

.if \allow_update_cdf == 1

    addi.d       t4,    a4,    0x50

    srli.d       t4,    t4,    0x04

    sltui        t7,    a4,    32

    add.w        a4,    a4,    t7

    vreplgr2vr.h vr7,   t4

    vavgr.hu     vr9,   vr8,   vr1

    vsub.h       vr9,   vr9,   vr0

    vsub.h       vr0,   vr0,   vr1

    vsra.h       vr9,   vr9,   vr7

    vadd.h       vr0,   vr0,   vr9

    vstelm.d     vr0,   a1,    0,  0

    st.h         a4,    a1,    0x06

.endif

    vmsknz.b     vr7,   vr1

    movfr2gr.s   t4,    f7

    ctz.w        t4,    t4 // loop_times * 2

    addi.d       t7,    t4,    2

    ldx.hu       t6,    sp,    t4  // u

    ldx.hu       t5,    sp,    t7  // v

    addi.w       t3,    t3,    0x05

    addi.w       t4,    t4,   -0x05 // if t4 == 3, continue

    sub.w        t6,    t6,    t5   // u - v , rng for ctx_norm

    slli.d       t5,    t5,    0x30 //  (ec_win)v << (EC_WIN_SIZE - 16)

    sub.d        t1,    t1,    t5   //  s->dif - ((ec_win)v << (EC_WIN_SIZE - 16))

    // Init ctx_norm  param

    clz.w        t7,    t6

    xori         t7,    t7,    0x1f

    xori         t7,    t7,    0x0f //  d = 15 ^ (31 ^ clz(rng));

    sll.d        t1,    t1,    t7   //  dif << d

    sll.d        t6,    t6,    t7   //  rng << d

    // update vr2 8 x rng

    vreplgr2vr.h vr2,   t6

    vreplvei.h   vr2,   vr2,   0

    st.w         t6,    a0,    0x18 // store rng

    move         t0,    t2

    sub.w        t2,    t2,    t7   // cnt - d

    bgeu         t0,    t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end     // if ((unsigned)cnt < (unsigned)d)  goto ctx_norm_end

    // Step into ctx_fill

    ld.d         t5,    a0,    0x00 // buf_pos

    ld.d         t6,    a0,    0x08 // end_pos

    addi.d       t7,    t5,    0x08 // buf_pos + 8

    sub.d        t7,    t7,    t6   // (buf_pos + 8) - end_pos

    blt          zero,  t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob

    // (end_pos - buf_pos) >= 8

    ld.d         t6,    t5,    0x00 // load buf_pos[0]~buf_pos[7]

    addi.w       t7,    t2,   -0x30 // cnt - 0x30

    nor          t6,    t6,    t6   // not buf data

    revb.d       t6,    t6          // Byte reversal

    srl.d        t6,    t6,    t7   // Replace left shift with right shift

    sub.w        t7,    zero,  t7   // neg

    srli.w       t7,    t7,    0x03 // Loop times

    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c

    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end

.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob:

    bge          t5,    t6,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one

    // end_pos - buf_pos < 8 && buf_pos < end_pos

    ld.d         t0,    t6,   -0x08

    slli.d       t7,    t7,    0x03

    srl.d        t6,    t0,    t7   // Retrieve the buf data and remove the excess data

    addi.w       t7,    t2,   -0x30 // cnt - 0x30

    nor          t6,    t6,    t6   // not

    revb.d       t6,    t6          // Byte reversal

    srl.d        t6,    t6,    t7   // Replace left shift with right shift

    sub.w        t7,    zero,  t7   // neg

    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c

    ld.d         t6,    a0,    0x08 // end_pos

    srli.w       t7,    t7,    0x03 // Loop times

    sub.d        t6,    t6,    t5   // end_pos - buf_pos

    slt          t0,    t6,    t7

    maskeqz      a3,    t6,    t0   // min(loop_times, end_pos - buf_pos)

    masknez      t0,    t7,    t0

    or           t7,    a3,    t0

    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end

.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one:

    // buf_pos >= end_pos

    addi.w       t7,    t2,   -0x10

    andi         t7,    t7,    0xf

    nor          t0,    zero,  zero

    srl.d        t0,    t0,    t7

    or           t1,    t1,    t0 // dif |= ~(~(ec_win)0xff << c);

    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end

.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end:

    add.d        t5,    t5,    t7        // buf_pos + Loop_times

    st.d         t5,    a0,    0x00      // Store buf_pos

    alsl.w       t2,    t7,    t2,  0x03 // update cnt

.\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end:

    srli.d       t7,    t1,    0x30

    vreplgr2vr.h vr3,   t7        // broadcast the high 16 bits of dif

    add.w        t3,    t4,    t3 // update control parameter

    beqz         t3,    .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times.

    blt          zero,  t4,    .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3

.\allow_update_cdf\()_hi_tok_lsx_end:

    addi.d       t3,    t3,    0x1e

    st.d         t1,    a0,    0x10 // store dif

    st.w         t2,    a0,    0x1c // store cnt

    srli.w       a0,    t3,    0x01 // tok

    addi.d       sp,    sp,    0x1a

.endm

/**

 * @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf)

 * * Reg Alloction

 * * vr0: cdf;

 * * vr1: temp;

 * * vr2: rng;

 * * vr3: dif;

 * * vr4: const 0xff00ff00...ff00ff00;

 * * vr5: const 0x0004080c;

 * * vr6: const 0;

 * * t0: allow_update_cdf, tmp;

 * * t1: dif;

 * * t2: cnt;

 * * t3: 0xffffffe8, outermost control parameter;

 * * t4: loop time

 * * t5: v, buf_pos, temp;

 * * t6: u, rng, end_pos, buf, temp;

 * * t7: temp;

*/

function msac_decode_hi_tok_lsx

    fld.d     f0,    a1,   0    // Load cdf[0]~cdf[3]

    vldrepl.h vr2,   a0,   0x18 //  8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid

    vldrepl.h vr3,   a0,   0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16)

    ld.w      t0,    a0,   0x20 // allow_update_cdf

    la.local  t7,    ph_0xff00

    vld       vr4,   t7,   0x00 // 0xff00ff00...ff00ff00

    la.local  t7,    min_prob

    vld       vr5,   t7,   12 * 2 // 0x0004080c

    vxor.v    vr6,   vr6,  vr6    // const 0

    ld.d      t1,    a0,   0x10   // dif

    ld.w      t2,    a0,   0x1c   // cnt

    orn       t3,    t3,   t3

    srli.d    t3,    t3,   32

    addi.d    t3,    t3,  -0x17 // 0xffffffe8

    vseq.h    vr8,   vr8,  vr8

    addi.d    sp,    sp,  -0x1a // alloc stack

    beqz      t0,    .hi_tok_lsx_no_update_cdf

    HI_TOK 1

    jirl      zero,  ra,   0x0

.hi_tok_lsx_no_update_cdf:

    HI_TOK 0

endfunc

Source code

Revision control

Copy as Markdown

Other Tools