cdef.S - mozsearch

mozilla-central/third_party/dav1d/src/arm/64/cdef.S

Enable keyboard shortcuts

Source code

File a bug in Core :: Audio/Video: Playback

Revision control

Copy as Markdown

Other Tools

/*

 * Copyright © 2018, VideoLAN and dav1d authors

 * Copyright © 2019, Martin Storsjo

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "src/arm/asm.S"

#include "util.S"

#include "cdef_tmpl.S"

.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret

        tst             w7,  #1 // CDEF_HAVE_LEFT

        b.eq            2f

        // CDEF_HAVE_LEFT

        sub             \s1,  \s1,  #2

        sub             \s2,  \s2,  #2

        tst             w7,  #2 // CDEF_HAVE_RIGHT

        b.eq            1f

        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT

        ldr             \rn\()0, [\s1]

        ldr             s1,      [\s1, #\w]

        ldr             \rn\()2, [\s2]

        ldr             s3,      [\s2, #\w]

        uxtl            v0.8h,   v0.8b

        uxtl            v1.8h,   v1.8b

        uxtl            v2.8h,   v2.8b

        uxtl            v3.8h,   v3.8b

        str             \rw\()0, [x0]

        str             d1,      [x0, #2*\w]

        add             x0,  x0,  #2*\stride

        str             \rw\()2, [x0]

        str             d3,      [x0, #2*\w]

.if \ret

ret

.else

        add             x0,  x0,  #2*\stride

        b               3f

.endif

1:

        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT

        ldr             \rn\()0, [\s1]

        ldr             h1,      [\s1, #\w]

        ldr             \rn\()2, [\s2]

        ldr             h3,      [\s2, #\w]

        uxtl            v0.8h,   v0.8b

        uxtl            v1.8h,   v1.8b

        uxtl            v2.8h,   v2.8b

        uxtl            v3.8h,   v3.8b

        str             \rw\()0, [x0]

        str             s1,      [x0, #2*\w]

        str             s31,     [x0, #2*\w+4]

        add             x0,  x0,  #2*\stride

        str             \rw\()2, [x0]

        str             s3,      [x0, #2*\w]

        str             s31,     [x0, #2*\w+4]

.if \ret

ret

.else

        add             x0,  x0,  #2*\stride

        b               3f

.endif

2:

        // !CDEF_HAVE_LEFT

        tst             w7,  #2 // CDEF_HAVE_RIGHT

        b.eq            1f

        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT

        ldr             \rn\()0, [\s1]

        ldr             h1,      [\s1, #\w]

        ldr             \rn\()2, [\s2]

        ldr             h3,      [\s2, #\w]

        uxtl            v0.8h,  v0.8b

        uxtl            v1.8h,  v1.8b

        uxtl            v2.8h,  v2.8b

        uxtl            v3.8h,  v3.8b

        str             s31, [x0]

        stur            \rw\()0, [x0, #4]

        str             s1,      [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        str             s31, [x0]

        stur            \rw\()2, [x0, #4]

        str             s3,      [x0, #4+2*\w]

.if \ret

ret

.else

        add             x0,  x0,  #2*\stride

        b               3f

.endif

1:

        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT

        ldr             \rn\()0, [\s1]

        ldr             \rn\()1, [\s2]

        uxtl            v0.8h,  v0.8b

        uxtl            v1.8h,  v1.8b

        str             s31,     [x0]

        stur            \rw\()0, [x0, #4]

        str             s31,     [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        str             s31,     [x0]

        stur            \rw\()1, [x0, #4]

        str             s31,     [x0, #4+2*\w]

.if \ret

ret

.else

        add             x0,  x0,  #2*\stride

.endif

3:

.endm

.macro load_n_incr dst, src, incr, w

.if \w == 4

        ld1             {\dst\().s}[0], [\src], \incr

.else

        ld1             {\dst\().8b},   [\src], \incr

.endif

.endm

// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,

//                                    ptrdiff_t src_stride, const pixel (*left)[2],

//                                    const pixel *const top,

//                                    const pixel *const bottom, int h,

//                                    enum CdefEdgeFlags edges);

.macro padding_func w, stride, rn, rw

function cdef_padding\w\()_8bpc_neon, export=1

        cmp             w7,  #0xf // fully edged

        b.eq            cdef_padding\w\()_edged_8bpc_neon

        movi            v30.8h,  #0x80, lsl #8

        mov             v31.16b, v30.16b

        sub             x0,  x0,  #2*(2*\stride+2)

        tst             w7,  #4 // CDEF_HAVE_TOP

        b.ne            1f

        // !CDEF_HAVE_TOP

        st1             {v30.8h, v31.8h}, [x0], #32

.if \w == 8

        st1             {v30.8h, v31.8h}, [x0], #32

.endif

        b               3f

1:

        // CDEF_HAVE_TOP

        add             x9,  x4,  x2

        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0

        // Middle section

3:

        tst             w7,  #1 // CDEF_HAVE_LEFT

        b.eq            2f

        // CDEF_HAVE_LEFT

        tst             w7,  #2 // CDEF_HAVE_RIGHT

        b.eq            1f

        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT

0:

        ld1             {v0.h}[0], [x3], #2

        ldr             h2,      [x1, #\w]

        load_n_incr     v1,  x1,  x2,  \w

        subs            w6,  w6,  #1

        uxtl            v0.8h,  v0.8b

        uxtl            v1.8h,  v1.8b

        uxtl            v2.8h,  v2.8b

        str             s0,      [x0]

        stur            \rw\()1, [x0, #4]

        str             s2,      [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        b.gt            0b

        b               3f

1:

        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT

        ld1             {v0.h}[0], [x3], #2

        load_n_incr     v1,  x1,  x2,  \w

        subs            w6,  w6,  #1

        uxtl            v0.8h,  v0.8b

        uxtl            v1.8h,  v1.8b

        str             s0,      [x0]

        stur            \rw\()1, [x0, #4]

        str             s31,     [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        b.gt            1b

        b               3f

2:

        tst             w7,  #2 // CDEF_HAVE_RIGHT

        b.eq            1f

        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT

0:

        ldr             h1,      [x1, #\w]

        load_n_incr     v0,  x1,  x2,  \w

        subs            w6,  w6,  #1

        uxtl            v0.8h,  v0.8b

        uxtl            v1.8h,  v1.8b

        str             s31,     [x0]

        stur            \rw\()0, [x0, #4]

        str             s1,      [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        b.gt            0b

        b               3f

1:

        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT

        load_n_incr     v0,  x1,  x2,  \w

        subs            w6,  w6,  #1

        uxtl            v0.8h,  v0.8b

        str             s31,     [x0]

        stur            \rw\()0, [x0, #4]

        str             s31,     [x0, #4+2*\w]

        add             x0,  x0,  #2*\stride

        b.gt            1b

3:

        tst             w7,  #8 // CDEF_HAVE_BOTTOM

        b.ne            1f

        // !CDEF_HAVE_BOTTOM

        st1             {v30.8h, v31.8h}, [x0], #32

.if \w == 8

        st1             {v30.8h, v31.8h}, [x0], #32

.endif

ret

1:

        // CDEF_HAVE_BOTTOM

        add             x9,  x5,  x2

        pad_top_bottom  x5,  x9, \w, \stride, \rn, \rw, 1

endfunc

.endm

padding_func 8, 16, d, q

padding_func 4, 8,  s, d

// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,

//                                    ptrdiff_t src_stride, const pixel (*left)[2],

//                                    const pixel *const top,

//                                    const pixel *const bottom, int h,

//                                    enum CdefEdgeFlags edges);

.macro padding_func_edged w, stride, reg

function cdef_padding\w\()_edged_8bpc_neon, export=1

        sub             x4,  x4,  #2

        sub             x5,  x5,  #2

        sub             x0,  x0,  #(2*\stride+2)

.if \w == 4

        ldr             d0, [x4]

        ldr             d1, [x4, x2]

        st1             {v0.8b, v1.8b}, [x0], #16

.else

        add             x9,  x4,  x2

        ldr             d0, [x4]

        ldr             s1, [x4, #8]

        ldr             d2, [x9]

        ldr             s3, [x9, #8]

        str             d0, [x0]

        str             s1, [x0, #8]

        str             d2, [x0, #\stride]

        str             s3, [x0, #\stride+8]

        add             x0,  x0,  #2*\stride

.endif

0:

        ld1             {v0.h}[0], [x3], #2

        ldr             h2,      [x1, #\w]

        load_n_incr     v1,  x1,  x2,  \w

        subs            w6,  w6,  #1

        str             h0,      [x0]

        stur            \reg\()1, [x0, #2]

        str             h2,      [x0, #2+\w]

        add             x0,  x0,  #\stride

        b.gt            0b

.if \w == 4

        ldr             d0, [x5]

        ldr             d1, [x5, x2]

        st1             {v0.8b, v1.8b}, [x0], #16

.else

        add             x9,  x5,  x2

        ldr             d0, [x5]

        ldr             s1, [x5, #8]

        ldr             d2, [x9]

        ldr             s3, [x9, #8]

        str             d0, [x0]

        str             s1, [x0, #8]

        str             d2, [x0, #\stride]

        str             s3, [x0, #\stride+8]

.endif

ret

endfunc

.endm

padding_func_edged 8, 16, d

padding_func_edged 4, 8,  s

tables

filter 8, 8

filter 4, 8

find_dir 8

.macro load_px_8 d1, d2, w

.if \w == 8

        add             x6,  x2,  w9, sxtb          // x + off

        sub             x9,  x2,  w9, sxtb          // x - off

        ld1             {\d1\().d}[0], [x6]         // p0

        add             x6,  x6,  #16               // += stride

        ld1             {\d2\().d}[0], [x9]         // p1

        add             x9,  x9,  #16               // += stride

        ld1             {\d1\().d}[1], [x6]         // p0

        ld1             {\d2\().d}[1], [x9]         // p0

.else

        add             x6,  x2,  w9, sxtb          // x + off

        sub             x9,  x2,  w9, sxtb          // x - off

        ld1             {\d1\().s}[0], [x6]         // p0

        add             x6,  x6,  #8                // += stride

        ld1             {\d2\().s}[0], [x9]         // p1

        add             x9,  x9,  #8                // += stride

        ld1             {\d1\().s}[1], [x6]         // p0

        add             x6,  x6,  #8                // += stride

        ld1             {\d2\().s}[1], [x9]         // p1

        add             x9,  x9,  #8                // += stride

        ld1             {\d1\().s}[2], [x6]         // p0

        add             x6,  x6,  #8                // += stride

        ld1             {\d2\().s}[2], [x9]         // p1

        add             x9,  x9,  #8                // += stride

        ld1             {\d1\().s}[3], [x6]         // p0

        ld1             {\d2\().s}[3], [x9]         // p1

.endif

.endm

.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min

.if \min

        umin            v3.16b,  v3.16b,  \s1\().16b

        umax            v4.16b,  v4.16b,  \s1\().16b

        umin            v3.16b,  v3.16b,  \s2\().16b

        umax            v4.16b,  v4.16b,  \s2\().16b

.endif

        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)

        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)

        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift

        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift

        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))

        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))

        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0

        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1

        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)

        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)

        dup             v19.16b, \tap                 // taps[k]

        neg             v16.16b, v17.16b              // -imin()

        neg             v20.16b, v21.16b              // -imin()

        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()

        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()

        mla             v1.16b,  v18.16b, v19.16b     // sum += taps[k] * constrain()

        mla             v2.16b,  v22.16b, v19.16b     // sum += taps[k] * constrain()

.endm

// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,

//                                   const uint8_t *tmp, int pri_strength,

//                                   int sec_strength, int dir, int damping,

//                                   int h);

.macro filter_func_8 w, pri, sec, min, suffix

function cdef_filter\w\suffix\()_edged_8bpc_neon

.if \pri

        movrel          x8,  pri_taps

        and             w9,  w3,  #1

        add             x8,  x8,  w9, uxtw #1

.endif

        movrel          x9,  directions\w

        add             x5,  x9,  w5, uxtw #1

        movi            v30.8b,  #7

        dup             v28.8b,  w6                 // damping

.if \pri

        dup             v25.16b, w3                 // threshold

.endif

.if \sec

        dup             v27.16b, w4                 // threshold

.endif

        trn1            v24.8b,  v25.8b, v27.8b

        clz             v24.8b,  v24.8b             // clz(threshold)

        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)

        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))

        neg             v24.8b,  v24.8b             // -shift

.if \sec

        dup             v26.16b, v24.b[1]

.endif

.if \pri

        dup             v24.16b, v24.b[0]

.endif

1:

.if \w == 8

        add             x12, x2,  #16

        ld1             {v0.d}[0], [x2]             // px

        ld1             {v0.d}[1], [x12]            // px

.else

        add             x12, x2,  #1*8

        add             x13, x2,  #2*8

        add             x14, x2,  #3*8

        ld1             {v0.s}[0], [x2]             // px

        ld1             {v0.s}[1], [x12]            // px

        ld1             {v0.s}[2], [x13]            // px

        ld1             {v0.s}[3], [x14]            // px

.endif

        // We need 9-bits or two 8-bit accululators to fit the sum.

        // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.

        // Start sum at -1 instead of 0 to help handle rounding later.

        movi            v1.16b, #255                // sum

        movi            v2.16b, #0                  // sum

.if \min

        mov             v3.16b, v0.16b              // min

        mov             v4.16b, v0.16b              // max

.endif

        // Instead of loading sec_taps 2, 1 from memory, just set it

        // to 2 initially and decrease for the second round.

        // This is also used as loop counter.

        mov             w11, #2                     // sec_taps[0]

2:

.if \pri

        ldrb            w9,  [x5]                   // off1

        load_px_8       v5,  v6, \w

.endif

.if \sec

        add             x5,  x5,  #4                // +2*2

        ldrb            w9,  [x5]                   // off2

        load_px_8       v28, v29, \w

.endif

.if \pri

        ldrb            w10, [x8]                   // *pri_taps

        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min

.endif

.if \sec

        add             x5,  x5,  #8                // +2*4

        ldrb            w9,  [x5]                   // off3

        load_px_8       v5,  v6,  \w

        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min

        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min

        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;

.else

        add             x5,  x5,  #1                // x5 += 1

.endif

        subs            w11, w11, #1                // sec_tap-- (value)

.if \pri

        add             x8,  x8,  #1                // pri_taps++ (pointer)

.endif

        b.ne            2b

        // Perform halving adds since the value won't fit otherwise.

        // To handle the offset for negative values, use both halving w/ and w/o rounding.

        srhadd          v5.16b,  v1.16b,  v2.16b    // sum >> 1

        shadd           v6.16b,  v1.16b,  v2.16b    // (sum - 1) >> 1

        cmlt            v1.16b,  v5.16b,  #0        // sum < 0

        bsl             v1.16b,  v6.16b,  v5.16b    // (sum - (sum < 0)) >> 1

        srshr           v1.16b,  v1.16b,  #3        // (8 + sum - (sum < 0)) >> 4

        usqadd          v0.16b,  v1.16b             // px + (8 + sum ...) >> 4

.if \min

        umin            v0.16b,  v0.16b,  v4.16b

        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)

.endif

.if \w == 8

        st1             {v0.d}[0], [x0], x1

        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride

        subs            w7,  w7,  #2                // h -= 2

        st1             {v0.d}[1], [x0], x1

.else

        st1             {v0.s}[0], [x0], x1

        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride

        st1             {v0.s}[1], [x0], x1

        subs            w7,  w7,  #4                // h -= 4

        st1             {v0.s}[2], [x0], x1

        st1             {v0.s}[3], [x0], x1

.endif

        // Reset pri_taps and directions back to the original point

        sub             x5,  x5,  #2

.if \pri

        sub             x8,  x8,  #2

.endif

        b.gt            1b

ret

endfunc

.endm

.macro filter_8 w

filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri

filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec

filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec

.endm

filter_8 8

filter_8 4