looprestoration_tmpl.S

/*

 * Copyright © 2018, VideoLAN and dav1d authors

 * Copyright © 2018, Martin Storsjo

 * All rights reserved.

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 * 1. Redistributions of source code must retain the above copyright notice, this

 *    list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 *    this list of conditions and the following disclaimer in the documentation

 *    and/or other materials provided with the distribution.

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "src/arm/asm.S"

#define FILTER_OUT_STRIDE 384

.macro sgr_funcs bpc

// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,

//                                               const pixel *src,

//                                               const ptrdiff_t src_stride,

//                                               const int32_t **a,

//                                               const int16_t **b,

//                                               const int w, const int h);

function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1

        stp             d8,  d9,  [sp, #-0x40]!

        stp             d10, d11, [sp, #0x10]

        stp             d12, d13, [sp, #0x20]

        stp             d14, d15, [sp, #0x30]

        ldp             x7,  x8,  [x3]

        ldp             x9,  x3,  [x3, #16]

        ldp             x10, x11, [x4]

        ldp             x12, x4,  [x4, #16]

        mov             x13, #FILTER_OUT_STRIDE

        cmp             w6,  #1

        add             x2,  x1,  x2 // src + stride

        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1

        add             x13, x0,  x13, lsl #1

        movi            v30.8h, #3

        movi            v31.4s, #3

1:

        ld1             {v0.8h, v1.8h}, [x10], #32

        ld1             {v2.8h, v3.8h}, [x11], #32

        ld1             {v4.8h, v5.8h}, [x12], #32

        ld1             {v6.8h, v7.8h}, [x4],  #32

        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48

        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48

        ld1             {v22.4s, v23.4s, v24.4s}, [x9], #48

        ld1             {v25.4s, v26.4s, v27.4s}, [x3], #48

2:

        ext             v8.16b,  v0.16b,  v1.16b, #2  // [0][1]

        ext             v9.16b,  v2.16b,  v3.16b, #2  // [1][1]

        ext             v10.16b, v4.16b,  v5.16b, #2  // [2][1]

        ext             v11.16b, v0.16b,  v1.16b, #4  // [0][2]

        ext             v12.16b, v2.16b,  v3.16b, #4  // [1][2]

        ext             v13.16b, v4.16b,  v5.16b, #4  // [2][2]

        add             v14.8h,  v2.8h,   v8.8h       // [1][0] + [0][1]

        add             v15.8h,  v9.8h,   v10.8h      // [1][1] + [2][1]

        add             v28.8h,  v0.8h,   v11.8h      // [0][0] + [0][2]

        add             v14.8h,  v14.8h,  v12.8h      // () + [1][2]

        add             v29.8h,  v4.8h,   v13.8h      // [2][0] + [2][2]

        ext             v8.16b,  v6.16b,  v7.16b, #2  // [3][1]

        ext             v11.16b, v6.16b,  v7.16b, #4  // [3][2]

        add             v14.8h,  v14.8h,  v15.8h      // mid

        add             v15.8h,  v28.8h,  v29.8h      // corners

        add             v28.8h,  v4.8h,   v9.8h       // [2][0] + [1][1]

        add             v29.8h,  v10.8h,  v8.8h       // [2][1] + [3][1]

        add             v2.8h,   v2.8h,   v12.8h      // [1][0] + [1][2]

        add             v28.8h,  v28.8h,  v13.8h      // () + [2][2]

        add             v4.8h,   v6.8h,   v11.8h      // [3][0] + [3][2]

        add             v0.8h,   v28.8h,  v29.8h      // mid

        add             v2.8h,   v2.8h,   v4.8h       // corners

        shl             v4.8h,   v14.8h,  #2

        mla             v4.8h,   v15.8h,  v30.8h      // * 3 -> a

        shl             v0.8h,   v0.8h,   #2

        mla             v0.8h,   v2.8h,   v30.8h      // * 3 -> a

        ext             v8.16b,  v16.16b, v17.16b, #4 // [0][1]

        ext             v9.16b,  v17.16b, v18.16b, #4

        ext             v10.16b, v16.16b, v17.16b, #8 // [0][2]

        ext             v11.16b, v17.16b, v18.16b, #8

        ext             v12.16b, v19.16b, v20.16b, #4 // [1][1]

        ext             v13.16b, v20.16b, v21.16b, #4

        add             v8.4s,   v8.4s,   v19.4s      // [0][1] + [1][0]

        add             v9.4s,   v9.4s,   v20.4s

        add             v16.4s,  v16.4s,  v10.4s      // [0][0] + [0][2]

        add             v17.4s,  v17.4s,  v11.4s

        ext             v14.16b, v19.16b, v20.16b, #8 // [1][2]

        ext             v15.16b, v20.16b, v21.16b, #8

        add             v16.4s,  v16.4s,  v22.4s      // () + [2][0]

        add             v17.4s,  v17.4s,  v23.4s

        add             v28.4s,  v12.4s,  v14.4s      // [1][1] + [1][2]

        add             v29.4s,  v13.4s,  v15.4s

        ext             v10.16b, v22.16b, v23.16b, #4 // [2][1]

        ext             v11.16b, v23.16b, v24.16b, #4

        add             v8.4s,   v8.4s,   v28.4s      // mid (incomplete)

        add             v9.4s,   v9.4s,   v29.4s

        add             v19.4s,  v19.4s,  v14.4s      // [1][0] + [1][2]

        add             v20.4s,  v20.4s,  v15.4s

        add             v14.4s,  v22.4s,  v12.4s      // [2][0] + [1][1]

        add             v15.4s,  v23.4s,  v13.4s

        ext             v12.16b, v22.16b, v23.16b, #8 // [2][2]

        ext             v13.16b, v23.16b, v24.16b, #8

        ext             v28.16b, v25.16b, v26.16b, #4 // [3][1]

        ext             v29.16b, v26.16b, v27.16b, #4

        add             v8.4s,   v8.4s,   v10.4s      // () + [2][1] = mid

        add             v9.4s,   v9.4s,   v11.4s

        add             v14.4s,  v14.4s,  v10.4s      // () + [2][1]

        add             v15.4s,  v15.4s,  v11.4s

        ext             v10.16b, v25.16b, v26.16b, #8 // [3][2]

        ext             v11.16b, v26.16b, v27.16b, #8

        add             v16.4s,  v16.4s,  v12.4s      // () + [2][2] = corner

        add             v17.4s,  v17.4s,  v13.4s

        add             v12.4s,  v12.4s,  v28.4s      // [2][2] + [3][1]

        add             v13.4s,  v13.4s,  v29.4s

        add             v25.4s,  v25.4s,  v10.4s      // [3][0] + [3][2]

        add             v26.4s,  v26.4s,  v11.4s

        add             v14.4s,  v14.4s,  v12.4s      // mid

        add             v15.4s,  v15.4s,  v13.4s

        add             v19.4s,  v19.4s,  v25.4s      // corner

        add             v20.4s,  v20.4s,  v26.4s

.if \bpc == 8

        ld1             {v25.8b}, [x1], #8            // src

        ld1             {v26.8b}, [x2], #8

.else

        ld1             {v25.8h}, [x1], #16           // src

        ld1             {v26.8h}, [x2], #16

.endif

        shl             v8.4s,   v8.4s,   #2

        shl             v9.4s,   v9.4s,   #2

        mla             v8.4s,   v16.4s,  v31.4s      // * 3 -> b

        mla             v9.4s,   v17.4s,  v31.4s

.if \bpc == 8

        uxtl            v25.8h,  v25.8b               // src

        uxtl            v26.8h,  v26.8b

.endif

        shl             v14.4s,  v14.4s,  #2

        shl             v15.4s,  v15.4s,  #2

        mla             v14.4s,  v19.4s,  v31.4s      // * 3 -> b

        mla             v15.4s,  v20.4s,  v31.4s

        umlsl           v8.4s,   v4.4h,   v25.4h      // b - a * src

        umlsl2          v9.4s,   v4.8h,   v25.8h

        umlsl           v14.4s,  v0.4h,   v26.4h      // b - a * src

        umlsl2          v15.4s,  v0.8h,   v26.8h

        mov             v0.16b,  v1.16b

        rshrn           v8.4h,   v8.4s,   #9

        rshrn2          v8.8h,   v9.4s,   #9

        mov             v2.16b,  v3.16b

        rshrn           v14.4h,  v14.4s,  #9

        rshrn2          v14.8h,  v15.4s,  #9

        subs            w5,  w5,  #8

        mov             v4.16b,  v5.16b

        st1             {v8.8h},  [x0],  #16

        mov             v6.16b,  v7.16b

        st1             {v14.8h}, [x13], #16

        b.le            3f

        mov             v16.16b, v18.16b

        mov             v19.16b, v21.16b

        mov             v22.16b, v24.16b

        mov             v25.16b, v27.16b

        ld1             {v1.8h}, [x10], #16

        ld1             {v3.8h}, [x11], #16

        ld1             {v5.8h}, [x12], #16

        ld1             {v7.8h}, [x4],  #16

        ld1             {v17.4s, v18.4s}, [x7], #32

        ld1             {v20.4s, v21.4s}, [x8], #32

        ld1             {v23.4s, v24.4s}, [x9], #32

        ld1             {v26.4s, v27.4s}, [x3], #32

        b               2b

3:

        ldp             d14, d15, [sp, #0x30]

        ldp             d12, d13, [sp, #0x20]

        ldp             d10, d11, [sp, #0x10]

        ldp             d8,  d9,  [sp], 0x40

ret

endfunc

// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,

//                                           const int32_t **a, const int16_t **b,

//                                           const int w, const int w1,

//                                           const int bitdepth_max);

function sgr_finish_weighted1_\bpc\()bpc_neon, export=1

        ldp             x7,  x8,  [x1]

        ldr             x1,       [x1, #16]

        ldp             x9,  x10, [x2]

        ldr             x2,       [x2, #16]

        dup             v31.8h, w4

        dup             v30.8h, w5

        movi            v6.8h,  #3

        movi            v7.4s,  #3

1:

        ld1             {v0.8h, v1.8h}, [x9],  #32

        ld1             {v2.8h, v3.8h}, [x10], #32

        ld1             {v4.8h, v5.8h}, [x2],  #32

        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48

        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48

        ld1             {v22.4s, v23.4s, v24.4s}, [x1], #48

2:

        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride

        ext             v26.16b, v2.16b,  v3.16b, #2  // 0

        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride

        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride

        ext             v29.16b, v2.16b,  v3.16b, #4  // +1

        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride

        ext             v25.16b, v4.16b,  v5.16b, #4  // +1+stride

        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride

        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride

        add             v2.8h,   v2.8h,   v26.8h

        add             v4.8h,   v4.8h,   v25.8h      // -1+stride, +1+stride

        add             v2.8h,   v2.8h,   v29.8h      // +1

        add             v0.8h,   v0.8h,   v4.8h

        ext             v25.16b, v16.16b, v17.16b, #4 // -stride

        ext             v26.16b, v17.16b, v18.16b, #4

        shl             v2.8h,   v2.8h,   #2

        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride

        ext             v28.16b, v17.16b, v18.16b, #8

        ext             v29.16b, v19.16b, v20.16b, #4 // 0

        ext             v4.16b,  v20.16b, v21.16b, #4

        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a

        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1

        add             v26.4s,  v26.4s,  v20.4s

        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride

        add             v17.4s,  v17.4s,  v28.4s

        ext             v27.16b, v19.16b, v20.16b, #8 // +1

        ext             v28.16b, v20.16b, v21.16b, #8

        add             v16.4s,  v16.4s,  v22.4s      // -1+stride

        add             v17.4s,  v17.4s,  v23.4s

        add             v29.4s,  v29.4s,  v27.4s      // 0, +1

        add             v4.4s,   v4.4s,   v28.4s

        add             v25.4s,  v25.4s,  v29.4s

        add             v26.4s,  v26.4s,  v4.4s

        ext             v27.16b, v22.16b, v23.16b, #4 // +stride

        ext             v28.16b, v23.16b, v24.16b, #4

        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride

        ext             v4.16b,  v23.16b, v24.16b, #8

.if \bpc == 8

        ld1             {v19.8b}, [x0]                // src

.else

        ld1             {v19.8h}, [x0]                // src

.endif

        add             v25.4s,  v25.4s,  v27.4s      // +stride

        add             v26.4s,  v26.4s,  v28.4s

        add             v16.4s,  v16.4s,  v29.4s      // +1+stride

        add             v17.4s,  v17.4s,  v4.4s

        shl             v25.4s,  v25.4s,  #2

        shl             v26.4s,  v26.4s,  #2

        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b

        mla             v26.4s,  v17.4s,  v7.4s

.if \bpc == 8

        uxtl            v19.8h,  v19.8b               // src

.endif

        mov             v0.16b,  v1.16b

        umlsl           v25.4s,  v2.4h,   v19.4h      // b - a * src

        umlsl2          v26.4s,  v2.8h,   v19.8h

        mov             v2.16b,  v3.16b

        rshrn           v25.4h,  v25.4s,  #9

        rshrn2          v25.8h,  v26.4s,  #9

        subs            w3,  w3,  #8

        // weighted1

        mov             v4.16b,  v5.16b

        ld1             {v1.8h}, [x9],  #16

        ld1             {v3.8h}, [x10], #16

        smull           v26.4s,  v25.4h,  v31.4h // v = t1 * w1

        smull2          v27.4s,  v25.8h,  v31.8h

        ld1             {v5.8h}, [x2],  #16

        rshrn           v26.4h,  v26.4s,  #11

        rshrn2          v26.8h,  v27.4s,  #11

        usqadd          v19.8h,  v26.8h

.if \bpc == 8

        mov             v16.16b, v18.16b

        sqxtun          v26.8b,  v19.8h

        mov             v19.16b, v21.16b

        mov             v22.16b, v24.16b

        st1             {v26.8b}, [x0], #8

.else

        mov             v16.16b, v18.16b

        umin            v26.8h,  v19.8h,  v30.8h

        mov             v19.16b, v21.16b

        mov             v22.16b, v24.16b

        st1             {v26.8h}, [x0], #16

.endif

        b.le            3f

        ld1             {v17.4s, v18.4s}, [x7], #32

        ld1             {v20.4s, v21.4s}, [x8], #32

        ld1             {v23.4s, v24.4s}, [x1], #32

        b               2b

3:

ret

endfunc

// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,

//                                               const pixel *src,

//                                               const ptrdiff_t stride,

//                                               const int32_t **a,

//                                               const int16_t **b,

//                                               const int w, const int h);

function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1

        stp             d8,  d9,  [sp, #-0x40]!

        stp             d10, d11, [sp, #0x10]

        stp             d12, d13, [sp, #0x20]

        stp             d14, d15, [sp, #0x30]

        ldp             x3,  x7,  [x3]

        ldp             x4,  x8,  [x4]

        mov             x10, #FILTER_OUT_STRIDE

        cmp             w6,  #1

        add             x2,  x1,  x2 // src + stride

        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1

        add             x10, x0,  x10, lsl #1

        movi            v4.8h,  #5

        movi            v5.4s,  #5

        movi            v6.8h,  #6

        movi            v7.4s,  #6

1:

        ld1             {v0.8h, v1.8h}, [x4], #32

        ld1             {v2.8h, v3.8h}, [x8], #32

        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

2:

        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride

        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride

        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride

        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride

        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride

        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride

        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride

        add             v0.8h,   v0.8h,   v25.8h

        mul             v8.8h,   v25.8h,  v4.8h       // * 5

        mla             v8.8h,   v23.8h,  v6.8h       // * 6

        ext             v22.16b, v16.16b, v17.16b, #4 // -stride

        ext             v23.16b, v17.16b, v18.16b, #4

        ext             v24.16b, v19.16b, v20.16b, #4 // +stride

        ext             v25.16b, v20.16b, v21.16b, #4

        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride

        ext             v27.16b, v17.16b, v18.16b, #8

        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride

        ext             v29.16b, v20.16b, v21.16b, #8

        mul             v0.8h,   v0.8h,   v4.8h       // * 5

        mla             v0.8h,   v2.8h,   v6.8h       // * 6

.if \bpc == 8

        ld1             {v31.8b}, [x1], #8

        ld1             {v30.8b}, [x2], #8

.else

        ld1             {v31.8h}, [x1], #16

        ld1             {v30.8h}, [x2], #16

.endif

        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride

        add             v17.4s,  v17.4s,  v27.4s

        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride

        add             v20.4s,  v20.4s,  v29.4s

        add             v16.4s,  v16.4s,  v19.4s

        add             v17.4s,  v17.4s,  v20.4s

        mul             v9.4s,   v19.4s,  v5.4s       // * 5

        mla             v9.4s,   v24.4s,  v7.4s       // * 6

        mul             v10.4s,  v20.4s,  v5.4s       // * 5

        mla             v10.4s,  v25.4s,  v7.4s       // * 6

        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride

        add             v23.4s,  v23.4s,  v25.4s

        // This is, surprisingly, faster than other variants where the

        // mul+mla pairs are further apart, on Cortex A53.

        mul             v16.4s,  v16.4s,  v5.4s       // * 5

        mla             v16.4s,  v22.4s,  v7.4s       // * 6

        mul             v17.4s,  v17.4s,  v5.4s       // * 5

        mla             v17.4s,  v23.4s,  v7.4s       // * 6

.if \bpc == 8

        uxtl            v31.8h,  v31.8b

        uxtl            v30.8h,  v30.8b

.endif

        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src

        umlsl2          v17.4s,  v0.8h,   v31.8h

        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src

        umlsl2          v10.4s,  v8.8h,   v30.8h

        mov             v0.16b,  v1.16b

        rshrn           v16.4h,  v16.4s,  #9

        rshrn2          v16.8h,  v17.4s,  #9

        rshrn           v9.4h,   v9.4s,   #8

        rshrn2          v9.8h,   v10.4s,  #8

        subs            w5,  w5,  #8

        mov             v2.16b,  v3.16b

        st1             {v16.8h}, [x0],  #16

        st1             {v9.8h},  [x10], #16

        b.le            9f

        mov             v16.16b, v18.16b

        mov             v19.16b, v21.16b

        ld1             {v1.8h}, [x4], #16

        ld1             {v3.8h}, [x8], #16

        ld1             {v17.4s, v18.4s}, [x3], #32

        ld1             {v20.4s, v21.4s}, [x7], #32

        b               2b

9:

        ldp             d14, d15, [sp, #0x30]

        ldp             d12, d13, [sp, #0x20]

        ldp             d10, d11, [sp, #0x10]

        ldp             d8,  d9,  [sp], 0x40

ret

endfunc

// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,

//                                           const int32_t **a,

//                                           const int16_t **b,

//                                           const int w, const int h,

//                                           const int w1,

//                                           const int bitdepth_max);

function sgr_finish_weighted2_\bpc\()bpc_neon, export=1

        stp             d8,  d9,  [sp, #-0x30]!

        str             d10,      [sp, #0x10]

        stp             d14, d15, [sp, #0x20]

        dup             v14.8h, w6

        dup             v15.8h, w7

        ldp             x2,  x7,  [x2]

        ldp             x3,  x8,  [x3]

        cmp             w5,  #1

        add             x1,  x0,  x1 // src + stride

        // if (h <= 1), set the pointer to the second row to any dummy buffer

        // we can clobber (x2 in this case)

        csel            x1,  x2,  x1,  le

        movi            v4.8h,  #5

        movi            v5.4s,  #5

        movi            v6.8h,  #6

        movi            v7.4s,  #6

1:

        ld1             {v0.8h, v1.8h}, [x3], #32

        ld1             {v2.8h, v3.8h}, [x8], #32

        ld1             {v16.4s, v17.4s, v18.4s}, [x2], #48

        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

2:

        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride

        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride

        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride

        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride

        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride

        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride

        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride

        add             v0.8h,   v0.8h,   v25.8h

        mul             v8.8h,   v25.8h,  v4.8h       // * 5

        mla             v8.8h,   v23.8h,  v6.8h       // * 6

        ext             v22.16b, v16.16b, v17.16b, #4 // -stride

        ext             v23.16b, v17.16b, v18.16b, #4

        ext             v24.16b, v19.16b, v20.16b, #4 // +stride

        ext             v25.16b, v20.16b, v21.16b, #4

        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride

        ext             v27.16b, v17.16b, v18.16b, #8

        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride

        ext             v29.16b, v20.16b, v21.16b, #8

        mul             v0.8h,   v0.8h,   v4.8h       // * 5

        mla             v0.8h,   v2.8h,   v6.8h       // * 6

.if \bpc == 8

        ld1             {v31.8b}, [x0]

        ld1             {v30.8b}, [x1]

.else

        ld1             {v31.8h}, [x0]

        ld1             {v30.8h}, [x1]

.endif

        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride

        add             v17.4s,  v17.4s,  v27.4s

        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride

        add             v20.4s,  v20.4s,  v29.4s

        add             v16.4s,  v16.4s,  v19.4s

        add             v17.4s,  v17.4s,  v20.4s

        mul             v9.4s,   v19.4s,  v5.4s       // * 5

        mla             v9.4s,   v24.4s,  v7.4s       // * 6

        mul             v10.4s,  v20.4s,  v5.4s       // * 5

        mla             v10.4s,  v25.4s,  v7.4s       // * 6

        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride

        add             v23.4s,  v23.4s,  v25.4s

        // This is, surprisingly, faster than other variants where the

        // mul+mla pairs are further apart, on Cortex A53.

        mul             v16.4s,  v16.4s,  v5.4s       // * 5

        mla             v16.4s,  v22.4s,  v7.4s       // * 6

        mul             v17.4s,  v17.4s,  v5.4s       // * 5

        mla             v17.4s,  v23.4s,  v7.4s       // * 6

.if \bpc == 8

        uxtl            v31.8h,  v31.8b

        uxtl            v30.8h,  v30.8b

.endif

        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src

        umlsl2          v17.4s,  v0.8h,   v31.8h

        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src

        umlsl2          v10.4s,  v8.8h,   v30.8h

        mov             v0.16b,  v1.16b

        rshrn           v16.4h,  v16.4s,  #9

        rshrn2          v16.8h,  v17.4s,  #9

        rshrn           v9.4h,   v9.4s,   #8

        rshrn2          v9.8h,   v10.4s,  #8

        subs            w4,  w4,  #8

        // weighted1

        mov             v2.16b,  v3.16b

        ld1             {v1.8h}, [x3], #16

        ld1             {v3.8h}, [x8], #16

        smull           v22.4s,  v16.4h,  v14.4h // v

        smull2          v23.4s,  v16.8h,  v14.8h

        mov             v16.16b, v18.16b

        smull           v24.4s,  v9.4h,   v14.4h

        smull2          v25.4s,  v9.8h,   v14.8h

        mov             v19.16b, v21.16b

        rshrn           v22.4h,  v22.4s,  #11

        rshrn2          v22.8h,  v23.4s,  #11

        rshrn           v23.4h,  v24.4s,  #11

        rshrn2          v23.8h,  v25.4s,  #11

        usqadd          v31.8h,  v22.8h

        usqadd          v30.8h,  v23.8h

.if \bpc == 8

        sqxtun          v22.8b,  v31.8h

        sqxtun          v23.8b,  v30.8h

        st1             {v22.8b}, [x0], #8

        st1             {v23.8b}, [x1], #8

.else

        umin            v22.8h,  v31.8h,  v15.8h

        umin            v23.8h,  v30.8h,  v15.8h

        st1             {v22.8h}, [x0], #16

        st1             {v23.8h}, [x1], #16

.endif

        b.le            3f

        ld1             {v17.4s, v18.4s}, [x2], #32

        ld1             {v20.4s, v21.4s}, [x7], #32

        b               2b

3:

        ldp             d14, d15, [sp, #0x20]

        ldr             d10,      [sp, #0x10]

        ldp             d8,  d9,  [sp], 0x30

ret

endfunc

// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,

//                                    const int16_t *t1, const int16_t *t2,

//                                    const int w, const int h,

//                                    const int16_t wt[2], const int bitdepth_max);

function sgr_weighted2_\bpc\()bpc_neon, export=1

        cmp             w5,  #2

        add             x10, x0,  x1

        add             x12, x2,  #2*FILTER_OUT_STRIDE

        add             x13, x3,  #2*FILTER_OUT_STRIDE

        ld2r            {v30.8h, v31.8h}, [x6] // wt[0], wt[1]

.if \bpc == 16

        dup             v29.8h,  w7

.endif

        mov             x8,  #4*FILTER_OUT_STRIDE

        lsl             x1,  x1,  #1

        add             w9,  w4,  #7

        bic             x9,  x9,  #7 // Aligned width

.if \bpc == 8

        sub             x1,  x1,  x9

.else

        sub             x1,  x1,  x9, lsl #1

.endif

        sub             x8,  x8,  x9, lsl #1

        mov             w9,  w4

        b.lt            2f

1:

.if \bpc == 8

        ld1             {v0.8b},  [x0]

        ld1             {v16.8b}, [x10]

.else

        ld1             {v0.8h},  [x0]

        ld1             {v16.8h}, [x10]

.endif

        ld1             {v1.8h},  [x2],  #16

        ld1             {v17.8h}, [x12], #16

        ld1             {v2.8h},  [x3],  #16

        ld1             {v18.8h}, [x13], #16

        subs            w4,  w4,  #8

.if \bpc == 8

        uxtl            v0.8h,  v0.8b

        uxtl            v16.8h, v16.8b

.endif

        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1

        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2

        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1

        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2

        smull           v19.4s, v17.4h, v30.4h // wt[0] * t1

        smlal           v19.4s, v18.4h, v31.4h // wt[1] * t2

        smull2          v20.4s, v17.8h, v30.8h // wt[0] * t1

        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * t2

        rshrn           v3.4h,  v3.4s,  #11

        rshrn2          v3.8h,  v4.4s,  #11

        rshrn           v19.4h, v19.4s, #11

        rshrn2          v19.8h, v20.4s, #11

        usqadd          v0.8h,  v3.8h

        usqadd          v16.8h, v19.8h

.if \bpc == 8

        sqxtun          v3.8b,  v0.8h

        sqxtun          v19.8b, v16.8h

        st1             {v3.8b},  [x0],  #8

        st1             {v19.8b}, [x10], #8

.else

        umin            v3.8h,  v0.8h,  v29.8h

        umin            v19.8h, v16.8h, v29.8h

        st1             {v3.8h},  [x0],  #16

        st1             {v19.8h}, [x10], #16

.endif

        b.gt            1b

        subs            w5,  w5,  #2

        cmp             w5,  #1

        b.lt            0f

        mov             w4,  w9

        add             x0,  x0,  x1

        add             x10, x10, x1

        add             x2,  x2,  x8

        add             x12, x12, x8

        add             x3,  x3,  x8

        add             x13, x13, x8

        b.eq            2f

        b               1b

2:

.if \bpc == 8

        ld1             {v0.8b}, [x0]

.else

        ld1             {v0.8h}, [x0]

.endif

        ld1             {v1.8h}, [x2], #16

        ld1             {v2.8h}, [x3], #16

        subs            w4,  w4,  #8

.if \bpc == 8

        uxtl            v0.8h,  v0.8b

.endif

        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1

        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2

        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1

        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2

        rshrn           v3.4h,  v3.4s,  #11

        rshrn2          v3.8h,  v4.4s,  #11

        usqadd          v0.8h,  v3.8h

.if \bpc == 8

        sqxtun          v3.8b,  v0.8h

        st1             {v3.8b}, [x0], #8

.else

        umin            v3.8h,  v0.8h,  v29.8h

        st1             {v3.8h}, [x0], #16

.endif

        b.gt            2b

0:

ret

endfunc

.endm

Source code

Revision control

Copy as Markdown

Other Tools