Source code
Revision control
Copy as Markdown
Other Tools
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#define FILTER_OUT_STRIDE 384
.macro sgr_funcs bpc
// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
// const pixel *src,
// const ptrdiff_t src_stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h);
function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp x7, x8, [x3]
ldp x9, x3, [x3, #16]
ldp x10, x11, [x4]
ldp x12, x4, [x4, #16]
mov x13, #FILTER_OUT_STRIDE
cmp w6, #1
add x2, x1, x2 // src + stride
csel x2, x1, x2, le // if (h <= 1) x2 = x1
add x13, x0, x13, lsl #1
movi v30.8h, #3
movi v31.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x10], #32
ld1 {v2.8h, v3.8h}, [x11], #32
ld1 {v4.8h, v5.8h}, [x12], #32
ld1 {v6.8h, v7.8h}, [x4], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48
ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48
2:
ext v8.16b, v0.16b, v1.16b, #2 // [0][1]
ext v9.16b, v2.16b, v3.16b, #2 // [1][1]
ext v10.16b, v4.16b, v5.16b, #2 // [2][1]
ext v11.16b, v0.16b, v1.16b, #4 // [0][2]
ext v12.16b, v2.16b, v3.16b, #4 // [1][2]
ext v13.16b, v4.16b, v5.16b, #4 // [2][2]
add v14.8h, v2.8h, v8.8h // [1][0] + [0][1]
add v15.8h, v9.8h, v10.8h // [1][1] + [2][1]
add v28.8h, v0.8h, v11.8h // [0][0] + [0][2]
add v14.8h, v14.8h, v12.8h // () + [1][2]
add v29.8h, v4.8h, v13.8h // [2][0] + [2][2]
ext v8.16b, v6.16b, v7.16b, #2 // [3][1]
ext v11.16b, v6.16b, v7.16b, #4 // [3][2]
add v14.8h, v14.8h, v15.8h // mid
add v15.8h, v28.8h, v29.8h // corners
add v28.8h, v4.8h, v9.8h // [2][0] + [1][1]
add v29.8h, v10.8h, v8.8h // [2][1] + [3][1]
add v2.8h, v2.8h, v12.8h // [1][0] + [1][2]
add v28.8h, v28.8h, v13.8h // () + [2][2]
add v4.8h, v6.8h, v11.8h // [3][0] + [3][2]
add v0.8h, v28.8h, v29.8h // mid
add v2.8h, v2.8h, v4.8h // corners
shl v4.8h, v14.8h, #2
mla v4.8h, v15.8h, v30.8h // * 3 -> a
shl v0.8h, v0.8h, #2
mla v0.8h, v2.8h, v30.8h // * 3 -> a
ext v8.16b, v16.16b, v17.16b, #4 // [0][1]
ext v9.16b, v17.16b, v18.16b, #4
ext v10.16b, v16.16b, v17.16b, #8 // [0][2]
ext v11.16b, v17.16b, v18.16b, #8
ext v12.16b, v19.16b, v20.16b, #4 // [1][1]
ext v13.16b, v20.16b, v21.16b, #4
add v8.4s, v8.4s, v19.4s // [0][1] + [1][0]
add v9.4s, v9.4s, v20.4s
add v16.4s, v16.4s, v10.4s // [0][0] + [0][2]
add v17.4s, v17.4s, v11.4s
ext v14.16b, v19.16b, v20.16b, #8 // [1][2]
ext v15.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // () + [2][0]
add v17.4s, v17.4s, v23.4s
add v28.4s, v12.4s, v14.4s // [1][1] + [1][2]
add v29.4s, v13.4s, v15.4s
ext v10.16b, v22.16b, v23.16b, #4 // [2][1]
ext v11.16b, v23.16b, v24.16b, #4
add v8.4s, v8.4s, v28.4s // mid (incomplete)
add v9.4s, v9.4s, v29.4s
add v19.4s, v19.4s, v14.4s // [1][0] + [1][2]
add v20.4s, v20.4s, v15.4s
add v14.4s, v22.4s, v12.4s // [2][0] + [1][1]
add v15.4s, v23.4s, v13.4s
ext v12.16b, v22.16b, v23.16b, #8 // [2][2]
ext v13.16b, v23.16b, v24.16b, #8
ext v28.16b, v25.16b, v26.16b, #4 // [3][1]
ext v29.16b, v26.16b, v27.16b, #4
add v8.4s, v8.4s, v10.4s // () + [2][1] = mid
add v9.4s, v9.4s, v11.4s
add v14.4s, v14.4s, v10.4s // () + [2][1]
add v15.4s, v15.4s, v11.4s
ext v10.16b, v25.16b, v26.16b, #8 // [3][2]
ext v11.16b, v26.16b, v27.16b, #8
add v16.4s, v16.4s, v12.4s // () + [2][2] = corner
add v17.4s, v17.4s, v13.4s
add v12.4s, v12.4s, v28.4s // [2][2] + [3][1]
add v13.4s, v13.4s, v29.4s
add v25.4s, v25.4s, v10.4s // [3][0] + [3][2]
add v26.4s, v26.4s, v11.4s
add v14.4s, v14.4s, v12.4s // mid
add v15.4s, v15.4s, v13.4s
add v19.4s, v19.4s, v25.4s // corner
add v20.4s, v20.4s, v26.4s
.if \bpc == 8
ld1 {v25.8b}, [x1], #8 // src
ld1 {v26.8b}, [x2], #8
.else
ld1 {v25.8h}, [x1], #16 // src
ld1 {v26.8h}, [x2], #16
.endif
shl v8.4s, v8.4s, #2
shl v9.4s, v9.4s, #2
mla v8.4s, v16.4s, v31.4s // * 3 -> b
mla v9.4s, v17.4s, v31.4s
.if \bpc == 8
uxtl v25.8h, v25.8b // src
uxtl v26.8h, v26.8b
.endif
shl v14.4s, v14.4s, #2
shl v15.4s, v15.4s, #2
mla v14.4s, v19.4s, v31.4s // * 3 -> b
mla v15.4s, v20.4s, v31.4s
umlsl v8.4s, v4.4h, v25.4h // b - a * src
umlsl2 v9.4s, v4.8h, v25.8h
umlsl v14.4s, v0.4h, v26.4h // b - a * src
umlsl2 v15.4s, v0.8h, v26.8h
mov v0.16b, v1.16b
rshrn v8.4h, v8.4s, #9
rshrn2 v8.8h, v9.4s, #9
mov v2.16b, v3.16b
rshrn v14.4h, v14.4s, #9
rshrn2 v14.8h, v15.4s, #9
subs w5, w5, #8
mov v4.16b, v5.16b
st1 {v8.8h}, [x0], #16
mov v6.16b, v7.16b
st1 {v14.8h}, [x13], #16
b.le 3f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
mov v22.16b, v24.16b
mov v25.16b, v27.16b
ld1 {v1.8h}, [x10], #16
ld1 {v3.8h}, [x11], #16
ld1 {v5.8h}, [x12], #16
ld1 {v7.8h}, [x4], #16
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x8], #32
ld1 {v23.4s, v24.4s}, [x9], #32
ld1 {v26.4s, v27.4s}, [x3], #32
b 2b
3:
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc
// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
// const int32_t **a, const int16_t **b,
// const int w, const int w1,
// const int bitdepth_max);
function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
ldp x7, x8, [x1]
ldr x1, [x1, #16]
ldp x9, x10, [x2]
ldr x2, [x2, #16]
dup v31.8h, w4
dup v30.8h, w5
movi v6.8h, #3
movi v7.4s, #3
1:
ld1 {v0.8h, v1.8h}, [x9], #32
ld1 {v2.8h, v3.8h}, [x10], #32
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48
ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48
2:
ext v25.16b, v0.16b, v1.16b, #2 // -stride
ext v26.16b, v2.16b, v3.16b, #2 // 0
ext v27.16b, v4.16b, v5.16b, #2 // +stride
ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
ext v29.16b, v2.16b, v3.16b, #4 // +1
add v2.8h, v2.8h, v25.8h // -1, -stride
ext v25.16b, v4.16b, v5.16b, #4 // +1+stride
add v26.8h, v26.8h, v27.8h // 0, +stride
add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
add v2.8h, v2.8h, v26.8h
add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v2.8h, v29.8h // +1
add v0.8h, v0.8h, v4.8h
ext v25.16b, v16.16b, v17.16b, #4 // -stride
ext v26.16b, v17.16b, v18.16b, #4
shl v2.8h, v2.8h, #2
ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
ext v28.16b, v17.16b, v18.16b, #8
ext v29.16b, v19.16b, v20.16b, #4 // 0
ext v4.16b, v20.16b, v21.16b, #4
mla v2.8h, v0.8h, v6.8h // * 3 -> a
add v25.4s, v25.4s, v19.4s // -stride, -1
add v26.4s, v26.4s, v20.4s
add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v28.4s
ext v27.16b, v19.16b, v20.16b, #8 // +1
ext v28.16b, v20.16b, v21.16b, #8
add v16.4s, v16.4s, v22.4s // -1+stride
add v17.4s, v17.4s, v23.4s
add v29.4s, v29.4s, v27.4s // 0, +1
add v4.4s, v4.4s, v28.4s
add v25.4s, v25.4s, v29.4s
add v26.4s, v26.4s, v4.4s
ext v27.16b, v22.16b, v23.16b, #4 // +stride
ext v28.16b, v23.16b, v24.16b, #4
ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
ext v4.16b, v23.16b, v24.16b, #8
.if \bpc == 8
ld1 {v19.8b}, [x0] // src
.else
ld1 {v19.8h}, [x0] // src
.endif
add v25.4s, v25.4s, v27.4s // +stride
add v26.4s, v26.4s, v28.4s
add v16.4s, v16.4s, v29.4s // +1+stride
add v17.4s, v17.4s, v4.4s
shl v25.4s, v25.4s, #2
shl v26.4s, v26.4s, #2
mla v25.4s, v16.4s, v7.4s // * 3 -> b
mla v26.4s, v17.4s, v7.4s
.if \bpc == 8
uxtl v19.8h, v19.8b // src
.endif
mov v0.16b, v1.16b
umlsl v25.4s, v2.4h, v19.4h // b - a * src
umlsl2 v26.4s, v2.8h, v19.8h
mov v2.16b, v3.16b
rshrn v25.4h, v25.4s, #9
rshrn2 v25.8h, v26.4s, #9
subs w3, w3, #8
// weighted1
mov v4.16b, v5.16b
ld1 {v1.8h}, [x9], #16
ld1 {v3.8h}, [x10], #16
smull v26.4s, v25.4h, v31.4h // v = t1 * w1
smull2 v27.4s, v25.8h, v31.8h
ld1 {v5.8h}, [x2], #16
rshrn v26.4h, v26.4s, #11
rshrn2 v26.8h, v27.4s, #11
usqadd v19.8h, v26.8h
.if \bpc == 8
mov v16.16b, v18.16b
sqxtun v26.8b, v19.8h
mov v19.16b, v21.16b
mov v22.16b, v24.16b
st1 {v26.8b}, [x0], #8
.else
mov v16.16b, v18.16b
umin v26.8h, v19.8h, v30.8h
mov v19.16b, v21.16b
mov v22.16b, v24.16b
st1 {v26.8h}, [x0], #16
.endif
b.le 3f
ld1 {v17.4s, v18.4s}, [x7], #32
ld1 {v20.4s, v21.4s}, [x8], #32
ld1 {v23.4s, v24.4s}, [x1], #32
b 2b
3:
ret
endfunc
// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
// const pixel *src,
// const ptrdiff_t stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h);
function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp x3, x7, [x3]
ldp x4, x8, [x4]
mov x10, #FILTER_OUT_STRIDE
cmp w6, #1
add x2, x1, x2 // src + stride
csel x2, x1, x2, le // if (h <= 1) x2 = x1
add x10, x0, x10, lsl #1
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x4], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
mul v8.8h, v25.8h, v4.8h // * 5
mla v8.8h, v23.8h, v6.8h // * 6
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
.if \bpc == 8
ld1 {v31.8b}, [x1], #8
ld1 {v30.8b}, [x2], #8
.else
ld1 {v31.8h}, [x1], #16
ld1 {v30.8h}, [x2], #16
.endif
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
mul v9.4s, v19.4s, v5.4s // * 5
mla v9.4s, v24.4s, v7.4s // * 6
mul v10.4s, v20.4s, v5.4s // * 5
mla v10.4s, v25.4s, v7.4s // * 6
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
.if \bpc == 8
uxtl v31.8h, v31.8b
uxtl v30.8h, v30.8b
.endif
umlsl v16.4s, v0.4h, v31.4h // b - a * src
umlsl2 v17.4s, v0.8h, v31.8h
umlsl v9.4s, v8.4h, v30.4h // b - a * src
umlsl2 v10.4s, v8.8h, v30.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
rshrn v9.4h, v9.4s, #8
rshrn2 v9.8h, v10.4s, #8
subs w5, w5, #8
mov v2.16b, v3.16b
st1 {v16.8h}, [x0], #16
st1 {v9.8h}, [x10], #16
b.le 9f
mov v16.16b, v18.16b
mov v19.16b, v21.16b
ld1 {v1.8h}, [x4], #16
ld1 {v3.8h}, [x8], #16
ld1 {v17.4s, v18.4s}, [x3], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
9:
ldp d14, d15, [sp, #0x30]
ldp d12, d13, [sp, #0x20]
ldp d10, d11, [sp, #0x10]
ldp d8, d9, [sp], 0x40
ret
endfunc
// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
// const int32_t **a,
// const int16_t **b,
// const int w, const int h,
// const int w1,
// const int bitdepth_max);
function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
stp d8, d9, [sp, #-0x30]!
str d10, [sp, #0x10]
stp d14, d15, [sp, #0x20]
dup v14.8h, w6
dup v15.8h, w7
ldp x2, x7, [x2]
ldp x3, x8, [x3]
cmp w5, #1
add x1, x0, x1 // src + stride
// if (h <= 1), set the pointer to the second row to any dummy buffer
// we can clobber (x2 in this case)
csel x1, x2, x1, le
movi v4.8h, #5
movi v5.4s, #5
movi v6.8h, #6
movi v7.4s, #6
1:
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.8h, v3.8h}, [x8], #32
ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48
ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
2:
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
mul v8.8h, v25.8h, v4.8h // * 5
mla v8.8h, v23.8h, v6.8h // * 6
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
ext v24.16b, v19.16b, v20.16b, #4 // +stride
ext v25.16b, v20.16b, v21.16b, #4
ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
ext v27.16b, v17.16b, v18.16b, #8
ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
ext v29.16b, v20.16b, v21.16b, #8
mul v0.8h, v0.8h, v4.8h // * 5
mla v0.8h, v2.8h, v6.8h // * 6
.if \bpc == 8
ld1 {v31.8b}, [x0]
ld1 {v30.8b}, [x1]
.else
ld1 {v31.8h}, [x0]
ld1 {v30.8h}, [x1]
.endif
add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
add v17.4s, v17.4s, v27.4s
add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
add v20.4s, v20.4s, v29.4s
add v16.4s, v16.4s, v19.4s
add v17.4s, v17.4s, v20.4s
mul v9.4s, v19.4s, v5.4s // * 5
mla v9.4s, v24.4s, v7.4s // * 6
mul v10.4s, v20.4s, v5.4s // * 5
mla v10.4s, v25.4s, v7.4s // * 6
add v22.4s, v22.4s, v24.4s // -stride, +stride
add v23.4s, v23.4s, v25.4s
// This is, surprisingly, faster than other variants where the
// mul+mla pairs are further apart, on Cortex A53.
mul v16.4s, v16.4s, v5.4s // * 5
mla v16.4s, v22.4s, v7.4s // * 6
mul v17.4s, v17.4s, v5.4s // * 5
mla v17.4s, v23.4s, v7.4s // * 6
.if \bpc == 8
uxtl v31.8h, v31.8b
uxtl v30.8h, v30.8b
.endif
umlsl v16.4s, v0.4h, v31.4h // b - a * src
umlsl2 v17.4s, v0.8h, v31.8h
umlsl v9.4s, v8.4h, v30.4h // b - a * src
umlsl2 v10.4s, v8.8h, v30.8h
mov v0.16b, v1.16b
rshrn v16.4h, v16.4s, #9
rshrn2 v16.8h, v17.4s, #9
rshrn v9.4h, v9.4s, #8
rshrn2 v9.8h, v10.4s, #8
subs w4, w4, #8
// weighted1
mov v2.16b, v3.16b
ld1 {v1.8h}, [x3], #16
ld1 {v3.8h}, [x8], #16
smull v22.4s, v16.4h, v14.4h // v
smull2 v23.4s, v16.8h, v14.8h
mov v16.16b, v18.16b
smull v24.4s, v9.4h, v14.4h
smull2 v25.4s, v9.8h, v14.8h
mov v19.16b, v21.16b
rshrn v22.4h, v22.4s, #11
rshrn2 v22.8h, v23.4s, #11
rshrn v23.4h, v24.4s, #11
rshrn2 v23.8h, v25.4s, #11
usqadd v31.8h, v22.8h
usqadd v30.8h, v23.8h
.if \bpc == 8
sqxtun v22.8b, v31.8h
sqxtun v23.8b, v30.8h
st1 {v22.8b}, [x0], #8
st1 {v23.8b}, [x1], #8
.else
umin v22.8h, v31.8h, v15.8h
umin v23.8h, v30.8h, v15.8h
st1 {v22.8h}, [x0], #16
st1 {v23.8h}, [x1], #16
.endif
b.le 3f
ld1 {v17.4s, v18.4s}, [x2], #32
ld1 {v20.4s, v21.4s}, [x7], #32
b 2b
3:
ldp d14, d15, [sp, #0x20]
ldr d10, [sp, #0x10]
ldp d8, d9, [sp], 0x30
ret
endfunc
// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2], const int bitdepth_max);
function sgr_weighted2_\bpc\()bpc_neon, export=1
cmp w5, #2
add x10, x0, x1
add x12, x2, #2*FILTER_OUT_STRIDE
add x13, x3, #2*FILTER_OUT_STRIDE
ld2r {v30.8h, v31.8h}, [x6] // wt[0], wt[1]
.if \bpc == 16
dup v29.8h, w7
.endif
mov x8, #4*FILTER_OUT_STRIDE
lsl x1, x1, #1
add w9, w4, #7
bic x9, x9, #7 // Aligned width
.if \bpc == 8
sub x1, x1, x9
.else
sub x1, x1, x9, lsl #1
.endif
sub x8, x8, x9, lsl #1
mov w9, w4
b.lt 2f
1:
.if \bpc == 8
ld1 {v0.8b}, [x0]
ld1 {v16.8b}, [x10]
.else
ld1 {v0.8h}, [x0]
ld1 {v16.8h}, [x10]
.endif
ld1 {v1.8h}, [x2], #16
ld1 {v17.8h}, [x12], #16
ld1 {v2.8h}, [x3], #16
ld1 {v18.8h}, [x13], #16
subs w4, w4, #8
.if \bpc == 8
uxtl v0.8h, v0.8b
uxtl v16.8h, v16.8b
.endif
smull v3.4s, v1.4h, v30.4h // wt[0] * t1
smlal v3.4s, v2.4h, v31.4h // wt[1] * t2
smull2 v4.4s, v1.8h, v30.8h // wt[0] * t1
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * t2
smull v19.4s, v17.4h, v30.4h // wt[0] * t1
smlal v19.4s, v18.4h, v31.4h // wt[1] * t2
smull2 v20.4s, v17.8h, v30.8h // wt[0] * t1
smlal2 v20.4s, v18.8h, v31.8h // wt[1] * t2
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
rshrn v19.4h, v19.4s, #11
rshrn2 v19.8h, v20.4s, #11
usqadd v0.8h, v3.8h
usqadd v16.8h, v19.8h
.if \bpc == 8
sqxtun v3.8b, v0.8h
sqxtun v19.8b, v16.8h
st1 {v3.8b}, [x0], #8
st1 {v19.8b}, [x10], #8
.else
umin v3.8h, v0.8h, v29.8h
umin v19.8h, v16.8h, v29.8h
st1 {v3.8h}, [x0], #16
st1 {v19.8h}, [x10], #16
.endif
b.gt 1b
subs w5, w5, #2
cmp w5, #1
b.lt 0f
mov w4, w9
add x0, x0, x1
add x10, x10, x1
add x2, x2, x8
add x12, x12, x8
add x3, x3, x8
add x13, x13, x8
b.eq 2f
b 1b
2:
.if \bpc == 8
ld1 {v0.8b}, [x0]
.else
ld1 {v0.8h}, [x0]
.endif
ld1 {v1.8h}, [x2], #16
ld1 {v2.8h}, [x3], #16
subs w4, w4, #8
.if \bpc == 8
uxtl v0.8h, v0.8b
.endif
smull v3.4s, v1.4h, v30.4h // wt[0] * t1
smlal v3.4s, v2.4h, v31.4h // wt[1] * t2
smull2 v4.4s, v1.8h, v30.8h // wt[0] * t1
smlal2 v4.4s, v2.8h, v31.8h // wt[1] * t2
rshrn v3.4h, v3.4s, #11
rshrn2 v3.8h, v4.4s, #11
usqadd v0.8h, v3.8h
.if \bpc == 8
sqxtun v3.8b, v0.8h
st1 {v3.8b}, [x0], #8
.else
umin v3.8h, v0.8h, v29.8h
st1 {v3.8h}, [x0], #16
.endif
b.gt 2b
0:
ret
endfunc
.endm