Source code

Revision control

Copy as Markdown

Other Tools

; Copyright © 2020, VideoLAN and dav1d authors
; Copyright © 2020, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
dd 1
pw_2048: times 2 dw 2048
dd 3
pw_8192: times 2 dw 8192
avg_shift: dw 5, 5, 3, 3
pw_27615: times 2 dw 27615
pw_32766: times 2 dw 32766
warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
resize_permE: dq 0, 2, 4, 6
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
prep_hv_shift: dq 6, 4
put_bilin_h_rnd: dw 8, 8, 10, 10
prep_mul: dw 16, 16, 4, 4
put_8tap_h_rnd: dd 34, 40
prep_8tap_rnd: dd 128 - (8192 << 8)
warp_8x8_rnd_h: dd 512, 2048
warp_8x8_rnd_v: dd 262144, 65536
warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
avg_round: dw -16400, -16400, -16388, -16388
w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
w_mask_round: dd 128, 64
bidir_shift: dw 6, 6, 4, 4
pb_64: times 4 db 64
pw_m512: times 2 dw -512
pw_2: times 2 dw 2
pw_64: times 2 dw 64
pd_32: dd 32
pd_63: dd 63
pd_128: dd 128
pd_640: dd 640
pd_2176: dd 2176
pd_16384: dd 16384
pd_0_4: dd 0, 4
%define pw_16 prep_mul
%define pd_512 warp_8x8_rnd_h
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
%macro BIDIR_JMP_TABLE 2-*
%xdefine %1_%2_table (%%table - 2*%3)
%xdefine %%base %1_%2_table
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
%%table:
%rep %0 - 2
dd %%prefix %+ .w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
cextern mc_warp_filter
cextern obmc_masks_avx2
cextern resize_filter
SECTION .text
%if WIN64
DECLARE_REG_TMP 4
%else
DECLARE_REG_TMP 8
%endif
INIT_ZMM avx512icl
cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
mov mxyd, r6m ; mx
lea r7, [put_avx512icl]
tzcnt t0d, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx t0d, word [r7+t0*2+table_offset(put,)]
add t0, r7
jmp t0
.put_w2:
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6d
mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r6
mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movu xmm0, [srcq+ssq*0]
movu xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], xmm0
mova [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
movu ym0, [srcq+ssq*0]
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], ym0
mova [dstq+dsq*1], ym1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+64*0], m0
mova [dstq+dsq*0+64*1], m1
mova [dstq+dsq*1+64*0], m2
mova [dstq+dsq*1+64*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w64
RET
.put_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
add srcq, ssq
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .put_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r7m ; my
vpbroadcastd m4, [pw_16]
psubw m4, m5
test mxyd, mxyd
jnz .hv
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
mov r6d, r8m ; bitdepth_max
add t0, r7
shr r6d, 11
vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
jmp t0
.h_w2:
movq xmm1, [srcq+ssq*0]
movhps xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2
RET
.h_w4:
movq xmm0, [srcq+ssq*0+0]
movhps xmm0, [srcq+ssq*1+0]
movq xmm1, [srcq+ssq*0+2]
movhps xmm1, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
pmullw xmm0, xm4
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 4
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+ssq*0+0]
vinserti32x4 ym0, [srcq+ssq*1+0], 1
movu xm1, [srcq+ssq*0+2]
vinserti32x4 ym1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw ym0, ym4
pmullw ym1, ym5
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 4
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+ssq*0+0]
vinserti32x8 m0, [srcq+ssq*1+0], 1
movu ym1, [srcq+ssq*0+2]
vinserti32x8 m1, [srcq+ssq*1+2], 1
lea srcq, [srcq+ssq*2]
pmullw m0, m4
pmullw m1, m5
paddw m0, m6
paddw m0, m1
psrlw m0, 4
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
pmullw m1, m4, [srcq+ssq*1+0]
pmullw m3, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+64*0+0]
pmullw m2, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m3, m5, [srcq+64*1+2]
add srcq, ssq
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 4
psrlw m1, 4
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+64*0+0]
pmullw m7, m5, [srcq+64*0+2]
pmullw m1, m4, [srcq+64*1+0]
pmullw m8, m5, [srcq+64*1+2]
pmullw m2, m4, [srcq+64*2+0]
pmullw m9, m5, [srcq+64*2+2]
pmullw m3, m4, [srcq+64*3+0]
pmullw m10, m5, [srcq+64*3+2]
add srcq, ssq
REPX {paddw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psrlw x, 4}, m0, m1, m2, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
shl mxyd, 11
vpbroadcastw m8, mxyd
add t0, r7
jmp t0
.v_w2:
movd xmm0, [srcq+ssq*0]
.v_w2_loop:
movd xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpckldq xmm2, xmm0, xmm1
movd xmm0, [srcq+ssq*0]
punpckldq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm0, [srcq+ssq*0]
.v_w4_loop:
movq xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq xmm2, xmm0, xmm1
movq xmm0, [srcq+ssq*0]
punpcklqdq xmm1, xmm0
psubw xmm1, xmm2
pmulhrsw xmm1, xm8
paddw xmm1, xmm2
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movu xmm0, [srcq+ssq*0]
.v_w8_loop:
vbroadcasti128 ymm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd ymm2, ymm0, ymm1, 0xf0
vbroadcasti128 ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm0, 0xf0
psubw ymm1, ymm2
pmulhrsw ymm1, ym8
paddw ymm1, ymm2
mova [dstq+dsq*0], xmm1
vextracti128 [dstq+dsq*1], ymm1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
vzeroupper
RET
.v_w16:
movu ym0, [srcq+ssq*0]
.v_w16_loop:
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw ym1, ym3, ym0
pmulhrsw ym1, ym8
paddw ym1, ym0
movu ym0, [srcq+ssq*0]
psubw ym2, ym0, ym3
pmulhrsw ym2, ym8
paddw ym2, ym3
mova [dstq+dsq*0], ym1
mova [dstq+dsq*1], ym2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+ssq*0]
.v_w32_loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
psubw m1, m3, m0
pmulhrsw m1, m8
paddw m1, m0
movu m0, [srcq+ssq*0]
psubw m2, m0, m3
pmulhrsw m2, m8
paddw m2, m3
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
.v_w64_loop:
movu m2, [srcq+ssq*1+64*0]
movu m3, [srcq+ssq*1+64*1]
lea srcq, [srcq+ssq*2]
psubw m4, m2, m0
pmulhrsw m4, m8
paddw m4, m0
movu m0, [srcq+ssq*0+64*0]
psubw m5, m3, m1
pmulhrsw m5, m8
paddw m5, m1
movu m1, [srcq+ssq*0+64*1]
psubw m6, m0, m2
pmulhrsw m6, m8
psubw m7, m1, m3
pmulhrsw m7, m8
mova [dstq+dsq*0+64*0], m4
mova [dstq+dsq*0+64*1], m5
paddw m6, m2
paddw m7, m3
mova [dstq+dsq*1+64*0], m6
mova [dstq+dsq*1+64*1], m7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*0+64*2]
movu m3, [srcq+ssq*0+64*3]
.v_w128_loop:
movu m4, [srcq+ssq*1+64*0]
movu m5, [srcq+ssq*1+64*1]
movu m6, [srcq+ssq*1+64*2]
movu m7, [srcq+ssq*1+64*3]
lea srcq, [srcq+ssq*2]
psubw m9, m4, m0
pmulhrsw m9, m8
paddw m9, m0
movu m0, [srcq+ssq*0+64*0]
psubw m10, m5, m1
pmulhrsw m10, m8
paddw m10, m1
movu m1, [srcq+ssq*0+64*1]
psubw m11, m6, m2
pmulhrsw m11, m8
paddw m11, m2
movu m2, [srcq+ssq*0+64*2]
psubw m12, m7, m3
pmulhrsw m12, m8
paddw m12, m3
movu m3, [srcq+ssq*0+64*3]
mova [dstq+dsq*0+64*0], m9
psubw m9, m0, m4
pmulhrsw m9, m8
mova [dstq+dsq*0+64*1], m10
psubw m10, m1, m5
pmulhrsw m10, m8
mova [dstq+dsq*0+64*2], m11
psubw m11, m2, m6
pmulhrsw m11, m8
mova [dstq+dsq*0+64*3], m12
psubw m12, m3, m7
pmulhrsw m12, m8
paddw m9, m4
paddw m10, m5
mova [dstq+dsq*1+64*0], m9
mova [dstq+dsq*1+64*1], m10
paddw m11, m6
paddw m12, m7
mova [dstq+dsq*1+64*2], m11
mova [dstq+dsq*1+64*3], m12
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w128_loop
RET
.hv:
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
shl mxyd, 11
vpbroadcastd m6, [pw_2]
vpbroadcastw m7, mxyd
vpbroadcastd m8, [pw_8192]
add t0, r7
test dword r8m, 0x800
jnz .hv_12bpc
psllw m4, 2
psllw m5, 2
vpbroadcastd m8, [pw_2048]
.hv_12bpc:
jmp t0
.hv_w2:
vpbroadcastq xmm1, [srcq+ssq*0]
pmullw xmm0, xmm1, xm4
psrlq xmm1, 16
pmullw xmm1, xm5
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w2_loop:
movq xmm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps xmm2, [srcq+ssq*0]
pmullw xmm1, xmm2, xm4
psrlq xmm2, 16
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 _ 2 _
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movd [dstq+dsq*0], xmm1
pextrd [dstq+dsq*1], xmm1, 2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
pmullw xmm0, xm4, [srcq+ssq*0-8]
pmullw xmm1, xm5, [srcq+ssq*0-6]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
.hv_w4_loop:
movq xmm1, [srcq+ssq*1+0]
movq xmm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
movhps xmm1, [srcq+ssq*0+0]
movhps xmm2, [srcq+ssq*0+2]
pmullw xmm1, xm4
pmullw xmm2, xm5
paddw xmm1, xm6
paddw xmm1, xmm2
psrlw xmm1, 2 ; 1 2
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
mova xmm0, xmm1
psubw xmm1, xmm2
paddw xmm1, xmm1
pmulhw xmm1, xm7
paddw xmm1, xmm2
pmulhrsw xmm1, xm8
movq [dstq+dsq*0], xmm1
movhps [dstq+dsq*1], xmm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
pmullw xmm0, xm4, [srcq+ssq*0+0]
pmullw xmm1, xm5, [srcq+ssq*0+2]
paddw xmm0, xm6
paddw xmm0, xmm1
psrlw xmm0, 2
vinserti32x4 ym0, xmm0, 1
.hv_w8_loop:
movu xm1, [srcq+ssq*1+0]
movu xm2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym1, [srcq+ssq*0+0], 1
vinserti32x4 ym2, [srcq+ssq*0+2], 1
pmullw ym1, ym4
pmullw ym2, ym5
paddw ym1, ym6
paddw ym1, ym2
psrlw ym1, 2 ; 1 2
vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
mova ym0, ym1
psubw ym1, ym2
paddw ym1, ym1
pmulhw ym1, ym7
paddw ym1, ym2
pmulhrsw ym1, ym8
mova [dstq+dsq*0], xm1
vextracti32x4 [dstq+dsq*1], ym1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+ssq*0+0]
pmullw ym1, ym5, [srcq+ssq*0+2]
paddw ym0, ym6
paddw ym0, ym1
psrlw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+ssq*1+0]
movu ym2, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
vinserti32x8 m1, [srcq+ssq*0+0], 1
vinserti32x8 m2, [srcq+ssq*0+2], 1
pmullw m1, m4
pmullw m2, m5
paddw m1, m6
paddw m1, m2
psrlw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m7
paddw m1, m2
pmulhrsw m1, m8
mova [dstq+dsq*0], ym1
vextracti32x8 [dstq+dsq*1], m1, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
.hv_w64:
.hv_w128:
movifnidn wd, wm
lea r6d, [hq+wq*8-256]
mov r4, srcq
mov r7, dstq
.hv_w32_loop0:
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m1, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m1
psrlw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+ssq*1+0]
pmullw m1, m5, [srcq+ssq*1+2]
lea srcq, [srcq+ssq*2]
paddw m3, m6
paddw m3, m1
psrlw m3, 2
psubw m1, m3, m0
paddw m1, m1
pmulhw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+ssq*0+0]
pmullw m2, m5, [srcq+ssq*0+2]
paddw m0, m6
paddw m0, m2
psrlw m0, 2
psubw m2, m0, m3
paddw m2, m2
pmulhw m2, m7
paddw m2, m3
pmulhrsw m1, m8
pmulhrsw m2, m8
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w32_loop
add r4, 64
add r7, 64
movzx hd, r6b
mov srcq, r4
mov dstq, r7
sub r6d, 1<<8
jg .hv_w32_loop0
RET
cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
lea r6, [prep_avx512icl]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r6m ; my
test mxyd, mxyd
jnz .v
.prep:
movzx wd, word [r6+wq*2+table_offset(prep,)]
mov r5d, r7m ; bitdepth_max
vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
add wq, r6
shr r5d, 11
vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
mov r3d, 0x0c
kmovb k1, r3d
.prep_w4_loop:
movq xm0, [srcq+strideq*0]
movhps xm0, [srcq+strideq*1]
vpbroadcastq ym1, [srcq+strideq*2]
vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4}
lea srcq, [srcq+strideq*4]
pmullw ym0, ym4
psubw ym0, ym5
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .prep_w4_loop
RET
.prep_w8:
movu xm0, [srcq+strideq*0]
vinserti32x4 ym0, [srcq+strideq*1], 1
vinserti32x4 m0, [srcq+strideq*2], 2
vinserti32x4 m0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
psubw m0, m5
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .prep_w8
RET
.prep_w16:
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1
movu ym1, [srcq+strideq*2]
vinserti32x8 m1, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m4
psubw m0, m5
psubw m1, m5
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 4
jg .prep_w16
RET
.prep_w32:
pmullw m0, m4, [srcq+strideq*0]
pmullw m1, m4, [srcq+strideq*1]
pmullw m2, m4, [srcq+strideq*2]
pmullw m3, m4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 4
jg .prep_w32
RET
.prep_w64:
pmullw m0, m4, [srcq+strideq*0+64*0]
pmullw m1, m4, [srcq+strideq*0+64*1]
pmullw m2, m4, [srcq+strideq*1+64*0]
pmullw m3, m4, [srcq+strideq*1+64*1]
lea srcq, [srcq+strideq*2]
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
sub hd, 2
jg .prep_w64
RET
.prep_w128:
pmullw m0, m4, [srcq+64*0]
pmullw m1, m4, [srcq+64*1]
pmullw m2, m4, [srcq+64*2]
pmullw m3, m4, [srcq+64*3]
add srcq, strideq
REPX {psubw x, m5}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .prep_w128
RET
.h:
vpbroadcastw m5, mxyd
mov mxyd, r6m ; my
vpbroadcastd m4, [pw_16]
vpbroadcastd m6, [pw_32766]
psubw m4, m5
test dword r7m, 0x800
jnz .h_12bpc
psllw m4, 2
psllw m5, 2
.h_12bpc:
test mxyd, mxyd
jnz .hv
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
movu xm1, [srcq+strideq*0]
vinserti32x4 ym1, [srcq+strideq*2], 1
movu xm2, [srcq+strideq*1]
vinserti32x4 ym2, [srcq+stride3q ], 1
lea srcq, [srcq+strideq*4]
punpcklqdq ym0, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym0, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4
RET
.h_w8:
movu xm0, [srcq+strideq*0+0]
movu xm1, [srcq+strideq*0+2]
vinserti32x4 ym0, [srcq+strideq*1+0], 1
vinserti32x4 ym1, [srcq+strideq*1+2], 1
vinserti32x4 m0, [srcq+strideq*2+0], 2
vinserti32x4 m1, [srcq+strideq*2+2], 2
vinserti32x4 m0, [srcq+stride3q +0], 3
vinserti32x4 m1, [srcq+stride3q +2], 3
lea srcq, [srcq+strideq*4]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8
RET
.h_w16:
movu ym0, [srcq+strideq*0+0]
vinserti32x8 m0, [srcq+strideq*1+0], 1
movu ym1, [srcq+strideq*0+2]
vinserti32x8 m1, [srcq+strideq*1+2], 1
lea srcq, [srcq+strideq*2]
pmullw m0, m4
pmullw m1, m5
psubw m0, m6
paddw m0, m1
psraw m0, 2
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16
RET
.h_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
pmullw m1, m4, [srcq+strideq*1+0]
pmullw m3, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
sub hd, 2
jg .h_w32
RET
.h_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
add srcq, strideq
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
add tmpq, 64*2
dec hd
jg .h_w64
RET
.h_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m7, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m8, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m9, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m10, m5, [srcq+194]
add srcq, strideq
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m7
paddw m1, m8
paddw m2, m9
paddw m3, m10
REPX {psraw x, 2}, m0, m1, m2, m3
mova [tmpq+64*0], m0
mova [tmpq+64*1], m1
mova [tmpq+64*2], m2
mova [tmpq+64*3], m3
add tmpq, 64*4
dec hd
jg .h_w128
RET
.v:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
vpbroadcastw m9, mxyd
vpbroadcastd m8, [pw_16]
vpbroadcastd m10, [pw_32766]
add wq, r6
lea stride3q, [strideq*3]
psubw m8, m9
test dword r7m, 0x800
jnz .v_12bpc
psllw m8, 2
psllw m9, 2
.v_12bpc:
jmp wq
.v_w4:
movq xmm0, [srcq+strideq*0]
.v_w4_loop:
vpbroadcastq xmm2, [srcq+strideq*1]
vpbroadcastq ymm1, [srcq+strideq*2]
vpbroadcastq ymm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpblendd ymm2, ymm1, 0x30
vpblendd ymm2, ymm3, 0xc0
vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
movq xmm0, [srcq+strideq*0]
valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
pmullw ymm1, ym8
pmullw ymm2, ym9
psubw ymm1, ym10
paddw ymm1, ymm2
psraw ymm1, 2
mova [tmpq], ymm1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
vzeroupper
RET
.v_w8:
movu xm0, [srcq+strideq*0]
.v_w8_loop:
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
vinserti32x4 m1, [srcq+strideq*2], 2
vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
lea srcq, [srcq+strideq*4]
movu xm0, [srcq+strideq*0]
valignq m2, m0, m1, 2 ; 1 2 3 4
pmullw m1, m8
pmullw m2, m9
psubw m1, m10
paddw m1, m2
psraw m1, 2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
movu ym0, [srcq+strideq*0]
.v_w16_loop:
vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
movu ym3, [srcq+strideq*2]
vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
lea srcq, [srcq+strideq*4]
movu ym0, [srcq+strideq*0]
vshufi32x4 m3, m1, m3, q1032 ; 1 2
vshufi32x4 m4, m2, m0, q1032 ; 3 4
pmullw m1, m8
pmullw m2, m8
pmullw m3, m9
pmullw m4, m9
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
movu m0, [srcq+strideq*0]
.v_w32_loop:
movu m3, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
pmullw m1, m8, m0
movu m0, [srcq+strideq*0]
pmullw m2, m8, m3
pmullw m3, m9
pmullw m4, m9, m0
psubw m1, m10
psubw m2, m10
paddw m1, m3
paddw m2, m4
psraw m1, 2
psraw m2, 2
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .v_w32_loop
RET
.v_w64:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
.v_w64_loop:
add srcq, strideq
pmullw m2, m8, m0
movu m0, [srcq+64*0]
pmullw m3, m8, m1
movu m1, [srcq+64*1]
pmullw m4, m9, m0
pmullw m5, m9, m1
psubw m2, m10
psubw m3, m10
paddw m2, m4
paddw m3, m5
psraw m2, 2
psraw m3, 2
mova [tmpq+64*0], m2
mova [tmpq+64*1], m3
add tmpq, 64*2
dec hd
jg .v_w64_loop
RET
.v_w128:
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
movu m2, [srcq+64*2]
movu m3, [srcq+64*3]
.v_w128_loop:
add srcq, strideq
pmullw m4, m8, m0
movu m0, [srcq+64*0]
pmullw m5, m8, m1
movu m1, [srcq+64*1]
pmullw m6, m8, m2
movu m2, [srcq+64*2]
pmullw m7, m8, m3
movu m3, [srcq+64*3]
pmullw m11, m9, m0
pmullw m12, m9, m1
pmullw m13, m9, m2
pmullw m14, m9, m3
REPX {psubw x, m10}, m4, m5, m6, m7
paddw m4, m11
paddw m5, m12
paddw m6, m13
paddw m7, m14
REPX {psraw x, 2}, m4, m5, m6, m7
mova [tmpq+64*0], m4
mova [tmpq+64*1], m5
mova [tmpq+64*2], m6
mova [tmpq+64*3], m7
add tmpq, 64*4
dec hd
jg .v_w128_loop
RET
.hv:
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
vpbroadcastw m7, mxyd
add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
movq xmm0, [srcq+strideq*0+0]
movq xmm1, [srcq+strideq*0+2]
pmullw xmm0, xm4
pmullw xmm1, xm5
psubw xmm0, xm6
paddw xmm0, xmm1
psraw xmm0, 2
vpbroadcastq ym0, xmm0
.hv_w4_loop:
movu xm1, [srcq+strideq*1]
vinserti128 ym1, [srcq+stride3q ], 1
movu xm2, [srcq+strideq*2]
lea srcq, [srcq+strideq*4]
vinserti128 ym2, [srcq+strideq*0], 1
punpcklqdq ym3, ym1, ym2
psrldq ym1, 2
psrldq ym2, 2
pmullw ym3, ym4
punpcklqdq ym1, ym2
pmullw ym1, ym5
psubw ym3, ym6
paddw ym1, ym3
psraw ym1, 2 ; 1 2 3 4
valignq ym2, ym1, ym0, 3 ; 0 1 2 3
mova ym0, ym1
psubw ym1, ym2
pmulhrsw ym1, ym7
paddw ym1, ym2
mova [tmpq], ym1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
pmullw xm0, xm4, [srcq+strideq*0+0]
pmullw xm1, xm5, [srcq+strideq*0+2]
psubw xm0, xm6
paddw xm0, xm1
psraw xm0, 2
vinserti32x4 m0, xm0, 3
.hv_w8_loop:
movu xm1, [srcq+strideq*1+0]
movu xm2, [srcq+strideq*1+2]
vinserti32x4 ym1, [srcq+strideq*2+0], 1
vinserti32x4 ym2, [srcq+strideq*2+2], 1
vinserti32x4 m1, [srcq+stride3q +0], 2
vinserti32x4 m2, [srcq+stride3q +2], 2
lea srcq, [srcq+strideq*4]
vinserti32x4 m1, [srcq+strideq*0+0], 3
vinserti32x4 m2, [srcq+strideq*0+2], 3
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2 3 4
valignq m2, m1, m0, 6 ; 0 1 2 3
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
pmullw ym0, ym4, [srcq+strideq*0+0]
pmullw ym1, ym5, [srcq+strideq*0+2]
psubw ym0, ym6
paddw ym0, ym1
psraw ym0, 2
vinserti32x8 m0, ym0, 1
.hv_w16_loop:
movu ym1, [srcq+strideq*1+0]
movu ym2, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
vinserti32x8 m1, [srcq+strideq*0+0], 1
vinserti32x8 m2, [srcq+strideq*0+2], 1
pmullw m1, m4
pmullw m2, m5
psubw m1, m6
paddw m1, m2
psraw m1, 2 ; 1 2
vshufi32x4 m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
pmulhrsw m1, m7
paddw m1, m2
mova [tmpq], m1
add tmpq, 64
sub hd, 2
jg .hv_w16_loop
RET
.hv_w32:
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m1, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m1
psraw m0, 2
.hv_w32_loop:
pmullw m3, m4, [srcq+strideq*1+0]
pmullw m1, m5, [srcq+strideq*1+2]
lea srcq, [srcq+strideq*2]
psubw m3, m6
paddw m3, m1
psraw m3, 2
psubw m1, m3, m0
pmulhrsw m1, m7
paddw m1, m0
pmullw m0, m4, [srcq+strideq*0+0]
pmullw m2, m5, [srcq+strideq*0+2]
psubw m0, m6
paddw m0, m2
psraw m0, 2
psubw m2, m0, m3
pmulhrsw m2, m7
paddw m2, m3
mova [tmpq+64*0], m1
mova [tmpq+64*1], m2
add tmpq, 64*2
sub hd, 2
jg .hv_w32_loop
RET
.hv_w64:
pmullw m0, m4, [srcq+ 0]
pmullw m2, m5, [srcq+ 2]
pmullw m1, m4, [srcq+64]
pmullw m3, m5, [srcq+66]
psubw m0, m6
psubw m1, m6
paddw m0, m2
paddw m1, m3
psraw m0, 2
psraw m1, 2
.hv_w64_loop:
add srcq, strideq
pmullw m2, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m3, m4, [srcq+64]
pmullw m9, m5, [srcq+66]
psubw m2, m6
psubw m3, m6
paddw m2, m8
paddw m3, m9
psraw m2, 2
psraw m3, 2
psubw m8, m2, m0
psubw m9, m3, m1
pmulhrsw m8, m7
pmulhrsw m9, m7
paddw m8, m0
mova m0, m2
paddw m9, m1
mova m1, m3
mova [tmpq+64*0], m8
mova [tmpq+64*1], m9
add tmpq, 64*2
dec hd
jg .hv_w64_loop
RET
.hv_w128:
pmullw m0, m4, [srcq+ 0]
pmullw m8, m5, [srcq+ 2]
pmullw m1, m4, [srcq+ 64]
pmullw m9, m5, [srcq+ 66]
pmullw m2, m4, [srcq+128]
pmullw m10, m5, [srcq+130]
pmullw m3, m4, [srcq+192]
pmullw m11, m5, [srcq+194]
REPX {psubw x, m6}, m0, m1, m2, m3
paddw m0, m8
paddw m1, m9
paddw m2, m10
paddw m3, m11
REPX {psraw x, 2}, m0, m1, m2, m3
.hv_w128_loop:
add srcq, strideq
pmullw m8, m4, [srcq+ 0]
pmullw m12, m5, [srcq+ 2]
pmullw m9, m4, [srcq+ 64]
pmullw m13, m5, [srcq+ 66]
pmullw m10, m4, [srcq+128]
pmullw m14, m5, [srcq+130]
pmullw m11, m4, [srcq+192]
pmullw m15, m5, [srcq+194]
REPX {psubw x, m6}, m8, m9, m10, m11
paddw m8, m12
paddw m9, m13
paddw m10, m14
paddw m11, m15
REPX {psraw x, 2}, m8, m9, m10, m11
psubw m12, m8, m0
psubw m13, m9, m1
psubw m14, m10, m2
psubw m15, m11, m3
REPX {pmulhrsw x, m7}, m12, m13, m14, m15
paddw m12, m0
mova m0, m8
paddw m13, m1
mova m1, m9
mova [tmpq+64*0], m12
mova [tmpq+64*1], m13
paddw m14, m2
mova m2, m10
paddw m15, m3
mova m3, m11
mova [tmpq+64*2], m14
mova [tmpq+64*3], m15
add tmpq, 64*4
dec hd
jg .hv_w128_loop
RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
%assign FILTER_SHARP (2*15 << 16) | 3*15
%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
cglobal %1_%2_16bpc
mov t0d, FILTER_%3
%ifidn %3, %4
mov t1d, t0d
%else
mov t1d, FILTER_%4
%endif
%if %0 == 5 ; skip the jump in the last filter
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
%endif
%endmacro
%if WIN64
DECLARE_REG_TMP 4, 5
%define buf rsp+stack_offset+8 ; shadow space
%else
DECLARE_REG_TMP 7, 8
%define buf rsp-40 ; red zone
%endif
%define PUT_8TAP_FN FN put_8tap,
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc
PUT_8TAP_FN regular, REGULAR, REGULAR
cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
%define base r8-put_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movifnidn wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.put:
tzcnt wd, wd
movzx wd, word [r8+wq*2+table_offset(put,)]
add wq, r8
%if WIN64
pop r8
%endif
jmp wq
.h_w8:
mova m4, [spel_h_shufA]
movu m5, [spel_h_shufB]
movu m6, [spel_h_shufC]
.h_w8_loop:
movu ym2, [srcq+ssq*0]
vinserti32x8 m2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
vpermb m1, m4, m2
vpdpwssd m0, m10, m1
vpermb m1, m5, m2
vpdpwssd m0, m11, m1
vpermb m1, m6, m2
vpdpwssd m0, m12, m1
psrad m0, 6
vextracti32x8 ym1, m0, 1
packusdw ym0, ym1
pminsw ym0, ym15
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8_loop
RET
.h:
vpbroadcastw m15, r8m
test myd, 0xf00
jnz .hv
mov r7d, r8m
shr r7d, 11
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
cmp wd, 4
jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
shr mxd, 16
sub srcq, 4
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
mova [buf], xmm0
vpbroadcastd m10, xmm0
vpbroadcastd m12, [buf+8]
vpbroadcastd m11, [buf+4]
sub wd, 16
jl .h_w8
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
jg .h_w32
.h_w16_loop:
movu ym2, [srcq+ssq*0+ 0]
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
movu ym3, [srcq+ssq*0+12]
vinserti32x8 m3, [srcq+ssq*1+12], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0 b0
pshufb m4, m3, m7
vpdpwssd m1, m12, m4 ; a2' b2'
pshufb m2, m7
pshufb m3, m6
vpdpwssd m0, m11, m2 ; a1 b1
vpdpwssd m1, m11, m3 ; a1' b1'
shufpd m2, m3, 0x55
vpdpwssd m0, m12, m2 ; a2 b2
vpdpwssd m1, m10, m2 ; a0' b0'
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+12]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m7
vpdpwssd m1, m12, m4 ; b2
pshufb m2, m7
pshufb m3, m6
vpdpwssd m0, m11, m2 ; a1
vpdpwssd m1, m11, m3 ; b1
shufpd m2, m3, 0x55
vpdpwssd m0, m12, m2 ; a2
vpdpwssd m1, m10, m2 ; b0
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+r6*2], m0
add r6, 32
jl .h_w32_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w32_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastd m11, [pd_32]
pmovsxbw xmm0, [base+subpel_filters+1+myq*8]
tzcnt r7d, wd
vpbroadcastw m15, r8m
mov r6, ssq
movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
neg r6
mova [rsp+stack_offset+8], xmm0
vpbroadcastd m12, xmm0
add r7, r8
vpbroadcastd m13, [rsp+stack_offset+12]
vpbroadcastd m14, [rsp+stack_offset+16]
jmp r7
.v_w2:
movd xmm2, [srcq+r6 *2]
pinsrd xmm2, [srcq+r6 *1], 1
pinsrd xmm2, [srcq+ssq*0], 2
pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
movd xmm0, [srcq+ssq*0]
palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
punpcklwd xmm1, xmm2, xmm3 ; 01 12
punpckhwd xmm2, xmm3 ; 23 34
.v_w2_loop:
movd xmm3, [srcq+ssq*1]
mova xmm4, xm11
vpdpwssd xmm4, xmm1, xm12 ; a0 b0
lea srcq, [srcq+ssq*2]
mova xmm1, xmm2
vpdpwssd xmm4, xmm2, xm13 ; a1 b1
punpckldq xmm2, xmm0, xmm3 ; 4 5
movd xmm0, [srcq+ssq*0]
punpckldq xmm3, xmm0 ; 5 6
punpcklwd xmm2, xmm3 ; 45 56
vpdpwssd xmm4, xmm2, xm14 ; a2 b2
psrad xmm4, 6
packusdw xmm4, xmm4
pminsw xmm4, xm15
movd [dstq+dsq*0], xmm4
pextrd [dstq+dsq*1], xmm4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm1, [srcq+r6 *2]
vpbroadcastq ymm3, [srcq+r6 *1]
vpbroadcastq ymm2, [srcq+ssq*0]
vpbroadcastq ymm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm1, ymm3, 0x30
vpblendd ymm3, ymm2, 0x30
punpcklwd ymm1, ymm3 ; 01 12
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm0, 0x30
punpcklwd ymm2, ymm4 ; 23 34
.v_w4_loop:
vpbroadcastq ymm3, [srcq+ssq*1]
mova ymm4, ym11
vpdpwssd ymm4, ymm1, ym12 ; a0 b0
lea srcq, [srcq+ssq*2]
mova ymm1, ymm2
vpdpwssd ymm4, ymm2, ym13 ; a1 b1
vpblendd ymm2, ymm0, ymm3, 0x30
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm3, ymm0, 0x30
punpcklwd ymm2, ymm3 ; 45 56
vpdpwssd ymm4, ymm2, ym14 ; a2 b2
psrad ymm4, 6
vextracti128 xmm3, ymm4, 1
packusdw xmm4, xmm3
pminsw xmm4, xm15
movq [dstq+dsq*0], xmm4
movhps [dstq+dsq*1], xmm4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
vzeroupper
RET
.v_w8:
vbroadcasti32x4 m0, [srcq+ssq*0]
vinserti32x4 m1, m0, [srcq+r6 *2], 0
vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2
vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m5, [spel_v_shuf8]
vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4
vpermb m1, m5, m1 ; 01 12
vpermb m2, m5, m0 ; 23 34
.v_w8_loop:
vinserti32x4 m0, [srcq+ssq*1], 3
lea srcq, [srcq+ssq*2]
movu xm3, [srcq+ssq*0]
mova m4, m11
vpdpwssd m4, m12, m1 ; a0 b0
vshufi32x4 m0, m3, q1032 ; 4 5 6
mova m1, m2
vpdpwssd m4, m13, m2 ; a1 b1
vpermb m2, m5, m0 ; 45 56
vpdpwssd m4, m14, m2 ; a2 b2
psrad m4, 6
vextracti32x8 ym3, m4, 1
packusdw ym4, ym3
pminsw ym4, ym15
mova [dstq+dsq*0], xm4
vextracti32x4 [dstq+dsq*1], ym4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m0, [srcq+r6 *1]
vinserti32x8 m1, m0, [srcq+ssq*0], 1
vinserti32x8 m0, [srcq+r6*2], 0
mova m6, [spel_v_shuf16]
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m3, [srcq+ssq*0], 1
vpermb m1, m6, m1 ; 12
vpermb m0, m6, m0 ; 01
vpermb m3, m6, m3 ; 34
mova m7, [deint_q_shuf]
vpshrdd m2, m1, m3, 16 ; 23
.v_w16_loop:
mova m5, m11
vpdpwssd m5, m12, m1 ; b0
mova m4, m11
vpdpwssd m4, m12, m0 ; a0
mova m1, m3
vpdpwssd m5, m13, m3 ; b1
mova m0, m2
vpdpwssd m4, m13, m2 ; a1
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m3, [srcq+ssq*0], 1
vpermb m3, m6, m3 ; 56
vpshrdd m2, m1, m3, 16 ; 45
vpdpwssd m5, m14, m3 ; b2
vpdpwssd m4, m14, m2 ; a2
psrad m5, 6
psrad m4, 6
packusdw m4, m5
pminsw m4, m15
vpermq m4, m7, m4
mova [dstq+dsq*0], ym4
vextracti32x8 [dstq+dsq*1], m4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
lea wd, [hq+wq*8-256]
.v_w32_loop0:
movu m16, [srcq+r6 *2]
movu m17, [srcq+r6 *1]
lea r7, [srcq+ssq*2]
movu m18, [srcq+ssq*0]
movu m19, [srcq+ssq*1]
mov r8, dstq
movu m20, [r7 +ssq*0]
punpcklwd m0, m16, m17 ; 01
punpckhwd m16, m17
punpcklwd m1, m17, m18 ; 12
punpckhwd m17, m18
punpcklwd m2, m18, m19 ; 23
punpckhwd m18, m19
punpcklwd m3, m19, m20 ; 34
punpckhwd m19, m20
.v_w32_loop:
mova m4, m11
vpdpwssd m4, m12, m0 ; a0
mova m6, m11
vpdpwssd m6, m12, m16
mova m5, m11
vpdpwssd m5, m12, m1 ; b0
mova m7, m11
vpdpwssd m7, m12, m17
mova m0, m2
vpdpwssd m4, m13, m2 ; a1
mova m16, m18
vpdpwssd m6, m13, m18
mova m1, m3
vpdpwssd m5, m13, m3 ; b1
mova m17, m19
vpdpwssd m7, m13, m19
movu m19, [r7+ssq*1]
lea r7, [r7+ssq*2]
punpcklwd m2, m20, m19 ; 45
punpckhwd m18, m20, m19
movu m20, [r7+ssq*0]
vpdpwssd m4, m14, m2 ; a2
vpdpwssd m6, m14, m18
punpcklwd m3, m19, m20 ; 56
punpckhwd m19, m20
vpdpwssd m5, m14, m3 ; b2
vpdpwssd m7, m14, m19
REPX {psrad x, 6}, m4, m6, m5, m7
packusdw m4, m6
packusdw m5, m7
pminsw m4, m15
pminsw m5, m15
mova [r8+dsq*0], m4
mova [r8+dsq*1], m5
lea r8, [r8+dsq*2]
sub hd, 2
jg .v_w32_loop
add srcq, 64
add dstq, 64
movzx hd, wb
sub wd, 1<<8
jg .v_w32_loop0
vzeroupper
RET
.hv:
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
mov r6, ssq
sub srcq, 2
neg r6
test dword r8m, 0x800
jnz .hv_12bit
vpbroadcastd m10, [pd_2176]
psllw xmm0, 6
jmp .hv_main
.hv_12bit:
vpbroadcastd m10, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_main:
movu xm4, [srcq+r6 *2]
vinserti32x4 ym4, [srcq+r6 *1], 1
vinserti32x4 m4, [srcq+ssq*0], 2
vbroadcasti32x4 m6, [spel_h_shufA]
vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3
lea srcq, [srcq+ssq*2]
movu xm5, [srcq+ssq*0] ; 4
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m8, [buf+ 4]
vpbroadcastd m9, [buf+ 8]
vpbroadcastd ym12, xmm1
vpbroadcastd ym13, [buf+20]
vpbroadcastd ym14, [buf+24]
cmp wd, 4
je .hv_w4
vbroadcasti32x4 m2, [spel_h_shufA]
mova m3, [spel_h_shuf2b]
mova m1, m10
pshufb m4, m6
pshufb xm5, xm6
punpcklqdq m2, m4, m5
vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_
mova ym6, [spel_h_shuf2a]
punpckhqdq m4, m5
mova xm5, [spel_shuf2]
vpdpwssd m1, m9, m4
vpermb m1, m3, m1 ; 01 12
vextracti32x4 xm2, ym1, 1 ; 23 34
.hv_w2_loop:
movu xm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym3, [srcq+ssq*0], 1
vpermb ym3, ym6, ym3
pmaddwd xmm0, xm12, xm1 ; a0 b0
mova xm4, xm10
vpdpwssd xm4, xm8, xm3
vextracti32x4 xm3, ym3, 1
mova xm1, xm2
vpdpwssd xmm0, xm13, xm2 ; a1 b1
vpdpwssd xm4, xm9, xm3 ; 5 6
vpermt2b xm2, xm5, xm4 ; 45 56
vpdpwssd xmm0, xm14, xm2 ; a2 b2
psrad xmm0, 10
packusdw xmm0, xmm0
pminsw xmm0, xm15
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
vbroadcasti32x4 m7, [spel_h_shufB]
mova ym0, [spel_shuf4a]
pshufb m1, m4, m6
mova m2, m10
vpdpwssd m2, m8, m1
pshufb xm1, xm5, xm6
mova xm3, xm10
vpdpwssd xm3, xm8, xm1
pshufb m4, m7
pshufb xm5, xm7
vpdpwssd m2, m9, m4 ; 0 1 2 3
vpdpwssd xm3, xm9, xm5 ; 4
mova ym5, [spel_shuf4b]
vpermb m1, m0, m2 ; 01 12
vshufi32x4 m2, m3, q1032 ; 2 3 4
vpermb m2, m0, m2 ; 23 34
.hv_w4_loop:
movu xm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym3, [srcq+ssq*0], 1
pmaddwd ym0, ym12, ym1 ; a0 b0
mova ym1, ym2
pshufb ym4, ym3, ym6
mova ym2, ym10
vpdpwssd ym2, ym8, ym4
pshufb ym3, ym7
vpdpwssd ym0, ym13, ym1 ; a1 b1
vpdpwssd ym2, ym9, ym3 ; 5 6
vpermt2b ym2, ym5, ym1 ; 45 56
vpdpwssd ym0, ym14, ym2 ; a2 b2
psrad ym0, 10
vextracti32x4 xm4, ym0, 1
packusdw xm0, xm4
pminsw xmm0, xm0, xm15
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
mov r6, ssq
sub srcq, 4
neg r6
test dword r8m, 0x800
jnz .hv_w8_12bit
vpbroadcastd m8, [pd_2176]
psllw xmm0, 6
jmp .hv_w8_main
.hv_w8_12bit:
vpbroadcastd m8, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_w8_main:
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m9, xmm0
vpbroadcastd m10, [buf+ 4]
vpbroadcastd m11, [buf+ 8]
vpbroadcastd m12, xmm1
vpbroadcastd m13, [buf+20]
vpbroadcastd m14, [buf+24]
cmp wd, 16
jge .hv_w16
mova m6, [spel_h_shufA]
movu ym16, [srcq+r6 *2]
vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1
movu ym17, [srcq+ssq*0]
vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3
lea srcq, [srcq+ssq*2]
movu ym18, [srcq+ssq*0] ; 4
movu m7, [spel_h_shufC]
vpermb m3, m6, m16
mova m1, m8
vpermb m4, m6, m17
vpdpwssd m1, m9, m3 ; a0 b0
mova m2, m8
vpermb m5, m6, m18
vpdpwssd m2, m9, m4 ; c0 d0
mova m0, m8
vpermb m16, m7, m16
vpdpwssd m0, m9, m5 ; e0
vpermb m17, m7, m17
vpdpwssd m1, m11, m16 ; a2 b2
vpermb m18, m7, m18
vpdpwssd m2, m11, m17 ; c2 d2
shufpd m3, m16, 0x55
vpdpwssd m0, m11, m18 ; e2
mova m16, [spel_shuf8a]
shufpd m4, m17, 0x55
vpdpwssd m1, m10, m3 ; a1 b1
shufpd m5, m18, 0x55
vpdpwssd m2, m10, m4 ; c1 d1
vpdpwssd m0, m10, m5 ; e1
mova m5, [spel_shuf8b]
vpermt2b m1, m16, m2 ; 01 12
vpermt2b m2, m16, m0 ; 23 34
.hv_w8_loop:
movu ym18, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m18, [srcq+ssq*0], 1
mova m0, m8
vpermb m17, m6, m18
vpdpwssd m0, m9, m17 ; f0 g0
vpermb m18, m7, m18
pmaddwd m16, m12, m1 ; A0 B0
vpdpwssd m0, m11, m18 ; f2 g2
shufpd m17, m18, 0x55
mova m1, m2
vpdpwssd m16, m13, m2 ; A1 B1
vpdpwssd m0, m10, m17 ; f1 g1
vpermt2b m2, m5, m0 ; 45 56
vpdpwssd m16, m14, m2 ; A2 B2
psrad m16, 10
vextracti32x8 ym17, m16, 1
packusdw ym16, ym17
pminsw ym16, ym15
mova [dstq+dsq*0], xm16
vextracti128 [dstq+dsq*1], ym16, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
vzeroupper
RET
.hv_w16:
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
jg .hv_w32
vbroadcasti32x8 m6, [srcq+r6 *2+ 8]
vinserti32x8 m2, m6, [srcq+r6 *2+16], 1
vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0
movu ym16, [srcq+r6 *1+ 0]
movu ym17, [srcq+r6 *1+12]
vinserti32x8 m16, [srcq+ssq*0+ 0], 1
vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2
movu ym18, [srcq+ssq*1+ 0]
movu ym19, [srcq+ssq*1+12]
lea srcq, [srcq+ssq*2]
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4
pshufb m2, m20
mova m1, m8
pshufb m3, m16, m20
vpdpwssd m1, m11, m2 ; a2
mova m2, m8
pshufb m4, m17, m21
vpdpwssd m2, m9, m3 ; b0 c0
mova m3, m8
pshufb m5, m18, m20
vpdpwssd m3, m11, m4 ; b2' c2'
mova m4, m8
pshufb m7, m19, m21
vpdpwssd m4, m9, m5 ; d0 e0
mova m5, m8
pshufb m0, m6, m20
vpdpwssd m5, m11, m7 ; d2' e2'
mova m7, [spel_shuf16]
pshufb m16, m21
vpdpwssd m1, m9, m0 ; a0
pshufb m17, m20
vpdpwssd m2, m10, m16 ; b1 c1
pshufb m18, m21
vpdpwssd m3, m10, m17 ; b1' c1'
pshufb m19, m20
vpdpwssd m4, m10, m18 ; d1 e1
pshufb m6, m21
vpdpwssd m5, m10, m19 ; d1' e1'
shufpd m16, m17, 0x55
vpdpwssd m1, m10, m6 ; a1
shufpd m18, m19, 0x55
vpdpwssd m2, m11, m16 ; b2 c2
vpdpwssd m3, m9, m16 ; b0' c0'
vpdpwssd m4, m11, m18 ; d2 e2
vpdpwssd m5, m9, m18 ; d0' e0'
pslldq m1, 1
vpermt2b m2, m7, m3 ; 12
vpermt2b m4, m7, m5 ; 34
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
.hv_w16_loop:
movu ym18, [srcq+ssq*1+ 0]
movu ym19, [srcq+ssq*1+12]
lea srcq, [srcq+ssq*2]
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
vinserti32x8 m19, [srcq+ssq*0+12], 1
mova m5, m8
mova m6, m8
pshufb m17, m18, m20
vpdpwssd m5, m9, m17 ; f0 g0
pshufb m16, m19, m21
vpdpwssd m6, m11, m16 ; f2' g2'
pmaddwd m17, m12, m2 ; B0
mova m2, m4
pmaddwd m16, m12, m1 ; A0
mova m1, m3
pshufb m18, m21
vpdpwssd m5, m10, m18 ; f1 g1
pshufb m19, m20
vpdpwssd m6, m10, m19 ; f1' g1'
vpdpwssd m17, m13, m4 ; B1
vpdpwssd m16, m13, m3 ; A1
shufpd m18, m19, 0x55
vpdpwssd m5, m11, m18 ; f2 g2
vpdpwssd m6, m9, m18 ; f0' g0'
mova m4, m7
vpermi2b m4, m5, m6 ; 56
vpshrdd m3, m2, m4, 16 ; 45
vpdpwssd m17, m14, m4 ; B2
vpdpwssd m16, m14, m3 ; A2
psrad m16, 10
psrad m17, 10
vshufi32x4 m18, m16, m17, q3232
vinserti32x8 m16, ym17, 1
packusdw m16, m18
pminsw m16, m15
mova [dstq+dsq*0], ym16
vextracti32x8 [dstq+dsq*1], m16, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w16_loop
vzeroupper
RET
.hv_w32:
WIN64_SPILL_XMM 28
mova m27, [spel_shuf32]
lea wd, [hq+wq*8-256]
.hv_w32_loop0:
movu m16, [srcq+r6 *2+ 0]
movu m7, [srcq+r6 *2+12]
movu m6, [srcq+r6 *1+ 0]
movu m18, [srcq+r6 *1+12]
lea r7, [srcq+ssq*2]
movu m17, [srcq+ssq*0+ 0]
movu m19, [srcq+ssq*0+12]
movu m22, [srcq+ssq*1+ 0]
movu m24, [srcq+ssq*1+12]
mov r8, dstq
movu m23, [r7 +ssq*0+ 0]
movu m25, [r7 +ssq*0+12]
pshufb m1, m16, m20
mova m0, m8
pshufb m2, m7, m21
vpdpwssd m0, m9, m1 ; a0
mova m1, m8
pshufb m4, m6, m20
vpdpwssd m1, m11, m2 ; a2'
mova m2, m8
pshufb m3, m17, m20
vpdpwssd m2, m9, m4 ; b0
mova m4, m8
pshufb m5, m18, m21
vpdpwssd m4, m9, m3 ; c0
mova m3, m8
pshufb m26, m19, m21
vpdpwssd m3, m11, m5 ; b2'
mova m5, m8
pshufb m16, m21
vpdpwssd m5, m11, m26 ; c2'
pshufb m7, m20
vpdpwssd m0, m10, m16 ; a1
pshufb m6, m21
vpdpwssd m1, m10, m7 ; a1'
pshufb m17, m21
vpdpwssd m2, m10, m6 ; b1
pshufb m18, m20
vpdpwssd m4, m10, m17 ; c1
pshufb m19, m20
vpdpwssd m3, m10, m18 ; b1'
shufpd m16, m7, 0x55
vpdpwssd m5, m10, m19 ; c1'
shufpd m6, m18, 0x55
vpdpwssd m0, m11, m16 ; a2
shufpd m17, m19, 0x55
vpdpwssd m1, m9, m16 ; a0'
pshufb m16, m22, m20
vpdpwssd m2, m11, m6 ; b2
pshufb m7, m23, m20
vpdpwssd m4, m11, m17 ; c2
vpdpwssd m3, m9, m6 ; b0'
mova m6, m8
vpdpwssd m5, m9, m17 ; c0'
pshufb m17, m24, m21
vpdpwssd m6, m9, m16 ; d0
mova m16, m8
pshufb m26, m25, m21
vpdpwssd m16, m9, m7 ; e0
mova m7, m8
pshufb m22, m21
vpdpwssd m7, m11, m17 ; d2'
mova m17, m8
pshufb m23, m21
vpdpwssd m17, m11, m26 ; e2'
pshufb m24, m20
vpdpwssd m6, m10, m22 ; d1
pshufb m25, m20
vpdpwssd m16, m10, m23 ; e1
shufpd m22, m24, 0x55
vpdpwssd m7, m10, m24 ; d1'
shufpd m23, m25, 0x55
vpdpwssd m17, m10, m25 ; e1'
pslldq m0, 1
vpdpwssd m6, m11, m22 ; d2
pslldq m1, 1
vpdpwssd m16, m11, m23 ; e2
vpermt2b m2, m27, m4 ; 12
vpdpwssd m7, m9, m22 ; d0'
vpermt2b m3, m27, m5 ; 12'
vpdpwssd m17, m9, m23 ; e0'
vpshrdd m0, m2, 16 ; 01
vpermt2b m6, m27, m16 ; 34
vpshrdd m1, m3, 16 ; 01'
vpermt2b m7, m27, m17 ; 34'
vpshrdd m4, m2, m6, 16 ; 23
vpshrdd m5, m3, m7, 16 ; 23'
.hv_w32_loop:
movu m22, [r7+ssq*1+ 0]
movu m24, [r7+ssq*1+12]
lea r7, [r7+ssq*2]
movu m23, [r7+ssq*0+ 0]
movu m25, [r7+ssq*0+12]
pmaddwd m17, m12, m2 ; B0
mova m2, m6
pmaddwd m19, m12, m3 ; B0'
mova m3, m7
pmaddwd m16, m12, m0 ; A0
mova m0, m4
pmaddwd m18, m12, m1 ; A0'
mova m1, m5
vpdpwssd m17, m13, m6 ; B1
vpdpwssd m19, m13, m7 ; B1'
mova m6, m8
vpdpwssd m16, m13, m4 ; A1
pshufb m4, m22, m20
vpdpwssd m18, m13, m5 ; A1'
pshufb m7, m23, m20
vpdpwssd m6, m9, m4 ; f0
mova m4, m8
pshufb m5, m24, m21
vpdpwssd m4, m9, m7 ; g0
mova m7, m8
pshufb m26, m25, m21
vpdpwssd m7, m11, m5 ; f2'
mova m5, m8
pshufb m22, m21
vpdpwssd m5, m11, m26 ; g2'
pshufb m23, m21
vpdpwssd m6, m10, m22 ; f1
pshufb m24, m20
vpdpwssd m4, m10, m23 ; g1
pshufb m25, m20
vpdpwssd m7, m10, m24 ; f1'
shufpd m22, m24, 0x55
vpdpwssd m5, m10, m25 ; g1'
shufpd m23, m25, 0x55
vpdpwssd m6, m11, m22 ; f2
vpdpwssd m4, m11, m23 ; g2
vpdpwssd m7, m9, m22 ; f0'
vpdpwssd m5, m9, m23 ; g0'
vpermt2b m6, m27, m4 ; 56
vpermt2b m7, m27, m5 ; 56'
vpdpwssd m17, m14, m6 ; B2
vpshrdd m4, m2, m6, 16 ; 45
vpdpwssd m19, m14, m7 ; B2'
vpshrdd m5, m3, m7, 16 ; 45'
vpdpwssd m16, m14, m4 ; A2
vpdpwssd m18, m14, m5 ; A2'
REPX {psrad x, 10}, m17, m19, m16, m18
packusdw m17, m19
packusdw m16, m18
pminsw m17, m15
pminsw m16, m15
mova [r8+dsq*0], m16
mova [r8+dsq*1], m17
lea r8, [r8+dsq*2]
sub hd, 2
jg .hv_w32_loop
add srcq, 64
add dstq, 64
movzx hd, wb
sub wd, 1<<8
jg .hv_w32_loop0
RET
PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc
PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc
PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc
PUT_8TAP_FN sharp, SHARP, SHARP
cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r8, [put_avx512icl]
movifnidn wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
vpbroadcastd m10, [pd_32]
pmovsxbw xmm0, [base+subpel_filters+myq*8]
tzcnt r7d, wd
vpbroadcastw m11, r8m
lea r6, [ssq*3]
movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
sub srcq, r6
mova [rsp+stack_offset+8], xmm0
vpbroadcastd m12, xmm0
add r7, r8
vpbroadcastd m13, [rsp+stack_offset+12]
vpbroadcastd m14, [rsp+stack_offset+16]
vpbroadcastd m15, [rsp+stack_offset+20]
jmp r7
.v_w2:
movd xmm2, [srcq+ssq*0]
pinsrd xmm2, [srcq+ssq*1], 1
pinsrd xmm2, [srcq+ssq*2], 2
add srcq, r6
pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
movd xmm3, [srcq+ssq*1]
vpbroadcastd xmm1, [srcq+ssq*2]
add srcq, r6
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm3, xmm1, 0x02 ; 4 5
vpblendd xmm1, xmm0, 0x02 ; 5 6
palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
punpcklwd xmm3, xmm1 ; 45 56
punpcklwd xmm1, xmm2, xmm4 ; 01 12
punpckhwd xmm2, xmm4 ; 23 34
.v_w2_loop:
vpbroadcastd xmm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova xmm5, xm10
vpdpwssd xmm5, xm12, xmm1 ; a0 b0
mova xmm1, xmm2
vpdpwssd xmm5, xm13, xmm2 ; a1 b1
mova xmm2, xmm3
vpdpwssd xmm5, xm14, xmm3 ; a2 b2
vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
vpbroadcastd xmm0, [srcq+ssq*0]
vpblendd xmm4, xmm0, 0x02 ; 7 8
punpcklwd xmm3, xmm4 ; 67 78
vpdpwssd xmm5, xm15, xmm3 ; a3 b3
psrad xmm5, 6
packusdw xmm5, xmm5
pminsw xmm5, xm11
movd [dstq+dsq*0], xmm5
pextrd [dstq+dsq*1], xmm5, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movq xmm1, [srcq+ssq*0]
vpbroadcastq ymm0, [srcq+ssq*1]
vpbroadcastq ymm2, [srcq+ssq*2]
add srcq, r6
vpbroadcastq ymm4, [srcq+ssq*0]
vpbroadcastq ymm3, [srcq+ssq*1]
vpbroadcastq ymm5, [srcq+ssq*2]
add srcq, r6
vpblendd ymm1, ymm0, 0x30
vpblendd ymm0, ymm2, 0x30
punpcklwd ymm1, ymm0 ; 01 12
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm2, ymm4, 0x30
vpblendd ymm4, ymm3, 0x30
punpcklwd ymm2, ymm4 ; 23 34
vpblendd ymm3, ymm5, 0x30
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 45 56
.v_w4_loop:
vpbroadcastq ymm5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mova ymm4, ym10
vpdpwssd ymm4, ym12, ymm1 ; a0 b0
mova ymm1, ymm2
vpdpwssd ymm4, ym13, ymm2 ; a1 b1
mova ymm2, ymm3
vpdpwssd ymm4, ym14, ymm3 ; a2 b2
vpblendd ymm3, ymm0, ymm5, 0x30
vpbroadcastq ymm0, [srcq+ssq*0]
vpblendd ymm5, ymm0, 0x30
punpcklwd ymm3, ymm5 ; 67 78
vpdpwssd ymm4, ym15, ymm3 ; a3 b3
psrad ymm4, 6
vextracti128 xmm5, ymm4, 1
packusdw xmm4, xmm5
pminsw xmm4, xm11
movq [dstq+dsq*0], xmm4
movhps [dstq+dsq*1], xmm4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
vzeroupper
RET
.v_w8:
vbroadcasti32x4 m2, [srcq+ssq*2]
vinserti32x4 m1, m2, [srcq+ssq*0], 0
vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
add srcq, r6
vinserti32x4 ym2, [srcq+ssq*0], 1
vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
mova m6, [spel_v_shuf8]
movu xm0, [srcq+ssq*1]
vinserti32x4 ym0, [srcq+ssq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
vpermb m1, m6, m1 ; 01 12
vpermb m2, m6, m2 ; 23 34
vpermb m3, m6, m0 ; 45 56
.v_w8_loop:
vinserti32x4 m0, [srcq+ssq*1], 3
lea srcq, [srcq+ssq*2]
movu xm5, [srcq+ssq*0]
mova m4, m10
vpdpwssd m4, m12, m1 ; a0 b0
mova m1, m2
vshufi32x4 m0, m5, q1032 ; 6 7 8
vpdpwssd m4, m13, m2 ; a1 b1
mova m2, m3
vpdpwssd m4, m14, m3 ; a2 b2
vpermb m3, m6, m0 ; 67 78
vpdpwssd m4, m15, m3 ; a3 b3
psrad m4, 6
vextracti32x8 ym5, m4, 1
packusdw ym4, ym5
pminsw ym4, ym11
mova [dstq+dsq*0], xm4
vextracti32x4 [dstq+dsq*1], ym4, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m0, [srcq+ssq*1]
vinserti32x8 m1, m0, [srcq+ssq*2], 1
vinserti32x8 m0, [srcq+ssq*0], 0
mova m8, [spel_v_shuf16]
add srcq, r6
movu ym3, [srcq+ssq*0]
vinserti32x8 m3, [srcq+ssq*1], 1
movu ym5, [srcq+ssq*2]
add srcq, r6
vinserti32x8 m5, [srcq+ssq*0], 1
vpermb m1, m8, m1 ; 12
vpermb m0, m8, m0 ; 01
vpermb m3, m8, m3 ; 34
vpermb m5, m8, m5 ; 56
mova m9, [deint_q_shuf]
vpshrdd m2, m1, m3, 16 ; 23
vpshrdd m4, m3, m5, 16 ; 45
.v_w16_loop:
mova m7, m10
vpdpwssd m7, m12, m1 ; b0
mova m6, m10
vpdpwssd m6, m12, m0 ; a0
mova m1, m3
vpdpwssd m7, m13, m3 ; b1
mova m0, m2
vpdpwssd m6, m13, m2 ; a1
mova m3, m5
vpdpwssd m7, m14, m5 ; b2
mova m2, m4
vpdpwssd m6, m14, m4 ; a2
movu ym5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m5, [srcq+ssq*0], 1
vpermb m5, m8, m5 ; 78
vpshrdd m4, m3, m5, 16 ; 67
vpdpwssd m7, m15, m5 ; b3
vpdpwssd m6, m15, m4 ; a3
psrad m7, 6
psrad m6, 6
packusdw m6, m7
pminsw m6, m11
vpermq m6, m9, m6
mova [dstq+dsq*0], ym6
vextracti32x8 [dstq+dsq*1], m6, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
WIN64_SPILL_XMM 23
lea wd, [hq+wq*8-256]
.v_w32_loop0:
movu m16, [srcq+ssq*0]
movu m17, [srcq+ssq*1]
lea r7, [srcq+r6 ]
movu m18, [srcq+ssq*2]
movu m19, [r7 +ssq*0]
mov r8, dstq
movu m20, [r7 +ssq*1]
movu m21, [r7 +ssq*2]
add r7, r6
movu m22, [r7 +ssq*0]
punpcklwd m0, m16, m17 ; 01l
punpckhwd m16, m17 ; 01h
punpcklwd m1, m17, m18 ; 12l
punpckhwd m17, m18 ; 12h
punpcklwd m2, m18, m19 ; 23l
punpckhwd m18, m19 ; 23h
punpcklwd m3, m19, m20 ; 34l
punpckhwd m19, m20 ; 34h
punpcklwd m4, m20, m21 ; 45l
punpckhwd m20, m21 ; 45h
punpcklwd m5, m21, m22 ; 56l
punpckhwd m21, m22 ; 56h
.v_w32_loop:
mova m6, m10
vpdpwssd m6, m12, m0 ; a0l
mova m8, m10
vpdpwssd m8, m12, m16 ; a0h
mova m7, m10
vpdpwssd m7, m12, m1 ; b0l
mova m9, m10
vpdpwssd m9, m12, m17 ; b0h
mova m0, m2
vpdpwssd m6, m13, m2 ; a1l
mova m16, m18
vpdpwssd m8, m13, m18 ; a1h
mova m1, m3
vpdpwssd m7, m13, m3 ; b1l
mova m17, m19
vpdpwssd m9, m13, m19 ; b1h
mova m2, m4
vpdpwssd m6, m14, m4 ; a2l
mova m18, m20
vpdpwssd m8, m14, m20 ; a2h
mova m3, m5
vpdpwssd m7, m14, m5 ; b2l
mova m19, m21
vpdpwssd m9, m14, m21 ; b2h
movu m21, [r7+ssq*1]
lea r7, [r7+ssq*2]
punpcklwd m4, m22, m21 ; 67l
punpckhwd m20, m22, m21 ; 67h
movu m22, [r7+ssq*0]
vpdpwssd m6, m15, m4 ; a3l
vpdpwssd m8, m15, m20 ; a3h
punpcklwd m5, m21, m22 ; 78l
punpckhwd m21, m22 ; 78h
vpdpwssd m7, m15, m5 ; b3l
vpdpwssd m9, m15, m21 ; b3h
REPX {psrad x, 6}, m6, m8, m7, m9
packusdw m6, m8
packusdw m7, m9
pminsw m6, m11
pminsw m7, m11
mova [r8+dsq*0], m6
mova [r8+dsq*1], m7
lea r8, [r8+dsq*2]
sub hd, 2
jg .v_w32_loop
add srcq, 64
add dstq, 64
movzx hd, wb
sub wd, 1<<8
jg .v_w32_loop0
RET
.h_w2:
RESET_STACK_STATE
mova ym2, [spel_h_shuf2a]
sub srcq, 2
pshufd xmm3, xmm0, q1111
pshufd xmm4, xmm0, q2222
.h_w2_loop:
movu xm1, [srcq+ssq*0]
vinserti32x4 ym1, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova xmm0, xm8
vpermb ym1, ym2, ym1
vpdpwssd xmm0, xmm3, xm1
vextracti32x4 xm1, ym1, 1
vpdpwssd xmm0, xmm4, xm1
psrad xmm0, 6
packusdw xmm0, xmm0
pminsw xmm0, xm15
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
jl .h_w2
vbroadcasti32x4 ym4, [spel_h_shufA]
vbroadcasti32x4 ym5, [spel_h_shufB]
sub srcq, 2
pshufd xmm0, xmm0, q2211
vpbroadcastq ym6, xmm0
vpermq ym7, ymm0, q1111
.h_w4_loop:
movu xm2, [srcq+ssq*0]
vinserti32x4 ym2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova ym0, ym8
pshufb ym1, ym2, ym4
vpdpwssd ym0, ym6, ym1
pshufb ym2, ym5
vpdpwssd ym0, ym7, ym2
psrad ym0, 6
vextracti32x4 xm1, ym0, 1
packusdw xm0, xm1
pminsw xmm0, xm0, xm15
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4_loop
RET
.h_w8:
mova m4, [spel_h_shufA]
movu m5, [spel_h_shufB]
movu m6, [spel_h_shufC]
mova m7, [spel_h_shufD]
.h_w8_loop:
movu ym2, [srcq+ssq*0]
vinserti32x8 m2, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
vpermb m1, m4, m2
vpdpwssd m0, m10, m1
vpermb m1, m5, m2
vpdpwssd m0, m11, m1
vpermb m1, m6, m2
vpdpwssd m0, m12, m1
vpermb m1, m7, m2
vpdpwssd m0, m13, m1
psrad m0, 6
vextracti32x8 ym1, m0, 1
packusdw ym0, ym1
pminsw ym0, ym15
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8_loop
RET
.h:
vpbroadcastw m15, r8m
test myd, 0xf00
jnz .hv
mov r7d, r8m
shr r7d, 11
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
cmp wd, 4
jle .h_w4
shr mxd, 16
sub srcq, 6
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mova [buf], xmm0
vpbroadcastd m10, xmm0
vpbroadcastd m11, [buf+ 4]
vpbroadcastd m12, [buf+ 8]
vpbroadcastd m13, [buf+12]
sub wd, 16
jl .h_w8
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
jg .h_w32
.h_w16_loop:
movu ym2, [srcq+ssq*0+ 0]
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
movu ym3, [srcq+ssq*0+16]
vinserti32x8 m3, [srcq+ssq*1+16], 1
lea srcq, [srcq+ssq*2]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m12, m4 ; b2
pshufb m4, m2, m7
vpdpwssd m0, m11, m4 ; a1
pshufb m4, m3, m7
vpdpwssd m1, m13, m4 ; b3
shufpd m2, m3, 0x55
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a2
vpdpwssd m1, m10, m4 ; b0
pshufb m2, m7
vpdpwssd m0, m13, m2 ; a3
vpdpwssd m1, m11, m2 ; b1
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+ 8]
mova m0, m8
mova m1, m8
pshufb m4, m2, m6
vpdpwssd m0, m10, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m10, m4 ; b0
vpdpwssd m0, m12, m4 ; a2
movu m4, [srcq+r6*2+16]
pshufb m3, m7
vpdpwssd m1, m11, m3 ; b1
vpdpwssd m0, m13, m3 ; a3
pshufb m3, m4, m6
vpdpwssd m1, m12, m3 ; b2
pshufb m2, m7
vpdpwssd m0, m11, m2 ; a1
pshufb m4, m7
vpdpwssd m1, m13, m4 ; b3
psrad m0, 6
psrad m1, 6
packusdw m0, m1
pminsw m0, m15
mova [dstq+r6*2], m0
add r6, 32
jl .h_w32_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w32_loop0
RET
.hv:
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [ssq*3]
sub srcq, 2
sub srcq, r6
test dword r8m, 0x800
jnz .hv_12bit
vpbroadcastd m10, [pd_2176]
psllw xmm0, 6
jmp .hv_main
.hv_12bit:
vpbroadcastd m10, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_main:
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m8, [buf+ 4]
vpbroadcastd m9, [buf+ 8]
vpbroadcastd ym11, xmm1
vpbroadcastd ym12, [buf+20]
vpbroadcastd ym13, [buf+24]
vpbroadcastd ym14, [buf+28]
movu xm4, [srcq+ssq*0]
vinserti32x4 ym4, [srcq+ssq*1], 1
vinserti32x4 m4, [srcq+ssq*2], 2
add srcq, r6
vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
movu xm0, [srcq+ssq*1]
vinserti32x4 ym0, [srcq+ssq*2], 1
add srcq, r6
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
cmp wd, 4
je .hv_w4
vbroadcasti32x4 m2, [spel_h_shufA]
mova m3, [spel_h_shuf2b]
mova ym6, [spel_h_shuf2a]
mova xm7, [spel_shuf2]
mova m1, m10
pshufb m4, m2
pshufb m0, m2
punpcklqdq m2, m4, m0
vpdpwssd m1, m8, m2 ; 04 15 26 3_
punpckhqdq m4, m0
vpdpwssd m1, m9, m4
vpermb m1, m3, m1 ; 01 12
vextracti32x4 xm2, ym1, 1 ; 23 34
vextracti32x4 xm3, m1, 2 ; 45 56
.hv_w2_loop:
movu xm5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x4 ym5, [srcq+ssq*0], 1
mova xm4, xm10
vpermb ym5, ym6, ym5
pmaddwd xmm0, xm11, xm1 ; a0 b0
vpdpwssd xm4, xm8, xm5
vextracti32x4 xm5, ym5, 1
mova xm1, xm2
vpdpwssd xmm0, xm12, xm2 ; a1 b1
vpdpwssd xm4, xm9, xm5 ; 7 8
mova xm2, xm3
vpdpwssd xmm0, xm13, xm3 ; a2 b2
vpermt2b xm3, xm7, xm4 ; 67 78
vpdpwssd xmm0, xm14, xm3 ; a3 b3
psrad xmm0, 10
packusdw xmm0, xmm0
pminsw xmm0, xm15
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
vbroadcasti32x4 m19, [spel_h_shufA]
vbroadcasti32x4 m20, [spel_h_shufB]
mova ym6, [spel_shuf4a]
mova ym7, [spel_shuf4b]
mova m2, m10
mova m3, m10
pshufb m1, m4, m19
vpdpwssd m2, m8, m1
pshufb m1, m0, m19
vpdpwssd m3, m8, m1
pshufb m4, m20
vpdpwssd m2, m9, m4
pshufb m0, m20
vpdpwssd m3, m9, m0
vpermb m1, m6, m2 ; 01 12
vshufi32x4 m2, m3, q1032
vpermb m3, m6, m3 ; 45 56
vpermb m2, m6, m2 ; 23 34
.hv_w4_loop:
movu xm18, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti128 ym18, [srcq+ssq*0], 1
pmaddwd ym16, ym11, ym1 ; a0 b0
mova ym1, ym2
mova ym2, ym3
pshufb ym17, ym18, ym19
mova ym3, ym10
vpdpwssd ym3, ym8, ym17
pshufb ym18, ym20
vpdpwssd ym16, ym12, ym1 ; a1 b1
vpdpwssd ym3, ym9, ym18 ; 7 8
vpdpwssd ym16, ym13, ym2 ; a2 b2
vpermt2b ym3, ym7, ym2 ; 67 78
vpdpwssd ym16, ym14, ym3 ; a3 b3
psrad ym16, 10
vextracti128 xm17, ym16, 1
packusdw xm16, xm17
pminsw xm16, xm15
movq [dstq+dsq*0], xm16
movhps [dstq+dsq*1], xm16
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
vzeroupper
RET
.hv_w8:
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [ssq*3]
sub srcq, 6
sub srcq, r6
test dword r8m, 0x800
jnz .hv_w8_12bit
vpbroadcastd m10, [pd_2176]
psllw xmm0, 6
jmp .hv_w8_main
.hv_w8_12bit:
vpbroadcastd m10, [pd_640]
psllw xmm0, 4
psllw xmm1, 2
.hv_w8_main:
mova [buf+ 0], xmm0
mova [buf+16], xmm1
vpbroadcastd m11, xmm0
vpbroadcastd m12, [buf+ 4]
vpbroadcastd m13, [buf+ 8]
vpbroadcastd m14, [buf+12]
vpbroadcastd m16, xmm1
vpbroadcastd m17, [buf+20]
vpbroadcastd m18, [buf+24]
vpbroadcastd m19, [buf+28]
cmp wd, 8
jg .hv_w16
mova m5, [spel_h_shufA]
movu ym0, [srcq+ssq*0]
vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
movu ym9, [srcq+ssq*2]
add srcq, r6
vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
movu ym20, [srcq+ssq*1]
vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
add srcq, r6
movu ym21, [srcq+ssq*0] ; 6
movu m6, [spel_h_shufB]
movu m7, [spel_h_shufC]
vpermb m8, m5, m0
mova m1, m10
vpdpwssd m1, m11, m8 ; a0 b0
vpermb m8, m5, m9
mova m2, m10
vpdpwssd m2, m11, m8 ; c0 d0
vpermb m8, m5, m20
mova m3, m10
vpdpwssd m3, m11, m8 ; e0 f0
vpermb m8, m5, m21
mova m4, m10
vpdpwssd m4, m11, m8 ; g0
vpermb m8, m6, m0
vpdpwssd m1, m12, m8 ; a1 b1
vpermb m8, m6, m9
vpdpwssd m2, m12, m8 ; c1 d1
vpermb m8, m6, m20
vpdpwssd m3, m12, m8 ; e1 f1
vpermb m8, m6, m21
vpdpwssd m4, m12, m8 ; g1
vpermb m8, m7, m0
vpdpwssd m1, m13, m8 ; a2 b2
vpermb m8, m7, m9
vpdpwssd m2, m13, m8 ; c2 d2
vpermb m8, m7, m20
vpdpwssd m3, m13, m8 ; e2 f2
vpermb m8, m7, m21
vpdpwssd m4, m13, m8 ; g2
mova m8, [spel_h_shufD]
vpermb m0, m8, m0
vpdpwssd m1, m14, m0 ; a3 b3
mova m0, [spel_shuf8a]
vpermb m9, m8, m9
vpdpwssd m2, m14, m9 ; c3 d3
mova m9, [spel_shuf8b]
vpermb m20, m8, m20
vpdpwssd m3, m14, m20 ; e3 f3
vpermb m21, m8, m21
vpdpwssd m4, m14, m21 ; g3
vpermt2b m1, m0, m2 ; 01 12
vpermt2b m2, m0, m3 ; 23 34
vpermt2b m3, m0, m4 ; 45 56
.hv_w8_loop:
movu ym0, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m0, [srcq+ssq*0], 1
mova m4, m10
vpermb m21, m5, m0
vpdpwssd m4, m11, m21 ; h0 i0
vpermb m21, m6, m0
pmaddwd m20, m16, m1 ; A0 B0
vpdpwssd m4, m12, m21 ; h1 i1
vpermb m21, m7, m0
mova m1, m2
vpdpwssd m20, m17, m2 ; A1 B1
vpdpwssd m4, m13, m21 ; h2 i2
vpermb m21, m8, m0
mova m2, m3
vpdpwssd m20, m18, m3 ; A2 B2
vpdpwssd m4, m14, m21 ; h3 i3
vpermt2b m3, m9, m4 ; 67 78
vpdpwssd m20, m19, m3 ; A3 B3
psrad m20, 10
vextracti32x8 ym21, m20, 1
packusdw ym20, ym21
pminsw ym20, ym15
mova [dstq+dsq*0], xm20
vextracti128 [dstq+dsq*1], ym20, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
vzeroupper
RET
.hv_w16:
WIN64_SPILL_XMM 26
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
add wd, wd
mova m9, [spel_shuf16]
lea wd, [hq+wq*8-256]
.hv_w16_loop0:
vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
movu ym6, [srcq+ssq*1+ 0]
movu ym7, [srcq+ssq*1+16]
lea r7, [srcq+r6]
vinserti32x8 m6, [srcq+ssq*2+ 0], 1
vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
movu ym22, [r7 +ssq*0+ 0]
movu ym23, [r7 +ssq*0+16]
mov r8, dstq
vinserti32x8 m22, [r7 +ssq*1+ 0], 1
vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4
movu ym24, [r7 +ssq*2+ 0]
movu ym25, [r7 +ssq*2+16]
add r7, r6
vinserti32x8 m24, [r7 +ssq*0+ 0], 1
vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6
pshufb m0, m4, m20
mova m1, m10
vpdpwssd m1, m11, m0 ; a0
pshufb m0, m6, m20
mova m2, m10
vpdpwssd m2, m11, m0 ; b0
pshufb m0, m7, m20
mova m3, m10
vpdpwssd m3, m13, m0 ; c2
pshufb m0, m4, m21
vpdpwssd m1, m12, m0 ; a1
pshufb m0, m6, m21
vpdpwssd m2, m12, m0 ; b1
pshufb m0, m7, m21
vpdpwssd m3, m14, m0 ; c3
pshufb m0, m5, m20
vpdpwssd m1, m13, m0 ; a2
shufpd m6, m7, 0x55
pshufb m7, m6, m20
vpdpwssd m2, m13, m7 ; b2
vpdpwssd m3, m11, m7 ; c0
pshufb m5, m21
vpdpwssd m1, m14, m5 ; a3
pshufb m6, m21
vpdpwssd m2, m14, m6 ; b3
vpdpwssd m3, m12, m6 ; c1
pshufb m0, m22, m20
mova m4, m10
vpdpwssd m4, m11, m0 ; d0
pshufb m0, m23, m20
mova m5, m10
vpdpwssd m5, m13, m0 ; e2
pshufb m0, m24, m20
mova m6, m10
vpdpwssd m6, m11, m0 ; f0
pshufb m0, m25, m20
mova m7, m10
vpdpwssd m7, m13, m0 ; g2
pshufb m0, m22, m21
vpdpwssd m4, m12, m0 ; d1
pshufb m0, m23, m21
vpdpwssd m5, m14, m0 ; e3
pshufb m0, m24, m21
vpdpwssd m6, m12, m0 ; f1
pshufb m0, m25, m21
vpdpwssd m7, m14, m0 ; g3
shufpd m22, m23, 0x55
pshufb m23, m22, m20
vpdpwssd m4, m13, m23 ; d2
vpdpwssd m5, m11, m23 ; e0
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m6, m13, m25 ; f2
vpdpwssd m7, m11, m25 ; g0
pshufb m22, m21
vpdpwssd m4, m14, m22 ; d3
vpdpwssd m5, m12, m22 ; e1
pshufb m24, m21
vpdpwssd m6, m14, m24 ; f3
vpdpwssd m7, m12, m24 ; g1
pslldq m1, 1
vpermt2b m2, m9, m3 ; 12
vpermt2b m4, m9, m5 ; 34
vpermt2b m6, m9, m7 ; 56
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
vpshrdd m5, m4, m6, 16 ; 45
.hv_w16_loop:
movu ym24, [r7+ssq*1+ 0]
movu ym25, [r7+ssq*1+16]
lea r7, [r7+ssq*2]
vinserti32x8 m24, [r7+ssq*0+ 0], 1
vinserti32x8 m25, [r7+ssq*0+16], 1
mova m7, m10
mova m8, m10
pshufb m0, m24, m20
vpdpwssd m7, m11, m0 ; h0
pshufb m0, m25, m20
vpdpwssd m8, m13, m0 ; i2
pmaddwd m22, m16, m1 ; A0
mova m1, m3
pmaddwd m23, m16, m2 ; B0
mova m2, m4
pshufb m0, m24, m21
vpdpwssd m7, m12, m0 ; h1
pshufb m0, m25, m21
vpdpwssd m8, m14, m0 ; i3
vpdpwssd m22, m17, m3 ; A1
mova m3, m5
vpdpwssd m23, m17, m4 ; B1
mova m4, m6
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m7, m13, m25 ; h2
vpdpwssd m8, m11, m25 ; i0
vpdpwssd m22, m18, m5 ; A2
vpdpwssd m23, m18, m6 ; B2
pshufb m24, m21
vpdpwssd m7, m14, m24 ; h3
vpdpwssd m8, m12, m24 ; i1
vpermt2b m7, m9, m8 ; 78
vpshrdd m5, m6, m7, 16 ; 67
vpdpwssd m22, m19, m5 ; A3
vpdpwssd m23, m19, m7 ; B3
mova m6, m7
psrad m22, 10
psrad m23, 10
vshufi32x4 m0, m22, m23, q3232
vinserti32x8 m22, ym23, 1
packusdw m22, m0
pminsw m22, m15
mova [r8+dsq*0], ym22
vextracti32x8 [r8+dsq*1], m22, 1
lea r8, [r8+dsq*2]
sub hd, 2
jg .hv_w16_loop
add srcq, 32
add dstq, 32
movzx hd, wb
sub wd, 1<<8
jg .hv_w16_loop0
RET
%if WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
%endif
%define PREP_8TAP_FN FN prep_8tap,
PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc
PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc
PREP_8TAP_FN regular, REGULAR, REGULAR
cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
%define base r7-prep_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 6tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 6tap_v, my, 4tap_v
lea r7, [prep_avx512icl]
mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
.prep:
tzcnt wd, wd
mov r5d, r7m ; bitdepth_max
vpbroadcastd m5, [pw_8192]
movzx wd, word [r7+wq*2+table_offset(prep,)]
shr r5d, 11
vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
add wq, r7
lea r6, [ssq*3]
%if WIN64
pop r7
%endif
jmp wq
.h_w8:
mova m6, [spel_h_shufA]
movu m7, [spel_h_shufC]
mova m8, [prep_endB]
.h_w8_loop:
movu ym4, [srcq+ssq*0]
vinserti32x8 m4, [srcq+ssq*1], 1
movu ym5, [srcq+ssq*2]
vinserti32x8 m5, [srcq+r6 ], 1
lea srcq, [srcq+ssq*4]
mova m0, m10
mova m1, m10
vpermb m2, m6, m4
vpermb m3, m6, m5
vpdpwssd m0, m12, m2 ; a0 b0
vpdpwssd m1, m12, m3 ; c0 d0
vpermb m4, m7, m4
vpermb m5, m7, m5
vpdpwssd m0, m14, m4 ; a2 b2
vpdpwssd m1, m14, m5 ; c2 d2
shufpd m2, m4, 0x55
shufpd m3, m5, 0x55
vpdpwssd m0, m13, m2 ; a1 b1
vpdpwssd m1, m13, m3 ; c1 d1
vpermt2b m0, m8, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8_loop
RET
.h:
vpbroadcastd m10, [prep_8tap_rnd]
test myd, 0xf00
jnz .hv
lea r6, [ssq*3]
cmp wd, 4
je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
mov r5d, r7m
sub srcq, 4
shr r5d, 11
psllw xmm0, [base+prep_hv_shift+r5*8]
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
cmp wd, 16
jl .h_w8
vbroadcasti32x4 m5, [spel_h_shufA]
vbroadcasti32x4 m6, [spel_h_shufB]
mova m7, [prep_endC]
jg .h_w32
.h_w16_loop:
movu ym2, [srcq+ssq*0+ 0]
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
movu ym3, [srcq+ssq*0+12]
vinserti32x8 m3, [srcq+ssq*1+12], 1
lea srcq, [srcq+ssq*2]
mova m0, m10
mova m1, m10
pshufb m4, m2, m5 ; 01
vpdpwssd m0, m12, m4 ; a0 b0
pshufb m4, m3, m6 ; 89
vpdpwssd m1, m14, m4 ; a2' b2'
pshufb m2, m6 ; 23
pshufb m3, m5 ; 67
vpdpwssd m0, m13, m2 ; a1 b1
vpdpwssd m1, m13, m3 ; a1' b1'
shufpd m2, m3, 0x55 ; 45
vpdpwssd m0, m14, m2 ; a2 b2
vpdpwssd m1, m12, m2 ; a0' b0'
vpermt2b m0, m7, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+12]
mova m0, m10
mova m1, m10
pshufb m4, m2, m5
vpdpwssd m0, m12, m4
pshufb m4, m3, m6
vpdpwssd m1, m14, m4
pshufb m2, m6
pshufb m3, m5
vpdpwssd m0, m13, m2
vpdpwssd m1, m13, m3
shufpd m2, m3, 0x55
vpdpwssd m0, m14, m2
vpdpwssd m1, m12, m2
vpermt2b m0, m7, m1
mova [tmpq], m0
add tmpq, 64
add r6, 32
jl .h_w32_loop
add srcq, ssq
dec hd
jg .h_w32_loop0
RET
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
vpbroadcastd m10, [prep_8tap_rnd]
pmovsxbw xmm0, [base+subpel_filters+1+myq*8]
tzcnt r6d, wd
shr r5d, 11
movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
psllw xmm0, [base+prep_hv_shift+r5*8]
add r7, r6
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
mov r6, ssq
vpbroadcastd m13, [tmpq+ 4]
neg r6
vpbroadcastd m14, [tmpq+ 8]
jmp r7
.v_w4:
mov r3d, 0x330c
movq xm1, [srcq+r6 *2]
kmovw k1, r3d
vpbroadcastq ym1{k1}, [srcq+r6 *1]
vpbroadcastq m2, [srcq+ssq*0]
vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3
movq xm0, [srcq+ssq*2]
mova ym4, [prep_endA]
valignq m0, m1, 2
punpcklwd m1, m0 ; 01 12 23 34
.v_w4_loop:
lea srcq, [srcq+ssq*4]
movq xm2, [srcq+r6 *1]
vpbroadcastq ym2{k1}, [srcq+ssq*0]
vpbroadcastq m3, [srcq+ssq*1]
vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3
mova m3, m10
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
valignq m0, m2, m0, 6 ; 4 5 6 7
punpcklwd m0, m2 ; 45 56 67 78
vpdpwssd m3, m14, m0 ; a2 b2 c2 d2
vshufi32x4 m1, m0, q1032 ; 23 34 45 56
vpdpwssd m3, m13, m1 ; a1 b1 c1 d1
mova m1, m0
mova m0, m2
vpermb m3, m4, m3
mova [tmpq], ym3
add tmpq, 32
sub hd, 4
jg .v_w4_loop
RET
.v_w8:
vbroadcasti32x4 ym1, [srcq+r6 *1]
mov r3d, 0x33
vbroadcasti32x4 m2, [srcq+ssq*0]
kmovb k1, r3d
mova m6, [spel_v_shuf8]
vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
vbroadcasti32x4 ym0, [srcq+ssq*1]
vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
mova m7, [prep_endB]
vpermb m1, m6, m1 ; 01 12
vpermb m2, m6, m0 ; 23 34
.v_w8_loop:
lea srcq, [srcq+ssq*4]
vbroadcasti32x4 ym3, [srcq+r6 *1]
movu xm4, [srcq+ssq*0]
vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6
vbroadcasti32x4 ym0, [srcq+ssq*1]
vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
mova m4, m10
vpdpwssd m4, m12, m1 ; a0 b0
mova m5, m10
vpdpwssd m5, m12, m2 ; c0 d0
vpermb m1, m6, m3 ; 45 56
vpdpwssd m4, m13, m2 ; a1 b1
vpermb m2, m6, m0 ; 67 78
vpdpwssd m5, m13, m1 ; c1 d1
vpdpwssd m4, m14, m1 ; a2 b2
vpdpwssd m5, m14, m2 ; c2 d2
vpermt2b m4, m7, m5
mova [tmpq], m4
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m0, [srcq+r6 *1]
vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2
vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1
mova m6, [spel_v_shuf16]
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4
mova m7, [prep_endA]
vpermb m1, m6, m1 ; 12
vpermb m0, m6, m0 ; 01
vpermb m3, m6, m3 ; 34
vpshrdd m2, m1, m3, 16 ; 23
.v_w16_loop:
mova m5, m10
vpdpwssd m5, m12, m1 ; b0
mova m4, m10
vpdpwssd m4, m12, m0 ; a0
mova m1, m3
vpdpwssd m5, m13, m3 ; b1
movu ym3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpdpwssd m4, m13, m2 ; a1
vinserti32x8 m3, [srcq+ssq*0], 1
mova m0, m2
vpermb m3, m6, m3 ; 56
vpshrdd m2, m1, m3, 16 ; 45
vpdpwssd m5, m14, m3 ; b2
vpdpwssd m4, m14, m2 ; a2
vpermt2b m4, m7, m5
mova [tmpq], m4
add tmpq, 64
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
%if WIN64
push r8
%endif
mova m11, [prep_endC]
lea r5, [hq+wq*8-256]
.v_w32_loop0:
movu m4, [srcq+r6 *2]
movu m5, [srcq+r6 *1]
lea r7, [srcq+ssq*2]
movu m6, [srcq+ssq*0]
movu m7, [srcq+ssq*1]
mov r8, tmpq
movu m8, [r7 +ssq*0]
punpcklwd m0, m4, m5 ; 01
punpckhwd m4, m5
punpcklwd m1, m5, m6 ; 12
punpckhwd m5, m6
punpcklwd m2, m6, m7 ; 23
punpckhwd m6, m7
punpcklwd m3, m7, m8 ; 34
punpckhwd m7, m8
.v_w32_loop:
mova m16, m10
movu m9, [r7+ssq*1]
mova m18, m10
vpdpwssd m16, m12, m0 ; a0
mova m17, m10
vpdpwssd m18, m12, m4
mova m19, m10
vpdpwssd m17, m12, m1 ; b0
lea r7, [r7+ssq*2]
vpdpwssd m19, m12, m5
mova m0, m2
vpdpwssd m16, m13, m2 ; a1
punpcklwd m2, m8, m9 ; 45
mova m4, m6
vpdpwssd m18, m13, m6
punpckhwd m6, m8, m9
movu m8, [r7+ssq*0]
vpdpwssd m17, m13, m3 ; b1
mova m1, m3
vpdpwssd m19, m13, m7
mova m5, m7
vpdpwssd m16, m14, m2 ; a2
punpcklwd m3, m9, m8 ; 56
vpdpwssd m18, m14, m6
punpckhwd m7, m9, m8
vpdpwssd m17, m14, m3 ; b2
vpdpwssd m19, m14, m7
vpermt2b m16, m11, m18
vpermt2b m17, m11, m19
mova [r8+wq*0], m16
mova [r8+wq*2], m17
lea r8, [r8+wq*4]
sub hd, 2
jg .v_w32_loop
add srcq, 64
add tmpq, 64
movzx hd, r5b
sub r5d, 1<<8
jg .v_w32_loop0
%if WIN64
pop r8
%endif
vzeroupper
RET
.hv_w4:
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
mov r6, ssq
sub srcq, 2
shr r5d, 11
neg r6
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m8, [tmpq+ 4]
mov r3d, 0xf0
vpbroadcastd m9, [tmpq+ 8]
vpbroadcastd m12, xmm1
movu xm3, [srcq+r6 *2]
kmovb k1, r3d
vinserti32x4 ym3, [srcq+r6 *1], 1
vbroadcasti32x4 m2, [srcq+ssq*0]
vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3
movu xm4, [srcq+ssq*2]
vbroadcasti32x4 m5, [spel_h_shufA]
vbroadcasti32x4 m6, [spel_h_shufB]
mova m1, m11
mova m15, [spel_shuf4a]
mova xm2, xm11
pshufb m0, m3, m5
vpdpwssd m1, m8, m0
pshufb xm0, xm4, xm5
vpdpwssd xm2, xm8, xm0
vpbroadcastd m13, [tmpq+20]
pshufb m3, m6
vpbroadcastd m14, [tmpq+24]
pshufb xm4, xm6
mova m7, [spel_shuf4b]
vpdpwssd m1, m9, m3 ; 0 1 2 3
vpdpwssd xm2, xm9, xm4 ; 4
vpermt2b m1, m15, m2 ; 01 12 23 34
mova ym15, [prep_endA]
.hv_w4_loop:
lea srcq, [srcq+ssq*4]
movu xm4, [srcq+r6 *1]
vinserti32x4 ym4, [srcq+ssq*0], 1
vbroadcasti32x4 m3, [srcq+ssq*1]
vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3
mova m2, m11
pshufb m3, m4, m5
vpdpwssd m2, m8, m3
mova m3, m10
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
pshufb m4, m6
vpdpwssd m2, m9, m4 ; 5 6 7 8
mova m4, m1
vpermt2b m1, m7, m2 ; 45 56 67 78
vpdpwssd m3, m14, m1 ; a2 b2 c2 d2
vshufi32x4 m4, m1, q1032 ; 23 34 45 56
vpdpwssd m3, m13, m4 ; a1 b1 c1 d1
vpermb m3, m15, m3
mova [tmpq], ym3
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
mova m8, [spel_h_shufA]
movu ym18, [srcq+r6 *2]
vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1
movu ym19, [srcq+ssq*0]
vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3
movu ym20, [srcq+ssq*2] ; 4
movu m9, [spel_h_shufC]
mova m21, [spel_shuf8a]
mova m0, [spel_shuf8b]
vpermb m4, m8, m18
mova m1, m10
vpermb m5, m8, m19
vpdpwssd m1, m12, m4 ; a0 b0
mova m2, m10
vpermb m6, m8, m20
vpdpwssd m2, m12, m5 ; c0 d0
mova m3, m10
vpermb m18, m9, m18
vpdpwssd m3, m12, m6 ; e0
mova m7, [prep_endB]
vpermb m19, m9, m19
vpdpwssd m1, m14, m18 ; a2 b2
vpermb m20, m9, m20
vpdpwssd m2, m14, m19 ; c2 d2
shufpd m4, m18, 0x55
vpdpwssd m3, m14, m20 ; e2
shufpd m5, m19, 0x55
vpdpwssd m1, m13, m4 ; a1 b1
shufpd m6, m20, 0x55
vpdpwssd m2, m13, m5 ; c1 d1
vpdpwssd m3, m13, m6 ; e1
vpermt2b m1, m21, m2 ; 01 12
vpermt2b m2, m21, m3 ; 23 34
.hv_w8_loop:
lea srcq, [srcq+ssq*4]
movu ym18, [srcq+r6 *1]
vinserti32x8 m18, [srcq+ssq*0], 1
movu ym19, [srcq+ssq*1]
vinserti32x8 m19, [srcq+ssq*2], 1
mova m3, m10
vpermb m5, m8, m18
mova m4, m10
vpermb m6, m8, m19
vpdpwssd m3, m12, m5 ; f0 g0
mova m20, m11
vpdpwssd m4, m12, m6 ; h0 i0
mova m21, m11
vpdpwssd m20, m15, m1 ; A0 B0
vpermb m18, m9, m18
vpdpwssd m21, m15, m2 ; C0 D0
vpermb m19, m9, m19
vpdpwssd m3, m14, m18 ; f2 g2
vpdpwssd m4, m14, m19 ; h2 i2
shufpd m5, m18, 0x55
vpdpwssd m20, m16, m2 ; A1 B1
shufpd m6, m19, 0x55
vpdpwssd m3, m13, m5 ; f1 g1
vpdpwssd m4, m13, m6 ; h1 i1
vpermt2b m2, m0, m3 ; 45 56
vpdpwssd m21, m16, m2 ; C1 D1
mova m1, m2
vpermt2b m2, m0, m4 ; 67 78
vpdpwssd m20, m17, m1 ; A2 B2
vpdpwssd m21, m17, m2 ; A2 B2
vpermt2b m20, m7, m21
mova [tmpq], m20
add tmpq, 64
sub hd, 4
jg .hv_w8_loop
vzeroupper
RET
.hv:
vpbroadcastd m11, [pd_128]
cmp wd, 4
je .hv_w4
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
mov r6, ssq
sub srcq, 4
shr r5d, 11
neg r6
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, xmm1
vpbroadcastd m16, [tmpq+20]
vpbroadcastd m17, [tmpq+24]
cmp wd, 16
jl .hv_w8
vbroadcasti32x4 m8, [spel_h_shufA]
vbroadcasti32x4 m9, [spel_h_shufB]
jg .hv_w32
vbroadcasti32x8 m6, [srcq+r6 *2+ 8]
vinserti32x8 m2, m6, [srcq+r6 *2+16], 1
vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0
movu ym18, [srcq+r6 *1+ 0]
movu ym19, [srcq+r6 *1+12]
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2
movu ym20, [srcq+ssq*1+ 0]
movu ym21, [srcq+ssq*1+12]
lea srcq, [srcq+ssq*2]
vinserti32x8 m20, [srcq+ssq*0+ 0], 1
vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4
pshufb m2, m8
mova m1, m10
pshufb m3, m18, m8
vpdpwssd m1, m14, m2 ; a2
mova m2, m10
pshufb m4, m19, m9
vpdpwssd m2, m12, m3 ; b0 c0
mova m3, m10
pshufb m5, m20, m8
vpdpwssd m3, m14, m4 ; b2' c2'
mova m4, m10
pshufb m7, m21, m9
vpdpwssd m4, m12, m5 ; d0 e0
mova m5, m10
pshufb m0, m6, m8
vpdpwssd m5, m14, m7 ; d2' e2'
mova m7, [spel_shuf16]
pshufb m18, m9
vpdpwssd m1, m12, m0 ; a0
pshufb m19, m8
vpdpwssd m2, m13, m18 ; b1 c1
pshufb m20, m9
vpdpwssd m3, m13, m19 ; b1' c1'
pshufb m21, m8
vpdpwssd m4, m13, m20 ; d1 e1
pshufb m6, m9
vpdpwssd m5, m13, m21 ; d1' e1'
mova m0, [prep_endB]
shufpd m18, m19, 0x55
vpdpwssd m1, m13, m6 ; a1
shufpd m20, m21, 0x55
vpdpwssd m2, m14, m18 ; b2 c2
vpdpwssd m3, m12, m18 ; b0' c0'
vpdpwssd m4, m14, m20 ; d2 e2
vpdpwssd m5, m12, m20 ; d0' e0'
pslldq m1, 1
vpermt2b m2, m7, m3 ; 12
vpermt2b m4, m7, m5 ; 34
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
.hv_w16_loop:
movu ym18, [srcq+ssq*1+ 0]
movu ym19, [srcq+ssq*1+12]
lea srcq, [srcq+ssq*2]
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
vinserti32x8 m19, [srcq+ssq*0+12], 1
mova m5, m10
mova m6, m10
pshufb m21, m18, m8
vpdpwssd m5, m12, m21 ; f0 g0
pshufb m20, m19, m9
mova m21, m11
vpdpwssd m6, m14, m20 ; f2' g2'
mova m20, m11
vpdpwssd m21, m15, m2 ; B0
mova m2, m4
vpdpwssd m20, m15, m1 ; A0
mova m1, m3
pshufb m18, m9
vpdpwssd m5, m13, m18 ; f1 g1
pshufb m19, m8
vpdpwssd m6, m13, m19 ; f1' g1'
vpdpwssd m21, m16, m4 ; B1
vpdpwssd m20, m16, m3 ; A1
shufpd m18, m19, 0x55
vpdpwssd m5, m14, m18 ; f2 g2
vpdpwssd m6, m12, m18 ; f0' g0'
mova m4, m7
vpermi2b m4, m5, m6 ; 56
vpshrdd m3, m2, m4, 16 ; 45
vpdpwssd m21, m17, m4 ; B2
vpdpwssd m20, m17, m3 ; A2
vpermt2b m20, m0, m21
mova [tmpq], m20
add tmpq, 64
sub hd, 2
jg .hv_w16_loop
vzeroupper
RET
.hv_w32:
WIN64_SPILL_XMM 29
%if WIN64
push r8
%endif
mova m27, [spel_shuf32]
lea r5d, [hq+wq*8-256]
mova m28, [prep_endC]
.hv_w32_loop0:
movu m18, [srcq+r6 *2+ 0]
movu m7, [srcq+r6 *2+12]
movu m6, [srcq+r6 *1+ 0]
movu m20, [srcq+r6 *1+12]
lea r7, [srcq+ssq*2]
movu m19, [srcq+ssq*0+ 0]
movu m21, [srcq+ssq*0+12]
movu m22, [srcq+ssq*1+ 0]
movu m24, [srcq+ssq*1+12]
mov r8, tmpq
movu m23, [r7 +ssq*0+ 0]
movu m25, [r7 +ssq*0+12]
pshufb m1, m18, m8
mova m0, m10
pshufb m2, m7, m9
vpdpwssd m0, m12, m1 ; a0
mova m1, m10
pshufb m4, m6, m8
vpdpwssd m1, m14, m2 ; a2'
mova m2, m10
pshufb m3, m19, m8
vpdpwssd m2, m12, m4 ; b0
mova m4, m10
pshufb m5, m20, m9
vpdpwssd m4, m12, m3 ; c0
mova m3, m10
pshufb m26, m21, m9
vpdpwssd m3, m14, m5 ; b2'
mova m5, m10
pshufb m18, m9
vpdpwssd m5, m14, m26 ; c2'
pshufb m7, m8
vpdpwssd m0, m13, m18 ; a1
pshufb m6, m9
vpdpwssd m1, m13, m7 ; a1'
pshufb m19, m9
vpdpwssd m2, m13, m6 ; b1
pshufb m20, m8
vpdpwssd m4, m13, m19 ; c1
pshufb m21, m8
vpdpwssd m3, m13, m20 ; b1'
shufpd m18, m7, 0x55
vpdpwssd m5, m13, m21 ; c1'
shufpd m6, m20, 0x55
vpdpwssd m0, m14, m18 ; a2
shufpd m19, m21, 0x55
vpdpwssd m1, m12, m18 ; a0'
pshufb m18, m22, m8
vpdpwssd m2, m14, m6 ; b2
pshufb m7, m23, m8
vpdpwssd m4, m14, m19 ; c2
vpdpwssd m3, m12, m6 ; b0'
mova m6, m10
vpdpwssd m5, m12, m19 ; c0'
pshufb m19, m24, m9
vpdpwssd m6, m12, m18 ; d0
mova m18, m10
pshufb m26, m25, m9
vpdpwssd m18, m12, m7 ; e0
mova m7, m10
pshufb m22, m9
vpdpwssd m7, m14, m19 ; d2'
mova m19, m10
pshufb m23, m9
vpdpwssd m19, m14, m26 ; e2'
pshufb m24, m8
vpdpwssd m6, m13, m22 ; d1
pshufb m25, m8
vpdpwssd m18, m13, m23 ; e1
shufpd m22, m24, 0x55
vpdpwssd m7, m13, m24 ; d1'
shufpd m23, m25, 0x55
vpdpwssd m19, m13, m25 ; e1'
pslldq m0, 1
vpdpwssd m6, m14, m22 ; d2
pslldq m1, 1
vpdpwssd m18, m14, m23 ; e2
vpermt2b m2, m27, m4 ; 12
vpdpwssd m7, m12, m22 ; d0'
vpermt2b m3, m27, m5 ; 12'
vpdpwssd m19, m12, m23 ; e0'
vpshrdd m0, m2, 16 ; 01
vpermt2b m6, m27, m18 ; 34
vpshrdd m1, m3, 16 ; 01'
vpermt2b m7, m27, m19 ; 34'
vpshrdd m4, m2, m6, 16 ; 23
vpshrdd m5, m3, m7, 16 ; 23'
.hv_w32_loop:
movu m22, [r7+ssq*1+ 0]
movu m24, [r7+ssq*1+12]
lea r7, [r7+ssq*2]
movu m23, [r7+ssq*0+ 0]
movu m25, [r7+ssq*0+12]
mova m19, m11
vpdpwssd m19, m15, m2 ; B0
mova m21, m11
vpdpwssd m21, m15, m3 ; B0'
mova m18, m11
vpdpwssd m18, m15, m0 ; A0
mova m20, m11
vpdpwssd m20, m15, m1 ; A0'
mova m2, m6
vpdpwssd m19, m16, m6 ; B1
mova m3, m7
vpdpwssd m21, m16, m7 ; B1'
mova m0, m4
vpdpwssd m18, m16, m4 ; A1
mova m1, m5
pshufb m4, m22, m8
vpdpwssd m20, m16, m5 ; A1'
mova m6, m10
pshufb m7, m23, m8
vpdpwssd m6, m12, m4 ; f0
mova m4, m10
pshufb m5, m24, m9
vpdpwssd m4, m12, m7 ; g0
mova m7, m10
pshufb m26, m25, m9
vpdpwssd m7, m14, m5 ; f2'
mova m5, m10
pshufb m22, m9
vpdpwssd m5, m14, m26 ; g2'
pshufb m23, m9
vpdpwssd m6, m13, m22 ; f1
pshufb m24, m8
vpdpwssd m4, m13, m23 ; g1
pshufb m25, m8
vpdpwssd m7, m13, m24 ; f1'
shufpd m22, m24, 0x55
vpdpwssd m5, m13, m25 ; g1'
shufpd m23, m25, 0x55
vpdpwssd m6, m14, m22 ; f2
vpdpwssd m4, m14, m23 ; g2
vpdpwssd m7, m12, m22 ; f0'
vpdpwssd m5, m12, m23 ; g0'
vpermt2b m6, m27, m4 ; 56
vpermt2b m7, m27, m5 ; 56'
vpdpwssd m19, m17, m6 ; B2
vpshrdd m4, m2, m6, 16 ; 45
vpdpwssd m21, m17, m7 ; B2'
vpshrdd m5, m3, m7, 16 ; 45'
vpdpwssd m18, m17, m4 ; A2
vpdpwssd m20, m17, m5 ; A2'
vpermt2b m19, m28, m21
vpermt2b m18, m28, m20
mova [r8+wq*0], m18
mova [r8+wq*2], m19
lea r8, [r8+wq*4]
sub hd, 2
jg .hv_w32_loop
add srcq, 64
add tmpq, 64
movzx hd, r5b
sub r5d, 1<<8
jg .hv_w32_loop0
%if WIN64
pop r8
%endif
RET
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc
PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc
PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc
PREP_8TAP_FN sharp, SHARP, SHARP
cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
%define base r7-prep_avx512icl
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
lea r7, [prep_avx512icl]
mov wd, wm
movifnidn hd, hm
test mxd, 0xf00
jnz .h
test myd, 0xf00
jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
.v:
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
vpbroadcastd m10, [prep_8tap_rnd]
pmovsxbw xmm0, [base+subpel_filters+myq*8]
tzcnt r6d, wd
shr r5d, 11
movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
psllw xmm0, [base+prep_hv_shift+r5*8]
add r7, r6
lea r6, [strideq*3]
sub srcq, r6
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
jmp r7
.v_w4:
mov r3d, 0x330c
movq xm1, [srcq+strideq*0]
kmovw k1, r3d
vpbroadcastq ym1{k1}, [srcq+strideq*1]
vpbroadcastq m0, [srcq+r6 ]
vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
lea srcq, [srcq+strideq*4]
vpbroadcastq ym0{k1}, [srcq+strideq*0]
vpbroadcastq m2, [srcq+strideq*1]
vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
mova ym5, [prep_endA]
vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4
vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5
punpcklwd m1, m3 ; 01 12 23 34
punpcklwd m2, m0 ; 23 34 45 56
.v_w4_loop:
movq xm4, [srcq+r6 ]
lea srcq, [srcq+strideq*4]
vpbroadcastq ym4{k1}, [srcq+strideq*0]
vpbroadcastq m3, [srcq+strideq*1]
vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
mova m3, m10
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
valignq m1, m4, m0, 6 ; 6 7 8 9
vpdpwssd m3, m13, m2 ; a1 b1 c1 d1
mova m0, m4
punpcklwd m4, m1, m4 ; 67 78 89 9a
vpdpwssd m3, m15, m4 ; a3 b3 c3 d3
vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78
vpdpwssd m3, m14, m1 ; a2 b2 c2 d2
mova m2, m4
vpermb m3, m5, m3
mova [tmpq], ym3
add tmpq, 32
sub hd, 4
jg .v_w4_loop
RET
.v_w8:
movu xm0, [srcq+strideq*0]
mov r3d, 0x33
vbroadcasti32x4 ym1, [srcq+strideq*1]
kmovb k1, r3d
mova m7, [spel_v_shuf8]
vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
add srcq, r6
vbroadcasti32x4 ym2, [srcq+strideq*0]
vbroadcasti32x4 m3, [srcq+strideq*1]
vbroadcasti32x4 ym0, [srcq+strideq*2]
vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4
vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
mova m8, [prep_endB]
vpermb m1, m7, m1 ; 01 12
vpermb m2, m7, m2 ; 23 34
vpermb m3, m7, m0 ; 45 56
.v_w8_loop:
lea srcq, [srcq+strideq*4]
vbroadcasti32x4 ym4, [srcq+strideq*0]
movu xm5, [srcq+strideq*1]
vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8
vbroadcasti32x4 ym0, [srcq+strideq*2]
vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
mova m5, m10
vpdpwssd m5, m12, m1 ; a0 b0
mova m6, m10
vpdpwssd m6, m12, m2 ; c0 d0
mova m1, m3
vpdpwssd m5, m13, m2 ; c1 d1
vpdpwssd m6, m13, m3 ; c1 d1
vpermb m2, m7, m4 ; 67 78
vpdpwssd m5, m14, m3 ; a2 b2
vpermb m3, m7, m0 ; 89 9a
vpdpwssd m6, m14, m2 ; c2 d2
vpdpwssd m5, m15, m2 ; a3 b3
vpdpwssd m6, m15, m3 ; c3 d3
vpermt2b m5, m8, m6
mova [tmpq], m5
add tmpq, 64
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
vbroadcasti32x8 m0, [srcq+strideq*1]
vinserti32x8 m1, m0, [srcq+strideq*2], 1
vinserti32x8 m0, [srcq+strideq*0], 0
mova m8, [spel_v_shuf16]
add srcq, r6
movu ym3, [srcq+strideq*0]
vinserti32x8 m3, [srcq+strideq*1], 1
movu ym5, [srcq+strideq*2]
add srcq, r6
vinserti32x8 m5, [srcq+strideq*0], 1
mova m11, [prep_endA]
vpermb m1, m8, m1 ; 12
vpermb m0, m8, m0 ; 01
vpermb m3, m8, m3 ; 34
vpermb m5, m8, m5 ; 56
vpshrdd m2, m1, m3, 16 ; 23
vpshrdd m4, m3, m5, 16 ; 45
.v_w16_loop:
mova m7, m10
vpdpwssd m7, m12, m1 ; b0
mova m6, m10
vpdpwssd m6, m12, m0 ; a0
mova m1, m3
vpdpwssd m7, m13, m3 ; b1
mova m0, m2
vpdpwssd m6, m13, m2 ; a1
mova m3, m5
vpdpwssd m7, m14, m5 ; b2
mova m2, m4
vpdpwssd m6, m14, m4 ; a2
movu ym5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vinserti32x8 m5, [srcq+strideq*0], 1
vpermb m5, m8, m5 ; 78
vpshrdd m4, m3, m5, 16 ; 67
vpdpwssd m7, m15, m5 ; b3
vpdpwssd m6, m15, m4 ; a3
vpermt2b m6, m11, m7
mova [tmpq], m6
add tmpq, 64
sub hd, 2
jg .v_w16_loop
RET
.v_w32:
.v_w64:
.v_w128:
WIN64_PUSH_XMM 23
%if WIN64
push r8
%endif
mova m11, [prep_endC]
lea r5, [hq+wq*8-256]
.v_w32_loop0:
movu m16, [srcq+strideq*0]
movu m17, [srcq+strideq*1]
lea r7, [srcq+r6]
movu m18, [srcq+strideq*2]
movu m19, [r7 +strideq*0]
mov r8, tmpq
movu m20, [r7 +strideq*1]
movu m21, [r7 +strideq*2]
add r7, r6
movu m22, [r7 +strideq*0]
punpcklwd m0, m16, m17 ; 01l
punpckhwd m16, m17 ; 01h
punpcklwd m1, m17, m18 ; 12l
punpckhwd m17, m18 ; 12h
punpcklwd m2, m18, m19 ; 23l
punpckhwd m18, m19 ; 23h
punpcklwd m3, m19, m20 ; 34l
punpckhwd m19, m20 ; 34h
punpcklwd m4, m20, m21 ; 45l
punpckhwd m20, m21 ; 45h
punpcklwd m5, m21, m22 ; 56l
punpckhwd m21, m22 ; 56h
.v_w32_loop:
mova m6, m10
vpdpwssd m6, m12, m0 ; a0l
mova m8, m10
vpdpwssd m8, m12, m16 ; a0h
mova m7, m10
vpdpwssd m7, m12, m1 ; b0l
mova m9, m10
vpdpwssd m9, m12, m17 ; b0h
mova m0, m2
vpdpwssd m6, m13, m2 ; a1l
mova m16, m18
vpdpwssd m8, m13, m18 ; a1h
mova m1, m3
vpdpwssd m7, m13, m3 ; b1l
mova m17, m19
vpdpwssd m9, m13, m19 ; b1h
mova m2, m4
vpdpwssd m6, m14, m4 ; a2l
mova m18, m20
vpdpwssd m8, m14, m20 ; a2h
mova m3, m5
vpdpwssd m7, m14, m5 ; b2l
mova m19, m21
vpdpwssd m9, m14, m21 ; b2h
movu m21, [r7+strideq*1]
lea r7, [r7+strideq*2]
punpcklwd m4, m22, m21 ; 67l
punpckhwd m20, m22, m21 ; 67h
movu m22, [r7+strideq*0]
vpdpwssd m6, m15, m4 ; a3l
vpdpwssd m8, m15, m20 ; a3h
punpcklwd m5, m21, m22 ; 78l
punpckhwd m21, m22 ; 78h
vpdpwssd m7, m15, m5 ; b3l
vpdpwssd m9, m15, m21 ; b3h
vpermt2b m6, m11, m8
vpermt2b m7, m11, m9
mova [r8+wq*0], m6
mova [r8+wq*2], m7
lea r8, [r8+wq*4]
sub hd, 2
jg .v_w32_loop
add srcq, 64
add tmpq, 64
movzx hd, r5b
sub r5d, 1<<8
jg .v_w32_loop0
%if WIN64
pop r8
%endif
RET
.h_w4:
RESET_STACK_STATE
movzx mxd, mxb
sub srcq, 2
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mov r5d, r7m
vbroadcasti32x4 m4, [spel_h_shufA]
vbroadcasti32x4 m5, [spel_h_shufB]
shr r5d, 11
mova ym9, [prep_endA]
psllw xmm0, [base+prep_hv_shift+r5*8]
mova [tmpq], xmm0
vpbroadcastd m6, [tmpq+4]
vpbroadcastd m7, [tmpq+8]
.h_w4_loop:
movu xm2, [srcq+strideq*0]
vinserti32x4 ym2, [srcq+strideq*1], 1
vinserti32x4 m2, [srcq+strideq*2], 2
vinserti32x4 m2, [srcq+r6 ], 3
lea srcq, [srcq+strideq*4]
mova m0, m10
pshufb m1, m2, m4
vpdpwssd m0, m6, m1
pshufb m2, m5
vpdpwssd m0, m7, m2
vpermb m0, m9, m0
mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
.h_w8:
mova m6, [spel_h_shufA]
movu m7, [spel_h_shufB]
movu m8, [spel_h_shufC]
mova m9, [spel_h_shufD]
mova m11, [prep_endB]
.h_w8_loop:
movu ym4, [srcq+strideq*0]
vinserti32x8 m4, [srcq+strideq*1], 1
movu ym5, [srcq+strideq*2]
vinserti32x8 m5, [srcq+r6 ], 1
lea srcq, [srcq+strideq*4]
mova m0, m10
mova m1, m10
vpermb m2, m6, m4
vpermb m3, m6, m5
vpdpwssd m0, m12, m2
vpdpwssd m1, m12, m3
vpermb m2, m7, m4
vpermb m3, m7, m5
vpdpwssd m0, m13, m2
vpdpwssd m1, m13, m3
vpermb m2, m8, m4
vpermb m3, m8, m5
vpdpwssd m0, m14, m2
vpdpwssd m1, m14, m3
vpermb m2, m9, m4
vpermb m3, m9, m5
vpdpwssd m0, m15, m2
vpdpwssd m1, m15, m3
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 4
jg .h_w8_loop
RET
.h:
vpbroadcastd m10, [prep_8tap_rnd]
test myd, 0xf00
jnz .hv
lea r6, [strideq*3]
cmp wd, 4
je .h_w4
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
mov r5d, r7m
sub srcq, 6
shr r5d, 11
psllw xmm0, [base+prep_hv_shift+r5*8]
mova [tmpq], xmm0
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
cmp wd, 16
jl .h_w8
vbroadcasti32x4 m6, [spel_h_shufA]
vbroadcasti32x4 m7, [spel_h_shufB]
mova m11, [prep_endC]
jg .h_w32
.h_w16_loop:
movu ym2, [srcq+strideq*0+ 0]
vinserti32x8 m2, [srcq+strideq*1+ 0], 1
movu ym3, [srcq+strideq*0+16]
vinserti32x8 m3, [srcq+strideq*1+16], 1
lea srcq, [srcq+strideq*2]
mova m0, m10
mova m1, m10
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m14, m4 ; b2
pshufb m4, m2, m7
vpdpwssd m0, m13, m4 ; a1
pshufb m4, m3, m7
vpdpwssd m1, m15, m4 ; b3
shufpd m2, m3, 0x55
pshufb m4, m2, m6
vpdpwssd m0, m14, m4 ; a2
vpdpwssd m1, m12, m4 ; b0
pshufb m2, m7
vpdpwssd m0, m15, m2 ; a3
vpdpwssd m1, m13, m2 ; b1
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
sub hd, 2
jg .h_w16_loop
RET
.h_w32:
lea srcq, [srcq+wq*2]
neg wq
.h_w32_loop0:
mov r6, wq
.h_w32_loop:
movu m2, [srcq+r6*2+ 0]
movu m3, [srcq+r6*2+ 8]
mova m0, m10
mova m1, m10
pshufb m4, m2, m6
vpdpwssd m0, m12, m4 ; a0
pshufb m4, m3, m6
vpdpwssd m1, m12, m4 ; b0
vpdpwssd m0, m14, m4 ; a2
movu m4, [srcq+r6*2+16]
pshufb m3, m7
vpdpwssd m1, m13, m3 ; b1
vpdpwssd m0, m15, m3 ; a3
pshufb m3, m4, m6
vpdpwssd m1, m14, m3 ; b2
pshufb m2, m7
vpdpwssd m0, m13, m2 ; a1
pshufb m4, m7
vpdpwssd m1, m15, m4 ; b3
vpermt2b m0, m11, m1
mova [tmpq], m0
add tmpq, 64
add r6, 32
jl .h_w32_loop
add srcq, strideq
dec hd
jg .h_w32_loop0
RET
.hv:
vpbroadcastd m11, [pd_128]
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmove myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [strideq*3]
sub srcq, 2
shr r5d, 11
sub srcq, r6
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m12, xmm1
movu xm16, [srcq+strideq*0]
mov r3d, 0xff0
vinserti128 ym16, [srcq+strideq*1], 1
kmovw k1, r3d
vbroadcasti32x4 m18, [srcq+strideq*2]
add srcq, r6
vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3
movu xm17, [srcq+strideq*1]
vbroadcasti32x4 ym18, [srcq+strideq*2]
add srcq, r6
vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2
vbroadcasti32x4 m5, [spel_h_shufA]
vbroadcasti32x4 m6, [spel_h_shufB]
vpbroadcastd m8, [tmpq+ 4]
vpbroadcastd m9, [tmpq+ 8]
mova m1, m10
mova m19, [spel_shuf4a]
mova m2, m10
pshufb m0, m16, m5
vpdpwssd m1, m8, m0
pshufb m0, m17, m5
vpdpwssd m2, m8, m0
vpbroadcastd m13, [tmpq+20]
pshufb m16, m6
vpbroadcastd m14, [tmpq+24]
pshufb m17, m6
vpbroadcastd m15, [tmpq+28]
vpdpwssd m1, m9, m16 ; 0 1 2 3
vpdpwssd m2, m9, m17 ; 4 5 6
mova m7, [spel_shuf4b]
vpermt2b m1, m19, m2 ; 01 12 23 34
vpermb m2, m19, m2 ; 45 56
mova ym19, [prep_endA]
vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56
.hv_w4_loop:
movu xm17, [srcq+strideq*1]
vinserti128 ym17, [srcq+strideq*2], 1
vbroadcasti32x4 m16, [srcq+r6 ]
lea srcq, [srcq+strideq*4]
vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3
mova m18, m10
pshufb m16, m17, m5
vpdpwssd m18, m8, m16
mova m16, m11
vpdpwssd m16, m12, m1 ; a0 b0 c0 d0
pshufb m17, m6
vpdpwssd m18, m9, m17 ; 7 8 9 a
mova m1, m2
vpdpwssd m16, m13, m2 ; a1 b1 c1 d1
vpermt2b m2, m7, m18 ; 67 78 89 9a
vpdpwssd m16, m15, m2 ; a3 b3 c3 d3
vshufi32x4 m1, m2, q1032 ; 45 56 67 78
vpdpwssd m16, m14, m1 ; a2 b2 c2 d2
vpermb m16, m19, m16
mova [tmpq], ym16
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
vzeroupper
RET
.hv_w8:
shr mxd, 16
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
cmp hd, 6
cmovs myd, mxd
mov r5d, r7m
pmovsxbw xmm1, [base+subpel_filters+myq*8]
lea r6, [strideq*3]
sub srcq, 6
shr r5d, 11
sub srcq, r6
psllw xmm0, [base+prep_hv_shift+r5*8]
psllw xmm1, 2
mova [tmpq+ 0], xmm0
mova [tmpq+16], xmm1
vpbroadcastd m12, xmm0
vpbroadcastd m13, [tmpq+ 4]
vpbroadcastd m14, [tmpq+ 8]
vpbroadcastd m15, [tmpq+12]
vpbroadcastd m16, xmm1
vpbroadcastd m17, [tmpq+20]
vpbroadcastd m18, [tmpq+24]
vpbroadcastd m19, [tmpq+28]
cmp wd, 8
jg .hv_w16
WIN64_SPILL_XMM 23
mova m5, [spel_h_shufA]
movu ym0, [srcq+strideq*0]
vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
movu ym9, [srcq+strideq*2]
add srcq, r6
vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
movu ym20, [srcq+strideq*1]
vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
add srcq, r6
movu ym21, [srcq+strideq*0] ; 6
movu m6, [spel_h_shufB]
movu m7, [spel_h_shufC]
mova ym22, [prep_endB]
vpermb m8, m5, m0
mova m1, m10
vpdpwssd m1, m12, m8 ; a0 b0
vpermb m8, m5, m9
mova m2, m10
vpdpwssd m2, m12, m8 ; c0 d0
vpermb m8, m5, m20
mova m3, m10
vpdpwssd m3, m12, m8 ; e0 f0
vpermb m8, m5, m21
mova m4, m10
vpdpwssd m4, m12, m8 ; g0
vpermb m8, m6, m0
vpdpwssd m1, m13, m8 ; a1 b1
vpermb m8, m6, m9
vpdpwssd m2, m13, m8 ; c1 d1
vpermb m8, m6, m20
vpdpwssd m3, m13, m8 ; e1 f1
vpermb m8, m6, m21
vpdpwssd m4, m13, m8 ; g1
vpermb m8, m7, m0
vpdpwssd m1, m14, m8 ; a2 b2
vpermb m8, m7, m9
vpdpwssd m2, m14, m8 ; c2 d2
vpermb m8, m7, m20
vpdpwssd m3, m14, m8 ; e2 f2
vpermb m8, m7, m21
vpdpwssd m4, m14, m8 ; g2
mova m8, [spel_h_shufD]
vpermb m0, m8, m0
vpdpwssd m1, m15, m0 ; a3 b3
mova m0, [spel_shuf8a]
vpermb m9, m8, m9
vpdpwssd m2, m15, m9 ; c3 d3
mova m9, [spel_shuf8b]
vpermb m20, m8, m20
vpdpwssd m3, m15, m20 ; e3 f3
vpermb m21, m8, m21
vpdpwssd m4, m15, m21 ; g3
vpermt2b m1, m0, m2 ; 01 12
vpermt2b m2, m0, m3 ; 23 34
vpermt2b m3, m0, m4 ; 45 56
.hv_w8_loop:
movu ym0, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
vinserti32x8 m0, [srcq+strideq*0], 1
mova m4, m10
mova m20, m11
vpermb m21, m5, m0
vpdpwssd m4, m12, m21 ; h0 i0
vpermb m21, m6, m0
vpdpwssd m20, m16, m1 ; A0 B0
vpdpwssd m4, m13, m21 ; h1 i1
vpermb m21, m7, m0
mova m1, m2
vpdpwssd m20, m17, m2 ; A1 B1
vpdpwssd m4, m14, m21 ; h2 i2
vpermb m21, m8, m0
mova m2, m3
vpdpwssd m20, m18, m3 ; A2 B2
vpdpwssd m4, m15, m21 ; h3 i3
vpermt2b m3, m9, m4 ; 67 78
vpdpwssd m20, m19, m3 ; A3 B3
vpermb m20, m22, m20
mova [tmpq], ym20
add tmpq, 32
sub hd, 2
jg .hv_w8_loop
RET
.hv_w16:
WIN64_SPILL_XMM 27
%if WIN64
push r8
%endif
vbroadcasti32x4 m20, [spel_h_shufA]
vbroadcasti32x4 m21, [spel_h_shufB]
add wd, wd
mova m9, [spel_shuf16]
mova m26, [prep_endB]
lea r5d, [hq+wq*8-256]
.hv_w16_loop0:
vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
movu ym6, [srcq+strideq*1+ 0]
movu ym7, [srcq+strideq*1+16]
lea r7, [srcq+r6]
vinserti32x8 m6, [srcq+strideq*2+ 0], 1
vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
movu ym22, [r7 +strideq*0+ 0]
movu ym23, [r7 +strideq*0+16]
mov r8, tmpq
vinserti32x8 m22, [r7 +strideq*1+ 0], 1
vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4
movu ym24, [r7 +strideq*2+ 0]
movu ym25, [r7 +strideq*2+16]
add r7, r6
vinserti32x8 m24, [r7 +strideq*0+ 0], 1
vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6
pshufb m0, m4, m20
mova m1, m10
vpdpwssd m1, m12, m0 ; a0
pshufb m0, m6, m20
mova m2, m10
vpdpwssd m2, m12, m0 ; b0
pshufb m0, m7, m20
mova m3, m10
vpdpwssd m3, m14, m0 ; c2
pshufb m0, m4, m21
vpdpwssd m1, m13, m0 ; a1
pshufb m0, m6, m21
vpdpwssd m2, m13, m0 ; b1
pshufb m0, m7, m21
vpdpwssd m3, m15, m0 ; c3
pshufb m0, m5, m20
vpdpwssd m1, m14, m0 ; a2
shufpd m6, m7, 0x55
pshufb m7, m6, m20
vpdpwssd m2, m14, m7 ; b2
vpdpwssd m3, m12, m7 ; c0
pshufb m5, m21
vpdpwssd m1, m15, m5 ; a3
pshufb m6, m21
vpdpwssd m2, m15, m6 ; b3
vpdpwssd m3, m13, m6 ; c1
pshufb m0, m22, m20
mova m4, m10
vpdpwssd m4, m12, m0 ; d0
pshufb m0, m23, m20
mova m5, m10
vpdpwssd m5, m14, m0 ; e2
pshufb m0, m24, m20
mova m6, m10
vpdpwssd m6, m12, m0 ; f0
pshufb m0, m25, m20
mova m7, m10
vpdpwssd m7, m14, m0 ; g2
pshufb m0, m22, m21
vpdpwssd m4, m13, m0 ; d1
pshufb m0, m23, m21
vpdpwssd m5, m15, m0 ; e3
pshufb m0, m24, m21
vpdpwssd m6, m13, m0 ; f1
pshufb m0, m25, m21
vpdpwssd m7, m15, m0 ; g3
shufpd m22, m23, 0x55
pshufb m23, m22, m20
vpdpwssd m4, m14, m23 ; d2
vpdpwssd m5, m12, m23 ; e0
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m6, m14, m25 ; f2
vpdpwssd m7, m12, m25 ; g0
pshufb m22, m21
vpdpwssd m4, m15, m22 ; d3
vpdpwssd m5, m13, m22 ; e1
pshufb m24, m21
vpdpwssd m6, m15, m24 ; f3
vpdpwssd m7, m13, m24 ; g1
pslldq m1, 1
vpermt2b m2, m9, m3 ; 12
vpermt2b m4, m9, m5 ; 34
vpermt2b m6, m9, m7 ; 56
vpshrdd m1, m2, 16 ; 01
vpshrdd m3, m2, m4, 16 ; 23
vpshrdd m5, m4, m6, 16 ; 45
.hv_w16_loop:
movu ym24, [r7+strideq*1+ 0]
movu ym25, [r7+strideq*1+16]
lea r7, [r7+strideq*2]
vinserti32x8 m24, [r7+strideq*0+ 0], 1
vinserti32x8 m25, [r7+strideq*0+16], 1
mova m7, m10
mova m8, m10
pshufb m0, m24, m20
vpdpwssd m7, m12, m0 ; h0
mova m22, m11
pshufb m0, m25, m20
vpdpwssd m8, m14, m0 ; i2
mova m23, m11
vpdpwssd m22, m16, m1 ; A0
mova m1, m3
vpdpwssd m23, m16, m2 ; B0
mova m2, m4
pshufb m0, m24, m21
vpdpwssd m7, m13, m0 ; h1
pshufb m0, m25, m21
vpdpwssd m8, m15, m0 ; i3
vpdpwssd m22, m17, m3 ; A1
mova m3, m5
vpdpwssd m23, m17, m4 ; B1
mova m4, m6
shufpd m24, m25, 0x55
pshufb m25, m24, m20
vpdpwssd m7, m14, m25 ; h2
vpdpwssd m8, m12, m25 ; i0
vpdpwssd m22, m18, m5 ; A2
vpdpwssd m23, m18, m6 ; B2
pshufb m24, m21
vpdpwssd m7, m15, m24 ; h3
vpdpwssd m8, m13, m24 ; i1
vpermt2b m7, m9, m8 ; 78
vpshrdd m5, m6, m7, 16 ; 67
vpdpwssd m22, m19, m5 ; A3
vpdpwssd m23, m19, m7 ; B3
mova m6, m7
vpermt2b m22, m26, m23
mova [r8+wq*0], ym22
vextracti32x8 [r8+wq*1], m22, 1
lea r8, [r8+wq*2]
sub hd, 2
jg .hv_w16_loop
add srcq, 32
add tmpq, 32
movzx hd, r5b
sub r5d, 1<<8
jg .hv_w16_loop0
%if WIN64
pop r8
%endif
RET
%if WIN64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 7
%endif
cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
%define base r6-pd_0to7
mov t0d, r7m
lea r6, [pd_0to7]
shr t0d, 11
vpbroadcastd m8, [base+warp_8x8t_rnd_v]
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
psrad m14, m16, 15
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
psrad m16, 15
packssdw m14, m16
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
psrad m15, m16, 15
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
add tsq, tsq
psrad m16, 15
packssdw m15, m16
jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
mov t0d, r7m ; pixel_max
lea r6, [pd_0to7]
shr t0d, 11
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
call .main
psrad m14, m16, 13
call .main2
psrad m16, 13
packusdw m14, m16
call .main2
psrad m15, m16, 13
call .main2
vpbroadcastd m0, [base+bidir_shift+t0*4]
vpsrlvw m14, m0
psrad m16, 13
packusdw m15, m16
vpsrlvw m15, m0
.end:
mova m0, [base+warp8x8_end]
vpermb m16, m0, m14
lea r2, [dsq*3]
mova [dstq+dsq*0], xm16
vextracti128 [dstq+dsq*1], ym16, 1
vextracti32x4 [dstq+dsq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
vpermb m16, m0, m15
lea dstq, [dstq+dsq*4]
mova [dstq+dsq*0], xm16
vextracti128 [dstq+dsq*1], ym16, 1
vextracti32x4 [dstq+dsq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
RET
.main:
vpbroadcastd ym3, [base+pd_512]
%if WIN64
mov abcdq, r5mp
vpaddd ym18, ym3, r6m {1to8} ; mx
%else
add r5d, 512
vpbroadcastd ym18, r5d
%endif
vpaddd ym20, ym3, r7m {1to8} ; my
mova ym16, [base+pd_0to7]
vpbroadcastd ym19, [abcdq+4*0] ; alpha
vpbroadcastd ym21, [abcdq+4*1] ; gamma
lea r4, [ssq*3+6]
vpdpwssd ym18, ym19, ym16 ; tmx
vpdpwssd ym20, ym21, ym16 ; tmy
sub srcq, r4
mova m10, [base+warp8x8_permA]
lea r4, [mc_warp_filter+64*8]
vbroadcasti32x4 m12, [base+warp8x8_permC]
kxnorb k1, k1, k1
vbroadcasti32x4 m13, [base+warp8x8_permD]
movu ym5, [srcq+0]
vinserti32x8 m5, [srcq+8], 1
psrad ym17, ym18, 10
mova m11, [base+warp8x8_permB]
kmovb k2, k1
vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
psrad ym19, 16 ; beta
psrad ym21, 16 ; delta
paddd ym18, ym19
vpermb m4, m10, m5
vpbroadcastq m9, [base+warp_shift_h+t0*8]
pshufd m3, m3, q3120
paddd m7, m1, m1
pshufb m2, m3, m12
vpdpwssd m1, m4, m2
vpermb m5, m11, m5
vshufi32x4 m4, m5, q1021
pshufb m3, m13
vpdpwssd m1, m4, m3
call .h
psllq m2, m1, 32
paddd m1, m2
vpmultishiftqb m1, m9, m1
vpshrdq m1, m0, 48 ; 01 12
call .h
vpshrdq m2, m1, m0, 48 ; 23 34
call .h
vpshrdq m3, m2, m0, 48 ; 45 56
.main2:
call .h
psrad ym6, ym20, 10
kmovb k1, k2
paddd ym17, ym20, ym21 ; my += delta
vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
psrad ym16, ym17, 10
kmovb k2, k1
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
shufps m5, m20, m6, q2020
mova m16, m8
pshufb m4, m5, m12
vpdpwssd m16, m1, m4 ; a0 b0
pshufb m5, m13
mova m1, m2
vpdpwssd m16, m2, m5 ; a1 b1
shufps m6, m20, m6, q3131
paddd ym20, ym17, ym21
pshufb m4, m6, m12
mova m2, m3
vpdpwssd m16, m3, m4 ; a2 b2
vpshrdq m3, m0, 48 ; 67 78
pshufb m6, m13
vpdpwssd m16, m3, m6 ; a3 b3
ret
ALIGN function_align
.h:
movu ym16, [srcq+ssq*1]
psrad ym6, ym18, 10
lea srcq, [srcq+ssq*2]
vinserti32x8 m5, m16, [srcq+ssq*0], 1
kmovb k1, k2
paddd ym17, ym18, ym19 ; mx += beta
vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
psrad ym16, ym17, 10
kmovb k2, k1
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
vpermb m4, m10, m5
shufps m16, m18, m6, q2020
shufps m6, m18, m6, q3131
mova m0, m7
pshufb m18, m16, m12
vpdpwssd m0, m4, m18 ; a0 b0
vpermb m5, m11, m5
pshufb m18, m6, m13
vpdpwssd m0, m5, m18 ; a3 b3
paddd ym18, ym17, ym19
vshufi32x4 m17, m4, m5, q1021
pshufb m16, m13
vpdpwssd m0, m17, m16 ; a1 b1
vshufi32x4 m4, m5, q2132
pshufb m6, m12
vpdpwssd m0, m4, m6 ; a2 b2
vpmultishiftqb m0, m9, m0 ; a a b b
ret
%macro BIDIR_FN 0
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq ], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq ], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm0, ym1, 1
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
vextracti32x4 xm0, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
%endmacro
%if WIN64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 7
%endif
cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
mov t0d, r6m ; pixel_max
movsxd wq, [r6+wq*4]
shr t0d, 11
vpbroadcastd m2, [base+avg_round+t0*4]
vpbroadcastd m3, [base+avg_shift+t0*4]
movifnidn hd, hm
add wq, r6
BIDIR_FN
ALIGN function_align
.main:
mova m0, [tmp1q+64*0]
paddsw m0, [tmp2q+64*0]
mova m1, [tmp1q+64*1]
paddsw m1, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
pmaxsw m0, m2
pmaxsw m1, m2
psubsw m0, m2
psubsw m1, m2
vpsrlvw m0, m3
vpsrlvw m1, m3
ret
cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-w_avg_avx512icl_table
lea r6, [w_avg_avx512icl_table]
tzcnt wd, wm
mov t0d, r7m ; pixel_max
shr t0d, 11
movsxd wq, [r6+wq*4]
vpbroadcastd m5, [base+w_avg_round+t0*4]
vpbroadcastd m7, [base+bidir_shift+t0*4]
add wq, r6
mov r6d, r6m ; weight
lea t0d, [r6-16]
shl r6d, 16
sub r6d, t0d ; 16-weight, weight
movifnidn hd, hm
vpbroadcastd m6, r6d
BIDIR_FN
ALIGN function_align
.main:
mova m3, [tmp1q+64*0]
mova m1, [tmp2q+64*0]
mova m0, [tmp1q+64*1]
mova m4, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
punpcklwd m2, m1, m3
punpckhwd m1, m3
punpcklwd m3, m4, m0
punpckhwd m4, m0
mova m0, m5
vpdpwssd m0, m6, m2
mova m2, m5
vpdpwssd m2, m6, m1
mova m1, m5
vpdpwssd m1, m6, m3
mova m3, m5
vpdpwssd m3, m6, m4
REPX {psrad x, 2}, m0, m2, m1, m3
packusdw m0, m2
packusdw m1, m3
vpsrlvw m0, m7
vpsrlvw m1, m7
ret
cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-mask_avx512icl_table
lea r7, [mask_avx512icl_table]
tzcnt wd, wm
mov r6d, r7m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_64]
vpbroadcastd m9, [base+mask_round+r6*4]
vpbroadcastd m10, [base+bidir_shift+r6*4]
mov maskq, maskmp
add wq, r7
BIDIR_FN
ALIGN function_align
.main:
pmovzxbw m1, [maskq+32*0]
mova m4, [tmp1q+64*0]
mova m2, [tmp2q+64*0]
pmovzxbw m6, [maskq+32*1]
mova m5, [tmp1q+64*1]
mova m3, [tmp2q+64*1]
add maskq, 32*2
add tmp1q, 64*2
add tmp2q, 64*2
punpcklwd m7, m4, m2
punpckhwd m4, m2
psubw m0, m8, m1
punpcklwd m2, m1, m0 ; m, 64-m
punpckhwd m1, m0
mova m0, m9
vpdpwssd m0, m7, m2
mova m2, m9
vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
punpcklwd m7, m5, m3
punpckhwd m5, m3
psubw m1, m8, m6
punpcklwd m3, m6, m1
punpckhwd m6, m1
mova m1, m9
vpdpwssd m1, m7, m3
mova m3, m9
vpdpwssd m3, m5, m6
REPX {psrad x, 4}, m0, m2, m1, m3
packusdw m0, m2
packusdw m1, m3
vpsrlvw m0, m10
vpsrlvw m1, m10
ret
cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_420_avx512icl_table
lea r7, [w_mask_420_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m11, [base+pw_64]
vpbroadcastd m12, [base+mask_round+r6*4]
vpbroadcastd m13, [base+bidir_shift+r6*4]
mov r6d, r7m ; sign
vpbroadcastd m14, [base+w_mask_round+r6*4]
mova ym15, [w_mask_end42x]
mov maskq, maskmp
add wq, r7
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
mova m4, [w_mask_shuf4]
vpermt2b m2, m4, m3
mova m3, m14
vpdpbusd m3, m2, [pb_64] {1to16}
vpermb m3, m15, m3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
mova [maskq], xm3
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8:
mova m8, [w_mask_shuf8]
vpbroadcastd m9, [pb_64]
jmp .w8_start
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 16
.w8_start:
vpermt2b m2, m8, m3
mova m3, m14
vpdpbusd m3, m2, m9
vpermb m3, m15, m3
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
mova [maskq], xm3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16:
mova m8, [w_mask_shuf16]
vpbroadcastd m9, [pb_64]
jmp .w16_start
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 16
.w16_start:
vpermt2b m2, m8, m3
mova m3, m14
vpdpbusd m3, m2, m9
vpermb m3, m15, m3
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
mova [maskq], xm3
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*4]
add maskq, 32
.w32:
paddw m2, m3
mova m8, m14
vpdpwssd m8, m11, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
call .main
paddw m2, m3
mova m3, m14
vpdpwssd m3, m11, m2
vpermt2b m8, m15, m3
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
mova [maskq], ym8
sub hd, 4
jg .w32_loop
RET
.w64_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 32
.w64:
mova m8, m2
mova m9, m3
mova [dstq+strideq*0+64*0], m0
mova [dstq+strideq*0+64*1], m1
call .main
paddw m8, m2
paddw m9, m3
mova m2, m14
vpdpwssd m2, m11, m8
mova m3, m14
vpdpwssd m3, m11, m9
vpermt2b m2, m15, m3
mova [dstq+strideq*1+64*0], m0
mova [dstq+strideq*1+64*1], m1
mova [maskq], ym2
sub hd, 2
jg .w64_loop
RET
.w128_loop:
call .main
lea dstq, [dstq+strideq*2]
add maskq, 64
.w128:
mova m16, m2
mova m8, m3
mova [dstq+strideq*0+64*0], m0
mova [dstq+strideq*0+64*1], m1
call .main
mova m17, m2
mova m9, m3
mova [dstq+strideq*0+64*2], m0
mova [dstq+strideq*0+64*3], m1
call .main
paddw m2, m16
paddw m3, m8
mova m16, m14
vpdpwssd m16, m11, m2
mova m8, m14
vpdpwssd m8, m11, m3
mova [dstq+strideq*1+64*0], m0
mova [dstq+strideq*1+64*1], m1
call .main
paddw m2, m17
paddw m3, m9
mova m17, m14
vpdpwssd m17, m11, m2
mova m9, m14
vpdpwssd m9, m11, m3
vpermt2b m16, m15, m8
vpermt2b m17, m15, m9
mova [dstq+strideq*1+64*2], m0
mova [dstq+strideq*1+64*3], m1
mova [maskq+32*0], ym16
mova [maskq+32*1], ym17
sub hd, 2
jg .w128_loop
vzeroupper
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m10, m6
psrlw m6, 10 ; 64-m
psubw m2, m11, m6 ; m
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m12
vpdpwssd m0, m5, m1
mova m1, m12
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m10, m5
psrlw m5, 10
psubw m3, m11, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m12
vpdpwssd m1, m6, m4
mova m4, m12
vpdpwssd m4, m7, m5
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpsrlvw m0, m13
vpsrlvw m1, m13
ret
cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_422_avx512icl_table
lea r7, [w_mask_422_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m9, [base+pw_64]
vpbroadcastd m10, [base+mask_round+r6*4]
vpbroadcastd m11, [base+bidir_shift+r6*4]
mov r6d, r7m ; sign
vpbroadcastd m12, [base+w_mask_round+r6*4]
mova ym13, [w_mask_end42x]
mov maskq, maskmp
add wq, r7
paddw m14, m9, m9 ; pw_128
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m8, m6
psrlw m6, 10
psubw m2, m9, m6
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m10
vpdpwssd m0, m5, m1
mova m1, m10
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m8, m5
psrlw m5, 10
psubw m3, m9, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m10
vpdpwssd m1, m6, m4
mova m4, m10
vpdpwssd m4, m7, m5
mova m5, m12
vpdpwssd m5, m14, m2
mova m2, m12
vpdpwssd m2, m14, m3
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpermt2b m5, m13, m2
vpsrlvw m0, m11
vpsrlvw m1, m11
mova [maskq], ym5
add maskq, 32
ret
cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_444_avx512icl_table
lea r7, [w_mask_444_avx512icl_table]
tzcnt wd, wm
mov r6d, r8m ; pixel_max
movifnidn hd, hm
shr r6d, 11
movsxd wq, [r7+wq*4]
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
vpbroadcastd m9, [base+pw_64]
vpbroadcastd m10, [base+mask_round+r6*4]
mova m11, [w_mask_end444]
vpbroadcastd m12, [base+bidir_shift+r6*4]
mov maskq, maskmp
add wq, r7
call .main
lea stride3q, [strideq*3]
jmp wq
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
vextracti32x4 xm2, ym0, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm0, m0, 3
movq [dstq+strideq*2], xm0
movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
vextracti32x4 xm2, ym1, 1
movq [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm2
vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm2
movhps [dstq+strideq*1], xm2
vextracti32x4 xm1, m1, 3
movq [dstq+strideq*2], xm1
movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
call .main
lea dstq, [dstq+strideq*4]
.w8:
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
vextracti32x4 [dstq+stride3q ], m0, 3
sub hd, 8
jl .w8_end
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], xm1
vextracti32x4 [dstq+strideq*1], ym1, 1
vextracti32x4 [dstq+strideq*2], m1, 2
vextracti32x4 [dstq+stride3q ], m1, 3
jg .w8_loop
.w8_end:
RET
.w16_loop:
call .main
lea dstq, [dstq+strideq*4]
.w16:
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
sub hd, 4
jg .w16_loop
RET
.w32_loop:
call .main
lea dstq, [dstq+strideq*2]
.w32:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
jg .w32_loop
RET
.w64_loop:
call .main
add dstq, strideq
.w64:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
dec hd
jg .w64_loop
RET
.w128_loop:
call .main
add dstq, strideq
.w128:
mova [dstq+64*0], m0
mova [dstq+64*1], m1
call .main
mova [dstq+64*2], m0
mova [dstq+64*3], m1
dec hd
jg .w128_loop
RET
ALIGN function_align
.main:
mova m1, [tmp1q+64*0]
mova m3, [tmp2q+64*0]
mova m4, [tmp1q+64*1]
mova m7, [tmp2q+64*1]
add tmp1q, 64*2
add tmp2q, 64*2
psubsw m6, m1, m3
punpcklwd m5, m3, m1
pabsw m6, m6
punpckhwd m3, m1
psubusw m6, m8, m6
psrlw m6, 10
psubw m2, m9, m6
punpcklwd m1, m6, m2
punpckhwd m6, m2
mova m0, m10
vpdpwssd m0, m5, m1
mova m1, m10
vpdpwssd m1, m3, m6
psubsw m5, m4, m7
punpcklwd m6, m7, m4
pabsw m5, m5
punpckhwd m7, m4
psubusw m5, m8, m5
psrlw m5, 10
psubw m3, m9, m5
punpcklwd m4, m5, m3
psrad m0, 4
punpckhwd m5, m3
psrad m1, 4
packusdw m0, m1
mova m1, m10
vpdpwssd m1, m6, m4
mova m4, m10
vpdpwssd m4, m7, m5
vpermt2b m2, m11, m3
psrad m1, 4
psrad m4, 4
packusdw m1, m4
vpsrlvw m0, m12
vpsrlvw m1, m12
mova [maskq], m2
add maskq, 64
ret
cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
%define base r6-blend_avx512icl_table
lea r6, [blend_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r6+wq*4]
movifnidn maskq, maskmp
vpbroadcastd m6, [base+pw_m512]
add wq, r6
lea r6, [dsq*3]
jmp wq
.w4:
pmovzxbw ym19, [maskq]
movq xm16, [dstq+dsq*0]
movhps xm16, [dstq+dsq*1]
vpbroadcastq ym17, [dstq+dsq*2]
vpbroadcastq ym18, [dstq+r6 ]
pmullw ym19, ym6
vpblendd ym16, ym17, 0x30
vpblendd ym16, ym18, 0xc0
psubw ym17, ym16, [tmpq]
add maskq, 16
add tmpq, 32
pmulhrsw ym17, ym19
paddw ym16, ym17
vextracti128 xm17, ym16, 1
movq [dstq+dsq*0], xm16
movhps [dstq+dsq*1], xm16
movq [dstq+dsq*2], xm17
movhps [dstq+r6 ], xm17
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w4
vzeroupper
RET
.w8:
pmovzxbw m2, [maskq]
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
vinserti32x4 m0, [dstq+dsq*2], 2
vinserti32x4 m0, [dstq+r6 ], 3
pmullw m2, m6
psubw m1, m0, [tmpq]
add maskq, 32
add tmpq, 64
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
vextracti32x4 [dstq+dsq*2], m0, 2
vextracti32x4 [dstq+r6 ], m0, 3
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w8
RET
.w16:
pmovzxbw m4, [maskq+32*0]
pmovzxbw m5, [maskq+32*1]
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
mova ym1, [dstq+dsq*2]
vinserti32x8 m1, [dstq+r6 ], 1
pmullw m4, m6
pmullw m5, m6
psubw m2, m0, [tmpq+64*0]
psubw m3, m1, [tmpq+64*1]
add maskq, 32*2
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
mova [dstq+dsq*2], ym1
vextracti32x8 [dstq+r6 ], m1, 1
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w16
RET
.w32:
pmovzxbw m4, [maskq+32*0]
pmovzxbw m5, [maskq+32*1]
mova m0, [dstq+dsq*0]
mova m1, [dstq+dsq*1]
pmullw m4, m6
pmullw m5, m6
psubw m2, m0, [tmpq+ 64*0]
psubw m3, m1, [tmpq+ 64*1]
add maskq, 32*2
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w32
RET
cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
lea r5, [blend_v_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
jmp wq
.w2:
vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
.w2_loop:
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
movq xmm1, [tmpq]
add tmpq, 4*2
psubw xmm1, xmm0, xmm1
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w2_loop
RET
.w4:
vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
.w4_loop:
movq xmm0, [dstq+dsq*0]
movhps xmm0, [dstq+dsq*1]
psubw xmm1, xmm0, [tmpq]
add tmpq, 8*2
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_loop
RET
.w8:
vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
.w8_loop:
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
psubw ym1, ym0, [tmpq]
add tmpq, 16*2
pmulhrsw ym1, ym2
paddw ym0, ym1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_loop
RET
.w16:
vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
.w16_loop:
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
psubw m1, m0, [tmpq]
add tmpq, 32*2
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w16_loop
RET
.w32:
mova m4, [obmc_masks_avx2+32*2]
.w32_loop:
mova m0, [dstq+dsq*0]
psubw m2, m0, [tmpq+ 64*0]
mova m1, [dstq+dsq*1]
psubw m3, m1, [tmpq+ 64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w32_loop
RET
cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
%define base r6-$$
lea r6, [$$]
tzcnt wd, wm
mov hd, hm
movsxd wq, [base+blend_h_avx512icl_table+wq*4]
lea maskq, [base+obmc_masks_avx2+hq*2]
lea hd, [hq*3]
lea wq, [base+blend_h_avx512icl_table+wq]
shr hd, 2 ; h * 3/4
lea maskq, [maskq+hq*2]
neg hq
jmp wq
.w2:
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
movd xmm2, [maskq+hq*2]
movq xmm1, [tmpq]
add tmpq, 4*2
punpcklwd xmm2, xmm2
psubw xmm1, xmm0, xmm1
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movd [dstq+dsq*0], xmm0
pextrd [dstq+dsq*1], xmm0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w2
RET
.w4:
mova xmm3, [blend_shuf]
.w4_loop:
movq xmm0, [dstq+dsq*0]
movhps xmm0, [dstq+dsq*1]
movd xmm2, [maskq+hq*2]
psubw xmm1, xmm0, [tmpq]
add tmpq, 8*2
pshufb xmm2, xmm3
pmulhrsw xmm1, xmm2
paddw xmm0, xmm1
movq [dstq+dsq*0], xmm0
movhps [dstq+dsq*1], xmm0
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w4_loop
RET
.w8:
vbroadcasti32x4 ym3, [blend_shuf]
shufpd ym3, ym3, 0x0c
.w8_loop:
mova xm0, [dstq+dsq*0]
vinserti32x4 ym0, [dstq+dsq*1], 1
vpbroadcastd ym2, [maskq+hq*2]
psubw ym1, ym0, [tmpq]
add tmpq, 16*2
pshufb ym2, ym3
pmulhrsw ym1, ym2
paddw ym0, ym1
mova [dstq+dsq*0], xm0
vextracti32x4 [dstq+dsq*1], ym0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w8_loop
RET
.w16:
vbroadcasti32x4 m3, [blend_shuf]
shufpd m3, m3, 0xf0
.w16_loop:
mova ym0, [dstq+dsq*0]
vinserti32x8 m0, [dstq+dsq*1], 1
vpbroadcastd m2, [maskq+hq*2]
psubw m1, m0, [tmpq]
add tmpq, 32*2
pshufb m2, m3
pmulhrsw m1, m2
paddw m0, m1
mova [dstq+dsq*0], ym0
vextracti32x8 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w16_loop
RET
.w32:
vpbroadcastw m4, [maskq+hq*2]
vpbroadcastw m5, [maskq+hq*2+2]
mova m0, [dstq+dsq*0]
psubw m2, m0, [tmpq+ 64*0]
mova m1, [dstq+dsq*1]
psubw m3, m1, [tmpq+ 64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m5
paddw m0, m2
paddw m1, m3
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
add hq, 2
jl .w32
RET
.w64:
vpbroadcastw m4, [maskq+hq*2]
mova m0, [dstq+64*0]
psubw m2, m0, [tmpq+64*0]
mova m1, [dstq+64*1]
psubw m3, m1, [tmpq+64*1]
add tmpq, 64*2
pmulhrsw m2, m4
pmulhrsw m3, m4
paddw m0, m2
paddw m1, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, dsq
inc hq
jl .w64
RET
.w128:
vpbroadcastw m8, [maskq+hq*2]
mova m0, [dstq+64*0]
psubw m4, m0, [tmpq+64*0]
mova m1, [dstq+64*1]
psubw m5, m1, [tmpq+64*1]
mova m2, [dstq+64*2]
psubw m6, m2, [tmpq+64*2]
mova m3, [dstq+64*3]
psubw m7, m3, [tmpq+64*3]
add tmpq, 64*4
REPX {pmulhrsw x, m8}, m4, m5, m6, m7
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
mova [dstq+64*0], m0
mova [dstq+64*1], m1
mova [dstq+64*2], m2
mova [dstq+64*3], m3
add dstq, dsq
inc hq
jl .w128
RET
cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
dst_w, h, src_w, dx, mx0, pxmax
sub dword mx0m, 4<<14
sub dword src_wm, 8
mov r6, ~0
vpbroadcastd m5, dxm
vpbroadcastd m8, mx0m
vpbroadcastd m6, src_wm
kmovq k6, r6
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
LEA r7, $$
%define base r7-$$
vpbroadcastd m3, [base+pd_16384]
vpbroadcastd m7, [base+pd_63]
mova m24, [base+resize_permA]
mova m25, [base+resize_permB]
mova m26, [base+resize_permC]
mova m27, [base+resize_permD]
vbroadcasti32x4 m28, [base+resize_shufA]
vbroadcasti32x4 m29, [base+resize_shufB]
mova m30, [base+resize_permE]
vpbroadcastw ym31, pxmaxm
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
pslld m5, 4 ; dx*16
pslld m6, 14
pxor m2, m2
.loop_y:
xor xd, xd
mova m4, m8 ; per-line working version of mx
.loop_x:
pmaxsd m0, m4, m2
psrad m9, m4, 8 ; filter offset (unmasked)
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
psubd m1, m4, m0 ; pshufb offset
psrad m0, 14 ; clipped src_x offset
psrad m1, 14 ; pshufb edge_emu offset
vptestmd k5, m1, m1
pand m9, m7 ; filter offset (masked)
ktestw k5, k5
jz .load
vpbroadcastq m14, [base+pd_0_4]
vpermq m10, m0, q1100
vpermq m11, m0, q3322
vpermq m20, m1, q1100
vpermq m21, m1, q3322
punpckldq m10, m10
punpckldq m11, m11
punpckldq m20, m20
punpckldq m21, m21
paddd m10, m14
paddd m11, m14
paddd m20, m14
paddd m21, m14
vextracti32x8 ym12, m10, 1
vextracti32x8 ym13, m11, 1
vextracti32x8 ym22, m20, 1
vextracti32x8 ym23, m21, 1
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
pshufb m16, m0
pshufb m17, m1
pshufb m18, m14
pshufb m19, m15
mova m20, m24
mova m22, m24
mova m21, m25
mova m23, m25
vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
mova m15, m26
mova m17, m26
mova m16, m27
mova m18, m27
vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
kmovq k1, k6
kmovq k2, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
jmp .filter
.load:
kmovq k1, k6
kmovq k2, k6
kmovq k3, k6
kmovq k4, k6
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
pshufb m10, m11, m28
pshufb m11, m11, m29
pshufb m12, m13, m28
pshufb m13, m13, m29
vpgatherdd m15{k3}, [srcq+m0*2+ 0]
vpgatherdd m16{k4}, [srcq+m0*2+ 4]
kmovq k1, k6
kmovq k2, k6
vpgatherdd m17{k1}, [srcq+m0*2+ 8]
vpgatherdd m18{k2}, [srcq+m0*2+12]
.filter:
mova m14, m2
vpdpwssd m14, m15, m10
vpdpwssd m14, m16, m11
vpdpwssd m14, m17, m12
vpdpwssd m14, m18, m13
psubd m14, m3, m14
psrad m14, 15
packusdw m14, m14
vpermq m14, m30, m14
pminsw ym14, ym31
mova [dstq+xq*2], ym14
paddd m4, m5
add xd, 16
cmp xd, dst_wd
jl .loop_x
add dstq, dst_strideq
add srcq, src_strideq
dec hd
jg .loop_y
RET
%endif ; ARCH_X86_64