Source code
Revision control
Copy as Markdown
Other Tools
; Copyright © 2018-2021, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 16
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db %1-128, 127-%1
%rotate 1
%endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights: SMOOTH_WEIGHT_TABLE \
0, 0, 255, 128, 255, 149, 85, 64, \
255, 197, 146, 105, 73, 50, 37, 32, \
255, 225, 196, 170, 145, 123, 102, 84, \
68, 54, 43, 33, 26, 20, 17, 16, \
255, 240, 225, 210, 196, 182, 169, 157, \
145, 133, 122, 111, 101, 92, 83, 74, \
66, 59, 52, 45, 39, 34, 29, 25, \
21, 17, 14, 12, 10, 9, 8, 8, \
255, 248, 240, 233, 225, 218, 210, 203, \
196, 189, 182, 176, 169, 163, 156, 150, \
144, 138, 133, 127, 121, 116, 111, 106, \
101, 96, 91, 86, 82, 77, 73, 69, \
65, 61, 57, 54, 50, 47, 44, 41, \
38, 35, 32, 29, 27, 25, 22, 20, \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
z_filter_wh4: db 7, 7, 19, 7,
z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
pd_32768: dd 32768
z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11
z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
pw_m1to4: dw -1, -2, -3, -4
z_filter_k: times 4 db 0, 16
times 4 db 0, 20
times 4 db 8, 16
times 4 db 32, 16
times 4 db 24, 20
times 4 db 16, 16
times 4 db 0, 0
times 4 db 0, 0
pw_8: times 8 db 8, 0
pb_3: times 16 db 3
pb_16: times 16 db 16
pw_62: times 8 dw 62
pw_64: times 8 dw 64
pw_256: times 8 dw 256
pw_512: times 8 dw 512
pw_m256: times 8 dw -256
pb_2: times 8 db 2
pb_4: times 8 db 4
pb_8: times 8 db 8
pb_128: times 8 db 128
pb_m16: times 8 db -16
pw_128: times 4 dw 128
pw_255: times 4 dw 255
pb_36_m4: times 4 db 36, -4
pb_127_m127: times 4 db 127, -127
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
%%table:
%rep %0 - 2
dd %%base %+ .%3 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
;---------------------------------------------------------------------------------------
;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
punpcklqdq m1, m1
mova [dstq + %2], m1
%if %1 > 16
mova [dstq + 16 + %2], m1
%endif
%if %1 > 32
mova [dstq + 32 + %2], m1
mova [dstq + 48 + %2], m1
%endif
%endmacro
%macro IPRED_H 1 ; width
sub tlq, 4
movd m0, [tlq] ; get 4 bytes of topleft data
punpcklbw m0, m0 ; extend 2 byte
%if %1 == 4
pshuflw m1, m0, q2233
movd [dstq+strideq*0], m1
psrlq m1, 32
movd [dstq+strideq*1], m1
pshuflw m0, m0, q0011
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+stride3q ], m0
%elif %1 == 8
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
movq [dstq+strideq*1], m1
movhps [dstq+strideq*0], m1
movq [dstq+stride3q ], m0
movhps [dstq+strideq*2], m0
%else
IPRED_SET %1, 0, q3333
IPRED_SET %1, strideq, q2222
IPRED_SET %1, strideq*2, q1111
IPRED_SET %1, stride3q, q0000
%endif
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w%1
RET
%endmacro
INIT_XMM ssse3
cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
LEA r5, ipred_h_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
.w4:
IPRED_H 4
.w8:
IPRED_H 8
.w16:
IPRED_H 16
.w32:
IPRED_H 32
.w64:
IPRED_H 64
;---------------------------------------------------------------------------------------
;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movu m0, [tlq+ 1]
movu m1, [tlq+17]
movu m2, [tlq+33]
movu m3, [tlq+49]
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd m4, r5d
tzcnt r5d, r5d
movd m5, r5d
LEA r5, ipred_dc_ssse3_table
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+20]
pcmpeqd m3, m3
psrlw m4, 1 ; dc = (width + height) >> 1;
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movd m0, [tlq-4]
pmaddubsw m0, m3
jmp wq
.w4:
movd m1, [tlq+1]
pmaddubsw m1, m3
psubw m0, m4
paddw m0, m1
pmaddwd m0, m3
cmp hd, 4
jg .w4_mul
psrlw m0, 3 ; dc >>= ctz(width + height);
jmp .w4_end
.w4_mul:
punpckhqdq m1, m0, m0
paddw m0, m1
psrlq m1, m0, 32
paddw m0, m1
psrlw m0, 2
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8
cmovz r6d, r2d
movd m5, r6d
pmulhuw m0, m5
.w4_end:
pxor m1, m1
pshufb m0, m1
.s4:
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m0
movd [dstq+strideq*2], m0
movd [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
ALIGN function_align
.h8:
movq m0, [tlq-8]
pmaddubsw m0, m3
jmp wq
.w8:
movq m1, [tlq+1]
pmaddubsw m1, m3
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
paddw m0, m1
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w8_end:
pxor m1, m1
pshufb m0, m1
.s8:
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
ALIGN function_align
.h16:
mova m0, [tlq-16]
pmaddubsw m0, m3
jmp wq
.w16:
movu m1, [tlq+1]
pmaddubsw m1, m3
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8|32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w16_end:
pxor m1, m1
pshufb m0, m1
.s16:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
ALIGN function_align
.h32:
mova m0, [tlq-32]
pmaddubsw m0, m3
mova m2, [tlq-16]
pmaddubsw m2, m3
paddw m0, m2
jmp wq
.w32:
movu m1, [tlq+1]
pmaddubsw m1, m3
movu m2, [tlq+17]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 64|16
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w32_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
.s32:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq*2], m0
mova [dstq+strideq*2+16], m1
mova [dstq+stride3q], m0
mova [dstq+stride3q+16], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s32
RET
ALIGN function_align
.h64:
mova m0, [tlq-64]
mova m1, [tlq-48]
pmaddubsw m0, m3
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-32]
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-16]
pmaddubsw m1, m3
paddw m0, m1
jmp wq
.w64:
movu m1, [tlq+ 1]
movu m2, [tlq+17]
pmaddubsw m1, m3
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+33]
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+49]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 64
je .w64_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w64_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
mova m2, m0
mova m3, m0
.s64:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq+32], m2
mova [dstq+strideq+48], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .s64
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_left_ssse3_table
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
movu m0, [tlq]
movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
movd m2, r6d
psrld m3, m2
movsxd r6, [r5+r6*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
.h64:
movu m1, [tlq+48] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
movu m1, [tlq+32] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h32:
movu m1, [tlq+16] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h16:
pshufd m1, m0, q3232 ; psrlq m1, m0, 16
paddw m0, m1
.h8:
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
.h4:
pmaddwd m0, m2
pmulhrsw m0, m3
lea stride3q, [strideq*3]
pxor m1, m1
pshufb m0, m1
mova m1, m0
mova m2, m0
mova m3, m0
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r5+wq*4]
movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
mova m1, m0
mova m2, m0
mova m3, m0
add wq, r5
lea stride3q, [strideq*3]
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
LEA r5, ipred_dc_left_ssse3_table
tzcnt wd, wm
inc tlq
movu m0, [tlq]
movifnidn hd, hm
movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
movd m2, wd
psrld m3, m2
movsxd r6, [r5+wq*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, r5
add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
movsxd wq, [r5+wq*4]
add wq, r5
jmp r6
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
pmaddubsw m6, m%3, m%1
pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
paddw m6, m%5
paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
psrlw m6, 8
psrlw m0, 8
packuswb m6, m0
%endmacro
cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_ssse3_table
LEA r6, ipred_smooth_v_ssse3_table
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
lea weightsq, [base+smooth_weights+hq*4]
neg hq
movd m5, [tlq+hq]
pxor m2, m2
pshufb m5, m2
add wq, r6
jmp wq
.w4:
movd m2, [tlq+1]
punpckldq m2, m2
punpcklbw m2, m5 ; top, bottom
lea r3, [strideq*3]
mova m4, [base+ipred_v_shuf]
mova m5, m4
punpckldq m4, m4
punpckhdq m5, m5
pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
.w4_loop:
movu m1, [weightsq+hq*2]
pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
movd [dstq+strideq*0], m6
pshuflw m1, m6, q1032
movd [dstq+strideq*1], m1
punpckhqdq m6, m6
movd [dstq+strideq*2], m6
psrlq m6, 32
movd [dstq+r3 ], m6
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w4_loop
RET
ALIGN function_align
.w8:
movq m2, [tlq+1]
punpcklbw m2, m5
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
pshufd m4, m5, q0000
pshufd m5, m5, q1111
pmaddubsw m3, m2, m0
paddw m1, m2
paddw m3, m1 ; m3 is output for loop
.w8_loop:
movq m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
movq [dstq+strideq*0], m6
movhps [dstq+strideq*1], m6
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w8_loop
RET
ALIGN function_align
.w16:
movu m3, [tlq+1]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1 ; m4 and m5 is output for loop
.w16_loop:
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add dstq, strideq
add hq, 1
jl .w16_loop
RET
ALIGN function_align
.w32:
WIN64_PUSH_XMM 8, 7
mova m7, m5
.w32_loop_init:
mov r3d, 2
.w32_loop:
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
movu m3, [tlq+1]
punpcklbw m2, m3, m7
punpckhbw m3, m7
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add tlq, 16
add dstq, 16
dec r3d
jg .w32_loop
lea dstq, [dstq-32+strideq]
sub tlq, 32
add hq, 1
jl .w32_loop_init
RET
ALIGN function_align
.w64:
WIN64_PUSH_XMM 8, 7
mova m7, m5
.w64_loop_init:
mov r3d, 4
.w64_loop:
movddup m0, [base+pb_127_m127]
movddup m1, [base+pw_128]
movu m3, [tlq+1]
punpcklbw m2, m3, m7
punpckhbw m3, m7
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
movd m1, [weightsq+hq*2]
pshuflw m1, m1, q0000
punpcklqdq m1, m1
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m6
add tlq, 16
add dstq, 16
dec r3d
jg .w64_loop
lea dstq, [dstq-64+strideq]
sub tlq, 64
add hq, 1
jl .w64_loop_init
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_ssse3_table
LEA r6, ipred_smooth_h_ssse3_table
mov wd, wm
movd m3, [tlq+wq]
pxor m1, m1
pshufb m3, m1 ; right
tzcnt wd, wd
mov hd, hm
movsxd wq, [r6+wq*4]
movddup m4, [base+pb_127_m127]
movddup m5, [base+pw_128]
add wq, r6
jmp wq
.w4:
movddup m6, [base+smooth_weights+4*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
sub tlq, hq
lea r3, [strideq*3]
.w4_loop:
movd m2, [tlq+hq] ; left
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r3 ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
ALIGN function_align
.w8:
mova m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
sub tlq, hq
punpckldq m7, m7
.w8_loop:
movd m2, [tlq+hq] ; left
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
mova m6, [base+smooth_weights+16*2]
mova m7, [base+smooth_weights+16*3]
sub tlq, 1
sub tlq, hq
.w16_loop:
pxor m1, m1
movd m2, [tlq+hq] ; left
pshufb m2, m1
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
lea dstq, [dstq+strideq]
sub hd, 1
jg .w16_loop
RET
ALIGN function_align
.w32:
sub tlq, 1
sub tlq, hq
pxor m6, m6
.w32_loop_init:
mov r5, 2
lea r3, [base+smooth_weights+16*4]
.w32_loop:
mova m7, [r3]
add r3, 16
movd m2, [tlq+hq] ; left
pshufb m2, m6
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m7
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
mova m7, [r3]
add r3, 16
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
add dstq, 16
dec r5
jg .w32_loop
lea dstq, [dstq-32+strideq]
sub hd, 1
jg .w32_loop_init
RET
ALIGN function_align
.w64:
sub tlq, 1
sub tlq, hq
pxor m6, m6
.w64_loop_init:
mov r5, 4
lea r3, [base+smooth_weights+16*8]
.w64_loop:
mova m7, [r3]
add r3, 16
movd m2, [tlq+hq] ; left
pshufb m2, m6
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m7
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
mova m7, [r3]
add r3, 16
pmaddubsw m2, m7
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
mova [dstq], m0
add dstq, 16
dec r5
jg .w64_loop
lea dstq, [dstq-64+strideq]
sub hd, 1
jg .w64_loop_init
RET
;---------------------------------------------------------------------------------------
;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
pmaddubsw m6, m%3, m%1
mova m0, m6
pmaddubsw m6, m%4, m%2
mova m1, m6
%ifnum %5
paddw m0, m%5
%else
paddw m0, %5
%endif
%ifnum %6
paddw m1, m%6
%else
paddw m1, %6
%endif
%ifnum %7
%else
mova m3, %7
%endif
pavgw m0, m2
pavgw m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
mova m1, [rsp+16*%1] ; top
punpckhbw m6, m1, m0 ; top, bottom
punpcklbw m1, m0 ; top, bottom
pmaddubsw m2, m1, m5
mova [rsp+16*%2], m1
paddw m1, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m1 ; 128 * top + 129 * bottom + 255
mova [rsp+16*%3], m2
pmaddubsw m2, m6, m5
mova [rsp+16*%4], m6
paddw m6, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m6 ; 128 * top + 129 * bottom + 255
mova [rsp+16*%5], m2
movd m1, [tlq+hq] ; left
pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
punpcklbw m1, m4 ; left, right
pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
paddw m2, m1 ; 128 * left + 129 * right
mova m3, m2
pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
pmaddubsw m1, %7
paddw m2, m3, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
mova m7, [rsp+16*%9]
pshufb m1, m7
mova [rsp+16*%8], m3
mova m4, [rsp+16*%2]
mova m5, [rsp+16*%3]
mova m3, [rsp+16*%4]
mova m7, [rsp+16*%5]
SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
mova [dstq], m0
movddup m3, [base+pw_255] ; recovery
mova m0, [rsp+16*%10] ; recovery
mova m4, [rsp+16*%11] ; recovery
mova m5, [rsp+16*%12] ; recovery
%endmacro
cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_ssse3_table
mov wd, wm
mov hd, hm
LEA r6, ipred_smooth_ssse3_table
movd m4, [tlq+wq] ; right
pxor m2, m2
pshufb m4, m2
tzcnt wd, wd
mov r5, tlq
sub r5, hq
movsxd wq, [r6+wq*4]
movddup m5, [base+pb_127_m127]
movd m0, [r5]
pshufb m0, m2 ; bottom
movddup m3, [base+pw_255]
add wq, r6
lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
jmp wq
.w4:
mova m7, [base+ipred_v_shuf]
movd m1, [tlq+1] ; left
pshufd m1, m1, q0000
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m1, m0 ; top, bottom
pshufd m6, m7, q1100
pshufd m7, m7, q3322
pmaddubsw m2, m1, m5
paddw m3, m1 ; 1 * top + 255 * bottom + 255
paddw m2, m3 ; 128 * top + 129 * bottom + 255
mova [rsp+16*0], m1
mova [rsp+16*1], m2
movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
punpcklqdq m1, m1
mova [rsp+16*2], m1
mova [rsp+16*3], m4
mova [rsp+16*4], m6
mova [rsp+16*5], m5
.w4_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+ipred_h_shuf]
punpcklbw m0, m1, m4 ; left, right
punpckhbw m1, m4
pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
pmaddubsw m3, m1, m5
paddw m2, m0 ; 128 * left + 129 * right
paddw m3, m1
mova m4, [rsp+16*2]
pmaddubsw m0, m4
pmaddubsw m1, m4
paddw m2, m0
paddw m3, m1
movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 8
pshufb m0, m1, m6
pshufb m1, m7
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
mova m4, [rsp+16*3]
mova m6, [rsp+16*4]
mova m5, [rsp+16*5]
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r3 ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
ALIGN function_align
.w8:
mova m7, [base+ipred_v_shuf]
movq m1, [tlq+1] ; left
punpcklqdq m1, m1
sub tlq, 4
sub tlq, hq
punpcklbw m1, m0
pshufd m6, m7, q0000
pshufd m7, m7, q1111
pmaddubsw m2, m1, m5
paddw m3, m1
paddw m2, m3
mova [rsp+16*0], m1
mova [rsp+16*1], m2
mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
mova [rsp+16*2], m1
mova [rsp+16*3], m4
mova [rsp+16*4], m6
mova [rsp+16*5], m5
.w8_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+ipred_h_shuf]
pshufd m1, m1, q1100
punpcklbw m0, m1, m4
punpckhbw m1, m4
pmaddubsw m2, m0, m5
pmaddubsw m3, m1, m5
paddw m2, m0
paddw m3, m1
mova m4, [rsp+16*2]
pmaddubsw m0, m4
pmaddubsw m1, m4
paddw m2, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 4
pshufb m0, m1, m6
pshufb m1, m7
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
mova m4, [rsp+16*3]
mova m6, [rsp+16*4]
mova m5, [rsp+16*5]
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
mova m7, [base+ipred_v_shuf]
movu m1, [tlq+1] ; left
sub tlq, 4
sub tlq, hq
punpckhbw m6, m1, m0 ; top, bottom
punpcklbw m1, m0 ; top, bottom
pshufd m7, m7, q0000
mova [rsp+16*2], m7
pmaddubsw m2, m6, m5
mova [rsp+16*5], m6
paddw m6, m3 ; 1 * top + 255 * bottom + 255
paddw m2, m6 ; 128 * top + 129 * bottom + 255
mova [rsp+16*6], m2
pmaddubsw m2, m1, m5
paddw m3, m1 ; 1 * top + 255 * bottom + 255
mova [rsp+16*0], m1
paddw m2, m3 ; 128 * top + 129 * bottom + 255
mova [rsp+16*1], m2
mova [rsp+16*3], m4
mova [rsp+16*4], m5
.w16_loop:
movd m1, [tlq+hq] ; left
pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
punpcklbw m1, m4 ; left, right
pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
paddw m2, m1 ; 128 * left + 129 * right
mova m0, m1
mova m3, m2
pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
pmaddubsw m1, [base+smooth_weights+16*3]
paddw m2, m0
paddw m3, m1
movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
add v_weightsq, 2
mova m7, [rsp+16*2]
pshufb m1, m7
mova [rsp+16*7], m3
mova m4, [rsp+16*0]
mova m5, [rsp+16*1]
mova m3, [rsp+16*5]
mova m7, [rsp+16*6]
SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
mova m4, [rsp+16*3]
mova m5, [rsp+16*4]
mova [dstq], m0
lea dstq, [dstq+strideq]
sub hd, 1
jg .w16_loop
RET
ALIGN function_align
.w32:
movu m1, [tlq+1] ; top topleft[1 + x]
movu m2, [tlq+17] ; top
mova [rsp+16*0], m1
mova [rsp+16*1], m2
sub tlq, 4
sub tlq, hq
mova m7, [base+ipred_v_shuf]
pshufd m7, m7, q0000
mova [rsp+16*2], m7
mova [rsp+16*3], m0
mova [rsp+16*4], m4
mova [rsp+16*5], m5
.w32_loop:
SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
lea dstq, [dstq-16+strideq]
add v_weightsq, 2
sub hd, 1
jg .w32_loop
RET
ALIGN function_align
.w64:
movu m1, [tlq+1] ; top topleft[1 + x]
movu m2, [tlq+17] ; top
mova [rsp+16*0], m1
mova [rsp+16*1], m2
movu m1, [tlq+33] ; top
movu m2, [tlq+49] ; top
mova [rsp+16*11], m1
mova [rsp+16*12], m2
sub tlq, 4
sub tlq, hq
mova m7, [base+ipred_v_shuf]
pshufd m7, m7, q0000
mova [rsp+16*2], m7
mova [rsp+16*3], m0
mova [rsp+16*4], m4
mova [rsp+16*5], m5
.w64_loop:
SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
add dstq, 16
SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
lea dstq, [dstq-48+strideq]
add v_weightsq, 2
sub hd, 1
jg .w64_loop
RET
%if ARCH_X86_64
cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
%define base r7-$$
lea r7, [$$]
mova m8, [base+pw_62]
mova m9, [base+pw_64]
mova m10, [base+pw_512]
%else
cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
%define base r1-$$
%define m8 [base+pw_62]
%define m9 [base+pw_64]
%define m10 [base+pw_512]
%define strideq r3
%define stridemp dword [rsp+16*12]
mov stridemp, r1
LEA r1, $$
%endif
tzcnt wd, wm
movifnidn angled, anglem
movifnidn hd, hm
inc tlq
movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
mov dxd, angled
and dxd, 0x7e
add angled, 165 ; ~90
lea wq, [base+wq+ipred_z1_ssse3_table]
movzx dxd, word [base+dr_intra_derivative+dxq]
xor angled, 0x4ff ; d = 90 - angle
jmp wq
.w4:
lea r3d, [angleq+88]
test r3d, 0x480
jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
sar r3d, 9
add r3d, hd
cmp r3d, 8
jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
mova m1, [tlq-1]
pshufb m0, m1, [base+z_upsample1]
pshufb m1, [base+z_upsample2]
movddup m2, [base+pb_36_m4]
add dxd, dxd
pmaddubsw m0, m2
pshufd m7, m1, q3333
movd [rsp+16], m7 ; top[max_base_x]
pmaddubsw m1, m2
movd m6, dxd
mov r5d, dxd ; xpos
pshufb m6, [base+pw_256]
paddw m1, m0
movq m0, [tlq]
pmulhrsw m1, m10
paddw m7, m6, m6
punpcklqdq m6, m7 ; xpos0 xpos1
packuswb m1, m1
punpcklbw m0, m1
movifnidn strideq, stridemp
mova [rsp], m0
.w4_upsample_loop:
lea r2d, [r5+dxq]
shr r5d, 6 ; base0
movq m0, [rsp+r5]
lea r5d, [r2+dxq]
shr r2d, 6 ; base1
movhps m0, [rsp+r2]
pand m2, m8, m6 ; frac
psubw m1, m9, m2 ; 64-frac
psllw m2, 8
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
paddw m6, m7 ; xpos += dx
pmulhrsw m0, m10
packuswb m0, m0
movd [dstq+strideq*0], m0
pshuflw m0, m0, q1032
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w4_upsample_loop
RET
.w4_no_upsample:
mov r3d, 7 ; max_base
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w4_main
lea r3d, [hq+3]
movd m0, r3d
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
pcmpeqb m1, m0, [base+z_filter_wh4]
pand m1, m2
pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
pmovmskb r5d, m1
mov r3d, 7
test r5d, r5d
jz .w4_main ; filter_strength == 0
mova m3, [tlq-1]
imul r5d, 0x55555555
movu m7, [base+z_filter_s+8]
shr r5d, 30 ; filter_strength
movddup m0, [base+pb_8]
pminub m7, m0
pshufb m0, m3, [base+z_filter_s]
movddup m4, [base+z_filter_k-8+r5*8+24*0]
pshufb m3, m7
movddup m5, [base+z_filter_k-8+r5*8+24*1]
shufps m2, m0, m3, q2121
movddup m6, [base+z_filter_k-8+r5*8+24*2]
pmaddubsw m0, m4
pmaddubsw m1, m2, m4
pmaddubsw m2, m5
paddd m5, m6
pmaddubsw m4, m3, m5
pmaddubsw m3, m6
paddw m0, m2
paddw m1, m4
paddw m0, m3
pshufd m1, m1, q3333
pmulhrsw m0, m10
pmulhrsw m1, m10
mov r5d, 9
mov tlq, rsp
cmp hd, 4
cmovne r3d, r5d
packuswb m0, m1
mova [tlq], m0
.w4_main:
add tlq, r3
movd m5, dxd
movddup m0, [base+z_base_inc] ; base_inc << 6
movd m7, [tlq] ; top[max_base_x]
shl r3d, 6
movd m4, r3d
pshufb m5, [base+pw_256]
mov r5d, dxd ; xpos
pshufb m7, [base+pw_m256]
sub r5, r3
pshufb m4, [base+pw_256]
mova m3, [base+z1_shuf_w4]
paddw m6, m5, m5
psubw m4, m0 ; max_base_x
punpcklqdq m5, m6 ; xpos0 xpos1
.w4_loop:
lea r3, [r5+dxq]
sar r5, 6 ; base0
movq m0, [tlq+r5]
lea r5, [r3+dxq]
sar r3, 6 ; base1
movhps m0, [tlq+r3]
pand m2, m8, m5 ; frac
psubw m1, m9, m2 ; 64-frac
psllw m2, 8
pshufb m0, m3
por m1, m2 ; 64-frac, frac
pmaddubsw m0, m1
movifnidn strideq, stridemp
pcmpgtw m1, m4, m5 ; base < max_base_x
pmulhrsw m0, m10
paddw m5, m6 ; xpos += dx
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movd [dstq+strideq*0], m0
pshuflw m0, m0, q1032
movd [dstq+strideq*1], m0
sub hd, 2
jz .w4_end
lea dstq, [dstq+strideq*2]
test r5d, r5d
jl .w4_loop
packuswb m7, m7
.w4_end_loop:
movd [dstq+strideq*0], m7
movd [dstq+strideq*1], m7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w4_end_loop
.w4_end:
RET
.w8:
lea r3d, [angleq+88]
and r3d, ~0x7f
or r3d, hd
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
mova m5, [base+z_upsample1]
movu m3, [base+z_filter_s+6]
movd m4, hd
mova m0, [tlq-1]
movu m1, [tlq+7]
pxor m7, m7
pshufb m4, m7
movddup m7, [base+pb_36_m4]
pminub m4, m3
add dxd, dxd
pshufb m2, m0, m5
pmaddubsw m2, m7
pshufb m0, m3
pmaddubsw m0, m7
movd m6, dxd
pshufb m3, m1, m5
pmaddubsw m3, m7
pshufb m1, m4
pmaddubsw m1, m7
pshufb m6, [base+pw_256]
mov r5d, dxd
paddw m2, m0
paddw m7, m6, m6
paddw m3, m1
punpcklqdq m6, m7 ; xpos0 xpos1
movu m1, [tlq]
pmulhrsw m2, m10
pmulhrsw m3, m10
packuswb m2, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
movifnidn strideq, stridemp
mova [rsp+16*0], m0
mova [rsp+16*1], m1
.w8_upsample_loop:
lea r2d, [r5+dxq]
shr r5d, 6 ; base0
movu m0, [rsp+r5]
lea r5d, [r2+dxq]
shr r2d, 6 ; base1
movu m1, [rsp+r2]
pand m2, m8, m6
psubw m3, m9, m2
psllw m2, 8
por m3, m2
punpcklqdq m2, m3, m3 ; frac0
pmaddubsw m0, m2
punpckhqdq m3, m3 ; frac1
pmaddubsw m1, m3
paddw m6, m7
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_upsample_loop
RET
.w8_no_upsample:
lea r3d, [hq+7]
movd m0, r3d
and r3d, 7
or r3d, 8 ; imin(h+7, 15)
test angled, 0x400
jnz .w8_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movu m1, [base+z_filter_wh8]
psrldq m3, [base+z_filter_t_w48+angleq*8], 4
pcmpeqb m1, m0
pand m1, m2
pcmpgtb m1, m3
pmovmskb r5d, m1
test r5d, r5d
jz .w8_main ; filter_strength == 0
movd m3, [tlq-1]
movu m0, [tlq+16*0]
imul r5d, 0x55555555
movu m1, [tlq+16*1]
shr r5d, 30 ; filter_strength
movd m2, [tlq+r3]
lea tlq, [rsp+16*4]
sub r5, 3
mova [tlq-16*1], m0
pxor m7, m7
mova [tlq+16*0], m1
pshufb m3, m7
pshufb m2, m7
mova [tlq-16*2], m3
movq [tlq+r3-15], m2
call .filter_edge
sar r5d, 1
add r5d, 17
cmp hd, 8
cmova r3d, r5d
.w8_main:
add tlq, r3
movd m5, dxd
movd m7, [tlq]
shl r3d, 6
movu m3, [base+z_filter_s+2]
movd m4, r3d
pshufb m5, [base+pw_256]
mov r5d, dxd
pshufb m7, [base+pw_m256]
sub r5, r3
pshufb m4, [base+pw_256]
psubw m4, [base+z_base_inc]
mova m6, m5
.w8_loop:
mov r3, r5
sar r3, 6
movu m0, [tlq+r3]
pand m1, m8, m5
psubw m2, m9, m1
psllw m1, 8
pshufb m0, m3
por m1, m2
pmaddubsw m0, m1
pcmpgtw m1, m4, m5
paddw m5, m6
pmulhrsw m0, m10
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movq [dstq], m0
dec hd
jz .w8_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w8_loop
packuswb m7, m7
.w8_end_loop:
movq [dstq], m7
add dstq, strideq
dec hd
jg .w8_end_loop
.w8_end:
RET
.w16:
lea r3d, [hq+15]
movd m0, r3d
and r3d, 15
or r3d, 16 ; imin(h+15, 31)
test angled, 0x400
jnz .w16_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movq m3, [base+z_filter_t_w16+angleq*4]
pcmpeqb m0, [base+z_filter_wh16]
pand m0, m2
pcmpgtb m0, m3
pmovmskb r5d, m0
test r5d, r5d
jz .w16_main ; filter_strength == 0
movd m4, [tlq-1]
movu m0, [tlq+16*0]
imul r5d, 0x24924924
movu m1, [tlq+16*1]
shr r5d, 30
movd m2, [tlq+30]
adc r5, -4 ; filter_strength-3
movd m3, [tlq+r3]
lea tlq, [rsp+16*4]
mova [tlq-16*1], m0
pxor m7, m7
mova [tlq+16*0], m1
pshufb m4, m7
movd [rsp], m2
pshufb m3, m7
mova [tlq-16*2], m4
movd [tlq+r3-16], m3
call .filter_edge
cmp hd, 16
jle .w16_main
pshuflw m0, [rsp], q0000
sar r5, 1
movd m1, [base+z_filter_k_tail+4+r5*4]
lea r3d, [r5+33]
pmaddubsw m0, m1
%if ARCH_X86_64
pmulhrsw m0, m10
%else
pmulhrsw m0, m4
%endif
packuswb m0, m0
movd [tlq+32], m0
.w16_main:
add tlq, r3
movd m5, dxd
movd m7, [tlq]
movd m4, r3d
shl r3d, 6
pshufb m5, [base+pw_256]
pxor m6, m6
pshufb m7, m6
mov r5d, dxd
pshufb m4, m6
sub r5, r3
psubb m4, [base+pb_0to15]
mova m6, m5
.w16_loop:
mov r3, r5
sar r3, 6
movu m1, [tlq+r3+0]
pand m0, m8, m5
movu m2, [tlq+r3+1]
psubw m3, m9, m0
psllw m0, 8
por m3, m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
psrlw m3, m5, 6
packsswb m3, m3
pmulhrsw m0, m10
pmulhrsw m1, m10
paddw m5, m6
pcmpgtb m2, m4, m3
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
mova [dstq], m0
dec hd
jz .w16_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w16_loop
.w16_end_loop:
mova [dstq], m7
add dstq, strideq
dec hd
jg .w16_end_loop
.w16_end:
RET
.w32:
lea r3d, [hq+31]
and r3d, 31
or r3d, 32 ; imin(h+31, 63)
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
movd m6, [tlq-1]
movu m0, [tlq+16*0]
movu m1, [tlq+16*1]
movu m2, [tlq+16*2]
movu m3, [tlq+16*3]
movd m4, [tlq+62]
movd m5, [tlq+r3]
lea tlq, [rsp+16*6]
mova [tlq-16*3], m0
pxor m7, m7
mova [tlq-16*2], m1
pshufb m6, m7
mova [tlq-16*1], m2
xor r5d, r5d ; filter_strength = 3
mova [tlq+16*0], m3
movd [rsp], m4
pshufb m5, m7
mova [tlq-16*4], m6
movd [tlq+r3-48], m5
call .filter_edge
sub tlq, 16*2
call .filter_edge
cmp hd, 32
jle .w32_main
pshuflw m0, [rsp], q0000
movd m1, [base+z_filter_k_tail+4]
add r3d, 2
pmaddubsw m0, m1
%if ARCH_X86_64
pmulhrsw m0, m10
%else
pmulhrsw m0, m4
%endif
packuswb m0, m0
movd [tlq+64], m0
.w32_main:
add tlq, r3
movd m0, r3d
movd m7, [tlq]
shl r3d, 6
movd m5, dxd
pxor m6, m6
mov r5d, dxd
pshufb m0, m6
pshufb m5, [base+pw_256]
sub r5, r3
pshufb m7, m6
psubb m0, [base+pb_0to15]
movddup m1, [base+pb_m16]
mova [rsp+16*0], m0
paddb m0, m1
mova [rsp+16*1], m0
mova m6, m5
.w32_loop:
mov r3, r5
sar r3, 6
movu m1, [tlq+r3+16*0+0]
pand m0, m8, m5
movu m2, [tlq+r3+16*0+1]
psubw m3, m9, m0
psllw m0, 8
por m3, m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
psrlw m4, m5, 6
pmulhrsw m0, m10
pmulhrsw m1, m10
packsswb m4, m4
pcmpgtb m2, [rsp+16*0], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
movu m1, [tlq+r3+16*1+0]
movu m2, [tlq+r3+16*1+1]
mova [dstq+16*0], m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
paddw m5, m6
pmulhrsw m0, m10
pmulhrsw m1, m10
pcmpgtb m2, [rsp+16*1], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
mova [dstq+16*1], m0
dec hd
jz .w32_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w32_loop
.w32_end_loop:
mova [dstq+16*0], m7
mova [dstq+16*1], m7
add dstq, strideq
dec hd
jg .w32_end_loop
.w32_end:
RET
.w64:
lea r3d, [hq+63]
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w64_main
movd m4, [tlq-1]
movu m0, [tlq+16*0]
movu m1, [tlq+16*1]
movu m2, [tlq+16*2]
movu m3, [tlq+16*3]
mova [rsp+16*3], m0
pxor m7, m7
mova [rsp+16*4], m1
pshufb m4, m7
mova [rsp+16*5], m2
mova [rsp+16*6], m3
mova [rsp+16*2], m4
movu m0, [tlq+16*4]
movu m1, [tlq+16*5]
movu m2, [tlq+16*6]
movu m3, [tlq+16*7]
movd m4, [tlq+r3]
lea tlq, [rsp+16*10]
mova [tlq-16*3], m0
xor r5d, r5d ; filter_strength = 3
mova [tlq-16*2], m1
pshufb m4, m7
mova [tlq-16*1], m2
mova [tlq+16*0], m3
movd [tlq+r3-16*7], m4
cmp hd, 64
jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
call .filter_edge
.w64_filter96:
sub tlq, 16*2
call .filter_edge
sub tlq, 16*2
call .filter_edge
sub tlq, 16*2
call .filter_edge
.w64_main:
add tlq, r3
movd m0, r3d
movd m7, [tlq]
shl r3d, 6
movd m5, dxd
pxor m6, m6
mov r5d, dxd
pshufb m0, m6
sub r5, r3
pshufb m5, [base+pw_256]
pshufb m7, m6
psubb m0, [base+pb_0to15]
movddup m1, [base+pb_m16]
mova [rsp+16*0], m0
paddb m0, m1
mova [rsp+16*1], m0
paddb m0, m1
mova [rsp+16*2], m0
paddb m0, m1
mova [rsp+16*3], m0
mova m6, m5
.w64_loop:
mov r3, r5
sar r3, 6
movu m1, [tlq+r3+16*0+0]
pand m0, m8, m5
movu m2, [tlq+r3+16*0+1]
psubw m3, m9, m0
psllw m0, 8
por m3, m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
psrlw m4, m5, 6
pmulhrsw m0, m10
pmulhrsw m1, m10
packsswb m4, m4
pcmpgtb m2, [rsp+16*0], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
movu m1, [tlq+r3+16*1+0]
movu m2, [tlq+r3+16*1+1]
mova [dstq+16*0], m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
pmulhrsw m0, m10
pmulhrsw m1, m10
pcmpgtb m2, [rsp+16*1], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
movu m1, [tlq+r3+16*2+0]
movu m2, [tlq+r3+16*2+1]
mova [dstq+16*1], m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
pmulhrsw m0, m10
pmulhrsw m1, m10
pcmpgtb m2, [rsp+16*2], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
movu m1, [tlq+r3+16*3+0]
movu m2, [tlq+r3+16*3+1]
mova [dstq+16*2], m0
punpcklbw m0, m1, m2
pmaddubsw m0, m3
punpckhbw m1, m2
pmaddubsw m1, m3
paddw m5, m6
pmulhrsw m0, m10
pmulhrsw m1, m10
pcmpgtb m2, [rsp+16*3], m4
packuswb m0, m1
pand m0, m2
pandn m2, m7
por m0, m2
mova [dstq+16*3], m0
dec hd
jz .w64_end
movifnidn strideq, stridemp
add dstq, strideq
add r5, dxq
jl .w64_loop
.w64_end_loop:
mova [dstq+16*0], m7
mova [dstq+16*1], m7
mova [dstq+16*2], m7
mova [dstq+16*3], m7
add dstq, strideq
dec hd
jg .w64_end_loop
.w64_end:
RET
ALIGN function_align
.filter_edge: ; 32 pixels/iteration
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
movu m2, [tlq-18]
movu m1, [tlq-17]
movu m3, [tlq- 2]
movu m4, [tlq- 1]
punpcklbw m0, m2, m1
pmaddubsw m0, m7
punpckhbw m2, m1
pmaddubsw m2, m7
punpcklbw m1, m3, m4
pmaddubsw m1, m7
punpckhbw m3, m4
pmaddubsw m3, m7
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
mova m5, [tlq-16]
movu m6, [tlq-15]
punpcklbw m4, m5, m6
pmaddubsw m4, m7
punpckhbw m5, m6
pmaddubsw m5, m7
paddw m0, m4
paddw m2, m5
mova m5, [tlq+ 0]
movu m6, [tlq+ 1]
punpcklbw m4, m5, m6
pmaddubsw m4, m7
punpckhbw m5, m6
pmaddubsw m5, m7
paddw m1, m4
paddw m3, m5
test r5d, r5d
jnz .filter_end ; 3-tap
movddup m7, [base+z_filter_k+8*8]
movu m5, [tlq-14]
movu m6, [tlq+ 2]
punpcklbw m4, m5, m5
pmaddubsw m4, m7
punpckhbw m5, m5
pmaddubsw m5, m7
paddw m0, m4
paddw m2, m5
punpcklbw m5, m6, m6
pmaddubsw m5, m7
punpckhbw m6, m6
pmaddubsw m6, m7
paddw m1, m5
paddw m3, m6
.filter_end:
%if ARCH_X86_64
REPX {pmulhrsw x, m10}, m0, m2, m1, m3
%else
mova m4, m10
REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
%endif
packuswb m0, m2
packuswb m1, m3
mova [tlq+16*0], m0
mova [tlq+16*1], m1
ret
%if ARCH_X86_64
cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
%define base r7-$$
%define maxwm r6m
%define maxhm r7m
lea r7, [$$]
mov hd, hm
mova m8, [base+pw_62]
mova m9, [base+pw_64]
lea r9d, [wq-4]
mova m10, [base+pw_512]
shl r9d, 6
mova m11, [base+z1_shuf_w4]
or r9d, hd
mova m12, [base+z2_h_shuf]
%else
cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
%define base r1-$$
%define m8 [base+pw_62]
%define m9 [base+pw_64]
%define m10 [base+pw_512]
%define m11 [rsp+16*16]
%define m12 [rsp+16*17]
%define r9b byte [rsp+16*18+4*0]
%define r9d dword [rsp+16*18+4*0]
%define r10d dword [rsp+16*18+4*1]
%define r11d dword [rsp+16*18+4*2]
%define maxwm [rsp+16*18+4*3]
%define maxhm [rsp+16*19+4*0]
%define stridemp [rsp+16*19+4*1]
%define strideq r3
%define dyd r4
%define dyq r4
mov stridemp, r1
mov r1d, r6m
mov r4d, r7m
mov maxwm, r1d
mov maxhm, r4d
LEA r1, $$
lea hd, [wq-4]
mova m0, [base+z1_shuf_w4]
shl hd, 6
mova m1, [base+z2_h_shuf]
or hd, hm
mova m11, m0
mov r9d, hd
mova m12, m1
%endif
tzcnt wd, wd
movifnidn angled, anglem
movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
%if ARCH_X86_64
movzx dxd, angleb
%else
movzx dxd, byte anglem
%endif
xor angled, 0x400
mova m0, [tlq-16*4]
mov dyd, dxd
mova m1, [tlq-16*3]
neg dxq
mova m2, [tlq-16*2]
and dyd, ~1
mova m3, [tlq-16*1]
and dxq, ~1
movd m4, [tlq]
movu m5, [tlq+16*0+1]
movu m6, [tlq+16*1+1]
movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
mova [rsp+16*2], m0
pxor m7, m7
mova [rsp+16*3], m1
pshufb m4, m7
mova [rsp+16*4], m2
lea wq, [base+ipred_z2_ssse3_table+wq]
mova [rsp+16*5], m3
neg dxd
mova [rsp+16*6], m4
or dyd, 4<<16
mova [rsp+16*7], m4
mova [rsp+16*8], m5
mova [rsp+16*9], m6
movq m0, [base+z_base_inc+2]
movsldup m1, [base+z2_dy_offset]
movq m2, [base+pw_256] ; 4<<6
movq [rsp+16*14+8*0], m0
movq [rsp+16*15+8*0], m1
movq [rsp+16*15+8*1], m2
%if ARCH_X86_64
lea r10d, [dxq+(128<<6)] ; xpos
%else
mov [rsp+16*7+4*1], dyd
lea r4d, [dxq+(128<<6)]
mov r10d, r4d
movzx hd, r9b
%endif
mov r11d, (128-4)<<6
jmp wq
.w4:
test angled, 0x400
jnz .w4_main
movd m5, [tlq+4]
lea r3d, [hq+2]
add angled, 1022
pshufb m5, m7
shl r3d, 6
movd [rsp+16*8+4], m5
test r3d, angled
jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
call .upsample_above
sub angled, 1075 ; angle - 53
lea r3d, [hq+3]
xor angled, 0x7f ; 180 - angle
movd m0, r3d
movd m6, angled
shr angled, 8 ; is_sm << 1
pshufb m0, m7
pshufb m6, m7
pcmpeqb m0, [base+z_filter_wh4]
pand m6, m0
pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
jmp .w8_filter_left
.upsample_above: ; w4/w8
movq m3, [rsp+gprsize+16*8-2]
movq m1, [rsp+gprsize+16*8-1]
movq m0, [rsp+gprsize+16*8+0]
movq m4, [rsp+gprsize+16*8+1]
movddup m5, [base+pb_36_m4]
punpcklbw m1, m3
punpcklbw m2, m0, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
%if ARCH_X86_64
mova m11, [base+pb_0to15]
lea r10d, [r10+dxq+(1<<6)]
mov r11d, (128-7)<<6
%else
mova m3, [base+pb_0to15]
mov r3d, [rsp+gprsize+16*18+4*1]
mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
lea r3d, [r3+dxq+(1<<6)]
mov [rsp+gprsize+16*18+4*1], r3d
mova [rsp+gprsize+16*16], m3
%endif
add dxd, dxd
paddw m1, m2
pmulhrsw m1, m10
movq m2, [rsp+gprsize+16*14]
paddw m2, m2
movq [rsp+gprsize+16*14], m2
packuswb m1, m1
punpcklbw m1, m0
mova [rsp+gprsize+16*8], m1
ret
.w4_no_upsample_above:
lea r3d, [hq+3]
mov [rsp], angled
sub angled, 1112 ; angle - 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movu m3, [base+z_filter_wh4]
mova m4, [base+z_filter_t_w48+angleq*8]
call .w8_filter_top
mov angled, [rsp]
lea r3d, [hq+2]
sub angled, 139
shl r3d, 6
test r3d, angled
jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
.upsample_left: ; w4/w8
neg hq
movd m0, [tlq+hq]
pshufb m0, m7
movd [rsp+16*6+hq-4], m0
movq m3, [rsp+16*5+7]
movq m0, [rsp+16*5+8]
movq m2, [rsp+16*5+9]
movq m4, [rsp+16*5+10]
movddup m5, [base+pb_36_m4]
punpcklbw m1, m0, m3
punpcklbw m2, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
movshdup m3, [base+z2_dy_offset]
%if ARCH_X86_64
mova m12, [base+z2_upsample]
add dyd, dyd
%else
mova m4, [base+z2_upsample]
shl dword [rsp+16*7+4*1], 1
mova m12, m4
%endif
paddw m1, m2
pmulhrsw m1, m10
movq [rsp+16*15], m3
packuswb m1, m1
punpcklbw m0, m1
mova [rsp+16*5], m0
.w4_main:
movd m6, dxd
%if ARCH_X86_64
movd m3, dyd
%else
movd m3, [rsp+16*7+4*1]
%endif
movddup m0, [rsp+16*14+8*0]
pshufb m6, [base+pw_256]
paddw m7, m6, m6
movq m5, [base+pw_m1to4]
pshuflw m4, m3, q0000
punpcklqdq m6, m7
pmullw m4, m5
pshuflw m3, m3, q1111
paddw m6, m0
mov r2d, r10d
pshuflw m0, m4, q3333
psubw m4, [rsp+16*15]
movq [rsp+16*6+8*1], m3
movq [rsp+8*1], m0 ; dy*4
mov r5, dstq
.w4_loop0:
mova [rsp+16*12], m6
movq [rsp+8*0], m4
pand m0, m4, m8
psraw m4, 6
psubw m1, m9, m0
psllw m0, 8
por m0, m1 ; 64-frac_y, frac_y
movq [rsp+8*3], m0
pabsw m4, m4
movq [rsp+8*2], m4
movzx hd, r9b
.w4_loop:
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x0
movq m0, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x1
movhps m0, [rsp+r3]
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x2
movq m1, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x3
movhps m1, [rsp+r3]
pand m2, m8, m6
paddsw m5, m6, m7
psubw m3, m9, m2
psllw m2, 8
pshufb m0, m11
por m2, m3
pmaddubsw m0, m2
pand m2, m8, m5
psubw m3, m9, m2
psllw m2, 8
pshufb m1, m11
por m2, m3
pmaddubsw m1, m2
cmp r3d, 127 ; topleft
jge .w4_toponly
movzx r3d, byte [rsp+8*2+0] ; base_y0
movq m3, [rsp+r3]
movzx r3d, byte [rsp+8*2+2] ; base_y1
movhps m3, [rsp+r3]
movzx r3d, byte [rsp+8*2+4] ; base_y2
movq m4, [rsp+r3]
movzx r3d, byte [rsp+8*2+6] ; base_y3
movhps m4, [rsp+r3]
pshufb m3, m12
pshufb m4, m12
punpckldq m2, m3, m4
punpckhdq m3, m4
movddup m4, [rsp+8*3]
pmaddubsw m2, m4
pmaddubsw m3, m4
psraw m6, 15 ; base_x < topleft
pand m2, m6
pandn m6, m0
por m0, m2, m6
psraw m6, m5, 15
pand m3, m6
pandn m6, m1
por m1, m3, m6
.w4_toponly:
pmulhrsw m0, m10
pmulhrsw m1, m10
movifnidn strideq, stridemp
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0
movd [dstq+strideq*0], m0
psrlq m0, 32
movd [dstq+strideq*1], m0
sub hd, 4
jz .w4_end
movq m4, [rsp+8*2]
movq m3, [rsp+16*6+8*1]
paddw m6, m5, m7 ; xpos += dx
psubw m4, m3
movq [rsp+8*2], m4
lea dstq, [dstq+strideq*2]
cmp r2d, r11d
jge .w4_loop
movddup m5, [rsp+8*3]
.w4_leftonly_loop:
movzx r2d, byte [rsp+8*2+0] ; base_y0
movq m1, [rsp+r2]
movzx r2d, byte [rsp+8*2+2] ; base_y1
movhps m1, [rsp+r2]
movzx r2d, byte [rsp+8*2+4] ; base_y2
movq m2, [rsp+r2]
movzx r2d, byte [rsp+8*2+6] ; base_y3
movhps m2, [rsp+r2]
psubw m4, m3
pshufb m1, m12
pshufb m2, m12
movq [rsp+8*2], m4
punpckldq m0, m1, m2
punpckhdq m1, m2
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0
movd [dstq+strideq*0], m0
psrlq m0, 32
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w4_leftonly_loop
.w4_end:
sub r9d, 1<<8
jl .w4_ret
movq m4, [rsp+8*1]
add r5, 4
mov dstq, r5
paddw m4, [rsp+8*0] ; base_y += 4*dy
movzx r2d, word [rsp+16*15+8*1]
movddup m6, [rsp+16*15+8*1]
paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
add r2d, r10d
mov r10d, r2d
jmp .w4_loop0
.w4_ret:
RET
.w8:
test angled, 0x400
jnz .w4_main
movd m5, [tlq+8]
lea r3d, [angleq+126]
pshufb m5, m7
%if ARCH_X86_64
mov r3b, hb
%else
xor r3b, r3b
or r3d, hd
%endif
movd [rsp+16*8+8], m5
cmp r3d, 8
ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
call .upsample_above
sub angled, 53
lea r3d, [hq+7]
xor angled, 0x7f ; 180 - angle
movu m1, [base+z_filter_wh8]
movd m0, r3d
movd m6, angled
shr angled, 8 ; is_sm << 1
psrldq m2, [base+z_filter_t_w48+angleq*8], 4
pshufb m0, m7
pshufb m6, m7
pcmpeqb m0, m1
pand m6, m0
pcmpgtb m6, m2
%if ARCH_X86_64
movq [rsp+16*15+8*1], m10 ; 8<<6
%else
movq m0, m10
movq [rsp+16*15+8*1], m0
%endif
jmp .w8_filter_left
.w8_no_upsample_above:
lea r3d, [hq+7]
mov [rsp], angled
sub angled, 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movu m3, [base+z_filter_wh8]
psrldq m4, [base+z_filter_t_w48+angleq*8], 4
call .w8_filter_top
mov r3d, [rsp]
sub r3d, 141
%if ARCH_X86_64
mov r3b, hb
%else
xor r3b, r3b
or r3d, hd
%endif
cmp r3d, 8
jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
.w8_filter_left:
pmovmskb r5d, m6
test r5d, r5d
jz .w4_main
imul r5d, 0x55555555
mov r3, tlq
shr r5d, 30
sub r5, 3 ; filter_strength-3
jmp .filter_left
.w8_filter_top:
movd m6, r3d
REPX {pshufb x, m7}, m0, m1, m6
pcmpeqb m0, m3
pand m1, m0
pand m6, m0
pcmpgtb m1, m4
pcmpgtb m6, m4
pmovmskb r5d, m1
test r5d, r5d
jz .w8_filter_top_end ; filter_strength == 0
imul r5d, 0x55555555
movq m0, [rsp+gprsize+16*8-2]
shr r5d, 30
movq m1, [rsp+gprsize+16*8-1]
sub r5, 3 ; filter_strength-3
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
punpcklbw m0, m1
pmaddubsw m0, m7
movq m1, [rsp+gprsize+16*8+0]
movq m2, [rsp+gprsize+16*8+1]
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
punpcklbw m1, m2
pmaddubsw m1, m7
movq m2, [rsp+gprsize+16*8+2]
movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
punpcklbw m2, m2
pmaddubsw m2, m7
paddw m0, m1
paddw m0, m2
%if ARCH_X86_64
mov r3d, r7m ; maxw, offset due to call
%else
mov r3d, [rsp+gprsize+16*18+4*3]
%endif
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movq [rsp+gprsize+16*8], m0
cmp r3d, 8
jge .w8_filter_top_end
movq m0, [tlq+r3+1]
movq [rsp+gprsize+r3+16*8], m0
.w8_filter_top_end:
ret
.w16:
test angled, 0x400
jnz .w4_main
lea r3d, [hq+15]
sub angled, 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movd m6, r3d
REPX {pshufb x, m7}, m0, m1, m6
movq m3, [base+z_filter_t_w16+angleq*4]
pcmpeqb m0, [base+z_filter_wh16]
pand m1, m0
pand m6, m0
pcmpgtb m1, m3
pcmpgtb m6, m3
pmovmskb r5d, m1
mov r3, tlq
test r5d, r5d
jz .w16_filter_left ; filter_strength == 0
imul r5d, 0x24924924
pshufb m5, [base+z_filter_t_w16] ; tlq[16]
shr r5d, 30
adc r5, -4 ; filter_strength-3
movd [rsp+16*9], m5
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
movu m1, [rsp+16*8-2]
movu m2, [rsp+16*8-1]
punpcklbw m0, m1, m2
pmaddubsw m0, m7
punpckhbw m1, m2
pmaddubsw m1, m7
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
mova m3, [rsp+16*8+0]
movu m4, [rsp+16*8+1]
punpcklbw m2, m3, m4
pmaddubsw m2, m7
punpckhbw m3, m4
pmaddubsw m3, m7
paddw m0, m2
paddw m1, m3
test r5d, r5d
jnz .w16_filter_end ; 3-tap
movddup m7, [base+z_filter_k+8*8]
movu m3, [rsp+16*8+2]
punpcklbw m2, m3, m3
pmaddubsw m2, m7
punpckhbw m3, m3
pmaddubsw m3, m7
paddw m0, m2
paddw m1, m3
.w16_filter_end:
mov r2d, maxwm
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
mova [rsp+16*8], m0
cmp r2d, 16
jge .w16_filter_left
movu m0, [r3+r2+1]
movu [rsp+r2+16*8], m0
.w16_filter_left:
pmovmskb r5d, m6
test r5d, r5d
jz .w4_main
imul r5d, 0x24924924
shr r5d, 30
adc r5, -4 ; filter_strength-3
jmp .filter_left
.w32:
test angled, 0x400
jnz .w4_main
pshufb m6, [base+z_filter_t_w16] ; tlq[32]
mov r3, tlq
lea tlq, [rsp+16*9]
movd [tlq+16*1], m6
xor r5d, r5d ; filter_strength = 3
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mov r2d, maxwm
mova [rsp+16*8], m0
mova [rsp+16*9], m1
cmp r2d, 32
jge .filter_left
movu m0, [r3+r2+16*0+1]
movu m1, [r3+r2+16*1+1]
movu [rsp+r2+16*8], m0
movu [rsp+r2+16*9], m1
jmp .filter_left
.w64:
movu m0, [tlq+16*2+1]
movu m1, [tlq+16*3+1]
mova [rsp+16*10], m0
mova [rsp+16*11], m1
test angled, 0x400
jnz .w4_main
pshufb m1, [base+z_filter_t_w16] ; tlq[64]
mov r3, tlq
lea tlq, [rsp+16*11]
movd [tlq+16*1], m1
xor r5d, r5d ; filter_strength = 3
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mova m2, [tlq+16*2]
mova m3, [tlq+16*3]
mov r2d, maxwm
mova [rsp+16* 8], m0
mova [rsp+16* 9], m1
mova [rsp+16*10], m2
mova [rsp+16*11], m3
cmp r2d, 64
jge .filter_left
movu m0, [r3+r2+16*0+1]
movu m1, [r3+r2+16*1+1]
movu [rsp+r2+16* 8], m0
movu [rsp+r2+16* 9], m1
cmp r2d, 32
jge .filter_left
movu m0, [r3+r2+16*2+1]
movu m1, [r3+r2+16*3+1]
movu [rsp+r2+16*10], m0
movu [rsp+r2+16*11], m1
.filter_left:
neg hq
movd m0, [r3+hq]
pxor m1, m1
pshufb m0, m1
movd [rsp+16*6+hq-4], m0
lea tlq, [rsp+16*5]
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
cmp hd, -32
jge .filter_left_end
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mova [rsp+16*2], m0
mova [rsp+16*3], m1
.filter_left_end:
mov r2d, maxhm
mova m0, [rsp+16*5]
mova m1, [rsp+16*6]
mova m2, [rsp+16*7]
neg r2
mova [rsp+16*4], m0
mova [rsp+16*5], m1
mova [rsp+16*6], m2
cmp r2d, hd
jle .w4_main
movu m0, [r3+r2-16*2]
movu m1, [r3+r2-16*1]
movu [rsp+r2+16*4], m0
movu [rsp+r2+16*5], m1
cmp r2d, -32
jle .w4_main
movu m0, [r3+r2-16*4]
movu m1, [r3+r2-16*3]
movu [rsp+r2+16*2], m0
movu [rsp+r2+16*3], m1
jmp .w4_main
%if ARCH_X86_64
cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
%define base r7-$$
lea r7, [$$]
mova m8, [base+pw_62]
mova m9, [base+pw_64]
mova m10, [base+pw_512]
mov org_wd, wd
%else
cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
%define base r1-$$
%define m8 [base+pw_62]
%define m9 [base+pw_64]
%define m10 [base+pw_512]
%define org_wd r5
%define org_wq r5
mov [dstq+strideq*0], strideq
mov [dstq+strideq*1], wd
LEA r1, $$
%endif
tzcnt hd, hm
movifnidn angled, anglem
dec tlq
movsxd hq, [base+ipred_z3_ssse3_table+hq*4]
sub angled, 180
mov dyd, angled
neg dyd
xor angled, 0x400
or dyq, ~0x7e
lea hq, [base+ipred_z3_ssse3_table+hq]
movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
jmp hq
.h4:
lea r4d, [angleq+88]
test r4d, 0x480
jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
sar r4d, 9
add r4d, wd
cmp r4d, 8
jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
movu m3, [tlq-7]
movu m1, [base+z_upsample1-4]
movu m4, [base+z_filter_s+2]
pshufb m0, m3, m1
pxor m1, m1
pshufb m2, m3, m1
pshufb m1, m3, m4
mova [rsp+16], m2 ; top[max_base_y]
movddup m2, [base+pb_36_m4]
add dyd, dyd
pmaddubsw m0, m2
pmaddubsw m1, m2
movd m5, dyd
mov r5d, dyd
pshufb m5, [base+pw_256]
paddw m0, m1
pmulhrsw m0, m10
shl wd, 2
mov tlq, rsp
sub rsp, wq
packuswb m0, m0
punpcklbw m0, m3
paddw m6, m5, m5
punpcklqdq m5, m6
pshufb m0, [base+pb_15to0]
mova [tlq], m0
.h4_upsample_loop:
lea r4d, [r5+dyq]
shr r5d, 6
movq m0, [tlq+r5]
lea r5d, [r4+dyq]
shr r4d, 6
movhps m0, [tlq+r4]
pand m2, m8, m5
psubw m1, m9, m2
psllw m2, 8
por m1, m2
pmaddubsw m0, m1
paddw m5, m6
pmulhrsw m0, m10
packuswb m0, m0
movq [rsp+wq-8], m0
sub wd, 8
jg .h4_upsample_loop
jmp .h4_transpose
.h4_no_upsample:
mov r4d, 7
test angled, 0x400 ; !enable_intra_edge_filter
jnz .h4_main
lea r4d, [wq+3]
movd m0, r4d
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
pcmpeqb m1, m0, [base+z_filter_wh4]
pand m1, m2
pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
pmovmskb r5d, m1
mov r4d, 7
test r5d, r5d
jz .h4_main ; filter_strength == 0
movu m2, [tlq-7]
imul r5d, 0x55555555
movu m3, [base+z_filter_s-2]
shr r5d, 30 ; filter_strength
mova m4, [base+z_upsample2]
movddup m5, [base+z_filter_k-8+r5*8+24*0]
movddup m6, [base+z_filter_k-8+r5*8+24*1]
movddup m7, [base+z_filter_k-8+r5*8+24*2]
pshufb m0, m2, m3
shufps m3, m4, q2121
pmaddubsw m1, m0, m5
pmaddubsw m0, m6
pshufb m5, m2, m3
pmaddubsw m3, m5, m6
pmaddubsw m5, m7
pshufb m2, m4
pmaddubsw m2, m7
paddw m0, m1
paddw m1, m3
paddw m0, m5
paddw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
lea r2d, [r4+2]
cmp wd, 4
cmovne r4d, r2d
pshufd m0, m0, q0000
lea tlq, [rsp+15]
packuswb m0, m1
mova [rsp], m0
.h4_main:
movd m5, dyd
movddup m0, [base+z_base_inc] ; base_inc << 6
sub tlq, r4
shl r4d, 6
movd m7, [tlq]
movd m4, r4d
pshufb m5, [base+pw_256]
neg dyq
pshufb m7, [base+pw_m256]
mova m3, [base+z3_shuf_h4]
lea r5, [dyq+r4+63] ; ypos
pshufb m4, [base+pw_256]
psubw m4, m0 ; max_base_y
shl wd, 2
paddw m6, m5, m5
sub rsp, wq
punpcklqdq m5, m6
.h4_loop:
lea r4, [r5+dyq]
sar r5, 6
movq m0, [tlq+r5-4]
lea r5, [r4+dyq]
sar r4, 6
movhps m0, [tlq+r4-4]
pand m2, m8, m5
psubw m1, m9, m2
psllw m2, 8
pshufb m0, m3
por m1, m2
pmaddubsw m0, m1
pcmpgtw m1, m4, m5
paddw m5, m6
pmulhrsw m0, m10
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movq [rsp+wq-8], m0
sub wd, 8
jz .h4_transpose
test r5d, r5d
jg .h4_loop
packuswb m7, m7
.h4_end_loop:
movq [rsp+wq-8], m7
sub wd, 8
jg .h4_end_loop
.h4_transpose:
mova m1, [base+z_transpose4]
%if ARCH_X86_32
mov strideq, [dstq]
mov org_wd, [dstq+strideq]
%endif
lea r2, [strideq*3]
lea dstq, [dstq+org_wq-4]
.h4_transpose_loop:
mova m0, [rsp]
add rsp, 16
pshufb m0, m1
movd [dstq+strideq*0], m0
pshuflw m2, m0, q1032
movd [dstq+strideq*1], m2
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r2 ], m0
sub dstq, 4
sub org_wd, 4
jg .h4_transpose_loop
RET
.h8:
lea r4d, [angleq+88]
and r4d, ~0x7f
or r4d, wd
cmp r4d, 8
ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
mova m4, [tlq-15]
and r4d, 4
movu m3, [tlq- 9]
movd m1, r4d
movu m2, [base+z_filter_s+2]
pxor m0, m0
movu m5, [base+z_filter_s+6]
movddup m7, [base+pb_36_m4]
pshufb m1, m0 ; w & 4
movu m0, [base+z_upsample1-4]
pmaxub m1, m0 ; clip 4x8
add dyd, dyd
pshufb m0, m4, m1
pmaddubsw m0, m7
pshufb m1, m4, m2
pmaddubsw m1, m7
pshufb m2, m3, [base+z_upsample1]
pmaddubsw m2, m7
pshufb m3, m5
pmaddubsw m3, m7
movd m5, dyd
neg dyq
paddw m1, m0
paddw m2, m3
pmulhrsw m1, m10
pmulhrsw m2, m10
shl wd, 3
lea tlq, [rsp+16]
pshufb m5, [base+pw_256]
sub rsp, wq
packuswb m1, m2
lea r5, [dyq+63]
punpcklbw m0, m1, m4
punpckhbw m1, m4
mova [tlq-16*1], m0
mova [tlq-16*0], m1
paddw m6, m5, m5
punpcklqdq m5, m6
.h8_upsample_loop:
lea r4, [r5+dyq]
sar r5, 6
movu m0, [tlq+r5]
lea r5, [r4+dyq]
sar r4, 6
movu m1, [tlq+r4]
pand m3, m8, m5
psubw m2, m9, m3
psllw m2, 8
por m3, m2
pshufd m2, m3, q1010
pmaddubsw m0, m2
punpckhqdq m3, m3
pmaddubsw m1, m3
paddw m5, m6
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m1, m0
mova [rsp+wq-16], m1
sub wd, 16
jg .h8_upsample_loop
jmp .h8_transpose
.h8_no_upsample:
lea r4d, [wq+7]
movd m0, r4d
and r4d, 7
or r4d, 8 ; imin(w+7, 15)
test angled, 0x400
jnz .h8_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movu m1, [base+z_filter_wh8]
psrldq m3, [base+z_filter_t_w48+angleq*8], 4
pcmpeqb m1, m0
pand m1, m2
pcmpgtb m1, m3
pmovmskb r5d, m1
test r5d, r5d
jz .h8_main ; filter_strength == 0
mova m0, [tlq-15]
imul r5d, 0x55555555
movd m1, [tlq+1]
neg r4
movd m2, [tlq+r4]
shr r5d, 30
pxor m7, m7
lea tlq, [rsp+16*2]
sub r5, 3 ; filter_strength-3
mova [tlq+16*0], m0
pshufb m1, m7
mova [tlq+16*1], m1
pshufb m2, m7
movq [tlq+r4+8], m2
neg r4d
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sar r5d, 1
add tlq, 31
add r5d, 17
cmp wd, 8
cmova r4d, r5d
.h8_main:
movd m5, dyd
sub tlq, r4
shl r4d, 6
movd m7, [tlq]
movd m4, r4d
pshufb m5, [base+pw_256]
neg dyq
pshufb m7, [base+pw_m256]
mova m3, [base+z3_shuf]
lea r5, [dyq+r4+63]
pshufb m4, [base+pw_256]
psubw m4, [base+z3_base_inc]
shl wd, 3
mova m6, m5
sub rsp, wq
.h8_loop:
mov r4, r5
sar r4, 6
movu m0, [tlq+r4-8]
pand m2, m8, m5
psubw m1, m9, m2
psllw m2, 8
pshufb m0, m3
por m1, m2
pmaddubsw m0, m1
pcmpgtw m1, m4, m5
paddw m5, m6
pmulhrsw m0, m10
pand m0, m1
pandn m1, m7
por m0, m1
packuswb m0, m0
movq [rsp+wq-8], m0
sub wd, 8
jz .h8_transpose
add r5, dyq
jg .h8_loop
packuswb m7, m7
.h8_end_loop:
movq [rsp+wq-8], m7
sub wd, 8
jg .h8_end_loop
.h8_transpose:
%if ARCH_X86_32
mov strideq, [dstq]
mov org_wd, [dstq+strideq]
%endif
or r3d, 8
cmp org_wd, 4
%if ARCH_X86_64
jne .end_transpose_main
%else
jne .end_transpose_loop
%endif
mova m1, [rsp+16*1]
mova m0, [rsp+16*0]
lea r2, [strideq*3]
add rsp, 16*2
punpcklbw m2, m1, m0
punpckhbw m1, m0
punpckhbw m0, m1, m2
punpcklbw m1, m2
.write_4x8_end:
call .write_4x8
RET
.write_4x8:
movd [dstq+r2 ], m0
pshuflw m4, m0, q1032
movd [dstq+strideq*2], m4
punpckhqdq m0, m0
movd [dstq+strideq*1], m0
psrlq m0, 32
movd [dstq+strideq*0], m0
lea dstq, [dstq+strideq*4]
movd [dstq+r2 ], m1
pshuflw m4, m1, q1032
movd [dstq+strideq*2], m4
punpckhqdq m1, m1
movd [dstq+strideq*1], m1
psrlq m1, 32
movd [dstq+strideq*0], m1
ret
.h16:
lea r4d, [wq+15]
movd m0, r4d
and r4d, 15
or r4d, 16 ; imin(w+15, 31)
test angled, 0x400
jnz .h16_main
movd m2, angled
shr angled, 8 ; is_sm << 1
pxor m1, m1
pshufb m0, m1
pshufb m2, m1
movq m3, [base+z_filter_t_w16+angleq*4]
pcmpeqb m1, m0, [base+z_filter_wh16]
pand m1, m2
pcmpgtb m1, m3
pmovmskb r5d, m1
test r5d, r5d
jz .h16_main ; filter_strength == 0
mova m0, [tlq-16*2+1]
imul r5d, 0x24924924
mova m1, [tlq-16*1+1]
neg r4
movd m2, [tlq-16*0+1]
shr r5d, 30
movd m3, [tlq+r4]
adc r5, -4 ; filter_strength-3
pxor m7, m7
lea tlq, [rsp+16*2]
mova [tlq-16*1], m0
pshufb m2, m7
mova [tlq+16*0], m1
pshufb m3, m7
mova [tlq+16*1], m2
movq [tlq+r4+8], m3
neg r4d
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
add tlq, 31
cmp wd, 16
jle .h16_main
pshuflw m0, [tlq-47], q0000
sar r5, 1
movq m1, [base+z3_filter_k_tail+r5*4]
lea r4d, [r5+33]
pmaddubsw m0, m1
%if ARCH_X86_64
pmulhrsw m0, m10
%else
pmulhrsw m0, m4
%endif
packuswb m0, m0
movd [tlq-35], m0
.h16_main:
movd m5, dyd
sub tlq, r4
movd m4, r4d
shl r4d, 6
movd m7, [tlq]
pxor m6, m6
pshufb m5, [base+pw_256]
neg dyq
pshufb m7, m6
mova m3, [base+z3_shuf]
lea r5, [dyq+r4+63]
pshufb m4, m6
psubb m4, [base+pb_15to0]
shl wd, 4
mova m6, m5
sub rsp, wq
.h16_loop:
mov r4, r5
pand m2, m8, m5
sar r4, 6
psubw m1, m9, m2
psllw m2, 8
movu m0, [tlq+r4-8*2]
por m2, m1
movu m1, [tlq+r4-8*1]
pshufb m0, m3
pmaddubsw m0, m2
pshufb m1, m3
pmaddubsw m1, m2
psrlw m2, m5, 6
paddw m5, m6
pmulhrsw m0, m10
pmulhrsw m1, m10
packsswb m2, m2
packuswb m0, m1
pcmpgtb m1, m4, m2
pand m0, m1
pandn m1, m7
por m0, m1
mova [rsp+wq-16], m0
sub wd, 16
jz .h16_transpose
add r5, dyq
jg .h16_loop
.h16_end_loop:
mova [rsp+wq-16], m7
sub wd, 16
jg .h16_end_loop
.h16_transpose:
%if ARCH_X86_32
mov strideq, [dstq]
mov org_wd, [dstq+strideq]
%endif
or r3d, 16
cmp org_wd, 4
%if ARCH_X86_64
jne .end_transpose_main
%else
jne .end_transpose_loop
%endif
.h16_transpose_w4:
mova m2, [rsp+16*3]
mova m4, [rsp+16*2]
mova m3, [rsp+16*1]
mova m0, [rsp+16*0]
lea r2, [strideq*3]
add rsp, 16*4
punpckhbw m1, m2, m4
punpcklbw m2, m4
punpckhbw m4, m3, m0
punpcklbw m3, m0
punpckhwd m0, m1, m4
punpcklwd m1, m4
call .write_4x8
lea dstq, [dstq+strideq*4]
punpckhwd m0, m2, m3
punpcklwd m1, m2, m3
jmp .write_4x8_end
.h32:
lea r4d, [wq+31]
and r4d, 31
or r4d, 32 ; imin(w+31, 63)
test angled, 0x400 ; !enable_intra_edge_filter
jnz .h32_main
mova m0, [tlq-16*4+1]
mova m1, [tlq-16*3+1]
mova m2, [tlq-16*2+1]
mova m3, [tlq-16*1+1]
movd m4, [tlq-16*0+1]
neg r4
movd m5, [tlq+r4]
pxor m7, m7
lea tlq, [rsp+16*4]
mova [tlq-16*3], m0
mova [tlq-16*2], m1
xor r5d, r5d ; filter_strength = 3
mova [tlq-16*1], m2
pshufb m4, m7
mova [tlq+16*0], m3
pshufb m5, m7
mova [tlq+16*1], m4
movq [tlq+r4+8], m5
neg r4d
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
add tlq, 63
cmp wd, 32
jle .h32_main
pshuflw m0, [tlq-79], q0000
movq m1, [base+z3_filter_k_tail]
add r4d, 2
pmaddubsw m0, m1
%if ARCH_X86_64
pmulhrsw m0, m10
%else
pmulhrsw m0, m4
%endif
packuswb m0, m0
movd [tlq-67], m0
.h32_main:
movd m5, dyd
sub tlq, r4
movd m4, r4d
shl r4d, 6
movd m7, [tlq]
pxor m6, m6
pshufb m5, [base+pw_256]
neg dyq
pshufb m7, m6
mova m3, [base+z3_shuf]
lea r5, [dyq+r4+63]
pshufb m4, m6
psubb m4, [base+pb_15to0]
mova m6, m5
.h32_loop:
mov r4, r5
pand m2, m8, m5
sar r4, 6
psubw m1, m9, m2
psllw m2, 8
movu m0, [tlq+r4-8*4]
por m2, m1
movu m1, [tlq+r4-8*3]
pshufb m0, m3
pmaddubsw m0, m2
pshufb m1, m3
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
sub rsp, 32
packuswb m0, m1
mova [rsp+16*0], m0
movu m0, [tlq+r4-8*2]
movu m1, [tlq+r4-8*1]
pshufb m0, m3
pshufb m1, m3
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
psrlw m2, m5, 6
paddw m5, m6
packsswb m2, m2
packuswb m0, m1
pcmpgtb m1, m4, m2
paddsb m2, [base+pb_16]
pand m0, m1
pandn m1, m7
por m0, m1
pcmpgtb m1, m4, m2
mova [rsp+16*1], m0
pand m0, m1, [rsp+16*0]
pandn m1, m7
por m0, m1
mova [rsp+16*0], m0
dec wd
jz .h32_transpose
add r5, dyq
jg .h32_loop
.h32_end_loop:
sub rsp, 32
mova [rsp+16*1], m7
mova [rsp+16*0], m7
dec wd
jg .h32_end_loop
.h32_transpose:
or r3d, 32
jmp .end_transpose_main
.h64:
lea r4d, [wq+63]
test angled, 0x400 ; !enable_intra_edge_filter
jnz .h64_main
mova m0, [tlq-16*8+1]
mova m1, [tlq-16*7+1]
mova m2, [tlq-16*6+1]
mova m3, [tlq-16*5+1]
mova [rsp+16*1], m0
mova [rsp+16*2], m1
mova [rsp+16*3], m2
mova [rsp+16*4], m3
mova m0, [tlq-16*4+1]
mova m1, [tlq-16*3+1]
mova m2, [tlq-16*2+1]
mova m3, [tlq-16*1+1]
movd m4, [tlq-16*0+1]
neg r4
movd m5, [tlq+r4]
pxor m7, m7
lea tlq, [rsp+16*8]
mova [tlq-16*3], m0
mova [tlq-16*2], m1
xor r5d, r5d ; filter_strength = 3
mova [tlq-16*1], m2
pshufb m4, m7
mova [tlq+16*0], m3
pshufb m5, m7
mova [tlq+16*1], m4
movq [tlq+r4+8], m5
neg r4d
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
cmp wd, 64
jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
.h64_filter96:
add tlq, 127
.h64_main:
movd m5, dyd
sub tlq, r4
movd m4, r4d
shl r4d, 6
movd m7, [tlq]
pxor m6, m6
pshufb m5, [base+pw_256]
neg dyq
pshufb m7, m6
mova m3, [base+z3_shuf]
lea r5, [dyq+r4+63]
pshufb m4, m6
psubb m4, [base+pb_15to0]
mova m6, m5
.h64_loop:
mov r4, r5
pand m2, m8, m5
sar r4, 6
psubw m1, m9, m2
psllw m2, 8
movu m0, [tlq+r4-8*8]
por m2, m1
movu m1, [tlq+r4-8*7]
pshufb m0, m3
pmaddubsw m0, m2
pshufb m1, m3
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
sub rsp, 64
packuswb m0, m1
mova [rsp+16*0], m0
movu m0, [tlq+r4-8*6]
movu m1, [tlq+r4-8*5]
pshufb m0, m3
pshufb m1, m3
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
mova [rsp+16*1], m0
movu m0, [tlq+r4-8*4]
movu m1, [tlq+r4-8*3]
pshufb m0, m3
pshufb m1, m3
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
mova [rsp+16*2], m0
movu m0, [tlq+r4-8*2]
movu m1, [tlq+r4-8*1]
pshufb m0, m3
pshufb m1, m3
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m10
pmulhrsw m1, m10
psrlw m2, m5, 6
paddw m5, m6
packsswb m2, m2
packuswb m0, m1
pcmpgtb m1, m4, m2
paddsb m2, [base+pb_16]
pand m0, m1
pandn m1, m7
por m0, m1
pcmpgtb m1, m4, m2
paddsb m2, [base+pb_16]
mova [rsp+16*3], m0
pand m0, m1, [rsp+16*2]
pandn m1, m7
por m0, m1
pcmpgtb m1, m4, m2
paddsb m2, [base+pb_16]
mova [rsp+16*2], m0
pand m0, m1, [rsp+16*1]
pandn m1, m7
por m0, m1
pcmpgtb m1, m4, m2
mova [rsp+16*1], m0
pand m0, m1, [rsp+16*0]
pandn m1, m7
por m0, m1
mova [rsp+16*0], m0
dec wd
jz .h64_transpose
add r5, dyq
jg .h64_loop
.h64_end_loop:
sub rsp, 64
mova [rsp+16*3], m7
mova [rsp+16*2], m7
mova [rsp+16*1], m7
mova [rsp+16*0], m7
dec wd
jg .h64_end_loop
.h64_transpose:
or r3d, 64
.end_transpose_main:
%if ARCH_X86_64
lea r5, [r3*3]
lea r7, [strideq*3]
%else
mov strideq, [dstq]
mov org_wd, [dstq+strideq]
%endif
.end_transpose_loop:
lea r4, [rsp+r3-8]
lea r6, [dstq+org_wq-8]
.end_transpose_loop_y:
movq m0, [r4+r3*1]
movq m4, [r4+r3*0]
%if ARCH_X86_64
movq m1, [r4+r5 ]
movq m5, [r4+r3*2]
lea r2, [r4+r3*4]
%else
lea r2, [r4+r3*2]
movq m1, [r2+r3*1]
movq m5, [r2+r3*0]
lea r2, [r2+r3*2]
%endif
movq m2, [r2+r3*1]
movq m6, [r2+r3*0]
%if ARCH_X86_64
movq m3, [r2+r5 ]
movq m7, [r2+r3*2]
%else
lea r2, [r2+r3*2]
movq m3, [r2+r3*1]
movq m7, [r2+r3*0]
%endif
sub r4, 8
punpcklbw m0, m4
punpcklbw m1, m5
punpcklbw m2, m6
punpcklbw m3, m7
punpckhwd m4, m1, m0
punpcklwd m1, m0
punpckhwd m0, m3, m2
punpcklwd m3, m2
punpckhdq m2, m3, m1
punpckldq m3, m1
punpckldq m1, m0, m4
punpckhdq m0, m4
movhps [r6+strideq*0], m0
movq [r6+strideq*1], m0
%if ARCH_X86_64
movhps [r6+strideq*2], m1
movq [r6+r7 ], m1
lea r6, [r6+strideq*4]
%else
lea r6, [r6+strideq*2]
movhps [r6+strideq*0], m1
movq [r6+strideq*1], m1
lea r6, [r6+strideq*2]
%endif
movhps [r6+strideq*0], m2
movq [r6+strideq*1], m2
%if ARCH_X86_64
movhps [r6+strideq*2], m3
movq [r6+r7 ], m3
lea r6, [r6+strideq*4]
%else
lea r6, [r6+strideq*2]
movhps [r6+strideq*0], m3
movq [r6+strideq*1], m3
lea r6, [r6+strideq*2]
%endif
cmp r4, rsp
jae .end_transpose_loop_y
lea rsp, [rsp+r3*8]
sub org_wd, 8
jg .end_transpose_loop
RET
;-------------------------------------------------------------------------------
;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
; const uint8_t *idx, int w, int h);
;-------------------------------------------------------------------------------
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
movq m4, [palq]
LEA r2, pal_pred_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r2+wq*4]
add wq, r2
lea r2, [strideq*3]
jmp wq
.w4:
movq m1, [idxq]
add idxq, 8
psrlw m0, m1, 4
punpcklbw m1, m0
pshufb m0, m4, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
movd [dstq+r2 ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
RET
.w8:
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r2 ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
.w16:
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w32
RET
.w64:
movu m0, [idxq+16*0]
movu m2, [idxq+16*1]
add idxq, 32
pshufb m1, m4, m0
psrlw m0, 4
pshufb m3, m4, m0
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
pshufb m1, m4, m2
psrlw m2, 4
pshufb m3, m4, m2
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
sub hd, 1
jg .w64
RET
;---------------------------------------------------------------------------------------
;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
%macro IPRED_CFL 1 ; ac in, unpacked pixels out
psignw m3, m%1, m1
pabsw m%1, m%1
pmulhrsw m%1, m2
psignw m%1, m3
paddw m%1, m0
%endmacro
%if UNIX64
DECLARE_REG_TMP 7
%else
DECLARE_REG_TMP 5
%endif
cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn wd, wm
movifnidn hd, hm
tzcnt r6d, hd
lea t0d, [wq+hq]
movd m4, t0d
tzcnt t0d, t0d
movd m5, t0d
LEA t0, ipred_cfl_ssse3_table
tzcnt wd, wd
movsxd r6, [t0+r6*4]
movsxd wq, [t0+wq*4+16]
pcmpeqd m3, m3
psrlw m4, 1
add r6, t0
add wq, t0
movifnidn acq, acmp
jmp r6
.h4:
movd m0, [tlq-4]
pmaddubsw m0, m3
jmp wq
.w4:
movd m1, [tlq+1]
pmaddubsw m1, m3
psubw m0, m4
paddw m0, m1
pmaddwd m0, m3
cmp hd, 4
jg .w4_mul
psrlw m0, 3 ; dc >>= ctz(width + height);
jmp .w4_end
.w4_mul:
punpckhqdq m1, m0, m0
paddw m0, m1
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
psrlw m0, 2
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8
cmovz r6d, r2d
movd m5, r6d
pmulhuw m0, m5
.w4_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s4:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
lea r6, [strideq*3]
pabsw m2, m1
psllw m2, 9
.s4_loop:
mova m4, [acq]
mova m5, [acq+16]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
movd [dstq+strideq*0], m4
pshuflw m4, m4, q1032
movd [dstq+strideq*1], m4
punpckhqdq m4, m4
movd [dstq+strideq*2], m4
psrlq m4, 32
movd [dstq+r6 ], m4
lea dstq, [dstq+strideq*4]
add acq, 32
sub hd, 4
jg .s4_loop
RET
ALIGN function_align
.h8:
movq m0, [tlq-8]
pmaddubsw m0, m3
jmp wq
.w8:
movq m1, [tlq+1]
pmaddubsw m1, m3
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
paddw m0, m1
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w8_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s8:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
lea r6, [strideq*3]
pabsw m2, m1
psllw m2, 9
.s8_loop:
mova m4, [acq]
mova m5, [acq+16]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
movq [dstq ], m4
movhps [dstq+strideq ], m4
mova m4, [acq+32]
mova m5, [acq+48]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
movq [dstq+strideq*2], m4
movhps [dstq+r6 ], m4
lea dstq, [dstq+strideq*4]
add acq, 64
sub hd, 4
jg .s8_loop
RET
ALIGN function_align
.h16:
mova m0, [tlq-16]
pmaddubsw m0, m3
jmp wq
.w16:
movu m1, [tlq+1]
pmaddubsw m1, m3
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8|32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w16_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s16:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s16_loop:
mova m4, [acq]
mova m5, [acq+16]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
mova [dstq], m4
mova m4, [acq+32]
mova m5, [acq+48]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
mova [dstq+strideq], m4
lea dstq, [dstq+strideq*2]
add acq, 64
sub hd, 2
jg .s16_loop
RET
ALIGN function_align
.h32:
mova m0, [tlq-32]
pmaddubsw m0, m3
mova m2, [tlq-16]
pmaddubsw m2, m3
paddw m0, m2
jmp wq
.w32:
movu m1, [tlq+1]
pmaddubsw m1, m3
movu m2, [tlq+17]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 64|16
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w32_end:
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.s32:
movd m1, alpham
pshuflw m1, m1, q0000
punpcklqdq m1, m1
pabsw m2, m1
psllw m2, 9
.s32_loop:
mova m4, [acq]
mova m5, [acq+16]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
mova [dstq], m4
mova m4, [acq+32]
mova m5, [acq+48]
IPRED_CFL 4
IPRED_CFL 5
packuswb m4, m5
mova [dstq+16], m4
add dstq, strideq
add acq, 64
dec hd
jg .s32_loop
RET
;---------------------------------------------------------------------------------------
;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
mov hd, hm ; zero upper half
tzcnt r6d, hd
sub tlq, hq
tzcnt wd, wm
movu m0, [tlq]
mov t0d, 0x8000
movd m3, t0d
movd m2, r6d
psrld m3, m2
LEA t0, ipred_cfl_left_ssse3_table
movsxd r6, [t0+r6*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, t0
add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
movsxd wq, [t0+wq*4]
add wq, t0
movifnidn acq, acmp
jmp r6
.h32:
movu m1, [tlq+16] ; unaligned when jumping here from dc_top
pmaddubsw m1, m2
paddw m0, m1
.h16:
pshufd m1, m0, q3232 ; psrlq m1, m0, 16
paddw m0, m1
.h8:
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
.h4:
pmaddwd m0, m2
pmulhrsw m0, m3
pshuflw m0, m0, q0000
punpcklqdq m0, m0
jmp wq
;---------------------------------------------------------------------------------------
;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
LEA t0, ipred_cfl_left_ssse3_table
tzcnt wd, wm
inc tlq
movu m0, [tlq]
movifnidn hd, hm
mov r6d, 0x8000
movd m3, r6d
movd m2, wd
psrld m3, m2
movsxd r6, [t0+wq*4]
pcmpeqd m2, m2
pmaddubsw m0, m2
add r6, t0
add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
movsxd wq, [t0+wq*4]
add wq, t0
movifnidn acq, acmp
jmp r6
;---------------------------------------------------------------------------------------
;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int16_t *ac, const int alpha);
;---------------------------------------------------------------------------------------
cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
tzcnt wd, wm
movifnidn hd, hm
LEA r6, ipred_cfl_splat_ssse3_table
movsxd wq, [r6+wq*4]
movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
add wq, r6
movifnidn acq, acmp
jmp wq
%macro RELOAD_ACQ_32 1
mov acq, ac_bakq ; restore acq
%endmacro
%if ARCH_X86_64
cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
DECLARE_REG_TMP 7
movddup m2, [pb_2]
%else
cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
DECLARE_REG_TMP 4
%define ac_bakq acmp
mov t0d, 0x02020202
movd m2, t0d
pshufd m2, m2, q0000
%endif
movifnidn wd, wm
mov t0d, hm
mov hd, t0d
imul t0d, wd
movd m5, t0d
movifnidn hpadd, hpadm
%if ARCH_X86_64
mov ac_bakq, acq
%endif
shl hpadd, 2
sub hd, hpadd
pxor m4, m4
cmp wd, 8
jg .w16
je .w8
; fall-through
%if ARCH_X86_64
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
%else
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
%endif
.w4:
lea stride3q, [strideq*3]
.w4_loop:
movq m0, [yq]
movq m1, [yq+strideq]
movhps m0, [yq+strideq*2]
movhps m1, [yq+stride3q]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*4]
add acq, 16
sub hd, 2
jg .w4_loop
test hpadd, hpadd
jz .calc_avg_4_8
punpckhqdq m0, m0
.w4_hpad_loop:
mova [acq], m0
paddw m4, m0
add acq, 16
sub hpadd, 2
jg .w4_hpad_loop
jmp .calc_avg_4_8
.w8:
lea stride3q, [strideq*3]
test wpadd, wpadd
jnz .w8_wpad
.w8_loop:
mova m0, [yq]
mova m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
mova m0, [yq+strideq*2]
mova m1, [yq+stride3q]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq+16], m0
paddw m4, m0
lea yq, [yq+strideq*4]
add acq, 32
sub hd, 2
jg .w8_loop
test hpadd, hpadd
jz .calc_avg_4_8
jmp .w8_hpad
.w8_wpad: ; wpadd=1
movddup m0, [yq]
movddup m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
pshufhw m0, m0, q3333
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 16
sub hd, 1
jg .w8_wpad
test hpadd, hpadd
jz .calc_avg_4_8
.w8_hpad:
mova [acq], m0
paddw m4, m0
add acq, 16
sub hpadd, 1
jg .w8_hpad
jmp .calc_avg_4_8
.w16:
test wpadd, wpadd
jnz .w16_wpad
.w16_loop:
mova m0, [yq]
mova m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
mova m6, [yq+16]
mova m1, [yq+strideq+16]
pmaddubsw m6, m2
pmaddubsw m1, m2
paddw m6, m1
mova [acq+16], m6
paddw m4, m6
lea yq, [yq+strideq*2]
add acq, 32
dec hd
jg .w16_loop
test hpadd, hpadd
jz .calc_avg16
jmp .w16_hpad_loop
.w16_wpad:
cmp wpadd, 2
jl .w16_pad1
je .w16_pad2
.w16_pad3:
movddup m0, [yq]
movddup m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
pshufhw m0, m0, q3333
mova [acq], m0
paddw m4, m0
mova m6, m0
punpckhqdq m6, m0, m0
mova [acq+16], m6
paddw m4, m6
lea yq, [yq+strideq*2]
add acq, 32
dec hd
jg .w16_pad3
jmp .w16_wpad_done
.w16_pad2:
mova m0, [yq]
mova m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
pshufhw m6, m0, q3333
punpckhqdq m6, m6
mova [acq+16], m6
paddw m4, m6
lea yq, [yq+strideq*2]
add acq, 32
dec hd
jg .w16_pad2
jmp .w16_wpad_done
.w16_pad1:
mova m0, [yq]
mova m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
movddup m6, [yq+16]
movddup m1, [yq+strideq+16]
pmaddubsw m6, m2
pmaddubsw m1, m2
paddw m6, m1
pshufhw m6, m6, q3333
mova [acq+16], m6
paddw m4, m6
lea yq, [yq+strideq*2]
add acq, 32
dec hd
jg .w16_pad1
.w16_wpad_done:
test hpadd, hpadd
jz .calc_avg16
.w16_hpad_loop:
mova [acq], m0
paddw m4, m0
mova [acq+16], m6
paddw m4, m6
add acq, 32
dec hpadd
jg .w16_hpad_loop
jmp .calc_avg16
%if ARCH_X86_64
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
%else
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
%endif
.calc_avg_4_8:
psrlw m2, 9
pmaddwd m4, m2
jmp .calc_avg
.calc_avg16:
psrld m0, m4, 16
pslld m4, 16
psrld m4, 16
paddd m4, m0
.calc_avg:
movd szd, m5
psrad m5, 1
tzcnt r1d, szd
paddd m4, m5
movd m1, r1d
pshufd m0, m4, q2301
paddd m0, m4
pshufd m4, m0, q1032
paddd m0, m4
psrad m0, m1 ; sum >>= log2sz;
packssdw m0, m0
RELOAD_ACQ_32 acq
.sub_loop:
mova m1, [acq]
psubw m1, m0 ; ac[x] -= sum;
mova [acq], m1
add acq, 16
sub szd, 8
jg .sub_loop
RET
%if ARCH_X86_64
cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
movddup m2, [pb_4]
%else
cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
mov t0d, 0x04040404
movd m2, t0d
pshufd m2, m2, q0000
%endif
movifnidn wd, wm
mov t0d, hm
mov hd, t0d
imul t0d, wd
movd m6, t0d
movifnidn hpadd, hpadm
%if ARCH_X86_64
mov ac_bakq, acq
%endif
shl hpadd, 2
sub hd, hpadd
pxor m4, m4
pxor m5, m5
cmp wd, 8
jg .w16
je .w8
; fall-through
%if ARCH_X86_64
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
%else
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
%endif
.w4:
lea stride3q, [strideq*3]
.w4_loop:
movq m1, [yq]
movhps m1, [yq+strideq]
movq m0, [yq+strideq*2]
movhps m0, [yq+stride3q]
pmaddubsw m0, m2
pmaddubsw m1, m2
mova [acq], m1
mova [acq+16], m0
paddw m4, m0
paddw m5, m1
lea yq, [yq+strideq*4]
add acq, 32
sub hd, 4
jg .w4_loop
test hpadd, hpadd
jz .calc_avg_4
punpckhqdq m0, m0
.w4_hpad_loop:
mova [acq], m0
paddw m4, m0
add acq, 16
sub hpadd, 2
jg .w4_hpad_loop
jmp .calc_avg_4
.w8:
lea stride3q, [strideq*3]
test wpadd, wpadd
jnz .w8_wpad
.w8_loop:
mova m1, [yq]
mova m0, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
mova [acq], m1
mova [acq+16], m0
paddw m4, m0
paddw m5, m1
mova m1, [yq+strideq*2]
mova m0, [yq+stride3q]
pmaddubsw m0, m2
pmaddubsw m1, m2
mova [acq+32], m1
mova [acq+48], m0
paddw m4, m0
paddw m5, m1
lea yq, [yq+strideq*4]
add acq, 64
sub hd, 4
jg .w8_loop
test hpadd, hpadd
jz .calc_avg_8_16
jmp .w8_hpad
.w8_wpad:
movddup m1, [yq]
pmaddubsw m1, m2
pshufhw m1, m1, q3333
mova [acq], m1
paddw m5, m1
movddup m0, [yq+strideq]
pmaddubsw m0, m2
pshufhw m0, m0, q3333
mova [acq+16], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 32
sub hd, 2
jg .w8_wpad
test hpadd, hpadd
jz .calc_avg_8_16
.w8_hpad:
mova [acq], m0
paddw m4, m0
mova [acq+16], m0
paddw m4, m0
add acq, 32
sub hpadd, 2
jg .w8_hpad
jmp .calc_avg_8_16
.w16:
test wpadd, wpadd
jnz .w16_wpad
.w16_loop:
mova m1, [yq]
mova m0, [yq+16]
pmaddubsw m0, m2
pmaddubsw m1, m2
mova [acq], m1
mova [acq+16], m0
paddw m5, m0
paddw m5, m1
mova m1, [yq+strideq]
mova m0, [yq+strideq+16]
pmaddubsw m0, m2
pmaddubsw m1, m2
mova [acq+32], m1
mova [acq+48], m0
paddw m4, m0
paddw m4, m1
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_loop
test hpadd, hpadd
jz .calc_avg_8_16
jmp .w16_hpad_loop
.w16_wpad:
cmp wpadd, 2
jl .w16_pad1
je .w16_pad2
.w16_pad3:
movddup m1, [yq]
pmaddubsw m1, m2
pshufhw m1, m1, q3333
mova [acq], m1
paddw m5, m1
punpckhqdq m1, m1
mova [acq+16], m1
paddw m5, m1
movddup m1, [yq+strideq]
pmaddubsw m1, m2
pshufhw m1, m1, q3333
mova [acq+32], m1
paddw m4, m1
punpckhqdq m0, m1, m1
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad3
jmp .w16_wpad_done
.w16_pad2:
mova m1, [yq]
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
pshufhw m1, m1, q3333
punpckhqdq m1, m1
mova [acq+16], m1
paddw m5, m1
mova m1, [yq+strideq]
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
mova m0, m1
pshufhw m0, m0, q3333
punpckhqdq m0, m0
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad2
jmp .w16_wpad_done
.w16_pad1:
mova m1, [yq]
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
movddup m0, [yq+16]
pmaddubsw m0, m2
pshufhw m0, m0, q3333
mova [acq+16], m0
paddw m5, m0
mova m1, [yq+strideq]
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
movddup m0, [yq+strideq+16]
pmaddubsw m0, m2
pshufhw m0, m0, q3333
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad1
.w16_wpad_done:
test hpadd, hpadd
jz .calc_avg_8_16
.w16_hpad_loop:
mova [acq], m1
mova [acq+16], m0
paddw m4, m1
paddw m5, m0
mova [acq+32], m1
mova [acq+48], m0
paddw m4, m1
paddw m5, m0
add acq, 64
sub hpadd, 2
jg .w16_hpad_loop
jmp .calc_avg_8_16
%if ARCH_X86_64
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
%else
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
%endif
.calc_avg_4:
psrlw m2, 10
pmaddwd m5, m2
pmaddwd m0, m4, m2
jmp .calc_avg
.calc_avg_8_16:
mova m0, m5
psrld m5, 16
pslld m0, 16
psrld m0, 16
paddd m5, m0
mova m0, m4
psrld m0, 16
pslld m4, 16
psrld m4, 16
paddd m0, m4
.calc_avg:
paddd m5, m0
movd szd, m6
psrad m6, 1
tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
paddd m5, m6
movd m1, r1d
pshufd m0, m5, q2301
paddd m0, m5
pshufd m5, m0, q1032
paddd m0, m5
psrad m0, m1 ; sum >>= log2sz;
packssdw m0, m0
RELOAD_ACQ_32 acq ; ac = ac_orig
.sub_loop:
mova m1, [acq]
psubw m1, m0
mova [acq], m1
add acq, 16
sub szd, 8
jg .sub_loop
RET
%if ARCH_X86_64
cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
movddup m2, [pb_4]
%else
cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
%define ac_bakq [rsp+16*4]
mov t0d, 0x04040404
movd m2, t0d
pshufd m2, m2, q0000
%endif
movifnidn wd, wm
movifnidn hpadd, hpadm
movd m0, hpadd
mov t0d, hm
mov hd, t0d
imul t0d, wd
movd m6, t0d
movd hpadd, m0
mov ac_bakq, acq
shl hpadd, 2
sub hd, hpadd
pxor m5, m5
pxor m4, m4
cmp wd, 16
jg .w32
cmp wd, 8
jg .w16
je .w8
; fall-through
%if ARCH_X86_64
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
%else
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
%endif
.w4:
lea stride3q, [strideq*3]
.w4_loop:
movd m1, [yq]
movd m3, [yq+strideq]
punpckldq m1, m3
punpcklbw m1, m1
movd m0, [yq+strideq*2]
movd m3, [yq+stride3q]
punpckldq m0, m3
punpcklbw m0, m0
pmaddubsw m1, m2
pmaddubsw m0, m2
mova [acq], m1
mova [acq+16], m0
paddw m5, m0
paddw m5, m1
lea yq, [yq+strideq*4]
add acq, 32
sub hd, 4
jg .w4_loop
test hpadd, hpadd
jz .calc_avg_4
punpckhqdq m0, m0
.w4_hpad_loop:
mova [acq], m0
paddw m5, m0
add acq, 16
sub hpadd, 2
jg .w4_hpad_loop
.calc_avg_4:
psrlw m2, 10
pmaddwd m5, m2
jmp .calc_avg
.w8:
lea stride3q, [strideq*3]
test wpadd, wpadd
jnz .w8_wpad
.w8_loop:
movq m1, [yq]
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
movq m0, [yq+strideq]
punpcklbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0
movq m1, [yq+strideq*2]
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
movq m0, [yq+stride3q]
punpcklbw m0, m0
pmaddubsw m0, m2
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*4]
add acq, 64
sub hd, 4
jg .w8_loop
test hpadd, hpadd
jz .calc_avg_8_16
jmp .w8_hpad
.w8_wpad:
movd m1, [yq]
punpcklbw m1, m1
punpcklqdq m1, m1
pmaddubsw m1, m2
pshufhw m1, m1, q3333
mova [acq], m1
paddw m5, m1
movd m0, [yq+strideq]
punpcklbw m0, m0
punpcklqdq m0, m0
pmaddubsw m0, m2
pshufhw m0, m0, q3333
mova [acq+16], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 32
sub hd, 2
jg .w8_wpad
test hpadd, hpadd
jz .calc_avg_8_16
.w8_hpad:
mova [acq], m0
paddw m5, m0
mova [acq+16], m0
paddw m4, m0
add acq, 32
sub hpadd, 2
jg .w8_hpad
jmp .calc_avg_8_16
.w16:
test wpadd, wpadd
jnz .w16_wpad
.w16_loop:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0
mova m0, [yq+strideq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_loop
test hpadd, hpadd
jz .calc_avg_8_16
jmp .w16_hpad_loop
.w16_wpad:
cmp wpadd, 2
jl .w16_pad1
je .w16_pad2
.w16_pad3:
movd m1, [yq]
punpcklbw m1, m1
punpcklqdq m1, m1
pshufhw m1, m1, q3333
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
punpckhqdq m1, m1
mova [acq+16], m1
paddw m5, m1
movd m1, [yq+strideq]
punpcklbw m1, m1
punpcklqdq m1, m1
pshufhw m1, m1, q3333
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
punpckhqdq m0, m1, m1
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad3
jmp .w16_wpad_done
.w16_pad2:
movq m1, [yq]
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
pshufhw m1, m1, q3333
punpckhqdq m1, m1
mova [acq+16], m1
paddw m5, m1
movq m1, [yq+strideq]
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
mova m0, m1
pshufhw m0, m0, q3333
punpckhqdq m0, m0
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad2
jmp .w16_wpad_done
.w16_pad1:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1
punpckhbw m0, m0
punpcklqdq m0, m0
pshufhw m0, m0, q3333
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0
mova m0, [yq+strideq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq+32], m1
paddw m4, m1
punpckhbw m0, m0
punpcklqdq m0, m0
pshufhw m0, m0, q3333
pmaddubsw m0, m2
mova [acq+48], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 64
sub hd, 2
jg .w16_pad1
.w16_wpad_done:
test hpadd, hpadd
jz .calc_avg_8_16
.w16_hpad_loop:
mova [acq], m1
mova [acq+16], m0
paddw m4, m1
paddw m5, m0
mova [acq+32], m1
mova [acq+48], m0
paddw m4, m1
paddw m5, m0
add acq, 64
sub hpadd, 2
jg .w16_hpad_loop
.calc_avg_8_16:
mova m0, m5
psrld m5, 16
pslld m0, 16
psrld m0, 16
paddd m5, m0
mova m0, m4
psrld m0, 16
pslld m4, 16
psrld m4, 16
paddd m0, m4
paddd m5, m0
jmp .calc_avg
.w32:
pxor m0, m0
mova [rsp ], m0
mova [rsp+16], m0
mova [rsp+32], m0
mova [rsp+48], m0
test wpadd, wpadd
jnz .w32_wpad
.w32_loop:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m4, [yq+16]
mova m3, m4
punpcklbw m3, m3
pmaddubsw m3, m2
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
punpckhbw m4, m4
pmaddubsw m4, m2
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_loop
test hpadd, hpadd
jz .calc_avg_32
jmp .w32_hpad_loop
.w32_wpad:
cmp wpadd, 2
jl .w32_pad1
je .w32_pad2
cmp wpadd, 4
jl .w32_pad3
je .w32_pad4
cmp wpadd, 6
jl .w32_pad5
je .w32_pad6
.w32_pad7:
movd m1, [yq]
punpcklbw m1, m1
punpcklqdq m1, m1
pshufhw m1, m1, q3333
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
mova m0, m1
punpckhqdq m0, m0
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m3, m0
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
mova m4, m3
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad7
jmp .w32_wpad_done
.w32_pad6:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
pshufhw m0, m1, q3333
punpckhqdq m0, m0
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m3, m0
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
mova m4, m3
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad6
jmp .w32_wpad_done
.w32_pad5:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
mova m5, [rsp]
paddw m5, m1
mova [rsp ], m5
punpckhbw m0, m0
punpcklqdq m0, m0
pshufhw m0, m0, q3333
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m3, m0
punpckhqdq m3, m3
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
mova m4, m3
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad5
jmp .w32_wpad_done
.w32_pad4:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m3, m0
pshufhw m3, m3, q3333
punpckhqdq m3, m3
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
mova m4, m3
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad4
jmp .w32_wpad_done
.w32_pad3:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
movd m3, [yq+16]
punpcklbw m3, m3
punpcklqdq m3, m3
pshufhw m3, m3, q3333
pmaddubsw m3, m2
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
mova m4, m3
punpckhqdq m4, m4
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad3
jmp .w32_wpad_done
.w32_pad2:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m3, [yq+16]
punpcklbw m3, m3
pmaddubsw m3, m2
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
pshufhw m4, m3, q3333
punpckhqdq m4, m4
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad2
jmp .w32_wpad_done
.w32_pad1:
mova m0, [yq]
mova m1, m0
punpcklbw m1, m1
pmaddubsw m1, m2
mova [acq], m1
paddw m5, m1, [rsp]
mova [rsp ], m5
punpckhbw m0, m0
pmaddubsw m0, m2
mova [acq+16], m0
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova m4, [yq+16]
mova m3, m4
punpcklbw m3, m3
pmaddubsw m3, m2
mova [acq+32], m3
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
punpckhbw m4, m4
punpcklqdq m4, m4
pshufhw m4, m4, q3333
pmaddubsw m4, m2
mova [acq+48], m4
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
lea yq, [yq+strideq]
add acq, 64
sub hd, 1
jg .w32_pad1
.w32_wpad_done:
test hpadd, hpadd
jz .calc_avg_32
.w32_hpad_loop:
mova [acq], m1
mova [acq+16], m0
paddw m5, m1, [rsp]
mova [rsp ], m5
paddw m5, m0, [rsp+16]
mova [rsp+16], m5
mova [acq+32], m3
mova [acq+48], m4
paddw m5, m3, [rsp+32]
mova [rsp+32], m5
paddw m5, m4, [rsp+48]
mova [rsp+48], m5
add acq, 64
sub hpadd, 1
jg .w32_hpad_loop
%if ARCH_X86_64
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
%else
DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
%endif
.calc_avg_32:
mova m5, [rsp]
mova m0, m5
psrld m5, 16
pslld m0, 16
psrld m0, 16
paddd m5, m0
mova m0, [rsp+16]
mova m3, m0
psrld m0, 16
pslld m3, 16
psrld m3, 16
paddd m0, m3
paddd m5, m0
mova m0, [rsp+32]
mova m3, m0
psrld m0, 16
pslld m3, 16
psrld m3, 16
paddd m0, m3
mova m1, [rsp+48]
mova m3, m1
psrld m1, 16
pslld m3, 16
psrld m3, 16
paddd m1, m3
paddd m1, m0
paddd m5, m1
.calc_avg:
movd szd, m6
psrad m6, 1
tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
paddd m5, m6
movd m1, r1d
pshufd m0, m5, q2301
paddd m0, m5
pshufd m5, m0, q1032
paddd m0, m5
psrad m0, m1 ; sum >>= log2sz;
packssdw m0, m0
RELOAD_ACQ_32 acq ; ac = ac_orig
.sub_loop:
mova m1, [acq]
psubw m1, m0
mova [acq], m1
add acq, 16
sub szd, 8
jg .sub_loop
RET
; %1 simd register that hold the mask and will hold the result
; %2 simd register that holds the "true" values
; %3 location of the "false" values (simd register/memory)
%macro BLEND 3 ; mask, true, false
pand %2, %1
pandn %1, %3
por %1, %2
%endmacro
%macro PAETH 2 ; top, ldiff
pavgb m1, m%1, m3
pxor m0, m%1, m3
pand m0, m4
psubusb m2, m5, m1
psubb m1, m0
psubusb m1, m5
por m1, m2
paddusb m1, m1
por m1, m0 ; min(tldiff, 255)
psubusb m2, m5, m3
psubusb m0, m3, m5
por m2, m0 ; tdiff
%ifnum %2
pminub m2, m%2
pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
%else
mova m0, %2
pminub m2, m0
pcmpeqb m0, m2
%endif
pminub m1, m2
pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
mova m2, m3
BLEND m0, m2, m%1
BLEND m1, m0, m5
%endmacro
cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
%define base r5-ipred_paeth_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
pxor m0, m0
movd m5, [tlq]
pshufb m5, m0
LEA r5, ipred_paeth_ssse3_table
movsxd wq, [r5+wq*4]
movddup m4, [base+ipred_paeth_shuf]
add wq, r5
jmp wq
.w4:
movd m6, [tlq+1] ; top
pshufd m6, m6, q0000
lea r3, [strideq*3]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0 ; ldiff
.w4_loop:
sub tlq, 4
movd m3, [tlq]
mova m1, [base+ipred_h_shuf]
pshufb m3, m1 ; left
PAETH 6, 7
movd [dstq ], m1
pshuflw m0, m1, q1032
movd [dstq+strideq ], m0
punpckhqdq m1, m1
movd [dstq+strideq*2], m1
psrlq m1, 32
movd [dstq+r3 ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_loop
RET
ALIGN function_align
.w8:
movddup m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w8_loop:
sub tlq, 2
movd m3, [tlq]
pshufb m3, [base+ipred_paeth_shuf]
PAETH 6, 7
movq [dstq ], m1
movhps [dstq+strideq], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
movu m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
.w16_loop:
sub tlq, 1
movd m3, [tlq]
pxor m1, m1
pshufb m3, m1
PAETH 6, 7
mova [dstq], m1
add dstq, strideq
sub hd, 1
jg .w16_loop
RET
ALIGN function_align
.w32:
movu m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp ], m6
mova [rsp+16], m7
movu m6, [tlq+17]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+32], m6
.w32_loop:
dec tlq
movd m3, [tlq]
pxor m1, m1
pshufb m3, m1
mova m6, [rsp]
PAETH 6, [rsp+16]
mova [dstq ], m1
mova m6, [rsp+32]
PAETH 6, 7
mova [dstq+16], m1
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
movu m6, [tlq+1]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp ], m6
mova [rsp+16], m7
movu m6, [tlq+17]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+32], m6
mova [rsp+48], m7
movu m6, [tlq+33]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+64], m6
mova [rsp+80], m7
movu m6, [tlq+49]
psubusb m7, m5, m6
psubusb m0, m6, m5
por m7, m0
mova [rsp+96], m6
.w64_loop:
dec tlq
movd m3, [tlq]
pxor m1, m1
pshufb m3, m1
mova m6, [rsp]
PAETH 6, [rsp+16]
mova [dstq ], m1
mova m6, [rsp+32]
PAETH 6, [rsp+48]
mova [dstq+16], m1
mova m6, [rsp+64]
PAETH 6, [rsp+80]
mova [dstq+32], m1
mova m6, [rsp+96]
PAETH 6, 7
mova [dstq+48], m1
add dstq, strideq
dec hd
jg .w64_loop
RET
%macro FILTER 4 ;dst, src, tmp, shuf
%ifnum %4
pshufb m%2, m%4
%else
pshufb m%2, %4
%endif
pshufd m%1, m%2, q0000 ;p0 p1
pmaddubsw m%1, m2
pshufd m%3, m%2, q1111 ;p2 p3
pmaddubsw m%3, m3
paddw m%1, [base+pw_8]
paddw m%1, m%3
pshufd m%3, m%2, q2222 ;p4 p5
pmaddubsw m%3, m4
paddw m%1, m%3
pshufd m%3, m%2, q3333 ;p6 __
pmaddubsw m%3, m5
paddw m%1, m%3
psraw m%1, 4
packuswb m%1, m%1
%endmacro
cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
%define base r6-$$
LEA r6, $$
tzcnt wd, wm
%ifidn filterd, filterm
movzx filterd, filterb
%else
movzx filterd, byte filterm
%endif
shl filterd, 6
lea filterq, [base+filter_intra_taps+filterq]
movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4
movsxd wq, [base+ipred_filter_ssse3_table+wq*4]
mova m2, [filterq+16*0]
mova m3, [filterq+16*1]
mova m4, [filterq+16*2]
mova m5, [filterq+16*3]
lea wq, [base+ipred_filter_ssse3_table+wq]
mov hd, hm
jmp wq
.w4:
mova m1, [base+filter_shuf1]
sub tlq, 3
sub tlq, hq
jmp .w4_loop_start
.w4_loop:
movd m0, [tlq+hq]
punpckldq m0, m6
lea dstq, [dstq+strideq*2]
.w4_loop_start:
FILTER 6, 0, 7, 1
movd [dstq+strideq*0], m6
pshuflw m6, m6, q1032
movd [dstq+strideq*1], m6
sub hd, 2
jg .w4_loop
RET
ALIGN function_align
.w8:
movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
sub tlq, 5
sub tlq, hq
.w8_loop:
FILTER 7, 0, 1, [base+filter_shuf1]
punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
FILTER 0, 6, 1, [base+filter_shuf2]
punpckldq m6, m7, m0
movq [dstq+strideq*0], m6
punpckhqdq m6, m6
movq [dstq+strideq*1], m6
movd m0, [tlq+hq] ;_ 6 5 0
punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_loop
RET
ALIGN function_align
.w16:
movu m6, [tlq+1] ;top row
sub tlq, 5
sub tlq, hq
.w16_loop:
FILTER 7, 0, 1, [base+filter_shuf1]
punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+strideq*0], m7
psrlq m7, 32
palignr m7, m6, 4
FILTER 6, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+4+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
FILTER 7, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+8+strideq*0], m7
psrlq m7, 32
palignr m7, m6, 4
FILTER 6, 0, 1, [base+filter_shuf2]
movd [dstq+12+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
mova [dstq+strideq*1], m6
movd m0, [tlq+hq] ;_ 6 5 0
punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
movu m6, [tlq+1] ;top row
lea filterq, [tlq+17]
sub tlq, 5
sub tlq, hq
.w32_loop:
FILTER 7, 0, 1, [base+filter_shuf1]
punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+strideq*0], m7
psrlq m7, 32
palignr m7, m6, 4
FILTER 6, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+4+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
FILTER 7, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+8+strideq*0], m7
psrlq m7, 32
palignr m7, m6, 4
FILTER 6, 0, 1, [base+filter_shuf2]
movu m1, [filterq]
punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+12+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
mova [dstq+strideq*1], m6
mova m6, m1
FILTER 7, 0, 6, [base+filter_shuf2]
punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+16+strideq*0], m7
psrlq m7, 32
palignr m7, m1, 4
FILTER 6, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+20+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
FILTER 7, 0, 1, [base+filter_shuf2]
punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
movd [dstq+24+strideq*0], m7
psrlq m7, 32
palignr m7, m6, 4
FILTER 6, 0, 1, [base+filter_shuf2]
movd [dstq+28+strideq*0], m6
psrlq m6, 32
palignr m6, m7, 4
mova [dstq+16+strideq*1], m6
mova m6, [dstq+strideq*1]
movd m0, [tlq+hq] ;_ 6 5 0
punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
lea filterq, [dstq+16+strideq*1]
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_loop
RET