subpel_variance_ssse3.asm

mozilla-central/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm

Enable keyboard shortcuts

Source code

File a bug in Core :: Audio/Video: Playback

Revision control

Copy as Markdown

Other Tools

; Copyright (c) 2016, Alliance for Open Media. All rights reserved.

; This source code is subject to the terms of the BSD 2 Clause License and

; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License

; was not distributed with this source code in the LICENSE file, you can

; obtain it at www.aomedia.org/license/software. If the Alliance for Open

; Media Patent License 1.0 was not distributed with this source code in the

; PATENTS file, you can obtain it at www.aomedia.org/license/patent.

%include "third_party/x86inc/x86inc.asm"

SECTION_RODATA

pw_8: times  8 dw  8

bilin_filter_m_ssse3: times  8 db 16,  0

                      times  8 db 14,  2

                      times  8 db 12,  4

                      times  8 db 10,  6

                      times 16 db  8

                      times  8 db  6, 10

                      times  8 db  4, 12

                      times  8 db  2, 14

SECTION .text

; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,

;                               int x_offset, int y_offset,

;                               const uint8_t *dst, ptrdiff_t dst_stride,

;                               int height, unsigned int *sse);

; This function returns the SE and stores SSE in the given pointer.

%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse

  psubw                %3, %4

  psubw                %1, %2

  paddw                %5, %3

  pmaddwd              %3, %3

  paddw                %5, %1

  pmaddwd              %1, %1

  paddd                %6, %3

  paddd                %6, %1

%endmacro

%macro STORE_AND_RET 1

%if %1 > 4

  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit

  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.

  ; We have to sign-extend it before adding the words within the register

  ; and outputing to a dword.

  pcmpgtw              m5, m6           ; mask for 0 > x

  movhlps              m3, m7

  punpcklwd            m4, m6, m5

  punpckhwd            m6, m5           ; sign-extend m6 word->dword

  paddd                m7, m3

  paddd                m6, m4

  pshufd               m3, m7, 0x1

  movhlps              m4, m6

  paddd                m7, m3

  paddd                m6, m4

  mov                  r1, ssem         ; r1 = unsigned int *sse

  pshufd               m4, m6, 0x1

  movd               [r1], m7           ; store sse

  paddd                m6, m4

  movd               raxd, m6           ; store sum as return value

%else ; 4xh

  pshuflw              m4, m6, 0xe

  pshuflw              m3, m7, 0xe

  paddw                m6, m4

  paddd                m7, m3

  pcmpgtw              m5, m6           ; mask for 0 > x

  mov                  r1, ssem         ; r1 = unsigned int *sse

  punpcklwd            m6, m5           ; sign-extend m6 word->dword

  movd               [r1], m7           ; store sse

  pshuflw              m4, m6, 0xe

  paddd                m6, m4

  movd               raxd, m6           ; store sum as return value

%endif

RET

%endmacro

%macro INC_SRC_BY_SRC_STRIDE  0

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

  add                srcq, src_stridemp

%else

  add                srcq, src_strideq

%endif

%endmacro

%macro SUBPEL_VARIANCE 1-2 0 ; W

%if cpuflag(ssse3)

%define bilin_filter_m bilin_filter_m_ssse3

%define filter_idx_shift 4

%endif

; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses

; 11, not 13, if the registers are ordered correctly. May make a minor speed

; difference on Win64

%if AOM_ARCH_X86_64

  %if %2 == 1 ; avg

    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \

                                        x_offset, y_offset, dst, dst_stride, \

                                        sec, sec_stride, height, sse

    %define sec_str sec_strideq

  %else

    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \

                                    x_offset, y_offset, dst, dst_stride, \

                                    height, sse

  %endif

  %define block_height heightd

  %define bilin_filter sseq

%else

  %if CONFIG_PIC=1

    %if %2 == 1 ; avg

      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

                                          x_offset, y_offset, dst, dst_stride, \

                                          sec, sec_stride, height, sse

      %define block_height dword heightm

      %define sec_str sec_stridemp

    %else

      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

                                      x_offset, y_offset, dst, dst_stride, \

                                      height, sse

      %define block_height heightd

    %endif

    ; reuse argument stack space

    %define g_bilin_filterm x_offsetm

    %define g_pw_8m y_offsetm

    ;Store bilin_filter and pw_8 location in stack

    %if GET_GOT_DEFINED == 1

      GET_GOT eax

      add esp, 4                ; restore esp

    %endif

    lea ecx, [GLOBAL(bilin_filter_m)]

    mov g_bilin_filterm, ecx

    lea ecx, [GLOBAL(pw_8)]

    mov g_pw_8m, ecx

    LOAD_IF_USED 0, 1         ; load eax, ecx back

  %else

    %if %2 == 1 ; avg

      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \

                                          x_offset, y_offset, \

                                          dst, dst_stride, sec, sec_stride, \

                                          height, sse

      %define block_height dword heightm

      %define sec_str sec_stridemp

    %else

      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \

                                      x_offset, y_offset, dst, dst_stride, \

                                      height, sse

      %define block_height heightd

    %endif

    %define bilin_filter bilin_filter_m

  %endif

%endif

%if %1 == 4

  %define movx movd

%else

  %define movx movh

%endif

  ASSERT               %1 <= 16         ; m6 overflows if w > 16

  pxor                 m6, m6           ; sum

  pxor                 m7, m7           ; sse

  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we

  ; could perhaps use it for something more productive then

  pxor                 m5, m5           ; dedicated zero register

%if %1 < 16

  sar                   block_height, 1

%if %2 == 1 ; avg

  shl             sec_str, 1

%endif

%endif

  ; FIXME(rbultje) replace by jumptable?

  test          x_offsetd, x_offsetd

  jnz .x_nonzero

  ; x_offset == 0

  test          y_offsetd, y_offsetd

  jnz .x_zero_y_nonzero

  ; x_offset == 0 && y_offset == 0

.x_zero_y_zero_loop:

%if %1 == 16

  movu                 m0, [srcq]

  mova                 m1, [dstq]

%if %2 == 1 ; avg

  pavgb                m0, [secq]

  punpckhbw            m3, m1, m5

  punpcklbw            m1, m5

%endif

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%if %2 == 0 ; !avg

  punpckhbw            m3, m1, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

%if %2 == 1 ; avg

%if %1 > 4

  movhps               m0, [srcq+src_strideq]

%else ; 4xh

  movx                 m1, [srcq+src_strideq]

  punpckldq            m0, m1

%endif

%else ; !avg

  movx                 m2, [srcq+src_strideq]

%endif

  movx                 m1, [dstq]

  movx                 m3, [dstq+dst_strideq]

%if %2 == 1 ; avg

%if %1 > 4

  pavgb                m0, [secq]

%else

  movh                 m2, [secq]

  pavgb                m0, m2

%endif

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%if %1 > 4

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else ; 4xh

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%else ; !avg

  punpcklbw            m0, m5

  punpcklbw            m2, m5

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_zero_y_zero_loop

  STORE_AND_RET %1

.x_zero_y_nonzero:

  cmp           y_offsetd, 4

  jne .x_zero_y_nonhalf

  ; x_offset == 0 && y_offset == 0.5

.x_zero_y_half_loop:

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m4, [srcq+src_strideq]

  mova                 m1, [dstq]

  pavgb                m0, m4

  punpckhbw            m3, m1, m5

%if %2 == 1 ; avg

  pavgb                m0, [secq]

%endif

  punpcklbw            m1, m5

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m2, [srcq+src_strideq]

%if %2 == 1 ; avg

%if %1 > 4

  movhps               m2, [srcq+src_strideq*2]

%else ; 4xh

  movx                 m1, [srcq+src_strideq*2]

  punpckldq            m2, m1

%endif

  movx                 m1, [dstq]

%if %1 > 4

  movlhps              m0, m2

%else ; 4xh

  punpckldq            m0, m2

%endif

  movx                 m3, [dstq+dst_strideq]

  pavgb                m0, m2

  punpcklbw            m1, m5

%if %1 > 4

  pavgb                m0, [secq]

  punpcklbw            m3, m5

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else ; 4xh

  movh                 m4, [secq]

  pavgb                m0, m4

  punpcklbw            m3, m5

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%else ; !avg

  movx                 m4, [srcq+src_strideq*2]

  movx                 m1, [dstq]

  pavgb                m0, m2

  movx                 m3, [dstq+dst_strideq]

  pavgb                m2, m4

  punpcklbw            m0, m5

  punpcklbw            m2, m5

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_zero_y_half_loop

  STORE_AND_RET %1

.x_zero_y_nonhalf:

  ; x_offset == 0 && y_offset == bilin interpolation

%if AOM_ARCH_X86_64

  lea        bilin_filter, [GLOBAL(bilin_filter_m)]

%endif

  shl           y_offsetd, filter_idx_shift

%if AOM_ARCH_X86_64 && %1 > 4

  mova                 m8, [bilin_filter+y_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                 m9, [bilin_filter+y_offsetq+16]

%endif

  mova                m10, [GLOBAL(pw_8)]

%define filter_y_a m8

%define filter_y_b m9

%define filter_rnd m10

%else ; x86-32 or mmx

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

; x_offset == 0, reuse x_offset reg

%define tempq x_offsetq

  add y_offsetq, g_bilin_filterm

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

  mov tempq, g_pw_8m

%define filter_rnd [tempq]

%else

  add           y_offsetq, bilin_filter

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

%define filter_rnd [GLOBAL(pw_8)]

%endif

%endif

.x_zero_y_other_loop:

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m4, [srcq+src_strideq]

  mova                 m1, [dstq]

%if cpuflag(ssse3)

  punpckhbw            m2, m0, m4

  punpcklbw            m0, m4

  pmaddubsw            m2, filter_y_a

  pmaddubsw            m0, filter_y_a

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

%else

  punpckhbw            m2, m0, m5

  punpckhbw            m3, m4, m5

  punpcklbw            m0, m5

  punpcklbw            m4, m5

  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can

  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of

  ; instructions is the same (5), but it is 1 mul instead of 2, so might be

  ; slightly faster because of pmullw latency. It would also cut our rodata

  ; tables in half for this function, and save 1-2 registers on x86-64.

  pmullw               m2, filter_y_a

  pmullw               m3, filter_y_b

  paddw                m2, filter_rnd

  pmullw               m0, filter_y_a

  pmullw               m4, filter_y_b

  paddw                m0, filter_rnd

  paddw                m2, m3

  paddw                m0, m4

%endif

  psraw                m2, 4

  psraw                m0, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

  packuswb             m0, m2

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%endif

  punpckhbw            m3, m1, m5

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m2, [srcq+src_strideq]

  movx                 m4, [srcq+src_strideq*2]

  movx                 m3, [dstq+dst_strideq]

%if cpuflag(ssse3)

  movx                 m1, [dstq]

  punpcklbw            m0, m2

  punpcklbw            m2, m4

  pmaddubsw            m0, filter_y_a

  pmaddubsw            m2, filter_y_a

  punpcklbw            m3, m5

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

%else

  punpcklbw            m0, m5

  punpcklbw            m2, m5

  punpcklbw            m4, m5

  pmullw               m0, filter_y_a

  pmullw               m1, m2, filter_y_b

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  pmullw               m2, filter_y_a

  pmullw               m4, filter_y_b

  paddw                m0, m1

  paddw                m2, filter_rnd

  movx                 m1, [dstq]

  paddw                m2, m4

%endif

  psraw                m0, 4

  psraw                m2, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

%if %1 == 4

  movlhps              m0, m2

%endif

  packuswb             m0, m2

%if %1 > 4

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else ; 4xh

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%endif

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_zero_y_other_loop

%undef filter_y_a

%undef filter_y_b

%undef filter_rnd

  STORE_AND_RET %1

.x_nonzero:

  cmp           x_offsetd, 4

  jne .x_nonhalf

  ; x_offset == 0.5

  test          y_offsetd, y_offsetd

  jnz .x_half_y_nonzero

  ; x_offset == 0.5 && y_offset == 0

.x_half_y_zero_loop:

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m4, [srcq+1]

  mova                 m1, [dstq]

  pavgb                m0, m4

  punpckhbw            m3, m1, m5

%if %2 == 1 ; avg

  pavgb                m0, [secq]

%endif

  punpcklbw            m1, m5

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m4, [srcq+1]

%if %2 == 1 ; avg

%if %1 > 4

  movhps               m0, [srcq+src_strideq]

  movhps               m4, [srcq+src_strideq+1]

%else ; 4xh

  movx                 m1, [srcq+src_strideq]

  punpckldq            m0, m1

  movx                 m2, [srcq+src_strideq+1]

  punpckldq            m4, m2

%endif

  movx                 m1, [dstq]

  movx                 m3, [dstq+dst_strideq]

  pavgb                m0, m4

  punpcklbw            m3, m5

%if %1 > 4

  pavgb                m0, [secq]

  punpcklbw            m1, m5

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else ; 4xh

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m1, m5

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%else ; !avg

  movx                 m2, [srcq+src_strideq]

  movx                 m1, [dstq]

  pavgb                m0, m4

  movx                 m4, [srcq+src_strideq+1]

  movx                 m3, [dstq+dst_strideq]

  pavgb                m2, m4

  punpcklbw            m0, m5

  punpcklbw            m2, m5

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_half_y_zero_loop

  STORE_AND_RET %1

.x_half_y_nonzero:

  cmp           y_offsetd, 4

  jne .x_half_y_nonhalf

  ; x_offset == 0.5 && y_offset == 0.5

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m3, [srcq+1]

  add                srcq, src_strideq

  pavgb                m0, m3

.x_half_y_half_loop:

  movu                 m4, [srcq]

  movu                 m3, [srcq+1]

  mova                 m1, [dstq]

  pavgb                m4, m3

  punpckhbw            m3, m1, m5

  pavgb                m0, m4

%if %2 == 1 ; avg

  punpcklbw            m1, m5

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m3, [srcq+1]

  add                srcq, src_strideq

  pavgb                m0, m3

.x_half_y_half_loop:

  movx                 m2, [srcq]

  movx                 m3, [srcq+1]

%if %2 == 1 ; avg

%if %1 > 4

  movhps               m2, [srcq+src_strideq]

  movhps               m3, [srcq+src_strideq+1]

%else

  movx                 m1, [srcq+src_strideq]

  punpckldq            m2, m1

  movx                 m1, [srcq+src_strideq+1]

  punpckldq            m3, m1

%endif

  pavgb                m2, m3

%if %1 > 4

  movlhps              m0, m2

  movhlps              m4, m2

%else ; 4xh

  punpckldq            m0, m2

  pshuflw              m4, m2, 0xe

%endif

  movx                 m1, [dstq]

  pavgb                m0, m2

  movx                 m3, [dstq+dst_strideq]

%if %1 > 4

  pavgb                m0, [secq]

%else

  movh                 m2, [secq]

  pavgb                m0, m2

%endif

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%if %1 > 4

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%else ; !avg

  movx                 m4, [srcq+src_strideq]

  movx                 m1, [srcq+src_strideq+1]

  pavgb                m2, m3

  pavgb                m4, m1

  pavgb                m0, m2

  pavgb                m2, m4

  movx                 m1, [dstq]

  movx                 m3, [dstq+dst_strideq]

  punpcklbw            m0, m5

  punpcklbw            m2, m5

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_half_y_half_loop

  STORE_AND_RET %1

.x_half_y_nonhalf:

  ; x_offset == 0.5 && y_offset == bilin interpolation

%if AOM_ARCH_X86_64

  lea        bilin_filter, [GLOBAL(bilin_filter_m)]

%endif

  shl           y_offsetd, filter_idx_shift

%if AOM_ARCH_X86_64 && %1 > 4

  mova                 m8, [bilin_filter+y_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                 m9, [bilin_filter+y_offsetq+16]

%endif

  mova                m10, [GLOBAL(pw_8)]

%define filter_y_a m8

%define filter_y_b m9

%define filter_rnd m10

%else  ;x86_32

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

; x_offset == 0.5. We can reuse x_offset reg

%define tempq x_offsetq

  add y_offsetq, g_bilin_filterm

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

  mov tempq, g_pw_8m

%define filter_rnd [tempq]

%else

  add           y_offsetq, bilin_filter

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

%define filter_rnd [GLOBAL(pw_8)]

%endif

%endif

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m3, [srcq+1]

  add                srcq, src_strideq

  pavgb                m0, m3

.x_half_y_other_loop:

  movu                 m4, [srcq]

  movu                 m2, [srcq+1]

  mova                 m1, [dstq]

  pavgb                m4, m2

%if cpuflag(ssse3)

  punpckhbw            m2, m0, m4

  punpcklbw            m0, m4

  pmaddubsw            m2, filter_y_a

  pmaddubsw            m0, filter_y_a

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

  psraw                m2, 4

%else

  punpckhbw            m2, m0, m5

  punpckhbw            m3, m4, m5

  pmullw               m2, filter_y_a

  pmullw               m3, filter_y_b

  paddw                m2, filter_rnd

  punpcklbw            m0, m5

  paddw                m2, m3

  punpcklbw            m3, m4, m5

  pmullw               m0, filter_y_a

  pmullw               m3, filter_y_b

  paddw                m0, filter_rnd

  psraw                m2, 4

  paddw                m0, m3

%endif

  punpckhbw            m3, m1, m5

  psraw                m0, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

  packuswb             m0, m2

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%endif

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m3, [srcq+1]

  add                srcq, src_strideq

  pavgb                m0, m3

%if notcpuflag(ssse3)

  punpcklbw            m0, m5

%endif

.x_half_y_other_loop:

  movx                 m2, [srcq]

  movx                 m1, [srcq+1]

  movx                 m4, [srcq+src_strideq]

  movx                 m3, [srcq+src_strideq+1]

  pavgb                m2, m1

  pavgb                m4, m3

  movx                 m3, [dstq+dst_strideq]

%if cpuflag(ssse3)

  movx                 m1, [dstq]

  punpcklbw            m0, m2

  punpcklbw            m2, m4

  pmaddubsw            m0, filter_y_a

  pmaddubsw            m2, filter_y_a

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  paddw                m2, filter_rnd

%else

  punpcklbw            m2, m5

  punpcklbw            m4, m5

  pmullw               m0, filter_y_a

  pmullw               m1, m2, filter_y_b

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  pmullw               m2, filter_y_a

  paddw                m0, m1

  pmullw               m1, m4, filter_y_b

  paddw                m2, filter_rnd

  paddw                m2, m1

  movx                 m1, [dstq]

%endif

  psraw                m0, 4

  psraw                m2, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

%if %1 == 4

  movlhps              m0, m2

%endif

  packuswb             m0, m2

%if %1 > 4

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%endif

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_half_y_other_loop

%undef filter_y_a

%undef filter_y_b

%undef filter_rnd

  STORE_AND_RET %1

.x_nonhalf:

  test          y_offsetd, y_offsetd

  jnz .x_nonhalf_y_nonzero

  ; x_offset == bilin interpolation && y_offset == 0

%if AOM_ARCH_X86_64

  lea        bilin_filter, [GLOBAL(bilin_filter_m)]

%endif

  shl           x_offsetd, filter_idx_shift

%if AOM_ARCH_X86_64 && %1 > 4

  mova                 m8, [bilin_filter+x_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                 m9, [bilin_filter+x_offsetq+16]

%endif

  mova                m10, [GLOBAL(pw_8)]

%define filter_x_a m8

%define filter_x_b m9

%define filter_rnd m10

%else    ; x86-32

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

;y_offset == 0. We can reuse y_offset reg.

%define tempq y_offsetq

  add x_offsetq, g_bilin_filterm

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

  mov tempq, g_pw_8m

%define filter_rnd [tempq]

%else

  add           x_offsetq, bilin_filter

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_rnd [GLOBAL(pw_8)]

%endif

%endif

.x_other_y_zero_loop:

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m4, [srcq+1]

  mova                 m1, [dstq]

%if cpuflag(ssse3)

  punpckhbw            m2, m0, m4

  punpcklbw            m0, m4

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m0, filter_x_a

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

%else

  punpckhbw            m2, m0, m5

  punpckhbw            m3, m4, m5

  punpcklbw            m0, m5

  punpcklbw            m4, m5

  pmullw               m2, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m2, filter_rnd

  pmullw               m0, filter_x_a

  pmullw               m4, filter_x_b

  paddw                m0, filter_rnd

  paddw                m2, m3

  paddw                m0, m4

%endif

  psraw                m2, 4

  psraw                m0, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

  packuswb             m0, m2

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%endif

  punpckhbw            m3, m1, m5

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m1, [srcq+1]

  movx                 m2, [srcq+src_strideq]

  movx                 m4, [srcq+src_strideq+1]

  movx                 m3, [dstq+dst_strideq]

%if cpuflag(ssse3)

  punpcklbw            m0, m1

  movx                 m1, [dstq]

  punpcklbw            m2, m4

  pmaddubsw            m0, filter_x_a

  pmaddubsw            m2, filter_x_a

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  paddw                m2, filter_rnd

%else

  punpcklbw            m0, m5

  punpcklbw            m1, m5

  punpcklbw            m2, m5

  punpcklbw            m4, m5

  pmullw               m0, filter_x_a

  pmullw               m1, filter_x_b

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  pmullw               m2, filter_x_a

  pmullw               m4, filter_x_b

  paddw                m0, m1

  paddw                m2, filter_rnd

  movx                 m1, [dstq]

  paddw                m2, m4

%endif

  psraw                m0, 4

  psraw                m2, 4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

%if %1 == 4

  movlhps              m0, m2

%endif

  packuswb             m0, m2

%if %1 > 4

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%endif

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_other_y_zero_loop

%undef filter_x_a

%undef filter_x_b

%undef filter_rnd

  STORE_AND_RET %1

.x_nonhalf_y_nonzero:

  cmp           y_offsetd, 4

  jne .x_nonhalf_y_nonhalf

  ; x_offset == bilin interpolation && y_offset == 0.5

%if AOM_ARCH_X86_64

  lea        bilin_filter, [GLOBAL(bilin_filter_m)]

%endif

  shl           x_offsetd, filter_idx_shift

%if AOM_ARCH_X86_64 && %1 > 4

  mova                 m8, [bilin_filter+x_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                 m9, [bilin_filter+x_offsetq+16]

%endif

  mova                m10, [GLOBAL(pw_8)]

%define filter_x_a m8

%define filter_x_b m9

%define filter_rnd m10

%else    ; x86-32

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

; y_offset == 0.5. We can reuse y_offset reg.

%define tempq y_offsetq

  add x_offsetq, g_bilin_filterm

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

  mov tempq, g_pw_8m

%define filter_rnd [tempq]

%else

  add           x_offsetq, bilin_filter

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_rnd [GLOBAL(pw_8)]

%endif

%endif

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m1, [srcq+1]

%if cpuflag(ssse3)

  punpckhbw            m2, m0, m1

  punpcklbw            m0, m1

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m0, filter_x_a

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

%else

  punpckhbw            m2, m0, m5

  punpckhbw            m3, m1, m5

  punpcklbw            m0, m5

  punpcklbw            m1, m5

  pmullw               m0, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m0, filter_rnd

  pmullw               m2, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m2, filter_rnd

  paddw                m0, m1

  paddw                m2, m3

%endif

  psraw                m0, 4

  psraw                m2, 4

  add                srcq, src_strideq

  packuswb             m0, m2

.x_other_y_half_loop:

  movu                 m4, [srcq]

  movu                 m3, [srcq+1]

%if cpuflag(ssse3)

  mova                 m1, [dstq]

  punpckhbw            m2, m4, m3

  punpcklbw            m4, m3

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m4, filter_x_a

  paddw                m2, filter_rnd

  paddw                m4, filter_rnd

  psraw                m2, 4

  psraw                m4, 4

  packuswb             m4, m2

  pavgb                m0, m4

  punpckhbw            m3, m1, m5

  punpcklbw            m1, m5

%else

  punpckhbw            m2, m4, m5

  punpckhbw            m1, m3, m5

  punpcklbw            m4, m5

  punpcklbw            m3, m5

  pmullw               m4, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m4, filter_rnd

  pmullw               m2, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m2, filter_rnd

  paddw                m4, m3

  paddw                m2, m1

  mova                 m1, [dstq]

  psraw                m4, 4

  psraw                m2, 4

  punpckhbw            m3, m1, m5

  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we

  ; have a 1-register shortage to be able to store the backup of the bilin

  ; filtered second line as words as cache for the next line. Packing into

  ; a byte costs 1 pack and 2 unpacks, but saves a register.

  packuswb             m4, m2

  punpcklbw            m1, m5

  pavgb                m0, m4

%endif

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

  pavgb                m0, [secq]

%endif

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  add                srcq, src_strideq

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m1, [srcq+1]

%if cpuflag(ssse3)

  punpcklbw            m0, m1

  pmaddubsw            m0, filter_x_a

  paddw                m0, filter_rnd

%else

  punpcklbw            m0, m5

  punpcklbw            m1, m5

  pmullw               m0, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m0, filter_rnd

  paddw                m0, m1

%endif

  add                srcq, src_strideq

  psraw                m0, 4

.x_other_y_half_loop:

  movx                 m2, [srcq]

  movx                 m1, [srcq+1]

  movx                 m4, [srcq+src_strideq]

  movx                 m3, [srcq+src_strideq+1]

%if cpuflag(ssse3)

  punpcklbw            m2, m1

  punpcklbw            m4, m3

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m4, filter_x_a

  movx                 m1, [dstq]

  movx                 m3, [dstq+dst_strideq]

  paddw                m2, filter_rnd

  paddw                m4, filter_rnd

%else

  punpcklbw            m2, m5

  punpcklbw            m1, m5

  punpcklbw            m4, m5

  punpcklbw            m3, m5

  pmullw               m2, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m2, filter_rnd

  pmullw               m4, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m4, filter_rnd

  paddw                m2, m1

  movx                 m1, [dstq]

  paddw                m4, m3

  movx                 m3, [dstq+dst_strideq]

%endif

  psraw                m2, 4

  psraw                m4, 4

  pavgw                m0, m2

  pavgw                m2, m4

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline - also consider going to bytes here

%if %1 == 4

  movlhps              m0, m2

%endif

  packuswb             m0, m2

%if %1 > 4

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%endif

  punpcklbw            m3, m5

  punpcklbw            m1, m5

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  lea                srcq, [srcq+src_strideq*2]

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_other_y_half_loop

%undef filter_x_a

%undef filter_x_b

%undef filter_rnd

  STORE_AND_RET %1

.x_nonhalf_y_nonhalf:

%if AOM_ARCH_X86_64

  lea        bilin_filter, [GLOBAL(bilin_filter_m)]

%endif

  shl           x_offsetd, filter_idx_shift

  shl           y_offsetd, filter_idx_shift

%if AOM_ARCH_X86_64 && %1 > 4

  mova                 m8, [bilin_filter+x_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                 m9, [bilin_filter+x_offsetq+16]

%endif

  mova                m10, [bilin_filter+y_offsetq]

%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64

  mova                m11, [bilin_filter+y_offsetq+16]

%endif

  mova                m12, [GLOBAL(pw_8)]

%define filter_x_a m8

%define filter_x_b m9

%define filter_y_a m10

%define filter_y_b m11

%define filter_rnd m12

%else   ; x86-32

%if AOM_ARCH_X86=1 && CONFIG_PIC=1

; In this case, there is NO unused register. Used src_stride register. Later,

; src_stride has to be loaded from stack when it is needed.

%define tempq src_strideq

  mov tempq, g_bilin_filterm

  add           x_offsetq, tempq

  add           y_offsetq, tempq

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

  mov tempq, g_pw_8m

%define filter_rnd [tempq]

%else

  add           x_offsetq, bilin_filter

  add           y_offsetq, bilin_filter

%define filter_x_a [x_offsetq]

%define filter_x_b [x_offsetq+16]

%define filter_y_a [y_offsetq]

%define filter_y_b [y_offsetq+16]

%define filter_rnd [GLOBAL(pw_8)]

%endif

%endif

  ; x_offset == bilin interpolation && y_offset == bilin interpolation

%if %1 == 16

  movu                 m0, [srcq]

  movu                 m1, [srcq+1]

%if cpuflag(ssse3)

  punpckhbw            m2, m0, m1

  punpcklbw            m0, m1

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m0, filter_x_a

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

%else

  punpckhbw            m2, m0, m5

  punpckhbw            m3, m1, m5

  punpcklbw            m0, m5

  punpcklbw            m1, m5

  pmullw               m0, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m0, filter_rnd

  pmullw               m2, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m2, filter_rnd

  paddw                m0, m1

  paddw                m2, m3

%endif

  psraw                m0, 4

  psraw                m2, 4

  INC_SRC_BY_SRC_STRIDE

  packuswb             m0, m2

.x_other_y_other_loop:

%if cpuflag(ssse3)

  movu                 m4, [srcq]

  movu                 m3, [srcq+1]

  mova                 m1, [dstq]

  punpckhbw            m2, m4, m3

  punpcklbw            m4, m3

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m4, filter_x_a

  punpckhbw            m3, m1, m5

  paddw                m2, filter_rnd

  paddw                m4, filter_rnd

  psraw                m2, 4

  psraw                m4, 4

  packuswb             m4, m2

  punpckhbw            m2, m0, m4

  punpcklbw            m0, m4

  pmaddubsw            m2, filter_y_a

  pmaddubsw            m0, filter_y_a

  punpcklbw            m1, m5

  paddw                m2, filter_rnd

  paddw                m0, filter_rnd

  psraw                m2, 4

  psraw                m0, 4

%else

  movu                 m3, [srcq]

  movu                 m4, [srcq+1]

  punpckhbw            m1, m3, m5

  punpckhbw            m2, m4, m5

  punpcklbw            m3, m5

  punpcklbw            m4, m5

  pmullw               m3, filter_x_a

  pmullw               m4, filter_x_b

  paddw                m3, filter_rnd

  pmullw               m1, filter_x_a

  pmullw               m2, filter_x_b

  paddw                m1, filter_rnd

  paddw                m3, m4

  paddw                m1, m2

  psraw                m3, 4

  psraw                m1, 4

  packuswb             m4, m3, m1

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

  pmullw               m2, filter_y_a

  pmullw               m1, filter_y_b

  paddw                m2, filter_rnd

  pmullw               m0, filter_y_a

  pmullw               m3, filter_y_b

  paddw                m2, m1

  mova                 m1, [dstq]

  paddw                m0, filter_rnd

  psraw                m2, 4

  paddw                m0, m3

  punpckhbw            m3, m1, m5

  psraw                m0, 4

  punpcklbw            m1, m5

%endif

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

  packuswb             m0, m2

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  INC_SRC_BY_SRC_STRIDE

  add                dstq, dst_strideq

%else ; %1 < 16

  movx                 m0, [srcq]

  movx                 m1, [srcq+1]

%if cpuflag(ssse3)

  punpcklbw            m0, m1

  pmaddubsw            m0, filter_x_a

  paddw                m0, filter_rnd

%else

  punpcklbw            m0, m5

  punpcklbw            m1, m5

  pmullw               m0, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m0, filter_rnd

  paddw                m0, m1

%endif

  psraw                m0, 4

%if cpuflag(ssse3)

  packuswb             m0, m0

%endif

  INC_SRC_BY_SRC_STRIDE

.x_other_y_other_loop:

  movx                 m2, [srcq]

  movx                 m1, [srcq+1]

  INC_SRC_BY_SRC_STRIDE

  movx                 m4, [srcq]

  movx                 m3, [srcq+1]

%if cpuflag(ssse3)

  punpcklbw            m2, m1

  punpcklbw            m4, m3

  pmaddubsw            m2, filter_x_a

  pmaddubsw            m4, filter_x_a

  movx                 m3, [dstq+dst_strideq]

  movx                 m1, [dstq]

  paddw                m2, filter_rnd

  paddw                m4, filter_rnd

  psraw                m2, 4

  psraw                m4, 4

  packuswb             m2, m2

  packuswb             m4, m4

  punpcklbw            m0, m2

  punpcklbw            m2, m4

  pmaddubsw            m0, filter_y_a

  pmaddubsw            m2, filter_y_a

  punpcklbw            m3, m5

  paddw                m0, filter_rnd

  paddw                m2, filter_rnd

  psraw                m0, 4

  psraw                m2, 4

  punpcklbw            m1, m5

%else

  punpcklbw            m2, m5

  punpcklbw            m1, m5

  punpcklbw            m4, m5

  punpcklbw            m3, m5

  pmullw               m2, filter_x_a

  pmullw               m1, filter_x_b

  paddw                m2, filter_rnd

  pmullw               m4, filter_x_a

  pmullw               m3, filter_x_b

  paddw                m4, filter_rnd

  paddw                m2, m1

  paddw                m4, m3

  psraw                m2, 4

  psraw                m4, 4

  pmullw               m0, filter_y_a

  pmullw               m3, m2, filter_y_b

  paddw                m0, filter_rnd

  pmullw               m2, filter_y_a

  pmullw               m1, m4, filter_y_b

  paddw                m2, filter_rnd

  paddw                m0, m3

  movx                 m3, [dstq+dst_strideq]

  paddw                m2, m1

  movx                 m1, [dstq]

  psraw                m0, 4

  psraw                m2, 4

  punpcklbw            m3, m5

  punpcklbw            m1, m5

%endif

%if %2 == 1 ; avg

  ; FIXME(rbultje) pipeline

%if %1 == 4

  movlhps              m0, m2

%endif

  packuswb             m0, m2

%if %1 > 4

  pavgb                m0, [secq]

  punpckhbw            m2, m0, m5

  punpcklbw            m0, m5

%else

  movh                 m2, [secq]

  pavgb                m0, m2

  punpcklbw            m0, m5

  movhlps              m2, m0

%endif

%endif

  SUM_SSE              m0, m1, m2, m3, m6, m7

  mova                 m0, m4

  INC_SRC_BY_SRC_STRIDE

  lea                dstq, [dstq+dst_strideq*2]

%endif

%if %2 == 1 ; avg

  add                secq, sec_str

%endif

  dec                   block_height

  jg .x_other_y_other_loop

%undef filter_x_a

%undef filter_x_b

%undef filter_y_a

%undef filter_y_b

%undef filter_rnd

%undef movx

  STORE_AND_RET %1

%endmacro

; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical

; between the ssse3 and non-ssse3 version. It may make sense to merge their

; code in the sense that the ssse3 version would jump to the appropriate

; location in the sse/2 version, rather than duplicating that code in the

; binary.

INIT_XMM ssse3

SUBPEL_VARIANCE  4

SUBPEL_VARIANCE  8

SUBPEL_VARIANCE 16

INIT_XMM ssse3

SUBPEL_VARIANCE  4, 1

SUBPEL_VARIANCE  8, 1

SUBPEL_VARIANCE 16, 1