pixman-arm-simd-asm.h

mozilla-central/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h

Enable keyboard shortcuts

Source code

File a bug in Core :: Graphics

Revision control

Copy as Markdown

Other Tools

/*

 * Copyright © 2012 Raspberry Pi Foundation

 * Copyright © 2012 RISC OS Open Ltd

 * Permission to use, copy, modify, distribute, and sell this software and its

 * documentation for any purpose is hereby granted without fee, provided that

 * the above copyright notice appear in all copies and that both that

 * copyright notice and this permission notice appear in supporting

 * documentation, and that the name of the copyright holders not be used in

 * advertising or publicity pertaining to distribution of the software without

 * specific, written prior permission.  The copyright holders make no

 * representations about the suitability of this software for any purpose.  It

 * is provided "as is" without express or implied warranty.

 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS

 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY

 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN

 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

 * SOFTWARE.

 * Author:  Ben Avison (bavison@riscosopen.org)

*/

/*

 * Because the alignment of pixel data to cachelines, and even the number of

 * cachelines per row can vary from row to row, and because of the need to

 * preload each scanline once and only once, this prefetch strategy treats

 * each row of pixels independently. When a pixel row is long enough, there

 * are three distinct phases of prefetch:

 * * an inner loop section, where each time a cacheline of data is

 *    processed, another cacheline is preloaded (the exact distance ahead is

 *    determined empirically using profiling results from lowlevel-blt-bench)

 * * a leading section, where enough cachelines are preloaded to ensure no

 *    cachelines escape being preloaded when the inner loop starts

 * * a trailing section, where a limited number (0 or more) of cachelines

 *    are preloaded to deal with data (if any) that hangs off the end of the

 *    last iteration of the inner loop, plus any trailing bytes that were not

 *    enough to make up one whole iteration of the inner loop

 * There are (in general) three distinct code paths, selected between

 * depending upon how long the pixel row is. If it is long enough that there

 * is at least one iteration of the inner loop (as described above) then

 * this is described as the "wide" case. If it is shorter than that, but

 * there are still enough bytes output that there is at least one 16-byte-

 * long, 16-byte-aligned write to the destination (the optimum type of

 * write), then this is the "medium" case. If it is not even this long, then

 * this is the "narrow" case, and there is no attempt to align writes to

 * 16-byte boundaries. In the "medium" and "narrow" cases, all the

 * cachelines containing data from the pixel row are prefetched up-front.

*/

/*

 * Determine whether we put the arguments on the stack for debugging.

*/

#undef DEBUG_PARAMS

/*

 * Bit flags for 'generate_composite_function' macro which are used

 * to tune generated functions behavior.

*/

.set FLAG_DST_WRITEONLY,         0

.set FLAG_DST_READWRITE,         1

.set FLAG_COND_EXEC,             0

.set FLAG_BRANCH_OVER,           2

.set FLAG_PROCESS_PRESERVES_PSR, 0

.set FLAG_PROCESS_CORRUPTS_PSR,  4

.set FLAG_PROCESS_DOESNT_STORE,  0

.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */

.set FLAG_NO_SPILL_LINE_VARS,        0

.set FLAG_SPILL_LINE_VARS_WIDE,      16

.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32

.set FLAG_SPILL_LINE_VARS,           48

.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0

.set FLAG_PROCESS_PRESERVES_SCRATCH, 64

.set FLAG_PROCESS_PRESERVES_WK0,     0

.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */

.set FLAG_PRELOAD_DST,               0

.set FLAG_NO_PRELOAD_DST,            256

/*

 * Number of bytes by which to adjust preload offset of destination

 * buffer (allows preload instruction to be moved before the load(s))

*/

.set DST_PRELOAD_BIAS, 0

/*

 * Offset into stack where mask and source pointer/stride can be accessed.

*/

#ifdef DEBUG_PARAMS

.set ARGS_STACK_OFFSET,        (9*4+9*4)

#else

.set ARGS_STACK_OFFSET,        (9*4)

#endif

/*

 * Offset into stack where space allocated during init macro can be accessed.

*/

.set LOCALS_STACK_OFFSET,     0

/*

 * Constants for selecting preferable prefetch type.

*/

.set PREFETCH_TYPE_NONE,       0

.set PREFETCH_TYPE_STANDARD,   1

/*

 * Definitions of macros for load/store of pixel data.

*/

.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0

 .if \numbytes == 16

  .if \unaligned == 1

        \op\()r\()\cond    WK\()\reg0, [\base], #4

        \op\()r\()\cond    WK\()\reg1, [\base], #4

        \op\()r\()\cond    WK\()\reg2, [\base], #4

        \op\()r\()\cond    WK\()\reg3, [\base], #4

  .else

        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

  .endif

 .elseif \numbytes == 8

  .if \unaligned == 1

        \op\()r\()\cond    WK\()\reg0, [\base], #4

        \op\()r\()\cond    WK\()\reg1, [\base], #4

  .else

        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}

  .endif

 .elseif \numbytes == 4

        \op\()r\()\cond    WK\()\reg0, [\base], #4

 .elseif \numbytes == 2

        \op\()rh\()\cond   WK\()\reg0, [\base], #2

 .elseif \numbytes == 1

        \op\()rb\()\cond   WK\()\reg0, [\base], #1

 .else

  .error "unsupported size: \numbytes"

 .endif

.endm

.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base

 .if \numbytes == 16

        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}

 .elseif \numbytes == 8

        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}

 .elseif \numbytes == 4

        str\()\cond    WK\()\reg0, [\base, #-4]

 .elseif \numbytes == 2

        strh\()\cond   WK\()\reg0, [\base, #-2]

 .elseif \numbytes == 1

        strb\()\cond   WK\()\reg0, [\base, #-1]

 .else

  .error "unsupported size: \numbytes"

 .endif

.endm

.macro pixld cond, numbytes, firstreg, base, unaligned

        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned

.endm

.macro pixst cond, numbytes, firstreg, base

 .if (flags) & FLAG_DST_READWRITE

        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base

 .else

        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base

 .endif

.endm

.macro PF a, x:vararg

 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)

        \a \x

 .endif

.endm

.macro preload_leading_step1  bpp, ptr, base

/* If the destination is already 16-byte aligned, then we need to preload

 * between 0 and prefetch_distance (inclusive) cache lines ahead so there

 * are no gaps when the inner loop starts.

*/

 .if \bpp > 0

        PF  bic,    \ptr, \base, #31

  .set OFFSET, 0

  .rept prefetch_distance+1

        PF  pld,    [\ptr, #OFFSET]

   .set OFFSET, OFFSET+32

  .endr

 .endif

.endm

.macro preload_leading_step2  bpp, bpp_shift, ptr, base

/* However, if the destination is not 16-byte aligned, we may need to

 * preload more cache lines than that. The question we need to ask is:

 * are the bytes corresponding to the leading pixels more than the amount

 * by which the source pointer will be rounded down for preloading, and if

 * so, by how many cache lines? Effectively, we want to calculate

 *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp

 *     inner_loop_offset = (src+leading_bytes)&31

 *     extra_needed = leading_bytes - inner_loop_offset

 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only

 * possible when there are 4 src bytes for every 1 dst byte).

*/

 .if \bpp > 0

  .ifc \base,DST

        /* The test can be simplified further when preloading the destination */

        PF  tst,    \base, #16

        PF  beq,    61f

  .else

   .if \bpp/dst_w_bpp == 4

        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift

        PF  and,    SCRATCH, SCRATCH, #31

        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift

        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */

        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */

        PF  bcs,    61f

        PF  bpl,    60f

        PF  pld,    [ptr, #32*(prefetch_distance+2)]

   .else

        PF  mov,    SCRATCH, \base, lsl #32-5

        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift

        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift

        PF  bls,    61f

   .endif

  .endif

60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]

61:

 .endif

.endm

#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))

.macro preload_middle   bpp, base, scratch_holds_offset

 .if \bpp > 0

        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */

  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)

   .if \scratch_holds_offset

        PF  pld,    [\base, SCRATCH]

   .else

        PF  bic,    SCRATCH, \base, #31

        PF  pld,    [SCRATCH, #32*prefetch_distance]

   .endif

  .endif

 .endif

.endm

.macro preload_trailing  bpp, bpp_shift, base

 .if \bpp > 0

  .if \bpp*pix_per_block > 256

        /* Calculations are more complex if more than one fetch per block */

        PF  and,    WK1, \base, #31

        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift

        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)

        PF  bic,    SCRATCH, \base, #31

80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

        PF  add,    SCRATCH, SCRATCH, #32

        PF  subs,   WK1, WK1, #32

        PF  bhi,    80b

  .else

        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */

        PF  mov,    SCRATCH, \base, lsl #32-5

        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift

        PF  adcseq, SCRATCH, SCRATCH, #0

        /* The instruction above has two effects: ensures Z is only

         * set if C was clear (so Z indicates that both shifted quantities

         * were 0), and clears C if Z was set (so C indicates that the sum

         * of the shifted quantities was greater and not equal to 32) */

        PF  beq,    82f

        PF  bic,    SCRATCH, \base, #31

        PF  bcc,    81f

        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]

81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]

82:

  .endif

 .endif

.endm

.macro preload_line    narrow_case, bpp, bpp_shift, base

/* "narrow_case" - just means that the macro was invoked from the "narrow"

 *    code path rather than the "medium" one - because in the narrow case,

 *    the row of pixels is known to output no more than 30 bytes, then

 *    (assuming the source pixels are no wider than the the destination

 *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,

 *    meaning there's no need for a loop.

 * "bpp" - number of bits per pixel in the channel (source, mask or

 *    destination) that's being preloaded, or 0 if this channel is not used

 *    for reading

 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)

 * "base" - base address register of channel to preload (SRC, MASK or DST)

*/

 .if \bpp > 0

  .if \narrow_case && (\bpp <= dst_w_bpp)

        /* In these cases, each line for each channel is in either 1 or 2 cache lines */

        PF  bic,    WK0, \base, #31

        PF  pld,    [WK0]

        PF  add,    WK1, \base, X, LSL #\bpp_shift

        PF  sub,    WK1, WK1, #1

        PF  bic,    WK1, WK1, #31

        PF  cmp,    WK1, WK0

        PF  beq,    90f

        PF  pld,    [WK1]

90:

  .else

        PF  bic,    WK0, \base, #31

        PF  pld,    [WK0]

        PF  add,    WK1, \base, X, lsl #\bpp_shift

        PF  sub,    WK1, WK1, #1

        PF  bic,    WK1, WK1, #31

        PF  cmp,    WK1, WK0

        PF  beq,    92f

91:     PF  add,    WK0, WK0, #32

        PF  cmp,    WK0, WK1

        PF  pld,    [WK0]

        PF  bne,    91b

92:

  .endif

 .endif

.endm

.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0

 .if \decrementx

        sub\()\cond X, X, #8*\numbytes/dst_w_bpp

 .endif

        \process_tail  \cond, \numbytes, \firstreg

 .if !((flags) & FLAG_PROCESS_DOES_STORE)

        pixst   \cond, \numbytes, \firstreg, DST

 .endif

.endm

.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx

 .if (flags) & FLAG_BRANCH_OVER

  .ifc \cond,mi

        bpl     100f

  .endif

  .ifc \cond,cs

        bcc     100f

  .endif

  .ifc \cond,ne

        beq     100f

  .endif

        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx

100:

 .else

        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx

 .endif

.endm

.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx

 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)

        /* Can't interleave reads and writes */

        \test

        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx

  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR

        \test

  .endif

        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx

 .else

        /* Can interleave reads and writes for better scheduling */

        \test

        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0

        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0

  .if \decrementx

        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp

        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp

  .endif

        \process_tail  \cond1, \numbytes1, \firstreg1

        \process_tail  \cond2, \numbytes2, \firstreg2

        pixst   \cond1, \numbytes1, \firstreg1, DST

        pixst   \cond2, \numbytes2, \firstreg2, DST

 .endif

.endm

.macro test_bits_1_0_ptr

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */

 .else

        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */

 .endif

.endm

.macro test_bits_3_2_ptr

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */

 .else

        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */

 .endif

.endm

.macro leading_15bytes  process_head, process_tail

        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */

 .set DECREMENT_X, 1

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

  .set DECREMENT_X, 0

        sub     X, X, WK0, lsr #dst_bpp_shift

        str     X, [sp, #LINE_SAVED_REG_COUNT*4]

        mov     X, WK0

 .endif

        /* Use unaligned loads in all cases for simplicity */

 .if dst_w_bpp == 8

        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X

 .elseif dst_w_bpp == 16

        test_bits_1_0_ptr

        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X

 .endif

        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]

 .endif

.endm

.macro test_bits_3_2_pix

        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3

.endm

.macro test_bits_1_0_pix

 .if dst_w_bpp == 8

        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1

 .else

        movs    SCRATCH, X, lsr #1

 .endif

.endm

.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask

        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0

 .if dst_w_bpp == 16

        test_bits_1_0_pix

        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0

 .elseif dst_w_bpp == 8

        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0

 .endif

.endm

.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment

110:

 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */

 .rept pix_per_block*dst_w_bpp/128

        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1

  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

        preload_middle  src_bpp, SRC, 1

  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

        preload_middle  mask_bpp, MASK, 1

  .else

        preload_middle  src_bpp, SRC, 0

        preload_middle  mask_bpp, MASK, 0

  .endif

  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)

        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that

         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset

         * preloads for, to achieve staggered prefetches for multiple channels, because there are

         * always two STMs per prefetch, so there is always an opposite STM on which to put the

         * preload. Note, no need to BIC the base register here */

        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]

  .endif

        \process_tail  , 16, 0

  .if !((flags) & FLAG_PROCESS_DOES_STORE)

        pixst   , 16, 0, DST

  .endif

  .set SUBBLOCK, SUBBLOCK+1

 .endr

        subs    X, X, #pix_per_block

        bhs     110b

.endm

.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask

        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */

 .if dst_r_bpp > 0

        tst     DST, #16

        bne     111f

        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS

        b       112f

111:

 .endif

        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS

112:

        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */

 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)

        PF  and,    WK0, X, #pix_per_block-1

 .endif

        preload_trailing  src_bpp, src_bpp_shift, SRC

        preload_trailing  mask_bpp, mask_bpp_shift, MASK

 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

        preload_trailing  dst_r_bpp, dst_bpp_shift, DST

 .endif

        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp

        /* The remainder of the line is handled identically to the medium case */

        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask

.endm

.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

120:

        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0

        \process_tail  , 16, 0

 .if !((flags) & FLAG_PROCESS_DOES_STORE)

        pixst   , 16, 0, DST

 .endif

        subs    X, X, #128/dst_w_bpp

        bhs     120b

        /* Trailing pixels */

        tst     X, #128/dst_w_bpp - 1

        beq     \exit_label

        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask

.endm

.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask

        tst     X, #16*8/dst_w_bpp

        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0

        /* Trailing pixels */

        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */

        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask

.endm

.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label

 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */

 .if mask_bpp == 8 || mask_bpp == 16

        tst     MASK, #3

        bne     141f

 .endif

  .if src_bpp == 8 || src_bpp == 16

        tst     SRC, #3

        bne     140f

  .endif

        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0

  .if src_bpp == 8 || src_bpp == 16

        b       \exit_label

140:

        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0

  .endif

 .if mask_bpp == 8 || mask_bpp == 16

        b       \exit_label

141:

  .if src_bpp == 8 || src_bpp == 16

        tst     SRC, #3

        bne     142f

  .endif

        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1

  .if src_bpp == 8 || src_bpp == 16

        b       \exit_label

142:

        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1

  .endif

 .endif

.endm

.macro end_of_line      restore_x, vars_spilled, loop_label, last_one

 .if \vars_spilled

        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */

        /* This is ldmia sp,{} */

        .word   0xE89D0000 | LINE_SAVED_REGS

 .endif

        subs    Y, Y, #1

 .if \vars_spilled

  .if (LINE_SAVED_REGS) & (1<<1)

        str     Y, [sp]

  .endif

 .endif

        add     DST, DST, STRIDE_D

 .if src_bpp > 0

        add     SRC, SRC, STRIDE_S

 .endif

 .if mask_bpp > 0

        add     MASK, MASK, STRIDE_M

 .endif

 .if \restore_x

        mov     X, ORIG_W

 .endif

        bhs     \loop_label

 .ifc "\last_one",""

  .if \vars_spilled

        b       197f

  .else

        b       198f

  .endif

 .else

  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)

        b       198f

  .endif

 .endif

.endm

.macro generate_composite_function fname, \

                                   src_bpp_, \

                                   mask_bpp_, \

                                   dst_w_bpp_, \

                                   flags_, \

                                   prefetch_distance_, \

                                   init, \

                                   newline, \

                                   cleanup, \

                                   process_head, \

                                   process_tail, \

                                   process_inner_loop

    pixman_asm_function \fname

/*

 * Make some macro arguments globally visible and accessible

 * from other macros

*/

 .set src_bpp, \src_bpp_

 .set mask_bpp, \mask_bpp_

 .set dst_w_bpp, \dst_w_bpp_

 .set flags, \flags_

 .set prefetch_distance, \prefetch_distance_

/*

 * Select prefetch type for this function.

*/

 .if prefetch_distance == 0

  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE

 .else

  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD

 .endif

 .if src_bpp == 32

  .set src_bpp_shift, 2

 .elseif src_bpp == 24

  .set src_bpp_shift, 0

 .elseif src_bpp == 16

  .set src_bpp_shift, 1

 .elseif src_bpp == 8

  .set src_bpp_shift, 0

 .elseif src_bpp == 0

  .set src_bpp_shift, -1

 .else

  .error "requested src bpp (src_bpp) is not supported"

 .endif

 .if mask_bpp == 32

  .set mask_bpp_shift, 2

 .elseif mask_bpp == 24

  .set mask_bpp_shift, 0

 .elseif mask_bpp == 8

  .set mask_bpp_shift, 0

 .elseif mask_bpp == 0

  .set mask_bpp_shift, -1

 .else

  .error "requested mask bpp (mask_bpp) is not supported"

 .endif

 .if dst_w_bpp == 32

  .set dst_bpp_shift, 2

 .elseif dst_w_bpp == 24

  .set dst_bpp_shift, 0

 .elseif dst_w_bpp == 16

  .set dst_bpp_shift, 1

 .elseif dst_w_bpp == 8

  .set dst_bpp_shift, 0

 .else

  .error "requested dst bpp (dst_w_bpp) is not supported"

 .endif

 .if (((flags) & FLAG_DST_READWRITE) != 0)

  .set dst_r_bpp, dst_w_bpp

 .else

  .set dst_r_bpp, 0

 .endif

 .set pix_per_block, 16*8/dst_w_bpp

 .if src_bpp != 0

  .if 32*8/src_bpp > pix_per_block

   .set pix_per_block, 32*8/src_bpp

  .endif

 .endif

 .if mask_bpp != 0

  .if 32*8/mask_bpp > pix_per_block

   .set pix_per_block, 32*8/mask_bpp

  .endif

 .endif

 .if dst_r_bpp != 0

  .if 32*8/dst_r_bpp > pix_per_block

   .set pix_per_block, 32*8/dst_r_bpp

  .endif

 .endif

/* The standard entry conditions set up by pixman-arm-common.h are:

 * r0 = width (pixels)

 * r1 = height (rows)

 * r2 = pointer to top-left pixel of destination

 * r3 = destination stride (pixels)

 * [sp] = source pixel value, or pointer to top-left pixel of source

 * [sp,#4] = 0 or source stride (pixels)

 * The following arguments are unused for non-mask operations

 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask

 * [sp,#12] = 0 or mask stride (pixels)

*/

/*

 * Assign symbolic names to registers

*/

    X           .req    r0  /* pixels to go on this line */

    Y           .req    r1  /* lines to go */

    DST         .req    r2  /* destination pixel pointer */

    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */

    SRC         .req    r4  /* source pixel pointer */

    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */

    MASK        .req    r6  /* mask pixel pointer (if applicable) */

    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */

    WK0         .req    r8  /* pixel data registers */

    WK1         .req    r9

    WK2         .req    r10

    WK3         .req    r11

    SCRATCH     .req    r12

    ORIG_W      .req    r14 /* width (pixels) */

        push    {r4-r11, lr}        /* save all registers */

        subs    Y, Y, #1

        blo     199f

#ifdef DEBUG_PARAMS

        sub     sp, sp, #9*4

#endif

 .if src_bpp > 0

        ldr     SRC, [sp, #ARGS_STACK_OFFSET]

        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]

 .endif

 .if mask_bpp > 0

        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]

        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]

 .endif

#ifdef DEBUG_PARAMS

        add     Y, Y, #1

        stmia   sp, {r0-r7,pc}

        sub     Y, Y, #1

#endif

        \init

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

        /* Reserve a word in which to store X during leading pixels */

        sub     sp, sp, #4

  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4

  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4

 .endif

        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */

        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift

 .if src_bpp > 0

        lsl     STRIDE_S, #src_bpp_shift

        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift

 .endif

 .if mask_bpp > 0

        lsl     STRIDE_M, #mask_bpp_shift

        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift

 .endif

        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */

        cmp     X, #2*16*8/dst_w_bpp - 1

        blo     170f

 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */

        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */

        cmp     X, #(prefetch_distance+3)*pix_per_block - 1

        blo     160f

        /* Wide case */

        /* Adjust X so that the decrement instruction can also test for

         * inner loop termination. We want it to stop when there are

         * (prefetch_distance+1) complete blocks to go. */

        sub     X, X, #(prefetch_distance+2)*pix_per_block

        mov     ORIG_W, X

  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE

        /* This is stmdb sp!,{} */

        .word   0xE92D0000 | LINE_SAVED_REGS

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

  .endif

151:    /* New line */

        \newline

        preload_leading_step1  src_bpp, WK1, SRC

        preload_leading_step1  mask_bpp, WK2, MASK

  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

        preload_leading_step1  dst_r_bpp, WK3, DST

  .endif

        ands    WK0, DST, #15

        beq     154f

        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */

        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC

        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK

  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST

  .endif

        leading_15bytes  \process_head, \process_tail

154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */

  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

        and     SCRATCH, SRC, #31

        rsb     SCRATCH, SCRATCH, #32*prefetch_distance

  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)

        and     SCRATCH, MASK, #31

        rsb     SCRATCH, SCRATCH, #32*prefetch_distance

  .endif

  .ifc "\process_inner_loop",""

        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f

  .else

        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f

  .endif

157:    /* Check for another line */

        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b

  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE

   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

  .endif

 .endif

 .ltorg

160:    /* Medium case */

        mov     ORIG_W, X

 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

        /* This is stmdb sp!,{} */

        .word   0xE92D0000 | LINE_SAVED_REGS

  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4

 .endif

161:    /* New line */

        \newline

        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

        preload_line 0, mask_bpp, mask_bpp_shift, MASK

 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

        preload_line 0, dst_r_bpp, dst_bpp_shift, DST

 .endif

        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */

        ands    WK0, DST, #15

        beq     164f

        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */

        leading_15bytes  \process_head, \process_tail

164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */

        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f

167:    /* Check for another line */

        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b

 .ltorg

170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */

 .if dst_w_bpp < 32

        mov     ORIG_W, X

 .endif

 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

        /* This is stmdb sp!,{} */

        .word   0xE92D0000 | LINE_SAVED_REGS

 .endif

171:    /* New line */

        \newline

        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */

        preload_line 1, mask_bpp, mask_bpp_shift, MASK

 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0

        preload_line 1, dst_r_bpp, dst_bpp_shift, DST

 .endif

 .if dst_w_bpp == 8

        tst     DST, #3

        beq     174f

172:    subs    X, X, #1

        blo     177f

        \process_head  , 1, 0, 1, 1, 0

        \process_tail  , 1, 0

  .if !((flags) & FLAG_PROCESS_DOES_STORE)

        pixst   , 1, 0, DST

  .endif

        tst     DST, #3

        bne     172b

 .elseif dst_w_bpp == 16

        tst     DST, #2

        beq     174f

        subs    X, X, #1

        blo     177f

        \process_head  , 2, 0, 1, 1, 0

        \process_tail  , 2, 0

  .if !((flags) & FLAG_PROCESS_DOES_STORE)

        pixst   , 2, 0, DST

  .endif

 .endif

174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */

        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f

177:    /* Check for another line */

        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one

 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE

  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4

 .endif

197:

 .if (flags) & FLAG_SPILL_LINE_VARS

        add     sp, sp, #LINE_SAVED_REG_COUNT*4

 .endif

198:

 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0

  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4

  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4

        add     sp, sp, #4

 .endif

        \cleanup

#ifdef DEBUG_PARAMS

        add     sp, sp, #9*4 /* junk the debug copy of arguments */

#endif

199:

        pop     {r4-r11, pc}  /* exit */

 .ltorg

    .unreq  X

    .unreq  Y

    .unreq  DST

    .unreq  STRIDE_D

    .unreq  SRC

    .unreq  STRIDE_S

    .unreq  MASK

    .unreq  STRIDE_M

    .unreq  WK0

    .unreq  WK1

    .unreq  WK2

    .unreq  WK3

    .unreq  SCRATCH

    .unreq  ORIG_W

    pixman_end_asm_function

.endm

.macro line_saved_regs  x:vararg

 .set LINE_SAVED_REGS, 0

 .set LINE_SAVED_REG_COUNT, 0

 .irp SAVED_REG,\x

  .ifc "SAVED_REG","Y"

   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)

   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

  .endif

  .ifc "SAVED_REG","STRIDE_D"

   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)

   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

  .endif

  .ifc "SAVED_REG","STRIDE_S"

   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)

   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

  .endif

  .ifc "SAVED_REG","STRIDE_M"

   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)

   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

  .endif

  .ifc "SAVED_REG","ORIG_W"

   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)

   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1

  .endif

 .endr

.endm

.macro nop_macro x:vararg

.endm