Blame pixman/pixman-arm-simd-asm.h

Packit 030a23
/*
Packit 030a23
 * Copyright © 2012 Raspberry Pi Foundation
Packit 030a23
 * Copyright © 2012 RISC OS Open Ltd
Packit 030a23
 *
Packit 030a23
 * Permission to use, copy, modify, distribute, and sell this software and its
Packit 030a23
 * documentation for any purpose is hereby granted without fee, provided that
Packit 030a23
 * the above copyright notice appear in all copies and that both that
Packit 030a23
 * copyright notice and this permission notice appear in supporting
Packit 030a23
 * documentation, and that the name of the copyright holders not be used in
Packit 030a23
 * advertising or publicity pertaining to distribution of the software without
Packit 030a23
 * specific, written prior permission.  The copyright holders make no
Packit 030a23
 * representations about the suitability of this software for any purpose.  It
Packit 030a23
 * is provided "as is" without express or implied warranty.
Packit 030a23
 *
Packit 030a23
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
Packit 030a23
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
Packit 030a23
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
Packit 030a23
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
Packit 030a23
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
Packit 030a23
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
Packit 030a23
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
Packit 030a23
 * SOFTWARE.
Packit 030a23
 *
Packit 030a23
 * Author:  Ben Avison (bavison@riscosopen.org)
Packit 030a23
 *
Packit 030a23
 */
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Because the alignment of pixel data to cachelines, and even the number of
Packit 030a23
 * cachelines per row can vary from row to row, and because of the need to
Packit 030a23
 * preload each scanline once and only once, this prefetch strategy treats
Packit 030a23
 * each row of pixels independently. When a pixel row is long enough, there
Packit 030a23
 * are three distinct phases of prefetch:
Packit 030a23
 * * an inner loop section, where each time a cacheline of data is
Packit 030a23
 *    processed, another cacheline is preloaded (the exact distance ahead is
Packit 030a23
 *    determined empirically using profiling results from lowlevel-blt-bench)
Packit 030a23
 * * a leading section, where enough cachelines are preloaded to ensure no
Packit 030a23
 *    cachelines escape being preloaded when the inner loop starts
Packit 030a23
 * * a trailing section, where a limited number (0 or more) of cachelines
Packit 030a23
 *    are preloaded to deal with data (if any) that hangs off the end of the
Packit 030a23
 *    last iteration of the inner loop, plus any trailing bytes that were not
Packit 030a23
 *    enough to make up one whole iteration of the inner loop
Packit 030a23
 * 
Packit 030a23
 * There are (in general) three distinct code paths, selected between
Packit 030a23
 * depending upon how long the pixel row is. If it is long enough that there
Packit 030a23
 * is at least one iteration of the inner loop (as described above) then
Packit 030a23
 * this is described as the "wide" case. If it is shorter than that, but
Packit 030a23
 * there are still enough bytes output that there is at least one 16-byte-
Packit 030a23
 * long, 16-byte-aligned write to the destination (the optimum type of
Packit 030a23
 * write), then this is the "medium" case. If it is not even this long, then
Packit 030a23
 * this is the "narrow" case, and there is no attempt to align writes to
Packit 030a23
 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
Packit 030a23
 * cachelines containing data from the pixel row are prefetched up-front.
Packit 030a23
 */
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Determine whether we put the arguments on the stack for debugging.
Packit 030a23
 */
Packit 030a23
#undef DEBUG_PARAMS
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Bit flags for 'generate_composite_function' macro which are used
Packit 030a23
 * to tune generated functions behavior.
Packit 030a23
 */
Packit 030a23
.set FLAG_DST_WRITEONLY,         0
Packit 030a23
.set FLAG_DST_READWRITE,         1
Packit 030a23
.set FLAG_COND_EXEC,             0
Packit 030a23
.set FLAG_BRANCH_OVER,           2
Packit 030a23
.set FLAG_PROCESS_PRESERVES_PSR, 0
Packit 030a23
.set FLAG_PROCESS_CORRUPTS_PSR,  4
Packit 030a23
.set FLAG_PROCESS_DOESNT_STORE,  0
Packit 030a23
.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
Packit 030a23
.set FLAG_NO_SPILL_LINE_VARS,        0
Packit 030a23
.set FLAG_SPILL_LINE_VARS_WIDE,      16
Packit 030a23
.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
Packit 030a23
.set FLAG_SPILL_LINE_VARS,           48
Packit 030a23
.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
Packit 030a23
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
Packit 030a23
.set FLAG_PROCESS_PRESERVES_WK0,     0
Packit 030a23
.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
Packit 030a23
.set FLAG_PRELOAD_DST,               0
Packit 030a23
.set FLAG_NO_PRELOAD_DST,            256
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Number of bytes by which to adjust preload offset of destination
Packit 030a23
 * buffer (allows preload instruction to be moved before the load(s))
Packit 030a23
 */
Packit 030a23
.set DST_PRELOAD_BIAS, 0
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Offset into stack where mask and source pointer/stride can be accessed.
Packit 030a23
 */
Packit 030a23
#ifdef DEBUG_PARAMS
Packit 030a23
.set ARGS_STACK_OFFSET,        (9*4+9*4)
Packit 030a23
#else
Packit 030a23
.set ARGS_STACK_OFFSET,        (9*4)
Packit 030a23
#endif
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Offset into stack where space allocated during init macro can be accessed.
Packit 030a23
 */
Packit 030a23
.set LOCALS_STACK_OFFSET,     0
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Constants for selecting preferable prefetch type.
Packit 030a23
 */
Packit 030a23
.set PREFETCH_TYPE_NONE,       0
Packit 030a23
.set PREFETCH_TYPE_STANDARD,   1
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Definitions of macros for load/store of pixel data.
Packit 030a23
 */
Packit 030a23
Packit 030a23
.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
Packit 030a23
 .if numbytes == 16
Packit 030a23
  .if unaligned == 1
Packit 030a23
        op&r&cond    WK&reg0, [base], #4
Packit 030a23
        op&r&cond    WK&reg1, [base], #4
Packit 030a23
        op&r&cond    WK&reg2, [base], #4
Packit 030a23
        op&r&cond    WK&reg3, [base], #4
Packit 030a23
  .else
Packit 030a23
        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
Packit 030a23
  .endif
Packit 030a23
 .elseif numbytes == 8
Packit 030a23
  .if unaligned == 1
Packit 030a23
        op&r&cond    WK&reg0, [base], #4
Packit 030a23
        op&r&cond    WK&reg1, [base], #4
Packit 030a23
  .else
Packit 030a23
        op&m&cond&ia base!, {WK&reg0,WK&reg1}
Packit 030a23
  .endif
Packit 030a23
 .elseif numbytes == 4
Packit 030a23
        op&r&cond    WK&reg0, [base], #4
Packit 030a23
 .elseif numbytes == 2
Packit 030a23
        op&r&cond&h  WK&reg0, [base], #2
Packit 030a23
 .elseif numbytes == 1
Packit 030a23
        op&r&cond&b  WK&reg0, [base], #1
Packit 030a23
 .else
Packit 030a23
  .error "unsupported size: numbytes"
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
Packit 030a23
 .if numbytes == 16
Packit 030a23
        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
Packit 030a23
 .elseif numbytes == 8
Packit 030a23
        stm&cond&db base, {WK&reg0,WK&reg1}
Packit 030a23
 .elseif numbytes == 4
Packit 030a23
        str&cond    WK&reg0, [base, #-4]
Packit 030a23
 .elseif numbytes == 2
Packit 030a23
        str&cond&h  WK&reg0, [base, #-2]
Packit 030a23
 .elseif numbytes == 1
Packit 030a23
        str&cond&b  WK&reg0, [base, #-1]
Packit 030a23
 .else
Packit 030a23
  .error "unsupported size: numbytes"
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro pixld cond, numbytes, firstreg, base, unaligned
Packit 030a23
        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro pixst cond, numbytes, firstreg, base
Packit 030a23
 .if (flags) & FLAG_DST_READWRITE
Packit 030a23
        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
Packit 030a23
 .else
Packit 030a23
        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro PF a, x:vararg
Packit 030a23
 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
Packit 030a23
        a x
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro preload_leading_step1  bpp, ptr, base
Packit 030a23
/* If the destination is already 16-byte aligned, then we need to preload
Packit 030a23
 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
Packit 030a23
 * are no gaps when the inner loop starts.
Packit 030a23
 */
Packit 030a23
 .if bpp > 0
Packit 030a23
        PF  bic,    ptr, base, #31
Packit 030a23
  .set OFFSET, 0
Packit 030a23
  .rept prefetch_distance+1
Packit 030a23
        PF  pld,    [ptr, #OFFSET]
Packit 030a23
   .set OFFSET, OFFSET+32
Packit 030a23
  .endr
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro preload_leading_step2  bpp, bpp_shift, ptr, base
Packit 030a23
/* However, if the destination is not 16-byte aligned, we may need to
Packit 030a23
 * preload more cache lines than that. The question we need to ask is:
Packit 030a23
 * are the bytes corresponding to the leading pixels more than the amount
Packit 030a23
 * by which the source pointer will be rounded down for preloading, and if
Packit 030a23
 * so, by how many cache lines? Effectively, we want to calculate
Packit 030a23
 *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
Packit 030a23
 *     inner_loop_offset = (src+leading_bytes)&31
Packit 030a23
 *     extra_needed = leading_bytes - inner_loop_offset
Packit 030a23
 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
Packit 030a23
 * possible when there are 4 src bytes for every 1 dst byte).
Packit 030a23
 */
Packit 030a23
 .if bpp > 0
Packit 030a23
  .ifc base,DST
Packit 030a23
        /* The test can be simplified further when preloading the destination */
Packit 030a23
        PF  tst,    base, #16
Packit 030a23
        PF  beq,    61f
Packit 030a23
  .else
Packit 030a23
   .if bpp/dst_w_bpp == 4
Packit 030a23
        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
Packit 030a23
        PF  and,    SCRATCH, SCRATCH, #31
Packit 030a23
        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
Packit 030a23
        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
Packit 030a23
        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
Packit 030a23
        PF  bcs,    61f
Packit 030a23
        PF  bpl,    60f
Packit 030a23
        PF  pld,    [ptr, #32*(prefetch_distance+2)]
Packit 030a23
   .else
Packit 030a23
        PF  mov,    SCRATCH, base, lsl #32-5
Packit 030a23
        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
Packit 030a23
        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
Packit 030a23
        PF  bls,    61f
Packit 030a23
   .endif
Packit 030a23
  .endif
Packit 030a23
60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
Packit 030a23
61:
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
Packit 030a23
.macro preload_middle   bpp, base, scratch_holds_offset
Packit 030a23
 .if bpp > 0
Packit 030a23
        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
Packit 030a23
  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
Packit 030a23
   .if scratch_holds_offset
Packit 030a23
        PF  pld,    [base, SCRATCH]
Packit 030a23
   .else
Packit 030a23
        PF  bic,    SCRATCH, base, #31
Packit 030a23
        PF  pld,    [SCRATCH, #32*prefetch_distance]
Packit 030a23
   .endif
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro preload_trailing  bpp, bpp_shift, base
Packit 030a23
 .if bpp > 0
Packit 030a23
  .if bpp*pix_per_block > 256
Packit 030a23
        /* Calculations are more complex if more than one fetch per block */
Packit 030a23
        PF  and,    WK1, base, #31
Packit 030a23
        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
Packit 030a23
        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
Packit 030a23
        PF  bic,    SCRATCH, base, #31
Packit 030a23
80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
Packit 030a23
        PF  add,    SCRATCH, SCRATCH, #32
Packit 030a23
        PF  subs,   WK1, WK1, #32
Packit 030a23
        PF  bhi,    80b
Packit 030a23
  .else
Packit 030a23
        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
Packit 030a23
        PF  mov,    SCRATCH, base, lsl #32-5
Packit 030a23
        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
Packit 030a23
        PF  adceqs, SCRATCH, SCRATCH, #0
Packit 030a23
        /* The instruction above has two effects: ensures Z is only
Packit 030a23
         * set if C was clear (so Z indicates that both shifted quantities
Packit 030a23
         * were 0), and clears C if Z was set (so C indicates that the sum
Packit 030a23
         * of the shifted quantities was greater and not equal to 32) */
Packit 030a23
        PF  beq,    82f
Packit 030a23
        PF  bic,    SCRATCH, base, #31
Packit 030a23
        PF  bcc,    81f
Packit 030a23
        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
Packit 030a23
81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
Packit 030a23
82:
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro preload_line    narrow_case, bpp, bpp_shift, base
Packit 030a23
/* "narrow_case" - just means that the macro was invoked from the "narrow"
Packit 030a23
 *    code path rather than the "medium" one - because in the narrow case,
Packit 030a23
 *    the row of pixels is known to output no more than 30 bytes, then
Packit 030a23
 *    (assuming the source pixels are no wider than the the destination
Packit 030a23
 *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
Packit 030a23
 *    meaning there's no need for a loop.
Packit 030a23
 * "bpp" - number of bits per pixel in the channel (source, mask or
Packit 030a23
 *    destination) that's being preloaded, or 0 if this channel is not used
Packit 030a23
 *    for reading
Packit 030a23
 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
Packit 030a23
 * "base" - base address register of channel to preload (SRC, MASK or DST)
Packit 030a23
 */
Packit 030a23
 .if bpp > 0
Packit 030a23
  .if narrow_case && (bpp <= dst_w_bpp)
Packit 030a23
        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
Packit 030a23
        PF  bic,    WK0, base, #31
Packit 030a23
        PF  pld,    [WK0]
Packit 030a23
        PF  add,    WK1, base, X, LSL #bpp_shift
Packit 030a23
        PF  sub,    WK1, WK1, #1
Packit 030a23
        PF  bic,    WK1, WK1, #31
Packit 030a23
        PF  cmp,    WK1, WK0
Packit 030a23
        PF  beq,    90f
Packit 030a23
        PF  pld,    [WK1]
Packit 030a23
90:
Packit 030a23
  .else
Packit 030a23
        PF  bic,    WK0, base, #31
Packit 030a23
        PF  pld,    [WK0]
Packit 030a23
        PF  add,    WK1, base, X, lsl #bpp_shift
Packit 030a23
        PF  sub,    WK1, WK1, #1
Packit 030a23
        PF  bic,    WK1, WK1, #31
Packit 030a23
        PF  cmp,    WK1, WK0
Packit 030a23
        PF  beq,    92f
Packit 030a23
91:     PF  add,    WK0, WK0, #32
Packit 030a23
        PF  cmp,    WK0, WK1
Packit 030a23
        PF  pld,    [WK0]
Packit 030a23
        PF  bne,    91b
Packit 030a23
92:
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
Packit 030a23
        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
Packit 030a23
 .if decrementx
Packit 030a23
        sub&cond X, X, #8*numbytes/dst_w_bpp
Packit 030a23
 .endif
Packit 030a23
        process_tail  cond, numbytes, firstreg
Packit 030a23
 .if !((flags) & FLAG_PROCESS_DOES_STORE)
Packit 030a23
        pixst   cond, numbytes, firstreg, DST
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
Packit 030a23
 .if (flags) & FLAG_BRANCH_OVER
Packit 030a23
  .ifc cond,mi
Packit 030a23
        bpl     100f
Packit 030a23
  .endif
Packit 030a23
  .ifc cond,cs
Packit 030a23
        bcc     100f
Packit 030a23
  .endif
Packit 030a23
  .ifc cond,ne
Packit 030a23
        beq     100f
Packit 030a23
  .endif
Packit 030a23
        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
Packit 030a23
100:
Packit 030a23
 .else
Packit 030a23
        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
Packit 030a23
 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
Packit 030a23
        /* Can't interleave reads and writes */
Packit 030a23
        test
Packit 030a23
        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
Packit 030a23
  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
Packit 030a23
        test
Packit 030a23
  .endif
Packit 030a23
        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
Packit 030a23
 .else
Packit 030a23
        /* Can interleave reads and writes for better scheduling */
Packit 030a23
        test
Packit 030a23
        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
Packit 030a23
        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
Packit 030a23
  .if decrementx
Packit 030a23
        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
Packit 030a23
        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
Packit 030a23
  .endif
Packit 030a23
        process_tail  cond1, numbytes1, firstreg1
Packit 030a23
        process_tail  cond2, numbytes2, firstreg2
Packit 030a23
        pixst   cond1, numbytes1, firstreg1, DST
Packit 030a23
        pixst   cond2, numbytes2, firstreg2, DST
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro test_bits_1_0_ptr
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
Packit 030a23
 .else
Packit 030a23
        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro test_bits_3_2_ptr
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
Packit 030a23
 .else
Packit 030a23
        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro leading_15bytes  process_head, process_tail
Packit 030a23
        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
Packit 030a23
 .set DECREMENT_X, 1
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
  .set DECREMENT_X, 0
Packit 030a23
        sub     X, X, WK0, lsr #dst_bpp_shift
Packit 030a23
        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
Packit 030a23
        mov     X, WK0
Packit 030a23
 .endif
Packit 030a23
        /* Use unaligned loads in all cases for simplicity */
Packit 030a23
 .if dst_w_bpp == 8
Packit 030a23
        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
Packit 030a23
 .elseif dst_w_bpp == 16
Packit 030a23
        test_bits_1_0_ptr
Packit 030a23
        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
Packit 030a23
 .endif
Packit 030a23
        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro test_bits_3_2_pix
Packit 030a23
        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro test_bits_1_0_pix
Packit 030a23
 .if dst_w_bpp == 8
Packit 030a23
        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
Packit 030a23
 .else
Packit 030a23
        movs    SCRATCH, X, lsr #1
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
Packit 030a23
        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
Packit 030a23
 .if dst_w_bpp == 16
Packit 030a23
        test_bits_1_0_pix
Packit 030a23
        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
Packit 030a23
 .elseif dst_w_bpp == 8
Packit 030a23
        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
Packit 030a23
110:
Packit 030a23
 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
Packit 030a23
 .rept pix_per_block*dst_w_bpp/128
Packit 030a23
        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
Packit 030a23
  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
Packit 030a23
        preload_middle  src_bpp, SRC, 1
Packit 030a23
  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
Packit 030a23
        preload_middle  mask_bpp, MASK, 1
Packit 030a23
  .else
Packit 030a23
        preload_middle  src_bpp, SRC, 0
Packit 030a23
        preload_middle  mask_bpp, MASK, 0
Packit 030a23
  .endif
Packit 030a23
  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
Packit 030a23
        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
Packit 030a23
         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
Packit 030a23
         * preloads for, to achieve staggered prefetches for multiple channels, because there are
Packit 030a23
         * always two STMs per prefetch, so there is always an opposite STM on which to put the
Packit 030a23
         * preload. Note, no need to BIC the base register here */
Packit 030a23
        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
Packit 030a23
  .endif
Packit 030a23
        process_tail  , 16, 0
Packit 030a23
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
Packit 030a23
        pixst   , 16, 0, DST
Packit 030a23
  .endif
Packit 030a23
  .set SUBBLOCK, SUBBLOCK+1
Packit 030a23
 .endr
Packit 030a23
        subs    X, X, #pix_per_block
Packit 030a23
        bhs     110b
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
Packit 030a23
        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
Packit 030a23
 .if dst_r_bpp > 0
Packit 030a23
        tst     DST, #16
Packit 030a23
        bne     111f
Packit 030a23
        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
Packit 030a23
        b       112f
Packit 030a23
111:
Packit 030a23
 .endif
Packit 030a23
        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
Packit 030a23
112:
Packit 030a23
        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
Packit 030a23
 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
Packit 030a23
        PF  and,    WK0, X, #pix_per_block-1
Packit 030a23
 .endif
Packit 030a23
        preload_trailing  src_bpp, src_bpp_shift, SRC
Packit 030a23
        preload_trailing  mask_bpp, mask_bpp_shift, MASK
Packit 030a23
 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
Packit 030a23
        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
Packit 030a23
 .endif
Packit 030a23
        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
Packit 030a23
        /* The remainder of the line is handled identically to the medium case */
Packit 030a23
        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
Packit 030a23
120:
Packit 030a23
        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
Packit 030a23
        process_tail  , 16, 0
Packit 030a23
 .if !((flags) & FLAG_PROCESS_DOES_STORE)
Packit 030a23
        pixst   , 16, 0, DST
Packit 030a23
 .endif
Packit 030a23
        subs    X, X, #128/dst_w_bpp
Packit 030a23
        bhs     120b
Packit 030a23
        /* Trailing pixels */
Packit 030a23
        tst     X, #128/dst_w_bpp - 1
Packit 030a23
        beq     exit_label
Packit 030a23
        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
Packit 030a23
        tst     X, #16*8/dst_w_bpp
Packit 030a23
        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
Packit 030a23
        /* Trailing pixels */
Packit 030a23
        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
Packit 030a23
        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
Packit 030a23
 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
Packit 030a23
 .if mask_bpp == 8 || mask_bpp == 16
Packit 030a23
        tst     MASK, #3
Packit 030a23
        bne     141f
Packit 030a23
 .endif
Packit 030a23
  .if src_bpp == 8 || src_bpp == 16
Packit 030a23
        tst     SRC, #3
Packit 030a23
        bne     140f
Packit 030a23
  .endif
Packit 030a23
        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
Packit 030a23
  .if src_bpp == 8 || src_bpp == 16
Packit 030a23
        b       exit_label
Packit 030a23
140:
Packit 030a23
        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
Packit 030a23
  .endif
Packit 030a23
 .if mask_bpp == 8 || mask_bpp == 16
Packit 030a23
        b       exit_label
Packit 030a23
141:
Packit 030a23
  .if src_bpp == 8 || src_bpp == 16
Packit 030a23
        tst     SRC, #3
Packit 030a23
        bne     142f
Packit 030a23
  .endif
Packit 030a23
        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
Packit 030a23
  .if src_bpp == 8 || src_bpp == 16
Packit 030a23
        b       exit_label
Packit 030a23
142:
Packit 030a23
        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
Packit 030a23
 .if vars_spilled
Packit 030a23
        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
Packit 030a23
        /* This is ldmia sp,{} */
Packit 030a23
        .word   0xE89D0000 | LINE_SAVED_REGS
Packit 030a23
 .endif
Packit 030a23
        subs    Y, Y, #1
Packit 030a23
 .if vars_spilled
Packit 030a23
  .if (LINE_SAVED_REGS) & (1<<1)
Packit 030a23
        str     Y, [sp]
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
        add     DST, DST, STRIDE_D
Packit 030a23
 .if src_bpp > 0
Packit 030a23
        add     SRC, SRC, STRIDE_S
Packit 030a23
 .endif
Packit 030a23
 .if mask_bpp > 0
Packit 030a23
        add     MASK, MASK, STRIDE_M
Packit 030a23
 .endif
Packit 030a23
 .if restore_x
Packit 030a23
        mov     X, ORIG_W
Packit 030a23
 .endif
Packit 030a23
        bhs     loop_label
Packit 030a23
 .ifc "last_one",""
Packit 030a23
  .if vars_spilled
Packit 030a23
        b       197f
Packit 030a23
  .else
Packit 030a23
        b       198f
Packit 030a23
  .endif
Packit 030a23
 .else
Packit 030a23
  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
Packit 030a23
        b       198f
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
.endm
Packit 030a23
Packit 030a23
Packit 030a23
.macro generate_composite_function fname, \
Packit 030a23
                                   src_bpp_, \
Packit 030a23
                                   mask_bpp_, \
Packit 030a23
                                   dst_w_bpp_, \
Packit 030a23
                                   flags_, \
Packit 030a23
                                   prefetch_distance_, \
Packit 030a23
                                   init, \
Packit 030a23
                                   newline, \
Packit 030a23
                                   cleanup, \
Packit 030a23
                                   process_head, \
Packit 030a23
                                   process_tail, \
Packit 030a23
                                   process_inner_loop
Packit 030a23
Packit 030a23
    pixman_asm_function fname
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Make some macro arguments globally visible and accessible
Packit 030a23
 * from other macros
Packit 030a23
 */
Packit 030a23
 .set src_bpp, src_bpp_
Packit 030a23
 .set mask_bpp, mask_bpp_
Packit 030a23
 .set dst_w_bpp, dst_w_bpp_
Packit 030a23
 .set flags, flags_
Packit 030a23
 .set prefetch_distance, prefetch_distance_
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Select prefetch type for this function.
Packit 030a23
 */
Packit 030a23
 .if prefetch_distance == 0
Packit 030a23
  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
Packit 030a23
 .else
Packit 030a23
  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .if src_bpp == 32
Packit 030a23
  .set src_bpp_shift, 2
Packit 030a23
 .elseif src_bpp == 24
Packit 030a23
  .set src_bpp_shift, 0
Packit 030a23
 .elseif src_bpp == 16
Packit 030a23
  .set src_bpp_shift, 1
Packit 030a23
 .elseif src_bpp == 8
Packit 030a23
  .set src_bpp_shift, 0
Packit 030a23
 .elseif src_bpp == 0
Packit 030a23
  .set src_bpp_shift, -1
Packit 030a23
 .else
Packit 030a23
  .error "requested src bpp (src_bpp) is not supported"
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .if mask_bpp == 32
Packit 030a23
  .set mask_bpp_shift, 2
Packit 030a23
 .elseif mask_bpp == 24
Packit 030a23
  .set mask_bpp_shift, 0
Packit 030a23
 .elseif mask_bpp == 8
Packit 030a23
  .set mask_bpp_shift, 0
Packit 030a23
 .elseif mask_bpp == 0
Packit 030a23
  .set mask_bpp_shift, -1
Packit 030a23
 .else
Packit 030a23
  .error "requested mask bpp (mask_bpp) is not supported"
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .if dst_w_bpp == 32
Packit 030a23
  .set dst_bpp_shift, 2
Packit 030a23
 .elseif dst_w_bpp == 24
Packit 030a23
  .set dst_bpp_shift, 0
Packit 030a23
 .elseif dst_w_bpp == 16
Packit 030a23
  .set dst_bpp_shift, 1
Packit 030a23
 .elseif dst_w_bpp == 8
Packit 030a23
  .set dst_bpp_shift, 0
Packit 030a23
 .else
Packit 030a23
  .error "requested dst bpp (dst_w_bpp) is not supported"
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .if (((flags) & FLAG_DST_READWRITE) != 0)
Packit 030a23
  .set dst_r_bpp, dst_w_bpp
Packit 030a23
 .else
Packit 030a23
  .set dst_r_bpp, 0
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .set pix_per_block, 16*8/dst_w_bpp
Packit 030a23
 .if src_bpp != 0
Packit 030a23
  .if 32*8/src_bpp > pix_per_block
Packit 030a23
   .set pix_per_block, 32*8/src_bpp
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
 .if mask_bpp != 0
Packit 030a23
  .if 32*8/mask_bpp > pix_per_block
Packit 030a23
   .set pix_per_block, 32*8/mask_bpp
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
 .if dst_r_bpp != 0
Packit 030a23
  .if 32*8/dst_r_bpp > pix_per_block
Packit 030a23
   .set pix_per_block, 32*8/dst_r_bpp
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
/* The standard entry conditions set up by pixman-arm-common.h are:
Packit 030a23
 * r0 = width (pixels)
Packit 030a23
 * r1 = height (rows)
Packit 030a23
 * r2 = pointer to top-left pixel of destination
Packit 030a23
 * r3 = destination stride (pixels)
Packit 030a23
 * [sp] = source pixel value, or pointer to top-left pixel of source
Packit 030a23
 * [sp,#4] = 0 or source stride (pixels)
Packit 030a23
 * The following arguments are unused for non-mask operations
Packit 030a23
 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
Packit 030a23
 * [sp,#12] = 0 or mask stride (pixels)
Packit 030a23
 */
Packit 030a23
Packit 030a23
/*
Packit 030a23
 * Assign symbolic names to registers
Packit 030a23
 */
Packit 030a23
    X           .req    r0  /* pixels to go on this line */
Packit 030a23
    Y           .req    r1  /* lines to go */
Packit 030a23
    DST         .req    r2  /* destination pixel pointer */
Packit 030a23
    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
Packit 030a23
    SRC         .req    r4  /* source pixel pointer */
Packit 030a23
    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
Packit 030a23
    MASK        .req    r6  /* mask pixel pointer (if applicable) */
Packit 030a23
    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
Packit 030a23
    WK0         .req    r8  /* pixel data registers */
Packit 030a23
    WK1         .req    r9
Packit 030a23
    WK2         .req    r10
Packit 030a23
    WK3         .req    r11
Packit 030a23
    SCRATCH     .req    r12
Packit 030a23
    ORIG_W      .req    r14 /* width (pixels) */
Packit 030a23
Packit 030a23
        push    {r4-r11, lr}        /* save all registers */
Packit 030a23
Packit 030a23
        subs    Y, Y, #1
Packit 030a23
        blo     199f
Packit 030a23
Packit 030a23
#ifdef DEBUG_PARAMS
Packit 030a23
        sub     sp, sp, #9*4
Packit 030a23
#endif
Packit 030a23
Packit 030a23
 .if src_bpp > 0
Packit 030a23
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
Packit 030a23
        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
Packit 030a23
 .endif
Packit 030a23
 .if mask_bpp > 0
Packit 030a23
        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
Packit 030a23
        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
Packit 030a23
 .endif
Packit 030a23
        
Packit 030a23
#ifdef DEBUG_PARAMS
Packit 030a23
        add     Y, Y, #1
Packit 030a23
        stmia   sp, {r0-r7,pc}
Packit 030a23
        sub     Y, Y, #1
Packit 030a23
#endif
Packit 030a23
Packit 030a23
        init
Packit 030a23
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
        /* Reserve a word in which to store X during leading pixels */
Packit 030a23
        sub     sp, sp, #4
Packit 030a23
  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
Packit 030a23
  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
Packit 030a23
 .endif
Packit 030a23
        
Packit 030a23
        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
Packit 030a23
        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
Packit 030a23
 .if src_bpp > 0
Packit 030a23
        lsl     STRIDE_S, #src_bpp_shift
Packit 030a23
        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
Packit 030a23
 .endif
Packit 030a23
 .if mask_bpp > 0
Packit 030a23
        lsl     STRIDE_M, #mask_bpp_shift
Packit 030a23
        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
Packit 030a23
 .endif
Packit 030a23
 
Packit 030a23
        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
Packit 030a23
        cmp     X, #2*16*8/dst_w_bpp - 1
Packit 030a23
        blo     170f
Packit 030a23
 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
Packit 030a23
        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
Packit 030a23
        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
Packit 030a23
        blo     160f
Packit 030a23
Packit 030a23
        /* Wide case */
Packit 030a23
        /* Adjust X so that the decrement instruction can also test for
Packit 030a23
         * inner loop termination. We want it to stop when there are
Packit 030a23
         * (prefetch_distance+1) complete blocks to go. */
Packit 030a23
        sub     X, X, #(prefetch_distance+2)*pix_per_block
Packit 030a23
        mov     ORIG_W, X
Packit 030a23
  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
Packit 030a23
        /* This is stmdb sp!,{} */
Packit 030a23
        .word   0xE92D0000 | LINE_SAVED_REGS
Packit 030a23
   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
Packit 030a23
   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
Packit 030a23
  .endif
Packit 030a23
151:    /* New line */
Packit 030a23
        newline
Packit 030a23
        preload_leading_step1  src_bpp, WK1, SRC
Packit 030a23
        preload_leading_step1  mask_bpp, WK2, MASK
Packit 030a23
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
Packit 030a23
        preload_leading_step1  dst_r_bpp, WK3, DST
Packit 030a23
  .endif
Packit 030a23
        
Packit 030a23
        ands    WK0, DST, #15
Packit 030a23
        beq     154f
Packit 030a23
        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
Packit 030a23
Packit 030a23
        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
Packit 030a23
        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
Packit 030a23
  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
Packit 030a23
        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
Packit 030a23
  .endif
Packit 030a23
Packit 030a23
        leading_15bytes  process_head, process_tail
Packit 030a23
        
Packit 030a23
154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
Packit 030a23
  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
Packit 030a23
        and     SCRATCH, SRC, #31
Packit 030a23
        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
Packit 030a23
  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
Packit 030a23
        and     SCRATCH, MASK, #31
Packit 030a23
        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
Packit 030a23
  .endif
Packit 030a23
  .ifc "process_inner_loop",""
Packit 030a23
        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
Packit 030a23
  .else
Packit 030a23
        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
Packit 030a23
  .endif
Packit 030a23
Packit 030a23
157:    /* Check for another line */
Packit 030a23
        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
Packit 030a23
  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
Packit 030a23
   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
Packit 030a23
   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
 .ltorg
Packit 030a23
Packit 030a23
160:    /* Medium case */
Packit 030a23
        mov     ORIG_W, X
Packit 030a23
 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
Packit 030a23
        /* This is stmdb sp!,{} */
Packit 030a23
        .word   0xE92D0000 | LINE_SAVED_REGS
Packit 030a23
  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
Packit 030a23
  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
Packit 030a23
 .endif
Packit 030a23
161:    /* New line */
Packit 030a23
        newline
Packit 030a23
        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
Packit 030a23
        preload_line 0, mask_bpp, mask_bpp_shift, MASK
Packit 030a23
 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
Packit 030a23
        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
Packit 030a23
 .endif
Packit 030a23
        
Packit 030a23
        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
Packit 030a23
        ands    WK0, DST, #15
Packit 030a23
        beq     164f
Packit 030a23
        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
Packit 030a23
        
Packit 030a23
        leading_15bytes  process_head, process_tail
Packit 030a23
        
Packit 030a23
164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
Packit 030a23
        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
Packit 030a23
        
Packit 030a23
167:    /* Check for another line */
Packit 030a23
        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
Packit 030a23
Packit 030a23
 .ltorg
Packit 030a23
Packit 030a23
170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
Packit 030a23
 .if dst_w_bpp < 32
Packit 030a23
        mov     ORIG_W, X
Packit 030a23
 .endif
Packit 030a23
 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
Packit 030a23
        /* This is stmdb sp!,{} */
Packit 030a23
        .word   0xE92D0000 | LINE_SAVED_REGS
Packit 030a23
 .endif
Packit 030a23
171:    /* New line */
Packit 030a23
        newline
Packit 030a23
        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
Packit 030a23
        preload_line 1, mask_bpp, mask_bpp_shift, MASK
Packit 030a23
 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
Packit 030a23
        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
Packit 030a23
 .endif
Packit 030a23
        
Packit 030a23
 .if dst_w_bpp == 8
Packit 030a23
        tst     DST, #3
Packit 030a23
        beq     174f
Packit 030a23
172:    subs    X, X, #1
Packit 030a23
        blo     177f
Packit 030a23
        process_head  , 1, 0, 1, 1, 0
Packit 030a23
        process_tail  , 1, 0
Packit 030a23
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
Packit 030a23
        pixst   , 1, 0, DST
Packit 030a23
  .endif
Packit 030a23
        tst     DST, #3
Packit 030a23
        bne     172b
Packit 030a23
 .elseif dst_w_bpp == 16
Packit 030a23
        tst     DST, #2
Packit 030a23
        beq     174f
Packit 030a23
        subs    X, X, #1
Packit 030a23
        blo     177f
Packit 030a23
        process_head  , 2, 0, 1, 1, 0
Packit 030a23
        process_tail  , 2, 0
Packit 030a23
  .if !((flags) & FLAG_PROCESS_DOES_STORE)
Packit 030a23
        pixst   , 2, 0, DST
Packit 030a23
  .endif
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
Packit 030a23
        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
Packit 030a23
Packit 030a23
177:    /* Check for another line */
Packit 030a23
        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
Packit 030a23
 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
Packit 030a23
  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
Packit 030a23
  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
197:
Packit 030a23
 .if (flags) & FLAG_SPILL_LINE_VARS
Packit 030a23
        add     sp, sp, #LINE_SAVED_REG_COUNT*4
Packit 030a23
 .endif
Packit 030a23
198:
Packit 030a23
 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
Packit 030a23
  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
Packit 030a23
  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
Packit 030a23
        add     sp, sp, #4
Packit 030a23
 .endif
Packit 030a23
Packit 030a23
        cleanup
Packit 030a23
Packit 030a23
#ifdef DEBUG_PARAMS
Packit 030a23
        add     sp, sp, #9*4 /* junk the debug copy of arguments */
Packit 030a23
#endif
Packit 030a23
199:
Packit 030a23
        pop     {r4-r11, pc}  /* exit */
Packit 030a23
Packit 030a23
 .ltorg
Packit 030a23
Packit 030a23
    .unreq  X
Packit 030a23
    .unreq  Y
Packit 030a23
    .unreq  DST
Packit 030a23
    .unreq  STRIDE_D
Packit 030a23
    .unreq  SRC
Packit 030a23
    .unreq  STRIDE_S
Packit 030a23
    .unreq  MASK
Packit 030a23
    .unreq  STRIDE_M
Packit 030a23
    .unreq  WK0
Packit 030a23
    .unreq  WK1
Packit 030a23
    .unreq  WK2
Packit 030a23
    .unreq  WK3
Packit 030a23
    .unreq  SCRATCH
Packit 030a23
    .unreq  ORIG_W
Packit 030a23
    .endfunc
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro line_saved_regs  x:vararg
Packit 030a23
 .set LINE_SAVED_REGS, 0
Packit 030a23
 .set LINE_SAVED_REG_COUNT, 0
Packit 030a23
 .irp SAVED_REG,x
Packit 030a23
  .ifc "SAVED_REG","Y"
Packit 030a23
   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
Packit 030a23
   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
Packit 030a23
  .endif
Packit 030a23
  .ifc "SAVED_REG","STRIDE_D"
Packit 030a23
   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
Packit 030a23
   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
Packit 030a23
  .endif
Packit 030a23
  .ifc "SAVED_REG","STRIDE_S"
Packit 030a23
   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
Packit 030a23
   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
Packit 030a23
  .endif
Packit 030a23
  .ifc "SAVED_REG","STRIDE_M"
Packit 030a23
   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
Packit 030a23
   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
Packit 030a23
  .endif
Packit 030a23
  .ifc "SAVED_REG","ORIG_W"
Packit 030a23
   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
Packit 030a23
   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
Packit 030a23
  .endif
Packit 030a23
 .endr
Packit 030a23
.endm
Packit 030a23
Packit 030a23
.macro nop_macro x:vararg
Packit 030a23
.endm