123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754 |
- /*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
- #define END(f) .size f, .-f;
- /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
- * integer (bicubic has a little overshoot). It would also be possible to add
- * a temporary DC bias to eliminate the sign bit for more precision, but that's
- * extra arithmetic.
- */
- .set VERTBITS, 14
- /* The size of the scratch buffer in which we store our vertically convolved
- * intermediates.
- */
- .set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
- .set CHUNKSIZE, (1 << CHUNKSHIFT)
- /* The number of components processed in a single iteration of the innermost
- * loop.
- */
- .set VECSHIFT, 3
- .set VECSIZE, (1<<VECSHIFT)
- /* Read four different lines (except at edges where addresses may be clamped,
- * which is why we don't simply take base and stride registers), and multiply
- * and accumulate them by the coefficients in v3[0..3], leaving the results in
- * v12. This gives eight 16-bit results representing a horizontal line of 2-8
- * input pixels (depending on number of components per pixel) to be fed into
- * the horizontal scaling pass.
- *
- * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
- * known to represent negative values and VMLS is used to implement this).
- * Output is VERTBITS signed fixed-point, which must leave room for a little
- * v12. This gives eight 16-bit results.
- */
- .macro vert8, dstlo=v12.4h, dsthi=v12.8h
- ld1 {v8.8b}, [x4], #8
- ld1 {v9.8b}, [x5], #8
- ld1 {v10.8b}, [x6], #8
- ld1 {v11.8b}, [x7], #8
- uxtl v8.8h, v8.8b
- uxtl v9.8h, v9.8b
- uxtl v10.8h, v10.8b
- uxtl v11.8h, v11.8b
- umull v12.4s, v9.4h, v3.h[1]
- umull2 v13.4s, v9.8h, v3.h[1]
- umlsl v12.4s, v8.4h, v3.h[0]
- umlsl2 v13.4s, v8.8h, v3.h[0]
- umlal v12.4s, v10.4h, v3.h[2]
- umlal2 v13.4s, v10.8h, v3.h[2]
- umlsl v12.4s, v11.4h, v3.h[3]
- umlsl2 v13.4s, v11.8h, v3.h[3]
- /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
- * minus VERTBITS (the number of fraction bits we want to keep from
- * here on).
- */
- sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS)
- sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS)
- .endm
- /* As above, but only four 16-bit results into v12hi.
- */
- .macro vert4, dst=v12.8h
- ld1 {v8.s}[0], [x4], #4
- ld1 {v9.s}[0], [x5], #4
- ld1 {v10.s}[0], [x6], #4
- ld1 {v11.s}[0], [x7], #4
- uxtl v8.8h, v8.8b
- uxtl v9.8h, v9.8b
- uxtl v10.8h, v10.8b
- uxtl v11.8h, v11.8b
- umull v12.4s, v9.4h, v3.h[1]
- umlsl v12.4s, v8.4h, v3.h[0]
- umlal v12.4s, v10.4h, v3.h[2]
- umlsl v12.4s, v11.4h, v3.h[3]
- .ifc \dst,v12.8h
- sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS)
- .else
- sqshrn \dst, v12.4s, #8 + (16 - VERTBITS)
- .endif
- .endm
- /* During horizontal resize having CHUNKSIZE input available means being able
- * to produce a varying amount of output, depending on the phase of the data.
- * This function calculates the minimum number of VECSIZE chunks extracted from
- * a CHUNKSIZE window (x1), and the threshold value for when the count will be
- * one higher than that (x0).
- * These work out, conveniently, to be the quotient and remainder from:
- * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
- *
- * The two values are packed together in a uint64_t for convenience; and
- * they are, in fact, used this way as an arithmetic short-cut later on.
- */
- /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
- ENTRY(rsdIntrinsicResize_oscctl_K)
- lsl x2, x0, #VECSHIFT
- mov x0, #(CHUNKSIZE << 16) - 1
- add x0, x0, x2
- udiv x1, x0, x2
- msub x0, x1, x2, x0
- add x0, x0, x1, LSL #32
- ret
- END(rsdIntrinsicResize_oscctl_K)
- /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
- * For the most part the vertical pass (the outer loop) is the same for all
- * versions. Exceptions are handled in-line with conditional assembly.
- */
- .irp comp, 1, 2, 4
- .if \comp == 1
- .set COMPONENT_SHIFT, 0
- .elseif \comp == 2
- .set COMPONENT_SHIFT, 1
- .elseif \comp == 4
- .set COMPONENT_SHIFT, 2
- .else
- .error "Unknown component count"
- .endif
- .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
- .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
- .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
- /* void rsdIntrinsicResizeB1_K(
- * uint8_t * restrict dst, // x0
- * size_t count, // x1
- * uint32_t xf, // x2
- * uint32_t xinc, // x3
- * uint8_t const * restrict srcn, // x4
- * uint8_t const * restrict src0, // x5
- * uint8_t const * restrict src1, // x6
- * uint8_t const * restrict src2, // x7
- * size_t xclip, // [sp,#0] -> [sp,#80] -> x12
- * size_t avail, // [sp,#8] -> [sp,#88] -> x11
- * uint64_t osc_ctl, // [sp,#16] -> [sp,#96] -> x10
- * int32 const *yr, // [sp,#24] -> [sp,#104] -> v4 (copied to v3 for scalar access)
- */
- ENTRY(rsdIntrinsicResizeB\comp\()_K)
- sub x8, sp, #48
- sub sp, sp, #80
- st1 {v8.1d - v11.1d}, [sp]
- st1 {v12.1d - v15.1d}, [x8]
- str x19, [x8, #32]
- /* align the working buffer on the stack to make it easy to use bit
- * twiddling for address calculations.
- */
- sub x12, sp, #BUFFER_SIZE
- bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
- ldr x8, [sp,#104] // yr
- adrp x9, intrinsic_resize_consts
- add x9, x9, :lo12:intrinsic_resize_consts
- ld1 {v4.4s}, [x8]
- ld1 {v5.8h}, [x9]
- sqxtun v4.4h, v4.4s // yr
- dup v6.8h, w2
- dup v7.8h, w3
- mla v6.8h, v5.8h, v7.8h // vxf
- shl v7.8h, v7.8h, #VECSHIFT // vxinc
- /* Compute starting condition for oscillator used to compute ahead
- * of time how many iterations are possible before needing to
- * refill the working buffer. This is based on the fixed-point
- * index of the last element in the vector of pixels processed in
- * each iteration, counting up until it would overflow.
- */
- sub x8, x2, x3
- lsl x9, x3, #VECSHIFT
- add x8, x8, x9
- ldr x10, [sp,#96] // osc_ctl
- ldp x13,x11, [sp,#80] // xclip, avail
- mov x19, sp
- mov sp, x12
- /* x4-x7 contain pointers to the four lines of input to be
- * convolved. These pointers have been clamped vertically and
- * horizontally (which is why it's not a simple row/stride pair),
- * and the xclip argument (now in x13) indicates how many pixels
- * from true the x position of the pointer is. This value should
- * be 0, 1, or 2 only.
- *
- * Start by placing four pixels worth of input at the far end of
- * the buffer. As many as two of these may be clipped, so four
- * pixels are fetched, and then the first pixel is duplicated and
- * the data shifted according to xclip. The source pointers are
- * then also adjusted according to xclip so that subsequent fetches
- * match.
- */
- mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */
- sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
- add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
- add x14, x14, #4 * COMPONENT_COUNT * 2
- .if \comp == 1
- vert4 v12.4h
- dup v11.4h, v12.h[0]
- st1 {v11.4h,v12.4h}, [x12]
- ld1 {v12.4h}, [x14]
- st1 {v12.4h}, [x15]
- .elseif \comp == 2
- vert8
- dup v11.4s, v12.s[0]
- st1 {v11.8h,v12.8h}, [x12]
- ld1 {v12.8h}, [x14]
- st1 {v12.8h}, [x15]
- .elseif \comp == 4
- vert8 v14.4h, v14.8h
- vert8 v15.4h, v15.8h
- dup v12.2d, v14.d[0]
- dup v13.2d, v14.d[0]
- st1 {v12.8h,v13.8h}, [x12], #32
- st1 {v14.8h,v15.8h}, [x12]
- sub x12, x12, #32
- ld1 {v11.8h,v12.8h}, [x14]
- st1 {v11.8h,v12.8h}, [x15]
- .endif
- /* Count off four pixels into the working buffer.
- */
- sub x11, x11, #4
- /* Incoming pointers were to the first _legal_ pixel. Four pixels
- * were read unconditionally, but some may have been discarded by
- * xclip, so we rewind the pointers to compensate.
- */
- sub x4, x4, x13, LSL #(COMPONENT_SHIFT)
- sub x5, x5, x13, LSL #(COMPONENT_SHIFT)
- sub x6, x6, x13, LSL #(COMPONENT_SHIFT)
- sub x7, x7, x13, LSL #(COMPONENT_SHIFT)
- /* First tap starts where we just pre-filled, at the end of the
- * buffer.
- */
- add x2, x2, #(CHUNKSIZE * 2 - 4) << 16
- /* Use overflowing arithmetic to implement wraparound array
- * indexing.
- */
- lsl x2, x2, #(47 - CHUNKSHIFT)
- lsl x3, x3, #(47 - CHUNKSHIFT)
- /* Start of outermost loop.
- * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
- * number of iterations of the inner loop that can be performed and
- * get into that.
- *
- * The fill is complicated by the possibility of running out of
- * input before the scratch buffer is filled. If this isn't a risk
- * then it's handled by the simple loop at 2:, otherwise the
- * horrible loop at 3:.
- */
- 1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */
- subs x11, x11, #CHUNKSIZE
- bge 2f /* if at least CHUNKSIZE are available... */
- add x11, x11, #CHUNKSIZE /* if they're not... */
- b 4f
- /* basic fill loop, processing 8 bytes at a time until there are
- * fewer than eight bytes available.
- */
- 3: vert8
- sub x11, x11, #8 / COMPONENT_COUNT
- st1 {v12.8h}, [x12], #16
- 4: cmp x11, #8 / COMPONENT_COUNT - 1
- bgt 3b
- .if \comp == 4
- blt 3f
- /* The last pixel (four bytes) if necessary */
- vert4
- .else
- cmp x11, #1
- blt 3f
- /* The last pixels if necessary */
- sub x4, x4, #8
- sub x5, x5, #8
- sub x6, x6, #8
- sub x7, x7, #8
- add x4, x4, x11, LSL #(COMPONENT_SHIFT)
- add x5, x5, x11, LSL #(COMPONENT_SHIFT)
- add x6, x6, x11, LSL #(COMPONENT_SHIFT)
- add x7, x7, x11, LSL #(COMPONENT_SHIFT)
- vert8
- sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
- sub sp, sp, #32
- sub x11, x11, #16
- .if \comp == 1
- dup v13.8h, v12.h[7]
- .elseif \comp == 2
- dup v13.4s, v12.s[3]
- .endif
- st1 {v12.8h,v13.8h}, [sp]
- ld1 {v12.8h}, [x11]
- add sp, sp, #32
- b 4f
- .endif
- /* Keep filling until we get to the end of this chunk of the buffer */
- 3:
- .if \comp == 1
- dup v12.8h, v12.h[7]
- .elseif \comp == 2
- dup v12.4s, v12.s[3]
- .elseif \comp == 4
- dup v12.2d, v12.d[1]
- .endif
- 4: st1 {v12.8h}, [x12], #16
- tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
- bne 3b
- b 4f
- .align 4
- 2: /* Quickly pull a chunk of data into the working buffer.
- */
- vert8
- st1 {v12.8h}, [x12], #16
- vert8
- st1 {v12.8h}, [x12], #16
- tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
- bne 2b
- cmp x11, #0
- bne 3f
- 4: /* if we end with 0 pixels left we'll have nothing handy to spread
- * across to the right, so we rewind a bit.
- */
- mov x11, #1
- sub x4, x4, #COMPONENT_COUNT
- sub x5, x5, #COMPONENT_COUNT
- sub x6, x6, #COMPONENT_COUNT
- sub x7, x7, #COMPONENT_COUNT
- 3: /* copy four taps (width of cubic window) to far end for overflow
- * address handling
- */
- sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
- eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
- .if \comp == 1
- ld1 {v14.4h}, [x13]
- .elseif \comp == 2
- ld1 {v14.8h}, [x13]
- .elseif \comp == 4
- ld1 {v14.8h,v15.8h}, [x13]
- .endif
- add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
- .if \comp == 1
- st1 {v14.4h}, [x13]
- .elseif \comp == 2
- st1 {v14.8h}, [x13]
- .elseif \comp == 4
- st1 {v14.8h,v15.8h}, [x13]
- .endif
- /* The high 32-bits of x10 contains the maximum possible iteration
- * count, but if x8 is greater than the low 32-bits of x10 then
- * this indicates that the count must be reduced by one for this
- * iteration to avoid reading past the end of the available data.
- */
- sub x13, x10, x8
- lsr x13, x13, #32
- madd x8, x13, x9, x8
- sub x8, x8, #(CHUNKSIZE << 16)
- /* prefer to count pixels, rather than vectors, to clarify the tail
- * store case on exit.
- */
- lsl x13, x13, #VECSHIFT
- cmp x13, x1
- csel x13, x1, x13, gt
- sub x1, x1, x13
- lsl x13, x13, #COMPONENT_SHIFT
- mov w14, #0x8000
- movi v30.8h, #3
- dup v31.8h, w14
- cmp x13, #0
- bgt 3f
- cmp x1, #0
- bgt 1b /* an extreme case where we shouldn't use code in this structure */
- b 9f
- .align 4
- 2: /* Inner loop continues here, but starts at 3:, see end of loop
- * below for explanation. */
- .if LOOP_OUTPUT_SIZE == 4
- st1 {v8.s}[0], [x0], #4
- .elseif LOOP_OUTPUT_SIZE == 8
- st1 {v8.8b}, [x0], #8
- .elseif LOOP_OUTPUT_SIZE == 16
- st1 {v8.16b}, [x0], #16
- .elseif LOOP_OUTPUT_SIZE == 32
- st1 {v8.16b,v9.16b}, [x0], #32
- .endif
- /* Inner loop: here the four x coefficients for each tap are
- * calculated in vector code, and the addresses are calculated in
- * scalar code, and these calculations are interleaved.
- */
- 3: ushr v8.8h, v6.8h, #1 // sxf
- lsr x14, x2, #(63 - CHUNKSHIFT)
- sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2
- add x2, x2, x3
- sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3
- lsr x15, x2, #(63 - CHUNKSHIFT)
- sshll v11.4s, v9.4h, #2
- sshll2 v12.4s, v9.8h, #2
- add x2, x2, x3
- smlsl v11.4s, v10.4h, v30.4h
- smlsl2 v12.4s, v10.8h, v30.8h
- lsr x16, x2, #(63 - CHUNKSHIFT)
- shadd v0.8h, v10.8h, v8.8h
- add x2, x2, x3
- sub v0.8h, v9.8h, v0.8h
- lsr x17, x2, #(63 - CHUNKSHIFT)
- saddw v1.4s, v11.4s, v9.4h
- saddw2 v13.4s, v12.4s, v9.8h
- add x2, x2, x3
- shrn v1.4h, v1.4s, #1
- shrn2 v1.8h, v13.4s, #1
- add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
- sub v1.8h, v1.8h, v31.8h
- add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
- saddw v2.4s, v11.4s, v8.4h
- saddw2 v13.4s, v12.4s, v8.8h
- add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
- shrn v2.4h, v2.4s, #1
- shrn2 v2.8h, v13.4s, #1
- add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
- neg v2.8h, v2.8h
- shsub v3.8h, v10.8h, v9.8h
- /* increment the x fractional parts (oveflow is ignored, as the
- * scalar arithmetic shadows this addition with full precision).
- */
- add v6.8h, v6.8h, v7.8h
- /* At this point we have four pointers in x8-x11, pointing to the
- * four taps in the scratch buffer that must be convolved together
- * to produce an output pixel (one output pixel per pointer).
- * These pointers usually overlap, but their spacing is irregular
- * so resolving the redundancy through L1 is a pragmatic solution.
- *
- * The scratch buffer is made of signed 16-bit data, holding over
- * some extra precision, and overshoot, from the vertical pass.
- *
- * We also have the 16-bit unsigned fixed-point weights for each
- * of the four taps in v0 - v3. That's eight pixels worth of
- * coefficients when we have only four pointers, so calculations
- * for four more pixels are interleaved with the fetch and permute
- * code for each variant in the following code.
- *
- * The data arrangement is less than ideal for any pixel format,
- * but permuting loads help to mitigate most of the problems.
- *
- * Note also that the two outside taps of a bicubic are negative,
- * but these coefficients are unsigned. The sign is hard-coded by
- * use of multiply-and-subtract operations.
- */
- .if \comp == 1
- /* The uchar 1 case.
- * Issue one lanewise ld4.h to load four consecutive pixels from
- * one pointer (one pixel) into four different registers; then load
- * four consecutive s16 values from the next pointer (pixel) into
- * the next lane of those four registers, etc., so that we finish
- * with v12 - v15 representing the four taps, and each lane
- * representing a separate pixel.
- *
- * The first ld4 uses a splat to avoid any false dependency on
- * the previous state of the register.
- */
- ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
- lsr x14, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15]
- add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
- lsr x15, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16]
- add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
- lsr x16, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17]
- add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
- lsr x17, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14]
- add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
- ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15]
- ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16]
- ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17]
- smull v8.4s, v12.4h, v0.4h
- smull2 v9.4s, v12.8h, v0.8h
- smlsl v8.4s, v13.4h, v1.4h
- smlsl2 v9.4s, v13.8h, v1.8h
- smlsl v8.4s, v14.4h, v2.4h
- smlsl2 v9.4s, v14.8h, v2.8h
- smlal v8.4s, v15.4h, v3.4h
- smlal2 v9.4s, v15.8h, v3.8h
- subs x13, x13, #LOOP_OUTPUT_SIZE
- sqrshrn v8.4h, v8.4s, #15
- sqrshrn2 v8.8h, v9.4s, #15
- sqrshrun v8.8b, v8.8h, #VERTBITS - 8
- .elseif \comp == 2
- /* The uchar2 case:
- * This time load pairs of values into adjacent lanes in v12 - v15
- * by aliasing them as u32 data; leaving room for only four pixels,
- * so the process has to be done twice. This also means that the
- * coefficient registers fail to align with the coefficient data
- * (eight separate pixels), so that has to be doubled-up to match.
- */
- ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
- lsr x14, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
- add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
- lsr x15, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
- add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
- lsr x16, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
- add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
- lsr x17, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- /* double-up coefficients to align with component pairs */
- zip1 v16.8h, v0.8h, v0.8h
- add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
- zip1 v17.8h, v1.8h, v1.8h
- zip1 v18.8h, v2.8h, v2.8h
- zip1 v19.8h, v3.8h, v3.8h
- smull v8.4s, v12.4h, v16.4h
- smull2 v9.4s, v12.8h, v16.8h
- smlsl v8.4s, v13.4h, v17.4h
- smlsl2 v9.4s, v13.8h, v17.8h
- smlsl v8.4s, v14.4h, v18.4h
- smlsl2 v9.4s, v14.8h, v18.8h
- smlal v8.4s, v15.4h, v19.4h
- smlal2 v9.4s, v15.8h, v19.8h
- sqrshrn v8.4h, v8.4s, #15
- sqrshrn2 v8.8h, v9.4s, #15
- ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
- ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
- ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
- ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
- /* double-up coefficients to align with component pairs */
- zip2 v16.8h, v0.8h, v0.8h
- zip2 v17.8h, v1.8h, v1.8h
- zip2 v18.8h, v2.8h, v2.8h
- zip2 v19.8h, v3.8h, v3.8h
- smull v10.4s, v12.4h, v16.4h
- smull2 v11.4s, v12.8h, v16.8h
- smlsl v10.4s, v13.4h, v17.4h
- smlsl2 v11.4s, v13.8h, v17.8h
- smlsl v10.4s, v14.4h, v18.4h
- smlsl2 v11.4s, v14.8h, v18.8h
- smlal v10.4s, v15.4h, v19.4h
- smlal2 v11.4s, v15.8h, v19.8h
- subs x13, x13, #LOOP_OUTPUT_SIZE
- sqrshrn v9.4h, v10.4s, #15
- sqrshrn2 v9.8h, v11.4s, #15
- sqrshrun v8.8b, v8.8h, #VERTBITS - 8
- sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
- .elseif \comp == 4
- /* The uchar4 case.
- * This case is comparatively painless because four s16s are the
- * smallest addressable unit for a vmul-by-scalar. Rather than
- * permute the data, simply arrange the multiplies to suit the way
- * the data comes in. That's a lot of data, though, so things
- * progress in pairs of pixels at a time.
- */
- ld1 {v12.8h,v13.8h}, [x14]
- lsr x14, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld1 {v14.8h,v15.8h}, [x15]
- add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
- lsr x15, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- smull v8.4s, v12.4h, v0.h[0]
- smull v9.4s, v14.4h, v0.h[1]
- smlsl2 v8.4s, v12.8h, v1.h[0]
- smlsl2 v9.4s, v14.8h, v1.h[1]
- smlsl v8.4s, v13.4h, v2.h[0]
- smlsl v9.4s, v15.4h, v2.h[1]
- smlal2 v8.4s, v13.8h, v3.h[0]
- smlal2 v9.4s, v15.8h, v3.h[1]
- /* And two more... */
- ld1 {v12.8h,v13.8h}, [x16]
- add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
- lsr x16, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- ld1 {v14.8h,v15.8h}, [x17]
- add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
- lsr x17, x2, #(63 - CHUNKSHIFT)
- add x2, x2, x3
- sqrshrn v8.4h, v8.4s, #15
- add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
- sqrshrn2 v8.8h, v9.4s, #15
- smull v10.4s, v12.4h, v0.h[2]
- smull v11.4s, v14.4h, v0.h[3]
- smlsl2 v10.4s, v12.8h, v1.h[2]
- smlsl2 v11.4s, v14.8h, v1.h[3]
- smlsl v10.4s, v13.4h, v2.h[2]
- smlsl v11.4s, v15.4h, v2.h[3]
- smlal2 v10.4s, v13.8h, v3.h[2]
- smlal2 v11.4s, v15.8h, v3.h[3]
- sqrshrn v9.4h, v10.4s, #15
- sqrshrn2 v9.8h, v11.4s, #15
- sqrshrun v8.8b, v8.8h, #VERTBITS - 8
- sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
- /* And two more... */
- ld1 {v12.8h,v13.8h}, [x14]
- ld1 {v14.8h,v15.8h}, [x15]
- smull v10.4s, v12.4h, v0.h[4]
- smull v11.4s, v14.4h, v0.h[5]
- smlsl2 v10.4s, v12.8h, v1.h[4]
- smlsl2 v11.4s, v14.8h, v1.h[5]
- smlsl v10.4s, v13.4h, v2.h[4]
- smlsl v11.4s, v15.4h, v2.h[5]
- smlal2 v10.4s, v13.8h, v3.h[4]
- smlal2 v11.4s, v15.8h, v3.h[5]
- /* And two more... */
- ld1 {v12.8h,v13.8h}, [x16]
- ld1 {v14.8h,v15.8h}, [x17]
- subs x13, x13, #LOOP_OUTPUT_SIZE
- sqrshrn v9.4h, v10.4s, #15
- sqrshrn2 v9.8h, v11.4s, #15
- smull v10.4s, v12.4h, v0.h[6]
- smull v11.4s, v14.4h, v0.h[7]
- smlsl2 v10.4s, v12.8h, v1.h[6]
- smlsl2 v11.4s, v14.8h, v1.h[7]
- smlsl v10.4s, v13.4h, v2.h[6]
- smlsl v11.4s, v15.4h, v2.h[7]
- smlal2 v10.4s, v13.8h, v3.h[6]
- smlal2 v11.4s, v15.8h, v3.h[7]
- sqrshrn v10.4h, v10.4s, #15
- sqrshrn2 v10.8h, v11.4s, #15
- sqrshrun v9.8b, v9.8h, #VERTBITS - 8
- sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8
- .endif
- bgt 2b /* continue inner loop */
- /* The inner loop has already been limited to ensure that none of
- * the earlier iterations could overfill the output, so the store
- * appears within the loop but after the conditional branch (at the
- * top). At the end, provided it won't overfill, perform the final
- * store here. If it would, then break out to the tricky tail case
- * instead.
- */
- blt 1f
- /* Store the amount of data appropriate to the configuration of the
- * instance being assembled.
- */
- .if LOOP_OUTPUT_SIZE == 4
- st1 {v8.s}[0], [x0], #4
- .elseif LOOP_OUTPUT_SIZE == 8
- st1 {v8.8b}, [x0], #8
- .elseif LOOP_OUTPUT_SIZE == 16
- st1 {v8.16b}, [x0], #16
- .elseif LOOP_OUTPUT_SIZE == 32
- st1 {v8.16b,v9.16b}, [x0], #32
- .endif
- b 1b /* resume outer loop */
- /* Partial tail store case:
- * Different versions of the code need different subsets of the
- * following partial stores. Here the number of components and the
- * size of the chunk of data produced by each inner loop iteration
- * is tested to figure out whether or not each phrase is relevant.
- */
- .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
- 1: tst x13, #16
- beq 1f
- st1 {v8.16b}, [x0], #16
- mov v8.16b, v9.16b
- .endif
- .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
- 1: tst x13, #8
- beq 1f
- st1 {v8.8b}, [x0], #8
- ext v8.16b, v8.16b, v8.16b, #8
- .endif
- .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
- 1: tst x13, #4
- beq 1f
- st1 {v8.s}[0], [x0], #4
- ext v8.8b, v8.8b, v8.8b, #4
- .endif
- .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
- 1: tst x13, #2
- beq 1f
- st1 {v8.h}[0], [x0], #2
- ext v8.8b, v8.8b, v8.8b, #2
- .endif
- .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
- 1: tst x13, #1
- beq 1f
- st1 {v8.b}[0], [x0], #1
- .endif
- 1:
- 9: mov sp, x19
- ld1 {v8.1d - v11.1d}, [sp], #32
- ld1 {v12.1d - v15.1d}, [sp], #32
- ldr x19, [sp], #16
- ret
- END(rsdIntrinsicResizeB\comp\()_K)
- .endr
- .rodata
- intrinsic_resize_consts: .hword 0, 1, 2, 3, 4, 5, 6, 7
|