rsCpuIntrinsics_advsimd_Resize.S 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /*
  2. * Copyright (C) 2015 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
  17. #define END(f) .size f, .-f;
  18. /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
  19. * integer (bicubic has a little overshoot). It would also be possible to add
  20. * a temporary DC bias to eliminate the sign bit for more precision, but that's
  21. * extra arithmetic.
  22. */
  23. .set VERTBITS, 14
  24. /* The size of the scratch buffer in which we store our vertically convolved
  25. * intermediates.
  26. */
  27. .set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
  28. .set CHUNKSIZE, (1 << CHUNKSHIFT)
  29. /* The number of components processed in a single iteration of the innermost
  30. * loop.
  31. */
  32. .set VECSHIFT, 3
  33. .set VECSIZE, (1<<VECSHIFT)
  34. /* Read four different lines (except at edges where addresses may be clamped,
  35. * which is why we don't simply take base and stride registers), and multiply
  36. * and accumulate them by the coefficients in v3[0..3], leaving the results in
  37. * v12. This gives eight 16-bit results representing a horizontal line of 2-8
  38. * input pixels (depending on number of components per pixel) to be fed into
  39. * the horizontal scaling pass.
  40. *
  41. * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
  42. * known to represent negative values and VMLS is used to implement this).
  43. * Output is VERTBITS signed fixed-point, which must leave room for a little
  44. * v12. This gives eight 16-bit results.
  45. */
  46. .macro vert8, dstlo=v12.4h, dsthi=v12.8h
  47. ld1 {v8.8b}, [x4], #8
  48. ld1 {v9.8b}, [x5], #8
  49. ld1 {v10.8b}, [x6], #8
  50. ld1 {v11.8b}, [x7], #8
  51. uxtl v8.8h, v8.8b
  52. uxtl v9.8h, v9.8b
  53. uxtl v10.8h, v10.8b
  54. uxtl v11.8h, v11.8b
  55. umull v12.4s, v9.4h, v3.h[1]
  56. umull2 v13.4s, v9.8h, v3.h[1]
  57. umlsl v12.4s, v8.4h, v3.h[0]
  58. umlsl2 v13.4s, v8.8h, v3.h[0]
  59. umlal v12.4s, v10.4h, v3.h[2]
  60. umlal2 v13.4s, v10.8h, v3.h[2]
  61. umlsl v12.4s, v11.4h, v3.h[3]
  62. umlsl2 v13.4s, v11.8h, v3.h[3]
  63. /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
  64. * minus VERTBITS (the number of fraction bits we want to keep from
  65. * here on).
  66. */
  67. sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS)
  68. sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS)
  69. .endm
  70. /* As above, but only four 16-bit results into v12hi.
  71. */
  72. .macro vert4, dst=v12.8h
  73. ld1 {v8.s}[0], [x4], #4
  74. ld1 {v9.s}[0], [x5], #4
  75. ld1 {v10.s}[0], [x6], #4
  76. ld1 {v11.s}[0], [x7], #4
  77. uxtl v8.8h, v8.8b
  78. uxtl v9.8h, v9.8b
  79. uxtl v10.8h, v10.8b
  80. uxtl v11.8h, v11.8b
  81. umull v12.4s, v9.4h, v3.h[1]
  82. umlsl v12.4s, v8.4h, v3.h[0]
  83. umlal v12.4s, v10.4h, v3.h[2]
  84. umlsl v12.4s, v11.4h, v3.h[3]
  85. .ifc \dst,v12.8h
  86. sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS)
  87. .else
  88. sqshrn \dst, v12.4s, #8 + (16 - VERTBITS)
  89. .endif
  90. .endm
  91. /* During horizontal resize having CHUNKSIZE input available means being able
  92. * to produce a varying amount of output, depending on the phase of the data.
  93. * This function calculates the minimum number of VECSIZE chunks extracted from
  94. * a CHUNKSIZE window (x1), and the threshold value for when the count will be
  95. * one higher than that (x0).
  96. * These work out, conveniently, to be the quotient and remainder from:
  97. * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
  98. *
  99. * The two values are packed together in a uint64_t for convenience; and
  100. * they are, in fact, used this way as an arithmetic short-cut later on.
  101. */
  102. /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
  103. ENTRY(rsdIntrinsicResize_oscctl_K)
  104. lsl x2, x0, #VECSHIFT
  105. mov x0, #(CHUNKSIZE << 16) - 1
  106. add x0, x0, x2
  107. udiv x1, x0, x2
  108. msub x0, x1, x2, x0
  109. add x0, x0, x1, LSL #32
  110. ret
  111. END(rsdIntrinsicResize_oscctl_K)
  112. /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
  113. * For the most part the vertical pass (the outer loop) is the same for all
  114. * versions. Exceptions are handled in-line with conditional assembly.
  115. */
  116. .irp comp, 1, 2, 4
  117. .if \comp == 1
  118. .set COMPONENT_SHIFT, 0
  119. .elseif \comp == 2
  120. .set COMPONENT_SHIFT, 1
  121. .elseif \comp == 4
  122. .set COMPONENT_SHIFT, 2
  123. .else
  124. .error "Unknown component count"
  125. .endif
  126. .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
  127. .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
  128. .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
  129. /* void rsdIntrinsicResizeB1_K(
  130. * uint8_t * restrict dst, // x0
  131. * size_t count, // x1
  132. * uint32_t xf, // x2
  133. * uint32_t xinc, // x3
  134. * uint8_t const * restrict srcn, // x4
  135. * uint8_t const * restrict src0, // x5
  136. * uint8_t const * restrict src1, // x6
  137. * uint8_t const * restrict src2, // x7
  138. * size_t xclip, // [sp,#0] -> [sp,#80] -> x12
  139. * size_t avail, // [sp,#8] -> [sp,#88] -> x11
  140. * uint64_t osc_ctl, // [sp,#16] -> [sp,#96] -> x10
  141. * int32 const *yr, // [sp,#24] -> [sp,#104] -> v4 (copied to v3 for scalar access)
  142. */
  143. ENTRY(rsdIntrinsicResizeB\comp\()_K)
  144. sub x8, sp, #48
  145. sub sp, sp, #80
  146. st1 {v8.1d - v11.1d}, [sp]
  147. st1 {v12.1d - v15.1d}, [x8]
  148. str x19, [x8, #32]
  149. /* align the working buffer on the stack to make it easy to use bit
  150. * twiddling for address calculations.
  151. */
  152. sub x12, sp, #BUFFER_SIZE
  153. bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
  154. ldr x8, [sp,#104] // yr
  155. adrp x9, intrinsic_resize_consts
  156. add x9, x9, :lo12:intrinsic_resize_consts
  157. ld1 {v4.4s}, [x8]
  158. ld1 {v5.8h}, [x9]
  159. sqxtun v4.4h, v4.4s // yr
  160. dup v6.8h, w2
  161. dup v7.8h, w3
  162. mla v6.8h, v5.8h, v7.8h // vxf
  163. shl v7.8h, v7.8h, #VECSHIFT // vxinc
  164. /* Compute starting condition for oscillator used to compute ahead
  165. * of time how many iterations are possible before needing to
  166. * refill the working buffer. This is based on the fixed-point
  167. * index of the last element in the vector of pixels processed in
  168. * each iteration, counting up until it would overflow.
  169. */
  170. sub x8, x2, x3
  171. lsl x9, x3, #VECSHIFT
  172. add x8, x8, x9
  173. ldr x10, [sp,#96] // osc_ctl
  174. ldp x13,x11, [sp,#80] // xclip, avail
  175. mov x19, sp
  176. mov sp, x12
  177. /* x4-x7 contain pointers to the four lines of input to be
  178. * convolved. These pointers have been clamped vertically and
  179. * horizontally (which is why it's not a simple row/stride pair),
  180. * and the xclip argument (now in x13) indicates how many pixels
  181. * from true the x position of the pointer is. This value should
  182. * be 0, 1, or 2 only.
  183. *
  184. * Start by placing four pixels worth of input at the far end of
  185. * the buffer. As many as two of these may be clipped, so four
  186. * pixels are fetched, and then the first pixel is duplicated and
  187. * the data shifted according to xclip. The source pointers are
  188. * then also adjusted according to xclip so that subsequent fetches
  189. * match.
  190. */
  191. mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */
  192. sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
  193. add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
  194. add x14, x14, #4 * COMPONENT_COUNT * 2
  195. .if \comp == 1
  196. vert4 v12.4h
  197. dup v11.4h, v12.h[0]
  198. st1 {v11.4h,v12.4h}, [x12]
  199. ld1 {v12.4h}, [x14]
  200. st1 {v12.4h}, [x15]
  201. .elseif \comp == 2
  202. vert8
  203. dup v11.4s, v12.s[0]
  204. st1 {v11.8h,v12.8h}, [x12]
  205. ld1 {v12.8h}, [x14]
  206. st1 {v12.8h}, [x15]
  207. .elseif \comp == 4
  208. vert8 v14.4h, v14.8h
  209. vert8 v15.4h, v15.8h
  210. dup v12.2d, v14.d[0]
  211. dup v13.2d, v14.d[0]
  212. st1 {v12.8h,v13.8h}, [x12], #32
  213. st1 {v14.8h,v15.8h}, [x12]
  214. sub x12, x12, #32
  215. ld1 {v11.8h,v12.8h}, [x14]
  216. st1 {v11.8h,v12.8h}, [x15]
  217. .endif
  218. /* Count off four pixels into the working buffer.
  219. */
  220. sub x11, x11, #4
  221. /* Incoming pointers were to the first _legal_ pixel. Four pixels
  222. * were read unconditionally, but some may have been discarded by
  223. * xclip, so we rewind the pointers to compensate.
  224. */
  225. sub x4, x4, x13, LSL #(COMPONENT_SHIFT)
  226. sub x5, x5, x13, LSL #(COMPONENT_SHIFT)
  227. sub x6, x6, x13, LSL #(COMPONENT_SHIFT)
  228. sub x7, x7, x13, LSL #(COMPONENT_SHIFT)
  229. /* First tap starts where we just pre-filled, at the end of the
  230. * buffer.
  231. */
  232. add x2, x2, #(CHUNKSIZE * 2 - 4) << 16
  233. /* Use overflowing arithmetic to implement wraparound array
  234. * indexing.
  235. */
  236. lsl x2, x2, #(47 - CHUNKSHIFT)
  237. lsl x3, x3, #(47 - CHUNKSHIFT)
  238. /* Start of outermost loop.
  239. * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
  240. * number of iterations of the inner loop that can be performed and
  241. * get into that.
  242. *
  243. * The fill is complicated by the possibility of running out of
  244. * input before the scratch buffer is filled. If this isn't a risk
  245. * then it's handled by the simple loop at 2:, otherwise the
  246. * horrible loop at 3:.
  247. */
  248. 1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */
  249. subs x11, x11, #CHUNKSIZE
  250. bge 2f /* if at least CHUNKSIZE are available... */
  251. add x11, x11, #CHUNKSIZE /* if they're not... */
  252. b 4f
  253. /* basic fill loop, processing 8 bytes at a time until there are
  254. * fewer than eight bytes available.
  255. */
  256. 3: vert8
  257. sub x11, x11, #8 / COMPONENT_COUNT
  258. st1 {v12.8h}, [x12], #16
  259. 4: cmp x11, #8 / COMPONENT_COUNT - 1
  260. bgt 3b
  261. .if \comp == 4
  262. blt 3f
  263. /* The last pixel (four bytes) if necessary */
  264. vert4
  265. .else
  266. cmp x11, #1
  267. blt 3f
  268. /* The last pixels if necessary */
  269. sub x4, x4, #8
  270. sub x5, x5, #8
  271. sub x6, x6, #8
  272. sub x7, x7, #8
  273. add x4, x4, x11, LSL #(COMPONENT_SHIFT)
  274. add x5, x5, x11, LSL #(COMPONENT_SHIFT)
  275. add x6, x6, x11, LSL #(COMPONENT_SHIFT)
  276. add x7, x7, x11, LSL #(COMPONENT_SHIFT)
  277. vert8
  278. sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
  279. sub sp, sp, #32
  280. sub x11, x11, #16
  281. .if \comp == 1
  282. dup v13.8h, v12.h[7]
  283. .elseif \comp == 2
  284. dup v13.4s, v12.s[3]
  285. .endif
  286. st1 {v12.8h,v13.8h}, [sp]
  287. ld1 {v12.8h}, [x11]
  288. add sp, sp, #32
  289. b 4f
  290. .endif
  291. /* Keep filling until we get to the end of this chunk of the buffer */
  292. 3:
  293. .if \comp == 1
  294. dup v12.8h, v12.h[7]
  295. .elseif \comp == 2
  296. dup v12.4s, v12.s[3]
  297. .elseif \comp == 4
  298. dup v12.2d, v12.d[1]
  299. .endif
  300. 4: st1 {v12.8h}, [x12], #16
  301. tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
  302. bne 3b
  303. b 4f
  304. .align 4
  305. 2: /* Quickly pull a chunk of data into the working buffer.
  306. */
  307. vert8
  308. st1 {v12.8h}, [x12], #16
  309. vert8
  310. st1 {v12.8h}, [x12], #16
  311. tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
  312. bne 2b
  313. cmp x11, #0
  314. bne 3f
  315. 4: /* if we end with 0 pixels left we'll have nothing handy to spread
  316. * across to the right, so we rewind a bit.
  317. */
  318. mov x11, #1
  319. sub x4, x4, #COMPONENT_COUNT
  320. sub x5, x5, #COMPONENT_COUNT
  321. sub x6, x6, #COMPONENT_COUNT
  322. sub x7, x7, #COMPONENT_COUNT
  323. 3: /* copy four taps (width of cubic window) to far end for overflow
  324. * address handling
  325. */
  326. sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
  327. eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
  328. .if \comp == 1
  329. ld1 {v14.4h}, [x13]
  330. .elseif \comp == 2
  331. ld1 {v14.8h}, [x13]
  332. .elseif \comp == 4
  333. ld1 {v14.8h,v15.8h}, [x13]
  334. .endif
  335. add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
  336. .if \comp == 1
  337. st1 {v14.4h}, [x13]
  338. .elseif \comp == 2
  339. st1 {v14.8h}, [x13]
  340. .elseif \comp == 4
  341. st1 {v14.8h,v15.8h}, [x13]
  342. .endif
  343. /* The high 32-bits of x10 contains the maximum possible iteration
  344. * count, but if x8 is greater than the low 32-bits of x10 then
  345. * this indicates that the count must be reduced by one for this
  346. * iteration to avoid reading past the end of the available data.
  347. */
  348. sub x13, x10, x8
  349. lsr x13, x13, #32
  350. madd x8, x13, x9, x8
  351. sub x8, x8, #(CHUNKSIZE << 16)
  352. /* prefer to count pixels, rather than vectors, to clarify the tail
  353. * store case on exit.
  354. */
  355. lsl x13, x13, #VECSHIFT
  356. cmp x13, x1
  357. csel x13, x1, x13, gt
  358. sub x1, x1, x13
  359. lsl x13, x13, #COMPONENT_SHIFT
  360. mov w14, #0x8000
  361. movi v30.8h, #3
  362. dup v31.8h, w14
  363. cmp x13, #0
  364. bgt 3f
  365. cmp x1, #0
  366. bgt 1b /* an extreme case where we shouldn't use code in this structure */
  367. b 9f
  368. .align 4
  369. 2: /* Inner loop continues here, but starts at 3:, see end of loop
  370. * below for explanation. */
  371. .if LOOP_OUTPUT_SIZE == 4
  372. st1 {v8.s}[0], [x0], #4
  373. .elseif LOOP_OUTPUT_SIZE == 8
  374. st1 {v8.8b}, [x0], #8
  375. .elseif LOOP_OUTPUT_SIZE == 16
  376. st1 {v8.16b}, [x0], #16
  377. .elseif LOOP_OUTPUT_SIZE == 32
  378. st1 {v8.16b,v9.16b}, [x0], #32
  379. .endif
  380. /* Inner loop: here the four x coefficients for each tap are
  381. * calculated in vector code, and the addresses are calculated in
  382. * scalar code, and these calculations are interleaved.
  383. */
  384. 3: ushr v8.8h, v6.8h, #1 // sxf
  385. lsr x14, x2, #(63 - CHUNKSHIFT)
  386. sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2
  387. add x2, x2, x3
  388. sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3
  389. lsr x15, x2, #(63 - CHUNKSHIFT)
  390. sshll v11.4s, v9.4h, #2
  391. sshll2 v12.4s, v9.8h, #2
  392. add x2, x2, x3
  393. smlsl v11.4s, v10.4h, v30.4h
  394. smlsl2 v12.4s, v10.8h, v30.8h
  395. lsr x16, x2, #(63 - CHUNKSHIFT)
  396. shadd v0.8h, v10.8h, v8.8h
  397. add x2, x2, x3
  398. sub v0.8h, v9.8h, v0.8h
  399. lsr x17, x2, #(63 - CHUNKSHIFT)
  400. saddw v1.4s, v11.4s, v9.4h
  401. saddw2 v13.4s, v12.4s, v9.8h
  402. add x2, x2, x3
  403. shrn v1.4h, v1.4s, #1
  404. shrn2 v1.8h, v13.4s, #1
  405. add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
  406. sub v1.8h, v1.8h, v31.8h
  407. add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
  408. saddw v2.4s, v11.4s, v8.4h
  409. saddw2 v13.4s, v12.4s, v8.8h
  410. add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
  411. shrn v2.4h, v2.4s, #1
  412. shrn2 v2.8h, v13.4s, #1
  413. add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
  414. neg v2.8h, v2.8h
  415. shsub v3.8h, v10.8h, v9.8h
  416. /* increment the x fractional parts (oveflow is ignored, as the
  417. * scalar arithmetic shadows this addition with full precision).
  418. */
  419. add v6.8h, v6.8h, v7.8h
  420. /* At this point we have four pointers in x8-x11, pointing to the
  421. * four taps in the scratch buffer that must be convolved together
  422. * to produce an output pixel (one output pixel per pointer).
  423. * These pointers usually overlap, but their spacing is irregular
  424. * so resolving the redundancy through L1 is a pragmatic solution.
  425. *
  426. * The scratch buffer is made of signed 16-bit data, holding over
  427. * some extra precision, and overshoot, from the vertical pass.
  428. *
  429. * We also have the 16-bit unsigned fixed-point weights for each
  430. * of the four taps in v0 - v3. That's eight pixels worth of
  431. * coefficients when we have only four pointers, so calculations
  432. * for four more pixels are interleaved with the fetch and permute
  433. * code for each variant in the following code.
  434. *
  435. * The data arrangement is less than ideal for any pixel format,
  436. * but permuting loads help to mitigate most of the problems.
  437. *
  438. * Note also that the two outside taps of a bicubic are negative,
  439. * but these coefficients are unsigned. The sign is hard-coded by
  440. * use of multiply-and-subtract operations.
  441. */
  442. .if \comp == 1
  443. /* The uchar 1 case.
  444. * Issue one lanewise ld4.h to load four consecutive pixels from
  445. * one pointer (one pixel) into four different registers; then load
  446. * four consecutive s16 values from the next pointer (pixel) into
  447. * the next lane of those four registers, etc., so that we finish
  448. * with v12 - v15 representing the four taps, and each lane
  449. * representing a separate pixel.
  450. *
  451. * The first ld4 uses a splat to avoid any false dependency on
  452. * the previous state of the register.
  453. */
  454. ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
  455. lsr x14, x2, #(63 - CHUNKSHIFT)
  456. add x2, x2, x3
  457. ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15]
  458. add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
  459. lsr x15, x2, #(63 - CHUNKSHIFT)
  460. add x2, x2, x3
  461. ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16]
  462. add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
  463. lsr x16, x2, #(63 - CHUNKSHIFT)
  464. add x2, x2, x3
  465. ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17]
  466. add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
  467. lsr x17, x2, #(63 - CHUNKSHIFT)
  468. add x2, x2, x3
  469. ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14]
  470. add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
  471. ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15]
  472. ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16]
  473. ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17]
  474. smull v8.4s, v12.4h, v0.4h
  475. smull2 v9.4s, v12.8h, v0.8h
  476. smlsl v8.4s, v13.4h, v1.4h
  477. smlsl2 v9.4s, v13.8h, v1.8h
  478. smlsl v8.4s, v14.4h, v2.4h
  479. smlsl2 v9.4s, v14.8h, v2.8h
  480. smlal v8.4s, v15.4h, v3.4h
  481. smlal2 v9.4s, v15.8h, v3.8h
  482. subs x13, x13, #LOOP_OUTPUT_SIZE
  483. sqrshrn v8.4h, v8.4s, #15
  484. sqrshrn2 v8.8h, v9.4s, #15
  485. sqrshrun v8.8b, v8.8h, #VERTBITS - 8
  486. .elseif \comp == 2
  487. /* The uchar2 case:
  488. * This time load pairs of values into adjacent lanes in v12 - v15
  489. * by aliasing them as u32 data; leaving room for only four pixels,
  490. * so the process has to be done twice. This also means that the
  491. * coefficient registers fail to align with the coefficient data
  492. * (eight separate pixels), so that has to be doubled-up to match.
  493. */
  494. ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
  495. lsr x14, x2, #(63 - CHUNKSHIFT)
  496. add x2, x2, x3
  497. ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
  498. add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
  499. lsr x15, x2, #(63 - CHUNKSHIFT)
  500. add x2, x2, x3
  501. ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
  502. add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
  503. lsr x16, x2, #(63 - CHUNKSHIFT)
  504. add x2, x2, x3
  505. ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
  506. add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
  507. lsr x17, x2, #(63 - CHUNKSHIFT)
  508. add x2, x2, x3
  509. /* double-up coefficients to align with component pairs */
  510. zip1 v16.8h, v0.8h, v0.8h
  511. add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
  512. zip1 v17.8h, v1.8h, v1.8h
  513. zip1 v18.8h, v2.8h, v2.8h
  514. zip1 v19.8h, v3.8h, v3.8h
  515. smull v8.4s, v12.4h, v16.4h
  516. smull2 v9.4s, v12.8h, v16.8h
  517. smlsl v8.4s, v13.4h, v17.4h
  518. smlsl2 v9.4s, v13.8h, v17.8h
  519. smlsl v8.4s, v14.4h, v18.4h
  520. smlsl2 v9.4s, v14.8h, v18.8h
  521. smlal v8.4s, v15.4h, v19.4h
  522. smlal2 v9.4s, v15.8h, v19.8h
  523. sqrshrn v8.4h, v8.4s, #15
  524. sqrshrn2 v8.8h, v9.4s, #15
  525. ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
  526. ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
  527. ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
  528. ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
  529. /* double-up coefficients to align with component pairs */
  530. zip2 v16.8h, v0.8h, v0.8h
  531. zip2 v17.8h, v1.8h, v1.8h
  532. zip2 v18.8h, v2.8h, v2.8h
  533. zip2 v19.8h, v3.8h, v3.8h
  534. smull v10.4s, v12.4h, v16.4h
  535. smull2 v11.4s, v12.8h, v16.8h
  536. smlsl v10.4s, v13.4h, v17.4h
  537. smlsl2 v11.4s, v13.8h, v17.8h
  538. smlsl v10.4s, v14.4h, v18.4h
  539. smlsl2 v11.4s, v14.8h, v18.8h
  540. smlal v10.4s, v15.4h, v19.4h
  541. smlal2 v11.4s, v15.8h, v19.8h
  542. subs x13, x13, #LOOP_OUTPUT_SIZE
  543. sqrshrn v9.4h, v10.4s, #15
  544. sqrshrn2 v9.8h, v11.4s, #15
  545. sqrshrun v8.8b, v8.8h, #VERTBITS - 8
  546. sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
  547. .elseif \comp == 4
  548. /* The uchar4 case.
  549. * This case is comparatively painless because four s16s are the
  550. * smallest addressable unit for a vmul-by-scalar. Rather than
  551. * permute the data, simply arrange the multiplies to suit the way
  552. * the data comes in. That's a lot of data, though, so things
  553. * progress in pairs of pixels at a time.
  554. */
  555. ld1 {v12.8h,v13.8h}, [x14]
  556. lsr x14, x2, #(63 - CHUNKSHIFT)
  557. add x2, x2, x3
  558. ld1 {v14.8h,v15.8h}, [x15]
  559. add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
  560. lsr x15, x2, #(63 - CHUNKSHIFT)
  561. add x2, x2, x3
  562. smull v8.4s, v12.4h, v0.h[0]
  563. smull v9.4s, v14.4h, v0.h[1]
  564. smlsl2 v8.4s, v12.8h, v1.h[0]
  565. smlsl2 v9.4s, v14.8h, v1.h[1]
  566. smlsl v8.4s, v13.4h, v2.h[0]
  567. smlsl v9.4s, v15.4h, v2.h[1]
  568. smlal2 v8.4s, v13.8h, v3.h[0]
  569. smlal2 v9.4s, v15.8h, v3.h[1]
  570. /* And two more... */
  571. ld1 {v12.8h,v13.8h}, [x16]
  572. add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
  573. lsr x16, x2, #(63 - CHUNKSHIFT)
  574. add x2, x2, x3
  575. ld1 {v14.8h,v15.8h}, [x17]
  576. add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
  577. lsr x17, x2, #(63 - CHUNKSHIFT)
  578. add x2, x2, x3
  579. sqrshrn v8.4h, v8.4s, #15
  580. add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
  581. sqrshrn2 v8.8h, v9.4s, #15
  582. smull v10.4s, v12.4h, v0.h[2]
  583. smull v11.4s, v14.4h, v0.h[3]
  584. smlsl2 v10.4s, v12.8h, v1.h[2]
  585. smlsl2 v11.4s, v14.8h, v1.h[3]
  586. smlsl v10.4s, v13.4h, v2.h[2]
  587. smlsl v11.4s, v15.4h, v2.h[3]
  588. smlal2 v10.4s, v13.8h, v3.h[2]
  589. smlal2 v11.4s, v15.8h, v3.h[3]
  590. sqrshrn v9.4h, v10.4s, #15
  591. sqrshrn2 v9.8h, v11.4s, #15
  592. sqrshrun v8.8b, v8.8h, #VERTBITS - 8
  593. sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
  594. /* And two more... */
  595. ld1 {v12.8h,v13.8h}, [x14]
  596. ld1 {v14.8h,v15.8h}, [x15]
  597. smull v10.4s, v12.4h, v0.h[4]
  598. smull v11.4s, v14.4h, v0.h[5]
  599. smlsl2 v10.4s, v12.8h, v1.h[4]
  600. smlsl2 v11.4s, v14.8h, v1.h[5]
  601. smlsl v10.4s, v13.4h, v2.h[4]
  602. smlsl v11.4s, v15.4h, v2.h[5]
  603. smlal2 v10.4s, v13.8h, v3.h[4]
  604. smlal2 v11.4s, v15.8h, v3.h[5]
  605. /* And two more... */
  606. ld1 {v12.8h,v13.8h}, [x16]
  607. ld1 {v14.8h,v15.8h}, [x17]
  608. subs x13, x13, #LOOP_OUTPUT_SIZE
  609. sqrshrn v9.4h, v10.4s, #15
  610. sqrshrn2 v9.8h, v11.4s, #15
  611. smull v10.4s, v12.4h, v0.h[6]
  612. smull v11.4s, v14.4h, v0.h[7]
  613. smlsl2 v10.4s, v12.8h, v1.h[6]
  614. smlsl2 v11.4s, v14.8h, v1.h[7]
  615. smlsl v10.4s, v13.4h, v2.h[6]
  616. smlsl v11.4s, v15.4h, v2.h[7]
  617. smlal2 v10.4s, v13.8h, v3.h[6]
  618. smlal2 v11.4s, v15.8h, v3.h[7]
  619. sqrshrn v10.4h, v10.4s, #15
  620. sqrshrn2 v10.8h, v11.4s, #15
  621. sqrshrun v9.8b, v9.8h, #VERTBITS - 8
  622. sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8
  623. .endif
  624. bgt 2b /* continue inner loop */
  625. /* The inner loop has already been limited to ensure that none of
  626. * the earlier iterations could overfill the output, so the store
  627. * appears within the loop but after the conditional branch (at the
  628. * top). At the end, provided it won't overfill, perform the final
  629. * store here. If it would, then break out to the tricky tail case
  630. * instead.
  631. */
  632. blt 1f
  633. /* Store the amount of data appropriate to the configuration of the
  634. * instance being assembled.
  635. */
  636. .if LOOP_OUTPUT_SIZE == 4
  637. st1 {v8.s}[0], [x0], #4
  638. .elseif LOOP_OUTPUT_SIZE == 8
  639. st1 {v8.8b}, [x0], #8
  640. .elseif LOOP_OUTPUT_SIZE == 16
  641. st1 {v8.16b}, [x0], #16
  642. .elseif LOOP_OUTPUT_SIZE == 32
  643. st1 {v8.16b,v9.16b}, [x0], #32
  644. .endif
  645. b 1b /* resume outer loop */
  646. /* Partial tail store case:
  647. * Different versions of the code need different subsets of the
  648. * following partial stores. Here the number of components and the
  649. * size of the chunk of data produced by each inner loop iteration
  650. * is tested to figure out whether or not each phrase is relevant.
  651. */
  652. .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
  653. 1: tst x13, #16
  654. beq 1f
  655. st1 {v8.16b}, [x0], #16
  656. mov v8.16b, v9.16b
  657. .endif
  658. .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
  659. 1: tst x13, #8
  660. beq 1f
  661. st1 {v8.8b}, [x0], #8
  662. ext v8.16b, v8.16b, v8.16b, #8
  663. .endif
  664. .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
  665. 1: tst x13, #4
  666. beq 1f
  667. st1 {v8.s}[0], [x0], #4
  668. ext v8.8b, v8.8b, v8.8b, #4
  669. .endif
  670. .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
  671. 1: tst x13, #2
  672. beq 1f
  673. st1 {v8.h}[0], [x0], #2
  674. ext v8.8b, v8.8b, v8.8b, #2
  675. .endif
  676. .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
  677. 1: tst x13, #1
  678. beq 1f
  679. st1 {v8.b}[0], [x0], #1
  680. .endif
  681. 1:
  682. 9: mov sp, x19
  683. ld1 {v8.1d - v11.1d}, [sp], #32
  684. ld1 {v12.1d - v15.1d}, [sp], #32
  685. ldr x19, [sp], #16
  686. ret
  687. END(rsdIntrinsicResizeB\comp\()_K)
  688. .endr
  689. .rodata
  690. intrinsic_resize_consts: .hword 0, 1, 2, 3, 4, 5, 6, 7