rsCpuIntrinsics_neon_YuvToRGB.S 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * Copyright (C) 2014 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
  17. #define END(f) .fnend; .size f, .-f;
  18. .eabi_attribute 25,1 @Tag_ABI_align8_preserved
  19. .arm
  20. /* Perform the actual YuvToRGB conversion in a macro, from register to
  21. * register. This macro will be called from within several different wrapper
  22. * variants for different data layouts. Y data starts in q8, but with the even
  23. * and odd bytes split into d16 and d17 respectively. U and V are in d20
  24. * and d21. Working constants are pre-loaded into q13-q15, and q3 is
  25. * pre-loaded with a constant 0xff alpha channel.
  26. *
  27. * The complicated arithmetic is the result of refactoring the original
  28. * equations to avoid 16-bit overflow without losing any precision.
  29. */
  30. .macro yuvkern
  31. vmov.i8 d15, #149
  32. vmull.u8 q1, d16, d15 // g0 = y0 * 149
  33. vmull.u8 q5, d17, d15 // g1 = y1 * 149
  34. vmov.i8 d14, #50
  35. vmov.i8 d15, #104
  36. vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104
  37. vmlal.u8 q8, d21, d15
  38. vshr.u8 d14, d21, #1
  39. vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1)
  40. vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1)
  41. vshll.u8 q7, d20, #2
  42. vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2)
  43. vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2)
  44. vmov.i8 d14, #204
  45. vmov.i8 d15, #254
  46. vmull.u8 q11, d21, d14 // r2 = v * 204
  47. vmull.u8 q12, d20, d15 // b2 = u * 254
  48. vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1
  49. vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1
  50. vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
  51. vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
  52. vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1
  53. vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1
  54. vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
  55. vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
  56. vqsub.u16 q1, q8 // g0 = satu16(g0 - g2)
  57. vqsub.u16 q5, q8 // g1 = satu16(g1 - g2)
  58. vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
  59. vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
  60. vqrshrn.u16 d0, q0, #6
  61. vqrshrn.u16 d1, q1, #7
  62. vqrshrn.u16 d2, q4, #6
  63. vqrshrn.u16 d3, q5, #7
  64. vqrshrn.u16 d4, q2, #6
  65. vqrshrn.u16 d5, q6, #6
  66. vzip.u8 q0, q1
  67. vzip.u8 d4, d5
  68. .endm
  69. /* Define the wrapper code which will load and store the data, iterate the
  70. * correct number of times, and safely handle the remainder at the end of the
  71. * loop. Some sections of code are switched out depending on the data packing
  72. * being handled.
  73. */
  74. .macro wrap_line kernel, interleaved=0, swapuv=0
  75. movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
  76. vdup.i16 q13, r5
  77. movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
  78. vdup.i16 q14, r5
  79. movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
  80. vdup.i16 q15, r5
  81. vmov.i8 q3, #0xff
  82. subs r2, #16
  83. bhs 1f
  84. b 2f
  85. .align 4
  86. 1: vld2.u8 {d16,d17}, [r1]!
  87. pld [r1, #256]
  88. .if \interleaved
  89. vld2.u8 {d20,d21}, [r3]!
  90. .if \swapuv
  91. vswp d20, d21
  92. .endif
  93. pld [r3, #256]
  94. .else
  95. vld1.u8 d20, [r3]!
  96. vld1.u8 d21, [r4]!
  97. pld [r3, #128]
  98. pld [r4, #128]
  99. .endif
  100. \kernel
  101. subs r2, #16
  102. vst4.u8 {d0,d2,d4,d6}, [r0]!
  103. vst4.u8 {d1,d3,d5,d7}, [r0]!
  104. bhs 1b
  105. 2: adds r2, #16
  106. beq 2f
  107. /* To handle the tail portion of the data (something less than 16
  108. * bytes) load small power-of-two chunks into working registers. It
  109. * doesn't matter where they end up in the register; the same process
  110. * will store them back out using the same positions and the
  111. * interaction between neighbouring pixels is constrained to odd
  112. * boundaries where the load operations don't interfere.
  113. */
  114. vmov.i8 q8, #0
  115. vmov.i8 q10, #0
  116. tst r2, #8
  117. beq 1f
  118. vld1.u8 d17, [r1]!
  119. .if \interleaved
  120. vld1.u8 d21, [r3]!
  121. .else
  122. vld1.u32 d20[1], [r3]!
  123. vld1.u32 d21[1], [r4]!
  124. .endif
  125. 1: tst r2, #4
  126. beq 1f
  127. vld1.u32 d16[1], [r1]!
  128. .if \interleaved
  129. vld1.u32 d20[1], [r3]!
  130. .else
  131. vld1.u16 d20[1], [r3]!
  132. vld1.u16 d21[1], [r4]!
  133. .endif
  134. 1: tst r2, #2
  135. beq 1f
  136. vld1.u16 d16[1], [r1]!
  137. .if \interleaved
  138. vld1.u16 d20[1], [r3]!
  139. .else
  140. vld1.u8 d20[1], [r3]!
  141. vld1.u8 d21[1], [r4]!
  142. .endif
  143. 1: tst r2, #1
  144. beq 1f
  145. vld1.u8 d16[1], [r1]!
  146. .if \interleaved
  147. vld1.u16 d20[0], [r3]!
  148. .else
  149. vld1.u8 d20[0], [r3]!
  150. vld1.u8 d21[0], [r4]!
  151. .endif
  152. /* One small impediment in the process above is that some of the load
  153. * operations can't perform byte-wise structure deinterleaving at the
  154. * same time as loading only part of a register. So the data is loaded
  155. * linearly and unpacked manually at this point if necessary.
  156. */
  157. 1: vuzp.8 d16, d17
  158. .if \interleaved
  159. vuzp.8 d20, d21
  160. .if \swapuv
  161. vswp d20, d21
  162. .endif
  163. .endif
  164. \kernel
  165. /* As above but with the output; structured stores for partial vectors
  166. * aren't available, so the data is re-packed first and stored linearly.
  167. */
  168. vzip.8 q0, q2
  169. vzip.8 q1, q3
  170. vzip.8 q0, q1
  171. vzip.8 q2, q3
  172. 1: tst r2, #8
  173. beq 1f
  174. vst1.u8 {d4,d5,d6,d7}, [r0]!
  175. 1: tst r2, #4
  176. beq 1f
  177. vst1.u8 {d2,d3}, [r0]!
  178. 1: tst r2, #2
  179. beq 1f
  180. vst1.u8 d1, [r0]!
  181. 1: tst r2, #1
  182. beq 2f
  183. vst1.u32 d0[1], [r0]!
  184. 2:
  185. .endm
  186. /* void rsdIntrinsicYuv2_K(
  187. * void *out, // r0
  188. * void const *yin, // r1
  189. * void const *uin, // r2
  190. * void const *vin, // r3
  191. * size_t xstart, // [sp]
  192. * size_t xend); // [sp+#4]
  193. */
  194. ENTRY(rsdIntrinsicYuv2_K)
  195. push {r4,r5}
  196. ldr r5, [sp, #8]
  197. mov r4, r3
  198. mov r3, r2
  199. ldr r2, [sp, #12]
  200. add r0, r5, LSL #2
  201. add r1, r5
  202. add r3, r5, LSR #1
  203. add r4, r5, LSR #1
  204. sub r2, r5
  205. vpush {d8-d15}
  206. wrap_line yuvkern, 0
  207. vpop {d8-d15}
  208. pop {r4,r5}
  209. bx lr
  210. END(rsdIntrinsicYuv2_K)
  211. /* void rsdIntrinsicYuv_K(
  212. * void *out, // r0
  213. * void const *yin, // r1
  214. * void const *uvin, // r2
  215. * size_t xstart, // r3
  216. * size_t xend); // [sp]
  217. */
  218. ENTRY(rsdIntrinsicYuv_K)
  219. push {r4,r5}
  220. bic r4, r3, #1
  221. add r3, r2, r4
  222. ldr r2, [sp, #8]
  223. add r0, r4, LSL #2
  224. add r1, r4
  225. sub r2, r4
  226. vpush {d8-d15}
  227. wrap_line yuvkern, 1, 1
  228. vpop {d8-d15}
  229. pop {r4,r5}
  230. bx lr
  231. END(rsdIntrinsicYuv_K)
  232. /* void rsdIntrinsicYuvR_K(
  233. * void *out, // r0
  234. * void const *yin, // r1
  235. * void const *uvin, // r2
  236. * size_t xstart, // r3
  237. * size_t xend); // [sp]
  238. */
  239. ENTRY(rsdIntrinsicYuvR_K)
  240. push {r4,r5}
  241. bic r4, r3, #1
  242. add r3, r2, r4
  243. ldr r2, [sp, #8]
  244. add r0, r4, LSL #2
  245. add r1, r4
  246. sub r2, r4
  247. vpush {d8-d15}
  248. wrap_line yuvkern, 1
  249. vpop {d8-d15}
  250. pop {r4,r5}
  251. bx lr
  252. END(rsdIntrinsicYuvR_K)