t32cb16blend.S 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. /* libs/pixelflinger/t32cb16blend.S
  2. **
  3. ** Copyright 2006, The Android Open Source Project
  4. **
  5. ** Licensed under the Apache License, Version 2.0 (the "License");
  6. ** you may not use this file except in compliance with the License.
  7. ** You may obtain a copy of the License at
  8. **
  9. ** http://www.apache.org/licenses/LICENSE-2.0
  10. **
  11. ** Unless required by applicable law or agreed to in writing, software
  12. ** distributed under the License is distributed on an "AS IS" BASIS,
  13. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. ** See the License for the specific language governing permissions and
  15. ** limitations under the License.
  16. */
  17. .text
  18. .syntax unified
  19. .balign 4
  20. .global scanline_t32cb16blend_arm
  21. /*
  22. * .macro pixel
  23. *
  24. * \DREG is a 32-bit register containing *two* original destination RGB565
  25. * pixels, with the even one in the low-16 bits, and the odd one in the
  26. * high 16 bits.
  27. *
  28. * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
  29. *
  30. * \FB is a target register that will contain the blended pixel values.
  31. *
  32. * \ODD is either 0 or 1 and indicates if we're blending the lower or
  33. * upper 16-bit pixels in DREG into FB
  34. *
  35. *
  36. * clobbered: r6, r7, lr
  37. *
  38. */
  39. .macro pixel, DREG, SRC, FB, ODD
  40. // SRC = 0xAABBGGRR
  41. mov r7, \SRC, lsr #24 // sA
  42. add r7, r7, r7, lsr #7 // sA + (sA >> 7)
  43. rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))
  44. 1:
  45. .if \ODD
  46. // red
  47. mov lr, \DREG, lsr #(16 + 11)
  48. smulbb lr, r7, lr
  49. mov r6, \SRC, lsr #3
  50. and r6, r6, #0x1F
  51. add lr, r6, lr, lsr #8
  52. cmp lr, #0x1F
  53. orrhs \FB, \FB, #(0x1F<<(16 + 11))
  54. orrlo \FB, \FB, lr, lsl #(16 + 11)
  55. // green
  56. and r6, \DREG, #(0x3F<<(16 + 5))
  57. smulbt r6, r7, r6
  58. mov lr, \SRC, lsr #(8+2)
  59. and lr, lr, #0x3F
  60. add r6, lr, r6, lsr #(5+8)
  61. cmp r6, #0x3F
  62. orrhs \FB, \FB, #(0x3F<<(16 + 5))
  63. orrlo \FB, \FB, r6, lsl #(16 + 5)
  64. // blue
  65. and lr, \DREG, #(0x1F << 16)
  66. smulbt lr, r7, lr
  67. mov r6, \SRC, lsr #(8+8+3)
  68. and r6, r6, #0x1F
  69. add lr, r6, lr, lsr #8
  70. cmp lr, #0x1F
  71. orrhs \FB, \FB, #(0x1F << 16)
  72. orrlo \FB, \FB, lr, lsl #16
  73. .else
  74. // red
  75. mov lr, \DREG, lsr #11
  76. and lr, lr, #0x1F
  77. smulbb lr, r7, lr
  78. mov r6, \SRC, lsr #3
  79. and r6, r6, #0x1F
  80. add lr, r6, lr, lsr #8
  81. cmp lr, #0x1F
  82. movhs \FB, #(0x1F<<11)
  83. movlo \FB, lr, lsl #11
  84. // green
  85. and r6, \DREG, #(0x3F<<5)
  86. smulbb r6, r7, r6
  87. mov lr, \SRC, lsr #(8+2)
  88. and lr, lr, #0x3F
  89. add r6, lr, r6, lsr #(5+8)
  90. cmp r6, #0x3F
  91. orrhs \FB, \FB, #(0x3F<<5)
  92. orrlo \FB, \FB, r6, lsl #5
  93. // blue
  94. and lr, \DREG, #0x1F
  95. smulbb lr, r7, lr
  96. mov r6, \SRC, lsr #(8+8+3)
  97. and r6, r6, #0x1F
  98. add lr, r6, lr, lsr #8
  99. cmp lr, #0x1F
  100. orrhs \FB, \FB, #0x1F
  101. orrlo \FB, \FB, lr
  102. .endif
  103. .endm
  104. // r0: dst ptr
  105. // r1: src ptr
  106. // r2: count
  107. // r3: d
  108. // r4: s0
  109. // r5: s1
  110. // r6: pixel
  111. // r7: pixel
  112. // r8: free
  113. // r9: free
  114. // r10: free
  115. // r11: free
  116. // r12: scratch
  117. // r14: pixel
  118. scanline_t32cb16blend_arm:
  119. stmfd sp!, {r4-r7, lr}
  120. pld [r0]
  121. pld [r1]
  122. // align DST to 32 bits
  123. tst r0, #0x3
  124. beq aligned
  125. subs r2, r2, #1
  126. ldmfdlo sp!, {r4-r7, lr} // return
  127. bxlo lr
  128. last:
  129. ldr r4, [r1], #4
  130. ldrh r3, [r0]
  131. pixel r3, r4, r12, 0
  132. strh r12, [r0], #2
  133. aligned:
  134. subs r2, r2, #2
  135. blo 9f
  136. // The main loop is unrolled twice and processes 4 pixels
  137. 8: ldmia r1!, {r4, r5}
  138. // stream the source
  139. pld [r1, #32]
  140. add r0, r0, #4
  141. // it's all zero, skip this pixel
  142. orrs r3, r4, r5
  143. beq 7f
  144. // load the destination
  145. ldr r3, [r0, #-4]
  146. // stream the destination
  147. pld [r0, #32]
  148. pixel r3, r4, r12, 0
  149. pixel r3, r5, r12, 1
  150. // effectively, we're getting write-combining by virtue of the
  151. // cpu's write-back cache.
  152. str r12, [r0, #-4]
  153. // 2nd iterration of the loop, don't stream anything
  154. subs r2, r2, #2
  155. movlt r4, r5
  156. blt 9f
  157. ldmia r1!, {r4, r5}
  158. add r0, r0, #4
  159. orrs r3, r4, r5
  160. beq 7f
  161. ldr r3, [r0, #-4]
  162. pixel r3, r4, r12, 0
  163. pixel r3, r5, r12, 16
  164. str r12, [r0, #-4]
  165. 7: subs r2, r2, #2
  166. bhs 8b
  167. mov r4, r5
  168. 9: adds r2, r2, #1
  169. ldmfdlo sp!, {r4-r7, lr} // return
  170. bxlo lr
  171. b last