123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- /* libs/pixelflinger/t32cb16blend.S
- **
- ** Copyright 2006, The Android Open Source Project
- **
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- **
- ** http://www.apache.org/licenses/LICENSE-2.0
- **
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- */
- .text
- .syntax unified
- .balign 4
-
- .global scanline_t32cb16blend_arm
- /*
- * .macro pixel
- *
- * \DREG is a 32-bit register containing *two* original destination RGB565
- * pixels, with the even one in the low-16 bits, and the odd one in the
- * high 16 bits.
- *
- * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
- *
- * \FB is a target register that will contain the blended pixel values.
- *
- * \ODD is either 0 or 1 and indicates if we're blending the lower or
- * upper 16-bit pixels in DREG into FB
- *
- *
- * clobbered: r6, r7, lr
- *
- */
- .macro pixel, DREG, SRC, FB, ODD
- // SRC = 0xAABBGGRR
- mov r7, \SRC, lsr #24 // sA
- add r7, r7, r7, lsr #7 // sA + (sA >> 7)
- rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))
- 1:
- .if \ODD
- // red
- mov lr, \DREG, lsr #(16 + 11)
- smulbb lr, r7, lr
- mov r6, \SRC, lsr #3
- and r6, r6, #0x1F
- add lr, r6, lr, lsr #8
- cmp lr, #0x1F
- orrhs \FB, \FB, #(0x1F<<(16 + 11))
- orrlo \FB, \FB, lr, lsl #(16 + 11)
- // green
- and r6, \DREG, #(0x3F<<(16 + 5))
- smulbt r6, r7, r6
- mov lr, \SRC, lsr #(8+2)
- and lr, lr, #0x3F
- add r6, lr, r6, lsr #(5+8)
- cmp r6, #0x3F
- orrhs \FB, \FB, #(0x3F<<(16 + 5))
- orrlo \FB, \FB, r6, lsl #(16 + 5)
- // blue
- and lr, \DREG, #(0x1F << 16)
- smulbt lr, r7, lr
- mov r6, \SRC, lsr #(8+8+3)
- and r6, r6, #0x1F
- add lr, r6, lr, lsr #8
- cmp lr, #0x1F
- orrhs \FB, \FB, #(0x1F << 16)
- orrlo \FB, \FB, lr, lsl #16
- .else
- // red
- mov lr, \DREG, lsr #11
- and lr, lr, #0x1F
- smulbb lr, r7, lr
- mov r6, \SRC, lsr #3
- and r6, r6, #0x1F
- add lr, r6, lr, lsr #8
- cmp lr, #0x1F
- movhs \FB, #(0x1F<<11)
- movlo \FB, lr, lsl #11
- // green
- and r6, \DREG, #(0x3F<<5)
- smulbb r6, r7, r6
- mov lr, \SRC, lsr #(8+2)
- and lr, lr, #0x3F
- add r6, lr, r6, lsr #(5+8)
- cmp r6, #0x3F
- orrhs \FB, \FB, #(0x3F<<5)
- orrlo \FB, \FB, r6, lsl #5
- // blue
- and lr, \DREG, #0x1F
- smulbb lr, r7, lr
- mov r6, \SRC, lsr #(8+8+3)
- and r6, r6, #0x1F
- add lr, r6, lr, lsr #8
- cmp lr, #0x1F
- orrhs \FB, \FB, #0x1F
- orrlo \FB, \FB, lr
- .endif
- .endm
-
- // r0: dst ptr
- // r1: src ptr
- // r2: count
- // r3: d
- // r4: s0
- // r5: s1
- // r6: pixel
- // r7: pixel
- // r8: free
- // r9: free
- // r10: free
- // r11: free
- // r12: scratch
- // r14: pixel
- scanline_t32cb16blend_arm:
- stmfd sp!, {r4-r7, lr}
- pld [r0]
- pld [r1]
- // align DST to 32 bits
- tst r0, #0x3
- beq aligned
- subs r2, r2, #1
- ldmfdlo sp!, {r4-r7, lr} // return
- bxlo lr
- last:
- ldr r4, [r1], #4
- ldrh r3, [r0]
- pixel r3, r4, r12, 0
- strh r12, [r0], #2
- aligned:
- subs r2, r2, #2
- blo 9f
- // The main loop is unrolled twice and processes 4 pixels
- 8: ldmia r1!, {r4, r5}
- // stream the source
- pld [r1, #32]
- add r0, r0, #4
- // it's all zero, skip this pixel
- orrs r3, r4, r5
- beq 7f
-
- // load the destination
- ldr r3, [r0, #-4]
- // stream the destination
- pld [r0, #32]
- pixel r3, r4, r12, 0
- pixel r3, r5, r12, 1
- // effectively, we're getting write-combining by virtue of the
- // cpu's write-back cache.
- str r12, [r0, #-4]
- // 2nd iterration of the loop, don't stream anything
- subs r2, r2, #2
- movlt r4, r5
- blt 9f
- ldmia r1!, {r4, r5}
- add r0, r0, #4
- orrs r3, r4, r5
- beq 7f
- ldr r3, [r0, #-4]
- pixel r3, r4, r12, 0
- pixel r3, r5, r12, 16
- str r12, [r0, #-4]
-
- 7: subs r2, r2, #2
- bhs 8b
- mov r4, r5
- 9: adds r2, r2, #1
- ldmfdlo sp!, {r4-r7, lr} // return
- bxlo lr
- b last
|