android_memset32.S 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. /*
  2. * Copyright (C) 2014 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "cache.h"
  17. #ifndef MEMSET
  18. # define MEMSET android_memset32
  19. #endif
  20. #ifndef L
  21. # define L(label) .L##label
  22. #endif
  23. #ifndef ALIGN
  24. # define ALIGN(n) .p2align n
  25. #endif
  26. #ifndef cfi_startproc
  27. # define cfi_startproc .cfi_startproc
  28. #endif
  29. #ifndef cfi_endproc
  30. # define cfi_endproc .cfi_endproc
  31. #endif
  32. #ifndef ENTRY
  33. # define ENTRY(name) \
  34. .type name, @function; \
  35. .globl name; \
  36. .p2align 4; \
  37. name: \
  38. cfi_startproc
  39. #endif
  40. #ifndef END
  41. # define END(name) \
  42. cfi_endproc; \
  43. .size name, .-name
  44. #endif
  45. #define JMPTBL(I, B) I - B
  46. /* Branch to an entry in a jump table. TABLE is a jump table with
  47. relative offsets. INDEX is a register contains the index into the
  48. jump table. SCALE is the scale of INDEX. */
  49. #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
  50. lea TABLE(%rip), %r11; \
  51. movslq (%r11, INDEX, SCALE), INDEX; \
  52. lea (%r11, INDEX), INDEX; \
  53. jmp *INDEX
  54. .section .text.sse2,"ax",@progbits
  55. ALIGN (4)
  56. ENTRY (MEMSET) // Address in rdi
  57. shr $2, %rdx // Count in rdx
  58. movl %esi, %ecx // Pattern in ecx
  59. cmp $16, %rdx
  60. jae L(16dbwordsormore)
  61. L(write_less16dbwords):
  62. lea (%rdi, %rdx, 4), %rdi
  63. BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4)
  64. .pushsection .rodata.sse2,"a",@progbits
  65. ALIGN (2)
  66. L(table_less16dbwords):
  67. .int JMPTBL (L(write_0dbwords), L(table_less16dbwords))
  68. .int JMPTBL (L(write_1dbwords), L(table_less16dbwords))
  69. .int JMPTBL (L(write_2dbwords), L(table_less16dbwords))
  70. .int JMPTBL (L(write_3dbwords), L(table_less16dbwords))
  71. .int JMPTBL (L(write_4dbwords), L(table_less16dbwords))
  72. .int JMPTBL (L(write_5dbwords), L(table_less16dbwords))
  73. .int JMPTBL (L(write_6dbwords), L(table_less16dbwords))
  74. .int JMPTBL (L(write_7dbwords), L(table_less16dbwords))
  75. .int JMPTBL (L(write_8dbwords), L(table_less16dbwords))
  76. .int JMPTBL (L(write_9dbwords), L(table_less16dbwords))
  77. .int JMPTBL (L(write_10dbwords), L(table_less16dbwords))
  78. .int JMPTBL (L(write_11dbwords), L(table_less16dbwords))
  79. .int JMPTBL (L(write_12dbwords), L(table_less16dbwords))
  80. .int JMPTBL (L(write_13dbwords), L(table_less16dbwords))
  81. .int JMPTBL (L(write_14dbwords), L(table_less16dbwords))
  82. .int JMPTBL (L(write_15dbwords), L(table_less16dbwords))
  83. .popsection
  84. ALIGN (4)
  85. L(write_15dbwords):
  86. movl %ecx, -60(%rdi)
  87. L(write_14dbwords):
  88. movl %ecx, -56(%rdi)
  89. L(write_13dbwords):
  90. movl %ecx, -52(%rdi)
  91. L(write_12dbwords):
  92. movl %ecx, -48(%rdi)
  93. L(write_11dbwords):
  94. movl %ecx, -44(%rdi)
  95. L(write_10dbwords):
  96. movl %ecx, -40(%rdi)
  97. L(write_9dbwords):
  98. movl %ecx, -36(%rdi)
  99. L(write_8dbwords):
  100. movl %ecx, -32(%rdi)
  101. L(write_7dbwords):
  102. movl %ecx, -28(%rdi)
  103. L(write_6dbwords):
  104. movl %ecx, -24(%rdi)
  105. L(write_5dbwords):
  106. movl %ecx, -20(%rdi)
  107. L(write_4dbwords):
  108. movl %ecx, -16(%rdi)
  109. L(write_3dbwords):
  110. movl %ecx, -12(%rdi)
  111. L(write_2dbwords):
  112. movl %ecx, -8(%rdi)
  113. L(write_1dbwords):
  114. movl %ecx, -4(%rdi)
  115. L(write_0dbwords):
  116. ret
  117. ALIGN (4)
  118. L(16dbwordsormore):
  119. test $3, %edi
  120. jz L(aligned4bytes)
  121. mov %ecx, (%rdi)
  122. mov %ecx, -4(%rdi, %rdx, 4)
  123. sub $1, %rdx
  124. rol $24, %ecx
  125. add $1, %rdi
  126. test $3, %edi
  127. jz L(aligned4bytes)
  128. ror $8, %ecx
  129. add $1, %rdi
  130. test $3, %edi
  131. jz L(aligned4bytes)
  132. ror $8, %ecx
  133. add $1, %rdi
  134. L(aligned4bytes):
  135. shl $2, %rdx
  136. /* Fill xmm0 with the pattern. */
  137. movd %ecx, %xmm0
  138. pshufd $0, %xmm0, %xmm0
  139. testl $0xf, %edi
  140. jz L(aligned_16)
  141. /* RDX > 32 and RDI is not 16 byte aligned. */
  142. movdqu %xmm0, (%rdi)
  143. mov %rdi, %rsi
  144. and $-16, %rdi
  145. add $16, %rdi
  146. sub %rdi, %rsi
  147. add %rsi, %rdx
  148. ALIGN (4)
  149. L(aligned_16):
  150. cmp $128, %rdx
  151. jge L(128bytesormore)
  152. L(aligned_16_less128bytes):
  153. add %rdx, %rdi
  154. shr $2, %rdx
  155. BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
  156. ALIGN (4)
  157. L(128bytesormore):
  158. cmp $SHARED_CACHE_SIZE, %rdx
  159. jg L(128bytesormore_nt)
  160. L(128bytesormore_normal):
  161. sub $128, %rdx
  162. movdqa %xmm0, (%rdi)
  163. movdqa %xmm0, 0x10(%rdi)
  164. movdqa %xmm0, 0x20(%rdi)
  165. movdqa %xmm0, 0x30(%rdi)
  166. movdqa %xmm0, 0x40(%rdi)
  167. movdqa %xmm0, 0x50(%rdi)
  168. movdqa %xmm0, 0x60(%rdi)
  169. movdqa %xmm0, 0x70(%rdi)
  170. lea 128(%rdi), %rdi
  171. cmp $128, %rdx
  172. jl L(128bytesless_normal)
  173. sub $128, %rdx
  174. movdqa %xmm0, (%rdi)
  175. movdqa %xmm0, 0x10(%rdi)
  176. movdqa %xmm0, 0x20(%rdi)
  177. movdqa %xmm0, 0x30(%rdi)
  178. movdqa %xmm0, 0x40(%rdi)
  179. movdqa %xmm0, 0x50(%rdi)
  180. movdqa %xmm0, 0x60(%rdi)
  181. movdqa %xmm0, 0x70(%rdi)
  182. lea 128(%rdi), %rdi
  183. cmp $128, %rdx
  184. jl L(128bytesless_normal)
  185. sub $128, %rdx
  186. movdqa %xmm0, (%rdi)
  187. movdqa %xmm0, 0x10(%rdi)
  188. movdqa %xmm0, 0x20(%rdi)
  189. movdqa %xmm0, 0x30(%rdi)
  190. movdqa %xmm0, 0x40(%rdi)
  191. movdqa %xmm0, 0x50(%rdi)
  192. movdqa %xmm0, 0x60(%rdi)
  193. movdqa %xmm0, 0x70(%rdi)
  194. lea 128(%rdi), %rdi
  195. cmp $128, %rdx
  196. jl L(128bytesless_normal)
  197. sub $128, %rdx
  198. movdqa %xmm0, (%rdi)
  199. movdqa %xmm0, 0x10(%rdi)
  200. movdqa %xmm0, 0x20(%rdi)
  201. movdqa %xmm0, 0x30(%rdi)
  202. movdqa %xmm0, 0x40(%rdi)
  203. movdqa %xmm0, 0x50(%rdi)
  204. movdqa %xmm0, 0x60(%rdi)
  205. movdqa %xmm0, 0x70(%rdi)
  206. lea 128(%rdi), %rdi
  207. cmp $128, %rdx
  208. jge L(128bytesormore_normal)
  209. L(128bytesless_normal):
  210. add %rdx, %rdi
  211. shr $2, %rdx
  212. BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
  213. ALIGN (4)
  214. L(128bytesormore_nt):
  215. sub $128, %rdx
  216. movntdq %xmm0, (%rdi)
  217. movntdq %xmm0, 0x10(%rdi)
  218. movntdq %xmm0, 0x20(%rdi)
  219. movntdq %xmm0, 0x30(%rdi)
  220. movntdq %xmm0, 0x40(%rdi)
  221. movntdq %xmm0, 0x50(%rdi)
  222. movntdq %xmm0, 0x60(%rdi)
  223. movntdq %xmm0, 0x70(%rdi)
  224. lea 128(%rdi), %rdi
  225. cmp $128, %rdx
  226. jge L(128bytesormore_nt)
  227. sfence
  228. add %rdx, %rdi
  229. shr $2, %rdx
  230. BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
  231. .pushsection .rodata.sse2,"a",@progbits
  232. ALIGN (2)
  233. L(table_16_128bytes):
  234. .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
  235. .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
  236. .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
  237. .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
  238. .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
  239. .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
  240. .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
  241. .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
  242. .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
  243. .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
  244. .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
  245. .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
  246. .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
  247. .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
  248. .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
  249. .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
  250. .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
  251. .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
  252. .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
  253. .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
  254. .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
  255. .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
  256. .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
  257. .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
  258. .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
  259. .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
  260. .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
  261. .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
  262. .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
  263. .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
  264. .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
  265. .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
  266. .popsection
  267. ALIGN (4)
  268. L(aligned_16_112bytes):
  269. movdqa %xmm0, -112(%rdi)
  270. L(aligned_16_96bytes):
  271. movdqa %xmm0, -96(%rdi)
  272. L(aligned_16_80bytes):
  273. movdqa %xmm0, -80(%rdi)
  274. L(aligned_16_64bytes):
  275. movdqa %xmm0, -64(%rdi)
  276. L(aligned_16_48bytes):
  277. movdqa %xmm0, -48(%rdi)
  278. L(aligned_16_32bytes):
  279. movdqa %xmm0, -32(%rdi)
  280. L(aligned_16_16bytes):
  281. movdqa %xmm0, -16(%rdi)
  282. L(aligned_16_0bytes):
  283. ret
  284. ALIGN (4)
  285. L(aligned_16_116bytes):
  286. movdqa %xmm0, -116(%rdi)
  287. L(aligned_16_100bytes):
  288. movdqa %xmm0, -100(%rdi)
  289. L(aligned_16_84bytes):
  290. movdqa %xmm0, -84(%rdi)
  291. L(aligned_16_68bytes):
  292. movdqa %xmm0, -68(%rdi)
  293. L(aligned_16_52bytes):
  294. movdqa %xmm0, -52(%rdi)
  295. L(aligned_16_36bytes):
  296. movdqa %xmm0, -36(%rdi)
  297. L(aligned_16_20bytes):
  298. movdqa %xmm0, -20(%rdi)
  299. L(aligned_16_4bytes):
  300. movl %ecx, -4(%rdi)
  301. ret
  302. ALIGN (4)
  303. L(aligned_16_120bytes):
  304. movdqa %xmm0, -120(%rdi)
  305. L(aligned_16_104bytes):
  306. movdqa %xmm0, -104(%rdi)
  307. L(aligned_16_88bytes):
  308. movdqa %xmm0, -88(%rdi)
  309. L(aligned_16_72bytes):
  310. movdqa %xmm0, -72(%rdi)
  311. L(aligned_16_56bytes):
  312. movdqa %xmm0, -56(%rdi)
  313. L(aligned_16_40bytes):
  314. movdqa %xmm0, -40(%rdi)
  315. L(aligned_16_24bytes):
  316. movdqa %xmm0, -24(%rdi)
  317. L(aligned_16_8bytes):
  318. movq %xmm0, -8(%rdi)
  319. ret
  320. ALIGN (4)
  321. L(aligned_16_124bytes):
  322. movdqa %xmm0, -124(%rdi)
  323. L(aligned_16_108bytes):
  324. movdqa %xmm0, -108(%rdi)
  325. L(aligned_16_92bytes):
  326. movdqa %xmm0, -92(%rdi)
  327. L(aligned_16_76bytes):
  328. movdqa %xmm0, -76(%rdi)
  329. L(aligned_16_60bytes):
  330. movdqa %xmm0, -60(%rdi)
  331. L(aligned_16_44bytes):
  332. movdqa %xmm0, -44(%rdi)
  333. L(aligned_16_28bytes):
  334. movdqa %xmm0, -28(%rdi)
  335. L(aligned_16_12bytes):
  336. movq %xmm0, -12(%rdi)
  337. movl %ecx, -4(%rdi)
  338. ret
  339. END (MEMSET)