U3memcpy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /* U3memcpy.S: UltraSparc-III optimized memcpy.
  2. *
  3. * Copyright (C) 1999, 2000, 2004 David S. Miller ([email protected])
  4. */
  5. #ifdef __KERNEL__
  6. #include <linux/linkage.h>
  7. #include <asm/visasm.h>
  8. #include <asm/asi.h>
  9. #define GLOBAL_SPARE %g7
  10. #else
  11. #define ASI_BLK_P 0xf0
  12. #define FPRS_FEF 0x04
  13. #ifdef MEMCPY_DEBUG
  14. #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  15. clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
  16. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  17. #else
  18. #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  19. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  20. #endif
  21. #define GLOBAL_SPARE %g5
  22. #endif
  23. #ifndef EX_LD
  24. #define EX_LD(x,y) x
  25. #endif
  26. #ifndef EX_LD_FP
  27. #define EX_LD_FP(x,y) x
  28. #endif
  29. #ifndef EX_ST
  30. #define EX_ST(x,y) x
  31. #endif
  32. #ifndef EX_ST_FP
  33. #define EX_ST_FP(x,y) x
  34. #endif
  35. #ifndef LOAD
  36. #define LOAD(type,addr,dest) type [addr], dest
  37. #endif
  38. #ifndef STORE
  39. #define STORE(type,src,addr) type src, [addr]
  40. #endif
  41. #ifndef STORE_BLK
  42. #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
  43. #endif
  44. #ifndef FUNC_NAME
  45. #define FUNC_NAME U3memcpy
  46. #endif
  47. #ifndef PREAMBLE
  48. #define PREAMBLE
  49. #endif
  50. #ifndef XCC
  51. #define XCC xcc
  52. #endif
  53. .register %g2,#scratch
  54. .register %g3,#scratch
  55. /* Special/non-trivial issues of this code:
  56. *
  57. * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
  58. * 2) Only low 32 FPU registers are used so that only the
  59. * lower half of the FPU register set is dirtied by this
  60. * code. This is especially important in the kernel.
  61. * 3) This code never prefetches cachelines past the end
  62. * of the source buffer.
  63. */
  64. .text
  65. #ifndef EX_RETVAL
  66. #define EX_RETVAL(x) x
  67. __restore_fp:
  68. VISExitHalf
  69. retl
  70. nop
  71. ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
  72. add %g1, 1, %g1
  73. add %g2, %g1, %g2
  74. ba,pt %xcc, __restore_fp
  75. add %o2, %g2, %o0
  76. ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
  77. ENTRY(U3_retl_o2_plus_g2_fp)
  78. ba,pt %xcc, __restore_fp
  79. add %o2, %g2, %o0
  80. ENDPROC(U3_retl_o2_plus_g2_fp)
  81. ENTRY(U3_retl_o2_plus_g2_plus_8_fp)
  82. add %g2, 8, %g2
  83. ba,pt %xcc, __restore_fp
  84. add %o2, %g2, %o0
  85. ENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
  86. ENTRY(U3_retl_o2)
  87. retl
  88. mov %o2, %o0
  89. ENDPROC(U3_retl_o2)
  90. ENTRY(U3_retl_o2_plus_1)
  91. retl
  92. add %o2, 1, %o0
  93. ENDPROC(U3_retl_o2_plus_1)
  94. ENTRY(U3_retl_o2_plus_4)
  95. retl
  96. add %o2, 4, %o0
  97. ENDPROC(U3_retl_o2_plus_4)
  98. ENTRY(U3_retl_o2_plus_8)
  99. retl
  100. add %o2, 8, %o0
  101. ENDPROC(U3_retl_o2_plus_8)
  102. ENTRY(U3_retl_o2_plus_g1_plus_1)
  103. add %g1, 1, %g1
  104. retl
  105. add %o2, %g1, %o0
  106. ENDPROC(U3_retl_o2_plus_g1_plus_1)
  107. ENTRY(U3_retl_o2_fp)
  108. ba,pt %xcc, __restore_fp
  109. mov %o2, %o0
  110. ENDPROC(U3_retl_o2_fp)
  111. ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
  112. sll %o3, 6, %o3
  113. add %o3, 0x80, %o3
  114. ba,pt %xcc, __restore_fp
  115. add %o2, %o3, %o0
  116. ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
  117. ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
  118. sll %o3, 6, %o3
  119. add %o3, 0x40, %o3
  120. ba,pt %xcc, __restore_fp
  121. add %o2, %o3, %o0
  122. ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
  123. ENTRY(U3_retl_o2_plus_GS_plus_0x10)
  124. add GLOBAL_SPARE, 0x10, GLOBAL_SPARE
  125. retl
  126. add %o2, GLOBAL_SPARE, %o0
  127. ENDPROC(U3_retl_o2_plus_GS_plus_0x10)
  128. ENTRY(U3_retl_o2_plus_GS_plus_0x08)
  129. add GLOBAL_SPARE, 0x08, GLOBAL_SPARE
  130. retl
  131. add %o2, GLOBAL_SPARE, %o0
  132. ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
  133. ENTRY(U3_retl_o2_and_7_plus_GS)
  134. and %o2, 7, %o2
  135. retl
  136. add %o2, GLOBAL_SPARE, %o0
  137. ENDPROC(U3_retl_o2_and_7_plus_GS)
  138. ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
  139. add GLOBAL_SPARE, 8, GLOBAL_SPARE
  140. and %o2, 7, %o2
  141. retl
  142. add %o2, GLOBAL_SPARE, %o0
  143. ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
  144. #endif
  145. .align 64
  146. /* The cheetah's flexible spine, oversized liver, enlarged heart,
  147. * slender muscular body, and claws make it the swiftest hunter
  148. * in Africa and the fastest animal on land. Can reach speeds
  149. * of up to 2.4GB per second.
  150. */
  151. .globl FUNC_NAME
  152. .type FUNC_NAME,#function
  153. FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  154. srlx %o2, 31, %g2
  155. cmp %g2, 0
  156. tne %xcc, 5
  157. PREAMBLE
  158. mov %o0, %o4
  159. cmp %o2, 0
  160. be,pn %XCC, 85f
  161. or %o0, %o1, %o3
  162. cmp %o2, 16
  163. blu,a,pn %XCC, 80f
  164. or %o3, %o2, %o3
  165. cmp %o2, (3 * 64)
  166. blu,pt %XCC, 70f
  167. andcc %o3, 0x7, %g0
  168. /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
  169. * o5 from here until we hit VISExitHalf.
  170. */
  171. VISEntryHalf
  172. /* Is 'dst' already aligned on an 64-byte boundary? */
  173. andcc %o0, 0x3f, %g2
  174. be,pt %XCC, 2f
  175. /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
  176. * of bytes to copy to make 'dst' 64-byte aligned. We pre-
  177. * subtract this from 'len'.
  178. */
  179. sub %o0, %o1, GLOBAL_SPARE
  180. sub %g2, 0x40, %g2
  181. sub %g0, %g2, %g2
  182. sub %o2, %g2, %o2
  183. andcc %g2, 0x7, %g1
  184. be,pt %icc, 2f
  185. and %g2, 0x38, %g2
  186. 1: subcc %g1, 0x1, %g1
  187. EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
  188. EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
  189. bgu,pt %XCC, 1b
  190. add %o1, 0x1, %o1
  191. add %o1, GLOBAL_SPARE, %o0
  192. 2: cmp %g2, 0x0
  193. and %o1, 0x7, %g1
  194. be,pt %icc, 3f
  195. alignaddr %o1, %g0, %o1
  196. EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
  197. 1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
  198. add %o1, 0x8, %o1
  199. subcc %g2, 0x8, %g2
  200. faligndata %f4, %f6, %f0
  201. EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
  202. be,pn %icc, 3f
  203. add %o0, 0x8, %o0
  204. EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
  205. add %o1, 0x8, %o1
  206. subcc %g2, 0x8, %g2
  207. faligndata %f6, %f4, %f2
  208. EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
  209. bne,pt %icc, 1b
  210. add %o0, 0x8, %o0
  211. 3: LOAD(prefetch, %o1 + 0x000, #one_read)
  212. LOAD(prefetch, %o1 + 0x040, #one_read)
  213. andn %o2, (0x40 - 1), GLOBAL_SPARE
  214. LOAD(prefetch, %o1 + 0x080, #one_read)
  215. LOAD(prefetch, %o1 + 0x0c0, #one_read)
  216. LOAD(prefetch, %o1 + 0x100, #one_read)
  217. EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
  218. LOAD(prefetch, %o1 + 0x140, #one_read)
  219. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
  220. LOAD(prefetch, %o1 + 0x180, #one_read)
  221. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
  222. LOAD(prefetch, %o1 + 0x1c0, #one_read)
  223. faligndata %f0, %f2, %f16
  224. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
  225. faligndata %f2, %f4, %f18
  226. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
  227. faligndata %f4, %f6, %f20
  228. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
  229. faligndata %f6, %f8, %f22
  230. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
  231. faligndata %f8, %f10, %f24
  232. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
  233. faligndata %f10, %f12, %f26
  234. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
  235. subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE
  236. add %o1, 0x40, %o1
  237. bgu,pt %XCC, 1f
  238. srl GLOBAL_SPARE, 6, %o3
  239. ba,pt %xcc, 2f
  240. nop
  241. .align 64
  242. 1:
  243. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  244. faligndata %f12, %f14, %f28
  245. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  246. faligndata %f14, %f0, %f30
  247. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  248. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  249. faligndata %f0, %f2, %f16
  250. add %o0, 0x40, %o0
  251. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  252. faligndata %f2, %f4, %f18
  253. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  254. faligndata %f4, %f6, %f20
  255. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  256. subcc %o3, 0x01, %o3
  257. faligndata %f6, %f8, %f22
  258. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  259. faligndata %f8, %f10, %f24
  260. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  261. LOAD(prefetch, %o1 + 0x1c0, #one_read)
  262. faligndata %f10, %f12, %f26
  263. bg,pt %XCC, 1b
  264. add %o1, 0x40, %o1
  265. /* Finally we copy the last full 64-byte block. */
  266. 2:
  267. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  268. faligndata %f12, %f14, %f28
  269. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  270. faligndata %f14, %f0, %f30
  271. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  272. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  273. faligndata %f0, %f2, %f16
  274. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  275. faligndata %f2, %f4, %f18
  276. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  277. faligndata %f4, %f6, %f20
  278. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  279. faligndata %f6, %f8, %f22
  280. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  281. faligndata %f8, %f10, %f24
  282. cmp %g1, 0
  283. be,pt %XCC, 1f
  284. add %o0, 0x40, %o0
  285. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  286. 1: faligndata %f10, %f12, %f26
  287. faligndata %f12, %f14, %f28
  288. faligndata %f14, %f0, %f30
  289. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  290. add %o0, 0x40, %o0
  291. add %o1, 0x40, %o1
  292. membar #Sync
  293. /* Now we copy the (len modulo 64) bytes at the end.
  294. * Note how we borrow the %f0 loaded above.
  295. *
  296. * Also notice how this code is careful not to perform a
  297. * load past the end of the src buffer.
  298. */
  299. and %o2, 0x3f, %o2
  300. andcc %o2, 0x38, %g2
  301. be,pn %XCC, 2f
  302. subcc %g2, 0x8, %g2
  303. be,pn %XCC, 2f
  304. cmp %g1, 0
  305. sub %o2, %g2, %o2
  306. be,a,pt %XCC, 1f
  307. EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
  308. 1: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
  309. add %o1, 0x8, %o1
  310. subcc %g2, 0x8, %g2
  311. faligndata %f0, %f2, %f8
  312. EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
  313. be,pn %XCC, 2f
  314. add %o0, 0x8, %o0
  315. EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
  316. add %o1, 0x8, %o1
  317. subcc %g2, 0x8, %g2
  318. faligndata %f2, %f0, %f8
  319. EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
  320. bne,pn %XCC, 1b
  321. add %o0, 0x8, %o0
  322. /* If anything is left, we copy it one byte at a time.
  323. * Note that %g1 is (src & 0x3) saved above before the
  324. * alignaddr was performed.
  325. */
  326. 2:
  327. cmp %o2, 0
  328. add %o1, %g1, %o1
  329. VISExitHalf
  330. be,pn %XCC, 85f
  331. sub %o0, %o1, %o3
  332. andcc %g1, 0x7, %g0
  333. bne,pn %icc, 90f
  334. andcc %o2, 0x8, %g0
  335. be,pt %icc, 1f
  336. nop
  337. EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
  338. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
  339. add %o1, 0x8, %o1
  340. sub %o2, 8, %o2
  341. 1: andcc %o2, 0x4, %g0
  342. be,pt %icc, 1f
  343. nop
  344. EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
  345. EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
  346. add %o1, 0x4, %o1
  347. sub %o2, 4, %o2
  348. 1: andcc %o2, 0x2, %g0
  349. be,pt %icc, 1f
  350. nop
  351. EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
  352. EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
  353. add %o1, 0x2, %o1
  354. sub %o2, 2, %o2
  355. 1: andcc %o2, 0x1, %g0
  356. be,pt %icc, 85f
  357. nop
  358. EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
  359. ba,pt %xcc, 85f
  360. EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
  361. .align 64
  362. 70: /* 16 < len <= 64 */
  363. bne,pn %XCC, 75f
  364. sub %o0, %o1, %o3
  365. 72:
  366. andn %o2, 0xf, GLOBAL_SPARE
  367. and %o2, 0xf, %o2
  368. 1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE
  369. EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
  370. EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
  371. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
  372. add %o1, 0x8, %o1
  373. EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
  374. bgu,pt %XCC, 1b
  375. add %o1, 0x8, %o1
  376. 73: andcc %o2, 0x8, %g0
  377. be,pt %XCC, 1f
  378. nop
  379. sub %o2, 0x8, %o2
  380. EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
  381. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
  382. add %o1, 0x8, %o1
  383. 1: andcc %o2, 0x4, %g0
  384. be,pt %XCC, 1f
  385. nop
  386. sub %o2, 0x4, %o2
  387. EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
  388. EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
  389. add %o1, 0x4, %o1
  390. 1: cmp %o2, 0
  391. be,pt %XCC, 85f
  392. nop
  393. ba,pt %xcc, 90f
  394. nop
  395. 75:
  396. andcc %o0, 0x7, %g1
  397. sub %g1, 0x8, %g1
  398. be,pn %icc, 2f
  399. sub %g0, %g1, %g1
  400. sub %o2, %g1, %o2
  401. 1: subcc %g1, 1, %g1
  402. EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
  403. EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
  404. bgu,pt %icc, 1b
  405. add %o1, 1, %o1
  406. 2: add %o1, %o3, %o0
  407. andcc %o1, 0x7, %g1
  408. bne,pt %icc, 8f
  409. sll %g1, 3, %g1
  410. cmp %o2, 16
  411. bgeu,pt %icc, 72b
  412. nop
  413. ba,a,pt %xcc, 73b
  414. 8: mov 64, %o3
  415. andn %o1, 0x7, %o1
  416. EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
  417. sub %o3, %g1, %o3
  418. andn %o2, 0x7, GLOBAL_SPARE
  419. sllx %g2, %g1, %g2
  420. 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
  421. subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE
  422. add %o1, 0x8, %o1
  423. srlx %g3, %o3, %o5
  424. or %o5, %g2, %o5
  425. EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
  426. add %o0, 0x8, %o0
  427. bgu,pt %icc, 1b
  428. sllx %g3, %g1, %g2
  429. srl %g1, 3, %g1
  430. andcc %o2, 0x7, %o2
  431. be,pn %icc, 85f
  432. add %o1, %g1, %o1
  433. ba,pt %xcc, 90f
  434. sub %o0, %o1, %o3
  435. .align 64
  436. 80: /* 0 < len <= 16 */
  437. andcc %o3, 0x3, %g0
  438. bne,pn %XCC, 90f
  439. sub %o0, %o1, %o3
  440. 1:
  441. subcc %o2, 4, %o2
  442. EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
  443. EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
  444. bgu,pt %XCC, 1b
  445. add %o1, 4, %o1
  446. 85: retl
  447. mov EX_RETVAL(%o4), %o0
  448. .align 32
  449. 90:
  450. subcc %o2, 1, %o2
  451. EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
  452. EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
  453. bgu,pt %XCC, 90b
  454. add %o1, 1, %o1
  455. retl
  456. mov EX_RETVAL(%o4), %o0
  457. .size FUNC_NAME, .-FUNC_NAME