NG4memcpy.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. /* NG4memcpy.S: Niagara-4 optimized memcpy.
  2. *
  3. * Copyright (C) 2012 David S. Miller ([email protected])
  4. */
  5. #ifdef __KERNEL__
  6. #include <linux/linkage.h>
  7. #include <asm/visasm.h>
  8. #include <asm/asi.h>
  9. #define GLOBAL_SPARE %g7
  10. #else
  11. #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  12. #define FPRS_FEF 0x04
  13. /* On T4 it is very expensive to access ASRs like %fprs and
  14. * %asi, avoiding a read or a write can save ~50 cycles.
  15. */
  16. #define FPU_ENTER \
  17. rd %fprs, %o5; \
  18. andcc %o5, FPRS_FEF, %g0; \
  19. be,a,pn %icc, 999f; \
  20. wr %g0, FPRS_FEF, %fprs; \
  21. 999:
  22. #ifdef MEMCPY_DEBUG
  23. #define VISEntryHalf FPU_ENTER; \
  24. clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
  25. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  26. #else
  27. #define VISEntryHalf FPU_ENTER
  28. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  29. #endif
  30. #define GLOBAL_SPARE %g5
  31. #endif
  32. #ifndef STORE_ASI
  33. #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  34. #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
  35. #else
  36. #define STORE_ASI 0x80 /* ASI_P */
  37. #endif
  38. #endif
  39. #if !defined(EX_LD) && !defined(EX_ST)
  40. #define NON_USER_COPY
  41. #endif
  42. #ifndef EX_LD
  43. #define EX_LD(x,y) x
  44. #endif
  45. #ifndef EX_LD_FP
  46. #define EX_LD_FP(x,y) x
  47. #endif
  48. #ifndef EX_ST
  49. #define EX_ST(x,y) x
  50. #endif
  51. #ifndef EX_ST_FP
  52. #define EX_ST_FP(x,y) x
  53. #endif
  54. #ifndef LOAD
  55. #define LOAD(type,addr,dest) type [addr], dest
  56. #endif
  57. #ifndef STORE
  58. #ifndef MEMCPY_DEBUG
  59. #define STORE(type,src,addr) type src, [addr]
  60. #else
  61. #define STORE(type,src,addr) type##a src, [addr] %asi
  62. #endif
  63. #endif
  64. #ifndef STORE_INIT
  65. #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
  66. #endif
  67. #ifndef FUNC_NAME
  68. #define FUNC_NAME NG4memcpy
  69. #endif
  70. #ifndef PREAMBLE
  71. #define PREAMBLE
  72. #endif
  73. #ifndef XCC
  74. #define XCC xcc
  75. #endif
  76. .register %g2,#scratch
  77. .register %g3,#scratch
  78. .text
  79. #ifndef EX_RETVAL
  80. #define EX_RETVAL(x) x
  81. __restore_asi_fp:
  82. VISExitHalf
  83. __restore_asi:
  84. retl
  85. wr %g0, ASI_AIUS, %asi
  86. ENTRY(NG4_retl_o2)
  87. ba,pt %xcc, __restore_asi
  88. mov %o2, %o0
  89. ENDPROC(NG4_retl_o2)
  90. ENTRY(NG4_retl_o2_plus_1)
  91. ba,pt %xcc, __restore_asi
  92. add %o2, 1, %o0
  93. ENDPROC(NG4_retl_o2_plus_1)
  94. ENTRY(NG4_retl_o2_plus_4)
  95. ba,pt %xcc, __restore_asi
  96. add %o2, 4, %o0
  97. ENDPROC(NG4_retl_o2_plus_4)
  98. ENTRY(NG4_retl_o2_plus_o5)
  99. ba,pt %xcc, __restore_asi
  100. add %o2, %o5, %o0
  101. ENDPROC(NG4_retl_o2_plus_o5)
  102. ENTRY(NG4_retl_o2_plus_o5_plus_4)
  103. add %o5, 4, %o5
  104. ba,pt %xcc, __restore_asi
  105. add %o2, %o5, %o0
  106. ENDPROC(NG4_retl_o2_plus_o5_plus_4)
  107. ENTRY(NG4_retl_o2_plus_o5_plus_8)
  108. add %o5, 8, %o5
  109. ba,pt %xcc, __restore_asi
  110. add %o2, %o5, %o0
  111. ENDPROC(NG4_retl_o2_plus_o5_plus_8)
  112. ENTRY(NG4_retl_o2_plus_o5_plus_16)
  113. add %o5, 16, %o5
  114. ba,pt %xcc, __restore_asi
  115. add %o2, %o5, %o0
  116. ENDPROC(NG4_retl_o2_plus_o5_plus_16)
  117. ENTRY(NG4_retl_o2_plus_o5_plus_24)
  118. add %o5, 24, %o5
  119. ba,pt %xcc, __restore_asi
  120. add %o2, %o5, %o0
  121. ENDPROC(NG4_retl_o2_plus_o5_plus_24)
  122. ENTRY(NG4_retl_o2_plus_o5_plus_32)
  123. add %o5, 32, %o5
  124. ba,pt %xcc, __restore_asi
  125. add %o2, %o5, %o0
  126. ENDPROC(NG4_retl_o2_plus_o5_plus_32)
  127. ENTRY(NG4_retl_o2_plus_g1)
  128. ba,pt %xcc, __restore_asi
  129. add %o2, %g1, %o0
  130. ENDPROC(NG4_retl_o2_plus_g1)
  131. ENTRY(NG4_retl_o2_plus_g1_plus_1)
  132. add %g1, 1, %g1
  133. ba,pt %xcc, __restore_asi
  134. add %o2, %g1, %o0
  135. ENDPROC(NG4_retl_o2_plus_g1_plus_1)
  136. ENTRY(NG4_retl_o2_plus_g1_plus_8)
  137. add %g1, 8, %g1
  138. ba,pt %xcc, __restore_asi
  139. add %o2, %g1, %o0
  140. ENDPROC(NG4_retl_o2_plus_g1_plus_8)
  141. ENTRY(NG4_retl_o2_plus_o4)
  142. ba,pt %xcc, __restore_asi
  143. add %o2, %o4, %o0
  144. ENDPROC(NG4_retl_o2_plus_o4)
  145. ENTRY(NG4_retl_o2_plus_o4_plus_8)
  146. add %o4, 8, %o4
  147. ba,pt %xcc, __restore_asi
  148. add %o2, %o4, %o0
  149. ENDPROC(NG4_retl_o2_plus_o4_plus_8)
  150. ENTRY(NG4_retl_o2_plus_o4_plus_16)
  151. add %o4, 16, %o4
  152. ba,pt %xcc, __restore_asi
  153. add %o2, %o4, %o0
  154. ENDPROC(NG4_retl_o2_plus_o4_plus_16)
  155. ENTRY(NG4_retl_o2_plus_o4_plus_24)
  156. add %o4, 24, %o4
  157. ba,pt %xcc, __restore_asi
  158. add %o2, %o4, %o0
  159. ENDPROC(NG4_retl_o2_plus_o4_plus_24)
  160. ENTRY(NG4_retl_o2_plus_o4_plus_32)
  161. add %o4, 32, %o4
  162. ba,pt %xcc, __restore_asi
  163. add %o2, %o4, %o0
  164. ENDPROC(NG4_retl_o2_plus_o4_plus_32)
  165. ENTRY(NG4_retl_o2_plus_o4_plus_40)
  166. add %o4, 40, %o4
  167. ba,pt %xcc, __restore_asi
  168. add %o2, %o4, %o0
  169. ENDPROC(NG4_retl_o2_plus_o4_plus_40)
  170. ENTRY(NG4_retl_o2_plus_o4_plus_48)
  171. add %o4, 48, %o4
  172. ba,pt %xcc, __restore_asi
  173. add %o2, %o4, %o0
  174. ENDPROC(NG4_retl_o2_plus_o4_plus_48)
  175. ENTRY(NG4_retl_o2_plus_o4_plus_56)
  176. add %o4, 56, %o4
  177. ba,pt %xcc, __restore_asi
  178. add %o2, %o4, %o0
  179. ENDPROC(NG4_retl_o2_plus_o4_plus_56)
  180. ENTRY(NG4_retl_o2_plus_o4_plus_64)
  181. add %o4, 64, %o4
  182. ba,pt %xcc, __restore_asi
  183. add %o2, %o4, %o0
  184. ENDPROC(NG4_retl_o2_plus_o4_plus_64)
  185. ENTRY(NG4_retl_o2_plus_o4_fp)
  186. ba,pt %xcc, __restore_asi_fp
  187. add %o2, %o4, %o0
  188. ENDPROC(NG4_retl_o2_plus_o4_fp)
  189. ENTRY(NG4_retl_o2_plus_o4_plus_8_fp)
  190. add %o4, 8, %o4
  191. ba,pt %xcc, __restore_asi_fp
  192. add %o2, %o4, %o0
  193. ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp)
  194. ENTRY(NG4_retl_o2_plus_o4_plus_16_fp)
  195. add %o4, 16, %o4
  196. ba,pt %xcc, __restore_asi_fp
  197. add %o2, %o4, %o0
  198. ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp)
  199. ENTRY(NG4_retl_o2_plus_o4_plus_24_fp)
  200. add %o4, 24, %o4
  201. ba,pt %xcc, __restore_asi_fp
  202. add %o2, %o4, %o0
  203. ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp)
  204. ENTRY(NG4_retl_o2_plus_o4_plus_32_fp)
  205. add %o4, 32, %o4
  206. ba,pt %xcc, __restore_asi_fp
  207. add %o2, %o4, %o0
  208. ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp)
  209. ENTRY(NG4_retl_o2_plus_o4_plus_40_fp)
  210. add %o4, 40, %o4
  211. ba,pt %xcc, __restore_asi_fp
  212. add %o2, %o4, %o0
  213. ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp)
  214. ENTRY(NG4_retl_o2_plus_o4_plus_48_fp)
  215. add %o4, 48, %o4
  216. ba,pt %xcc, __restore_asi_fp
  217. add %o2, %o4, %o0
  218. ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp)
  219. ENTRY(NG4_retl_o2_plus_o4_plus_56_fp)
  220. add %o4, 56, %o4
  221. ba,pt %xcc, __restore_asi_fp
  222. add %o2, %o4, %o0
  223. ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp)
  224. ENTRY(NG4_retl_o2_plus_o4_plus_64_fp)
  225. add %o4, 64, %o4
  226. ba,pt %xcc, __restore_asi_fp
  227. add %o2, %o4, %o0
  228. ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp)
  229. #endif
  230. .align 64
  231. .globl FUNC_NAME
  232. .type FUNC_NAME,#function
  233. FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  234. #ifdef MEMCPY_DEBUG
  235. wr %g0, 0x80, %asi
  236. #endif
  237. srlx %o2, 31, %g2
  238. cmp %g2, 0
  239. tne %XCC, 5
  240. PREAMBLE
  241. mov %o0, %o3
  242. brz,pn %o2, .Lexit
  243. cmp %o2, 3
  244. ble,pn %icc, .Ltiny
  245. cmp %o2, 19
  246. ble,pn %icc, .Lsmall
  247. or %o0, %o1, %g2
  248. cmp %o2, 128
  249. bl,pn %icc, .Lmedium
  250. nop
  251. .Llarge:/* len >= 0x80 */
  252. /* First get dest 8 byte aligned. */
  253. sub %g0, %o0, %g1
  254. and %g1, 0x7, %g1
  255. brz,pt %g1, 51f
  256. sub %o2, %g1, %o2
  257. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
  258. add %o1, 1, %o1
  259. subcc %g1, 1, %g1
  260. add %o0, 1, %o0
  261. bne,pt %icc, 1b
  262. EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
  263. 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
  264. LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
  265. LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
  266. LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
  267. LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
  268. LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
  269. LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
  270. LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  271. /* Check if we can use the straight fully aligned
  272. * loop, or we require the alignaddr/faligndata variant.
  273. */
  274. andcc %o1, 0x7, %o5
  275. bne,pn %icc, .Llarge_src_unaligned
  276. sub %g0, %o0, %g1
  277. /* Legitimize the use of initializing stores by getting dest
  278. * to be 64-byte aligned.
  279. */
  280. and %g1, 0x3f, %g1
  281. brz,pt %g1, .Llarge_aligned
  282. sub %o2, %g1, %o2
  283. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
  284. add %o1, 8, %o1
  285. subcc %g1, 8, %g1
  286. add %o0, 8, %o0
  287. bne,pt %icc, 1b
  288. EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8)
  289. .Llarge_aligned:
  290. /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
  291. andn %o2, 0x3f, %o4
  292. sub %o2, %o4, %o2
  293. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4)
  294. add %o1, 0x40, %o1
  295. EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4)
  296. subcc %o4, 0x40, %o4
  297. EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64)
  298. EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64)
  299. EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64)
  300. EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64)
  301. add %o0, 0x08, %o0
  302. EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56)
  303. add %o0, 0x08, %o0
  304. EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48)
  305. EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48)
  306. add %o0, 0x08, %o0
  307. EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40)
  308. EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40)
  309. add %o0, 0x08, %o0
  310. EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32)
  311. EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32)
  312. add %o0, 0x08, %o0
  313. EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24)
  314. add %o0, 0x08, %o0
  315. EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16)
  316. add %o0, 0x08, %o0
  317. EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8)
  318. add %o0, 0x08, %o0
  319. bne,pt %icc, 1b
  320. LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
  321. membar #StoreLoad | #StoreStore
  322. brz,pn %o2, .Lexit
  323. cmp %o2, 19
  324. ble,pn %icc, .Lsmall_unaligned
  325. nop
  326. ba,a,pt %icc, .Lmedium_noprefetch
  327. .Lexit: retl
  328. mov EX_RETVAL(%o3), %o0
  329. .Llarge_src_unaligned:
  330. #ifdef NON_USER_COPY
  331. VISEntryHalfFast(.Lmedium_vis_entry_fail)
  332. #else
  333. VISEntryHalf
  334. #endif
  335. andn %o2, 0x3f, %o4
  336. sub %o2, %o4, %o2
  337. alignaddr %o1, %g0, %g1
  338. add %o1, %o4, %o1
  339. EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4)
  340. 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4)
  341. subcc %o4, 0x40, %o4
  342. EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64)
  343. EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64)
  344. EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64)
  345. EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64)
  346. EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64)
  347. EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64)
  348. faligndata %f0, %f2, %f16
  349. EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64)
  350. faligndata %f2, %f4, %f18
  351. add %g1, 0x40, %g1
  352. faligndata %f4, %f6, %f20
  353. faligndata %f6, %f8, %f22
  354. faligndata %f8, %f10, %f24
  355. faligndata %f10, %f12, %f26
  356. faligndata %f12, %f14, %f28
  357. faligndata %f14, %f0, %f30
  358. EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64)
  359. EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56)
  360. EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48)
  361. EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40)
  362. EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32)
  363. EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24)
  364. EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16)
  365. EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8)
  366. add %o0, 0x40, %o0
  367. bne,pt %icc, 1b
  368. LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
  369. #ifdef NON_USER_COPY
  370. VISExitHalfFast
  371. #else
  372. VISExitHalf
  373. #endif
  374. brz,pn %o2, .Lexit
  375. cmp %o2, 19
  376. ble,pn %icc, .Lsmall_unaligned
  377. nop
  378. ba,a,pt %icc, .Lmedium_unaligned
  379. #ifdef NON_USER_COPY
  380. .Lmedium_vis_entry_fail:
  381. or %o0, %o1, %g2
  382. #endif
  383. .Lmedium:
  384. LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
  385. andcc %g2, 0x7, %g0
  386. bne,pn %icc, .Lmedium_unaligned
  387. nop
  388. .Lmedium_noprefetch:
  389. andncc %o2, 0x20 - 1, %o5
  390. be,pn %icc, 2f
  391. sub %o2, %o5, %o2
  392. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
  393. EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5)
  394. EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5)
  395. EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5)
  396. add %o1, 0x20, %o1
  397. subcc %o5, 0x20, %o5
  398. EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32)
  399. EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24)
  400. EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24)
  401. EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8)
  402. bne,pt %icc, 1b
  403. add %o0, 0x20, %o0
  404. 2: andcc %o2, 0x18, %o5
  405. be,pt %icc, 3f
  406. sub %o2, %o5, %o2
  407. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
  408. add %o1, 0x08, %o1
  409. add %o0, 0x08, %o0
  410. subcc %o5, 0x08, %o5
  411. bne,pt %icc, 1b
  412. EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8)
  413. 3: brz,pt %o2, .Lexit
  414. cmp %o2, 0x04
  415. bl,pn %icc, .Ltiny
  416. nop
  417. EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2)
  418. add %o1, 0x04, %o1
  419. add %o0, 0x04, %o0
  420. subcc %o2, 0x04, %o2
  421. bne,pn %icc, .Ltiny
  422. EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4)
  423. ba,a,pt %icc, .Lexit
  424. .Lmedium_unaligned:
  425. /* First get dest 8 byte aligned. */
  426. sub %g0, %o0, %g1
  427. and %g1, 0x7, %g1
  428. brz,pt %g1, 2f
  429. sub %o2, %g1, %o2
  430. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
  431. add %o1, 1, %o1
  432. subcc %g1, 1, %g1
  433. add %o0, 1, %o0
  434. bne,pt %icc, 1b
  435. EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
  436. 2:
  437. and %o1, 0x7, %g1
  438. brz,pn %g1, .Lmedium_noprefetch
  439. sll %g1, 3, %g1
  440. mov 64, %g2
  441. sub %g2, %g1, %g2
  442. andn %o1, 0x7, %o1
  443. EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2)
  444. sllx %o4, %g1, %o4
  445. andn %o2, 0x08 - 1, %o5
  446. sub %o2, %o5, %o2
  447. 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5)
  448. add %o1, 0x08, %o1
  449. subcc %o5, 0x08, %o5
  450. srlx %g3, %g2, GLOBAL_SPARE
  451. or GLOBAL_SPARE, %o4, GLOBAL_SPARE
  452. EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8)
  453. add %o0, 0x08, %o0
  454. bne,pt %icc, 1b
  455. sllx %g3, %g1, %o4
  456. srl %g1, 3, %g1
  457. add %o1, %g1, %o1
  458. brz,pn %o2, .Lexit
  459. nop
  460. ba,pt %icc, .Lsmall_unaligned
  461. .Ltiny:
  462. EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
  463. subcc %o2, 1, %o2
  464. be,pn %icc, .Lexit
  465. EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1)
  466. EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2)
  467. subcc %o2, 1, %o2
  468. be,pn %icc, .Lexit
  469. EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1)
  470. EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2)
  471. ba,pt %icc, .Lexit
  472. EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2)
  473. .Lsmall:
  474. andcc %g2, 0x3, %g0
  475. bne,pn %icc, .Lsmall_unaligned
  476. andn %o2, 0x4 - 1, %o5
  477. sub %o2, %o5, %o2
  478. 1:
  479. EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
  480. add %o1, 0x04, %o1
  481. subcc %o5, 0x04, %o5
  482. add %o0, 0x04, %o0
  483. bne,pt %icc, 1b
  484. EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4)
  485. brz,pt %o2, .Lexit
  486. nop
  487. ba,a,pt %icc, .Ltiny
  488. .Lsmall_unaligned:
  489. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
  490. add %o1, 1, %o1
  491. add %o0, 1, %o0
  492. subcc %o2, 1, %o2
  493. bne,pt %icc, 1b
  494. EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1)
  495. ba,a,pt %icc, .Lexit
  496. .size FUNC_NAME, .-FUNC_NAME