memcpy.S 16 KB


  1. /*
  2. * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
  3. *
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License version 2 and
  7. * only version 2 as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301, USA.
  18. */
  19. /*
  20. * Description
  21. *
  22. * library function for memcpy where length bytes are copied from
  23. * ptr_in to ptr_out. ptr_out is returned unchanged.
  24. * Allows any combination of alignment on input and output pointers
  25. * and length from 0 to 2^32-1
  26. *
  27. * Restrictions
  28. * The arrays should not overlap, the program will produce undefined output
  29. * if they do.
  30. * For blocks less than 16 bytes a byte by byte copy is performed. For
  31. * 8byte alignments, and length multiples, a dword copy is performed up to
  32. * 96bytes
  33. * History
  34. *
  35. * DJH 5/15/09 Initial version 1.0
  36. * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19
  37. * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840
  38. * DJH 10/14/09 Version 1.3 added special loop for aligned case, was
  39. * overreading bloated codesize back up to 892
  40. * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads
  41. * occurring if only 1 left outstanding, fixes bug
  42. * # 3888, corrected for all alignments. Peeled off
  43. * 1 32byte chunk from kernel loop and extended 8byte
  44. * loop at end to solve all combinations and prevent
  45. * over read. Fixed Ldword_loop_prolog to prevent
  46. * overread for blocks less than 48bytes. Reduced
  47. * codesize to 752 bytes
  48. * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not
  49. * aligned to dword boundaries,underwriting by 1
  50. * byte, added detection for this and fixed. A
  51. * little bloat.
  52. * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored
  53. * always, fixed the error of R20 being modified
  54. * before it was being saved
  55. * Natural c model
  56. * ===============
  57. * void * memcpy(char * ptr_out, char * ptr_in, int length) {
  58. * int i;
  59. * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; }
  60. * return(ptr_out);
  61. * }
  62. *
  63. * Optimized memcpy function
  64. * =========================
  65. * void * memcpy(char * ptr_out, char * ptr_in, int len) {
  66. * int i, prolog, kernel, epilog, mask;
  67. * u8 offset;
  68. * s64 data0, dataF8, data70;
  69. *
  70. * s64 * ptr8_in;
  71. * s64 * ptr8_out;
  72. * s32 * ptr4;
  73. * s16 * ptr2;
  74. *
  75. * offset = ((int) ptr_in) & 7;
  76. * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers
  77. *
  78. * data70 = *ptr8_in++;
  79. * dataF8 = *ptr8_in++;
  80. *
  81. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  82. *
  83. * prolog = 32 - ((int) ptr_out);
  84. * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len);
  85. * prolog = prolog & mask;
  86. * kernel = len - prolog;
  87. * epilog = kernel & 0x1F;
  88. * kernel = kernel>>5;
  89. *
  90. * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;}
  91. * ptr2 = (s16 *) &ptr_out[0];
  92. * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
  93. * ptr4 = (s32 *) &ptr_out[0];
  94. * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
  95. *
  96. * offset = offset + (prolog & 7);
  97. * if (offset >= 8) {
  98. * data70 = dataF8;
  99. * dataF8 = *ptr8_in++;
  100. * }
  101. * offset = offset & 0x7;
  102. *
  103. * prolog = prolog >> 3;
  104. * if (prolog) for (i=0; i < prolog; i++) {
  105. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  106. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  107. * data70 = dataF8;
  108. * dataF8 = *ptr8_in++;
  109. * }
  110. * if(kernel) { kernel -= 1; epilog += 32; }
  111. * if(kernel) for(i=0; i < kernel; i++) {
  112. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  113. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  114. * data70 = *ptr8_in++;
  115. *
  116. * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
  117. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  118. * dataF8 = *ptr8_in++;
  119. *
  120. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  121. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  122. * data70 = *ptr8_in++;
  123. *
  124. * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset);
  125. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  126. * dataF8 = *ptr8_in++;
  127. * }
  128. * epilogdws = epilog >> 3;
  129. * if (epilogdws) for (i=0; i < epilogdws; i++) {
  130. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  131. * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8;
  132. * data70 = dataF8;
  133. * dataF8 = *ptr8_in++;
  134. * }
  135. * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset);
  136. *
  137. * ptr4 = (s32 *) &ptr_out[0];
  138. * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;}
  139. * ptr2 = (s16 *) &ptr_out[0];
  140. * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;}
  141. * if (epilog & 1) { *ptr_out++ = (u8) data0; }
  142. *
  143. * return(ptr_out - length);
  144. * }
  145. *
  146. * Codesize : 784 bytes
  147. */
  148. #define ptr_out R0 /* destination pounter */
  149. #define ptr_in R1 /* source pointer */
  150. #define len R2 /* length of copy in bytes */
  151. #define data70 R13:12 /* lo 8 bytes of non-aligned transfer */
  152. #define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */
  153. #define ldata0 R7:6 /* even 8 bytes chunks */
  154. #define ldata1 R25:24 /* odd 8 bytes chunks */
  155. #define data1 R7 /* lower 8 bytes of ldata1 */
  156. #define data0 R6 /* lower 8 bytes of ldata0 */
  157. #define ifbyte p0 /* if transfer has bytes in epilog/prolog */
  158. #define ifhword p0 /* if transfer has shorts in epilog/prolog */
  159. #define ifword p0 /* if transfer has words in epilog/prolog */
  160. #define noprolog p0 /* no prolog, xfer starts at 32byte */
  161. #define nokernel p1 /* no 32byte multiple block in the transfer */
  162. #define noepilog p0 /* no epilog, xfer ends on 32byte boundary */
  163. #define align p2 /* alignment of input rel to 8byte boundary */
  164. #define kernel1 p0 /* kernel count == 1 */
  165. #define dalign R25 /* rel alignment of input to output data */
  166. #define star3 R16 /* number bytes in prolog - dwords */
  167. #define rest R8 /* length - prolog bytes */
  168. #define back R7 /* nr bytes > dword boundary in src block */
  169. #define epilog R3 /* bytes in epilog */
  170. #define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */
  171. #define kernel R4 /* number of 32byte chunks in kernel */
  172. #define ptr_in_p_128 R5 /* pointer for prefetch of input data */
  173. #define mask R8 /* mask used to determine prolog size */
  174. #define shift R8 /* used to work a shifter to extract bytes */
  175. #define shift2 R5 /* in epilog to workshifter to extract bytes */
  176. #define prolog R15 /* bytes in prolog */
  177. #define epilogdws R15 /* number dwords in epilog */
  178. #define shiftb R14 /* used to extract bytes */
  179. #define offset R9 /* same as align in reg */
  180. #define ptr_out_p_32 R17 /* pointer to output dczero */
  181. #define align888 R14 /* if simple dword loop can be used */
  182. #define len8 R9 /* number of dwords in length */
  183. #define over R20 /* nr of bytes > last inp buf dword boundary */
  184. #define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */
  185. .section .text
  186. .p2align 4
  187. .global memcpy
  188. .type memcpy, @function
  189. memcpy:
  190. {
  191. p2 = cmp.eq(len, #0); /* =0 */
  192. align888 = or(ptr_in, ptr_out); /* %8 < 97 */
  193. p0 = cmp.gtu(len, #23); /* %1, <24 */
  194. p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */
  195. }
  196. {
  197. p1 = or(p2, p1);
  198. p3 = cmp.gtu(len, #95); /* %8 < 97 */
  199. align888 = or(align888, len); /* %8 < 97 */
  200. len8 = lsr(len, #3); /* %8 < 97 */
  201. }
  202. {
  203. dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */
  204. p2 = bitsclr(align888, #7); /* %8 < 97 */
  205. if(p1) jumpr r31; /* =0 */
  206. }
  207. {
  208. p2 = and(p2,!p3); /* %8 < 97 */
  209. if (p2.new) len = add(len, #-8); /* %8 < 97 */
  210. if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */
  211. }
  212. {
  213. if(!p0) jump .Lbytes23orless; /* %1, <24 */
  214. mask.l = #LO(0x7fffffff);
  215. /* all bytes before line multiples of data */
  216. prolog = sub(#0, ptr_out);
  217. }
  218. {
  219. /* save r31 on stack, decrement sp by 16 */
  220. allocframe(#24);
  221. mask.h = #HI(0x7fffffff);
  222. ptr_in_p_128 = add(ptr_in, #32);
  223. back = cl0(len);
  224. }
  225. {
  226. memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */
  227. r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */
  228. prolog &= lsr(mask, back);
  229. offset = and(ptr_in, #7);
  230. }
  231. {
  232. memd(sp+#8) = R25:24; /* save r25,r24 on stack */
  233. dalign = sub(ptr_out, ptr_in);
  234. r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */
  235. }
  236. {
  237. /* see if there if input buffer end if aligned */
  238. over = add(len, ptr_in);
  239. back = add(len, offset);
  240. memd(sp+#16) = R21:20; /* save r20,r21 on stack */
  241. }
  242. {
  243. noprolog = bitsclr(prolog, #7);
  244. prolog = and(prolog, #31);
  245. dcfetch(ptr_in_p_128);
  246. ptr_in_p_128 = add(ptr_in_p_128, #32);
  247. }
  248. {
  249. kernel = sub(len, prolog);
  250. shift = asl(prolog, #3);
  251. star3 = and(prolog, #7);
  252. ptr_in = and(ptr_in, #-8);
  253. }
  254. {
  255. prolog = lsr(prolog, #3);
  256. epilog = and(kernel, #31);
  257. ptr_out_p_32 = add(ptr_out, prolog);
  258. over = and(over, #7);
  259. }
  260. {
  261. p3 = cmp.gtu(back, #8);
  262. kernel = lsr(kernel, #5);
  263. dcfetch(ptr_in_p_128);
  264. ptr_in_p_128 = add(ptr_in_p_128, #32);
  265. }
  266. {
  267. p1 = cmp.eq(prolog, #0);
  268. if(!p1.new) prolog = add(prolog, #1);
  269. dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
  270. ptr_in_p_128 = add(ptr_in_p_128, #32);
  271. }
  272. {
  273. nokernel = cmp.eq(kernel,#0);
  274. dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
  275. ptr_in_p_128 = add(ptr_in_p_128, #32);
  276. shiftb = and(shift, #8);
  277. }
  278. {
  279. dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */
  280. ptr_in_p_128 = add(ptr_in_p_128, #32);
  281. if(nokernel) jump .Lskip64;
  282. p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */
  283. }
  284. {
  285. dczeroa(ptr_out_p_32);
  286. /* don't advance pointer */
  287. if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32);
  288. }
  289. {
  290. dalign = and(dalign, #31);
  291. dczeroa(ptr_out_p_32);
  292. }
  293. .Lskip64:
  294. {
  295. data70 = memd(ptr_in++#16);
  296. if(p3) dataF8 = memd(ptr_in+#8);
  297. if(noprolog) jump .Lnoprolog32;
  298. align = offset;
  299. }
  300. /* upto initial 7 bytes */
  301. {
  302. ldata0 = valignb(dataF8, data70, align);
  303. ifbyte = tstbit(shift,#3);
  304. offset = add(offset, star3);
  305. }
  306. {
  307. if(ifbyte) memb(ptr_out++#1) = data0;
  308. ldata0 = lsr(ldata0, shiftb);
  309. shiftb = and(shift, #16);
  310. ifhword = tstbit(shift,#4);
  311. }
  312. {
  313. if(ifhword) memh(ptr_out++#2) = data0;
  314. ldata0 = lsr(ldata0, shiftb);
  315. ifword = tstbit(shift,#5);
  316. p2 = cmp.gtu(offset, #7);
  317. }
  318. {
  319. if(ifword) memw(ptr_out++#4) = data0;
  320. if(p2) data70 = dataF8;
  321. if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */
  322. align = offset;
  323. }
  324. .Lnoprolog32:
  325. {
  326. p3 = sp1loop0(.Ldword_loop_prolog, prolog)
  327. rest = sub(len, star3); /* whats left after the loop */
  328. p0 = cmp.gt(over, #0);
  329. }
  330. if(p0) rest = add(rest, #16);
  331. .Ldword_loop_prolog:
  332. {
  333. if(p3) memd(ptr_out++#8) = ldata0;
  334. ldata0 = valignb(dataF8, data70, align);
  335. p0 = cmp.gt(rest, #16);
  336. }
  337. {
  338. data70 = dataF8;
  339. if(p0) dataF8 = memd(ptr_in++#8);
  340. rest = add(rest, #-8);
  341. }:endloop0
  342. .Lkernel:
  343. {
  344. /* kernel is at least 32bytes */
  345. p3 = cmp.gtu(kernel, #0);
  346. /* last itn. remove edge effects */
  347. if(p3.new) kernel = add(kernel, #-1);
  348. /* dealt with in last dword loop */
  349. if(p3.new) epilog = add(epilog, #32);
  350. }
  351. {
  352. nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */
  353. if(nokernel.new) jump:NT .Lepilog; /* likely not taken */
  354. inc = combine(#32, #-1);
  355. p3 = cmp.gtu(dalign, #24);
  356. }
  357. {
  358. if(p3) jump .Lodd_alignment;
  359. }
  360. {
  361. loop0(.Loword_loop_25to31, kernel);
  362. kernel1 = cmp.gtu(kernel, #1);
  363. rest = kernel;
  364. }
  365. .falign
  366. .Loword_loop_25to31:
  367. {
  368. dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */
  369. if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
  370. }
  371. {
  372. dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */
  373. p3 = cmp.eq(kernel, rest);
  374. }
  375. {
  376. /* kernel -= 1 */
  377. ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
  378. /* kill write on first iteration */
  379. if(!p3) memd(ptr_out++#8) = ldata1;
  380. ldata1 = valignb(dataF8, data70, align);
  381. data70 = memd(ptr_in++#8);
  382. }
  383. {
  384. memd(ptr_out++#8) = ldata0;
  385. ldata0 = valignb(data70, dataF8, align);
  386. dataF8 = memd(ptr_in++#8);
  387. }
  388. {
  389. memd(ptr_out++#8) = ldata1;
  390. ldata1 = valignb(dataF8, data70, align);
  391. data70 = memd(ptr_in++#8);
  392. }
  393. {
  394. memd(ptr_out++#8) = ldata0;
  395. ldata0 = valignb(data70, dataF8, align);
  396. dataF8 = memd(ptr_in++#8);
  397. kernel1 = cmp.gtu(kernel, #1);
  398. }:endloop0
  399. {
  400. memd(ptr_out++#8) = ldata1;
  401. jump .Lepilog;
  402. }
  403. .Lodd_alignment:
  404. {
  405. loop0(.Loword_loop_00to24, kernel);
  406. kernel1 = cmp.gtu(kernel, #1);
  407. rest = add(kernel, #-1);
  408. }
  409. .falign
  410. .Loword_loop_00to24:
  411. {
  412. dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */
  413. ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
  414. if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32);
  415. }
  416. {
  417. dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */
  418. }
  419. {
  420. memd(ptr_out++#8) = ldata0;
  421. ldata0 = valignb(dataF8, data70, align);
  422. data70 = memd(ptr_in++#8);
  423. }
  424. {
  425. memd(ptr_out++#8) = ldata0;
  426. ldata0 = valignb(data70, dataF8, align);
  427. dataF8 = memd(ptr_in++#8);
  428. }
  429. {
  430. memd(ptr_out++#8) = ldata0;
  431. ldata0 = valignb(dataF8, data70, align);
  432. data70 = memd(ptr_in++#8);
  433. }
  434. {
  435. memd(ptr_out++#8) = ldata0;
  436. ldata0 = valignb(data70, dataF8, align);
  437. dataF8 = memd(ptr_in++#8);
  438. kernel1 = cmp.gtu(kernel, #1);
  439. }:endloop0
  440. .Lepilog:
  441. {
  442. noepilog = cmp.eq(epilog,#0);
  443. epilogdws = lsr(epilog, #3);
  444. kernel = and(epilog, #7);
  445. }
  446. {
  447. if(noepilog) jumpr r31;
  448. if(noepilog) ptr_out = sub(ptr_out, len);
  449. p3 = cmp.eq(epilogdws, #0);
  450. shift2 = asl(epilog, #3);
  451. }
  452. {
  453. shiftb = and(shift2, #32);
  454. ifword = tstbit(epilog,#2);
  455. if(p3) jump .Lepilog60;
  456. if(!p3) epilog = add(epilog, #-16);
  457. }
  458. {
  459. loop0(.Ldword_loop_epilog, epilogdws);
  460. /* stop criteria is lsbs unless = 0 then its 8 */
  461. p3 = cmp.eq(kernel, #0);
  462. if(p3.new) kernel= #8;
  463. p1 = cmp.gt(over, #0);
  464. }
  465. /* if not aligned to end of buffer execute 1 more iteration */
  466. if(p1) kernel= #0;
  467. .Ldword_loop_epilog:
  468. {
  469. memd(ptr_out++#8) = ldata0;
  470. ldata0 = valignb(dataF8, data70, align);
  471. p3 = cmp.gt(epilog, kernel);
  472. }
  473. {
  474. data70 = dataF8;
  475. if(p3) dataF8 = memd(ptr_in++#8);
  476. epilog = add(epilog, #-8);
  477. }:endloop0
  478. /* copy last 7 bytes */
  479. .Lepilog60:
  480. {
  481. if(ifword) memw(ptr_out++#4) = data0;
  482. ldata0 = lsr(ldata0, shiftb);
  483. ifhword = tstbit(epilog,#1);
  484. shiftb = and(shift2, #16);
  485. }
  486. {
  487. if(ifhword) memh(ptr_out++#2) = data0;
  488. ldata0 = lsr(ldata0, shiftb);
  489. ifbyte = tstbit(epilog,#0);
  490. if(ifbyte.new) len = add(len, #-1);
  491. }
  492. {
  493. if(ifbyte) memb(ptr_out) = data0;
  494. ptr_out = sub(ptr_out, len); /* return dest pointer */
  495. jumpr r31;
  496. }
  497. /* do byte copy for small n */
  498. .Lbytes23orless:
  499. {
  500. p3 = sp1loop0(.Lbyte_copy, len);
  501. len = add(len, #-1);
  502. }
  503. .Lbyte_copy:
  504. {
  505. data0 = memb(ptr_in++#1);
  506. if(p3) memb(ptr_out++#1) = data0;
  507. }:endloop0
  508. {
  509. memb(ptr_out) = data0;
  510. ptr_out = sub(ptr_out, len);
  511. jumpr r31;
  512. }
  513. /* do dword copies for aligned in, out and length */
  514. .Ldwordaligned:
  515. {
  516. p3 = sp1loop0(.Ldword_copy, len8);
  517. }
  518. .Ldword_copy:
  519. {
  520. if(p3) memd(ptr_out++#8) = ldata0;
  521. ldata0 = memd(ptr_in++#8);
  522. }:endloop0
  523. {
  524. memd(ptr_out) = ldata0;
  525. ptr_out = sub(ptr_out, len);
  526. jumpr r31; /* return to function caller */
  527. }
  528. .Lmemcpy_return:
  529. r21:20 = memd(sp+#16); /* restore r20+r21 */
  530. {
  531. r25:24 = memd(sp+#8); /* restore r24+r25 */
  532. r17:16 = memd(sp+#0); /* restore r16+r17 */
  533. }
  534. deallocframe; /* restore r31 and incrment stack by 16 */
  535. jumpr r31