checksum_64.S 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. /* checksum.S: Sparc V9 optimized checksum code.
  2. *
  3. * Copyright(C) 1995 Linus Torvalds
  4. * Copyright(C) 1995 Miguel de Icaza
  5. * Copyright(C) 1996, 2000 David S. Miller
  6. * Copyright(C) 1997 Jakub Jelinek
  7. *
  8. * derived from:
  9. * Linux/Alpha checksum c-code
  10. * Linux/ix86 inline checksum assembly
  11. * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12. * David Mosberger-Tang for optimized reference c-code
  13. * BSD4.4 portable checksum routine
  14. */
  15. #include <asm/export.h>
  16. .text
  17. csum_partial_fix_alignment:
  18. /* We checked for zero length already, so there must be
  19. * at least one byte.
  20. */
  21. be,pt %icc, 1f
  22. nop
  23. ldub [%o0 + 0x00], %o4
  24. add %o0, 1, %o0
  25. sub %o1, 1, %o1
  26. 1: andcc %o0, 0x2, %g0
  27. be,pn %icc, csum_partial_post_align
  28. cmp %o1, 2
  29. blu,pn %icc, csum_partial_end_cruft
  30. nop
  31. lduh [%o0 + 0x00], %o5
  32. add %o0, 2, %o0
  33. sub %o1, 2, %o1
  34. ba,pt %xcc, csum_partial_post_align
  35. add %o5, %o4, %o4
  36. .align 32
  37. .globl csum_partial
  38. EXPORT_SYMBOL(csum_partial)
  39. csum_partial: /* %o0=buff, %o1=len, %o2=sum */
  40. prefetch [%o0 + 0x000], #n_reads
  41. clr %o4
  42. prefetch [%o0 + 0x040], #n_reads
  43. brz,pn %o1, csum_partial_finish
  44. andcc %o0, 0x3, %g0
  45. /* We "remember" whether the lowest bit in the address
  46. * was set in %g7. Because if it is, we have to swap
  47. * upper and lower 8 bit fields of the sum we calculate.
  48. */
  49. bne,pn %icc, csum_partial_fix_alignment
  50. andcc %o0, 0x1, %g7
  51. csum_partial_post_align:
  52. prefetch [%o0 + 0x080], #n_reads
  53. andncc %o1, 0x3f, %o3
  54. prefetch [%o0 + 0x0c0], #n_reads
  55. sub %o1, %o3, %o1
  56. brz,pn %o3, 2f
  57. prefetch [%o0 + 0x100], #n_reads
  58. /* So that we don't need to use the non-pairing
  59. * add-with-carry instructions we accumulate 32-bit
  60. * values into a 64-bit register. At the end of the
  61. * loop we fold it down to 32-bits and so on.
  62. */
  63. prefetch [%o0 + 0x140], #n_reads
  64. 1: lduw [%o0 + 0x00], %o5
  65. lduw [%o0 + 0x04], %g1
  66. lduw [%o0 + 0x08], %g2
  67. add %o4, %o5, %o4
  68. lduw [%o0 + 0x0c], %g3
  69. add %o4, %g1, %o4
  70. lduw [%o0 + 0x10], %o5
  71. add %o4, %g2, %o4
  72. lduw [%o0 + 0x14], %g1
  73. add %o4, %g3, %o4
  74. lduw [%o0 + 0x18], %g2
  75. add %o4, %o5, %o4
  76. lduw [%o0 + 0x1c], %g3
  77. add %o4, %g1, %o4
  78. lduw [%o0 + 0x20], %o5
  79. add %o4, %g2, %o4
  80. lduw [%o0 + 0x24], %g1
  81. add %o4, %g3, %o4
  82. lduw [%o0 + 0x28], %g2
  83. add %o4, %o5, %o4
  84. lduw [%o0 + 0x2c], %g3
  85. add %o4, %g1, %o4
  86. lduw [%o0 + 0x30], %o5
  87. add %o4, %g2, %o4
  88. lduw [%o0 + 0x34], %g1
  89. add %o4, %g3, %o4
  90. lduw [%o0 + 0x38], %g2
  91. add %o4, %o5, %o4
  92. lduw [%o0 + 0x3c], %g3
  93. add %o4, %g1, %o4
  94. prefetch [%o0 + 0x180], #n_reads
  95. add %o4, %g2, %o4
  96. subcc %o3, 0x40, %o3
  97. add %o0, 0x40, %o0
  98. bne,pt %icc, 1b
  99. add %o4, %g3, %o4
  100. 2: and %o1, 0x3c, %o3
  101. brz,pn %o3, 2f
  102. sub %o1, %o3, %o1
  103. 1: lduw [%o0 + 0x00], %o5
  104. subcc %o3, 0x4, %o3
  105. add %o0, 0x4, %o0
  106. bne,pt %icc, 1b
  107. add %o4, %o5, %o4
  108. 2:
  109. /* fold 64-->32 */
  110. srlx %o4, 32, %o5
  111. srl %o4, 0, %o4
  112. add %o4, %o5, %o4
  113. srlx %o4, 32, %o5
  114. srl %o4, 0, %o4
  115. add %o4, %o5, %o4
  116. /* fold 32-->16 */
  117. sethi %hi(0xffff0000), %g1
  118. srl %o4, 16, %o5
  119. andn %o4, %g1, %g2
  120. add %o5, %g2, %o4
  121. srl %o4, 16, %o5
  122. andn %o4, %g1, %g2
  123. add %o5, %g2, %o4
  124. csum_partial_end_cruft:
  125. /* %o4 has the 16-bit sum we have calculated so-far. */
  126. cmp %o1, 2
  127. blu,pt %icc, 1f
  128. nop
  129. lduh [%o0 + 0x00], %o5
  130. sub %o1, 2, %o1
  131. add %o0, 2, %o0
  132. add %o4, %o5, %o4
  133. 1: brz,pt %o1, 1f
  134. nop
  135. ldub [%o0 + 0x00], %o5
  136. sub %o1, 1, %o1
  137. add %o0, 1, %o0
  138. sllx %o5, 8, %o5
  139. add %o4, %o5, %o4
  140. 1:
  141. /* fold 32-->16 */
  142. sethi %hi(0xffff0000), %g1
  143. srl %o4, 16, %o5
  144. andn %o4, %g1, %g2
  145. add %o5, %g2, %o4
  146. srl %o4, 16, %o5
  147. andn %o4, %g1, %g2
  148. add %o5, %g2, %o4
  149. 1: brz,pt %g7, 1f
  150. nop
  151. /* We started with an odd byte, byte-swap the result. */
  152. srl %o4, 8, %o5
  153. and %o4, 0xff, %g1
  154. sll %g1, 8, %g1
  155. or %o5, %g1, %o4
  156. 1: addcc %o2, %o4, %o2
  157. addc %g0, %o2, %o2
  158. csum_partial_finish:
  159. retl
  160. srl %o2, 0, %o0