123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- /* checksum.S: Sparc V9 optimized checksum code.
- *
- * Copyright(C) 1995 Linus Torvalds
- * Copyright(C) 1995 Miguel de Icaza
- * Copyright(C) 1996, 2000 David S. Miller
- * Copyright(C) 1997 Jakub Jelinek
- *
- * derived from:
- * Linux/Alpha checksum c-code
- * Linux/ix86 inline checksum assembly
- * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
- * David Mosberger-Tang for optimized reference c-code
- * BSD4.4 portable checksum routine
- */
- #include <asm/export.h>
- .text
- csum_partial_fix_alignment:
- /* We checked for zero length already, so there must be
- * at least one byte.
- */
- be,pt %icc, 1f
- nop
- ldub [%o0 + 0x00], %o4
- add %o0, 1, %o0
- sub %o1, 1, %o1
- 1: andcc %o0, 0x2, %g0
- be,pn %icc, csum_partial_post_align
- cmp %o1, 2
- blu,pn %icc, csum_partial_end_cruft
- nop
- lduh [%o0 + 0x00], %o5
- add %o0, 2, %o0
- sub %o1, 2, %o1
- ba,pt %xcc, csum_partial_post_align
- add %o5, %o4, %o4
- .align 32
- .globl csum_partial
- EXPORT_SYMBOL(csum_partial)
- csum_partial: /* %o0=buff, %o1=len, %o2=sum */
- prefetch [%o0 + 0x000], #n_reads
- clr %o4
- prefetch [%o0 + 0x040], #n_reads
- brz,pn %o1, csum_partial_finish
- andcc %o0, 0x3, %g0
- /* We "remember" whether the lowest bit in the address
- * was set in %g7. Because if it is, we have to swap
- * upper and lower 8 bit fields of the sum we calculate.
- */
- bne,pn %icc, csum_partial_fix_alignment
- andcc %o0, 0x1, %g7
- csum_partial_post_align:
- prefetch [%o0 + 0x080], #n_reads
- andncc %o1, 0x3f, %o3
- prefetch [%o0 + 0x0c0], #n_reads
- sub %o1, %o3, %o1
- brz,pn %o3, 2f
- prefetch [%o0 + 0x100], #n_reads
- /* So that we don't need to use the non-pairing
- * add-with-carry instructions we accumulate 32-bit
- * values into a 64-bit register. At the end of the
- * loop we fold it down to 32-bits and so on.
- */
- prefetch [%o0 + 0x140], #n_reads
- 1: lduw [%o0 + 0x00], %o5
- lduw [%o0 + 0x04], %g1
- lduw [%o0 + 0x08], %g2
- add %o4, %o5, %o4
- lduw [%o0 + 0x0c], %g3
- add %o4, %g1, %o4
- lduw [%o0 + 0x10], %o5
- add %o4, %g2, %o4
- lduw [%o0 + 0x14], %g1
- add %o4, %g3, %o4
- lduw [%o0 + 0x18], %g2
- add %o4, %o5, %o4
- lduw [%o0 + 0x1c], %g3
- add %o4, %g1, %o4
- lduw [%o0 + 0x20], %o5
- add %o4, %g2, %o4
- lduw [%o0 + 0x24], %g1
- add %o4, %g3, %o4
- lduw [%o0 + 0x28], %g2
- add %o4, %o5, %o4
- lduw [%o0 + 0x2c], %g3
- add %o4, %g1, %o4
- lduw [%o0 + 0x30], %o5
- add %o4, %g2, %o4
- lduw [%o0 + 0x34], %g1
- add %o4, %g3, %o4
- lduw [%o0 + 0x38], %g2
- add %o4, %o5, %o4
- lduw [%o0 + 0x3c], %g3
- add %o4, %g1, %o4
- prefetch [%o0 + 0x180], #n_reads
- add %o4, %g2, %o4
- subcc %o3, 0x40, %o3
- add %o0, 0x40, %o0
- bne,pt %icc, 1b
- add %o4, %g3, %o4
- 2: and %o1, 0x3c, %o3
- brz,pn %o3, 2f
- sub %o1, %o3, %o1
- 1: lduw [%o0 + 0x00], %o5
- subcc %o3, 0x4, %o3
- add %o0, 0x4, %o0
- bne,pt %icc, 1b
- add %o4, %o5, %o4
- 2:
- /* fold 64-->32 */
- srlx %o4, 32, %o5
- srl %o4, 0, %o4
- add %o4, %o5, %o4
- srlx %o4, 32, %o5
- srl %o4, 0, %o4
- add %o4, %o5, %o4
- /* fold 32-->16 */
- sethi %hi(0xffff0000), %g1
- srl %o4, 16, %o5
- andn %o4, %g1, %g2
- add %o5, %g2, %o4
- srl %o4, 16, %o5
- andn %o4, %g1, %g2
- add %o5, %g2, %o4
- csum_partial_end_cruft:
- /* %o4 has the 16-bit sum we have calculated so-far. */
- cmp %o1, 2
- blu,pt %icc, 1f
- nop
- lduh [%o0 + 0x00], %o5
- sub %o1, 2, %o1
- add %o0, 2, %o0
- add %o4, %o5, %o4
- 1: brz,pt %o1, 1f
- nop
- ldub [%o0 + 0x00], %o5
- sub %o1, 1, %o1
- add %o0, 1, %o0
- sllx %o5, 8, %o5
- add %o4, %o5, %o4
- 1:
- /* fold 32-->16 */
- sethi %hi(0xffff0000), %g1
- srl %o4, 16, %o5
- andn %o4, %g1, %g2
- add %o5, %g2, %o4
- srl %o4, 16, %o5
- andn %o4, %g1, %g2
- add %o5, %g2, %o4
- 1: brz,pt %g7, 1f
- nop
- /* We started with an odd byte, byte-swap the result. */
- srl %o4, 8, %o5
- and %o4, 0xff, %g1
- sll %g1, 8, %g1
- or %o5, %g1, %o4
- 1: addcc %o2, %o4, %o2
- addc %g0, %o2, %o2
- csum_partial_finish:
- retl
- srl %o2, 0, %o0
|