summaryrefslogtreecommitdiffstats
path: root/dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s')
-rw-r--r--dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s215
1 files changed, 215 insertions, 0 deletions
diff --git a/dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s b/dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s
new file mode 100644
index 0000000..be8db5b
--- /dev/null
+++ b/dependencies/pkg/mod/github.com/cespare/xxhash/v2@v2.1.2/xxhash_amd64.s
@@ -0,0 +1,215 @@
+// +build !appengine
+// +build gc
+// +build !purego
+
+#include "textflag.h"
+
+// Register allocation:
+// AX h
+// SI pointer to advance through b
+// DX n
+// BX loop end
+// R8 v1, k1
+// R9 v2
+// R10 v3
+// R11 v4
+// R12 tmp
+// R13 prime1v
+// R14 prime2v
+// DI prime4v
+
+// round reads from and advances the buffer pointer in SI.
+// It assumes that R13 has prime1v and R14 has prime2v.
+#define round(r) \
+ MOVQ (SI), R12 \
+ ADDQ $8, SI \
+ IMULQ R14, R12 \
+ ADDQ R12, r \
+ ROLQ $31, r \
+ IMULQ R13, r
+
+// mergeRound applies a merge round on the two registers acc and val.
+// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
+#define mergeRound(acc, val) \
+ IMULQ R14, val \
+ ROLQ $31, val \
+ IMULQ R13, val \
+ XORQ val, acc \
+ IMULQ R13, acc \
+ ADDQ DI, acc
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOSPLIT, $0-32
+ // Load fixed primes.
+ MOVQ ·prime1v(SB), R13
+ MOVQ ·prime2v(SB), R14
+ MOVQ ·prime4v(SB), DI
+
+ // Load slice.
+ MOVQ b_base+0(FP), SI
+ MOVQ b_len+8(FP), DX
+ LEAQ (SI)(DX*1), BX
+
+ // The first loop limit will be len(b)-32.
+ SUBQ $32, BX
+
+ // Check whether we have at least one block.
+ CMPQ DX, $32
+ JLT noBlocks
+
+ // Set up initial state (v1, v2, v3, v4).
+ MOVQ R13, R8
+ ADDQ R14, R8
+ MOVQ R14, R9
+ XORQ R10, R10
+ XORQ R11, R11
+ SUBQ R13, R11
+
+ // Loop until SI > BX.
+blockLoop:
+ round(R8)
+ round(R9)
+ round(R10)
+ round(R11)
+
+ CMPQ SI, BX
+ JLE blockLoop
+
+ MOVQ R8, AX
+ ROLQ $1, AX
+ MOVQ R9, R12
+ ROLQ $7, R12
+ ADDQ R12, AX
+ MOVQ R10, R12
+ ROLQ $12, R12
+ ADDQ R12, AX
+ MOVQ R11, R12
+ ROLQ $18, R12
+ ADDQ R12, AX
+
+ mergeRound(AX, R8)
+ mergeRound(AX, R9)
+ mergeRound(AX, R10)
+ mergeRound(AX, R11)
+
+ JMP afterBlocks
+
+noBlocks:
+ MOVQ ·prime5v(SB), AX
+
+afterBlocks:
+ ADDQ DX, AX
+
+ // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
+ ADDQ $24, BX
+
+ CMPQ SI, BX
+ JG fourByte
+
+wordLoop:
+ // Calculate k1.
+ MOVQ (SI), R8
+ ADDQ $8, SI
+ IMULQ R14, R8
+ ROLQ $31, R8
+ IMULQ R13, R8
+
+ XORQ R8, AX
+ ROLQ $27, AX
+ IMULQ R13, AX
+ ADDQ DI, AX
+
+ CMPQ SI, BX
+ JLE wordLoop
+
+fourByte:
+ ADDQ $4, BX
+ CMPQ SI, BX
+ JG singles
+
+ MOVL (SI), R8
+ ADDQ $4, SI
+ IMULQ R13, R8
+ XORQ R8, AX
+
+ ROLQ $23, AX
+ IMULQ R14, AX
+ ADDQ ·prime3v(SB), AX
+
+singles:
+ ADDQ $4, BX
+ CMPQ SI, BX
+ JGE finalize
+
+singlesLoop:
+ MOVBQZX (SI), R12
+ ADDQ $1, SI
+ IMULQ ·prime5v(SB), R12
+ XORQ R12, AX
+
+ ROLQ $11, AX
+ IMULQ R13, AX
+
+ CMPQ SI, BX
+ JL singlesLoop
+
+finalize:
+ MOVQ AX, R12
+ SHRQ $33, R12
+ XORQ R12, AX
+ IMULQ R14, AX
+ MOVQ AX, R12
+ SHRQ $29, R12
+ XORQ R12, AX
+ IMULQ ·prime3v(SB), AX
+ MOVQ AX, R12
+ SHRQ $32, R12
+ XORQ R12, AX
+
+ MOVQ AX, ret+24(FP)
+ RET
+
+// writeBlocks uses the same registers as above except that it uses AX to store
+// the d pointer.
+
+// func writeBlocks(d *Digest, b []byte) int
+TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+ // Load fixed primes needed for round.
+ MOVQ ·prime1v(SB), R13
+ MOVQ ·prime2v(SB), R14
+
+ // Load slice.
+ MOVQ b_base+8(FP), SI
+ MOVQ b_len+16(FP), DX
+ LEAQ (SI)(DX*1), BX
+ SUBQ $32, BX
+
+ // Load vN from d.
+ MOVQ d+0(FP), AX
+ MOVQ 0(AX), R8 // v1
+ MOVQ 8(AX), R9 // v2
+ MOVQ 16(AX), R10 // v3
+ MOVQ 24(AX), R11 // v4
+
+ // We don't need to check the loop condition here; this function is
+ // always called with at least one block of data to process.
+blockLoop:
+ round(R8)
+ round(R9)
+ round(R10)
+ round(R11)
+
+ CMPQ SI, BX
+ JLE blockLoop
+
+ // Copy vN back to d.
+ MOVQ R8, 0(AX)
+ MOVQ R9, 8(AX)
+ MOVQ R10, 16(AX)
+ MOVQ R11, 24(AX)
+
+ // The number of bytes written is SI minus the old base pointer.
+ SUBQ b_base+8(FP), SI
+ MOVQ SI, ret+32(FP)
+
+ RET