diff options
Diffstat (limited to 'src/internal/chacha8rand')
-rw-r--r-- | src/internal/chacha8rand/chacha8.go | 197 | ||||
-rw-r--r-- | src/internal/chacha8rand/chacha8_amd64.s | 174 | ||||
-rw-r--r-- | src/internal/chacha8rand/chacha8_arm64.s | 104 | ||||
-rw-r--r-- | src/internal/chacha8rand/chacha8_generic.go | 235 | ||||
-rw-r--r-- | src/internal/chacha8rand/chacha8_stub.s | 12 | ||||
-rw-r--r-- | src/internal/chacha8rand/export_test.go | 12 | ||||
-rw-r--r-- | src/internal/chacha8rand/rand_test.go | 202 |
7 files changed, 936 insertions, 0 deletions
diff --git a/src/internal/chacha8rand/chacha8.go b/src/internal/chacha8rand/chacha8.go new file mode 100644 index 0000000..ce55c07 --- /dev/null +++ b/src/internal/chacha8rand/chacha8.go @@ -0,0 +1,197 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha8rand implements a pseudorandom generator +// based on ChaCha8. It is used by both runtime and math/rand/v2 +// and must have no dependencies. +package chacha8rand + +const ( + ctrInc = 4 // increment counter by 4 between block calls + ctrMax = 16 // reseed when counter reaches 16 + chunk = 32 // each chunk produced by block is 32 uint64s + reseed = 4 // reseed with 4 words +) + +// block is the chacha8rand block function. +func block(seed *[4]uint64, blocks *[32]uint64, counter uint32) + +// A State holds the state for a single random generator. +// It must be used from one goroutine at a time. +// If used by multiple goroutines at a time, the goroutines +// may see the same random values, but the code will not +// crash or cause out-of-bounds memory accesses. +type State struct { + buf [32]uint64 + seed [4]uint64 + i uint32 + n uint32 + c uint32 +} + +// Next returns the next random value, along with a boolean +// indicating whether one was available. +// If one is not available, the caller should call Refill +// and then repeat the call to Next. +// +// Next is //go:nosplit to allow its use in the runtime +// with per-m data without holding the per-m lock. +//go:nosplit +func (s *State) Next() (uint64, bool) { + i := s.i + if i >= s.n { + return 0, false + } + s.i = i + 1 + return s.buf[i&31], true // i&31 eliminates bounds check +} + +// Init seeds the State with the given seed value. +func (s *State) Init(seed [32]byte) { + s.Init64([4]uint64{ + leUint64(seed[0*8:]), + leUint64(seed[1*8:]), + leUint64(seed[2*8:]), + leUint64(seed[3*8:]), + }) +} + +// Init64 seeds the state with the given seed value. +func (s *State) Init64(seed [4]uint64) { + s.seed = seed + block(&s.seed, &s.buf, 0) + s.c = 0 + s.i = 0 + s.n = chunk +} + +// Refill refills the state with more random values. +// After a call to Refill, an immediate call to Next will succeed +// (unless multiple goroutines are incorrectly sharing a state). +func (s *State) Refill() { + s.c += ctrInc + if s.c == ctrMax { + // Reseed with generated uint64s for forward secrecy. + // Normally this is done immediately after computing a block, + // but we do it immediately before computing the next block, + // to allow a much smaller serialized state (just the seed plus offset). + // This gives a delayed benefit for the forward secrecy + // (you can reconstruct the recent past given a memory dump), + // which we deem acceptable in exchange for the reduced size. + s.seed[0] = s.buf[len(s.buf)-reseed+0] + s.seed[1] = s.buf[len(s.buf)-reseed+1] + s.seed[2] = s.buf[len(s.buf)-reseed+2] + s.seed[3] = s.buf[len(s.buf)-reseed+3] + s.c = 0 + } + block(&s.seed, &s.buf, s.c) + s.i = 0 + s.n = uint32(len(s.buf)) + if s.c == ctrMax-ctrInc { + s.n = uint32(len(s.buf)) - reseed + } +} + +// Reseed reseeds the state with new random values. +// After a call to Reseed, any previously returned random values +// have been erased from the memory of the state and cannot be +// recovered. +func (s *State) Reseed() { + var seed [4]uint64 + for i := range seed { + for { + x, ok := s.Next() + if ok { + seed[i] = x + break + } + s.Refill() + } + } + s.Init64(seed) +} + +// Marshal marshals the state into a byte slice. +// Marshal and Unmarshal are functions, not methods, +// so that they will not be linked into the runtime +// when it uses the State struct, since the runtime +// does not need these. +func Marshal(s *State) []byte { + data := make([]byte, 6*8) + copy(data, "chacha8:") + used := (s.c/ctrInc)*chunk + s.i + bePutUint64(data[1*8:], uint64(used)) + for i, seed := range s.seed { + lePutUint64(data[(2+i)*8:], seed) + } + return data +} + +type errUnmarshalChaCha8 struct{} + +func (*errUnmarshalChaCha8) Error() string { + return "invalid ChaCha8 encoding" +} + +// Unmarshal unmarshals the state from a byte slice. +func Unmarshal(s *State, data []byte) error { + if len(data) != 6*8 || string(data[:8]) != "chacha8:" { + return new(errUnmarshalChaCha8) + } + used := beUint64(data[1*8:]) + if used > (ctrMax/ctrInc)*chunk-reseed { + return new(errUnmarshalChaCha8) + } + for i := range s.seed { + s.seed[i] = leUint64(data[(2+i)*8:]) + } + s.c = ctrInc * (uint32(used) / chunk) + block(&s.seed, &s.buf, s.c) + s.i = uint32(used) % chunk + s.n = chunk + if s.c == ctrMax-ctrInc { + s.n = chunk - reseed + } + return nil +} + +// binary.bigEndian.Uint64, copied to avoid dependency +func beUint64(b []byte) uint64 { + _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 + return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | + uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56 +} + +// binary.bigEndian.PutUint64, copied to avoid dependency +func bePutUint64(b []byte, v uint64) { + _ = b[7] // early bounds check to guarantee safety of writes below + b[0] = byte(v >> 56) + b[1] = byte(v >> 48) + b[2] = byte(v >> 40) + b[3] = byte(v >> 32) + b[4] = byte(v >> 24) + b[5] = byte(v >> 16) + b[6] = byte(v >> 8) + b[7] = byte(v) +} + +// binary.littleEndian.Uint64, copied to avoid dependency +func leUint64(b []byte) uint64 { + _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +// binary.littleEndian.PutUint64, copied to avoid dependency +func lePutUint64(b []byte, v uint64) { + _ = b[7] // early bounds check to guarantee safety of writes below + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) +} diff --git a/src/internal/chacha8rand/chacha8_amd64.s b/src/internal/chacha8rand/chacha8_amd64.s new file mode 100644 index 0000000..b56deb3 --- /dev/null +++ b/src/internal/chacha8rand/chacha8_amd64.s @@ -0,0 +1,174 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// ChaCha8 is ChaCha with 8 rounds. +// See https://cr.yp.to/chacha/chacha-20080128.pdf. +// See chacha8_generic.go for additional details. + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +#define ROL(N, R, T) \ + MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R + +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL16(R, T) PSHUFB ·rol16<>(SB), R +#else +#define ROL16(R, T) ROL(16, R, T) +#endif + +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL8(R, T) PSHUFB ·rol8<>(SB), R +#else +#define ROL8(R, T) ROL(8, R, T) +#endif + +// QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary. +#define QR(A, B, C, D, T) \ + PADDD B, A; PXOR A, D; ROL16(D, T); \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \ + PADDD B, A; PXOR A, D; ROL8(D, T); \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B + +// REPLREG replicates the register R into 4 uint32s in XR. +#define REPLREG(R, XR) \ + MOVQ R, XR; \ + PSHUFD $0, XR, XR + +// REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX. +#define REPL(val, XR) \ + MOVL $val, DX; \ + REPLREG(DX, XR) + +// SEED copies the off'th uint32 of the seed into the register XR, +// replicating it into all four stripes of the register. +#define SEED(off, reg, XR) \ + MOVL (4*off)(AX), reg; \ + REPLREG(reg, XR) \ + +// block runs 4 ChaCha8 block transformations in the four stripes of the X registers. + +// func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32) +TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 + // seed in AX + // blocks in BX + // counter in CX + + // Load initial constants into top row. + REPL(0x61707865, X0) + REPL(0x3320646e, X1) + REPL(0x79622d32, X2) + REPL(0x6b206574, X3) + + // Load counter into bottom left cell. + // Each stripe gets a different counter: 0, 1, 2, 3. + // (PINSRD is not available in GOAMD64_v1, + // so just do it in memory on all systems. + // This is not on the critical path.) + MOVL CX, 0(SP) + INCL CX + MOVL CX, 4(SP) + INCL CX + MOVL CX, 8(SP) + INCL CX + MOVL CX, 12(SP) + MOVOU 0(SP), X12 + + // Load seed words into next two rows and into DI, SI, R8..R13 + SEED(0, DI, X4) + SEED(1, SI, X5) + SEED(2, R8, X6) + SEED(3, R9, X7) + SEED(4, R10, X8) + SEED(5, R11, X9) + SEED(6, R12, X10) + SEED(7, R13, X11) + + // Zeros for remaining two matrix entries. + // We have just enough XMM registers to hold the state, + // without one for the temporary, so we flush and restore + // some values to and from memory to provide a temporary. + // The initial temporary is X15, so zero its memory instead + // of X15 itself. + MOVL $0, DX + MOVQ DX, X13 + MOVQ DX, X14 + MOVOU X14, (15*16)(BX) + + // 4 iterations. Each iteration is 8 quarter-rounds. + MOVL $4, DX +loop: + QR(X0, X4, X8, X12, X15) + MOVOU X4, (4*16)(BX) // save X4 + QR(X1, X5, X9, X13, X15) + MOVOU (15*16)(BX), X15 // reload X15; temp now X4 + QR(X2, X6, X10, X14, X4) + QR(X3, X7, X11, X15, X4) + + QR(X0, X5, X10, X15, X4) + MOVOU X15, (15*16)(BX) // save X15 + QR(X1, X6, X11, X12, X4) + MOVOU (4*16)(BX), X4 // reload X4; temp now X15 + QR(X2, X7, X8, X13, X15) + QR(X3, X4, X9, X14, X15) + + DECL DX + JNZ loop + + // Store interlaced blocks back to output buffer, + // adding original seed along the way. + + // First the top and bottom rows. + MOVOU X0, (0*16)(BX) + MOVOU X1, (1*16)(BX) + MOVOU X2, (2*16)(BX) + MOVOU X3, (3*16)(BX) + MOVOU X12, (12*16)(BX) + MOVOU X13, (13*16)(BX) + MOVOU X14, (14*16)(BX) + // X15 has already been stored. + + // Now we have X0-X3, X12-X15 available for temporaries. + // Add seed rows back to output. We left seed in DI, SI, R8..R13 above. + REPLREG(DI, X0) + REPLREG(SI, X1) + REPLREG(R8, X2) + REPLREG(R9, X3) + REPLREG(R10, X12) + REPLREG(R11, X13) + REPLREG(R12, X14) + REPLREG(R13, X15) + PADDD X0, X4 + PADDD X1, X5 + PADDD X2, X6 + PADDD X3, X7 + PADDD X12, X8 + PADDD X13, X9 + PADDD X14, X10 + PADDD X15, X11 + MOVOU X4, (4*16)(BX) + MOVOU X5, (5*16)(BX) + MOVOU X6, (6*16)(BX) + MOVOU X7, (7*16)(BX) + MOVOU X8, (8*16)(BX) + MOVOU X9, (9*16)(BX) + MOVOU X10, (10*16)(BX) + MOVOU X11, (11*16)(BX) + + MOVL $0, AX + MOVQ AX, X15 // must be 0 on return + + RET + +// rotate left 16 indexes for PSHUFB +GLOBL ·rol16<>(SB), NOPTR|RODATA, $16 +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A + +// rotate left 8 indexes for PSHUFB +GLOBL ·rol8<>(SB), NOPTR|RODATA, $16 +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B diff --git a/src/internal/chacha8rand/chacha8_arm64.s b/src/internal/chacha8rand/chacha8_arm64.s new file mode 100644 index 0000000..18e34dd --- /dev/null +++ b/src/internal/chacha8rand/chacha8_arm64.s @@ -0,0 +1,104 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// QR is the ChaCha quarter-round on A, B, C, and D. +// V30 is used as a temporary, and V31 is assumed to +// hold the index table for rotate left 8. +#define QR(A, B, C, D) \ + VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VREV32 D.H8, D.H8; \ + VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $12, V30.S4, B.S4; VSRI $20, V30.S4, B.S4 \ + VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VTBL V31.B16, [D.B16], D.B16; \ + VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $7, V30.S4, B.S4; VSRI $25, V30.S4, B.S4 + +// block runs 4 ChaCha8 block transformations in the four stripes of the V registers. + +// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32) +TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 + // seed in R0 + // blocks in R1 + // counter in R2 + + // Load initial constants into top row. + MOVD $·chachaConst(SB), R10 + VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + + // Load increment and rotate 8 constants into V30, V31. + MOVD $·chachaIncRot(SB), R11 + VLD1 (R11), [V30.S4, V31.S4] + + VLD4R.P 16(R0), [V4.S4, V5.S4, V6.S4, V7.S4] + VLD4R.P 16(R0), [V8.S4, V9.S4, V10.S4, V11.S4] + + // store counter to memory to replicate its uint32 halfs back out + MOVW R2, 0(RSP) + VLD1R 0(RSP), [V12.S4] + + // Add 0, 1, 2, 3 to counter stripes. + VADD V30.S4, V12.S4, V12.S4 + + // Zeros for remaining two matrix entries. + VEOR V13.B16, V13.B16, V13.B16 + VEOR V14.B16, V14.B16, V14.B16 + VEOR V15.B16, V15.B16, V15.B16 + + // Save seed state for adding back later. + VMOV V4.B16, V20.B16 + VMOV V5.B16, V21.B16 + VMOV V6.B16, V22.B16 + VMOV V7.B16, V23.B16 + VMOV V8.B16, V24.B16 + VMOV V9.B16, V25.B16 + VMOV V10.B16, V26.B16 + VMOV V11.B16, V27.B16 + + // 4 iterations. Each iteration is 8 quarter-rounds. + MOVD $4, R0 +loop: + QR(V0, V4, V8, V12) + QR(V1, V5, V9, V13) + QR(V2, V6, V10, V14) + QR(V3, V7, V11, V15) + + QR(V0, V5, V10, V15) + QR(V1, V6, V11, V12) + QR(V2, V7, V8, V13) + QR(V3, V4, V9, V14) + + SUB $1, R0 + CBNZ R0, loop + + // Add seed back. + VADD V4.S4, V20.S4, V4.S4 + VADD V5.S4, V21.S4, V5.S4 + VADD V6.S4, V22.S4, V6.S4 + VADD V7.S4, V23.S4, V7.S4 + VADD V8.S4, V24.S4, V8.S4 + VADD V9.S4, V25.S4, V9.S4 + VADD V10.S4, V26.S4, V10.S4 + VADD V11.S4, V27.S4, V11.S4 + + // Store interlaced blocks back to output buffer. + VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R1) + VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R1) + VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R1) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R1) + RET + +GLOBL ·chachaConst(SB), NOPTR|RODATA, $32 +DATA ·chachaConst+0x00(SB)/4, $0x61707865 +DATA ·chachaConst+0x04(SB)/4, $0x3320646e +DATA ·chachaConst+0x08(SB)/4, $0x79622d32 +DATA ·chachaConst+0x0c(SB)/4, $0x6b206574 + +GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32 +DATA ·chachaIncRot+0x00(SB)/4, $0x00000000 +DATA ·chachaIncRot+0x04(SB)/4, $0x00000001 +DATA ·chachaIncRot+0x08(SB)/4, $0x00000002 +DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003 +DATA ·chachaIncRot+0x10(SB)/4, $0x02010003 +DATA ·chachaIncRot+0x14(SB)/4, $0x06050407 +DATA ·chachaIncRot+0x18(SB)/4, $0x0A09080B +DATA ·chachaIncRot+0x1c(SB)/4, $0x0E0D0C0F diff --git a/src/internal/chacha8rand/chacha8_generic.go b/src/internal/chacha8rand/chacha8_generic.go new file mode 100644 index 0000000..2a0f5cd --- /dev/null +++ b/src/internal/chacha8rand/chacha8_generic.go @@ -0,0 +1,235 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// ChaCha8 is ChaCha with 8 rounds. +// See https://cr.yp.to/chacha/chacha-20080128.pdf. +// +// ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to: +// +// const1 const2 const3 const4 +// seed seed seed seed +// seed seed seed seed +// counter64 0 0 +// +// We use the same constants as ChaCha20 does, a random seed, +// and a counter. Running ChaCha8 on this input produces +// a 4x4 matrix of pseudo-random values with as much entropy +// as the seed. +// +// Given SIMD registers that can hold N uint32s, it is possible +// to run N ChaCha8 block transformations in parallel by filling +// the first register with the N copies of const1, the second +// with N copies of const2, and so on, and then running the operations. +// +// Each iteration of ChaCha8Rand operates over 32 bytes of input and +// produces 992 bytes of RNG output, plus 32 bytes of input for the next +// iteration. +// +// The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to +// produce 1024 bytes of output (16 blocks, with counters 0 to 15). +// First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32, +// 0x6b206574 are subtracted from the 32-bit little-endian words at +// position 0, 1, 2, and 3 respectively, and an increasing counter +// starting at zero is subtracted from each word at position 12. Then, +// this stream is permuted such that for each sequence of four blocks, +// first we output the first four bytes of each block, then the next four +// bytes of each block, and so on. Finally, the last 32 bytes of output +// are used as the input of the next iteration, and the remaining 992 +// bytes are the RNG output. +// +// See https://c2sp.org/chacha8rand for additional details. +// +// Normal ChaCha20 implementations for encryption use this same +// parallelism but then have to deinterlace the results so that +// it appears the blocks were generated separately. For the purposes +// of generating random numbers, the interlacing is fine. +// We are simply locked in to preserving the 4-way interlacing +// in any future optimizations. +package chacha8rand + +import ( + "internal/goarch" + "unsafe" +) + +// setup sets up 4 ChaCha8 blocks in b32 with the counter and seed. +// Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced +// the same way they would be in a 4-way SIMD implementations. +func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) { + // Convert to uint64 to do half as many stores to memory. + b := (*[16][2]uint64)(unsafe.Pointer(b32)) + + // Constants; same as in ChaCha20: "expand 32-byte k" + b[0][0] = 0x61707865_61707865 + b[0][1] = 0x61707865_61707865 + + b[1][0] = 0x3320646e_3320646e + b[1][1] = 0x3320646e_3320646e + + b[2][0] = 0x79622d32_79622d32 + b[2][1] = 0x79622d32_79622d32 + + b[3][0] = 0x6b206574_6b206574 + b[3][1] = 0x6b206574_6b206574 + + // Seed values. + var x64 uint64 + var x uint32 + + x = uint32(seed[0]) + x64 = uint64(x)<<32 | uint64(x) + b[4][0] = x64 + b[4][1] = x64 + + x = uint32(seed[0] >> 32) + x64 = uint64(x)<<32 | uint64(x) + b[5][0] = x64 + b[5][1] = x64 + + x = uint32(seed[1]) + x64 = uint64(x)<<32 | uint64(x) + b[6][0] = x64 + b[6][1] = x64 + + x = uint32(seed[1] >> 32) + x64 = uint64(x)<<32 | uint64(x) + b[7][0] = x64 + b[7][1] = x64 + + x = uint32(seed[2]) + x64 = uint64(x)<<32 | uint64(x) + b[8][0] = x64 + b[8][1] = x64 + + x = uint32(seed[2] >> 32) + x64 = uint64(x)<<32 | uint64(x) + b[9][0] = x64 + b[9][1] = x64 + + x = uint32(seed[3]) + x64 = uint64(x)<<32 | uint64(x) + b[10][0] = x64 + b[10][1] = x64 + + x = uint32(seed[3] >> 32) + x64 = uint64(x)<<32 | uint64(x) + b[11][0] = x64 + b[11][1] = x64 + + // Counters. + if goarch.BigEndian { + b[12][0] = uint64(counter+0)<<32 | uint64(counter+1) + b[12][1] = uint64(counter+2)<<32 | uint64(counter+3) + } else { + b[12][0] = uint64(counter+0) | uint64(counter+1)<<32 + b[12][1] = uint64(counter+2) | uint64(counter+3)<<32 + } + + // Zeros. + b[13][0] = 0 + b[13][1] = 0 + b[14][0] = 0 + b[14][1] = 0 + + b[15][0] = 0 + b[15][1] = 0 +} + +func _() { + // block and block_generic must have same type + x := block + x = block_generic + _ = x +} + +// block_generic is the non-assembly block implementation, +// for use on systems without special assembly. +// Even on such systems, it is quite fast: on GOOS=386, +// ChaCha8 using this code generates random values faster than PCG-DXSM. +func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) { + b := (*[16][4]uint32)(unsafe.Pointer(buf)) + + setup(seed, b, counter) + + for i := range b[0] { + // Load block i from b[*][i] into local variables. + b0 := b[0][i] + b1 := b[1][i] + b2 := b[2][i] + b3 := b[3][i] + b4 := b[4][i] + b5 := b[5][i] + b6 := b[6][i] + b7 := b[7][i] + b8 := b[8][i] + b9 := b[9][i] + b10 := b[10][i] + b11 := b[11][i] + b12 := b[12][i] + b13 := b[13][i] + b14 := b[14][i] + b15 := b[15][i] + + // 4 iterations of eight quarter-rounds each is 8 rounds + for round := 0; round < 4; round++ { + b0, b4, b8, b12 = qr(b0, b4, b8, b12) + b1, b5, b9, b13 = qr(b1, b5, b9, b13) + b2, b6, b10, b14 = qr(b2, b6, b10, b14) + b3, b7, b11, b15 = qr(b3, b7, b11, b15) + + b0, b5, b10, b15 = qr(b0, b5, b10, b15) + b1, b6, b11, b12 = qr(b1, b6, b11, b12) + b2, b7, b8, b13 = qr(b2, b7, b8, b13) + b3, b4, b9, b14 = qr(b3, b4, b9, b14) + } + + // Store block i back into b[*][i]. + // Add b4..b11 back to the original key material, + // like in ChaCha20, to avoid trivial invertibility. + // There is no entropy in b0..b3 and b12..b15 + // so we can skip the additions and save some time. + b[0][i] = b0 + b[1][i] = b1 + b[2][i] = b2 + b[3][i] = b3 + b[4][i] += b4 + b[5][i] += b5 + b[6][i] += b6 + b[7][i] += b7 + b[8][i] += b8 + b[9][i] += b9 + b[10][i] += b10 + b[11][i] += b11 + b[12][i] = b12 + b[13][i] = b13 + b[14][i] = b14 + b[15][i] = b15 + } + + if goarch.BigEndian { + // On a big-endian system, reading the uint32 pairs as uint64s + // will word-swap them compared to little-endian, so we word-swap + // them here first to make the next swap get the right answer. + for i, x := range buf { + buf[i] = x>>32 | x<<32 + } + } +} + +// qr is the (inlinable) ChaCha8 quarter round. +func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) { + a += b + d ^= a + d = d<<16 | d>>16 + c += d + b ^= c + b = b<<12 | b>>20 + a += b + d ^= a + d = d<<8 | d>>24 + c += d + b ^= c + b = b<<7 | b>>25 + return a, b, c, d +} diff --git a/src/internal/chacha8rand/chacha8_stub.s b/src/internal/chacha8rand/chacha8_stub.s new file mode 100644 index 0000000..09be558 --- /dev/null +++ b/src/internal/chacha8rand/chacha8_stub.s @@ -0,0 +1,12 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 && !arm64 + +#include "textflag.h" + +// func block(counter uint64, seed *[8]uint32, blocks *[16][4]uint32) +TEXT ·block(SB), NOSPLIT, $0 + JMP ·block_generic(SB) + diff --git a/src/internal/chacha8rand/export_test.go b/src/internal/chacha8rand/export_test.go new file mode 100644 index 0000000..728aded --- /dev/null +++ b/src/internal/chacha8rand/export_test.go @@ -0,0 +1,12 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha8rand + +var Block = block +var Block_generic = block_generic + +func Seed(s *State) [4]uint64 { + return s.seed +} diff --git a/src/internal/chacha8rand/rand_test.go b/src/internal/chacha8rand/rand_test.go new file mode 100644 index 0000000..2975013 --- /dev/null +++ b/src/internal/chacha8rand/rand_test.go @@ -0,0 +1,202 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha8rand_test + +import ( + "bytes" + "encoding/binary" + "fmt" + . "internal/chacha8rand" + "slices" + "testing" +) + +func TestOutput(t *testing.T) { + var s State + s.Init(seed) + for i := range output { + for { + x, ok := s.Next() + if ok { + if x != output[i] { + t.Errorf("#%d: have %#x want %#x", i, x, output[i]) + } + break + } + s.Refill() + } + } +} + +func TestMarshal(t *testing.T) { + var s State + s.Init(seed) + for i := range output { + for { + b := Marshal(&s) + s = State{} + err := Unmarshal(&s, b) + if err != nil { + t.Fatalf("#%d: Unmarshal: %v", i, err) + } + x, ok := s.Next() + if ok { + if x != output[i] { + t.Fatalf("#%d: have %#x want %#x", i, x, output[i]) + } + break + } + s.Refill() + } + } +} + +func TestReseed(t *testing.T) { + var s State + s.Init(seed) + old := Seed(&s) + s.Reseed() + if Seed(&s) == old { + t.Errorf("Reseed did not change seed") + } +} + +func BenchmarkBlock(b *testing.B) { + var seed [4]uint64 + var blocks [32]uint64 + + for i := 0; i < b.N; i++ { + Block(&seed, &blocks, 0) + } + b.SetBytes(32 * 8) +} + +func TestBlockGeneric(t *testing.T) { + var b1, b2 [32]uint64 + s := seed // byte seed + seed := [4]uint64{ + binary.LittleEndian.Uint64(s[0*8:]), + binary.LittleEndian.Uint64(s[1*8:]), + binary.LittleEndian.Uint64(s[2*8:]), + binary.LittleEndian.Uint64(s[3*8:]), + } + + Block(&seed, &b1, 4) + Block_generic(&seed, &b2, 4) + if !slices.Equal(b1[:], b2[:]) { + var out bytes.Buffer + fmt.Fprintf(&out, "%-18s %-18s\n", "block", "block_generic") + for i := range b1 { + suffix := "" + if b1[i] != b2[i] { + suffix = " mismatch!" + } + fmt.Fprintf(&out, "%#016x %#016x%s\n", b1[i], b2[i], suffix) + } + t.Errorf("block and block_generic disagree:\n%s", out.String()) + } +} + +// Golden output test to make sure algorithm never changes, +// so that its use in math/rand/v2 stays stable. +// See https://c2sp.org/chacha8rand. + +var seed = [32]byte([]byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ123456")) + +var output = []uint64{ + 0xb773b6063d4616a5, 0x1160af22a66abc3c, 0x8c2599d9418d287c, 0x7ee07e037edc5cd6, + 0xcfaa9ee02d1c16ad, 0x0e090eef8febea79, 0x3c82d271128b5b3e, 0x9c5addc11252a34f, + 0xdf79bb617d6ceea6, 0x36d553591f9d736a, 0xeef0d14e181ee01f, 0x089bfc760ae58436, + 0xd9e52b59cc2ad268, 0xeb2fb4444b1b8aba, 0x4f95c8a692c46661, 0xc3c6323217cae62c, + 0x91ebb4367f4e2e7e, 0x784cf2c6a0ec9bc6, 0x5c34ec5c34eabe20, 0x4f0a8f515570daa8, + 0xfc35dcb4113d6bf2, 0x5b0da44c645554bc, 0x6d963da3db21d9e1, 0xeeaefc3150e500f3, + 0x2d37923dda3750a5, 0x380d7a626d4bc8b0, 0xeeaf68ede3d7ee49, 0xf4356695883b717c, + 0x846a9021392495a4, 0x8e8510549630a61b, 0x18dc02545dbae493, 0x0f8f9ff0a65a3d43, + 0xccf065f7190ff080, 0xfd76d1aa39673330, 0x95d232936cba6433, 0x6c7456d1070cbd17, + 0x462acfdaff8c6562, 0x5bafab866d34fc6a, 0x0c862f78030a2988, 0xd39a83e407c3163d, + 0xc00a2b7b45f22ebf, 0x564307c62466b1a9, 0x257e0424b0c072d4, 0x6fb55e99496c28fe, + 0xae9873a88f5cd4e0, 0x4657362ac60d3773, 0x1c83f91ecdf23e8e, 0x6fdc0792c15387c0, + 0x36dad2a30dfd2b5c, 0xa4b593290595bdb7, 0x4de18934e4cc02c5, 0xcdc0d604f015e3a7, + 0xfba0dbf69ad80321, 0x60e8bea3d139de87, 0xd18a4d851ef48756, 0x6366447c2215f34a, + 0x05682e97d3d007ee, 0x4c0e8978c6d54ab2, 0xcf1e9f6a6712edc2, 0x061439414c80cfd3, + 0xd1a8b6e2745c0ead, 0x31a7918d45c410e8, 0xabcc61ad90216eec, 0x4040d92d2032a71a, + 0x3cd2f66ffb40cd68, 0xdcd051c07295857a, 0xeab55cbcd9ab527e, 0x18471dce781bdaac, + 0xf7f08cd144dc7252, 0x5804e0b13d7f40d1, 0x5cb1a446e4b2d35b, 0xe6d4a728d2138a06, + 0x05223e40ca60dad8, 0x2d61ec3206ac6a68, 0xab692356874c17b8, 0xc30954417676de1c, + 0x4f1ace3732225624, 0xfba9510813988338, 0x997f200f52752e11, 0x1116aaafe86221fa, + 0x07ce3b5cb2a13519, 0x2956bc72bc458314, 0x4188b7926140eb78, 0x56ca6dbfd4adea4d, + 0x7fe3c22349340ce5, 0x35c08f9c37675f8a, 0x11e1c7fbef5ed521, 0x98adc8464ec1bc75, + 0xd163b2c73d1203f8, 0x8c761ee043a2f3f3, 0x24b99d6accecd7b7, 0x793e31aa112f0370, + 0x8e87dc2a19285139, 0x4247ae04f7096e25, 0x514f3122926fe20f, 0xdc6fb3f045d2a7e9, + 0x15cb30cecdd18eba, 0xcbc7fdecf6900274, 0x3fb5c696dc8ba021, 0xd1664417c8d274e6, + 0x05f7e445ea457278, 0xf920bbca1b9db657, 0x0c1950b4da22cb99, 0xf875baf1af09e292, + 0xbed3d7b84250f838, 0xf198e8080fd74160, 0xc9eda51d9b7ea703, 0xf709ef55439bf8f6, + 0xd20c74feebf116fc, 0x305668eb146d7546, 0x829af3ec10d89787, 0x15b8f9697b551dbc, + 0xfc823c6c8e64b8c9, 0x345585e8183b40bc, 0x674b4171d6581368, 0x1234d81cd670e9f7, + 0x0e505210d8a55e19, 0xe8258d69eeeca0dc, 0x05d4c452e8baf67e, 0xe8dbe30116a45599, + 0x1cf08ce1b1176f00, 0xccf7d0a4b81ecb49, 0x303fea136b2c430e, 0x861d6c139c06c871, + 0x5f41df72e05e0487, 0x25bd7e1e1ae26b1d, 0xbe9f4004d662a41d, 0x65bf58d483188546, + 0xd1b27cff69db13cc, 0x01a6663372c1bb36, 0x578dd7577b727f4d, 0x19c78f066c083cf6, + 0xdbe014d4f9c391bb, 0x97fbb2dd1d13ffb3, 0x31c91e0af9ef8d4f, 0x094dfc98402a43ba, + 0x069bd61bea37b752, 0x5b72d762e8d986ca, 0x72ee31865904bc85, 0xd1f5fdc5cd36c33e, + 0xba9b4980a8947cad, 0xece8f05eac49ab43, 0x65fe1184abae38e7, 0x2d7cb9dea5d31452, + 0xcc71489476e467e3, 0x4c03a258a578c68c, 0x00efdf9ecb0fd8fc, 0x9924cad471e2666d, + 0x87f8668318f765e9, 0xcb4dc57c1b55f5d8, 0xd373835a86604859, 0xe526568b5540e482, + 0x1f39040f08586fec, 0xb764f3f00293f8e6, 0x049443a2f6bd50a8, 0x76fec88697d3941a, + 0x3efb70d039bae7a2, 0xe2f4611368eca8a8, 0x7c007a96e01d2425, 0xbbcce5768e69c5bf, + 0x784fb4985c42aac3, 0xf72b5091aa223874, 0x3630333fb1e62e07, 0x8e7319ebdebbb8de, + 0x2a3982bca959fa00, 0xb2b98b9f964ba9b3, 0xf7e31014adb71951, 0xebd0fca3703acc82, + 0xec654e2a2fe6419a, 0xb326132d55a52e2c, 0x2248c57f44502978, 0x32710c2f342daf16, + 0x0517b47b5acb2bec, 0x4c7a718fca270937, 0xd69142bed0bcc541, 0xe40ebcb8ff52ce88, + 0x3e44a2dbc9f828d4, 0xc74c2f4f8f873f58, 0x3dbf648eb799e45b, 0x33f22475ee0e86f8, + 0x1eb4f9ee16d47f65, 0x40f8d2b8712744e3, 0xb886b4da3cb14572, 0x2086326fbdd6f64d, + 0xcc3de5907dd882b9, 0xa2e8b49a5ee909df, 0xdbfb8e7823964c10, 0x70dd6089ef0df8d5, + 0x30141663cdd9c99f, 0x04b805325c240365, 0x7483d80314ac12d6, 0x2b271cb91aa7f5f9, + 0x97e2245362abddf0, 0x5a84f614232a9fab, 0xf71125fcda4b7fa2, 0x1ca5a61d74b27267, + 0x38cc6a9b3adbcb45, 0xdde1bb85dc653e39, 0xe9d0c8fa64f89fd4, 0x02c5fb1ecd2b4188, + 0xf2bd137bca5756e5, 0xadefe25d121be155, 0x56cd1c3c5d893a8e, 0x4c50d337beb65bb9, + 0x918c5151675cf567, 0xaba649ffcfb56a1e, 0x20c74ab26a2247cd, 0x71166bac853c08da, + 0xb07befe2e584fc5d, 0xda45ff2a588dbf32, 0xdb98b03c4d75095e, 0x60285ae1aaa65a4c, + 0xf93b686a263140b8, 0xde469752ee1c180e, 0xcec232dc04129aae, 0xeb916baa1835ea04, + 0xd49c21c8b64388ff, 0x72a82d9658864888, 0x003348ef7eac66a8, 0x7f6f67e655b209eb, + 0x532ffb0b7a941b25, 0xd940ade6128deede, 0xdf24f2a1af89fe23, 0x95aa3b4988195ae0, + 0x3da649404f94be4a, 0x692dad132c3f7e27, 0x40aee76ecaaa9eb8, 0x1294a01e09655024, + 0x6df797abdba4e4f5, 0xea2fb6024c1d7032, 0x5f4e0492295489fc, 0x57972914ea22e06a, + 0x9a8137d133aad473, 0xa2e6dd6ae7cdf2f3, 0x9f42644f18086647, 0x16d03301c170bd3e, + 0x908c416fa546656d, 0xe081503be22e123e, 0x077cf09116c4cc72, 0xcbd25cd264b7f229, + 0x3db2f468ec594031, 0x46c00e734c9badd5, 0xd0ec0ac72075d861, 0x3037cb3cf80b7630, + 0x574c3d7b3a2721c6, 0xae99906a0076824b, 0xb175a5418b532e70, 0xd8b3e251ee231ddd, + 0xb433eec25dca1966, 0x530f30dc5cff9a93, 0x9ff03d98b53cd335, 0xafc4225076558cdf, + 0xef81d3a28284402a, 0x110bdbf51c110a28, 0x9ae1b255d027e8f6, 0x7de3e0aa24688332, + 0xe483c3ecd2067ee2, 0xf829328b276137e6, 0xa413ccad57562cad, 0xe6118e8b496acb1f, + 0x8288dca6da5ec01f, 0xa53777dc88c17255, 0x8a00f1e0d5716eda, 0x618e6f47b7a720a8, + 0x9e3907b0c692a841, 0x978b42ca963f34f3, 0x75e4b0cd98a7d7ef, 0xde4dbd6e0b5f4752, + 0x0252e4153f34493f, 0x50f0e7d803734ef9, 0x237766a38ed167ee, 0x4124414001ee39a0, + 0xd08df643e535bb21, 0x34f575b5a9a80b74, 0x2c343af87297f755, 0xcd8b6d99d821f7cb, + 0xe376fd7256fc48ae, 0xe1b06e7334352885, 0xfa87b26f86c169eb, 0x36c1604665a971de, + 0xdba147c2239c8e80, 0x6b208e69fc7f0e24, 0x8795395b6f2b60c3, 0x05dabee9194907f4, + 0xb98175142f5ed902, 0x5e1701e2021ddc81, 0x0875aba2755eed08, 0x778d83289251de95, + 0x3bfbe46a039ecb31, 0xb24704fce4cbd7f9, 0x6985ffe9a7c91e3d, 0xc8efb13df249dabb, + 0xb1037e64b0f4c9f6, 0x55f69fd197d6b7c3, 0x672589d71d68a90c, 0xbebdb8224f50a77e, + 0x3f589f80007374a7, 0xd307f4635954182a, 0xcff5850c10d4fd90, 0xc6da02dfb6408e15, + 0x93daeef1e2b1a485, 0x65d833208aeea625, 0xe2b13fa13ed3b5fa, 0x67053538130fb68e, + 0xc1042f6598218fa9, 0xee5badca749b8a2e, 0x6d22a3f947dae37d, 0xb62c6d1657f4dbaf, + 0x6e007de69704c20b, 0x1af2b913fc3841d8, 0xdc0e47348e2e8e22, 0x9b1ddef1cf958b22, + 0x632ed6b0233066b8, 0xddd02d3311bed8f2, 0xf147cfe1834656e9, 0x399aaa49d511597a, + 0x6b14886979ec0309, 0x64fc4ac36b5afb97, 0xb82f78e07f7cf081, 0x10925c9a323d0e1b, + 0xf451c79ee13c63f6, 0x7c2fc180317876c7, 0x35a12bd9eecb7d22, 0x335654a539621f90, + 0xcc32a3f35db581f0, 0xc60748a80b2369cb, 0x7c4dd3b08591156b, 0xac1ced4b6de22291, + 0xa32cfa2df134def5, 0x627108918dea2a53, 0x0555b1608fcb4ff4, 0x143ee7ac43aaa33c, + 0xdae90ce7cf4fc218, 0x4d68fc2582bcf4b5, 0x37094e1849135d71, 0xf7857e09f3d49fd8, + 0x007538c503768be7, 0xedf648ba2f6be601, 0xaa347664dd72513e, 0xbe63893c6ef23b86, + 0x130b85710605af97, 0xdd765c6b1ef6ab56, 0xf3249a629a97dc6b, 0x2a114f9020fab8e5, + 0x5a69e027cfc6ad08, 0x3c4ccb36f1a5e050, 0x2e9e7d596834f0a5, 0x2430be6858fce789, + 0xe90b862f2466e597, 0x895e2884f159a9ec, 0x26ab8fa4902fcb57, 0xa6efff5c54e1fa50, + 0x333ac4e5811a8255, 0xa58d515f02498611, 0xfe5a09dcb25c6ef4, 0x03898988ab5f5818, + 0x289ff6242af6c617, 0x3d9dd59fd381ea23, 0x52d7d93d8a8aae51, 0xc76a123d511f786f, + 0xf68901edaf00c46c, 0x8c630871b590de80, 0x05209c308991e091, 0x1f809f99b4788177, + 0x11170c2eb6c19fd8, 0x44433c779062ba58, 0xc0acb51af1874c45, 0x9f2e134284809fa1, + 0xedb523bd15c619fa, 0x02d97fd53ecc23c0, 0xacaf05a34462374c, 0xddd9c6d34bffa11f, +} |