diff options
Diffstat (limited to 'src/internal/bytealg/count_s390x.s')
-rw-r--r-- | src/internal/bytealg/count_s390x.s | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s new file mode 100644 index 0000000..2a3b5c0 --- /dev/null +++ b/src/internal/bytealg/count_s390x.s @@ -0,0 +1,169 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// condition code masks +#define EQ 8 +#define NE 7 + +// register assignments +#define R_ZERO R0 +#define R_VAL R1 +#define R_TMP R2 +#define R_PTR R3 +#define R_LEN R4 +#define R_CHAR R5 +#define R_RET R6 +#define R_ITER R7 +#define R_CNT R8 +#define R_MPTR R9 + +// vector register assignments +#define V_ZERO V0 +#define V_CHAR V1 +#define V_MASK V2 +#define V_VAL V3 +#define V_CNT V4 + +// mask for trailing bytes in vector implementation +GLOBL countbytemask<>(SB), RODATA, $16 +DATA countbytemask<>+0(SB)/8, $0x0101010101010101 +DATA countbytemask<>+8(SB)/8, $0x0101010101010101 + +// func Count(b []byte, c byte) int +TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40 + LMG b+0(FP), R_PTR, R_LEN + MOVBZ c+24(FP), R_CHAR + MOVD $ret+32(FP), R_RET + BR countbytebody<>(SB) + +// func CountString(s string, c byte) int +TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 + LMG s+0(FP), R_PTR, R_LEN + MOVBZ c+16(FP), R_CHAR + MOVD $ret+24(FP), R_RET + BR countbytebody<>(SB) + +// input: +// R_PTR = address of array of bytes +// R_LEN = number of bytes in array +// R_CHAR = byte value to count zero (extended to register width) +// R_RET = address of return value +TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 + MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP + MOVD $countbytemask<>(SB), R_MPTR + CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0. + SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks + MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility + CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available + + // Start of vector code (have vector facility). + // + // Set R_LEN to be the length mod 16 minus 1 to use as an index for + // vector 'load with length' (VLL). It will be in the range [-1,14]. + // Also replicate c across a 16-byte vector and initialize V_ZERO. + ANDW $0xf, R_LEN + VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0} + VZERO V_ZERO // V_ZERO = [1]uint128{0} + ADDW $-1, R_LEN + VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c} + + // Jump to loop if we have more than 15 bytes to process. + CGIJ $NE, R_ITER, $0, vxchunks + + // Load 1-15 bytes and corresponding mask. + // Note: only the low 32-bits of R_LEN are used for the index. + VLL R_LEN, (R_PTR), V_VAL + VLL R_LEN, (R_MPTR), V_MASK + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 + VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + + // Return rightmost (lowest) 64-bit part of accumulator. + VSTEG $1, V_CNT, (R_RET) + RET + +vxchunks: + // Load 0x01 into every byte element in the 16-byte mask vector. + VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1} + VZERO V_CNT // initial uint128 count of 0 + +vxloop: + // Load input bytes in 16-byte chunks. + VL (R_PTR), V_VAL + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 + VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits + + // Increment input string address. + MOVD $16(R_PTR), R_PTR + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + VAQ V_VAL, V_CNT, V_CNT // accumulate + + // Repeat until all 16-byte chunks are done. + BRCTG R_ITER, vxloop + + // Skip to end if there are no trailing bytes. + CIJ $EQ, R_LEN, $-1, vxret + + // Load 1-15 bytes and corresponding mask. + // Note: only the low 32-bits of R_LEN are used for the index. + VLL R_LEN, (R_PTR), V_VAL + VLL R_LEN, (R_MPTR), V_MASK + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL + VN V_MASK, V_VAL, V_VAL + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + VAQ V_VAL, V_CNT, V_CNT // accumulate + +vxret: + // Return rightmost (lowest) 64-bit part of accumulator. + VSTEG $1, V_CNT, (R_RET) + RET + +novx: + // Start of non-vector code (the vector facility not available). + // + // Initialise counter and constant zero. + MOVD $0, R_CNT + MOVD $0, R_ZERO + +loop: + // Read 1-byte from input and compare. + // Note: avoid putting LOCGR in critical path. + MOVBZ (R_PTR), R_VAL + MOVD $1, R_TMP + MOVD $1(R_PTR), R_PTR + CMPW R_VAL, R_CHAR + LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match) + ADD R_TMP, R_CNT // accumulate 64-bit result + + // Repeat until all bytes have been checked. + BRCTG R_LEN, loop + +ret: + MOVD R_CNT, (R_RET) + RET + +ret0: + MOVD $0, (R_RET) + RET |