From 43a123c1ae6613b3efeed291fa552ecd909d3acf Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Tue, 16 Apr 2024 21:23:18 +0200
Subject: Adding upstream version 1.20.14.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/internal/bytealg/indexbyte_ppc64x.s | 391 ++++++++++++++++++++++++++++++++
 1 file changed, 391 insertions(+)
 create mode 100644 src/internal/bytealg/indexbyte_ppc64x.s

(limited to 'src/internal/bytealg/indexbyte_ppc64x.s')
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..1a6e852
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,391 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer
+	// R4 = length
+	MOVD	R6, R5		// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+	// R3 = string
+	// R4 = length
+	// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	BR	indexbytebody<>(SB)
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R17		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	ADD	R4,R3,R7	// Last acceptable address in R7.
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
+	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLT	small_string	// Jump to the small string case if it's <32 bytes.
+	CMP	R16,$1		// optimize for power8 v power9
+	BNE	power8
+	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	R5,V1
+	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
+	VSPLTB	$7,V1,V1	// Replicate byte across V1
+	CMP	R4,$64
+	MOVD	$16,R11
+	MOVD	R3,R8
+	BLT	cmp32
+	MOVD	$32,R12
+	MOVD	$48,R6
+
+loop64:
+	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8, jump out
+
+	LXVB16X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
+
+	LXVB16X	(R8)(R12),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
+
+	LXVB16X	(R8)(R6),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
+	ADD	$64,R8
+	ADD	$-64,R4
+	CMP	R4,$64		// >=64 bytes left to scan?
+	BGE	loop64
+	CMP	R4,$32
+	BLT	rem		// jump to rem if there are < 32 bytes left
+cmp32:
+	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8
+
+	LXVB16X	(R11)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16
+
+	ADD	$32,R8
+	ADD	$-32,R4
+rem:
+	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
+	BR	small_string
+
+foundat3:
+	ADD	$16,R8
+foundat2:
+	ADD	$16,R8
+foundat1:
+	ADD	$16,R8
+foundat0:
+	// Compress the result into a single doubleword and
+	// move it to a GPR for the final calculation.
+	VBPERMQ	V6,V10,V6
+	MFVRD	V6,R3
+	// count leading zeroes upto the match that ends up in low 16 bits
+	// in both endian modes, compute index by subtracting the number by 16
+	CNTLZW	R3,R11
+	ADD	$-16,R11
+	ADD	R8,R11,R3	// Calculate byte address
+	SUB	R17,R3
+	RET
+power8:
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
+
+	// Check for quadword alignment
+	ANDCC	$15,R8,R11
+	BEQ	CR0,qw_align
+
+	// Not aligned, so handle the next doubleword
+	MOVD	0(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
+	BLE	  tail
+
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
+
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
+
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
+
+	ADD	$-64,R8,R8   // Adjust index for loop entry
+	MOVD	$16,R11      // Load offsets for the vector loads
+	MOVD	$32,R9
+	MOVD	$48,R7
+
+	// Main loop we will load 64 bytes per iteration
+loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
+	LVX	    (R8+R11),V3
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
+	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
+	VCMPEQUB    V1,V4,V8
+	VCMPEQUB    V1,V5,V9
+
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
+	VOR	    V8,V9,V12
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
+	BGE	    CR6,found
+	BC	    16,0,loop	      // bdnz loop
+
+	// Handle the tailing bytes or R4 ≤ 64
+	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
+tail:
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+
+notfound:
+	MOVD	$-1, R3
+	RET
+
+found:
+	// We will now compress the results into a single doubleword,
+	// so it can be moved to a GPR for the final index calculation.
+
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
+
+	// Merge V6-V9 into a single doubleword and move to a GPR.
+	VOR	V6,V7,V11
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+
+return:
+	SUB	R17, R3
+	RET
+
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
+	PCALIGN	  $16
+
+done:
+	ADD	$-1,R10,R6
+	// Offset of last index for the final
+	// doubleword comparison
+	RLDICL	$0,R6,$61,R6
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	BR	notfound
+
+small_string:
+	// process string of length < 32 bytes
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
+	BEQ	notfound
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+	CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done
+	BR	notfound
+
-- 
cgit v1.2.3