Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/isa-l/erasure_code/ppc64le
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
16 files changed, 1782 insertions, 0 deletions
diff --git a/src/isa-l/erasure_code/ppc64le/Makefile.am b/src/isa-l/erasure_code/ppc64le/Makefile.am
new file mode 100644
index 000000000..9d263ac22
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/Makefile.am
@@ -0,0 +1,15 @@
+lsrc_ppc64le      += erasure_code/ppc64le/ec_base_vsx.c \
+		erasure_code/ppc64le/gf_vect_mul_vsx.c \
+		erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_2vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_3vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_4vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_5vect_mad_vsx.c \
+		erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \
+		erasure_code/ppc64le/gf_6vect_mad_vsx.c
+
diff --git a/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c
new file mode 100644
index 000000000..05624f1b6
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c
@@ -0,0 +1,97 @@
+#include "erasure_code.h"
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *v,
+		      unsigned char **src, unsigned char *dest)
+{
+	gf_vect_dot_prod_vsx(len, vlen, v, src, dest);
+}
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v,
+		 unsigned char *src, unsigned char *dest)
+{
+	gf_vect_mad_vsx(len, vec, vec_i, v, src, dest);
+
+}
+
+void ec_encode_data(int len, int srcs, int dests, unsigned char *v,
+		    unsigned char **src, unsigned char **dest)
+{
+	if (len < 64) {
+		ec_encode_data_base(len, srcs, dests, v, src, dest);
+		return;
+	}
+
+	while (dests >= 6) {
+		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+		v += 6 * srcs * 32;
+		dest += 6;
+		dests -= 6;
+	}
+	switch (dests) {
+	case 6:
+		gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 5:
+		gf_5vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 4:
+		gf_4vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 3:
+		gf_3vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 2:
+		gf_2vect_dot_prod_vsx(len, srcs, v, src, dest);
+		break;
+	case 1:
+		gf_vect_dot_prod_vsx(len, srcs, v, src, *dest);
+		break;
+	case 0:
+		break;
+	}
+}
+
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v,
+			   unsigned char *data, unsigned char **dest)
+{
+	if (len < 64) {
+		ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest);
+		return;
+	}
+
+	while (rows >= 6) {
+		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+		v += 6 * k * 32;
+		dest += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 5:
+		gf_5vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 4:
+		gf_4vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 3:
+		gf_3vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 2:
+		gf_2vect_mad_vsx(len, k, vec_i, v, data, dest);
+		break;
+	case 1:
+		gf_vect_mad_vsx(len, k, vec_i, v, data, *dest);
+		break;
+	case 0:
+		break;
+	}
+}
+
+int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
+{
+	gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest);
+	return 0;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h
new file mode 100644
index 000000000..c808629a9
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h
@@ -0,0 +1,338 @@
+#ifndef _ERASURE_CODE_PPC64LE_H_
+#define _ERASURE_CODE_PPC64LE_H_
+
+#include "erasure_code.h"
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ibmxl__)
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 8
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 7
+#if defined _ARCH_PWR9
+#define EC_vec_xl(a, b) vec_vsx_ld(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+	vector unsigned char vc;
+	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+	return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#endif
+#else
+#if defined _ARCH_PWR8
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+	vector unsigned char vc;
+	__asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+	return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+#error "This code is only supported on ppc64le."
+#endif
+#endif
+
+/**
+ * @brief GF(2^8) vector multiply. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and save to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32 byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product. VSX version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ *               on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			  unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes. Must >= 16.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vlen   Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ *               based on the array of input coefficients.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len    Length of each vector in bytes.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_PPC64LE_H_
diff --git a/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
new file mode 100644
index 000000000..3cb269cce
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
@@ -0,0 +1,83 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4;
+	vector unsigned char vYD, vYE, vYF, vYG;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			g0 += 32;
+			g1 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
new file mode 100644
index 000000000..621684a5f
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
@@ -0,0 +1,65 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4;
+	vector unsigned char vYD, vYE, vYF, vYG;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
new file mode 100644
index 000000000..23b72dc4b
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
@@ -0,0 +1,104 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
new file mode 100644
index 000000000..ba90c1fdb
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
@@ -0,0 +1,84 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
new file mode 100644
index 000000000..e65654453
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
@@ -0,0 +1,124 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+			vlo3 = vec_xl(0, g3);
+			vhi3 = vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
new file mode 100644
index 000000000..7b236b6f8
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
@@ -0,0 +1,103 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
new file mode 100644
index 000000000..e9eef0e63
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
@@ -0,0 +1,145 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+		vY9 = vY9 ^ vY9;
+		vYA = vYA ^ vYA;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+		vYL = vYL ^ vYL;
+		vYM = vYM ^ vYM;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+		unsigned char *g4 = &gftbls[4 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = vec_xl(0, g2);
+			vhi2 = vec_xl(16, g2);
+			vlo3 = vec_xl(0, g3);
+			vhi3 = vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vlo4 = vec_xl(0, g4);
+			vhi4 = vec_xl(16, g4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+			g4 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
new file mode 100644
index 000000000..7bb7bb211
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
@@ -0,0 +1,122 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vY9 = vec_xl(0, t4 + i);
+		vYA = vec_xl(16, t4 + i);
+		vYL = vec_xl(32, t4 + i);
+		vYM = vec_xl(48, t4 + i);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+
+		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
new file mode 100644
index 000000000..ac918bd49
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
@@ -0,0 +1,166 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			   unsigned char **src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+		gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+		gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+		gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+		gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+		gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]);
+
+		for (j = 1; j < vlen; j++) {
+			gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+	t5 = (unsigned char *)dest[5];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+		gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5);
+	}
+
+	for (i = head; i < len - 63; i += 64) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+		vY9 = vY9 ^ vY9;
+		vYA = vYA ^ vYA;
+		vYB = vYB ^ vYB;
+		vYC = vYC ^ vYC;
+
+		vYD = vYD ^ vYD;
+		vYE = vYE ^ vYE;
+		vYF = vYF ^ vYF;
+		vYG = vYG ^ vYG;
+		vYH = vYH ^ vYH;
+		vYI = vYI ^ vYI;
+		vYJ = vYJ ^ vYJ;
+		vYK = vYK ^ vYK;
+		vYL = vYL ^ vYL;
+		vYM = vYM ^ vYM;
+		vYN = vYN ^ vYN;
+		vYO = vYO ^ vYO;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+		unsigned char *g1 = &gftbls[1 * 32 * vlen];
+		unsigned char *g2 = &gftbls[2 * 32 * vlen];
+		unsigned char *g3 = &gftbls[3 * 32 * vlen];
+		unsigned char *g4 = &gftbls[4 * 32 * vlen];
+		unsigned char *g5 = &gftbls[5 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+			vlo1 = EC_vec_xl(0, g1);
+			vhi1 = EC_vec_xl(16, g1);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vlo2 = EC_vec_xl(0, g2);
+			vhi2 = EC_vec_xl(16, g2);
+			vlo3 = EC_vec_xl(0, g3);
+			vhi3 = EC_vec_xl(16, g3);
+
+			vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+			vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+			vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+			vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+			vlo4 = EC_vec_xl(0, g4);
+			vhi4 = EC_vec_xl(16, g4);
+			vlo5 = EC_vec_xl(0, g5);
+			vhi5 = EC_vec_xl(16, g5);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+			vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+			vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+			vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+			vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+			vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+			vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+			vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+			vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+			vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+			vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+			vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+			vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+			vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+			vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+			vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+			g0 += 32;
+			g1 += 32;
+			g2 += 32;
+			g3 += 32;
+			g4 += 32;
+			g5 += 32;
+		}
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYB, 0, t5 + i);
+		vec_xst(vYC, 16, t5 + i);
+
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+		vec_xst(vYN, 32, t5 + i);
+		vec_xst(vYO, 48, t5 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
new file mode 100644
index 000000000..43ea6c696
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
@@ -0,0 +1,142 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		      unsigned char *src, unsigned char **dest)
+{
+	unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+	vector unsigned char vX1, vX2, vX3, vX4;
+	vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+	vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+	vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+	vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest[0];
+	t1 = (unsigned char *)dest[1];
+	t2 = (unsigned char *)dest[2];
+	t3 = (unsigned char *)dest[3];
+	t4 = (unsigned char *)dest[4];
+	t5 = (unsigned char *)dest[5];
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+	vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+	vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+	vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+	vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+	vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vYD = vec_xl(32, t0 + i);
+		vYE = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vYD, 32, t0 + i);
+		vec_xst(vYE, 48, t0 + i);
+
+		vY3 = vec_xl(0, t1 + i);
+		vY4 = vec_xl(16, t1 + i);
+		vYF = vec_xl(32, t1 + i);
+		vYG = vec_xl(48, t1 + i);
+
+		vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+		vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+		vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+		vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+		vec_xst(vY3, 0, t1 + i);
+		vec_xst(vY4, 16, t1 + i);
+		vec_xst(vYF, 32, t1 + i);
+		vec_xst(vYG, 48, t1 + i);
+
+		vY5 = vec_xl(0, t2 + i);
+		vY6 = vec_xl(16, t2 + i);
+		vYH = vec_xl(32, t2 + i);
+		vYI = vec_xl(48, t2 + i);
+
+		vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+		vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+		vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+		vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+		vY7 = vec_xl(0, t3 + i);
+		vY8 = vec_xl(16, t3 + i);
+		vYJ = vec_xl(32, t3 + i);
+		vYK = vec_xl(48, t3 + i);
+
+		vec_xst(vY5, 0, t2 + i);
+		vec_xst(vY6, 16, t2 + i);
+		vec_xst(vYH, 32, t2 + i);
+		vec_xst(vYI, 48, t2 + i);
+
+		vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+		vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+		vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+		vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+		vY9 = vec_xl(0, t4 + i);
+		vYA = vec_xl(16, t4 + i);
+		vYL = vec_xl(32, t4 + i);
+		vYM = vec_xl(48, t4 + i);
+
+		vec_xst(vY7, 0, t3 + i);
+		vec_xst(vY8, 16, t3 + i);
+		vec_xst(vYJ, 32, t3 + i);
+		vec_xst(vYK, 48, t3 + i);
+
+		vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+		vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+		vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+		vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+		vYB = vec_xl(0, t5 + i);
+		vYC = vec_xl(16, t5 + i);
+		vYN = vec_xl(32, t5 + i);
+		vYO = vec_xl(48, t5 + i);
+
+		vec_xst(vY9, 0, t4 + i);
+		vec_xst(vYA, 16, t4 + i);
+		vec_xst(vYL, 32, t4 + i);
+		vec_xst(vYM, 48, t4 + i);
+
+		vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+		vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+		vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+		vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+		vec_xst(vYB, 0, t5 + i);
+		vec_xst(vYC, 16, t5 + i);
+		vec_xst(vYN, 32, t5 + i);
+		vec_xst(vYO, 48, t5 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
new file mode 100644
index 000000000..2f97e3421
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
@@ -0,0 +1,85 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+			  unsigned char **src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vX5, vY5;
+	vector unsigned char vX6, vY6;
+	vector unsigned char vX7, vY7;
+	vector unsigned char vX8, vY8;
+	vector unsigned char vhi0, vlo0;
+	int i, j, head;
+
+	if (vlen < 128) {
+		gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest);
+
+		for (j = 1; j < vlen; j++) {
+			gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+		}
+		return;
+	}
+
+	t0 = (unsigned char *)dest;
+
+	head = len % 128;
+	if (head != 0) {
+		gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+	}
+
+	for (i = head; i < len - 127; i += 128) {
+		vY1 = vY1 ^ vY1;
+		vY2 = vY2 ^ vY2;
+		vY3 = vY3 ^ vY3;
+		vY4 = vY4 ^ vY4;
+
+		vY5 = vY5 ^ vY5;
+		vY6 = vY6 ^ vY6;
+		vY7 = vY7 ^ vY7;
+		vY8 = vY8 ^ vY8;
+
+		unsigned char *g0 = &gftbls[0 * 32 * vlen];
+
+		for (j = 0; j < vlen; j++) {
+			s = (unsigned char *)src[j];
+			vX1 = vec_xl(0, s + i);
+			vX2 = vec_xl(16, s + i);
+			vX3 = vec_xl(32, s + i);
+			vX4 = vec_xl(48, s + i);
+
+			vlo0 = EC_vec_xl(0, g0);
+			vhi0 = EC_vec_xl(16, g0);
+
+			vX5 = vec_xl(64, s + i);
+			vX6 = vec_xl(80, s + i);
+			vX7 = vec_xl(96, s + i);
+			vX8 = vec_xl(112, s + i);
+
+			vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+			vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+			vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+			vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+			vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5);
+			vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6);
+			vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7);
+			vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8);
+
+			g0 += 32;
+		}
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+
+		vec_xst(vY5, 64, t0 + i);
+		vec_xst(vY6, 80, t0 + i);
+		vec_xst(vY7, 96, t0 + i);
+		vec_xst(vY8, 112, t0 + i);
+	}
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
new file mode 100644
index 000000000..a4810b96d
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
@@ -0,0 +1,48 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+		     unsigned char *src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vhi0, vlo0;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest;
+
+	head = len % 64;
+	if (head != 0) {
+		gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+	vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+
+	for (i = head; i < len - 63; i += 64) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vY1 = vec_xl(0, t0 + i);
+		vY2 = vec_xl(16, t0 + i);
+		vY3 = vec_xl(32, t0 + i);
+		vY4 = vec_xl(48, t0 + i);
+
+		vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+		vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+		vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+	}
+
+	return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
new file mode 100644
index 000000000..3e610a104
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
@@ -0,0 +1,61 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest)
+{
+	unsigned char *s, *t0;
+	vector unsigned char vX1, vY1;
+	vector unsigned char vX2, vY2;
+	vector unsigned char vX3, vY3;
+	vector unsigned char vX4, vY4;
+	vector unsigned char vX5, vY5;
+	vector unsigned char vX6, vY6;
+	vector unsigned char vX7, vY7;
+	vector unsigned char vX8, vY8;
+	vector unsigned char vhi0, vlo0;
+	int i, head;
+
+	s = (unsigned char *)src;
+	t0 = (unsigned char *)dest;
+
+	head = len % 128;
+	if (head != 0) {
+		gf_vect_mul_base(head, gftbl, src, dest);
+	}
+
+	vlo0 = EC_vec_xl(0, gftbl);
+	vhi0 = EC_vec_xl(16, gftbl);
+
+	for (i = head; i < len - 127; i += 128) {
+		vX1 = vec_xl(0, s + i);
+		vX2 = vec_xl(16, s + i);
+		vX3 = vec_xl(32, s + i);
+		vX4 = vec_xl(48, s + i);
+
+		vX5 = vec_xl(64, s + i);
+		vX6 = vec_xl(80, s + i);
+		vX7 = vec_xl(96, s + i);
+		vX8 = vec_xl(112, s + i);
+
+		vY1 = EC_vec_permxor(vhi0, vlo0, vX1);
+		vY2 = EC_vec_permxor(vhi0, vlo0, vX2);
+		vY3 = EC_vec_permxor(vhi0, vlo0, vX3);
+		vY4 = EC_vec_permxor(vhi0, vlo0, vX4);
+
+		vY5 = EC_vec_permxor(vhi0, vlo0, vX5);
+		vY6 = EC_vec_permxor(vhi0, vlo0, vX6);
+		vY7 = EC_vec_permxor(vhi0, vlo0, vX7);
+		vY8 = EC_vec_permxor(vhi0, vlo0, vX8);
+
+		vec_xst(vY1, 0, t0 + i);
+		vec_xst(vY2, 16, t0 + i);
+		vec_xst(vY3, 32, t0 + i);
+		vec_xst(vY4, 48, t0 + i);
+
+		vec_xst(vY5, 64, t0 + i);
+		vec_xst(vY6, 80, t0 + i);
+		vec_xst(vY7, 96, t0 + i);
+		vec_xst(vY8, 112, t0 + i);
+	}
+
+	return;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/isa-l/erasure_code/ppc64le
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip