summaryrefslogtreecommitdiffstats
path: root/src/isa-l/erasure_code/ppc64le
diff options
context:
space:
mode:
Diffstat (limited to 'src/isa-l/erasure_code/ppc64le')
-rw-r--r--src/isa-l/erasure_code/ppc64le/Makefile.am15
-rw-r--r--src/isa-l/erasure_code/ppc64le/ec_base_vsx.c97
-rw-r--r--src/isa-l/erasure_code/ppc64le/ec_base_vsx.h338
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c83
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c65
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c104
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c84
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c124
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c103
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c145
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c122
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c166
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c142
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c85
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c48
-rw-r--r--src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c61
16 files changed, 1782 insertions, 0 deletions
diff --git a/src/isa-l/erasure_code/ppc64le/Makefile.am b/src/isa-l/erasure_code/ppc64le/Makefile.am
new file mode 100644
index 000000000..9d263ac22
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/Makefile.am
@@ -0,0 +1,15 @@
+lsrc_ppc64le += erasure_code/ppc64le/ec_base_vsx.c \
+ erasure_code/ppc64le/gf_vect_mul_vsx.c \
+ erasure_code/ppc64le/gf_vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_2vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_3vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_4vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_5vect_mad_vsx.c \
+ erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c \
+ erasure_code/ppc64le/gf_6vect_mad_vsx.c
+
diff --git a/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c
new file mode 100644
index 000000000..05624f1b6
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.c
@@ -0,0 +1,97 @@
+#include "erasure_code.h"
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod(int len, int vlen, unsigned char *v,
+ unsigned char **src, unsigned char *dest)
+{
+ gf_vect_dot_prod_vsx(len, vlen, v, src, dest);
+}
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *v,
+ unsigned char *src, unsigned char *dest)
+{
+ gf_vect_mad_vsx(len, vec, vec_i, v, src, dest);
+
+}
+
+void ec_encode_data(int len, int srcs, int dests, unsigned char *v,
+ unsigned char **src, unsigned char **dest)
+{
+ if (len < 64) {
+ ec_encode_data_base(len, srcs, dests, v, src, dest);
+ return;
+ }
+
+ while (dests >= 6) {
+ gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+ v += 6 * srcs * 32;
+ dest += 6;
+ dests -= 6;
+ }
+ switch (dests) {
+ case 6:
+ gf_6vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 5:
+ gf_5vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 4:
+ gf_4vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 3:
+ gf_3vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 2:
+ gf_2vect_dot_prod_vsx(len, srcs, v, src, dest);
+ break;
+ case 1:
+ gf_vect_dot_prod_vsx(len, srcs, v, src, *dest);
+ break;
+ case 0:
+ break;
+ }
+}
+
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *v,
+ unsigned char *data, unsigned char **dest)
+{
+ if (len < 64) {
+ ec_encode_data_update_base(len, k, rows, vec_i, v, data, dest);
+ return;
+ }
+
+ while (rows >= 6) {
+ gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+ v += 6 * k * 32;
+ dest += 6;
+ rows -= 6;
+ }
+ switch (rows) {
+ case 6:
+ gf_6vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 5:
+ gf_5vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 4:
+ gf_4vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 3:
+ gf_3vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 2:
+ gf_2vect_mad_vsx(len, k, vec_i, v, data, dest);
+ break;
+ case 1:
+ gf_vect_mad_vsx(len, k, vec_i, v, data, *dest);
+ break;
+ case 0:
+ break;
+ }
+}
+
+int gf_vect_mul(int len, unsigned char *a, void *src, void *dest)
+{
+ gf_vect_mul_vsx(len, a, (unsigned char *)src, (unsigned char *)dest);
+ return 0;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h
new file mode 100644
index 000000000..c808629a9
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/ec_base_vsx.h
@@ -0,0 +1,338 @@
+#ifndef _ERASURE_CODE_PPC64LE_H_
+#define _ERASURE_CODE_PPC64LE_H_
+
+#include "erasure_code.h"
+#include <altivec.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ibmxl__)
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 8
+#define EC_vec_xl(a, b) vec_xl_be(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vc)
+#elif defined __GNUC__ && __GNUC__ >= 7
+#if defined _ARCH_PWR9
+#define EC_vec_xl(a, b) vec_vsx_ld(a, b)
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+ vector unsigned char vc;
+ __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+ return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#endif
+#else
+#if defined _ARCH_PWR8
+inline vector unsigned char EC_vec_xl(int off, unsigned char *ptr) {
+ vector unsigned char vc;
+ __asm__ __volatile__("lxvd2x %x0, %1, %2; xxswapd %x0, %x0" : "=wa" (vc) : "r" (off), "r" (ptr));
+ return vc;
+}
+#define EC_vec_permxor(va, vb, vc) __builtin_crypto_vpermxor(va, vb, vec_nor(vc, vc))
+#else
+#error "This code is only supported on ppc64le."
+#endif
+#endif
+
+/**
+ * @brief GF(2^8) vector multiply. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and save to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32 byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbls, unsigned char *src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product. VSX version.
+ *
+ * Does a GF(2^8) dot product across each byte of the input array and a constant
+ * set of coefficients to produce each byte of the output. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 32*vlen byte constant array based on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
+ * on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector dot product with two outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate two outputs at a time. Does two
+ * GF(2^8) dot products across each byte of the input array and two constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 2*32*vlen byte constant array based on the two sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with three outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate three outputs at a time. Does three
+ * GF(2^8) dot products across each byte of the input array and three constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 3*32*vlen byte constant array based on the three sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with four outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate four outputs at a time. Does four
+ * GF(2^8) dot products across each byte of the input array and four constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 4*32*vlen byte constant array based on the four sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with five outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate five outputs at a time. Does five
+ * GF(2^8) dot products across each byte of the input array and five constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 5*32*vlen byte constant array based on the five sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes. Must >= 16.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector dot product with six outputs. VSX version.
+ *
+ * Vector dot product optimized to calculate six outputs at a time. Does six
+ * GF(2^8) dot products across each byte of the input array and six constant
+ * sets of coefficients to produce each byte of the outputs. Can be used for
+ * erasure coding encode and decode. Function requires pre-calculation of a
+ * 6*32*vlen byte constant array based on the six sets of input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vlen Number of vector sources.
+ * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
+ * based on the array of input coefficients.
+ * @param src Array of pointers to source inputs.
+ * @param dest Array of pointers to destination data buffers.
+ * @returns none
+ */
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Array of pointers to source inputs.
+ * @param dest Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. VSX version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires VSX
+ *
+ * @param len Length of each vector in bytes.
+ * @param vec The number of vector sources or rows in the generator matrix
+ * for coding.
+ * @param vec_i The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src Pointer to source input array.
+ * @param dest Array of pointers to destination input/outputs.
+ * @returns none
+ */
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+ unsigned char **dest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ERASURE_CODE_PPC64LE_H_
diff --git a/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
new file mode 100644
index 000000000..3cb269cce
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_2vect_dot_prod_vsx.c
@@ -0,0 +1,83 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4;
+ vector unsigned char vYD, vYE, vYF, vYG;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_2vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
new file mode 100644
index 000000000..621684a5f
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_2vect_mad_vsx.c
@@ -0,0 +1,65 @@
+#include "ec_base_vsx.h"
+
+void gf_2vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4;
+ vector unsigned char vYD, vYE, vYF, vYG;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
new file mode 100644
index 000000000..23b72dc4b
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_3vect_dot_prod_vsx.c
@@ -0,0 +1,104 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_3vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
new file mode 100644
index 000000000..ba90c1fdb
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_3vect_mad_vsx.c
@@ -0,0 +1,84 @@
+#include "ec_base_vsx.h"
+
+void gf_3vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
new file mode 100644
index 000000000..e65654453
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_4vect_dot_prod_vsx.c
@@ -0,0 +1,124 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_4vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+ vlo3 = vec_xl(0, g3);
+ vhi3 = vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
new file mode 100644
index 000000000..7b236b6f8
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_4vect_mad_vsx.c
@@ -0,0 +1,103 @@
+#include "ec_base_vsx.h"
+
+void gf_4vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
new file mode 100644
index 000000000..e9eef0e63
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_5vect_dot_prod_vsx.c
@@ -0,0 +1,145 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+ gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_5vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+ vY9 = vY9 ^ vY9;
+ vYA = vYA ^ vYA;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+ vYL = vYL ^ vYL;
+ vYM = vYM ^ vYM;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+ unsigned char *g4 = &gftbls[4 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = vec_xl(0, g2);
+ vhi2 = vec_xl(16, g2);
+ vlo3 = vec_xl(0, g3);
+ vhi3 = vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vlo4 = vec_xl(0, g4);
+ vhi4 = vec_xl(16, g4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ g4 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
new file mode 100644
index 000000000..7bb7bb211
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_5vect_mad_vsx.c
@@ -0,0 +1,122 @@
+#include "ec_base_vsx.h"
+
+void gf_5vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2, vhi3, vlo3, vhi4, vlo4;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vec_xl(0, t4 + i);
+ vYA = vec_xl(16, t4 + i);
+ vYL = vec_xl(32, t4 + i);
+ vYM = vec_xl(48, t4 + i);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
new file mode 100644
index 000000000..ac918bd49
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_6vect_dot_prod_vsx.c
@@ -0,0 +1,166 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest[0]);
+ gf_vect_mul_vsx(len, &gftbls[1 * 32 * vlen], src[0], (unsigned char *)dest[1]);
+ gf_vect_mul_vsx(len, &gftbls[2 * 32 * vlen], src[0], (unsigned char *)dest[2]);
+ gf_vect_mul_vsx(len, &gftbls[3 * 32 * vlen], src[0], (unsigned char *)dest[3]);
+ gf_vect_mul_vsx(len, &gftbls[4 * 32 * vlen], src[0], (unsigned char *)dest[4]);
+ gf_vect_mul_vsx(len, &gftbls[5 * 32 * vlen], src[0], (unsigned char *)dest[5]);
+
+ for (j = 1; j < vlen; j++) {
+ gf_6vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+ t5 = (unsigned char *)dest[5];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[1 * 32 * vlen], src, t1);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[2 * 32 * vlen], src, t2);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[3 * 32 * vlen], src, t3);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[4 * 32 * vlen], src, t4);
+ gf_vect_dot_prod_base(head, vlen, &gftbls[5 * 32 * vlen], src, t5);
+ }
+
+ for (i = head; i < len - 63; i += 64) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+ vY9 = vY9 ^ vY9;
+ vYA = vYA ^ vYA;
+ vYB = vYB ^ vYB;
+ vYC = vYC ^ vYC;
+
+ vYD = vYD ^ vYD;
+ vYE = vYE ^ vYE;
+ vYF = vYF ^ vYF;
+ vYG = vYG ^ vYG;
+ vYH = vYH ^ vYH;
+ vYI = vYI ^ vYI;
+ vYJ = vYJ ^ vYJ;
+ vYK = vYK ^ vYK;
+ vYL = vYL ^ vYL;
+ vYM = vYM ^ vYM;
+ vYN = vYN ^ vYN;
+ vYO = vYO ^ vYO;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+ unsigned char *g1 = &gftbls[1 * 32 * vlen];
+ unsigned char *g2 = &gftbls[2 * 32 * vlen];
+ unsigned char *g3 = &gftbls[3 * 32 * vlen];
+ unsigned char *g4 = &gftbls[4 * 32 * vlen];
+ unsigned char *g5 = &gftbls[5 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+ vlo1 = EC_vec_xl(0, g1);
+ vhi1 = EC_vec_xl(16, g1);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vlo2 = EC_vec_xl(0, g2);
+ vhi2 = EC_vec_xl(16, g2);
+ vlo3 = EC_vec_xl(0, g3);
+ vhi3 = EC_vec_xl(16, g3);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vlo4 = EC_vec_xl(0, g4);
+ vhi4 = EC_vec_xl(16, g4);
+ vlo5 = EC_vec_xl(0, g5);
+ vhi5 = EC_vec_xl(16, g5);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+ vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+ vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+ vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+ g0 += 32;
+ g1 += 32;
+ g2 += 32;
+ g3 += 32;
+ g4 += 32;
+ g5 += 32;
+ }
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYB, 0, t5 + i);
+ vec_xst(vYC, 16, t5 + i);
+
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+ vec_xst(vYN, 32, t5 + i);
+ vec_xst(vYO, 48, t5 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
new file mode 100644
index 000000000..43ea6c696
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_6vect_mad_vsx.c
@@ -0,0 +1,142 @@
+#include "ec_base_vsx.h"
+
+void gf_6vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char **dest)
+{
+ unsigned char *s, *t0, *t1, *t2, *t3, *t4, *t5;
+ vector unsigned char vX1, vX2, vX3, vX4;
+ vector unsigned char vY1, vY2, vY3, vY4, vY5, vY6, vY7, vY8, vY9, vYA, vYB, vYC;
+ vector unsigned char vYD, vYE, vYF, vYG, vYH, vYI, vYJ, vYK, vYL, vYM, vYN, vYO;
+ vector unsigned char vhi0, vlo0, vhi1, vlo1, vhi2, vlo2;
+ vector unsigned char vhi3, vlo3, vhi4, vlo4, vhi5, vlo5;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest[0];
+ t1 = (unsigned char *)dest[1];
+ t2 = (unsigned char *)dest[2];
+ t3 = (unsigned char *)dest[3];
+ t4 = (unsigned char *)dest[4];
+ t5 = (unsigned char *)dest[5];
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, t0);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[1 * 32 * vec], src, t1);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[2 * 32 * vec], src, t2);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[3 * 32 * vec], src, t3);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[4 * 32 * vec], src, t4);
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[5 * 32 * vec], src, t5);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vlo1 = EC_vec_xl(0, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vhi1 = EC_vec_xl(16, gftbls + (((1 * vec) << 5) + (vec_i << 5)));
+ vlo2 = EC_vec_xl(0, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vhi2 = EC_vec_xl(16, gftbls + (((2 * vec) << 5) + (vec_i << 5)));
+ vlo3 = EC_vec_xl(0, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vhi3 = EC_vec_xl(16, gftbls + (((3 * vec) << 5) + (vec_i << 5)));
+ vlo4 = EC_vec_xl(0, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vhi4 = EC_vec_xl(16, gftbls + (((4 * vec) << 5) + (vec_i << 5)));
+ vlo5 = EC_vec_xl(0, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+ vhi5 = EC_vec_xl(16, gftbls + (((5 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vYD = vec_xl(32, t0 + i);
+ vYE = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vYD = vYD ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vYE = vYE ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vYD, 32, t0 + i);
+ vec_xst(vYE, 48, t0 + i);
+
+ vY3 = vec_xl(0, t1 + i);
+ vY4 = vec_xl(16, t1 + i);
+ vYF = vec_xl(32, t1 + i);
+ vYG = vec_xl(48, t1 + i);
+
+ vY3 = vY3 ^ EC_vec_permxor(vhi1, vlo1, vX1);
+ vY4 = vY4 ^ EC_vec_permxor(vhi1, vlo1, vX2);
+ vYF = vYF ^ EC_vec_permxor(vhi1, vlo1, vX3);
+ vYG = vYG ^ EC_vec_permxor(vhi1, vlo1, vX4);
+
+ vec_xst(vY3, 0, t1 + i);
+ vec_xst(vY4, 16, t1 + i);
+ vec_xst(vYF, 32, t1 + i);
+ vec_xst(vYG, 48, t1 + i);
+
+ vY5 = vec_xl(0, t2 + i);
+ vY6 = vec_xl(16, t2 + i);
+ vYH = vec_xl(32, t2 + i);
+ vYI = vec_xl(48, t2 + i);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi2, vlo2, vX1);
+ vY6 = vY6 ^ EC_vec_permxor(vhi2, vlo2, vX2);
+ vYH = vYH ^ EC_vec_permxor(vhi2, vlo2, vX3);
+ vYI = vYI ^ EC_vec_permxor(vhi2, vlo2, vX4);
+
+ vY7 = vec_xl(0, t3 + i);
+ vY8 = vec_xl(16, t3 + i);
+ vYJ = vec_xl(32, t3 + i);
+ vYK = vec_xl(48, t3 + i);
+
+ vec_xst(vY5, 0, t2 + i);
+ vec_xst(vY6, 16, t2 + i);
+ vec_xst(vYH, 32, t2 + i);
+ vec_xst(vYI, 48, t2 + i);
+
+ vY7 = vY7 ^ EC_vec_permxor(vhi3, vlo3, vX1);
+ vY8 = vY8 ^ EC_vec_permxor(vhi3, vlo3, vX2);
+ vYJ = vYJ ^ EC_vec_permxor(vhi3, vlo3, vX3);
+ vYK = vYK ^ EC_vec_permxor(vhi3, vlo3, vX4);
+
+ vY9 = vec_xl(0, t4 + i);
+ vYA = vec_xl(16, t4 + i);
+ vYL = vec_xl(32, t4 + i);
+ vYM = vec_xl(48, t4 + i);
+
+ vec_xst(vY7, 0, t3 + i);
+ vec_xst(vY8, 16, t3 + i);
+ vec_xst(vYJ, 32, t3 + i);
+ vec_xst(vYK, 48, t3 + i);
+
+ vY9 = vY9 ^ EC_vec_permxor(vhi4, vlo4, vX1);
+ vYA = vYA ^ EC_vec_permxor(vhi4, vlo4, vX2);
+ vYL = vYL ^ EC_vec_permxor(vhi4, vlo4, vX3);
+ vYM = vYM ^ EC_vec_permxor(vhi4, vlo4, vX4);
+
+ vYB = vec_xl(0, t5 + i);
+ vYC = vec_xl(16, t5 + i);
+ vYN = vec_xl(32, t5 + i);
+ vYO = vec_xl(48, t5 + i);
+
+ vec_xst(vY9, 0, t4 + i);
+ vec_xst(vYA, 16, t4 + i);
+ vec_xst(vYL, 32, t4 + i);
+ vec_xst(vYM, 48, t4 + i);
+
+ vYB = vYB ^ EC_vec_permxor(vhi5, vlo5, vX1);
+ vYC = vYC ^ EC_vec_permxor(vhi5, vlo5, vX2);
+ vYN = vYN ^ EC_vec_permxor(vhi5, vlo5, vX3);
+ vYO = vYO ^ EC_vec_permxor(vhi5, vlo5, vX4);
+
+ vec_xst(vYB, 0, t5 + i);
+ vec_xst(vYC, 16, t5 + i);
+ vec_xst(vYN, 32, t5 + i);
+ vec_xst(vYO, 48, t5 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
new file mode 100644
index 000000000..2f97e3421
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_dot_prod_vsx.c
@@ -0,0 +1,85 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_dot_prod_vsx(int len, int vlen, unsigned char *gftbls,
+ unsigned char **src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vX5, vY5;
+ vector unsigned char vX6, vY6;
+ vector unsigned char vX7, vY7;
+ vector unsigned char vX8, vY8;
+ vector unsigned char vhi0, vlo0;
+ int i, j, head;
+
+ if (vlen < 128) {
+ gf_vect_mul_vsx(len, &gftbls[0 * 32 * vlen], src[0], (unsigned char *)dest);
+
+ for (j = 1; j < vlen; j++) {
+ gf_vect_mad_vsx(len, vlen, j, gftbls, src[j], dest);
+ }
+ return;
+ }
+
+ t0 = (unsigned char *)dest;
+
+ head = len % 128;
+ if (head != 0) {
+ gf_vect_dot_prod_base(head, vlen, &gftbls[0 * 32 * vlen], src, t0);
+ }
+
+ for (i = head; i < len - 127; i += 128) {
+ vY1 = vY1 ^ vY1;
+ vY2 = vY2 ^ vY2;
+ vY3 = vY3 ^ vY3;
+ vY4 = vY4 ^ vY4;
+
+ vY5 = vY5 ^ vY5;
+ vY6 = vY6 ^ vY6;
+ vY7 = vY7 ^ vY7;
+ vY8 = vY8 ^ vY8;
+
+ unsigned char *g0 = &gftbls[0 * 32 * vlen];
+
+ for (j = 0; j < vlen; j++) {
+ s = (unsigned char *)src[j];
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vlo0 = EC_vec_xl(0, g0);
+ vhi0 = EC_vec_xl(16, g0);
+
+ vX5 = vec_xl(64, s + i);
+ vX6 = vec_xl(80, s + i);
+ vX7 = vec_xl(96, s + i);
+ vX8 = vec_xl(112, s + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY5 = vY5 ^ EC_vec_permxor(vhi0, vlo0, vX5);
+ vY6 = vY6 ^ EC_vec_permxor(vhi0, vlo0, vX6);
+ vY7 = vY7 ^ EC_vec_permxor(vhi0, vlo0, vX7);
+ vY8 = vY8 ^ EC_vec_permxor(vhi0, vlo0, vX8);
+
+ g0 += 32;
+ }
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+
+ vec_xst(vY5, 64, t0 + i);
+ vec_xst(vY6, 80, t0 + i);
+ vec_xst(vY7, 96, t0 + i);
+ vec_xst(vY8, 112, t0 + i);
+ }
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
new file mode 100644
index 000000000..a4810b96d
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_mad_vsx.c
@@ -0,0 +1,48 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mad_vsx(int len, int vec, int vec_i, unsigned char *gftbls,
+ unsigned char *src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vhi0, vlo0;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest;
+
+ head = len % 64;
+ if (head != 0) {
+ gf_vect_mad_base(head, vec, vec_i, &gftbls[0 * 32 * vec], src, dest);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+ vhi0 = EC_vec_xl(16, gftbls + (((0 * vec) << 5) + (vec_i << 5)));
+
+ for (i = head; i < len - 63; i += 64) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vY1 = vec_xl(0, t0 + i);
+ vY2 = vec_xl(16, t0 + i);
+ vY3 = vec_xl(32, t0 + i);
+ vY4 = vec_xl(48, t0 + i);
+
+ vY1 = vY1 ^ EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = vY2 ^ EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = vY3 ^ EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = vY4 ^ EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+ }
+
+ return;
+}
diff --git a/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c b/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
new file mode 100644
index 000000000..3e610a104
--- /dev/null
+++ b/src/isa-l/erasure_code/ppc64le/gf_vect_mul_vsx.c
@@ -0,0 +1,61 @@
+#include "ec_base_vsx.h"
+
+void gf_vect_mul_vsx(int len, unsigned char *gftbl, unsigned char *src, unsigned char *dest)
+{
+ unsigned char *s, *t0;
+ vector unsigned char vX1, vY1;
+ vector unsigned char vX2, vY2;
+ vector unsigned char vX3, vY3;
+ vector unsigned char vX4, vY4;
+ vector unsigned char vX5, vY5;
+ vector unsigned char vX6, vY6;
+ vector unsigned char vX7, vY7;
+ vector unsigned char vX8, vY8;
+ vector unsigned char vhi0, vlo0;
+ int i, head;
+
+ s = (unsigned char *)src;
+ t0 = (unsigned char *)dest;
+
+ head = len % 128;
+ if (head != 0) {
+ gf_vect_mul_base(head, gftbl, src, dest);
+ }
+
+ vlo0 = EC_vec_xl(0, gftbl);
+ vhi0 = EC_vec_xl(16, gftbl);
+
+ for (i = head; i < len - 127; i += 128) {
+ vX1 = vec_xl(0, s + i);
+ vX2 = vec_xl(16, s + i);
+ vX3 = vec_xl(32, s + i);
+ vX4 = vec_xl(48, s + i);
+
+ vX5 = vec_xl(64, s + i);
+ vX6 = vec_xl(80, s + i);
+ vX7 = vec_xl(96, s + i);
+ vX8 = vec_xl(112, s + i);
+
+ vY1 = EC_vec_permxor(vhi0, vlo0, vX1);
+ vY2 = EC_vec_permxor(vhi0, vlo0, vX2);
+ vY3 = EC_vec_permxor(vhi0, vlo0, vX3);
+ vY4 = EC_vec_permxor(vhi0, vlo0, vX4);
+
+ vY5 = EC_vec_permxor(vhi0, vlo0, vX5);
+ vY6 = EC_vec_permxor(vhi0, vlo0, vX6);
+ vY7 = EC_vec_permxor(vhi0, vlo0, vX7);
+ vY8 = EC_vec_permxor(vhi0, vlo0, vX8);
+
+ vec_xst(vY1, 0, t0 + i);
+ vec_xst(vY2, 16, t0 + i);
+ vec_xst(vY3, 32, t0 + i);
+ vec_xst(vY4, 48, t0 + i);
+
+ vec_xst(vY5, 64, t0 + i);
+ vec_xst(vY6, 80, t0 + i);
+ vec_xst(vY7, 96, t0 + i);
+ vec_xst(vY8, 112, t0 + i);
+ }
+
+ return;
+}