72 files changed, 26488 insertions, 0 deletions
diff --git a/src/erasure-code/jerasure/gf-complete/.gitignore b/src/erasure-code/jerasure/gf-complete/.gitignore
new file mode 100644
index 000000000..bfc1dfc10
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/.gitignore
@@ -0,0 +1,78 @@
+Makefile
+Makefile.in
+/autom4te.cache
+/aclocal.m4
+/compile
+/configure
+/depcomp
+/install-sh
+/missing
+include/config.h
+include/config.h.in
+include/config.h.in~
+include/stamp-h1
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Libraries
+*.lib
+*.la
+*.a
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.lo
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Other stuff
+.deps/
+.libs/
+/config.log
+/config.status
+/libtool
+INSTALL
+config.guess
+config.sub
+ltmain.sh
+m4/libtool.m4
+m4/ltversion.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/lt~obsolete.m4
+test-driver
+src/.dirstamp
+test-driver
+
+examples/gf_example_1
+examples/gf_example_2
+examples/gf_example_3
+examples/gf_example_4
+examples/gf_example_5
+examples/gf_example_6
+examples/gf_example_7
+test/gf_unit
+tools/gf_add
+tools/gf_div
+tools/gf_inline_time
+tools/gf_methods
+tools/gf_mult
+tools/gf_poly
+tools/gf_time
+tools/gf_unit_w*
+tools/test-suite.log
+tools/.qemu/
+tools/test_simd*.results*
diff --git a/src/erasure-code/jerasure/gf-complete/AUTHORS b/src/erasure-code/jerasure/gf-complete/AUTHORS
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/AUTHORS
diff --git a/src/erasure-code/jerasure/gf-complete/COPYING b/src/erasure-code/jerasure/gf-complete/COPYING
new file mode 100644
index 000000000..df8d9ed33
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/COPYING
@@ -0,0 +1,32 @@
+Copyright (c) 2013, James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/erasure-code/jerasure/gf-complete/ChangeLog b/src/erasure-code/jerasure/gf-complete/ChangeLog
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/ChangeLog
diff --git a/src/erasure-code/jerasure/gf-complete/License.txt b/src/erasure-code/jerasure/gf-complete/License.txt
new file mode 100644
index 000000000..df8d9ed33
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/License.txt
@@ -0,0 +1,32 @@
+Copyright (c) 2013, James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/erasure-code/jerasure/gf-complete/Makefile.am b/src/erasure-code/jerasure/gf-complete/Makefile.am
new file mode 100644
index 000000000..cfb293a15
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/Makefile.am
@@ -0,0 +1,10 @@
+# Top-level GF-Complete AM file
+# Distributes headers
+
+SUBDIRS = src tools test examples
+ACLOCAL_AMFLAGS = -I m4
+
+include_HEADERS = include/gf_complete.h include/gf_method.h include/gf_rand.h include/gf_general.h
+
+# display the output of failed TESTS after a failed make check
+export VERBOSE = true
diff --git a/src/erasure-code/jerasure/gf-complete/NEWS b/src/erasure-code/jerasure/gf-complete/NEWS
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/NEWS
diff --git a/src/erasure-code/jerasure/gf-complete/README b/src/erasure-code/jerasure/gf-complete/README
new file mode 100644
index 000000000..7fd2f0494
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/README
@@ -0,0 +1,21 @@
+This is GF-Complete, Revision 1.03.   January 1, 2015.
+
+Authors: James S. Plank (University of Tennessee)
+         Ethan L. Miller (UC Santa Cruz)
+         Kevin M. Greenan (Box)
+         Benjamin A. Arnold (University of Tennessee)
+         John A. Burnum (University of Tennessee)
+         Adam W. Disney (University of Tennessee,
+         Allen C. McBride (University of Tennessee)
+
+The user's manual is in the file Manual.pdf.  
+
+The online home for GF-Complete is:
+
+  - https://jerasure.org/jerasure/gf-complete
+
+To compile, do:
+
+   ./configure
+   make
+   sudo make install
diff --git a/src/erasure-code/jerasure/gf-complete/README.txt b/src/erasure-code/jerasure/gf-complete/README.txt
new file mode 100644
index 000000000..cd2d66e1e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/README.txt
@@ -0,0 +1,21 @@
+This is GF-Complete, Revision 1.03.   January 1, 2015.
+
+Authors: James S. Plank (University of Tennessee)
+         Ethan L. Miller (UC Santa Cruz)
+         Kevin M. Greenan (Box)
+         Benjamin A. Arnold (University of Tennessee)
+         John A. Burnum (University of Tennessee)
+         Adam W. Disney (University of Tennessee,
+         Allen C. McBride (University of Tennessee)
+
+The user's manual is in the file Manual.pdf.  
+
+The online home for GF-Complete is:
+
+  - http://jerasure.org/jerasure/gf-complete
+
+To compile, do:
+
+   ./configure
+   make
+   sudo make install
diff --git a/src/erasure-code/jerasure/gf-complete/autogen.sh b/src/erasure-code/jerasure/gf-complete/autogen.sh
new file mode 100755
index 000000000..b483139f9
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/autogen.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+autoreconf --force --install -I m4
diff --git a/src/erasure-code/jerasure/gf-complete/configure.ac b/src/erasure-code/jerasure/gf-complete/configure.ac
new file mode 100644
index 000000000..d696f6eb0
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/configure.ac
@@ -0,0 +1,87 @@
+# gf-complete autoconf template
+
+# FIXME - add project url as the last argument
+AC_INIT(gf-complete, 1.0)
+
+# Override default CFLAGS
+: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
+
+AC_PREREQ([2.61])
+
+AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
+LT_INIT # libtool
+
+AC_CONFIG_HEADER(include/config.h)
+
+dnl Needed when reconfiguring with 'autoreconf -i -s'
+AC_CONFIG_MACRO_DIR([m4])
+
+# This prevents './configure; make' from trying to run autotools.
+AM_MAINTAINER_MODE([disable])
+
+dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
+AC_PROG_CC
+
+# Check for functions to provide aligned memory
+#
+AC_CHECK_FUNCS([posix_memalign],
+ [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
+AC_ARG_ENABLE([debug-functions],
+              AS_HELP_STRING([--enable-debug-func], [Enable debugging of functions selected]))
+AS_IF([test "x$enable_debug_func" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_FUNCTIONS"])
+
+AC_ARG_ENABLE([debug-cpu],
+              AS_HELP_STRING([--enable-debug-cpu], [Enable debugging of SIMD detection]))
+AS_IF([test "x$enable_debug_cpu" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_CPU_DETECTION"])
+
+AX_EXT()
+
+AC_ARG_ENABLE([neon],
+              AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
+
+AS_IF([test "x$enable_neon" != "xno"],
+      [noneon_CPPFLAGS=$CPPFLAGS
+       CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
+       AC_CHECK_HEADER([arm_neon.h],
+                       [have_neon=yes],
+                       [have_neon=no
+                        CPPFLAGS=$noneon_CPPFLAGS])],
+      [have_neon=no
+       AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
+             [SIMD_FLAGS=""])
+      ])
+
+AS_IF([test "x$have_neon" = "xno"],
+      [AS_IF([test "x$enable_neon" = "xyes"],
+             [AC_MSG_ERROR([neon requested but arm_neon.h not found])])
+      ])
+AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
+
+AC_ARG_ENABLE([sse],
+              AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
+              [if   test "x$enableval" = "xno" ; then
+                SIMD_FLAGS=""
+                echo "DISABLED SSE!!!"
+              fi]
+)
+
+AC_ARG_ENABLE([valgrind],
+            [AS_HELP_STRING([--enable-valgrind], [run tests with valgrind])],
+            [],
+            [enable_valgrind=no])
+AM_CONDITIONAL(ENABLE_VALGRIND, test "x$enable_valgrind" != xno)
+
+AC_ARG_ENABLE([avx], AS_HELP_STRING([--enable-avx], [Build with AVX optimizations]))
+AX_CHECK_COMPILE_FLAG(-mavx, [ax_cv_support_avx=yes], [])
+
+AS_IF([test "x$enable_avx" = "xyes"],
+      [AS_IF([test "x$ax_cv_support_avx" = "xno"],
+             [AC_MSG_ERROR([AVX requested but compiler does not support -mavx])],
+             [SIMD_FLAGS="$SIMD_FLAGS -mavx"])
+      ])
+
+AC_CONFIG_FILES([Makefile src/Makefile tools/Makefile test/Makefile examples/Makefile])
+AC_OUTPUT
diff --git a/src/erasure-code/jerasure/gf-complete/examples/Makefile.am b/src/erasure-code/jerasure/gf-complete/examples/Makefile.am
new file mode 100644
index 000000000..a420bda84
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/Makefile.am
@@ -0,0 +1,37 @@
+# GF-Complete 'examples' AM file
+
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
+
+bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
+               gf_example_5 gf_example_6 gf_example_7
+
+gf_example_1_SOURCES = gf_example_1.c
+#gf_example_1_LDFLAGS = -lgf_complete
+gf_example_1_LDADD = ../src/libgf_complete.la
+
+gf_example_2_SOURCES = gf_example_2.c
+#gf_example_2_LDFLAGS = -lgf_complete
+gf_example_2_LDADD = ../src/libgf_complete.la
+
+gf_example_3_SOURCES = gf_example_3.c
+#gf_example_3_LDFLAGS = -lgf_complete
+gf_example_3_LDADD = ../src/libgf_complete.la
+
+gf_example_4_SOURCES = gf_example_4.c
+#gf_example_4_LDFLAGS = -lgf_complete
+gf_example_4_LDADD = ../src/libgf_complete.la
+
+gf_example_5_SOURCES = gf_example_5.c
+#gf_example_5_LDFLAGS = -lgf_complete
+gf_example_5_LDADD = ../src/libgf_complete.la
+
+gf_example_6_SOURCES = gf_example_6.c
+#gf_example_6_LDFLAGS = -lgf_complete
+gf_example_6_LDADD = ../src/libgf_complete.la
+
+gf_example_7_SOURCES = gf_example_7.c
+#gf_example_7_LDFLAGS = -lgf_complete
+gf_example_7_LDADD = ../src/libgf_complete.la
+
+
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c
new file mode 100644
index 000000000..a7a415595
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c
@@ -0,0 +1,58 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_1.c
+ *
+ * Demonstrates using the procedures for examples in GF(2^w) for w <= 32.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_1 w - w must be between 1 and 32\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint32_t a, b, c;
+  int w;
+  gf_t gf;
+
+  if (argc != 2) usage(NULL);
+  w = atoi(argv[1]);
+  if (w <= 0 || w > 32) usage("Bad w");
+
+  /* Get two random numbers in a and b */
+
+  MOA_Seed(time(0));
+  a = MOA_Random_W(w, 0);
+  b = MOA_Random_W(w, 0);
+ 
+  /* Create the proper instance of the gf_t object using defaults: */
+
+  gf_init_easy(&gf, w);
+
+  /* And multiply a and b using the galois field: */
+
+  c = gf.multiply.w32(&gf, a, b);
+  printf("%u * %u = %u\n", a, b, c); 
+
+  /* Divide the product by a and b */
+
+  printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a));
+  printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b));
+  
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c
new file mode 100644
index 000000000..576d9a534
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c
@@ -0,0 +1,107 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_2.c
+ *
+ * Demonstrates using the procedures for examples in GF(2^w) for w <= 32.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_2 w - w must be between 1 and 32\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint32_t a, b, c;
+  uint8_t *r1, *r2;
+  uint16_t *r16 = NULL;
+  uint32_t *r32 = NULL;
+  int w, i;
+  gf_t gf;
+
+  if (argc != 2) usage(NULL);
+  w = atoi(argv[1]);
+  if (w <= 0 || w > 32) usage("Bad w");
+
+  /* Get two random numbers in a and b */
+
+  MOA_Seed(time(0));
+  a = MOA_Random_W(w, 0);
+  b = MOA_Random_W(w, 0);
+
+  /* Create the proper instance of the gf_t object using defaults: */
+
+  gf_init_easy(&gf, w);
+
+  /* And multiply a and b using the galois field: */
+
+  c = gf.multiply.w32(&gf, a, b);
+  printf("%u * %u = %u\n", a, b, c);
+
+  /* Divide the product by a and b */
+
+  printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a));
+  printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b));
+
+  /* If w is 4, 8, 16 or 32, do a very small region operation */
+
+  if (w == 4 || w == 8 || w == 16 || w == 32) {
+    r1 = (uint8_t *) malloc(16);
+    r2 = (uint8_t *) malloc(16);
+
+    if (w == 4 || w == 8) {
+      r1[0] = b;
+      for (i = 1; i < 16; i++) r1[i] = MOA_Random_W(8, 1);
+    } else if (w == 16) {
+      r16 = (uint16_t *) r1;
+      r16[0] = b;
+      for (i = 1; i < 8; i++) r16[i] = MOA_Random_W(16, 1);
+    } else {
+      r32 = (uint32_t *) r1;
+      r32[0] = b;
+      for (i = 1; i < 4; i++) r32[i] = MOA_Random_W(32, 1);
+    }
+
+    gf.multiply_region.w32(&gf, r1, r2, a, 16, 0);
+  
+    printf("\nmultiply_region by 0x%x (%u)\n\n", a, a);
+    printf("R1 (the source):  ");
+    if (w == 4) {
+      for (i = 0; i < 16; i++) printf(" %x %x", r1[i] >> 4, r1[i] & 0xf);
+    } else if (w == 8) {
+      for (i = 0; i < 16; i++) printf(" %02x", r1[i]);
+    } else if (w == 16) {
+      for (i = 0; i < 8; i++) printf(" %04x", r16[i]);
+    } else if (w == 32) {
+      for (i = 0; i < 4; i++) printf(" %08x", r32[i]);
+    }
+    printf("\nR2 (the product): ");
+    if (w == 4) {
+      for (i = 0; i < 16; i++) printf(" %x %x", r2[i] >> 4, r2[i] & 0xf);
+    } else if (w == 8) {
+      for (i = 0; i < 16; i++) printf(" %02x", r2[i]);
+    } else if (w == 16) {
+      r16 = (uint16_t *) r2;
+      for (i = 0; i < 8; i++) printf(" %04x", r16[i]);
+    } else if (w == 32) {
+      r32 = (uint32_t *) r2;
+      for (i = 0; i < 4; i++) printf(" %08x", r32[i]);
+    }
+    printf("\n");
+  }
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c
new file mode 100644
index 000000000..d6fef879e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c
@@ -0,0 +1,74 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_3.c
+ *
+ * Identical to example_2 except it works in GF(2^64)
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_3\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint64_t a, b, c;
+  uint64_t *r1, *r2;
+  int i;
+  gf_t gf;
+
+  if (argc != 1) usage(NULL);
+
+  /* Get two random numbers in a and b */
+
+  MOA_Seed(time(0));
+  a = MOA_Random_64();
+  b = MOA_Random_64();
+
+  /* Create the proper instance of the gf_t object using defaults: */
+
+  gf_init_easy(&gf, 64);
+
+  /* And multiply a and b using the galois field: */
+
+  c = gf.multiply.w64(&gf, a, b);
+  printf("%llx * %llx = %llx\n", (long long unsigned int) a, (long long unsigned int) b, (long long unsigned int) c);
+
+  /* Divide the product by a and b */
+
+  printf("%llx / %llx = %llx\n", (long long unsigned int) c, (long long unsigned int) a, (long long unsigned int) gf.divide.w64(&gf, c, a));
+  printf("%llx / %llx = %llx\n", (long long unsigned int) c, (long long unsigned int) b, (long long unsigned int) gf.divide.w64(&gf, c, b));
+
+  r1 = (uint64_t *) malloc(32);
+  r2 = (uint64_t *) malloc(32);
+
+  r1[0] = b;
+
+  for (i = 1; i < 4; i++) r1[i] = MOA_Random_64();
+
+  gf.multiply_region.w64(&gf, r1, r2, a, 32, 0);
+
+  printf("\nmultiply_region by %llx\n\n", (long long unsigned int) a);
+  printf("R1 (the source):  ");
+  for (i = 0; i < 4; i++) printf(" %016llx", (long long unsigned int) r1[i]);
+
+  printf("\nR2 (the product): ");
+  for (i = 0; i < 4; i++) printf(" %016llx", (long long unsigned int) r2[i]);
+  printf("\n");
+
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c
new file mode 100644
index 000000000..17529b5b0
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c
@@ -0,0 +1,69 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_4.c
+ *
+ * Identical to example_3 except it works in GF(2^128)
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+#define LLUI (long long unsigned int) 
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_3\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint64_t a[2], b[2], c[2];
+  uint64_t *r1, *r2;
+  int i;
+  gf_t gf;
+
+  if (argc != 1) usage(NULL);
+
+  /* Get two random numbers in a and b */
+
+  MOA_Seed(time(0));
+  MOA_Random_128(a);
+  MOA_Random_128(b);
+
+  /* Create the proper instance of the gf_t object using defaults: */
+
+  gf_init_easy(&gf, 128);
+
+  /* And multiply a and b using the galois field: */
+
+  gf.multiply.w128(&gf, a, b, c);
+  printf("%016llx%016llx * %016llx%016llx =\n%016llx%016llx\n", 
+      LLUI a[0], LLUI a[1], LLUI b[0], LLUI b[1], LLUI c[0], LLUI c[1]);
+
+  r1 = (uint64_t *) malloc(32);
+  r2 = (uint64_t *) malloc(32);
+
+  for (i = 0; i < 4; i++) r1[i] = MOA_Random_64();
+
+  gf.multiply_region.w128(&gf, r1, r2, a, 32, 0);
+
+  printf("\nmultiply_region by %016llx%016llx\n\n", LLUI a[0], LLUI a[1]);
+  printf("R1 (the source):  ");
+  for (i = 0; i < 4; i += 2) printf(" %016llx%016llx", LLUI r1[i], LLUI r1[i+1]);
+
+  printf("\nR2 (the product): ");
+  for (i = 0; i < 4; i += 2) printf(" %016llx%016llx", LLUI r2[i], LLUI r2[i+1]);
+  printf("\n");
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c
new file mode 100644
index 000000000..da6e9ca68
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c
@@ -0,0 +1,78 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_5.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_5\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint16_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 16, 4, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint16_t *) malloc(200);
+  b = (uint16_t *) malloc(200);
+
+  a += 6;
+  b += 6;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %4d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %04x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %04x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x    ", i,
+           gf.extract_word.w32(&gf, a, 30*2, i),
+           gf.extract_word.w32(&gf, b, 30*2, i));
+    printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*2, i+15),
+           gf.extract_word.w32(&gf, b, 30*2, i+15));
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c
new file mode 100644
index 000000000..800a35ffb
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c
@@ -0,0 +1,84 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_6.c
+ *
+ * Demonstrating altmap and extract_word
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_6\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint32_t *a, *b;
+  int i, j;
+  gf_t gf, gf_16;
+
+  if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+                   0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (6) failed\n");
+    exit(1);
+  }
+
+  if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, 
+                   0, 2, 0, &gf_16, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard (32) failed\n");
+    exit(1);
+  }
+
+  a = (uint32_t *) malloc(200);
+  b = (uint32_t *) malloc(200);
+
+  a += 3;
+  b += 3;
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  for (i = 0; i < 30; i += 10) {
+    printf("\n");
+    printf("  ");
+    for (j = 0; j < 10; j++) printf(" %8d", i+j);
+    printf("\n");
+
+    printf("a:");
+    for (j = 0; j < 10; j++) printf(" %08x", a[i+j]);
+    printf("\n");
+
+    printf("b:");
+    for (j = 0; j < 10; j++) printf(" %08x", b[i+j]);
+    printf("\n");
+    printf("\n");
+  }
+
+  for (i = 0; i < 15; i ++) {
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x    ", i,
+           gf.extract_word.w32(&gf, a, 30*4, i),
+           gf.extract_word.w32(&gf, b, 30*4, i));
+    printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15,
+           gf.extract_word.w32(&gf, a, 30*4, i+15),
+           gf.extract_word.w32(&gf, b, 30*4, i+15));
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c
new file mode 100644
index 000000000..ee07d5353
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c
@@ -0,0 +1,75 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_example_7.c
+ *
+ * Demonstrating extract_word and Cauchy
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_example_7\n");
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  uint8_t *a, *b;
+  int i, j;
+  gf_t gf;
+
+  if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) {
+    fprintf(stderr, "gf_init_hard failed\n");
+    exit(1);
+  }
+
+  a = (uint8_t *) malloc(3);
+  b = (uint8_t *) malloc(3);
+
+  MOA_Seed(0);
+
+  for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1);
+
+  gf.multiply_region.w32(&gf, a, b, 5, 3, 0);
+
+  printf("a: 0x%lx    b: 0x%lx\n", (unsigned long) a, (unsigned long) b);
+
+  printf("\n");
+  printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]);
+  printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]);
+  printf("\n");
+
+  printf("a bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("b bits:");
+  for (i = 0; i < 3; i++) {
+    printf(" ");
+    for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0');
+  }
+  printf("\n");
+
+  printf("\n");
+  for (i = 0; i < 8; i++) {
+    printf("Word %2d: %d * 5 = %d\n", i,
+           gf.extract_word.w32(&gf, a, 3, i),
+           gf.extract_word.w32(&gf, b, 3, i));
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h
new file mode 100644
index 000000000..c4783e80a
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h
@@ -0,0 +1,204 @@
+/* 
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_complete.h
+ *
+ * The main include file for gf_complete. 
+ */
+
+#ifndef _GF_COMPLETE_H_
+#define _GF_COMPLETE_H_
+#include <stdint.h>
+
+#ifdef INTEL_SSE4
+  #ifdef __SSE4_2__
+    #include <nmmintrin.h>
+  #endif
+  #ifdef __SSE4_1__
+    #include <smmintrin.h>
+  #endif
+#endif
+
+#ifdef INTEL_SSSE3
+  #include <tmmintrin.h>
+#endif
+
+#ifdef INTEL_SSE2
+  #include <emmintrin.h>
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+  #include <wmmintrin.h>
+#endif
+
+#if defined(ARM_NEON)
+  #include <arm_neon.h>
+#endif
+
+
+/* These are the different ways to perform multiplication.
+   Not all are implemented for all values of w.
+   See the paper for an explanation of how they work. */
+
+typedef enum {GF_MULT_DEFAULT,
+              GF_MULT_SHIFT,
+              GF_MULT_CARRY_FREE,
+              GF_MULT_CARRY_FREE_GK,
+              GF_MULT_GROUP,
+              GF_MULT_BYTWO_p,
+              GF_MULT_BYTWO_b,
+              GF_MULT_TABLE,
+              GF_MULT_LOG_TABLE,
+              GF_MULT_LOG_ZERO,
+              GF_MULT_LOG_ZERO_EXT,
+              GF_MULT_SPLIT_TABLE,
+              GF_MULT_COMPOSITE } gf_mult_type_t;
+
+/* These are the different ways to optimize region 
+   operations.  They are bits because you can compose them.
+   Certain optimizations only apply to certain gf_mult_type_t's.  
+   Again, please see documentation for how to use these */
+   
+#define GF_REGION_DEFAULT      (0x0)
+#define GF_REGION_DOUBLE_TABLE (0x1)
+#define GF_REGION_QUAD_TABLE   (0x2)
+#define GF_REGION_LAZY         (0x4)
+#define GF_REGION_SIMD         (0x8)
+#define GF_REGION_SSE          (0x8)
+#define GF_REGION_NOSIMD       (0x10)
+#define GF_REGION_NOSSE        (0x10)
+#define GF_REGION_ALTMAP       (0x20)
+#define GF_REGION_CAUCHY       (0x40)
+
+typedef uint32_t gf_region_type_t;
+
+/* These are different ways to implement division.
+   Once again, it's best to use "DEFAULT".  However,
+   there are times when you may want to experiment
+   with the others. */
+
+typedef enum { GF_DIVIDE_DEFAULT,
+               GF_DIVIDE_MATRIX,
+               GF_DIVIDE_EUCLID } gf_division_type_t;
+
+/* We support w=4,8,16,32,64 and 128 with their own data types and
+   operations for multiplication, division, etc.  We also support
+   a "gen" type so that you can do general gf arithmetic for any 
+   value of w from 1 to 32.  You can perform a "region" operation
+   on these if you use "CAUCHY" as the mapping. 
+ */
+
+typedef uint32_t    gf_val_32_t;
+typedef uint64_t    gf_val_64_t;
+typedef uint64_t   *gf_val_128_t;
+
+extern int _gf_errno;
+extern void gf_error();
+
+typedef struct gf *GFP;
+
+typedef union gf_func_a_b {
+    gf_val_32_t  (*w32) (GFP gf, gf_val_32_t a,  gf_val_32_t b);
+    gf_val_64_t  (*w64) (GFP gf, gf_val_64_t a,  gf_val_64_t b);
+    void         (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t c);
+} gf_func_a_b;
+  
+typedef union {
+  gf_val_32_t  (*w32) (GFP gf, gf_val_32_t a);
+  gf_val_64_t  (*w64) (GFP gf, gf_val_64_t a);
+  void         (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b);
+} gf_func_a;
+  
+typedef union {
+  void  (*w32) (GFP gf, void *src, void *dest, gf_val_32_t val,  int bytes, int add);
+  void  (*w64) (GFP gf, void *src, void *dest, gf_val_64_t val,  int bytes, int add);
+  void  (*w128)(GFP gf, void *src, void *dest, gf_val_128_t val, int bytes, int add);
+} gf_region;
+
+typedef union {
+  gf_val_32_t  (*w32) (GFP gf, void *start, int bytes, int index);
+  gf_val_64_t  (*w64) (GFP gf, void *start, int bytes, int index);
+  void         (*w128)(GFP gf, void *start, int bytes, int index, gf_val_128_t rv);
+} gf_extract;
+
+typedef struct gf {
+  gf_func_a_b    multiply;
+  gf_func_a_b    divide;
+  gf_func_a      inverse;
+  gf_region      multiply_region;
+  gf_extract     extract_word;
+  void           *scratch;
+} gf_t;
+    
+/* Initializes the GF to defaults.  Pass it a pointer to a gf_t.
+   Returns 0 on failure, 1 on success. */
+
+extern int gf_init_easy(GFP gf, int w);
+
+/* Initializes the GF changing the defaults.
+   Returns 0 on failure, 1 on success.
+   Pass it a pointer to a gf_t.
+   For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t .  
+   For region_type, OR together the GF_REGION_xxx's defined above.  
+   Use 0 as prim_poly for defaults.  Otherwise, the leading 1 is optional.
+   Use NULL for scratch_memory to have init_hard allocate memory.  Otherwise,
+   use gf_scratch_size() to determine how big scratch_memory has to be.
+ */
+
+extern int gf_init_hard(GFP gf, 
+                        int w, 
+                        int mult_type, 
+                        int region_type, 
+                        int divide_type, 
+                        uint64_t prim_poly,
+                        int arg1, 
+                        int arg2,
+                        GFP base_gf,
+                        void *scratch_memory);
+
+/* Determines the size for scratch_memory.  
+   Returns 0 on failure and non-zero on success. */
+
+extern int gf_scratch_size(int w, 
+                           int mult_type, 
+                           int region_type, 
+                           int divide_type, 
+                           int arg1, 
+                           int arg2);
+
+/* This reports the gf_scratch_size of a gf_t that has already been created */
+
+extern int gf_size(GFP gf);
+
+/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc.
+   If recursive = 1, then it calls itself recursively on base_gf. */
+
+extern int gf_free(GFP gf, int recursive);
+
+/* This is support for inline single multiplications and divisions.
+   I know it's yucky, but if you've got to be fast, you've got to be fast.
+   We support inlining for w=4, w=8 and w=16.  
+
+   To use inline multiplication and division with w=4 or 8, you should use the 
+   default gf_t, or one with a single table.  Otherwise, gf_w4/8_get_mult_table()
+   will return NULL. Similarly, with w=16, the gf_t must be LOG */
+
+uint8_t *gf_w4_get_mult_table(GFP gf);
+uint8_t *gf_w4_get_div_table(GFP gf);
+
+#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)])
+
+uint8_t *gf_w8_get_mult_table(GFP gf);
+uint8_t *gf_w8_get_div_table(GFP gf);
+
+#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)])
+
+uint16_t *gf_w16_get_log_table(GFP gf);
+uint16_t *gf_w16_get_mult_alog_table(GFP gf);
+uint16_t *gf_w16_get_div_alog_table(GFP gf);
+
+#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]])
+#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]])
+#endif
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h b/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h
new file mode 100644
index 000000000..71c722706
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#pragma once
+
+extern int gf_cpu_supports_intel_pclmul;
+extern int gf_cpu_supports_intel_sse4;
+extern int gf_cpu_supports_intel_ssse3;
+extern int gf_cpu_supports_intel_sse3;
+extern int gf_cpu_supports_intel_sse2;
+extern int gf_cpu_supports_arm_neon;
+
+void gf_cpu_identify(void);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_general.h b/src/erasure-code/jerasure/gf-complete/include/gf_general.h
new file mode 100644
index 000000000..9a5de529d
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_general.h
@@ -0,0 +1,61 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_general.h
+ *
+ * This file has helper routines for doing basic GF operations with any
+ * legal value of w.  The problem is that w <= 32, w=64 and w=128 all have
+ * different data types, which is a pain.  The procedures in this file try
+ * to alleviate that pain.  They are used in gf_unit and gf_time.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+
+typedef union {
+  uint32_t w32;
+  uint64_t w64;
+  uint64_t w128[2];
+} gf_general_t;
+
+void gf_general_set_zero(gf_general_t *v, int w);
+void gf_general_set_one(gf_general_t *v, int w);
+void gf_general_set_two(gf_general_t *v, int w);
+
+int gf_general_is_zero(gf_general_t *v, int w);
+int gf_general_is_one(gf_general_t *v, int w);
+int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w);
+
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex);
+int  gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex);
+
+void gf_general_set_random(gf_general_t *v, int w, int zero_ok);
+
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c);
+void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b);
+
+void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, 
+                                   void *ra, void *rb, 
+                                   int bytes, int xor);
+
+void gf_general_do_region_check(gf_t *gf, gf_general_t *a, 
+                                void *orig_a, void *orig_target, void *final_target, 
+                                int bytes, int xor);
+
+
+/* Which is M, D or I for multiply, divide or inverse. */
+
+void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size);
+int  gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_int.h b/src/erasure-code/jerasure/gf-complete/include/gf_int.h
new file mode 100644
index 000000000..0356920fd
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_int.h
@@ -0,0 +1,216 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_int.h
+ *
+ * Internal code for Galois field routines.  This is not meant for 
+ * users to include, but for the internal GF files to use. 
+ */
+
+#pragma once
+
+#include "gf_complete.h"
+
+#include <string.h>
+
+extern void     timer_start (double *t);
+extern double   timer_split (const double *t);
+extern void     galois_fill_random (void *buf, int len, unsigned int seed);
+
+typedef struct {
+  int mult_type;
+  int region_type;
+  int divide_type;
+  int w;
+  uint64_t prim_poly;
+  int free_me;
+  int arg1;
+  int arg2;
+  gf_t *base_gf;
+  void *private;
+#ifdef DEBUG_FUNCTIONS
+  const char *multiply;
+  const char *divide;
+  const char *inverse;
+  const char *multiply_region;
+  const char *extract_word;
+#endif
+} gf_internal_t;
+
+#ifdef DEBUG_FUNCTIONS
+#define SET_FUNCTION(gf,method,size,func) \
+  { (gf)->method.size = (func); \
+  ((gf_internal_t*)(gf)->scratch)->method = #func; }
+#else
+#define SET_FUNCTION(gf,method,size,func) \
+  (gf)->method.size = (func);
+#endif
+
+extern int gf_w4_init (gf_t *gf);
+extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w8_init (gf_t *gf);
+extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w16_init (gf_t *gf);
+extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w32_init (gf_t *gf);
+extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w64_init (gf_t *gf);
+extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_w128_init (gf_t *gf);
+extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+extern int gf_wgen_init (gf_t *gf);
+extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2);
+
+void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor);
+gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index);
+
+extern void gf_alignment_error(char *s, int a);
+
+extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp);
+
+/* This returns the correct default for prim_poly when base is used as the base
+   field for COMPOSITE.  It returns 0 if we don't have a default prim_poly. */
+
+extern uint64_t gf_composite_get_default_poly(gf_t *base);
+
+/* This structure lets you define a region multiply.  It helps because you can handle
+   unaligned portions of the data with the procedures below, which really cleans
+   up the code. */
+
+typedef struct {
+  gf_t *gf;
+  void *src;
+  void *dest;
+  int bytes;
+  uint64_t val;
+  int xor;
+  int align;           /* The number of bytes to which to align. */
+  void *s_start;       /* The start and the top of the aligned region. */
+  void *d_start;
+  void *s_top;
+  void *d_top;
+} gf_region_data;
+
+/* This lets you set up one of these in one call. It also sets the start/top pointers. */
+
+void gf_set_region_data(gf_region_data *rd,
+                        gf_t *gf,
+                        void *src,
+                        void *dest,
+                        int bytes,
+                        uint64_t val,
+                        int xor,
+                        int align);
+
+/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */
+
+extern void gf_do_initial_region_alignment(gf_region_data *rd);
+
+/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */
+
+extern void gf_do_final_region_alignment(gf_region_data *rd);
+
+extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base);
+
+extern void gf_multby_zero(void *dest, int bytes, int xor);
+extern void gf_multby_one(void *src, void *dest, int bytes, int xor);
+
+typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
+              GF_E_MDEFREG, /* Reg != Default && Mult == Default */
+              GF_E_MDEFARG, /* Args != Default && Mult == Default */
+              GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
+              GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
+              GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
+              GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
+              GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
+              GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
+              GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
+              GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */
+              GF_E_MATRIXW, /* Div == MATRIX && w > 32 */
+              GF_E_BAD___W, /* Illegal w */
+              GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */
+              GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */
+              GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */
+              GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */
+              GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */
+              GF_E_QUAD__W, /* Reg == QUAD && w != 4 */
+              GF_E_QUAD__J, /* Reg == QUAD && other Reg */
+              GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
+              GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
+              GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
+              GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
+              GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
+              GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
+              GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
+              GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
+              GF_E_LOGBADW, /* Mult == LOGx, w too big*/
+              GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */
+              GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */
+              GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */
+              GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */
+              GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */
+              GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
+              GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
+              GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
+              GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
+              GF_E_GR_AR_W, /* Mult == GROUP, either arg > w  */
+              GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
+              GF_E_TABLE_W, /* Mult == TABLE, w too big */
+              GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
+              GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
+              GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
+              GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
+              GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */
+              GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */
+              GF_E_SP128_A, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */
+              GF_E_SP128_S, /* Mult == SPLIT, w=128, SSE only with 4/128 */
+              GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128)  */
+              GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */
+              GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */
+              GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */
+              GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */
+              GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */
+              GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */
+              GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */
+              GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */
+              GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */
+              GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */
+              GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */
+              GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */
+              GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
+              GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
+              GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
+              GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
+              GF_E_COMP__W, /* Mult == COMP, Bad w. */
+              GF_E_UNKFLAG, /* Unknown flag in create_from.... */
+              GF_E_UNKNOWN, /* Unknown mult_type. */
+              GF_E_UNK_REG, /* Unknown region_type. */
+              GF_E_UNK_DIV, /* Unknown divide_type. */
+              GF_E_CFM___W, /* Mult == CFM,  Bad w. */
+              GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */
+              GF_E_FEWARGS, /* Too few args in argc/argv. */
+              GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */
+              GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */
+              GF_E_COMPXPP, /* Can't derive a default pp for composite field. */
+              GF_E_BASE__W, /* Composite -- Base field is the wrong size. */
+              GF_E_TWOMULT, /* In create_from... two -m's. */
+              GF_E_TWO_DIV, /* In create_from... two -d's. */
+              GF_E_POLYSPC, /* Bad numbera after -p. */
+              GF_E_SPLITAR, /* Ran out of arguments in SPLIT */
+              GF_E_SPLITNU, /* Arguments not integers in SPLIT. */
+              GF_E_GROUPAR, /* Ran out of arguments in GROUP */
+              GF_E_GROUPNU, /* Arguments not integers in GROUP. */
+              GF_E_DEFAULT } gf_error_type_t;
+
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_method.h b/src/erasure-code/jerasure/gf-complete/include/gf_method.h
new file mode 100644
index 000000000..880b34967
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_method.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_method.h
+ *
+ * Parses argv to figure out the flags and arguments.  Creates the gf.
+ */
+
+#pragma once
+
+#include "gf_complete.h"
+
+/* Parses argv starting at "starting".  
+   
+   Returns 0 on failure.
+   On success, it returns one past the last argument it read in argv. */
+
+extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h
new file mode 100644
index 000000000..24294adc7
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h
@@ -0,0 +1,22 @@
+/* 
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_rand.h
+ *
+ * Random number generation, using the "Mother of All" random number generator.  */
+
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* These are all pretty self-explanatory */
+uint32_t MOA_Random_32();
+uint64_t MOA_Random_64();
+void     MOA_Random_128(uint64_t *x);
+uint32_t MOA_Random_W(int w, int zero_ok);
+void MOA_Fill_Random_Region (void *reg, int size);   /* reg should be aligned to 4 bytes, but
+                                                        size can be anything. */
+void     MOA_Seed(uint32_t seed);
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w16.h b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h
new file mode 100644
index 000000000..fb4c0e98f
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h
@@ -0,0 +1,66 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.h
+ *
+ * Defines and data structures for 16-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W16_H
+#define GF_COMPLETE_GF_W16_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (16)
+#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (8)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w16_logtable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+};
+
+struct gf_w16_zero_logtable_data {
+    int           log_tbl[GF_FIELD_SIZE];
+    uint16_t      _antilog_tbl[GF_FIELD_SIZE * 4];
+    uint16_t      *antilog_tbl;
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w16_lazytable_data {
+    uint16_t      log_tbl[GF_FIELD_SIZE];
+    uint16_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint16_t      inv_tbl[GF_FIELD_SIZE];
+    uint16_t      *d_antilog;
+    uint16_t      lazytable[GF_FIELD_SIZE];
+};
+
+struct gf_w16_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w16_split_8_8_data {
+    uint16_t      tables[3][256][256];
+};
+
+struct gf_w16_group_4_4_data {
+    uint16_t reduce[16];
+    uint16_t shift[16];
+};
+
+struct gf_w16_composite_data {
+  uint8_t *mult_table;
+};
+
+void gf_w16_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W16_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w32.h b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h
new file mode 100644
index 000000000..7734f30ff
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h
@@ -0,0 +1,71 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.h
+ *
+ * Defines and data structures for 32-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W32_H
+#define GF_COMPLETE_GF_W32_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (32)
+#define GF_FIRST_BIT ((gf_val_32_t)1 << 31)
+
+#define GF_BASE_FIELD_WIDTH (16)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+struct gf_split_2_32_lazy_data {
+    uint32_t      tables[16][4];
+    uint32_t      last_value;
+};
+
+struct gf_w32_split_8_8_data {
+    uint32_t      tables[7][256][256];
+    uint32_t      region_tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_w32_group_data {
+    uint32_t *reduce;
+    uint32_t *shift;
+    int      tshift;
+    uint64_t rmask;
+    uint32_t *memory;
+};
+
+struct gf_split_16_32_lazy_data {
+    uint32_t      tables[2][(1<<16)];
+    uint32_t      last_value;
+};
+
+struct gf_split_8_32_lazy_data {
+    uint32_t      tables[4][256];
+    uint32_t      last_value;
+};
+
+struct gf_split_4_32_lazy_data {
+    uint32_t      tables[8][16];
+    uint32_t      last_value;
+};
+
+struct gf_w32_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+struct gf_w32_composite_data {
+  uint16_t *log;
+  uint16_t *alog;
+};
+
+void gf_w32_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W32_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w4.h b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h
new file mode 100644
index 000000000..8ee94a339
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h
@@ -0,0 +1,63 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.h
+ *
+ * Defines and data structures for 4-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W4_H
+#define GF_COMPLETE_GF_W4_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH      4
+#define GF_DOUBLE_WIDTH     (GF_FIELD_WIDTH*2)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_MULT_GROUP_SIZE       (GF_FIELD_SIZE-1)
+
+/* ------------------------------------------------------------
+   JSP: Each implementation has its own data, which is allocated
+   at one time as part of the handle. For that reason, it
+   shouldn't be hierarchical -- i.e. one should be able to
+   allocate it with one call to malloc. */
+
+struct gf_logtable_data {
+    uint8_t      log_tbl[GF_FIELD_SIZE];
+    uint8_t      antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t      *antilog_tbl_div;
+};
+
+struct gf_single_table_data {
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_double_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+struct gf_quad_table_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[GF_FIELD_SIZE][(1<<16)];
+};
+
+struct gf_quad_table_lazy_data {
+    uint8_t      div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t      smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t     mult[(1 << 16)];
+};
+
+struct gf_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+// ARM NEON init functions
+int gf_w4_neon_cfm_init(gf_t *gf);
+void gf_w4_neon_single_table_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W4_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w64.h b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h
new file mode 100644
index 000000000..9a74a8125
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h
@@ -0,0 +1,50 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.h
+ *
+ * Defines and data structures for 64-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W64_H
+#define GF_COMPLETE_GF_W64_H
+
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (64)
+#define GF_FIRST_BIT (1ULL << 63)
+
+#define GF_BASE_FIELD_WIDTH (32)
+#define GF_BASE_FIELD_SIZE       (1ULL << GF_BASE_FIELD_WIDTH)
+#define GF_BASE_FIELD_GROUP_SIZE  GF_BASE_FIELD_SIZE-1
+
+struct gf_w64_group_data {
+    uint64_t *reduce;
+    uint64_t *shift;
+    uint64_t *memory;
+};
+
+struct gf_split_4_64_lazy_data {
+    uint64_t      tables[16][16];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_64_lazy_data {
+    uint64_t      tables[8][(1<<8)];
+    uint64_t      last_value;
+};
+
+struct gf_split_16_64_lazy_data {
+    uint64_t      tables[4][(1<<16)];
+    uint64_t      last_value;
+};
+
+struct gf_split_8_8_data {
+    uint64_t      tables[15][256][256];
+};
+
+void gf_w64_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W64_H */
diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w8.h b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h
new file mode 100644
index 000000000..938fcfdf1
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h
@@ -0,0 +1,99 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Defines and data stuctures for 8-bit Galois fields
+ */
+
+#ifndef GF_COMPLETE_GF_W8_H
+#define GF_COMPLETE_GF_W8_H
+
+#include "gf_int.h"
+#include <stdint.h>
+
+#define GF_FIELD_WIDTH (8)
+#define GF_FIELD_SIZE       (1 << GF_FIELD_WIDTH)
+#define GF_HALF_SIZE       (1 << (GF_FIELD_WIDTH/2))
+#define GF_MULT_GROUP_SIZE       GF_FIELD_SIZE-1
+
+#define GF_BASE_FIELD_WIDTH (4)
+#define GF_BASE_FIELD_SIZE       (1 << GF_BASE_FIELD_WIDTH)
+
+struct gf_w8_logtable_data {
+    uint8_t         log_tbl[GF_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_FIELD_SIZE * 2];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+};
+
+struct gf_w8_logzero_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[512+512+1];
+    uint8_t         *div_tbl;
+    uint8_t         *inv_tbl;
+};
+
+struct gf_w8_logzero_small_table_data {
+    short           log_tbl[GF_FIELD_SIZE];  /* Make this signed, so that we can divide easily */
+    uint8_t         antilog_tbl[255*3];
+    uint8_t         inv_tbl[GF_FIELD_SIZE];
+    uint8_t         *div_tbl;
+};
+
+struct gf_w8_composite_data {
+  uint8_t *mult_table;
+};
+
+/* Don't change the order of these relative to gf_w8_half_table_data */
+
+struct gf_w8_default_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_half_table_data {
+  uint8_t     high[GF_FIELD_SIZE][GF_HALF_SIZE];
+  uint8_t     low[GF_FIELD_SIZE][GF_HALF_SIZE];
+};
+
+struct gf_w8_single_table_data {
+  uint8_t     divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+  uint8_t     multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w8_double_table_lazy_data {
+    uint8_t         div[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint8_t         smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+    uint16_t        mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
+};
+
+struct gf_w4_logtable_data {
+    uint8_t         log_tbl[GF_BASE_FIELD_SIZE];
+    uint8_t         antilog_tbl[GF_BASE_FIELD_SIZE * 2];
+    uint8_t         *antilog_tbl_div;
+};
+
+struct gf_w4_single_table_data {
+    uint8_t         div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+    uint8_t         mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
+};
+
+struct gf_w8_bytwo_data {
+    uint64_t prim_poly;
+    uint64_t mask1;
+    uint64_t mask2;
+};
+
+int gf_w8_neon_cfm_init(gf_t *gf);
+void gf_w8_neon_split_init(gf_t *gf);
+
+#endif /* GF_COMPLETE_GF_W8_H */
diff --git a/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4 b/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4
new file mode 100644
index 000000000..c3a8d695a
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4
@@ -0,0 +1,72 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 b/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4
new file mode 100644
index 000000000..95c4dbe23
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4
@@ -0,0 +1,75 @@
+#
+# This macro is based on http://www.gnu.org/software/autoconf-archive/ax_ext.html
+# but simplified to do compile time SIMD checks only
+#
+
+AC_DEFUN([AX_EXT],
+[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+
+  case $host_cpu in
+    aarch64*)
+      AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
+      SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
+
+      AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes])
+      if test "$ax_cv_have_neon_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no])
+      fi
+      ;;
+
+    arm*)
+      AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes])
+      if test "$ax_cv_have_neon_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON"], [ax_cv_have_neon_ext=no])
+      fi
+      ;;
+
+    powerpc*)
+      AC_CACHE_CHECK([whether altivec is enabled], [ax_cv_have_altivec_ext], [ax_cv_have_altivec_ext=yes])
+      if test "$ax_cv_have_altivec_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [ax_cv_have_altivec_ext=no])
+      fi
+      ;;
+
+    i[[3456]]86*|x86_64*|amd64*)
+
+      AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes])
+      if test "$ax_cv_have_sse_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes])
+      if test "$ax_cv_have_sse2_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes])
+      if test "$ax_cv_have_sse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes])
+      if test "$ax_cv_have_ssse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes])
+      if test "$ax_cv_have_pclmuldq_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes])
+      if test "$ax_cv_have_sse41_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
+      fi
+
+      AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes])
+      if test "$ax_cv_have_sse42_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
+      fi
+      ;;
+  esac
+
+  AC_SUBST(SIMD_FLAGS)
+])
diff --git a/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html b/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html
new file mode 100644
index 000000000..ed79e2576
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html
@@ -0,0 +1,3484 @@
+<html>
+
+<head>
+
+<link rel="stylesheet" type="text/css" href="style.css">
+
+</head>
+
+<body>
+
+<div id="box">
+
+<h1>
+GF-Complete: A Comprehensive Open Source Library for Galois </br>
+Field Arithmetic
+</h1>
+
+<h1> Version 1.02  </h1>
+
+<h4>James S. Plank* &nbsp&nbsp&nbsp&nbsp&nbsp&nbsp Ethan L. Miller 
+Kevin M. Greenan &nbsp&nbsp&nbsp&nbsp&nbsp&nbsp Benjamin A. Arnold<br>
+John A. Burnum &nbsp&nbsp&nbsp&nbsp&nbsp&nbsp Adam W. Disney  &nbsp&nbsp&nbsp&nbsp&nbsp&nbsp
+Allen C. McBride
+
+</h4> <br>
+
+
+
+<a href="">
+
+https://bitbucket.org/jimplank/gf-complete
+
+ </a><br><br>
+<a href=""> 
+http://web.eecs.utk.edu/~plank/plank/papers/GF-Complete-Manual-1.02.pdf
+
+
+ </a> <br> <br> 
+
+
+
+
+
+
+
+</div>
+
+
+<div id="pages_paragraphs_2">
+
+This is a user's manual for GF-Complete, version 1.02. This release supersedes version 0.1 and represents the first
+major release of GF-Complete. To our knowledge, this library implements every Galois Field multiplication technique
+applicable to erasure coding for storage, which is why we named it GF-Complete. The primary goal of this library is
+to allow storage system researchers and implementors to utilize very fast Galois Field arithmetic for Reed-Solomon
+coding and the like in their storage installations. The secondary goal is to allow those who want to explore different
+ways to perform Galois Field arithmetic to be able to do so effectively.
+
+
+<p>
+If you wish to cite GF-Complete, please cite technical report UT-CS-13-716: [PMG<sup>+</sup>13].
+
+</p>
+
+
+<h2>If You Use This Library or Document </h2>
+
+
+
+Please send me an email to let me know how it goes. Or send me an email just to let me know you are using the
+library. One of the ways in which we are evaluated both internally and externally is by the impact of our work, and if
+you have found this library and/or this document useful, we would like to be able to document it. Please send mail to
+<em>plank@cs.utk.edu.</em> Please send bug reports to that address as well.
+
+
+
+<p>
+The library itself is protected by the New BSD License. It is free to use and modify within the bounds of this
+license. To the authors' knowledge, none of the techniques implemented in this library have been patented, and the
+authors are not pursing patents. </p> <br>
+
+ </div>
+<div id="footer"> 
+ 
+<span id="footer_bar">&nbsp&nbsp&nbsp&nbsp.*plank@cs.utk.edu (University of Tennessee), el  </span> <em>m@cs.ucsc.edu </em>(UC Santa Cruz), <em>kmgreen2@gmail.com </em> (Box). This material
+is based upon work supported by the National Science Foundation under grants CNS-0917396, IIP-0934401 and CSR-1016636, plus REU supplements
+CNS-1034216, CSR-1128847 and CSR-1246277. Thanks to Jens Gregor for helping us wade through compilation issues, and for Will
+Houston for his initial work on this library.
+
+</div>
+
+<b>Finding the Code </b>
+<br><br>
+This code is actively maintained on bitbucket:<a href=""> https://bitbucket.org/jimplank/gf-complete. </a> There are
+previous versions on my UTK site as a technical report; however, that it too hard to maintain, so the main version is
+on bitbucket.<br><br>
+
+
+<b>Two Related Papers </b> <br><br>
+
+This software acccompanies a large paper that describes these implementation techniques in detail [PGM13a]. We
+will refer to this as <em> "The Paper." </em> You do not have to read The Paper to use the software. However, if you want to
+start exploring the various implementations, then The Paper is where you'll want to go to learn about the techniques
+in detail.
+
+
+
+<p>This library implements the techniques described in the paper "Screaming Fast Galois Field Arithmetic Using Intel
+SIMD Instructions," [PGM13b]. The Paper describes all of those techniques as well.
+</p><br><br>
+
+<b>If You Would Like HelpWith the Software </b><br><br>
+
+Please contact the first author of this manual.<br><br>
+
+<b>Changes from Revision 1.01</b>
+<br><br>
+The major change is that we are using autoconf to aid with compilation, thus obviating the need for the old <b>flag_tester</b>
+code. Additionally, we have added a quick timing tool, and we have modified <b>gf_methods</b> so that it may be used to
+run the timing tool and the unit tester.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+CONTENT  <span class="aligning_page_number"> 3 </span> 
+<h2>Contents </h2>
+<div class="index">
+1 <span class="aligning_numbers">Introduction </span> <span class="aligning_page_number"> 5 </span>
+  <br><br> 
+2 <span class="aligning_numbers">Files in the Library </span>	<span class="aligning_page_number"> 6  </span>  <br> </div>
+
+<div class="sub_indices">
+2.1 Header files in the directory <b>"include"</b>  . .. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number"> 6 </span>  <br>
+2.2 Source files in the <b>"src"</b> directory . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . .<span class="aligning_page_number">   7  </span> <br>
+2.3 Library tools files in the <b>"tools"</b> directory  . . . . . . . . . . ..  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . .   <span class="aligning_page_number"> 7   </span> <br>
+2.4 The unit tester in the <b>"test"</b> directory. . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . <span class="aligning_page_number">  8  </span>  <br>
+2.5 Example programs in the <b>"examples"</b> directory . . . .  . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . .<span class="aligning_page_number"> 8  </span> 
+
+</div>
+<br>
+<div class="index">
+
+3 <span class="aligning_numbers">Compilation </span><span class="aligning_page_number">  8 </span>  <br> <br>
+4 <span class="aligning_numbers">Some Tools and Examples to Get You Started </span><span class="aligning_page_number">  8 </span> <br><br>  </div> 
+
+
+
+<div class="sub_indices">
+4.1 Three Simple Command Line Tools: gf_mult, gf_div and gf_add . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number"> 8</span>  <br>
+4.2 Quick Starting Example #1: Simple multiplication and division . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . <span class="aligning_page_number">   9  </span> <br>
+4.3 Quick Starting Example #2: Multiplying a region by a constant    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . .  <span class="aligning_page_number"> 10   </span> <br>
+4.4 Quick Starting Example #3: Using w = 64 . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . <span class="aligning_page_number">  11  </span>  <br>
+4.5 Quick Starting Example #4: Using w = 128. . . .  . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number"> 11  </span> 
+</div>
+<br>
+
+
+<div class="index">
+5 <span class="aligning_numbers"> Important Information on Alignment when Multiplying Regions </span><span class="aligning_page_number"> 12</span> <br><br>
+
+6 <span class="aligning_numbers"> The Defaults</span><span class="aligning_page_number"> 13 </span> <br>
+
+</div>
+
+<div class="sub_indices">
+6.1 Changing the Defaults . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . .<span class="aligning_page_number">   14  </span> <br>
+
+
+<ul style="list-style-type:none;">
+<li>6.1.1 Changing the Components of a Galois Field with <b> create_gf_from_argv() </b>   . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . .  <span class="aligning_page_number"> 15   </span> <br>
+</li>
+<li>
+6.1.2 Changing the Polynomial. . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number">  16  </span>  <br>
+</li>
+<li>
+6.1.3 Changing the Multiplication Technique. . . .  . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . .<span class="aligning_page_number"> 17  </span> 
+</li>
+
+
+<li>
+6.1.4 Changing the Division Technique . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  <span class="aligning_page_number"> 19  </span> 
+</li>
+
+
+<li>
+6.1.5 Changing the Region Technique. . . .  . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . .  . . . ..<span class="aligning_page_number"> 19  </span> 
+</li>
+</ul>
+6.2 Determining Supported Techniques with <b>gf_methods</b> . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number"> 20</span>  <br>
+
+6.3 Testing with <b>gf_unit, gf_time,</b> and <b>time_tool.sh </b>. . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . .  . . <span class="aligning_page_number"> 21</span>
+
+<ul style="list-style-type:none;">
+<li>
+6.3.1 <b>time_tool.sh</b> . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . .  <span class="aligning_page_number"> 22 </span> 
+</li>
+
+<li>
+6.3.2 An example of <b>gf_methods</b> and <b>time_tool.sh</b> . . . . . .  . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . .. . . . . . . .  . .. .  .<span class="aligning_page_number"> 23  </span> 
+</li>
+
+</ul>
+
+6.4 Calling <b>gf_init_hard()</b> . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . .  .. . . . . . . .  . . .  <span class="aligning_page_number"> 24</span>  <br>
+
+6.5 <b>gf_size()</b> . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . .  .. . . . . . . . .. . . . . . . .  .. . . . . . . . .. . . . . . . . . . ..  .  <span class="aligning_page_number"> 26</span>  <br><br>
+</div>
+
+
+<div class="index">
+8 <span class="aligning_numbers">  Further Information on Options and Algorithms </span><span class="aligning_page_number">   26 </span> </div> <br><br> </div>
+<div class="sub_indices">
+7.1 Inlining Single Multiplication and Division for Speed   . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . <span class="aligning_page_number"> 26 </span> <br>
+7.2 Using different techniques for single and region multiplication . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  .  <span class="aligning_page_number"> 27 </span> <br>
+7.3 General <em>w</em> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . .  . <span class="aligning_page_number"> 28  </span><br>
+
+7.4 Arguments to <b>"SPLIT"</b> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . <span class="aligning_page_number"> 28</span>  <br>
+7.5 Arguments to <b>"GROUP"</b> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  <span class="aligning_page_number">29 </span> <br>
+7.6 Considerations with <b>"COMPOSITE"</b> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . .  .  <span class="aligning_page_number">30 </span> <br>
+7.7 <b>"CARRY_FREE"</b> and the Primitive Polynomial  . . . . . . . . . . . . . . .  . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . .   <span class="aligning_page_number">31 </span> <br>
+7.8 More on Primitive Polynomials . .  . . . . . . . . . . . . . . . . . . .  . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . ..   . . . . . . . . .  <span class="aligning_page_number">31 </span> <br>
+
+
+<ul style="list-style-type:none;">
+<li>
+7.8.1 Primitive Polynomials that are not Primitive . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . .  . . .  <span class="aligning_page_number"> 31</span>  <br>
+
+</li>
+<li>7.8.2 Default Polynomials for Composite Fields . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . .  <span class="aligning_page_number"> 32</span>  <br>
+
+</li>
+</ul>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+CONTENT  <span class="aligning_page_number"> 4 </span> 
+
+<div class="sub_indices">
+<ul style="list-style-type:none">
+<li> 7.8.3 The Program <b>gf_poly</b> for Verifying Irreducibility of Polynomials </span><span class="aligning_page_number">  33 </span> 
+</li>
+</ul>
+
+
+7.9<span class="aligning_numbers"><b>"ALTMAP"</b> considerations and <b>extract_word()</b> </span><span class="aligning_page_number">  34 </span>  
+<ul style="list-style-type:none">
+<li>
+
+7.9.1 Alternate mappings with <b>"SPLIT"</b> . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .<span class="aligning_page_number"> 34</span>  <br>
+</li>
+<li>
+7.9.2 Alternate mappings with <b>"COMPOSITE"</b> . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . . . . . . . . <span class="aligning_page_number">   36  </span> <br>
+</li>
+<li>
+7.9.3 The mapping of <b>"CAUCHY"</b>    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . . . . .. . . . . .. . . . . . . . . .  . ..  <span class="aligning_page_number"> 37   </span> <br>
+</li>
+</ul>
+</div>
+
+
+8 <span class="aligning_numbers"><b>Thread Safety </b></span><span class="aligning_page_number">  37 </span> <br><br>  </div> 
+
+9 <span class="aligning_numbers"><b>Listing of Procedures</b> </span><span class="aligning_page_number">  37 </span> <br><br>  </div> 
+
+10 <span class="aligning_numbers"><b>Troubleshooting</b> </span><span class="aligning_page_number">  38 </span> <br><br>  </div> 
+11 <span class="aligning_numbers"><b>Timings</b> </span><span class="aligning_page_number">  41 </span> <br><br>  </div> 
+
+<div class="sub_indices">
+11.1 Multiply() . . . . . . . . . .  . . . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . . .  . . . .. . . . <span class="aligning_page_number"> 42</span>  <br>
+11.2 Divide() . .  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . .. . . . . <span class="aligning_page_number">   42  </span> <br>
+11.3 Multiply Region()    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . . .  . . . . .  <span class="aligning_page_number"> 43   </span> <br>
+</div>
+
+
+
+
+
+
+<br/>
+INTRODUCTION  <span class="aligning_page_number"> 5 </span> 
+
+
+<h3>1 Introduction </h3>
+
+Galois Field arithmetic forms the backbone of erasure-coded storage systems, most famously the Reed-Solomon
+erasure code. A Galois Field is defined over w-bit words and is termed <em>GF(2<sup>w</sup>).</em> As such, the elements of a Galois
+Field are the integers 0, 1, . . ., 2<sup>w</sup> - 1. Galois Field arithmetic defines addition and multiplication over these closed
+sets of integers in such a way that they work as you would hope they would work. Specifically, every number has a
+unique multiplicative inverse. Moreover, there is a value, typically the value 2, which has the property that you can
+enumerate all of the non-zero elements of the field by taking that value to successively higher powers.
+
+
+<p>Addition in a Galois Field is equal to the bitwise exclusive-or operation. That's nice and convenient. Multiplication
+is a little more complex, and there are many, many ways to implement it. The Paper describes them all, and the
+following references providemore supporting material: [Anv09, GMS08, LHy08, LD00, LBOX12, Pla97]. The intent
+of this library is to implement all of the techniques. That way, their performancemay be compared, and their tradeoffs
+may be analyzed. <p>
+
+
+
+
+<ol>
+
+When used for erasure codes, there are typically five important operations:<br>
+<li> <b>Adding two numbers in </b> GF(2<sup>w</sup>). That's bitwise exclusive-or. </li>
+<li> <b>Multiplying two numbers in</b> GF(2<sup>w</sup>). Erasure codes are usually based on matrices in GF(2<sup>w</sup>), and constructing
+these matrices requires both addition and multiplication.</li>
+<li> <b>Dividing two numbers in </b>GF(2<sup>w</sup>). Sometimes you need to divide to construct matrices (for example, Cauchy
+Reed-Solomon codes [BKK<sup>+</sup>95, Rab89]). More often, though, you use division to invert matrices for decoding.
+Sometimes it is easier to find a number's inverse than it is to divide. In that case, you can divide by multiplying
+by an inverse. </li>
+
+<li><b>adding two regions of numbers in</b> GF(2<sup>w</sup>), which will be explained along with... </li>
+<li> <b>Mutiplying a region of numbers in </b>GF(2<sup>w</sup>) by a constant in GF(2<sup>w</sup>). Erasure coding typically boils down
+to performing dot products in GF(2<sup>w</sup>). For example, you may define a coding disk using the equation: </li><br>
+
+
+
+
+<center>c<em><sub>0</sub></em>= d<em><sub>0</sub></em> + 2d<em><sub>1</sub></em> + 4d<em><sub>2</sub></em> + 8d<em><sub>3</sub></em>.</sup> </center><br>
+
+That looks like three multiplications and three additions However, the way ' implemented in a disk system
+looks as in Figure 1. Large regions of disks are partitioned into w-bit words in GF(2<sup>w</sup>). In the example, let us
+suppose that <em>w</em> = 8, and therefore that words are bytes. Then the regions pictured are 1 KB from each disk.
+The bytes on disk Di are labeled d<sub>i,0,</sub> d<sub>i,1, . . . ,</sub> d<sub>i,1023,</sub> and the equation above is replicated 1024 times. For
+0 &#8804 j < 1024:
+<br><br>
+<center>c<em><sub>0,j</sub></em> = d<em><sub>0,j</sub></em> + 2d<em><sub>1,j</sub></em> + 4d<em><sub>2,j</sub></em> + 8d<em><sub>3,j</sub></em> . </center>
+<br>
+
+
+While it's possible to implement each of these 1024 equations independently, using the single multiplication
+and addition operations above, it is often much more efficient to aggregate. For example, most computer architectures
+support bitwise exclusive-or of 64 and 128 bit words. Thus, it makes much more sense to add regions
+of numbers in 64 or 128 bit chunks rather than as words in GF(2<sup>w</sup>). Multiplying a region by a constant can
+leverage similar optimizations. </ol>
+
+
+<p>GF-Complete supports multiplication and division of single values for all values of <em>w</em> &#8804 32, plus <em>w</em> = 64 and <em>w</em> =
+128. It also supports adding two regions of memory (for any value of <em>w</em>, since addition equals XOR), and multiplying
+a region by a constant in <em>GF(2<sup>4</sup>), GF(2<sup>8</sup>), GF(2<sup>16</sup>), GF(2<sup>32</sup>), GF(2<sup>64</sup>) and GF(2<sup>128</sup>).</em> These values are chosen
+because words in GF(2<sup>w</sup>) fit into machine words with these values of <em>w.</em> Other values of w don't lend themselves
+to efficient multiplication of regions by constants (although see the <b>"CAUCHY"</b> option in section 6.1.5 for a way to
+multiply regions for other values of <em>w</em>).</p>
+
+
+
+
+
+
+<br/>
+
+2 &nbsp &nbsp  <em>  FILES IN THE LIBRARY     </em>   <span id="index_number">6  </span> <br><br><br>
+
+
+
+<div class="image-cell_1"> </div>  <br><br><br>
+
+Figure 1: An example of adding two regions of numbers, and multiplying a region of numbers by a constant
+in <em>GF(2<sup>w</sup>) </em>. In this example, <em>w</em> = 8, and each disk is holding a 1KB region. The same coding equation -
+c<sub>0,j</sub></b> = d<sub>0,j</sub> + ad<sub>1,j</sub> + a<sup>2</sup>d<sub>2,j</sub> + a<sup>3</sup>d<sub>3,j</sub> is applied 1024 times. However, rather than executing this equation 1024
+times, it is more efficient to implement this with three region-constant multiplications and three region-region additions.
+
+<h3>2 &nbsp&nbsp&nbsp Files in the Library </h3>
+This section provides an overview of the files that compose GF-Complete. They are partitioned among multiple
+directories.
+
+<h4> <b>2.1 &nbsp&nbsp&nbsp Header files in the directory  "include"</b> </h4>
+
+The following header files are part of GF-Complete.
+<ul>
+<li><b>gf_complete.h:</b> This is the header file that applications should include. It defines the gf_t type, which holds
+all of the data that you need to perform the various operations in GF(2<sup>w</sup>). It also defines all of the arithmetic
+operations. For an application to use this library, you should include gf_complete.h and then compile with the
+library src/libgf_complete.la. </li><br>
+
+<li><b>gf_method.h:</b> If you are wanting to modify the implementation techniques from the defaults, this file provides
+a "helper" function so that you can do it from the Unix command line.
+</li><br>
+
+<li><b>gf_general.h:</b> This file has helper routines for doing basic Galois Field operations with any legal value of <em>w.</em>
+The problem is that <em>w </em> &#8804 32, <em>w </em> = 64 and <em> w </em> = 128 all have different data types, which is a pain. The procedures
+in this file try to alleviate that pain. They are used in <b>gf_mult, gf_unit</b> and <b>gf_time.</b> I'm guessing that most
+applications won't use them, as most applications use <em>w</em> &#8804 32. </li><br>
+
+<li><b>gf_rand.h:</b> I've learned that <b>srand48()</b> and its kin are not supported in all C installations. Therefore, this file
+defines some randomnumber generators to help test the programs. The randomnumber generator is the "Mother
+</li>
+
+</ul>
+
+
+
+
+
+
+
+<br/>
+
+2 &nbsp &nbsp  <em>  FILES IN THE LIBRARY     </em>   <span id="index_number">7  </span> <br><br><br>
+<ul>
+
+of All" random number generator [Mar94] which we've selected because it has no patent issues. <b>gf_unit</b> and
+<b>gf_time</b> use these random number generators.<br><br>
+<li><b>gf_int.h:</b> This is an internal header file that the various source files use. This is <em>not</em> intended for applications to
+include.</li><br>
+<li><b>config.xx</b> and <b>stamp-h1</b> are created by autoconf, and should be ignored by applications. </li>
+</ul>
+
+<h3>2.2 &nbsp &nbsp <b> Source files in the "src" directory" </b> </h3>
+<ul>
+The following C files compose <b>gf_complete.a,</b> and they are in the direcoty src. You shouldn't have to mess with these
+files, but we include them in case you have to:<br><br>
+<li><b> gf_.c:</b> This implements all of the procedures in both <b>gf_complete.h</b> and <b>gf_int.h.</b> </li><br>
+<li><b> gf_w4.c:</b> Procedures specific to <em>w </em> = 4. </li><br>
+<li> <b>gf_w8.c:</b> Procedures specific to <em>w </em> = 8</li><br>
+<li> <b>gf_w16.c:</b> Procedures specific to <em>w </em> = 16</li><br>
+<li> <b>gf_w32.c:</b> Procedures specific to <em>w </em> = 32</li><br>
+<li><b>gf_w64.c:</b> Procedures specific to <em>w </em> = 64</li><br>
+<li> <b>gf_w128.c:</b> Procedures specific to <em>w </em> = 128</li><br>
+<li> <b>gf_wgen.c:</b> Procedures specific to other values of <em>w </em> between 1 and 31</li><br>
+<li> <b>gf_general.c:</b> Procedures that let you manipulate general values, regardless of whether <em>w </em> &#8804 32, <em>w </em> = 64
+or <em>w </em> = 128. (I.e. the procedures defined in <b>gf_ general.h</b>)</li><br>
+<li> <b>gf_method.c:</b> Procedures to help you switch between the various implementation techniques. (I.e. the procedures
+defined in <b>gf_method.h</b>)</li><br>
+<li> <b>gf_ rand.c:</b>"The Mother of all" random number generator. (I.e. the procedures defined in <b>gf_rand.h</b>)</li><br> </ul>
+
+<h3>2.3 &nbsp &nbsp Library tools files in the "tools" directory </h3>
+
+<ul>
+The following are tools to help you with Galois Field arithmetic, and with the library. They are explained in greater
+detail elsewhere in this manual.<br><br>
+<li> <b>gf_mult.c, gf_ div.c</b> and <b>gf_ add:</b> Command line tools to do multiplication, division and addition by single numbers</li><br>
+<li> <b>gf_time.c:</b> A program that times the procedures for given values of <em>w </em> and implementation options</li><br>
+<li> <b>time_tool.sh:</b> A shell script that helps perform rough timings of the various multiplication, division and region
+operations in GF-Complete</li><br>
+<li> <b>gf_methods.c:</b> A program that enumerates most of the implementation methods supported by GF-Complete</li><br>
+<li> <b> gf_poly.c:</b> A program to identify irreducible polynomials in regular and composite Galois Fields</li><br>
+
+</ul>
+
+
+
+
+
+
+
+
+<br/>
+
+3 &nbsp &nbsp  <em>  COMPILATION     </em>   <span id="index_number">8  </span> <br><br><br>
+
+
+<h3>2.4 &nbsp &nbsp The unit tester in the "test" directory </h3>
+
+The test directory contains the proram <b>gf_unit.c,</b> which performs a battery of unit tests on GF-Complete. This is
+explained in more detail in section 6.3.
+
+
+<h3>2.5&nbsp &nbsp Example programs in the "examples" directory </h3>
+
+There are seven example programs to help you understand various facets of GF-Complete. They are in the files
+<b>gf_example x.c </b> in the <b>examples</b> directory. They are explained in sections 4.2 through 4.5, and section 7.9.<br><br>
+
+<h2>3 &nbsp &nbsp Compilation </h2>
+
+<em>From revision 1.02 forward, we are using autoconf. The old "flag tester" directory is now gone, as it is no longer in
+use. </em><br><br>
+To compile and install, you should do the standard operations that you do with most open source Unix code:<br><br>
+
+UNIX> ./configure <br>
+... <br>
+UNIX> make <br>
+... <br>
+UNIX> sudo make install <br><br>
+
+
+<p>If you perform the <b>install,</b> then the header, source, tool, and library files will be moved to system locations. In
+particular, you may then compile the library by linking with the flag <b>-lgf_complete,</b> and you may use the tools from a
+global executable directory (like <b>/usr/local/bin</b>). </p>
+
+<p>
+If you don't perform the install, then the header and tool files will be in their respective directories, and the library
+will be in <b>src/libgf_complete.la.</b> </p>
+<p>
+If your system supports the various Intel SIMD instructions, the compiler will find them, and GF-Complete will
+use them by default. </p>
+
+
+
+<h2>4 &nbsp &nbsp Some Tools and Examples to Get You Started </h2> 
+<h3>4.1 Three Simple Command Line Tools: gf_mult, gf_div and gf_add </h3>
+
+
+Before delving into the library, it may be helpful to explore Galois Field arithmetic with the command line tools:
+<b>gf_mult, gf_div </b> and <b>gf_add.</b> These perform multiplication, division and addition on elements in <em>GF(2<sup>w</sup>).</em> If these are
+not installed on your system, then you may find them in the tools directory. Their syntax is:
+<ul>
+<li><b>gf_mult a b</b> <em>w </em> - Multiplies a and b in <em> GF(2<sup>w</sup>)</em>. </li><br>
+<li> <b>gf_div a b </b><em>w </em> - Divides a by b in GF(2<em><sup>w </sup></em>). </li><br>
+<li><b>gf_add a b </b> <em>w </em> - Adds a and b in GF(2<em><sup>w </sup> </em>). </li><br>
+
+You may use any value of <em>w </em> from 1 to 32, plus 64 and 128. By default, the values are read and printed in decimal;
+however, if you append an 'h' to <em>w </em>, then <em>a, b </em> and the result will be printed in hexadecimal. For <em>w </em> = 128, the 'h' is
+mandatory, and all values will be printed in hexadecimal.
+
+
+
+
+
+
+
+<br/>
+
+4 &nbsp &nbsp  <em>   SOME TOOLS AND EXAMPLES TO GET YOU STARTED 9     </em>   <span id="index_number">9  </span> <br><br><br>
+
+
+<p>Try them out on some examples like the ones below. You of course don't need to know that, for example, 5 * 4 = 7
+in <em>GF(2<sup>4 </sup>) </em>; however, once you know that, you know that 7/
+5 = 4 and 7/4 = 5. You should be able to verify the <b>gf_add</b>
+statements below in your head. As for the other <b>gf_mult's</b>, you can simply verify that division and multiplication work
+with each other as you hope they would. </p>
+<br><br>
+<div id="number_spacing">
+
+UNIX> gf_mult 5 4 4  <br>
+7 <br>
+UNIX> gf_div 7 5 4 <br>
+4 <br>
+UNIX> gf_div 7 4 4 <br>
+5   <br>
+UNIX> gf_mult 8000 2 16h <br>
+100b  <br>
+UNIX> gf_add f0f0f0f0f0f0f0f0 1313131313131313 64h <br>
+e3e3e3e3e3e3e3e3 <br>
+UNIX> gf_mult f0f0f0f0f0f0f0f0 1313131313131313 64h <br>
+8da08da08da08da0 <br>
+UNIX> gf_div 8da08da08da08da0 1313131313131313 64h <br>
+f0f0f0f0f0f0f0f0  <br>
+UNIX> gf_add f0f0f0f0f0f0f0f01313131313131313 1313131313131313f0f0f0f0f0f0f0f0 128h <br>
+e3e3e3e3e3e3e3e3e3e3e3e3e3e3e3e3 <br>
+UNIX> gf_mult f0f0f0f0f0f0f0f01313131313131313 1313131313131313f0f0f0f0f0f0f0f0 128h <br>
+786278627862784982d782d782d7816e <br>
+UNIX> gf_div 786278627862784982d782d782d7816e f0f0f0f0f0f0f0f01313131313131313 128h <br>
+1313131313131313f0f0f0f0f0f0f0f0 <br>
+UNIX> <br><br>
+
+</div>
+
+
+Don't bother trying to read the source code of these programs yet. Start with some simpler examples  like the ones
+below. <br><br>
+
+<h3>4.2 Quick Starting Example #1: Simple multiplication and division </h3>
+
+The source files for these examples are in the examples directory.
+<p>These two examples are intended for those who just want to use the library without getting too complex. The
+first example is <b>gf_example 1,</b> and it takes one command line argument - w, which must be between 1 and 32. It
+generates two random non-zero numbers in <em>GF(2<sup>w </sup>) </em> and multiplies them. After doing that, it divides the product by
+each number. </p>
+<p>
+To perform multiplication and division in <em>GF(2<sup>w </sup>) </em>, you must declare an instance of the gf_t type, and then initialize
+it for <em>GF(2<sup>w </sup>) </em> by calling <b>gf_init_easy().</b> This is done in <b>gf_example 1.c</b> with the following lines: </p><br><br>
+
+gf_t gf; <br><br>r
+... <br><br>
+if (!gf_init_easy(&gf, w)) { <br>
+fprintf(stderr, "Couldn't initialize GF structure.\n"); <br>
+exit(0); <br>
+}  <br>
+
+
+
+
+
+
+<br/>
+
+4 &nbsp &nbsp  <em>   SOME TOOLS AND EXAMPLES TO GET YOU STARTED      </em>   <span id="index_number">10  </span> <br><br><br>
+
+<p>Once <b>gf</b> is initialized, you may use it for multiplication and division with the function pointers <b>multiply.w32</b> and
+<b>divide.w32.</b> These work for any element of <em>GF(2<sup>w</sup>)</em> so long as w &#8804 32. </p> <br><br>
+
+<div id="number_spacing">
+<div style="padding-left:54px">
+c = gf.multiply.w32(&gf, a, b);<br>
+printf("%u * %u = %u\n", a, b, c);<br><br>
+printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a));<br>
+printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b));<br>
+
+
+</div> </div>
+<br><br>
+Go ahead and test this program out. You can use <b>gf_mult</b> and <b>gf_div</b> to verify the results:<br><br>
+
+<div id="number_spacing">
+UNIX> gf_example_1 4 <br>
+12 * 4 = 5  <br>
+5 / 12 = 4  <br>
+5 / 4 = 12  <br>
+UNIX> gf_mult 12 4 4 <br>
+5  <br>
+UNIX> gf_example_1 16 <br>
+14411 * 60911 = 44568 <br>
+44568 / 14411 = 60911 <br>
+44568 / 60911 = 14411  <br>
+UNIX> gf_mult 14411 60911 16 <br>
+44568 <br>
+UNIX>  <br><br>
+</div>
+
+<b>gf_init_easy()</b> (and <b>later_gf_init_hard()</b>) do call <b>malloc()</b> to implement internal structures. To release memory, call
+<b>gf_free().</b> Please see section 6.4 to see how to call <b>gf_init_hard()</b> in such a way that it doesn't call <b>malloc().</b> <br><br>
+
+
+
+<h3>4.3 &nbsp &nbsp &nbspQuick Starting Example #2: Multiplying a region by a constant </h3>
+
+
+The program <b>gf_example</b> 2 expands on <b>gf_example</b> 1. If <em>w</em> is equal to 4, 8, 16 or 32, it performs a region multiply
+operation. It allocates two sixteen byte regions, <b>r1</b> and <b>r2,</b> and then multiples <b>r1</b> by a and puts the result in <b>r2</b> using
+the <b>multiply_region.w32</b> function pointer: <br><br>
+
+<div style="padding-left:52px">
+gf.multiply_region.w32 (&gf, r1, r2, a, 16, 0); <br><br>
+</div>
+
+That last argument specifies whether to simply place the product into r2 or to XOR it with the contents that are already
+in r2. Zero means to place the product there. When we run it, it prints the results of the <b>multiply_region.w32</b> in
+hexadecimal. Again, you can verify it using <b>gf_mult</b>:<br><br>
+<div id="number_spacing">
+UNIX> gf_example_2 4 <br>
+12 * 2 = 11 <br>
+11 / 12 = 2 <br>
+11 / 2 = 12 <br><br>
+multiply_region by 0xc (12) <br><br>
+R1 (the source): 0 2 d 9 d 6 8 a 8 d b 3 5 c 1 8 8 e b 0 6 1 5 a 2 c 4 b 3 9 3 6 <br>
+R2 (the product): 0 b 3 6 3 e a 1 a 3 d 7 9 f c a a 4 d 0 e c 9 1 b f 5 d 7 6 7 e <br>
+
+</div>
+
+
+
+
+
+
+
+
+
+
+<br/>
+
+4 &nbsp &nbsp  <em>   SOME TOOLS AND EXAMPLES TO GET YOU STARTED      </em>   <span id="index_number">11  </span> <br><br><br>
+
+<div id="number_spacing">
+<table cellpadding="6">
+<tr><td>UNIX></td> <td colspan="4"> gf_example_2 16 </td> </tr>
+
+<tr>
+
+<td>49598</td> <td> * </td> <td> 35999</td> <td> = </td> <td>19867 </td> </tr>
+
+<tr><td>19867 </td><td>/ </td> <td> 49598 </td> <td> =  </td> <td>35999 </td> </tr>
+<tr><td>19867</td><td> /</td> <td> 35999 </td> <td> = </td> <td> 49598 </td> </tr>  </table><br>
+
+
+&nbsp multiply_region by 0xc1be (49598) <br><br>
+
+
+<table cellpadding="6" >
+<tr>
+<td>R1 (the source):</td> <td> 8c9f </td> <td> b30e </td> <td> 5bf3 </td> <td> 7cbb </td> <td>16a9 </td> <td> 105d </td> <td> 9368 </td> <td> 4bbe </td> </tr>
+<td>R2 (the product):</td> <td> 4d9b</td> <td> 992d </td> <td> 02f2 </td> <td> c95c </td> <td> 228e </td> <td> ec82 </td> <td> 324e </td> <td> 35e4 </td></tr>
+</table>
+</div>
+<div id="number_spacing">
+<div style="padding-left:9px">
+UNIX> gf_mult c1be 8c9f 16h<br>
+4d9b <br>
+UNIX> gf_mult c1be b30e 16h <br>
+992d <br>
+UNIX> <br><br>
+</div>
+</div>
+
+<h3>4.4 &nbsp &nbsp &nbsp Quick Starting Example #3: Using <em>w </em>= 64 </h3>
+The program in <b>gf_example 3.c </b> is identical to the previous program, except it uses <em> GF(2<sup>64 </sup>). </em> Now <em>a, b</em> and <em> c </em> are
+<b>uint64 t'</b>s, and you have to use the function pointers that have <b>w64</b> extensions so that the larger types may be employed.
+<br><br>
+<div id="number_spacing">
+
+UNIX> gf_example_31 
+<table cellpadding="6">
+<tr>
+
+<td>a9af3adef0d23242 </td> <td> * </td> <td> 61fd8433b25fe7cd</td> <td> = </td> <td>bf5acdde4c41ee0c </td> </tr>
+
+<td>bf5acdde4c41ee0c </td> <td> / </td> <td> a9af3adef0d23242 </td> <td> = </td> <td>61fd8433b25fe7cd </td> </tr>
+<td>bf5acdde4c41ee0c </td> <td> / </td> <td> 61fd8433b25fe7cd  </td> <td>= </td> <td>a9af3adef0d23242 </td> </tr>
+</table><br><br>
+
+&nbsp multiply_region by a9af3adef0d23242<br><br>
+<table cellpadding="6" >
+<tr>
+<td>R1 (the source): </td> <td> 61fd8433b25fe7cd </td> <td>272d5d4b19ca44b7 </td> <td> 3870bf7e63c3451a </td> <td> 08992149b3e2f8b7 </td> </tr>
+<tr><td>R2 (the product): </td> <td> bf5acdde4c41ee0c </td> <td> ad2d786c6e4d66b7 </td> <td> 43a7d857503fd261 </td> <td> d3d29c7be46b1f7c </td> </tr>
+</table>
+
+<div style="padding-left:9px">
+
+UNIX> gf_mult a9af3adef0d23242 61fd8433b25fe7cd 64h <br>
+bf5acdde4c41ee0c<br>
+UNIX><br><br>
+</div>
+</div>
+<h3>4.5 &nbsp &nbsp &nbsp Quick Starting Example #4: Using <em>w </em>= 128 </h3>
+Finally, the program in <b>gf_example_4.c</b> uses  <em>GF(2<sup>128</sup>).</em> Since there is not universal support for uint128 t, the library
+represents 128-bit numbers as arrays of two uint64 t's. The function pointers for multiplication, division and region
+multiplication now accept the return values as arguments:<br><br>
+
+gf.multiply.w128(&gf, a, b, c); <br><br>
+
+Again, we can use <b>gf_mult </b> and <b>gf_div </b>to verify the results:<br><br>
+<div id="number_spacing">
+<div style="padding-left:9px">
+UNIX> gf_example_4 </div>
+<table cellpadding="6" >
+<tr>
+
+<td>e252d9c145c0bf29b85b21a1ae2921fa </td> <td> * </td> <td> b23044e7f45daf4d70695fb7bf249432 </td> <td> = </td> </tr>
+<tr><td>7883669ef3001d7fabf83784d52eb414 </td> </tr>
+
+</table>
+
+</div>
+
+
+
+
+
+
+
+
+<br/>
+
+4 &nbsp &nbsp  <em>   IMPORTANT INFORMATION ON ALIGNMENT WHEN MULTIPLYING REGIONS      </em>   <span id="index_number">12  </span> <br><br><br>
+
+<div id="number_spacing">
+multiply_region by e252d9c145c0bf29b85b21a1ae2921fa <br>
+R1 (the source): f4f56f08fa92494c5faa57ddcd874149 b4c06a61adbbec2f4b0ffc68e43008cb <br>
+R2 (the product): b1e34d34b031660676965b868b892043 382f12719ffe3978385f5d97540a13a1 <br>
+UNIX> gf_mult e252d9c145c0bf29b85b21a1ae2921fa f4f56f08fa92494c5faa57ddcd874149 128h <br>
+b1e34d34b031660676965b868b892043 <br>
+UNIX> gf_div 382f12719ffe3978385f5d97540a13a1 b4c06a61adbbec2f4b0ffc68e43008cb 128h<br>
+e252d9c145c0bf29b85b21a1ae2921fa<br>
+UNIX><br><br>
+
+</div>
+
+
+<h2>5 &nbsp &nbsp &nbspImportant Information on Alignment when Multiplying Regions </h2>
+
+
+
+In order to make multiplication of regions fast, we often employ 64 and 128 bit instructions. This has ramifications
+for pointer alignment, because we want to avoid bus errors, and because on many machines, loading and manipulating
+aligned quantities is much faster than unalinged quantities.<br><br>
+
+
+When you perform multiply_region.wxx(<em>gf, source, dest, value, size, add </em>), there are three requirements:
+<ol>
+<li>
+ The pointers <em>source</em> and <em>dest </em> must be aligned for <em>w</em>-bit words. For <em>w </em> = 4 and <em>w </em> = 8, there is no restriction;
+however for <em>w </em> = 16, the pointers must be multiples of 2, for <em>w </em> = 32, they must be multiples of 4, and for
+<em>w </em> &#1013; {64, 128}, they must be multiples of 8. </li><br>
+
+<li> The <em>size</em> must be a multiple of &#91; <em>w /
+</em> 
+8 .&#93;
+ With <em>w </em> = 4 and <em>w </em> = 8, <em>w/ </em>
+8  = 1 and there is no restriction. The other
+sizes must be multiples of <em>w </em>/
+8  because you have to be multiplying whole elements of <em> GF(2<sup>w </sup>) </em>. </li><br>
+
+<li> The <b>source</b> and <b>dest</b> pointers must be aligned identically with respect to each other for the implementation
+chosen. This is subtle, and we explain it in detail in the next few paragraphs. However, if you'd rather not figure
+it out, the following recommendation will <em>always </em> work in GF-Complete: </li>
+
+</ol>
+
+
+
+<div style="padding-left:100px">
+<b>If you want to be safe, make sure that source and dest are both multiples of 16. That is not a
+strict requirement, but it will always work! </b> <br><br>
+</div>
+
+
+If you want to relax the above recommendation, please read further.
+<p>When performing <b>multiply_region.wxx() </b>, the implementation is typically optimized for a region of bytes whose
+size must be a multiple of a variable <em>s </em> ,, and which must be aligned to a multiple of another variable <em>t </em>. For example,
+when doing <b>multiply_region.w32() </b> in <em> GF(2<sup>16 </sup>) </em> with SSE enabled, the implementation is optimized for regions of
+32 bytes, which must be aligned on a 16-byte quantity. Thus, <em>s </em> = 32 and <em>t</em> = 16. However, we don't want <b>multiply_
+region.w32() </b> to be too restrictive, so instead of requiring <em>source</em> and <em> dest </em> to be aligned to 16-byte regions, we
+require that (<em>source </em> mod 16) equal (<em>dest</em> mod 16). Or, in general, that (<em>source</em> mod t) equal (<em>dest</em> mod <em>t</em>). </p>
+
+
+<p>
+Then, <b>multiply_region.wxx()</b> proceeds in three phases. In the first phase,<b> multiply.wxx()</b> is called on successive
+words until (<em>source</em> mod <em>t</em>) equals zero. The second phase then performs the optimized region multiplication on
+chunks of <em> s  </em>bytes, until the remaining part of the region is less than s bytes. At that point, the third phase calls
+<em>multiply.wxx() </em> on the last part of the region. </p>
+
+A detailed example helps to illustrate. Suppose we make the following call in <em>GF(2<sup>16</sup>) </em> with SSE enabled:<br><br>
+<center><b>multiply region.w32(gf, 0x10006, 0x20006, a, 274, 0)</b> </center>
+
+
+
+
+
+
+
+<br/>
+
+2 &nbsp &nbsp  <em>  FILES IN THE LIBRARY     </em>   <span id="index_number">13  </span> <br><br><br>
+
+
+
+<div class="image-cell_2"> </div>  <br><br><br>
+
+Figure 2: Example of multiplying a region of 274 bytes in GF(216) when (source mod 16) = (dest mod 16) = 6. The
+alignment parameters are s = 32 and t = 16. The multiplication is in three phases, which correspond to the initial
+unaligned region (10 bytes), the aligned region of s-byte chunks (256 bytes), and the final leftover region (8 bytes).
+
+
+<p>First, note that <em>source</em> and <em>dest</em> are aligned on two-byte quantities, which they must be in <em>GF(2<sup>16</sup>).</em> Second, note
+that size is a multiple of &#91; 16/
+8 &#93 = 2. And last, note that (<em>source</em> mod 16) equals (<em>dest</em> mod 16). We illustrate the three
+phases of region multiplication in Figure 2. Because (<em>source</em> mod 16) = 6, there are 10 bytes of unaligned words that
+are multiplied with five calls to <b>multiply.w32()</b> in the first phase. The second phase multiplies 256 bytes (eight chunks
+of <em>s</em> = 32 bytes) using the SSE instructions. That leaves 8 bytes remaining for the third phase.
+</p>
+
+<p>
+When we describe the defaults and the various implementation options, we specify s and t as "alignment parameters."
+</p>
+<p>
+One of the advanced region options is using an alternate mapping of words to memory ("ALTMAP"). These interact
+in a more subtle manner with alignment. Please see Section 7.9 for details.
+</p>
+
+<h3> 6 &nbsp &nbspThe Defaults </h3>
+
+
+GF-Complete implements a wide variety of techniques for multiplication, division and region multiplication. We have
+set the defaults with three considerations in mind:
+<ol>
+<li>
+<b>Speed:</b> Obviously, we want the implementations to be fast. Therefore, we choose the fastest implementations
+that don’t violate the other considerations. The compilation environment is considered. For example, if SSE is
+enabled, region multiplication in <em> GF(2<sup>4 </sup>) </em> employs a single multiplication table. If SSE is not enabled, then a
+"double" table is employed that performs table lookup two bytes at a time. </li><br>
+<li>
+<b>Memory Consumption:</b> We try to keep the memory footprint of GF-Complete low. For example, the fastest
+way to perform <b>multiply.w32()</b> in <em>GF(2<sup>32</sup>) </em> is to employ 1.75 MB of multiplication tables (see Section 7.4
+below). We do not include this as a default, however, because we want to keep the default memory consumption
+of GF-Complete low.
+</li>
+
+</ul>
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">14  </span> <br><br><br>
+
+<ul>
+
+3. &nbsp <b>Compatibility with "standard" implementations:</b> While there is no <em>de facto</em> standard of Galois Field arithmetic,
+most libraries implement the same fields. For that reason, we have not selected composite fields, alternate
+polynomials or memory layouts for the defaults, even though these would be faster. Again, see section 7.7 for
+more information.
+
+</ul>
+
+<p>Table 1 shows the default methods used for each power-of-two word size, their alignment parameters <em>s</em> and <em> t,</em> their
+memory consumption and their rough performance. The performance tests are on an Intel Core i7-3770 running at
+3.40 GHz, and are included solely to give a flavor of performance on a standard microprocessor. Some processors
+will be faster with some techniques and others will be slower, so we only put numbers in so that you can ballpark it.
+For other values of <em>w</em> between 1 and 31, we use table lookup when w &#8804 8, discrete logarithms when w &#8804 16 and
+"Bytwo<sub>p</sub>" for w &#8804 32. </p>
+<br><br>
+<center> With SSE 
+<div id="data1">
+<table cellpadding="6" cellspacing="0">
+<tr>
+<th>w </th><th class="double_border" >Memory <br> Usage </br> </th><th>multiply() <br> Implementation</th><th>Performance <br>(Mega Ops / s) </th><th>multiply region() <br> Implementation </th>
+<th>s </th> <th>t </th> <th> Performance <br>(MB/s)</th>
+</tr>
+<tr>
+<td>4 </td><td class="double_border"><1K </td><td>Table</td><td>501</td><td>Table</td>
+<td>16 </td><td>16 </td> <td>11,659</td> </tr>
+
+<tr>
+<td>8 </td><td class="double_border">136K </td><td>Table</td><td>501</td><td>Split Table (8,4)</td>
+<td>16 </td><td>16 </td> <td>11,824</td> </tr>
+
+<tr>
+<td>16 </td><td class="double_border">896K </td><td>Log</td><td>260</td><td>Split Table (16,4)</td>
+<td>32 </td><td>16 </td> <td>7,749</td> </tr>
+
+<tr>
+<td>32 </td><td class="double_border"><1K </td><td>Carry-Free</td><td>48</td><td>Split Table (32,4)</td>
+<td>64 </td><td>16 </td> <td>5,011</td> </tr>
+
+<tr>
+<td>64 </td><td class="double_border">2K </td><td>Carry-Free</td><td>84</td><td>Split Table (64,4)</td>
+<td>128 </td><td>16 </td> <td>2,402</td> </tr>
+
+<tr>
+<td>128 </td><td class="double_border">64K </td><td>Carry-Free</td><td>48</td><td>Split Table (128,4)</td>
+<td>16 </td><td>16 </td> <td>833</td> </tr>
+</table></div>
+
+
+<div id="data1">
+<center>Without SE </center>
+<table cellpadding="6" cellspacing="0">
+<tr>
+<th>w </th><th>Memory <br> Usage </br> </th><th>multiply() <br> Implementation</th><th>Performance <br>(Mega Ops / s) </th><th>multiply region() <br> Implementation </th>
+<th>s </th> <th>t </th> <th> Performance <br>(MB/s)</th>
+</tr>
+<tr>
+<td>4 </td><td>4K </td><td>Table</td><td>501</td><td>Double Table</td>
+<td>16 </td><td>16 </td> <td>11,659</td> </tr>
+
+<tr>
+<td>8 </td><td>128K </td><td>Table</td><td>501</td><td>Table</td>
+<td>1 </td><td>1 </td> <td>1,397</td> </tr>
+
+<tr>
+<td>16 </td><td>896K </td><td>Log</td><td>266</td><td>Split Table (16,8)</td>
+<td>32 </td><td>16 </td> <td>2,135</td> </tr>
+
+<tr>
+<td>32 </td><td>4K </td><td>Bytwo<sub>p</sub></td><td>19</td><td>Split Table (32,4)</td>
+<td>4 </td><td>4 </td> <td>1,149</td> </tr>
+
+<tr>
+<td>64 </td><td>16K </td><td>Bytwo<sub>p</sub></td><td>9</td><td>Split Table (64,4)</td>
+<td>8 </td><td>8 </td> <td>987</td> </tr>
+
+<tr>
+<td>128 </td><td>64K </td><td>Bytwo<sub>p</sub></td><td>1.4</td><td>Split Table (128,4)</td>
+<td>16 </td><td>8 </td> <td>833</td> </tr>
+</table>
+</div>
+</center>
+<br><br>
+Table 1: The default implementations, memory consumption and rough performance when w is a power of two. The
+variables s and t are alignment variables described in Section 5.
+<p>
+A few comments on Table 1 are in order. First, with SSE, the performance of <b>multiply()</b> is faster when <em> w </em> = 64
+than when<em> w </em> = 32. That is because the primitive polynomial for <em> w  </em>= 32, that has historically been used in Galois
+Field implementations, is sub-ideal for using carry-free multiplication (PCLMUL). You can change this polynomial
+(see section 7.7) so that the performance matches <em>w </em> = 64. </p>
+<p>
+The region operations for <em> w  </em>= 4 and <em>w </em>= 8 without SSE have been selected to have a low memory footprint. There
+are better options that consume more memory, or that only work on large memory regions (see section 6.1.5).
+</p>
+
+There are times that you may want to stray from the defaults. For example:
+<ul>
+<li>
+You may want better performance.
+</li>
+
+</ul>
+
+
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">15  </span> <br><br><br>
+
+<ul>
+<li>You may want a lower memory footprint.</li>
+<li>You may want to use a different Galois Field or even a ring.</li>
+<li>You only care about multiplying a region by the value two.</li>
+
+</ul>
+
+
+<p>
+Our command line tools allow you to deviate from the defaults, and we have two C functions <b>-gf_init_hard()</b>
+and <b>create_gf_from_argv()</b> that can be called from application code to override the default methods. There are six
+command-line tools that can be used to explore the many techniques implemented in GF-Complete: </p>
+
+<ul><br>
+
+<li> <b>gf_methods</b> is a tool that enumerates most of the possible command-line arguments that can be sent to the other
+tools</li><br>
+<li> <b>gf_mult</b> and <b>gf_div</b> are explained above. You may change the multiplication and division technique in these
+tools if you desire</li><br>
+<li> <b>gf_unit</b> performs unit tests on a set of techniques to verify correctness</li><br>
+<li> <b> gf_time measures </b> the performance of a particular set of techniques</li><br>
+<li> <b>time_tool.sh </b> makes some quick calls to <b>gf_time</b> so that you may gauge rough performance.</li><br>
+<li> <b>gf_poly</b> tests the irreducibility of polynomials in a Galois Field</li><br>
+</ul>
+
+
+<p>To change the default behavior in application code, you need to call <b>gf_init_hard()</b> rather than <b>gf_init_easy().</b>
+Alternatively, you can use <b>create_g_from_argv(),</b> included from <b>gf_method.h,</b> which uses an <b>argv</b>-style array of
+strings to specify the options that you want. The procedure in <b>gf_method.c</b> parses the array and makes the proper
+<b>gf_init_hard()</b> procedure call. This is the technique used to parse the command line in <b> gf_mult, gf_div, gf_unit </b><em>et al.</em> </p>
+
+
+<h2>6.1.1 Changing the Components of a Galois Field with create <b>gf_from_argv()</b> </h2>
+There are five main components to every Galois Field instance:
+<ul>
+<li> <em>w </em> </li>
+<li> Multiplication technique </li>
+<li> Division technique  </li>
+<li> Region technique(s) </li>
+<li> Polynomial </li>
+</ul>
+
+<p>The procedures <b>gf_init_hard()</b> and <b> create_gf_from_argv()</b> allow you to specify these parameters when you create
+your Galois Field instance. We focus first on <b>create_gf_from_argv(),</b> because that is how the tools allow you to specify
+the components. The prototype of <b>create_gf_from_argv()</b> is as follows: </p><br>
+
+<div id="number_spacing">
+int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);<br><br> </div>
+
+You pass it a pointer to a gf_t, which it will initialize. You specify the word size with the parameter <em><b>w,</b></em> and then you
+pass it an <b>argc/argv</b> pair as in any C or C++ program. You also specify a <b>starting</b> argument, which is where in <b>argv</b>
+the specifications begin. If it successfully parses <b>argc</b> and <b>argv,</b> then it creates the <b>gf_t</b> using <b>gf_init_hard()</b> (described
+below in section 6.4). It returns one past the last index of <b>argv</b> that it considered when creating the <b>gf_t.</b> If it fails, then
+it returns zero, and the <b>gf_t</b> is unmodified.
+
+
+
+<p>For example, <b>gf_mult.c</b> calls create gf_from_argv() by simply passing <b>argc</b> and <b>argv</b> from its <b>main()</b> declaration,
+and setting starting to 4.</p>
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">16  </span> <br><br><br>
+
+<p>
+To choose defaults, <b>argv[starting]</b> should equal "-". Otherwise, you specify the component that you are changing
+with "-m" for multiplication technique, "-d" for division technique, "-r" for region technique, and "-p" for the
+polynomial. You may change multiple components. You end your specification with a single dash. For example, the
+following call multiplies 6 and 5 in <em>GF(2<sup>4</sup>)</em> with polynomial 0x19 using the "SHIFT" technique for multiplication
+(we'll explain these parameters later):
+</p><br><br>
+
+<div id="number_spacing">
+UNIX> ./gf_mult 6 5 4 -p 0x19 -m SHIFT -<br>
+7 <br>
+UNIX> <br><br>
+</div>
+
+<p>If <b>create_gf_from_argv()</b> fails, then you can call the procedure <b>gf_error(),</b> which prints out the reason why <b>create_
+gf_from_argv()</b> failed. </p>
+
+
+<h2>6.1.2 Changing the Polynomial </h2>
+
+Galois Fields are typically implemented by representing numbers as polynomials with binary coefficients, and then
+using the properties of polynomials to define addition and multiplication. You do not need to understand any of that to
+use this library. However, if you want to learn more about polynomial representations and how they construct fields,
+please refer to The Paper.
+
+<p>Multiplication is based on a special polynomial that we will refer to here as the "defining polynomial." This
+polynomial has binary coefficients and is of degree <em> w.</em> You may change the polynomial with "-p" and then a number
+in hexadecimal (the leading "0x" is optional). It is assumed that the <em>w</em>-th bit of the polynomial is set - you may include
+it or omit it. For example, if you wish to set the polynomial for GF(2<sup>16</sup>) to x<sup>16</sup> + x<sup>5</sup> + x<sup>3</sup> + x<sup>2</sup> + 1, rather than its
+default of x<sup>16</sup> + x<sup>12</sup> + x<sup>3</sup> + x + 1, you may say "-p 0x1002d," "-p 1002d," "-p 0x2d" or "-p 2d."
+We discuss changing the polynomial for three reasons in other sections: </p>
+<ul>
+<li>Leveraging carry-free multiplication (section 7.7). </li>
+<li>Defining composite fields (section 7.6). </li>
+<li>Implementing rings (section 7.8.1). </li>
+
+</ul>
+
+<p>
+Some words about nomenclature with respect to the polynomial. A Galois Field requires the polynomial to be
+<em>irreducible </em>.. That means that it cannot be factored. For example, when the coefficients are binary, the polynomial x<sup>5</sup>+
+x<sup>4</sup>+x+1 may be factored as (x<sup>4</sup>+1)(x+1). Therefore it is not irreducible and cannot be used to define a Galois Field.
+It may, however, be used to define a ring. Please see section 7.8.1 for a discussion of ring support in GF-Complete. </p>
+<p>
+There is a subset of irreducible polynomials called primitive. These have an important property that one may enumerate
+all of the elements of the field by raising 2 to successive posers. All of the default polynomials in GF-Complete 
+are primitive. However, so long as a polynomial is irreducible, it defines a Galois Field. Please see section 7.7 for a
+further discussion of the polynomial. </p>
+
+<p>
+One thing that we want to stress here is that changing the polynomial changes the field, so fields with different
+polynomialsmay not be used interchangeably. So long as the polynomial is irreducible, it generates a Galois Field that
+is isomorphic to all other Galois Fields; however the multiplication and division of elements will differ. For example,
+the polynomials 0x13 (the default) and 0x19 in <em>GF(2<sup>4</sup>) </em> are both irreducible, so both generate valid Galois Fields.
+However, their multiplication differs: </p><br>
+
+<div id="number_spacing">
+UNIX> gf_mult 8 2 4 -p 0x13 - <br>
+3 <br>
+UNIX> gf_mult 8 2 4 -p 0x19 - <br>
+9 <br>
+</div>
+
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">17  </span> <br><br><br>
+
+<div id="number_spacing">
+UNIX> gf_div 3 8 4 -p 0x13 -<br>
+2 <br>
+UNIX> gf_div 9 8 4 -p 0x19 - <br>
+2 <br>
+UNIX> <br>
+
+</div>
+
+
+<h3>6.1.3 &nbsp &nbsp Changing the Multiplication Technique </h3>
+The following list describes the multiplication techinques that may be changed with "-m". We keep the description
+here brief. Please refer to The Paper for detailed descriptions of these techniques.<br><br>
+
+
+<li><b> "TABLE:" </b> Multiplication and division are implemented with tables. The tables consume quite a bit of memory
+(2<sup>w</sup> &#215 2 <sup>w</sup> &#215  <sup>w</sup>/
+8  bytes), so they are most useful when <em>w</em> is small. Please see <b>"SSE," "LAZY," "DOUBLE"</b> and
+
+<b>"QUAD"</b> under region techniques below for further modifications to <b>"TABLE"</b> to perform <b>multiply_region()</b></li><br>
+
+
+<li> <b>"LOG:"</b> This employs discrete (or "Zeph") logarithm <b>tables</b> to implement multiplication and division. The
+memory usage is roughly (3 &#215 2<sup>w</sup> &#215 w /
+8  bytes), so they are most useful when w is small, but they tolerate
+larger <em>w</em> than <b>"TABLE."</b> If the polynomial is not primitive (see section 6.1.2), then you cannot use <b>"LOG"</b> as
+an implementation. In that case,<b> gf_init_hard()</b> or <b>create_gf_from_argv()</b> will fail</li><br>
+
+
+<li><b> "LOG_ZERO:"</b> Discrete logarithm tables which include extra room for zero entries. This more than doubles
+the memory consumption to remove an <b>if</b> statement (please see [GMS08] or The Paper for more description). It
+doesn’t really make a huge deal of difference in performance</li><br>
+
+<li> <b>"LOG_ZERO_EXT:"</b> This expends even more memory to remove another <b>if</b> statement. Again, please see The
+Paper for an explanation. As with <b>"LOG_ZERO,"</b> the performance difference is negligible</li><br>
+
+<li> <b>"SHIFT:"</b> Implementation straight from the definition of Galois Field multiplication, by shifting and XOR-ing,
+then reducing the product using the polynomial. This is <em>slooooooooow,</em> so we don’t recommend you use it</li><br>
+
+
+<li> <b>"CARRY_FREE:"</b> This is identical to <b>"SHIFT,"</b> however it leverages the SSE instruction PCLMUL to perform
+carry-freemultiplications in single instructions. As such, it is the fastest way to perform multiplication for large
+values of <em>w</em> when that instruction is available. Its performance depends on the polynomial used. See The Paper
+for details, and see section 7.7 below for the speedups available when <em>w </em>= 16 and <em>w</em> = 32 if you use a different
+polynomial than the default one</li><br>
+
+
+<li> <b>"BYTWO_p:"</b> This implements multiplication by successively multiplying the product by two and selectively
+XOR-ing the multiplicand. See The Paper for more detail. It can leverage Anvin’s optimization that multiplies
+64 and 128 bits of numbers in <em>GF(2<sup>w</sup>) </em> by two with just a few instructions. The SSE version requires SSE2</li><br>
+
+
+<li> <b>"BYTWO_b:"</b> This implements multiplication by successively multiplying the multiplicand by two and selectively
+XOR-ing it into the product. It can also leverage Anvin's optimization, and it has the feature that when
+you're multiplying a region by a very small constant (like 2), it can terminate the multiplication early. As such,
+if you are multiplying regions of bytes by two (as in the Linux RAID-6 Reed-Solomon code [Anv09]), this is
+the fastest of the techniques, regardless of the value of <em>w.</em> The SSE version requires SSE2</li><br>
+
+
+<li> <b>"SPLIT:"</b> Split multiplication tables (like the LR tables in [GMS08], or the SIMD tables for w &#8804 8 in [LHy08,
+Anv09, PGM13b]). This argument must be followed by two more arguments, w<sub>a</sub> and w<sub>b</sub>, which are the index
+sizes of the sub-tables. This implementation reduces the size of the table from <b>"TABLE,"</b> but requires multiple
+</li><br>
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">18 </span> <br><br><br>
+<ul>
+table lookups. For example, the following multiplies 100 and 200 in <em>GF(2<sup>8</sup>) </em> using two 4K tables, as opposed 
+to one 64K table when you use <b>"TABLE:"</b><br><br>
+<div id="number_spacing">
+UNIX> ./gf_mult 100 200 8 -m SPLIT 8 4 - <br>
+79<br>
+UNIX><br><br>
+</div>
+
+See section 7.4 for additional information on the arguments to <b>"SPLIT."</b> The SSE version typically requires
+SSSE3.<br><br>
+
+
+<li> <b>"GROUP:"</b> This implements the "left-to-right comb" technique [LBOX12]. I'm afraid we don't like that name,
+so we call it <b>"GROUP,"</b> because it performs table lookup on groups of bits for shifting (left) and reducing (right).
+It takes two additional arguments - g<sub>s,</sub> which is the number of bits you use while shifting (left) and g<sub>r</sub>, which
+is the number of bits you use while reducing (right). Increasing these arguments can you higher computational
+speed, but requires more memory. SSE version exists only for <em> w </em> = 128 and it requires SSE4. For more
+description on the arguments g<sub>s</sub> and g<sub>r</sub>, see section 7.5. For a full description of <b>"GROUP"</b> algorithm, please
+see The Paper.
+</li><br>
+
+<li> <b>"COMPOSITE:"</b> This allows you to perform operations on a composite Galois Field, <em> GF((2<sup>l</sup>)<sup>k</sup>)</em> as described
+in [GMS08], [LBOX12] and The Paper. The field size <em>w </em> is equal to <em>lk.</em> It takes one argument, which is <em>k,</em> and
+then a specification of the base field. Currently, the only value of <em>k</em> that is supported is two. However, that may
+change in a future revision of the library. </li><br>
+
+
+In order to specify the base field, put appropriate flags after specifying <em>k.</em> The single dash ends the base field,
+and after that, you may continue making specifications for the composite field. This process can be continued
+for multiple layers of <b>"COMPOSITE."</b> As an example, the following multiplies 1000000 and 2000000
+in <em>GF((2<sup>16</sup>)<sup>2</sup>),</em> where the base field uses <b>BYTWO_p</b> for multiplication: <br><br>
+<center>./gf_mult 1000000 2000000 32 -m COMPOSITE 2 <span style="color:red">-m BYTWO_p - -</span> </center><br>
+
+In the above example, the red text applies to the base field, and the black text applies to the composite field.
+Composite fields have two defining polynomials - one for the composite field, and one for the base field. Thus, if
+you want to change polynomials, you should change both. The polynomial for the composite field must be of the
+form x<sup>2</sup>+sx+1, where s is an element of <em>GF(2<sup>k</sup>).</em> To change it, you specify s (in hexadecimal)with "-p." In the
+example below, we multiply 20000 and 30000 in <em>GF((2<sup>8</sup>)<sup>2</sup>) </em>, setting s to three, and using x<sup>8</sup>+x<sup>4</sup>+x<sup>3</sup>+x<sup>2</sup>+1
+as the polynomial for the base field: <br><br>
+
+<center>./gf_mult 20000 30000 16 -m COMPOSITE 2 <span style="color:red">-p 0x11d </span> - -p 0x3 - </center> <br><br>
+
+If you use composite fields, you should consider using <b>"ALTMAP"</b> as well. The reason is that the region
+operations will go much faster. Please see section 7.6.<br><br>
+As with changing the polynomial, when you use a composite field, <em> GF((2<sup>l</sup>)<sup>k</sup>)</em>, you are using a different field
+than the "standard" field for <em> GF((2<sup>l</sup>)<sup>k</sup>)</em>. All Galois Fields are isomorphic to each other, so they all have the
+desired properties; however, the fields themselves change when you use composite fields.<br><br>
+</ul>
+<p>
+With the exception of <b>"COMPOSITE"</b>, only one multiplication technique can be provided for a given Galois
+Field instance. Composite fields may use composite fields as their base fields, in which case the specification will be
+recursive. </p>
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">19 </span> <br><br><br>
+
+<h3>6.1.4 &nbsp &nbsp &nbsp Changing the Division Technique </h3>
+
+There are two techniques for division that may be set with "-d". If "-d" is not specified, then appropriate defaults
+are employed. For example, when the multiplication technique is <b>"TABLE,"</b> a table is created for division as well as
+multiplication. When <b>"LOG"</b> is specified, the logarithm tables are used for division. With <b>"COMPOSITE,"</b> a special
+variant of Euclid's algorithm is employed that performs division using multiplication and division in the base field.
+Otherwise, Euclid's algorithm is used. Please see The Paper for a description of Euclid's algorithm applied to Galois
+Fields.
+
+<p>If you use "-d", you must also specify the multiplication technique with "-m." </p>
+<p>To force Euclid's algorithm instead of the defaults, you may specify it with "-d EUCLID." If instead, you would
+rather convert elements of a Galois Field to a binary matrix and find an element's inverse by inverting the matrix,
+then specify "-d MATRIX." In all of our tests, <b>"MATRIX"</b> is slower than <b>"EUCLID." "MATRIX" </b> is also not defined
+for <em>w </em> > 32.
+</p>
+
+
+<h3>6.1.5  &nbsp&nbsp&nbsp Changing the Region Technique </h3>
+The following are the region multiplication options ("-r"):
+<ul>
+<li>
+<b>"SSE:"</b> Use SSE instructions. Initialization will fail if the instructions aren't supported. Table 2 details the
+multiplication techniques which can leverage SSE instructions and which versions of SSE are required. </li><br>
+
+<center>
+<div id="data1">
+<table cellpadding="6" cellspacing="0" style="text-align:center;font-size:19px">
+<tr>
+<th>Multiplication <br> Technique</th><th>multiply() </th><th>multiply_region() </th><th>SSE Version </th><th>Comments</th>
+
+</tr>
+<tr>
+<td><b>"TABLE"</b></td><td >- </td><td>Yes</td><td>SSSE3</td><td>Only for <em>GF(2<sup>4</sup>). </em></td>
+
+<tr>
+<td><b>"SPLIT"</b></td><td>-</td><td>Yes</td><td>SSSE3</td><td>Only when the second argument equals 4.</td>
+
+<tr>
+<td><b>"SPLIT"</b></td><td>- </td><td>Yes</td><td>SSE4</td><td>When <em>w </em> = 64 and not using <b>"ALTMAP".</b></td>
+
+<tr>
+<td><b>"BYTWO_p"</b></td><td>- </td><td>Yes</td><td>SSE2</td><td></td>
+
+<tr>
+<td><b>"BYTWO_p"</b></td><td>- </td><td>Yes</td><td>SSE2</td><td></td>
+
+</table></div> <br><br>
+Table 2: Multiplication techniques which can leverage SSE instructions when they are available.
+</center> <br><br>
+
+
+
+
+
+
+
+
+
+
+
+
+<li> <b>"NOSSE:"</b> Force non-SSE version </li><br>
+
+<li> <b> "DOUBLE:"</b> Use a table that is indexed on two words rather than one. This applies only to <em>w  </em> = 4, where
+the table is indexed on bytes rather than 4-bit quantities, and to <em>w </em> = 8, where the table is indexed on shorts
+rather than bytes. In each case, the table lookup performs two multiplications at a time, which makes region
+multiplication faster. It doubles the size of the lookup table. </li><br>
+
+<li> <b>"QUAD:"</b> Use a table that is indexed on four words rather than two or one. This only applies to <em>w </em> = 4, where
+the table is indexed on shorts. The "Quad" table may be lazily created or created ahead of time (the default). If
+the latter, then it consumes 2<sup>4</sup> &#215 2<sup>16</sup> &#215 2 = 2 MB of memory. </li><br>
+
+<li> <b> "LAZY:"</b> Typically it's clear whether tables are constructed upon initialization or lazily when a region operation
+is performed. There are two times where it is ambiguous: <b>"QUAD"</b> when <em>w </em> = 4 and <b>"DOUBLE"</b> when <em>w </em> = 8.
+If you don't specify anything, these tables are created upon initialization, consuming a lot of memory. If you
+specify <b>"LAZY,"</b> then the necessary row of the table is created lazily when you call <b>"multiply_region().</b>
+</li>
+
+</ul>
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">20 </span> <br><br><br>
+<ul>
+
+<li> <b>"ALTMAP:"</b> Use an alternate mapping, where words are split across different subregions of memory. There
+are two places where this matters. The first is when implementing "<b>SPLIT</b> <em>w </em> 4" using SSE when <em>w </em> > 8. In
+these cases, each byte of the word is stored in a different 128-bit vector, which allows the implementation to
+better leverage 16-byte table lookups. See section 7.4 for examples, and The Paper or [PGM13b] for detailed
+explanations.<br><br> </li>
+
+The second place where it matters is when using <b>"COMPOSITE."</b> In this case, it is advantageous to split each
+memory region into two chunks, and to store half of each word in a different chunk. This allows us to call
+<b>region_multiply() </b> recursively on the base field, which is <em>much </em> faster than the alternative. See Section 7.6 for
+examples, and The Paper for an explanation.<br><br>
+
+It is important to note that with <b>"ALTMAP,"</b> the words are not "converted" from a standard mapping to an
+alternate mapping and back again. They are assumed to always be in the alternate mapping. This typically
+doesn't matter, so long as you always use the same <b>"ALTMAP"</b> calls. Please see section 7.9 for further details
+on <b>"ALTMAP,"</b> especially with respect to alignment.<br><br>
+
+<li> <b>"CAUCHY:"</b> Break memory into <em>w </em> subregions and perform only XOR's as in Cauchy Reed-Solomon coding
+[BKK<sup>+</sup>95] (also described in The Paper). This works for <em>any</em> value of <em>w </em> &#8804 32, even those that are not
+powers of two. If SSE2 is available, then XOR's work 128 bits at a time. For <b>"CAUCHY"</b> to work correctly,
+<em>size</em> must be a multiple of <em>w </em>.</li> </ul>
+
+
+
+<p>It is possible to combine region multiplication options. This is fully supported as long as <b>gf_methods</b> has the combination
+listed. If multiple region options are required, they should be specified independently (as flags for <b>gf_init_hard()</b>
+and independent options for command-line tools and <b>create_gf_from_argv()).</b> </p>
+
+
+<h3>6.2  &nbsp&nbsp&nbspDetermining Supported Techniques with gf_methods </h3>
+
+
+The program <b>gf_methods</b> prints a list of supported methods on standard output. It is called as follows:<br><br>
+<div id="number_spacing">
+<center>./gf_methods <em>w </em> -BADC -LUMDRB <br><br> </center> </div>
+
+The first argument is <em>w </em>, which may be any legal value of <em>w </em>. The second argument has the following flags: <br><br>
+<ul>
+
+<li> <b>"B:"</b> This only prints out "basic" methods that are useful for the given value of <em>w </em>. It omits <b>"SHIFT"</b> and other
+methods that are never really going to be useful.</li><br>
+
+<li> <b> "A:"</b> In constrast, this specifies to print "all" methods. </li><br>
+
+<li> <b>"D:"</b> This includes the <b>"EUCLID"</b> and <b>"MATRIX"</b> methods for division. By default, they are not included. </li><br>
+
+<li> <b>"C:"</b> This includes the <b>"CAUCHY"</b> methods for region multiplication. By default, it is not included.</li> <br>
+</ul>
+<p>
+You may specify multiple of these as the second argument. If you include both <b>"B"</b> and <b>"A,"</b> then it uses the last
+one specified. </p>
+<p>
+The last argument determines the output format of <b>gf_methods.</b> If it is <b>"L,"</b> then it simply lists methods. If it
+is <b>"U,"</b> then the output contains <b>gf_unit</b> commands for each of the methods. For the others, the output contains
+<b>gf_time_tool.sh</b> commands for <b>M </b>ultiplication,<b>D</b>ivision,<b>R</b>egion multiplications with multiple buffer sizes, and the
+<b>B</b>est region multiplication. </p>
+<p>
+<b>gf_methods</b> enumerates combinations of flags, and calls <b>create_gf_from_argv()</b> to see if the combinations are
+supported. Although it enumerates a large number of combinations, it doesn't enumerate all possible parameters for
+<b>"SPLIT," "GROUP"</b> or <b>"COMPOSITE."</b> </p>
+
+<p>Some examples of calling <b>gf_methods</b> are shown below in section 6.3.2. </p>
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">21 </span> <br><br><br>
+
+
+<h3>6.3 Testing with <b>gf_unit </b>, <b>gf_time </b>, and time_tool.sh </h3>
+
+
+
+<b>gf_unit </b> and <b>gf_time </b> may be used to verify that a combination of arguments works correctly and efficiently on your
+platform. If you plan to stray from the defaults, it is probably best to run both tools to ensure there are no issues with
+your environment. <b>gf_unit </b> will run a set of unit tests based on the arguments provided to the tool, and <b>gf_time </b> will
+time Galois Field methods based on the provided arguments.<br>
+The usage of gf_ unit is:<br><br>
+<div id="number_spacing">
+<b>gf_unit </b> w tests seed method<br><br> </div>
+The usage of gf_ time is:<br><br>
+<div id="number_spacing">
+<b>gf_time </b> w tests seed buffer-size iterations method<br><br>
+</div>
+
+
+
+The seed is an integer- negative one uses the current time. The tests are specified by a listing of characters. The
+following tests are supported (All are supported by <b>gf_time.</b> Only ', 'S' and 'R' are supported by <b>gf_unit</b>):<br><br>
+
+<ul>
+<li> <b>'M':</b> Single multiplications</li><br>
+<li> <b> 'D':</b> Single divisions</li><br>
+<li> <b> 'I':</b> Single inverses</li><br>
+<li> <b>'G': </b> Region multiplication of a buffer by a random constant</li><br>
+<li> <b>'0': </b> Region multiplication of a buffer by zero (does nothing and<b>bzero()</b>)</li><br>
+<li> <b>'1': </b> Region multiplication of a buffer by one (does <b>memcpy()</b> and <b>XOR</b>)</li><br>
+<li> <b>'2': </b> Region multiplication of a buffer by two – sometimes this is faster than general multiplication</li><br>
+<li> <b>'S':</b> All three single tests</li><br>
+<li> <b>'R':</b> All four region tests</li><br>
+<li> <b>'A':</b> All seven tests</li><br>
+</ul>
+
+
+
+
+
+<p>Here are some examples of calling <b>gf_unit</b> and <b>gf_time</b> to verify that <b>"-m SPLIT 32 4 -r ALTMAP -"</b> works
+in <em>GF(2<sup>32</sup>),</em> and to get a feel for its performance. First, we go to the test directory and call <b>gf_unit:</b> </p><br><br>
+
+
+<div id="number_spacing">
+UNIX> cd test <br>
+UNIX> ./gf_unit 32 A -1 -m SPLIT 32 4 -r ALTMAP - <br>
+Args: 32 A -1 -m SPLIT 32 4 -r ALTMAP - / size (bytes): 684 <br>
+UNIX> <br><br>
+</div>
+
+<b>gf_unit</b> reports on the arguments and how may bytes the <b>gf_t</b> consumes. If it discovers any problems or inconsistencies
+with multiplication, division or region multiplication, it will report them. Here, there are no problems.
+Next, we move to the <b>tools</b> directory and run performance tests on a 10K buffer, with 10,000 iterations of each test:<br><br>
+
+
+UNIX> cd ../tools <br>
+UNIX> ./gf_time 32 A -1 10240 10000 -m SPLIT 32 4 -r ALTMAP -<br>
+Seed: 1388435794 <br>
+<div id="number_spacing">
+<table cellpadding="0" cellspacing="25" style="font-size:19px,font-family: 'Roboto Condensed', sans-serif;
+">
+
+<tr>
+
+<td>Multiply:</td> <td>4.090548 s</td> <td> Mops: </td> <td> 24.414 </td> <td>5.968 Mega-ops/s </td> </tr>
+<tr><td>Divide:</td> <td> 37.794962 s </td> <td>Mops: </td> <td> 24.414 </td> <td>0.646 Mega-ops/s </td> </tr>
+<tr><td>Inverse:</td> <td> 33.709875 s </td> <td> Mops: </td> <td> 24.414 </td> <td> 0.724 Mega-ops/s </td> </tr>
+<tr><td>Region-Random: XOR: 0 </td> <td> 0.035210 s </td> <td> MB:</td> <td> 97.656 </td> <td> 2773.527 MB/s </td></tr>
+<tr><td>Region-Random: XOR: 1 </td> <td> 0.036081 s</td> <td> MB:</td> <td> 97.656 </td> <td>2706.578 MB/s </td></tr>
+<tr><td>Region-By-Zero:XOR: 0 </td> <td> 0.003199 s </tD> <td> MB: </td> <td>97.656 </td> <td> 30523.884 MB/s </td> </tr>
+<tr><td>Region-By-Zero: XOR: 1 </td> <td> 0.000626 s  </td> <td>MB: </td> <td> 97.656 </td> <td> 156038.095 MB/s </td></tr>
+
+</table>
+</div>
+
+
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">22 </span> <br><br><br>
+
+<div id="number_spacing">
+<table cellpadding="0" cellspacing="10" style="font-family: 'Roboto Condensed', sans-serif;
+">
+
+<tr>
+<td>Region-By-One: XOR: 0</td> <td> 0.003810 s</td> <td> MB:</td> <td> 97.656 </td> <td> 25628.832 MB/s </td>
+<tr><td>Region-By-One: XOR: 1 </td> <td> 0.008363 s </td> <td> MB:</td> <td> 97.656 </tD> <td>11677.500 MB/s </td></tr>
+
+<tr><td>Region-By-Two: XOR: 0 </td> <td>0.032942 s  </td> <td>MB: </td> <td> 97.656 </td> <td> 2964.486 MB/s </td> </tr>
+<tr><td>Region-By-Two: XOR: 1 </td> <td> 0.033488 s </td> <td> MB: </td> <td> 97.656 </td> <td> 2916.153 MB/s </td> </tr> </table>
+</div>
+UNIX><br><br>
+
+<p>The first column of output displays the name of the test performed. Region tests will test with and without the XOR
+flag being set (see Section 4.3 for an example). The second column displays the total time the test took to complete
+measured in seconds (s). The third column displays the size of the test measured in millions of operations (Mops) for
+single tests and in Megabytes (MB) for the region tests. The final column displays the speed of the tests calculated
+from the second and third columns, and is where you should look to get an idea of a method's performance.</p>
+<p>
+If the output of <b>gf_unit</b> and <b>gf_time</b> are to your satisfaction, you can incorporate the method into application code
+using create <b>gf_from_argv()</b> or <b>gf_init hard().</b></p>
+<p>
+The performance of "Region-By-Zero" and "Region-By-One" will not change from test to test, as all methods make
+the same calls for these. "Region-By-Zero" with "XOR: 1" does nothing except set up the tests. Therefore, you may
+use it as a control.</p>
+
+<h3>6.3.1 &nbsp &nbsp &nbsp time_tool.sh </h3> 
+
+Finally, the shell script <b>time_tool.sh</b> makes a bunch of calls to <b>gf_time</b> to give a rough estimate of performance. It is
+called as follows:<br><br>
+usage sh time_tool.sh M|D|R|B w method<br><br>
+
+
+<p>The values for the first argument are <b>MDRB,</b> for <b>M</b>ultiplication, <b>D</b>ivision,<b>R</b>egion multiplications with multiple
+buffer sizes, and the <b>B</b>est region multiplication. For the example above, let's call <b>time_tool.sh</b> to get a rough idea of
+performance: </p><br><br>
+
+<div id="number_spacing">
+UNIX> sh time_tool.sh M 32 -m SPLIT 32 4 -r ALTMAP - <br>
+M speed (MB/s): 6.03 W-Method: 32 -m SPLIT 32 4 -r ALTMAP - <br>
+UNIX> sh time_tool.sh D 32 -m SPLIT 32 4 -r ALTMAP - <br>
+D speed (MB/s): 0.65 W-Method: 32 -m SPLIT 32 4 -r ALTMAP - <br>
+UNIX> sh time_tool.sh R 32 -m SPLIT 32 4 -r ALTMAP - <br>
+
+<table cellpadding="0" cellspacing="10" style="font-family: 'Roboto Condensed', sans-serif;
+">
+
+<tr>
+<td>Region Buffer-Size:</td> <td> 16K (MB/s):</td> <td>3082.91</td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>32K (MB/s): </td> <td>3529.07 </td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>64K (MB/s): </td> <td> 3749.94</td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>128K (MB/s):</td> <td>3861.27 </td> <td>W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>512K (MB/s):</td> <td>3820.82 </td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>1M (MB/s):</td> <td>3737.41 </td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td>  </tr>
+<tr><td>Region Buffer-Size:</td> <td>2M (MB/s):</td> <td>3002.90 </td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Buffer-Size:</td> <td>4M (MB/s): </td><td>2760.77</td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+<tr><td>Region Best (MB/s):</td><td> 3861.27</td><td> W-Method: 32 </td> <td>-m SPLIT 32 4 </td> <td>-r ALTMAP -</td> </tr>
+</table>
+
+UNIX> sh time_tool.sh B 32 -m SPLIT 32 4 -r ALTMAP - <br>
+Region Best (MB/s): 3929.09  W-Method: 32  -m SPLIT 32 4 -r ALTMAP -</br>
+UNIX><br><br>
+</div>
+<p>
+We say that <b>time_tool.sh </b>is "rough" because it tries to limit each test to 5 ms or less. Thus, the time granularity
+is fine, which means that the numbers may not be as precise as they could be were the time granularity to be course.
+When in doubt, you should make your own calls to <b>gf_time</b> with a lot of iterations, so that startup costs and roundoff
+error may be minimized. </p>
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">23 </span> <br><br><br>
+
+<h3>6.3.2 &nbsp &nbsp &nbsp An example of gf_methods and time_tool.sh </h3><br><br>
+Let's give an example of how some of these components fit together. Suppose we want to explore the basic techniques
+in <em>GF(2<sup>32</sup>).</em> First, let's take a look at what <b>gf_methods</b> suggests as "basic" methods: <br><br>
+<div id="number_spacing">
+UNIX> gf_methods 32 -B -L <br>
+w=32: - <br>
+w=32: -m GROUP 4 8 - <br>
+w=32: -m SPLIT 32 4 - <br>
+w=32: -m SPLIT 32 4 -r ALTMAP - <br>
+w=32: -m SPLIT 32 8 - <br>
+w=32: -m SPLIT 8 8 - <br>
+w=32: -m COMPOSITE 2 - - <br>
+w=32: -m COMPOSITE 2 - -r ALTMAP - <br>
+UNIX> <br><br>
+</div>
+
+
+<p>
+
+You'll note, this is on my old Macbook Pro, which doesn't support (PCLMUL), so <b>"CARRY_FREE"</b> is not included
+as an option. Now, let's run the unit tester on these to make sure they work, and to see their memory consumption: </p><br><br>
+
+<div id="number_spacing">
+UNIX> gf_methods 32 -B -U <br>
+../test/gf_unit 32 A -1 - <br>
+../test/gf_unit 32 A -1 -m GROUP 4 8 - <br>
+../test/gf_unit 32 A -1 -m SPLIT 32 4 - <br>
+../test/gf_unit 32 A -1 -m SPLIT 32 4 -r ALTMAP - <br>
+../test/gf_unit 32 A -1 -m SPLIT 32 8 - <br>
+../test/gf_unit 32 A -1 -m SPLIT 8 8 - <br>
+../test/gf_unit 32 A -1 -m COMPOSITE 2 - - <br>
+../test/gf_unit 32 A -1 -m COMPOSITE 2 - -r ALTMAP - <br>
+UNIX> gf_methods 32 -B -U | sh <br>
+Args: 32 A -1 - / size (bytes): 684 <br>
+Args: 32 A -1 -m GROUP 4 8 - / size (bytes): 1296 <br>
+Args: 32 A -1 -m SPLIT 32 4 - / size (bytes): 684 <br>
+Args: 32 A -1 -m SPLIT 32 4 -r ALTMAP - / size (bytes): 684 <br>
+Args: 32 A -1 -m SPLIT 32 8 - / size (bytes): 4268 <br>
+Args: 32 A -1 -m SPLIT 8 8 - / size (bytes): 1839276 <br>
+Args: 32 A -1 -m COMPOSITE 2 - - / size (bytes): 524648 <br>
+Args: 32 A -1 -m COMPOSITE 2 - -r ALTMAP - / size (bytes): 524648 <br>
+UNIX> <br> <br>
+</div>
+<p>
+As anticipated, <b>"SPLIT 8 8"</b> consumes quite a bit of memory! Now, let's see how well they perform with both
+single multiplications and region multiplications: </p> <br><br>
+<div id="number_spacing">
+UNIX> gf_methods 32 -B -M <br>
+sh time_tool.sh M 32 - <br>
+sh time_tool.sh M 32 -m GROUP 4 8  - <br>
+sh time_tool.sh M 32 -m SPLIT 32 4 - <br>
+sh time_tool.sh M 32 -m SPLIT 32 4 -r ALTMAP -<br>
+sh time_tool.sh M 32 -m SPLIT 32 8 - <br>
+sh time_tool.sh M 32 -m SPLIT 8 8 - <br>
+
+</div>
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">24 </span> <br><br><br>
+
+<div id="number_spacing">
+sh time_tool.sh M 32 -m COMPOSITE 2 - <br>
+sh time_tool.sh M 32 -m COMPOSITE 2 - -r ALTMAP <br>
+UNIX> gf_methods 32 -B -M | sh
+M speed (MB/s): 5.90 W-Method: 32 <br>
+M speed (MB/s): 14.09 W-Method: 32 -m GROUP 4 8 <br>
+M speed (MB/s): 5.60 W-Method: 32 -m SPLIT 32 4 <br>
+M speed (MB/s): 5.19 W-Method: 32 -m SPLIT 32 4 -r ALTMAP <br>
+M speed (MB/s): 5.98 W-Method: 32 -m SPLIT 32 8 <br>
+M speed (MB/s): 22.10 W-Method: 32 -m SPLIT 8 8 <br>
+M speed (MB/s): 34.98 W-Method: 32 -m COMPOSITE 2 - <br>
+M speed (MB/s): 34.16 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP <br>
+UNIX> gf_methods 32 -B -B | sh
+Region Best (MB/s): 2746.76 W-Method: 32 <br>
+Region Best (MB/s): 177.06 W-Method: 32 -m GROUP 4 8 <br>
+Region Best (MB/s): 2818.75 W-Method: 32 -m SPLIT 32 4 <br>
+Region Best (MB/s): 3818.21 W-Method: 32 -m SPLIT 32 4 -r ALTMAP <br>
+Region Best (MB/s): 728.68 W-Method: 32 -m SPLIT 32 8 <br>
+Region Best (MB/s): 730.97 W-Method: 32 -m SPLIT 8 8 <br>
+Region Best (MB/s): 190.20 W-Method: 32 -m COMPOSITE 2 - <br>
+Region Best (MB/s): 1837.99 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP <br>
+UNIX>
+</div>
+<p>
+The default is quite a bit slower than the best performing methods for both single and region multiplication. So
+why are the defaults the way that they are? As detailed at the beginning of this chapter, we strive for lower memory
+consumption, so we don't use <b>"SPLIT 8 8,"</b> which consumes 1.75MB.We don't implement alternate fields by default,
+which is why we don't use <b>"COMPOSITE."</b> Finally, we don't implement alternate mappings of memory by default,
+which is why we don't use "<b>-m SPLIT 32 4 -r ALTMAP -.</b>"</p>
+
+<p>Of course, you may change these defaults if you please.</p>
+<p>
+<b>Test question:</b> Given the numbers above, it would appear that <b>"COMPOSITE"</b> yields the fastest performance of
+single multiplication, while "SPLIT 32 4" yields the fastest performance of region multiplication. Should I use two
+gf_t's in my application – one for single multiplication that uses <b>"COMPOSITE,"</b> and one for region multiplication
+that uses <b>"SPLIT 32 4?"</b></p>
+<p>
+The answer to this is "no." Why? Because composite fields are different from the "standard" fields, and if you mix
+these two <b>gf_t</b>'s, then you are using different fields for single multiplication and region multiplication. Please read
+section 7.2 for a little more information on this.</p>
+
+<h3>6.4 &nbsp &nbsp &nbspCalling gf_init_hard()</h3>
+
+We recommend that you use <b>create_gf_from_argv()</b> instead of <b>gf_init_hard().</b> However, there are extra things that
+you can do with <b>gf_init_hard().</b> Here's the prototype:<br><br>
+<div id="number_spacing">
+int gf_init_hard(gf_t *gf<br>
+<div style="padding-left:100px">
+int w<br>
+int mult_type<br>
+int region_type<br>
+int divide_type<br>
+uint64_t prim_poly<br>
+int arg1<br>
+int arg2<br>
+</div>
+</div>
+
+
+
+
+
+
+
+
+<br/>
+
+6 &nbsp &nbsp  <em>  THE DEFAULTS     </em>   <span id="index_number">25 </span> <br><br><br>
+<div id="number_spacing">
+<div style="padding-left:100px">
+GFP base_gf, <br>
+void *scratch_memory); </div><br><br>
+
+
+The arguments mult type, region type and divide type allow for the same specifications as above, except the
+types are integer constants defined in gf_complete.h: <br><br>
+typedef enum {GF_MULT_DEFAULT,<br>
+<div style="padding-left:124px">
+GF_MULT_SHIFT<br>
+GF_MULT_CARRY_FREE<br>
+GF_MULT_GROUP<br>
+GF_MULT_BYTWO_p<br>
+GF_MULT_BYTWO_b<br>
+GF_MULT_TABLE<br>
+GF_MULT_LOG_TABLE<br>
+GF_MULT_LOG_ZERO<br>
+GF_MULT_LOG_ZERO_EXT<br>
+GF_MULT_SPLIT_TABLE<br>
+GF_MULT_COMPOSITE } gf_mult_type_t;<br><br>
+
+</div>
+
+#define GF_REGION_DEFAULT (0x0)<br>
+#define GF_REGION_DOUBLE_TABLE (0x1) <br>
+#define GF_REGION_QUAD_TABLE (0x2) <br>
+#define GF_REGION_LAZY (0x4) <br>
+#define GF_REGION_SSE (0x8) <br>
+#define GF_REGION_NOSSE (0x10) <br>
+#define GF_REGION_ALTMAP (0x20) <br>
+#define GF_REGION_CAUCHY (0x40) <br><br>
+typedef enum { GF_DIVIDE_DEFAULT<br>
+<div style="padding-left:130px">GF_DIVIDE_MATRIX<br>
+GF_DIVIDE_EUCLID } gf_division_type_t;<br><br>
+</div>
+</div>
+<p>
+You can mix the region types with bitwise or. The arguments to <b>GF_MULT_GROUP,GF_MULT_SPLIT_TABLE</b>
+and <b>GF_MULT_COMPOSITE</b> are specified in arg1 and arg2. <b>GF_MULT_COMPOSITE</b> also takes a base field
+in <b>base_gf.</b> The base field is itself a <b>gf_t,</b> which should have been created previously with <b>create_gf_fro_argv(),</b>
+<b>gf_init_easy()</b> or <b>gf_init_hard().</b> Note that this <b>base_gf</b> has its own <b>base_gf</b> member and can be a composite field
+itself.</p>
+<p>
+You can specify an alternate polynomial in <b>prim_poly.</b> For <em>w </em>&#8804 32, the leftmost one (the one in bit position <em>w</em>) is
+optional. If you omit it, it will be added for you. For <em>w </em> = 64, there's no room for that one, so you have to leave it off.
+For <em>w </em>= 128, your polynomial can only use the bottom-most 64 bits. Fortunately, the standard polynomial only uses
+those bits. If you set <b>prim_poly</b> to zero, the library selects the "standard" polynomial.
+</p>
+<p>
+Finally, <b>scratch_memory</b> is there in case you don't want <b>gf_init_hard()</b> to call <b>malloc()</b>. Youmay call <b>gf_scratch_size()</b>
+to find out how much extra memory each technique uses, and then you may pass it a pointer for it to use in <b>scratc_memory.</b>
+If you set scratch memory to NULL, then the extra memory is allocated for you with <b>malloc().</b> If you use <b>gf_init_easy()</b>
+or <b>create_gf_from_argv(),</b> or you use <b>gf_init_hard()</b> and set <b>scratch_memory</b> to <b>NULL,</b> then you should call <b>gf_free()</b>
+to free memory. If you use <b>gf_init_hard()</b> and use your own <b>scratch_memory</b> you can still call <b>gf_free(),</b> and it will
+not do anything.</p>
+<p>
+Both <b>gf_init_hard()</b> and <b>gf_scratch_size()</b> return zero if the arguments don't specify a valid <b>gf_t.</b> When that happens,
+you can call <b>gf_error()</b> to print why the call failed.</p>
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">26  </span> <br><br><br>
+
+
+<p>We'll give you one example of calling <b>gf_ init_hard().</b> Suppose you want to make a <b>gf_ init_hard()</b> call to be
+equivalent to "-m SPLIT 16 4 -r SSE -r ALTMAP -" and you want to allocate the scratch space yourself. Then you'd
+do the following:</p><br><br>
+
+<div id="number_spacing">
+gf_t gf; <br>
+void *scratch; <br>
+int size; <br>
+size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE,<br>
+GF_REGION_SSE | GF_REGION_ALTMAP,<br>
+GF_DIVIDE_DEFAULT,<br>
+16, 4); <br>
+if (size == 0) { gf_error(); exit(1); } /* It failed. That shouldn’t happen */<br>
+scratch = (void *) malloc(size); <br>
+if (scratch == NULL) { perror("malloc"); exit(1); } <br>
+if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, <br>
+GF_REGION_SSE | GF_REGION_ALTMAP, <br>
+GF_DIVIDE_DEFAULT,<br>
+0, 16, 4, NULL, scratch)) { <br>
+gf_error(); <br>
+exit(1); <br>
+} <br>
+
+</div>
+
+
+<h3>6.5 &nbsp   &nbsp   gf_size() </h3>
+
+You can call <b>gf_size(gf_t *gf)</b> to learn the memory consumption of the <b>gf_t.</b> It returns all memory consumed by the
+<b>gf_t,</b> including the <b>gf_t</b> itself, any scratch memory required by the gf_ t, and the memory consumed by the sub-field
+if the field is <b>"COMPOSITE."</b> If you provided your own memory to <b>gf_init_hard(),</b> it does not report the size of
+this memory, but what the size should be, as determined by <b>gf_scratch size(). gf_ unit() </b> prints out the return value of
+<b>gf_size()</b> on the given field.
+
+
+<h2>7 &nbsp Further Information on Options and Algorithms </h2>
+<h3>
+7.1 &nbsp Inlining Single Multiplication and Division for Speed </h3>
+
+Obviously, procedure calls are more expensive than single instructions, and the mechanics of multiplication in <b>"TABLE"</b>
+and <b>"LOG"</b> are pretty simple. For that reason, we support inlining for <b>"TABLE"</b> when <em>w </em> = 4 and <em>w </em> = 8, and
+for <b>"LOG"</b> when <em>w </em> = 16. We elaborate below.
+<p>
+When <em>w </em> = 4, you may inline multiplication and division as follows. The following procedures return pointers to
+the multiplication and division tables respectively: </p> <br><br>
+
+<div id="number_spacing">
+uint8_t *gf_w4_get_mult_table(gf_t * gf);<br>
+uint8_t *gf_w4_get_div_table(gf_t * gf);<br><br>
+</div>
+<p>The macro <b>Gf_W4_INLINE_MULTDIV </b>(<em>table, a, b</em>) then multiplies or divides <em>a </em> by <em>b</em> using the given table. This
+of course only works if the multiplication technique is <b>"TABLE,"</b> which is the default for <em>w </em> = 4. If the multiplication
+technique is not <b>"TABLE,"</b> then <b>gf_w4_get_mult_table()</b> will return <b>NULL.</b></p>
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">27  </span> <br><br><br>
+
+
+
+
+<p>When <em>w </em> = 8, the procedures <b>gf_w8_et_mult_table()</b> and <b>gf_ w8_get_div_table(),</b> and the macro </p>
+
+<b>GF_W8_INLINE_MULTDIV </b>(<em>table, a, b</em>) work identically to the <em>w </em> = 4 case.
+
+<p>When <em>w </em> = 16, the following procedures return pointers to the logarithm table, and the two inverse logarithm tables
+respectively: </p><br>
+
+<div id="number_spacing">
+uint16_t *gf_w16_get_log_table(gf_t * gf); <br>
+uint16_t *gf_w16_get_mult_alog_table(gf_t * gf);<br>
+uint16_t *gf_w16_get_div_alog_table(gf_t * gf);<br>
+
+</div>
+<br>
+<p>
+The first inverse logarithm table works for multiplication, and the second works for division. They actually point
+to the same table, but to different places in the table. You may then use the macro <b>GF_W16_INLINE_MULT</b>(<em>log,
+alog, a, b </em>) to multiply <em>a</em> and <em>b</em>, and the macro <b>GF_W16_INLINE_DIV </b>(<em>log, alog, a, b </em>) to divide a and b. Make
+sure you use the <em>alog</em> table returned by <b>gf_w16_get_mult_alog_table()</b> for multiplication and the one returned by
+<b>gf_w16_get_div_alog_table()</b> for division. Here are some timings: </p> <br><br>
+
+
+UNIX> gf_time 4 M 0 10240 10240 - <br>
+Seed: 0 <br>
+Multiply: 0.228860 s Mops: 100.000 436.949 Mega-ops/s <br>
+UNIX> gf_inline_time 4 0 10240 10240 <br>
+Seed: 0 <br>
+Inline mult: 0.096859 s Mops: 100.000 1032.424 Mega-ops/s <br>
+UNIX> gf_time 8 M 0 10240 10240 - <br>
+Seed: 0 <br>
+Multiply: 0.228931 s Mops: 100.000 436.812 Mega-ops/s <br>
+UNIX> gf_inline_time 8 0 10240 10240 <br>
+Seed: 0 <br>
+Inline mult: 0.114300 s Mops: 100.000 874.889 Mega-ops/s <br>
+UNIX> gf_time 16 M 0 10240 10240 - <br>
+Seed: 0 <br>
+Multiply: 0.193626 s Mops: 50.000 258.229 Mega-ops/s <br>
+UNIX> gf_inline_time 16 0 10240 10240 <br>
+Seed: 0 <br>
+Inline mult: 0.310229 s Mops: 100.000 322.342 Mega-ops/s <br>
+UNIX> <br> <br>
+
+<h3>
+7.2 &nbsp &nbsp Using different techniques for single and region multiplication </h3>
+
+
+You may want to "mix and match" the techniques. For example, suppose you'd like to use "-m SPLIT 8 8" for
+<b>multiply()</b> in <em>GF(2<sup>32</sup>),</em> because it's fast, and you don't mind consuming all of that space for tables. However, for
+<b>multiply_region(),</b> you'd like to use "-m SPLIT 32 4 -r ALTMAP," because that's the fastest way to implement
+<b>multiply_region().</b> Unfortunately, There is no way to create a <b>gf_t</b> that does this combination. In this case, you should
+simply create two <b>gf_t's,</b> and use one for <b>multiply()</b> and the other for <b>multiply_region().</b> All of the implementations
+may be used interchangably with the following exceptions:
+
+<ul>
+<li>
+<b>"COMPOSITE"</b> implements a different Galois Field. </li><br>
+
+<li>If you change a field's polynomial, then the resulting Galois Field will be different. </li>
+
+</ul>
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">28  </span> <br><br><br>
+
+<ul>
+<li>
+
+If you are using <b>"ALTMAP"</b> to multiply regions, then the contents of the resulting regions of memory will
+depend on the multiplication technique, the size of the region and its alignment. Please see section 7.9 for a
+detailed explanation of this. </li>
+
+<li>If you are using <b>"CAUCHY"</b> to multiply regions, then like <b>"ALTMAP,"</b> the contents of the result regions of
+memory the multiplication technique and the size of the region. You don't have to worry about alignment. </li>
+
+<h3>7.3 &nbsp &nbsp General <em>w </em>  </h3>
+The library supports Galois Field arithmetic with 2 < <em>w </em> &#8804 32. Values of <em>w </em> which are not whole number powers of
+2 are handled by the functions in <b>gf_wgen.c</b> . For these values of <em>w </em>, the available multiplication types are <b>"SHIFT,"
+"BYT<em>w </em>O p," "BYT<em>w </em>O b," "GROUP," "TABLE"</b> and <b>"LOG." "LOG" </b> is only valid for <em>w </em> < 28 and <b>"TABLE"</b>
+
+is only valid for <em>w </em> < 15. The defaults for these values of <em>w </em> are <b>"TABLE"</b> for <em>w </em> < 8, <b>"LOG"</b> for <em>w </em> < 16, and
+<b>"BYT<em>w </em>O p"</b> for <em>w </em> < 32.<br><br>
+
+<h3>7.4 Arguments to "SPLIT" </h3>
+
+The "SPLIT" technique is based on the distributive property of multiplication and addition: <br><br>
+<center>
+a * (b + c) = (a * b) + (a * c). </center>
+<br>
+This property allo<em>w </em>s us to, for example, split an eight bit <em>w </em>ord into t<em>w </em>o four-bit components and calculate the product
+by performing t<em>w </em>o table lookups in 16-element tables on each of the compoents, and adding the result. There is much
+more information on <b>"SPLIT"</b> in The Paper. Here <em>w </em>e describe the version of <b>"SPLIT"</b> implemented in GF-Complete.
+
+<p>
+<b>"SPLIT"</b> takes t<em>w </em>o arguments, <em>w </em>hich are the number of bits in each component of a, <em>w </em>hich <em>w </em>e call <em>w </em><sub>a</sub>, and the
+number of bits in each component of b, <em>w </em>hich <em>w </em>e call <em>w </em><sub>b.</sub> If the t<em>w </em>o differ, it does not matter <em>w </em>hich is bigger - the
+library recognizes this and performs the correct implementation. The legal values of <em>w </em><sub>a</sub> and <em>w </em><sub>b</sub> fall into five categories:
+</p><br>
+
+
+<ol>
+<li>
+ <em>w </em><sub>a</sub> is equal to <em>w </em> and <em>w </em><sub>b</sub> is equal to four. In this case, b is broken up into <em>w </em>/4
+four-bit <em>w </em>ords <em>w </em>hich are used
+in 16-element lookup tables. The tables are created on demand in <b>multiply_region()</b> and the SSSE3 instruction
+
+<b>mm_shuffle_epi8()</b> is leveraged to perform 16 lookups in parallel. Thus, these are very fast implementations.
+<em>w </em>hen <em>w </em> &#8805 16, you should combine this <em>w </em>ith <b>"ALTMAP"</b> to get the best performance (see The Paper
+or [PGM13b] for explanation). If you do this please see section 7.9 for information about <b>"ALTMAP"</b> and
+alignment.<br><br>
+
+
+If you don't use <b>"ALTMAP,"</b> the implementations for <em>w </em> &#8712 {16, 32, 64} convert the standard representation into
+<b>"ALTMAP,"</b> perform the multiplication <em>w </em>ith <b>"ALTMAP"</b> and then convert back to the standard representation.
+The performance difference using <b>"ALTMAP"</b> can be significant: <br><br><br>
+
+<div id="number_spacing">
+<center>
+<div id="table_page28">
+<table cellpadding="6" cellspacing="0" style="text-align:center;font-size:19px">
+<tr>
+<td> gf_time 16 G 0 1048576 100 -m SPLIT 16 4 -</td> <td>Speed = 8,389 MB/s </td> 
+</tr>
+<tr>
+<td>gf_time 16 G 0 1048576 100 -m SPLIT 16 4 -r ALTMAP - </td> <td>Speed = 8,389 MB/s </td> 
+</tr>
+
+<tr>
+<td>gf_time 32 G 0 1048576 100 -m SPLIT 32 4 -</td> <td> Speed = 5,304 MB/s</td> 
+</tr>
+<tr>
+<td>gf_time 32 G 0 1048576 100 -m SPLIT 32 4 -r ALTMAP -</td> <td> Speed = 7,146 MB/s</td> 
+</tr>
+
+
+<tr>
+<td>gf_time 64 G 0 1048576 100 -m SPLIT 64 4 - </td> <td>Speed = 2,595 MB/s </td> 
+</tr>
+
+<tr>
+<td>gf_time 64 G 0 1048576 100 -m SPLIT 64 4 -r ALTMAP - </td> <td>Speed = 3,436 MB/s </td> 
+</tr>
+</div>
+
+
+
+</table>
+</div>
+
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">29  </span> <br><br><br>
+
+<ol style="list-style-type:none">
+
+
+<li>2. &nbsp w<sub>a</sub> is equal to <em>w </em> and w<sub>b</sub> is equal to eight. Now, b is broken into bytes, each of these is used in its own 256-element
+lookup table. This is typically the best w<sub>a</sub>y to perform <b>multiply_region()</b> without SSE.</li> 
+Because this is a region optimization, when you specify these options, you get a default <b>multiply()</b>  see
+Table 1 for a listing of the defaults. See section 7.2 for using a different <b>multiply()</b> than the defaults.<br><br>
+
+
+<li>
+3. &nbsp w<sub>a</sub> is equal to <em>w </em> and <em>w </em><sub>b</sub> is equal to 16. This is only valid for <em>w </em> = 32 and <em>w </em> = 64. No<em>w </em>, b is broken into shorts,
+each of these is used in its own 64K-element lookup table. This is typically slower than when <em>w </em><sub>b</suB> equals 8, and
+requires more amortization (larger buffer sizes) to be effective. </li><br>
+
+
+<li>4. &nbsp <em>w </em><sub>a</sub> and <em>w </em><sub>b</sub> are both equal to eight. Now  both <em>a</em> and <em>b</em> are broken into bytes, 
+and the products of the various bytes
+are looked up in multiple 256 &#215 256 tables. In <em>GF(2<sup>16</sup>),</em> there are three of these tables. In <em>GF(232),</em> there are
+seven, and in <em>GF(2<sup>64</sup>)</em> there are fifteen. Thus, this implementation can be a space hog. How ever, for <em>w </em> = 32,
+this is the fastest way to perform <b>multiply()</b> on some machines.
+when this option is employed, <b>multiply_region()</b> is implemented in an identical fashion to when <em>w </em><sub>a</sub> = <em>w </em>
+and <em>w </em><sub>b</sub> = 8. </li><br>
+
+<li>5.&nbsp w<sub>a</sub> = 32 and w<sub>b</sub> = 2. (<em>w</em> = 32 only). I was playing with a different way to use <b>mm_shuffle_epi8().</b> It works,
+but it's slower than when w<sub>b</sub> = 4.
+</li>
+
+</ul>
+
+
+
+<h2>7.5 &nbsp&nbsp Arguments to "GROUP" </h3>
+
+The <b>"GROUP"</b> multiplication option takes t<em>w </em>o arguments, g<sub>s</sub> and g<sub>r</sub>. It implements multiplication in the same manner
+as <b>"SHIFT,"</b> except it uses a table of size 2<sup>gs</sup> to perform g<sup>s</sup> shifts at a time, and a table of size 2<sup>gr</sup> to perform g<sup>r</sup>
+reductions at at time. The program <b>gf_methods</b> only prints the options 4 4 and 4 8 as arguments for <b>"GROUP."</b>
+However, other values of g<sub>s</sub> and g<sub>r</sub> are legal and sometimes desirable: <br><br>
+
+<ol>
+<li>
+ For <em>w </em> &#8804 32 and <em>w </em> = 64, any values of g<sub>s</sub> and g<sub>r</sub> may be used, so long as they are less than or equal to <em>w </em> and so
+long as the tables fit into memory. There are four exceptions to this, listed belo<em>w </em>. </li><br>
+<li> For <em>w </em> = 4, <b>"GROUP"</b> is not supported. </li><br>
+<li> For <em>w </em> = 8, <b>"GROUP"</b> is not supported. </li><br>
+<li> For <em>w </em> = 16, <b>"GROUP"</b> is only supported for gs = gr = 4. </li><br>
+<li> For <em>w </em> = 128 <b>"GROUP"</b> only supports <em>g<sub>s</sub></em> = 4 and <em> g<sub>r</b> </em> &#8712 {4, 8, 16}.</li><br>
+</ol>
+<p>
+The way that gs and gr impact performance is as follows. The <b>"SHIFT"</b> implementation works by performing a
+carry-free multiplication in <em>w </em> steps, and then performing reduction in <em>w </em> steps. In "GROUP," the carry-free multiplication
+is reduced to  <em>w /</em>g<sub>s</sub>steps, and the reduction is reduced to <em>w /</em>g<sub>r</sub>
+
+. Both require tables. The table for the carry-free
+multiplication must be created at the beginning of each <b>multiply()</b> or <b>multiply_region(),</b> while the table for reduction
+is created when the <b>gf_t</b> is initialized. For that reason, it makes sense for g<sub>r</sub> to be bigger than g<sub>s.</sub></p>
+
+<p>
+To give a flavor for the impact of these arguments, Figure 3 show </em>s the performance of varying g<sub>s</sub> and g<sub>r</sub> for
+single multiplication and region multiplication respectively, in <em> GF(2<sup>32</sup>)</em> and <em>GF(2<sup>64</sup>).</em> As the graphs demonstrate,
+<b>multiply()</b> performs better <em>w </em>ith smaller values of gs, <em>w </em>hile multiply region() amortizes the creation of the shifting
+table, and can tolerate larger values of g<sub>s.</sub> <em>w </em>hen g<sub>s</sub> equals g<sub>r,</sub> there are some optimizations that we hand-encode.
+These can be seen clearly in the <b>multiply_region()</b> graphs.
+</p>
+
+
+
+
+
+
+
+
+<br/>
+7 &nbsp &nbsp  <em>   FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">30 </span> 
+
+
+<div id="box_1"> 
+ 
+<div class="image-cell_3"> </div>
+
+<div class="image-cell_4"> </div>
+</div>
+Figure 3: The performance of <b>multiply()</b> and <b>multiply_region()</b> using <b>"GROUP,"</b> and varying the arguments <br> g<sub>s</sub>
+and g<sub>r.</sub> All graphs are heat maps with black equaling zero. The region size is 100KB.
+
+<h3>7.6 &nbspConsiderations with "COMPOSITE" </h3>
+
+
+As mentioned above, using <b>"ALTMAP"</b> with <b>"COMPOSITE"</b> allows <b>multiply_region()</b> to recursively call <b>multiply_
+region(),</b> rather than simply calling <b>multiply()</b> on every word in the region. The difference can be pronounced:<br><br>
+
+<div id="table_page28"><center>
+
+<table cellpadding="6" cellspacing="0" style="text-align:center;font-size:19px"><tr>
+<td>
+gf_time 32 G 0 10240 10240 -m COMPOSITE 2 - -
+Speed = 322 MB/s </td> </tr>
+<tr>
+<td>gf_time 32 G 0 10240 10240 -m COMPOSITE 2 - -r ALTMAP -
+Speed = 3,368 MB/s </td> </tr>
+
+<tr>
+<td>
+gf_time 32 G 0 10240 10240 -m COMPOSITE 2 -m SPLIT 16 4 -r ALTMAP - -r ALTMAP -
+Speed = 3,925 MB/s </td> </tr>
+</center>
+</table>
+</div>
+
+
+<br><br>
+<p>
+There is support for performing <b>multiply()</b> inline for the <b>"TABLE"</b> implementations for w &#8712 {4, 8} and for the
+"LOG" implementation for <em>w</em> = 16 (see section 7.1). These are leveraged by <b>multiply()</b> in <b>"COMPOSITE,"</b> and
+by <b>multiply_region()</b> if you are not using <b>"ALTMAP."</b> To demonstrate this, in the table below, you can see that the
+performance of <b>multiply()</b> with <b>"SPLIT 8 4"</b> is 88 percent as fast than the default in <em>w</em> = 8 (which is <b>"TABLE"</b>).
+When you use each as a base field for <b>"COMPOSITE"</b> with <em>w</em> = 16, the one with <b>"SPLIT 8 4"</b> is now just 37 percent
+as fast. The difference is the inlining of multiplication in the base field when <b>"TABLE"</b> is employed:</p><br><br>
+
+<div id="table_page28" border="0"><center>
+
+    <table cellpadding="6" cellspacing="0" style="text-align:center;font-size:19px">
+
+      <tr><td>gf_time 8 M 0 1048576 100 - Speed = 501 Mega-ops/s</td> </tr>
+      <tr><td>gf_time 8 M 0 1048576 100 -m SPLIT 8 4 - Speed = 439 Mega-ops/s </td> </tr>
+      <tr><td>gf_time 8 M 0 1048576 100 -m COMPOSITE 2 - - Speed = 207 Mega-ops/s </td> </tr>
+      <tr><td>gf_time 8 M 0 1048576 100 -m COMPOSITE 2 -m SPLIT 8 4 - - Speed = 77 Mega-ops/s </td> </tr>
+
+    </table> 
+    </center>
+<br><br>
+</div>
+
+You can keep making recursive definitions of composites field if you want. For example, this one's not too slow for
+region operations (641 MB/s):
+
+
+
+
+
+
+
+
+<br/>
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">31  </span> <br><br><br>
+
+<div id="number_spacing">
+<center>
+gf_time 128 G 0 1048576 100 -m COMPOSITE 2 <span style="color:red">-m COMPOSITE 2 </span> <span style="color:blue">-m COMPOSITE 2 </span> <br>
+<span style="color:rgb(250, 149, 167)">-m SPLIT 16 4 -r ALTMAP -</span> <span style="color:blue">-r ALTMAP -</span> <span style="color:red"> -r ALTMAP -</span> -r ALTMAP -
+</center>
+</div><br>
+
+<p>Please see section 7.8.1 for a discussion of polynomials in composite fields.</p>
+
+<h2>7.7 &nbsp &nbsp &nbsp "CARRY_FREE" and the Primitive Polynomial </h2>
+
+
+If your machine supports the PCLMUL instruction, then we leverage that in <b>"CARRY_FREE."</b> This implementation
+first performs a carry free multiplication of two <em>w</em>-bit numbers, which yields a 2<em>w</em>-bit number. It does this with
+one PCLMUL instruction. To reduce the 2<em>w</em>-bit number back to a <em>w</em>-bit number requires some manipulation of the
+polynomial. As it turns out, if the polynomial has a lot of contiguous zeroes following its leftmost one, the number of
+reduction steps may be minimized. For example, with <em>w </em> = 32, we employ the polynomial 0x100400007, because that
+is what other libraries employ. This only has 9 contiguous zeros following the one, which means that the reduction
+takes four steps. If we instead use 0x1000000c5, which has 24 contiguous zeros, the reduction takes just two steps.
+You can see the difference in performance:
+<br><br>
+<center>
+<div id="table_page28">
+
+<table cellpadding="6" cellspacing="0" style="text-align:center;font-size:19px">
+<tr>
+
+<td>gf_time 32 M 0 1048576 100 -m CARRY_FREE - </td> <td> Speed = 48 Mega-ops/s</td> </tr>
+
+<tr><td>gf_time 32 M 0 1048576 100 -m CARRY_FREE -p 0xc5 -</td> <td> Speed = 81 Mega-ops/s </td> </tr>
+
+</table></center>
+</div>
+<br><br>
+
+<p>
+This is relevant for <em>w </em> = 16 and <em>w </em> = 32, where the "standard" polynomials are sub-optimal with respect to
+<b>"CARRY_FREE."</b> For <em>w </em> = 16, the polynomial 0x1002d has the desired property. It’s less important, of course,
+with <em>w </em> = 16, because <b>"LOG"</b> is so much faster than <b>CARRY_FREE.</b> </p>
+
+<h2>7.8 &nbsp  More on Primitive Polynomials </h3>
+
+<h3>7.8.1 &nbsp Primitive Polynomials that are not Primitive </h4>
+
+The library is willing to work with most polynomials, even if they are not primitive or irreducible. For example, the
+polynomial x<sup>4</sup> + x<sup>3</sup> +x<sup>2</sup> +x+1 is irreducible, and therefore generates a valid Galois Field for <em>GF(2<sup>4</sup>).</em> However, it
+is not primitive, because 2<sup>5</sup> = 1. For that reason, if you use this polynomial, you cannot use the <b>"LOG"</b> method. The
+other methods will work fine: <br><br>
+
+<div id="number_spacing">
+
+UNIX> gf_mult 2 2 4 -p 0xf -  <br>
+4 <br>
+UNIX> gf_mult 4 2 4 -p 0xf - <br>
+8 <br>
+UNIX> gf_mult 8 2 4 -p 0xf - <br>
+15 <br>
+UNIX> gf_mult 15  2 4 -p 0xf - <br>
+1 <br>
+UNIX> gf_div 1 15 4 -p 0xf - <br>
+2 <br>
+UNIX> gf_div 1 15 4 -p 0xf -m LOG - <br>
+usage: gf_div a b w [method] - does division of a and b in GF(2&#710;w) <br>
+Bad Method Specification: Cannot use Log tables because the polynomial is not primitive. <br>
+UNIX>  <br>
+</div>
+<p>
+If a polynomial is reducible, then it does not define a Galois Field, but instead a ring. GF-Complete attempts to
+work here where it can; however certain parts of the library will not work:
+</p>
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">32  </span> <br><br><br>
+<ol>
+<li>
+Division is a best effort service. The problemis that often quotients are not unique. If <b>divide()</b> returns a non-zero
+number, then that number will be a valid quotient, but it may be one of many. If the multiplication technique is
+<b>"TABLE,"</b> then if a quotient exists, one is returned. Otherwise, zero is returned. Here are some examples - the
+polynomial x<sup>4</sup> + 1 is reducible, and therefore produces a ring. Below, we see that with this polynomal, 1*6 = 6
+and 14*6 = 6. Therefore, 6/6 has two valid quotients: 1 and 14. GF-Complete returns 14 as the quotient:</li><br>
+
+<div id="number_spacing">
+UNIX> gf_mult 1 6 4 -p 0x1 -<br>
+6 <br>
+UNIX> gf_mult 14 6 4 -p 0x1 - <br>
+6 <br>
+UNIX> gf_div 6 6 4 -p 0x1 - <br>
+14 <br>
+UNIX> <br><br>
+</div>
+
+
+<li>When <b>"EUCLID"</b> is employed for division, it uses the extended Euclidean algorithm for GCD to find a number's
+inverse, and then it multiplies by the inverse. The problem is that not all numbers in a ring have inverses. For
+example, in the above ring, there is no number <em>a</em> such that 6a = 1. Thus, 6 has no inverse. This means that even
+though 6/6 has quotients in this ring, <b>"EUCLID"</b> will fail on it because it is unable to find the inverse of 6. It will
+return 0:
+</li><br>
+<div id="number_spacing">
+UNIX> gf_div 6 6 4 -p 0x1 -m TABLE -d EUCLID -<br>
+0<br>
+UNIX><br>
+</div><br>
+
+<li> Inverses only work if a number has an inverse. Inverses may not be unique. </li><br>
+
+<li> <b>"LOG"</b> will not work. In cases where the default would be <b>"LOG,"</b> <b>"SHIFT"</b> is used instead. </li>
+</ol>
+
+<p>
+Due to problems with division, <b>gf_unit</b> may fail on a reducible polynomial. If you are determined to use such a
+polynomial, don't let this error discourage you.
+</p>
+
+<h3>7.8.2 Default Polynomials for Composite Fields </h3>
+
+GF-Complete will successfully select a default polynomial in the following composite fields:
+<ul>
+<li> <em>w </em> = 8 and the default polynomial (0x13) is employed for <em>GF(2<sup>4</sup>)</em></li><br>
+<li> w = 16 and the default polynomial (0x11d) is employed for <em>GF(2<sup>8</sup>)</em></li><br>
+<li> <em>w </em> = 32 and the default polynomial (0x1100b) is employed for <em>GF(2<sup>16</sup>) </em></li><br>
+<li> <em>w </em> = 32 and 0x1002d is employed for <em>GF(2<sup>16</sup>) </em></li><br>
+<li> <em>w </em> = 32 and the base field for <em>GF(w<em>16</em>) </em> is a composite field that uses a default polynomial</li><br>
+<li> <em>w </em> = 64 and the default polynomial (0x100400007) is employed for <em>GF(2<sup>32</sup>)</em></li><br>
+<li> <em>w </em> = 64 and 0x1000000c5 is employed for <em>GF(2<sup>32</sup>) </em></li><br>
+<li> <em>w </em> = 64 and the base field for <em>GF(w<sup>32</sup>) </em> is a composite field that uses a default polynomial</li><br>
+<li> <em>w </em> = 128 and the default polynomial (0x1b) is employed for <em>GF(2<sup>64</sup>) </em></li><br>
+<li> <em>w </em> = 128 and the base field for <em> GF(w<sup>64 </sup>) </em> is a composite field that uses a default polynomial</li><br>
+</ul>
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">33  </span> <br><br><br>
+
+
+<h3>7.8.3 The Program gf_poly for Verifying Irreducibility of Polynomials </h3>
+
+The program <b>gf_poly</b> uses the Ben-Or algorithm[GP97] to determine whether a polynomial with coefficients in <em> GF(2<sup>w </sup>) </em>
+is reducible. Its syntax is:<br><br>
+<div id="number_spacing">
+gf_poly w method power:coef power:coef ... 
+</div>
+
+<br>
+<p>You can use it to test for irreducible polynomials with binary coefficients by specifying w = 1. For example, from
+the discussion above, we know that x<sup>4</sup> +x+1 and x<sup>4</sup> +x<sup>3</sup> +x<sup>2</sup> +x+1 are both irreducible, but x<sup>4</sup> +1 is reducible.
+<b>gf_poly</b> confirms:<p><br>
+
+<div id="number_spacing">
+UNIX> gf_poly 1 - 4:1 1:1 0:1 <br>
+Poly: x&#710;4 + x + 1 <br>
+Irreducible. <br>
+UNIX> gf_poly 1 - 4:1 3:1 2:1 1:1 0:1 <rb>
+Poly: x&#710;4 + x&#710;3 + x&#710;2 + x + 1 <br>
+Irreducible. <br>
+UNIX> gf_poly 1 - 4:1 0:1 r<br>
+Poly: x&#710;4 + 1 <br>
+Reducible. <br>
+UNIX> <br>
+
+</div>
+
+
+<p>
+For composite fields <em>GF((2<sup>l</sup>)<sup>2</sup>),</em> we are looking for a value s such that x<sup>2</sup> + sx + 1 is irreducible. That value
+depends on the base field. For example, for the default field <em>GF(2<sup>32</sup>),</em> a value of <em>s</em> = 2 makes the polynomial
+irreducible. However, if the polynomial 0xc5 is used (so that PCLMUL is fast - see section 7.7), then <em>s</em> = 2 yields a
+reducible polynomial, but <em>s</em> = 3 yields an irreducible one. You can use <b>gf_poly</b> to help verify these things, and to help
+define s if you need to stray from the defaults:</p> <br>
+
+<div id="number_spacing">
+UNIX> gf_poly 32 - 2:1 1:2 0:1<br>
+Poly: x&#710;2 + (0x2)x + 1 <br>
+Irreducible. <br>
+UNIX> gf_poly 32 -p 0xc5 - 2:1 1:2 0:1 <br>
+Poly: x&#710;2 + (0x2)x + 1 <br>
+Reducible. <br>
+UNIX> gf_poly 32 -p 0xc5 - 2:1 1:3 0:1 <br>
+Poly: x&#710;2 + (0x3)x + 1 <br>
+Irreducible. <br>
+UNIX> <br>
+</div>
+
+<p>
+<b>gf_unit</b> does random sampling to test for problems. In particular, it chooses a random a and a random b, multiplies
+them, and then tests the result by dividing it by a and b. When w is large, this sampling does not come close to
+providing complete coverage to check for problems. In particular, if the polynomial is reducible, there is a good
+chance that <b>gf_unit</b> won't discover any problems. For example, the following <b>gf_unit</b> call does not flag any problems,
+even though the polynomial is reducible.</p>
+<br>
+<div id="number_spacing">
+UNIX> gf_unit 64 A 0 -m COMPOSITE 2 -p 0xc5 - -p 2 -<br>
+UNIX>
+</div>
+
+<p>
+How can we demonstrate that this particular field has a problem? Well, when the polynomial is 0xc5, we can factor
+x<sup>2</sup> + 2x + 1 as (x + 0x7f6f95f9)(x + 0x7f6f95fb). Thus, in the composite field, when we multiply 0x17f6f95f9 by
+0x17f6f95fb, we get zero. That's the problem:
+</p>
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">34  </span> <br><br><br>
+
+<div id="number_spacing">
+
+UNIX> gf_mult 7f6f95f9 7f6f95fb 32h -p 0xc5 - <br>
+1 <br>
+UNIX> gf_mult 17f6f95f9 17f6f95fb 64h -m COMPOSITE 2 -p 0xc5 - -p 2 - <br>
+0 <br>
+UNIX> <br>
+
+</div>
+
+<h2>7.9 "ALTMAP" considerations and extract_word() </h2>
+
+There are two times when you may employ alternate memory mappings:
+<ol>
+<li> When using <b>"SPLIT"</b> and w<sub>b</sub> = 4. </li>
+<li> When using <b>"COMPOSITE."</b> </li>
+</ol>
+
+Additionally, by default, the <b>"CAUCHY"</b> region option also employs an alternate memory mapping.
+
+<p>When you use alternate memory mappings, the exact mapping of words in <em> GF(2<sup>w </sup>) </em> to memory depends on the
+situation, the size of the region, and the alignment of the pointers. To help you figure things out, we have included the
+procedures <b>extract_word.wxx()</b> as part of the <b>gf_t</b> struct. This procedure takes four parameters: </p>
+<ul>
+<li>A pointer to the <b>gf_t.</b> </li>
+<li> The beginning of the memory region. </li>
+<li>The number of bytes in the memory region. </li>
+<li>The desired word number: <em>n.</em> </li>
+</ul>
+
+<p>
+It then returns the <em>n</em>-th word in memory. When the standard mapping is employed, this simply returns the <em>n</em>-
+th contiguous word in memory. With alternate mappings, each word may be split over several memory regions, so
+<b>extract_word()</b> grabs the relevant parts of each memory region to extract the word. Below, we go over each of the
+above situations in detail. Please refer to Figure 2 in Section 5 for reference. </p>
+
+
+<h3>7.9.1 Alternate mappings with "SPLIT" </h3>
+
+The alternate mapping with <b>"SPLIT"</b> is employed so that we can best leverage <b>mm_shuffle_epi8().</b> Please read [PGM13b]
+for details as to why. Consider an example when <em>w</em> = 16. In the main region of memory (the middle region in Figure
+2), multiplication proceeds in units of 32 bytes, which are each broken into two 16-byte regions. The first region
+holds the high bytes of each word in <em>GF(2<sup>16</sup>),</em> and the second region holds the low bytes.
+Let's look at a very detailed example, from <b>gf_example_5.c.</b> This program makes the following call, where <b>gf</b> has
+
+been initialized for <em>w</em> = 16, using <b>"SPLIT"</b> and <b>"ALTMAP:"</b><br><br>
+<div id="number_spacing">
+gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0);
+</div><br>
+
+
+<p>In other words, it is multiplying a region a of 60 bytes (30 words) by the constant 0x1234 in <em> GF(2<sup>16</sup>),</em> and placing
+the result into <em>b.</em> The pointers <em>a</em> and <em>b</em> have been set up so that they are not multiples of 16. The first line of output
+prints <em>a</em> and <em>b:</em></p><br>
+
+a: 0x10010008c b: 0x10010015c <br><br>
+
+As described in Section 5, the regions of memory are split into three parts:
+
+
+
+
+
+
+
+
+<br/>
+
+
+6 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">35  </span> <br><br><br>
+
+
+<ol>
+<li> 4 bytes starting at 0x1001008c / 0x10010015c. </li>
+<li> 32 bytes starting at 0x10010090 / 0x100100160. </li>
+<li> 24 bytes starting at 0x100100b0 / 0x100100180. </li>
+
+</ol>
+
+
+<p>In the first and third parts, the bytes are laid out according to the standard mapping. However, the second part is
+split into two 16-byte regions- one that holds the high bytes of each word and one that holds the low bytes. To help
+illustrate, the remainder of the output prints the 30 words of <em>a</em> and <em>b</em> as they appear in memory, and then the 30 return
+values of <b>extract_word.w32():</b> </p><br>
+
+<div id="number_spacing">
+<table cellspacing="6" style="text-align:right">
+
+<tr>
+<td></td> <td> 1</td> <td> 2 </td> <td> 3 </td> <td> 4</td> <td> 5 </td> <td> 6 </td> <td> 7</td> <td> 8 </td> <td> 9</td> </tr>
+<tr>
+<td>a:</td><td> 640b</td> <td> 07e5</td> <td> 2fba </td> <td> ce5d </td> <td> f1f9</td> <td> 3ab8</td> <td> c518 </td> <td> 1d97</td> <td> 45a7</td>
+ <td> 0160</td> </tr>
+ 
+<tr><td>b:</td> <td>1ba3</td><td> 644e</td> <td> 84f8</td> <td> be3c</td> <td> 4318</td> <td> 4905</td> <td> b2fb </td> <td> 46eb </td> <td> ef01 </td>
+ <td>a503</td> 
+</tr>
+</table> 
+ <br><br>
+<table cellspacing="6" style="text-align:right">
+
+<tr>
+<td> 10</td> <td> 11 </td> <td> 12</td> <td> 13</td> <td> 14 </td> <td> 15 </td> <td> 16</td> <td> 17</td> <td>18</td> <td> 19 </td></tr>
+<tr>
+<td>a:</td><td> 3759</td> <td> b107</td> <td> 9660 </td> <td> 3fde </td> <td> b3ea</td> <td> 8a53</td> <td> 75ff </td> <td> 46dc</td> <td> c504</td>
+ <td> 72c2</td> </tr>
+ 
+<tr><td>b:</td> <td>da27</td><td> e166</td> <td> a0d2</td> <td> b3a2</td> <td> 1699</td> <td> 3a3e</td> <td> 47fb </td> <td> 39af </td> <td> 1314 </td>
+ <td>8e76</td> 
+</tr>
+</table> 
+
+<table cellspacing="6" style="text-align:right">
+<br><br>
+<tr>
+<td> 20</td> <td> 21 </td> <td> 22</td> <td> 23</td> <td> 24 </td> <td> 25 </td> <td> 26</td> <td> 27</td> <td>28</td> <td> 29 </td></tr>
+<tr>
+<td>a:</td><td> b469</td> <td> 1b97</td> <td> e91d </td> <td> 1dbc </td> <td> 131e</td> <td> 47e0</td> <td> c11a </td> <td> 7f07</td> <td> 76e0</td>
+ <td> fe86</td> </tr>
+ 
+<tr><td>b:</td> <td>937c</td><td> a5db</td> <td> 01b7</td> <td> 7f5f</td> <td> 8974</td> <td> 05e1</td> <td> cff3 </td> <td> a09c </td> <td> de3c </td>
+ <td>4ac0</td> 
+</tr>
+</table> 
+<br><br>
+<table cellspacing="6">
+
+
+<tr><td>Word</td><td> 0:</td> <td>0x640b </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x1ba3 Word 15:</td> <td>0x4575 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xef47</td></tr>     
+<tr><td>Word</td> <td> 1:</td> <td>0x07e5 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x644e Word 16:</td> <td>0x60dc </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x03af</td></tr>
+<tr><td>Word</td> <td> 2:</td> <td>0xba59 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xf827 Word 17:</td> <td>0x0146 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xa539 </td> </tr>
+<tr><td>Word</td> <td>3:</td> <td>0x2f37 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x84da Word 18:</td> <td>0xc504 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x1314 </td> </tr>
+<tr><td>Word</td> <td>4:</td> <td>0x5d07 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x3c66 Word 19:</td> <td>0x72c2 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x8e76 </td> </tr>
+<tr><td>Word</td> <td>5:</td> <td>0xceb1 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xbee1 Word 20:</td> <td>0xb469 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x937c </td> </tr>
+<tr><td>Word</td> <td>6:</td> <td>0xf960 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x18d2 Word 21:</td> <td>0x1b97 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xa5db </td> </tr>
+<tr><td>Word</td> <td>7:</td> <td>0xf196 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x43a0 Word 22:</td> <td>0xe91d </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x01b7 </td> </tr>
+<tr><td>Word</td> <td>8:</td> <td>0xb8de </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x05a2 Word 23:</td> <td>0x1dbc </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x7f5f </td> </tr>
+<tr><td>Word</td> <td>9:</td> <td>0x3a3f </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x49b3 Word 24:</td> <td>0x131e </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x8974 </td> </tr>
+<tr><td>Word</td> <td>10:</td> <td>0x18ea </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xfb99 Word 25:</td> <td>0x47e0 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x05e1 </td> </tr>
+<tr><td>Word</td> <td>11:</td> <td>0xc5b3 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xb216 Word 26:</td> <td>0xc11a </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xcff3  </td> </tr>
+<tr><td>Word</td> <td>12:</td> <td>0x9753 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xeb3e Word 27:</td> <td>0x7f07 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xa09c  </td> </tr>
+<tr><td>Word</td> <td>13:</td> <td>0x1d8a </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x463a Word 28:</td> <td>0x76e0 </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0xde3c  </td> </tr>
+<tr><td>Word</td> <td>14:</td> <td>0xa7ff </td><td>*</td> <td>0x1234</td> <td>=</td> <td>0x01fb Word 29:</td> <td>0xfe86 <td>*</td> <td>0x1234</td> <td>=</td> <td>0x4ac0 </td> </tr>
+
+</table>
+</div>
+<br>
+In the first region are words 0 and 1, which are identical to how they appear in memory: 0x640b and 0x07e5. In
+the second region are words 2 through 17. These words are split among the two sixteen-byte regions. For example,
+word 2, which <b>extract_word()</b> reports is 0xba59, is constructed from the low byte in word 2 (0xba) and the low byte
+in word 10 (0x59). Since 0xba59 * 0x1234 = 0xf827, we see that the low byte in word 2 of <em> b </em> is 0xf8, and the low byte
+in word 10 is 0x27.
+<p>When we reach word 22, we are in the third region of memory, and words are once again identical to how they
+appear in memory.</p>
+
+<p>While this is confusing, we stress that that so long as you call <b>multiply_region()</b> with pointers of the same alignment
+and regions of the same size, your results with <b>ALTMAP</b> will be consistent. If you call it with pointers of </p>
+
+
+
+
+
+
+<br/>
+
+
+7 &nbsp &nbsp  <em>  FURTHER INFORMATION ON OPTIONS AND ALGORITHMS     </em>   <span id="index_number">36  </span> <br><br><br>
+
+different alignments, or with different region sizes, then the results will not be consistent. To reiterate, if you don't use
+<b>ALTMAP,</b> you don't have to worry about any of this - words will always be laid out contiguously in memory.
+<p>
+When <em>w</em> = 32, the middle region is a multiple of 64, and each word in the middle region is broken into bytes, each
+of which is in a different 16-byte region. When <em>w</em> = 64, the middle region is a multiple of 128, and each word is
+stored in eight 16-byte regions. And finally, when<em>w</em> = 128, the middle region is a multiple of 128, and each word is
+stored in 16 16-byte regions.</p><br>
+
+<h3>7.9.2 &nbsp Alternate mappings with "COMPOSITE" </h3>
+
+With <b>"COMPOSITE,"</b> the alternate mapping divides the middle region in half. The lower half of each word is stored
+in the first half of the middle region, and the higher half is stored in the second half. To illustrate, gf_example_6
+performs the same example as gf_example_5, except it is using <b>"COMPOSITE"</b> in GF((2<sup>16</sup>)<sup>2</sup>), and it is multiplying
+a region of 120 bytes rather than 60. As before, the pointers are not aligned on 16-bit quantities, so the region is broken
+into three regions of 4 bytes, 96 bytes, and 20 bytes. In the first and third region, each consecutive four byte word is a
+word in <em>GF(2<sup>32</sup>).</em> For example, word 0 is 0x562c640b, and word 25 is 0x46bc47e0. In the middle region, the low two
+bytes of each word come from the first half, and the high two bytes come from the second half. For example, word 1
+as reported by <b>extract_word()</b> is composed of the lower two bytes of word 1 of memory (0x07e5), and the lower two
+bytes of word 13 (0x3fde). The product of 0x3fde07e5 and 0x12345678 is 0x211c880d, which is stored in the lower
+two bytes of words 1 and 13 of <em>b.</em><br><br>
+
+a: 0x10010011c b: 0x1001001ec
+
+<br><br>
+
+<div id="number_spacing">
+<table cellspacing="6" style="text-align:right">
+
+<tr>
+<td></td> <td> 1</td> <td> 2 </td> <td> 3 </td> <td> 4</td> <td> 5 </td> <td> 6 </td> <td> 7</td> <td> 8 </td> <td> 9</td> </tr>
+<tr>
+<td>a:</td><td> 562c640b</td> <td> 959407e5</td> <td> 56592fba </td> <td> cbadce5d </td> <td> 1d1cf1f9</td> <td> 35d73ab8</td> <td> 6493c518 </td> <td> b37c1d97</td> 
+<td> 8e4545a7</td>
+ <td> c0d80160</td> </tr>
+ 
+<tr><td>b:</td> <td>f589f36c</td><td> f146880d</td> <td> 74f7b349</td> <td> 7ea7c5c6</td> <td> 34827c1a</td> <td> 93cc3746</td> <td> bfd9288b </td>
+ <td> 763941d1 </td> 
+<td> bcd33a5d </td>
+ <td>da695e64</td> 
+</tr>
+</table> 
+
+
+<br><br>
+<table cellspacing="6" style="text-align:right">
+
+<tr>
+<td> 10</td> <td> 11 </td> <td> 12</td> <td> 13</td> <td> 14 </td> <td> 15 </td> <td> 16</td> <td> 17</td> <td>18</td> <td> 19 </td></tr>
+<tr>
+<td>a:</td><td> 965b3759</td> <td> cb3eb107</td> <td> 1b129660 </td> <td> 95a33fde </td> <td> 95a7b3ea</td> <td> d16c8a53</td> <td> 153375ff </td> 
+<td> f74646dc</td> <td> 35aac504</td>
+ <td> 98f972c2</td> </tr>
+ 
+<tr><td>b:</td> <td>fd70f125</td><td> 3274fa8f</td> <td> d9dd34ee</td> <td> c01a211c</td> <td> d4402403</td> <td> 8b55c08b</td> <td> da45f0ad </td> 
+<td> 90992e18 </td> <td> b65e0902 </td>
+ <td>d91069b5</td> 
+</tr>
+</table> 
+
+
+<table cellspacing="6" style="text-align:right">
+<br><br>
+<tr>
+<td> 20</td> <td> 21 </td> <td> 22</td> <td> 23</td> <td> 24 </td> <td> 25 </td> <td> 26</td> <td> 27</td> <td>28</td> <td> 29 </td></tr>
+<tr>
+<td>a:</td><td> 5509b469</td> <td> 7f8a1b97</td> <td> 3472e91d </td> <td> 9ee71dbc </td> <td> de4e131e</td> <td> 46bc47e0</td> <td> 5bc9c11a </td>
+ <td> 931d7f07</td> <td> c85cfe86</td>
+ <td> fe86</td> </tr>
+ 
+<tr><td>b:</td> <td>fc92b8f5</td><td> edd59668</td> <td> b4bc0d90</td> <td> a679e4ce</td> <td> 1a98f7d0</td> <td> 6038765f</td> <td> b2ff333f </td> <td> e7937e49 </td> 
+<td> fa5a5867 </td>
+ <td>79c00ea2</td> 
+</tr>
+</table> 
+<br><br>
+
+
+<table cellspacing="6" style="text-align:right">
+
+
+<tr><td>Word</td><td> 0:</td> <td>0x562c640b </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xf589f36c Word 15:</td> <td>0xb46945a7 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xb8f53a5d</td></tr>     
+<tr><td>Word</td> <td> 1:</td> <td>0x3fde07e5 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x211c880d Word 16:</td> <td>0x55098e45 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xfc92bcd3</td></tr>
+<tr><td>Word</td> <td> 2:</td> <td>0x95a39594 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xc01af146 Word 17:</td> <td>0x1b970160 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x96685e64 </td> </tr>
+<tr><td>Word</td> <td>3:</td> <td>0xb3ea2fba </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x2403b349 Word 18:</td> <td>0x7f8ac0d8 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xedd5da69 </td> </tr>
+<tr><td>Word</td> <td>4:</td> <td>0x95a75659 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xd44074f7 Word 19:</td> <td>0xe91d3759 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x0d90f125 </td> </tr>
+<tr><td>Word</td> <td>5:</td> <td>0x8a53ce5d </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xc08bc5c6 Word 20:</td> <td>0x3472965b </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xb4bcfd70 </td> </tr>
+<tr><td>Word</td> <td>6:</td> <td>0xd16ccbad </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x8b557ea7 Word 21:</td> <td>0x1dbcb107 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xe4cefa8f </td> </tr>
+<tr><td>Word</td> <td>7:</td> <td>0x75fff1f9 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xf0ad7c1a Word 22:</td> <td>0x9ee7cb3e </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xa6793274 </td> </tr>
+<tr><td>Word</td> <td>8:</td> <td>0x15331d1c </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xda453482 Word 23:</td> <td>0x131e9660 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xf7d034ee </td> </tr>
+<tr><td>Word</td> <td>9:</td> <td>0x46dc3ab8 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x2e183746 Word 24:</td> <td>0xde4e1b12 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x1a98d9dd </td> </tr>
+<tr><td>Word</td> <td>10:</td> <td>0xf74635d7 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x909993cc Word 25:</td> <td>0x46bc47e0 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x6038765f </td> </tr>
+<tr><td>Word</td> <td>11:</td> <td>0xc504c518 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0x0902288b Word 26:</td> <td>0x5bc9c11a </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xb2ff333f  </td> </tr>
+<tr><td>Word</td> <td>12:</td> <td>0x35aa6493 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xb65ebfd9 Word 27:</td> <td>0x931d7f07 </td><td>*</td> <td>0x12345678</td> <td>=</td> <td>0xe7937e49  </td> </tr>
+
+</table>
+</div>
+
+
+
+
+
+
+
+
+<br/>
+
+
+8 &nbsp &nbsp  <em>  THREAD SAFETY     </em>   <span id="index_number">37  </span> <br><br><br>
+<div id="number_spacing">
+<table cellpadding="6" cellspacing="0">
+<tr>
+<td>Word 13:</td> <td> 0x72c21d97</td> <td> *</td> <td> 0x12345678</td> <td> =</td> <td> 0x69b541d1</td> <td> Word 28:</tD>
+
+<td> 0xd40676e0 </td> <td> * </td> <td> 0x12345678 </td> <td> = </td> <td> 0xfa5a5867 </td> </tr>
+
+<tr><td>Word 14:</td> <td> 0x98f9b37c</td> <td> * </td> <td> 0x12345678 </td> <td> = </td> <td> 0xd9107639</td> <td> Word 29:</td>
+<td> 0xc85cfe86</td> <td>*</td> <td> 0x12345678</td> <td> =</td> <td> 0x79c00ea2</td></tr>
+
+</table>
+</div><br>
+
+
+<p>
+As with <b>"SPLIT,"</b> using <b>multiply_region()</b> with <b>"COMPOSITE"</b> and <b>"ALTMAP"</b> will be consistent only if the
+alignment of pointers and region sizes are identical. </p>
+
+
+<h3>7.9.3 The mapping of "CAUCHY" </h3>
+
+With <b>"CAUCHY,"</b> the region is partitioned into <em>w</em> subregions, and each word in the region is broken into <em>w</em> bits,
+each of which is stored in a different subregion. To illustrate, <b>gf_example_7</b> multiplies a region of three bytes by 5
+in <em>GF(2<sup>3</sup>)</em> using <b>"CAUCHY:"</b><br><br>
+
+<div id="number_spacing">
+
+UNIX> gf_example_7 <br>
+a: 0x100100190 b: 0x1001001a0 <br><br>
+a: 0x0b 0xe5 0xba <br>
+b: 0xee 0xba 0x0b <br><br>
+a bits: 00001011 11100101 10111010 <br>
+b bits: 11101110 10111010 00001011<br><br>
+Word 0: 3 * 5 = 4 <br>
+Word 1: 5 * 5 = 7 <br>
+Word 2: 2 * 5 = 1 <br>
+Word 3: 5 * 5 = 7 <br>  
+Word 4: 4 * 5 = 2 <br>
+Word 5: 6 * 5 = 3 <br>
+Word 6: 2 * 5 = 1 <br>
+Word 7: 6 * 5 = 3 <br>
+UNIX><br><br> </div>
+<p>
+
+The program prints the three bytes of a and b in hexadecimal and in binary. To see how words are broken up,
+consider word 0, which is the lowest bit of each of the three bytes of a (and b). These are the bits 1, 1 and 0 in a, and
+0, 0, and 1 in b. Accordingly, the word is 3 in a, and 3*5 = 4 in b. Similarly, word 7 is the high bit in each byte: 0, 1, 1
+(6) in a, and 1, 1, 0 (3) in b.</p>
+<p>With <b>"CAUCHY," multiply_region()</b>may be implemented exclusively with XOR operations. Please see [BKK<sup>+</sup>95]
+for more information on the motivation behind <b>"CAUCHY."</b> </p>
+
+<h2>8 &nbsp Thread Safety </h2>
+
+Once you initialize a <b>gf_t,</b> you may use it wontonly in multiple threads for all operations except for the ones below.
+With the implementations listed below, the scratch space in the <b>gf_t</b> is used for temporary tables, and therefore you
+cannot call <b>region_multiply,</b> and in some cases <b>multiply</b> from multiple threads because they will overwrite each
+others' tables. In these cases, if you want to call the procedures from multiple threads, you should allocate a separate
+gf_t for each thread:
+<ul>
+<li>
+ All "GROUP" implementations are not thread safe for either <b>region_multiply()</b> or <b> multiply().</b> Other than
+<b>"GROUP," multiply() </b> is always thread-safe.
+
+</li>
+</ul>
+
+
+
+
+
+
+
+
+
+<br/>
+
+
+9 &nbsp &nbsp  <em>  LISTING OF PROCEDURES     </em>   <span id="index_number">38  </span> <br><br><br>
+<ul>
+<li>
+
+For <em>w </em> = 4, <b>region_multiply.w32()</b> is unsafe in in "-m TABLE -r QUAD -r LAZY." </li><br>
+<li> For <em>w </em> = 8, <b> region_multiply.w32()</b> is unsafe in in "-m TABLE -r DOUBLE -r LAZY."</li><br>
+<li> For <em>w </em> = 16, <b>region_multiply.w32() </b> is unsafe in in "-m TABLE."</li><br>
+<li> For <em>w </em> &#8712 {32, 64, 128}, all <b>"SPLIT"</b> implementations are unsafe for <b>region_multiply().</b> This means that if the
+default uses <b>"SPLIT"</b> (see Table 1 for when that occurs), then <b>region_multiply()</b> is not thread safe.</li><br>
+<li> The <b>"COMPOSITE"</b> operations are only safe if the implementations of the underlying fields are safe.</li>
+</ul>
+
+<h2>9 &nbspListing of Procedures </h2>
+
+The following is an alphabetical listing of the procedures, data types and global variables for users to employ in
+GF-complete.<br>
+
+<ul>
+<li> <b>GF_W16_INLINE_DIV()</b> in <b>gf_complete.h:</b> This is a macro for inline division when <em>w </em> = 16. See section 7.1.</li><br>
+<li> <b>GF_W16_INLINE_MULT()</b> in <b>gf_complete.h:</b> This is a macro for inline multiplication when <em>w </em> = 16. See
+section 7.1.</li><br>
+<li> <b>GF_W4_INLINE_MULTDIV()</b> in <b>gf_complete.h:</b> This is a macro for inline multiplication/division when <em>w </em> =
+4. See section 7.1.</li><br>
+
+<li> <b>GF_W8_INLINE_MULTDIV()</b> in <b>gf_complete.h:</b> This is a macro for inline multiplication/division when <em>w </em> =
+8. See section 7.1.</li><br>
+<li> <b>MOA_Fill_Random_Region()</b> in <b>gf_rand.h:</b> Fills a region with random numbers.</li><br>
+<li> <b>MOA_Random_128()</b> in <b>gf_rand.h:</b> Creates a random 128-bit number.</li><br>
+<li> <b>MOA_Random_32()</b> in <b>gf_rand.h:</b> Creates a random 32-bit number. </li><br>
+<li> <b>MOA_Random_64()</b> in <b>gf_rand.h:</b> Creates a random 64-bit number. </li><br>
+<li> <b>MOA_Random_W()</b> in <b>gf_rand.h:</b> Creates a random w-bit number, where <em>w </em> &#8804 32. </li><br>
+<li> <b>MOA_Seed()</b> in <b>gf_rand.h:</b> Sets the seed for the random number generator. </li><br>
+<li> <b>gf_errno</b> in <b>gf_complete.h:</b> This is to help figure out why an initialization call failed. See section 6.1.</li><br>
+<li> <b>gf_create_gf_from_argv()</b> in <b>gf_method.h:</b> Creates a gf_t using C style argc/argv. See section 6.1.1. </li><br>
+<li> <b>gf_division_type_t</b> in <b>gf_complete.h:</b> the different ways to specify division when using <b>gf_init_hard().</b> See 
+section 6.4. </li><br>
+<li> <b>gf_error()</b> in <b>gf_complete.h:</b> This prints out why an initialization call failed. See section 6.1. </li><br>
+
+<li> <b>gf_extract</b> in <b>gf_complete.h:</b> This is the data type of <b>extract_word()</b> in a gf_t. See section 7.9 for an example
+of how to use extract word().</li>
+</ul>
+
+
+
+
+
+<br/>
+
+
+9 &nbsp &nbsp  <em>  LISTING OF PROCEDURES     </em>   <span id="index_number">39  </span> <br><br><br>
+<ul>
+<li>
+<b>gf_free()</b> in <b>gf_complete.h:</b> If <b>gf_init easy(), gf_init hard()</b> or <b>create_gf_from_argv()</b> allocated memory, this
+frees it. See section 6.4. </li>
+
+<li> <b>gf_func_a_b</b> in <b>gf_complete.h:</b> This is the data type of <b>multiply()</b> and <b>divide()</b> in a gf_t. See section 4.2 for
+examples of how to use <b>multiply()</b> and <b>divide()</b></li><br>
+
+<li> <b>gf_func_a_b</b> in <b>gf_complete.h:</b> This is the data type of <b>multiply()</b> and <b>divide()</b> in a <b>gf_t.</b> See section 4.2 for
+examples of how to use <b>multiply()</b> and <b>divide()</b></li><br>
+
+<li> <b>gf_func_a</b> in <b>gf_complete.h:</b> This is the data type of <b>inverse()</b> in a <b>gf_t</b></li><br>
+
+<li> <b>gf_general_add()</b> in <b>gf_general.h:</b> This adds two <b>gf_general_t's </b></li><br>
+
+<li> <b>gf_general_divide()</b> in <b>gf_general.h:</b> This divides two <b>gf_general t's </b></li><br>
+
+<li> <b>gf_general_do_region_check() </b> in <b>gf_general.h:</b> This checks a region multiply of <b>gf_general_t's </b></li><br>
+
+<li> <b>gf_general_do_region_multiply() </b> in <b>gf_general.h:</b> This does a region multiply of <b>gf_general_t's </b></li><br>
+
+<li> <b>gf_general_do_single_timing_test()</b> in <b>gf_general.h:</b> Used in <b>gf_time.c </b></li><br>
+
+<li> <b>gf_general_inverse() </b> in <b>gf_general.h:</b> This takes the inverse of a <b>gf_general_t </b></li><br>
+
+<li> <b>gf_general_is_one() </b> in <b>gf_general.h:</b> This tests whether a <b>gf_general_t </b> is one</li><br>
+
+<li> <b>gf_general_is_two() </b> in <b>gf_general.h:</b> This tests whether a <b>gf_general_t  </b>is two</li><br>
+
+<li> <b>gf_general_is_zero() </b> in <b>gf_general.h:</b> This tests whether a <b>gf_general_t </b> is zero</li><br>
+
+<li> <b>gf_general_multiply() </b> in <b>gf_general.h:</b> This multiplies two <b>gf_general_t's.</b> See the implementation of gf_mult.c
+
+for an example</li><br>
+<li> <b>gf_general_s_to_val() </b> in <b>gf_general.h:</b> This converts a string to a <b>gf_general t.</b> See the implementation of
+gf_mult.c for an example</li><br>
+<li> <b>gf_general_set_one() </b> in <b>gf_general.h:</b> This sets a <b>gf_general_t</b> to one</li><br>
+<li> <b>gf_general_set_random()</b> in <b>gf_general.h:</b> This sets a <b>gf_general_t </b> to a random number</li><br>
+<li> <b>gf_general_set_two() in </b><b>gf_general.h:</b> This sets a <b>gf_general_t </b> to two</li><br>
+<li> <b>gf_general_set_up_single_timing_test() </b> in <b>gf_general.h:</b> Used in <b>gf_time.c</b></li><br>
+<li> <b>gf_general_set_zero() in </b><b>gf_general.h:</b> This sets a <b>gf_general_t_to_zero</b></li><br>
+<li> <b>gf_general_t_in .</b><b>gf_general.h:</b> This is a general data type for all values of w. See the implementation of gf_mult.c
+for examples of using these</li><br>
+<li> <b>gf_general_val_to_s()</b> in<b>gf_general.h:</b> This converts a <b>gf_general_t </b> to a string. See the implementation of
+<b>gf_mult.c</b> for an example</li><br>
+
+<li> <b>gf_init_easy()</b> in <b>gf_complete.h:</b> This is how you initialize a default <b>gf_t.</b> See 4.2 through 4.5 for examples of
+calling <b>gf_init_easy()</b></li><br>
+</ul>
+
+
+
+
+
+
+
+<br/>
+
+
+9 &nbsp &nbsp  <em>  LISTING OF PROCEDURES     </em>   <span id="index_number">40  </span> <br><br><br>
+
+<ul>
+
+<li><b>gf_init hard()</b> in <b>gf_complete.h: </b> This allows you to initialize a <b>gf_t</b> without using the defaults. See 6.4. We
+recommend calling create <b>gf_from argv()</b> when you can, instead of <b>gf_ init_hard()</b></li><br>
+
+<li> <b>gf_ mult_type_t </b> in <b>gf_complete.h: </b> the different ways to specify multiplication when using <b>gf_init hard()</b>. See
+section 6.4</li><br>
+
+<li> <b>gf_region_type_t</b> in <b>gf_complete.h: </b> the different ways to specify region multiplication when using <b>gf_init_hard()</b>.
+See section 6.4</li><br>
+
+<li> <b>gf_region_in</b> <b>gf_complete.h: </b> This is the data type of <b>multiply_region()</b> in a <b>gf_t.</b> See section 4.3 for an example
+of how to use <b>multiply_region()</b></li><br>
+
+<li> <b>gf_scratch_size()</b> in <b>gf_complete.h: </b> This is how you calculate how much memory a <b>gf_t</b> needs. See section 6.4.</li><br>
+
+<li> <b>gf_size()</b> in <b>gf_complete.h: </b> Returns the memory consumption of a <b>gf_t.</b> See section 6.5.</li><br>
+
+<li> <b>gf_ val_128_t</b> in <b>gf_complete.h: </b> This is how you store a value where <em>w </em> &#8804 128. It is a pointer to two 64-bit
+unsigned integers. See section 4.4</li><br>
+
+
+<li> <b>gf_val_32_t</b> in <b>gf_ complete.h: </b> This is how you store a value where <em>w </em> &#8804 32. It is equivalent to a 32-bit unsigned
+integer. See section 4.2</li><br>
+
+<li> <b>gf_ val_64_t</b> in <b>gf_complete.h: </b> This is how you store a value where <em>w </em> &#8804 64. It is equivalent to a 64-bit unsigned
+integer. See section 4.5</li><br>
+
+<li> <b>gf_w16_get_div_alog_table()</b> in <b>gf_ complete.h: </b> This returns a pointer to an inverse logarithm table that can be
+used for inlining division when <em>w </em> = 16. See section 7.1</li><br>
+
+
+<li> <b>gf_w16_get_log_table()</b> in <b>gf_complete.h: </b> This returns a pointer to a logarithm table that can be used for inlining
+when <em>w </em> = 16. See section 7.1</li><br>
+
+
+<li> <b>gf_w16_get_mult_alog_table()</b> in <b>gf_complete.h: </b> This returns a pointer to an inverse logarithm table that can be
+used for inlining multiplication when <em>w </em> = 16. See section 7.1</li><br>
+
+
+<li> <b>gf_ w4 get div table()</b> in <b>gf_complete.h: </b> This returns a pointer to a division table that can be used for inlining
+when <em>w </em> = 4. See section 7.1</li><br>
+
+
+<li> <b>gf_w4_get_mult_table()</b> in <b>gf_complete.h: </b> This returns a pointer to a multiplication table that can be used for
+inlining when <em>w </em> = 4. See section 7.1</li><br>
+
+<li> <b>gf_w8_get_div_table()</b> in <b>gf_complete.h: </b> This returns a pointer to a division table that can be used for inlining
+when <em>w </em> = 8. See section 7.1</li><br>
+
+<li> <b>gf_w8_get_mult_table()</b> in <b>gf_complete.h: </b> This returns a pointer to a multiplication table that can be used for
+inlining when <em>w </em> = 8. See section 7.1</li><br>
+
+</ul>
+
+
+
+
+
+
+
+
+
+<br/>
+10 &nbsp &nbsp  <em>TROUBLESHOOTING </em>   <span id="index_number">41  </span> <br><br><br>
+
+<ul>
+<li><b> SSE support.</b> Leveraging SSE instructions requires processor support as well as compiler support. For example,
+the Mac OS 10.8.4 (and possibly earlier versions) default compile environment fails to properly compile
+PCLMUL instructions. This issue can be fixed by installing an alternative compiler; see Section 3 for details</li><br>
+
+<li> <b>Initialization segfaults.</b> You have to already have allocated your <b>gf_t</b> before you pass a pointer to it in
+<b>bgf_init_easy()</b>, <b>create_gf_ from_argv()</b>, or <b>bgf_ini_hard()</b></li><br>
+
+
+<li> <b>GF-Complete is slower than it should be.</b> Perhaps your machine has SSE, but you haven't specified the SSE
+compilation flags. See section 3 for how to compile using the proper flags</li><br>
+
+
+<li> <b>Bad alignment.</b> If you get alignment errors, see Section 5</li><br>
+
+<li> <b>Mutually exclusive region types.</b> Some combinations of region types are invalid. All valid and implemented
+combinations are printed by <b>bgf_methods.c </b></li><br>
+
+<li><b>Incompatible division types.</b> Some choices of multiplication type constrain choice of divide type. For example,
+<b>"COMPOSITE"</b> methods only allow the default division type, which divides by finding inverses (i.e.,
+neither <b>"EUCLID"</b> nor <b>"MATRIX"</b> are allowed). For each multiplication method printed by <b>gf_methods.c,</b> the
+corresponding valid division types are also printed</li><br>
+
+
+<li><b> Arbitrary "GROUP" arguments.</b> The legal arguments to <b>"GROUP"</b> are specified in section 7.5</li><br>
+
+<li> <b> Arbitrary "SPLIt" arguments.</b> The legal arguments to <b>"SPLIt"</b> are specified in section 7.4</li><br>
+
+<li> <b>Threading problems.</b> For threading questions, see Section 8</li><br>
+
+<li> <b>No default polynomial.</b> If you change the polynomial in a base field using <b>"COMPOSITE,"</b> then unless it is
+a special case for which GF-Complete finds a default polynomial, you'll need to specify the polynomial of the
+composite field too. See 7.8.2 for the fields where GF-Complete will support default polynomials</li><br>
+<li> Encoding/decoding with different fields. Certain fields are not compatible. Please see section 7.2 for an
+explanation</li><br>
+
+
+<li> <b>"ALTMAP" is confusing.</b> We agree. Please see section 7.9 for more explanation.</li><br>
+
+<li> <b>I used "ALTMAP" and it doesn't appear to be functioning correctly.</b> With 7.9, the size of the region and
+its alignment both matter in terms of how <b>"ALTMAP"</b> performs <b>multiply_region()</b>. Please see section 7.9 for
+detailed explanation</li><br>
+
+<li><b>Where are the erasure codes?.</b> This library only implements Galois Field arithmetic, which is an underlying
+component for erasure coding. Jerasure will eventually be ported to this library, so that you can have fast erasure
+coding</li><br>
+</ul>
+<h2>11 &nbsp &nbsp Timings </h2>
+
+We don't want to get too detailed with timing, because it is quite machine specific. However, here are the timings on
+an Intel Core i7-3770 CPU running at 3.40 GHz, with 4 &#215 256 KB L2 caches and an 8MB L3 cache. All timings are
+obtained with <b>gf_time</b> or <b>gf_inline_time,</b> in user mode with the machine dedicated solely to running these jobs.
+
+
+
+
+
+
+
+
+
+<br/>
+10 &nbsp &nbsp  <em>TROUBLESHOOTING </em>   <span id="index_number">41  </span> <br><br><br>
+
+<div class="image-cell_5"> </div>
+<center>Figure 4: Speed of doing single multiplications for w &#8712 {4, 8, 16}. </center>
+<h2>11.1 &nbsp Multiply() </h2>
+
+The performance of <b>multiply()</b> is displayed in Figures 4 for w &#8712 {4, 8, 16} and 5 for w &#8712 {32, 64, 128}. These
+numbers were obtained by calling <b>gf_time</b> with the size and iterations both set to 10240. We plot the speed in megaops
+per second.
+
+<p>As would be anticipated, the inlined operations (see section 7.1) outperform the others. Additionally, in all
+cases with the exception of <em>w</em> = 32, the defaults are the fastest performing implementations. With w = 32,
+"CARRY_FREE" is the fastest with an alternate polynomial (see section 7.7). Because we require the defaults to
+use a "standard" polynomial, we cannot use this implementation as the default. </p>
+
+<h2>11.2 &nbsp Divide() </h2>
+
+For the  <b>"TABLE"</b> and <b>"LOG"</b> implementations, the performance of division is the same as multiplication. This means
+that for w &#8712 {4, 8, 16}, it is very fast indeed. For the other implementations, division is implemented with Euclid's
+method, and is several factors slower than multiplication.
+In Figure 6, we plot the speed of a few implementations of the larger word sizes. Compared to the <b>"TABLE"</b> and
+<b>"LOG"</b> implemenations for the smaller word sizes, where the speeds are in the hundreds of mega-ops per second,
+these are very slow. Of note is the <b>"COMPOSITE"</b> implementation for <em>w</em> = 32, which is much faster than the others
+
+
+
+
+
+
+
+
+<br/>
+10 &nbsp &nbsp  <em>TROUBLESHOOTING </em>   <span id="index_number">43  </span> <br><br><br>
+
+<div class="image-cell_6"> </div>
+
+<center>Figure 5: Speed of doing single multiplications for w &#8712 {32, 64, 128}. </center><br>
+
+because it uses a special application of Euclid's method, which relies on division in <em>GF(2<sup>16</sup>),</em> which is very fast.<br><br>
+
+<h3>11.3 &nbsp Multiply_Region() </h2>
+
+Tables 3 through 8 show the performance of the various region operations. It should be noted that for <em>GF(2<sup>16 </sup>) </em>
+through <em>GF(2<sup>128</sup>),</em> the default is not the fastest implementation of <b>multiply_region().</b> The reasons for this are outlined
+in section 6
+<p>
+For these tables, we performed 1GB worth of <b>multiply_region()</b> calls for all regions of size 2i bytes for 10 &#8804 i &#8804
+30. In the table, we plot the fastest speed obtained.</p>
+<p>We note that the performance of <b>"CAUCHY"</b> can be improved with techniques from [LSXP13] and [PSR12].</p>
+
+
+
+
+
+
+
+
+
+<br/>
+<em>REFERENCES </em>   <span id="index_number">44  </span> <br><br><br>
+
+<div class="image-cell_7"> </div>
+
+<center>Figure 6: Speed of doing single divisions for w &#8712 {32, 64, 128}. </center><br>
+
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+
+<tr><td>-m TABLE (Default) -</td> <td>11879.909</td> </tr>
+<tr><td>-m TABLE -r CAUCHY -</td> <td>9079.712</td> </tr>
+<tr><td>-m BYTWO_b -</td> <td>5242.400</td> </tr>
+<tr><td>-m BYTWO_p -</td> <td>4078.431</td> </tr>
+<tr><td>-m BYTWO_b -r NOSSE -</td> <td>3799.699</td> </tr>
+<tr><td>-m TABLE -r QUAD -</td> <td>3014.315</td> </tr>
+
+<tr><td>-m TABLE -r DOUBLE -</td> <td>2253.627</td> </tr>
+<tr><td>-m TABLE -r NOSSE -</td> <td>2021.237</td> </tr>
+<tr><td>-m TABLE -r NOSSE -</td> <td>1061.497</td> </tr>
+<tr><td>-m LOG -</td> <td>503.310</td> </tr>
+
+
+<tr><td>m SHIFT -</td> <td>157.749</td> </tr>
+<tr><td>-m CARRY_FREE -</td> <td>86.202</td> </tr>
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 3: Speed of various calls to <b>multiply_region()</b> for <em>w</em> = 4. </center>
+
+<h3>References </h3>
+
+[Anv09] H. P. Anvin. The mathematics of RAID-6.<a href=""> http://kernel.org/pub/linux/kernel/people/hpa/
+raid6.pdf,</a> 2009.<br><br>
+
+[BKK<sup>+</sup>95] J. Blomer, M. Kalfane, M. Karpinski, R. Karp, M. Luby, and D. Zuckerman. An XOR-based erasureresilient
+coding scheme. Technical Report TR-95-048, International Computer Science Institute, August
+1995. <br><br>
+
+[GMS08] K. Greenan, E. Miller, and T. J. Schwartz. Optimizing Galois Field arithmetic for diverse processor
+architectures and applications. In MASCOTS 2008: <em>16th IEEE Symposium on Modeling, Analysis and
+Simulation of Computer and Telecommunication Systems,</em> Baltimore, MD, September 2008.<br><br>
+
+
+[GP97] S. Gao and D. Panario. Tests and constructions of irreducible polynomials over finite fields. In <em> Foundations
+of Computational Mathematics,</em> pages 346–361. Springer Verlag, 1997.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+<em>REFERENCES </em>   <span id="index_number">45  </span> <br><br><br>
+
+
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+<tr><td>-m SPLIT 8 4 (Default)</td> <td>13279.146</td> </tr>
+<tr><td>-m COMPOSITE 2 - -r ALTMAP -</td> <td>5516.588</td> </tr>
+<tr><td>-m TABLE -r CAUCHY -</td> <td>4968.721</td> </tr>
+<tr><td>-m BYTWO_b -</td> <td>2656.463</td> </tr>
+<tr><td>-m TABLE -r DOUBLE -</td> <td>2561.225</td> </tr>
+<tr><td>-m TABLE -</td> <td>1408.577</td> </tr>
+
+<tr><td>-m BYTWO_b -r NOSSE -</td> <td>1382.409</td> </tr>
+<tr><td>-m BYTWO_p -</td> <td>1376.661</td> </tr>
+<tr><td>-m LOG_ZERO_EXT -</td> <td>1175.739</td> </tr>
+<tr><td>-m LOG_ZERO -</td> <td>1174.694</td> </tr>
+
+
+<tr><td>-m LOG -</td> <td>997.838</td> </tr>
+<tr><td>-m SPLIT 8 4 -r NOSSE -</td> <td>885.897</td> </tr>
+
+
+<tr><td>-m BYTWO_p -r NOSSE -</td> <td>589.520</td> </tr>
+<tr><td>-m COMPOSITE 2 - -</td> <td>327.039</td> </tr>
+
+
+<tr><td>-m SHIFT -</td> <td>106.115</td> </tr>
+
+<tr><td>-m CARRY_FREE -</td> <td>104.299</td> </tr>
+
+
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 4: Speed of various calls to multiply region() for <em>w</em> = 4. </center><br><br>
+
+[LBOX12] J. Luo, K. D. Bowers, A. Oprea, and L. Xu. Efficient software implementations of large finite fields
+<em>GF(2<sup>n</sup>) </em> for secure storage applications.<em> ACM Transactions on Storage, 8(2),</em> February 2012.<br><br>
+
+[LD00] J. Lopez and R. Dahab. High-speed software multiplication in f<sub>2<sup>m</sup></sub>. In <em>Annual International Conference
+on Cryptology in India,</em> 2000.<br><br>
+
+[LHy08] H. Li and Q. Huan-yan. Parallelized network coding with SIMD instruction sets. In <em>International Symposium
+on Computer Science and Computational Technology,</em> pages 364-369. IEEE, December 2008.<br><br>
+
+[LSXP13] J. Luo, M. Shrestha, L. Xu, and J. S. Plank. Efficient encoding schedules for XOR-based erasure codes.
+<em>IEEE Transactions on Computing,</em>May 2013.<br><br>
+
+[Mar94] G. Marsaglia. The mother of all random generators.<a href=""> ftp://ftp.taygeta.com/pub/c/mother.
+c,</a> October 1994.<br>
+
+[PGM13a] J. S. Plank, K. M. Greenan, and E. L. Miller. A complete treatment of software implementations of
+finite field arithmetic for erasure coding applications. Technical Report UT-CS-13-717, University of
+Tennessee, September 2013.<br><br>
+
+[PGM13b] J. S. Plank, K. M. Greenan, and E. L. Miller. Screaming fast Galois Field arithmetic using Intel SIMD
+instructions. In FAST-2013: <em>11th Usenix Conference on File and Storage Technologies,</em> San Jose, February
+2013.<br><br>
+
+[Pla97] J. S. Plank. A tutorial on Reed-Solomon coding for fault-tolerance in RAID-like systems.<em> Software -
+Practice & Experience,</em> 27(9):995-1012, September 1997.
+
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+<em>REFERENCES </em>   <span id="index_number">46  </span> <br><br><br>
+
+
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+<tr><td>-m SPLIT 16 4 -r ALTMAP -</td> <td>10460.834</td> </tr>
+<tr><td>-m SPLIT 16 4 -r SSE (Default) - </td> <td>8473.793</td> </tr>
+<tr><td>-m COMPOSITE 2 - -r ALTMAP -</td> <td>5215.073</td> </tr>
+<tr><td>-m LOG -r CAUCHY -</td> <td>2428.824</td> </tr>
+<tr><td>-m TABLE -</td> <td>2319.129</td> </tr>
+<tr><td>-m SPLIT 16 8 -</td> <td>2164.111</td> </tr>
+
+<tr><td>-m SPLIT 8 8 -</td> <td>2163.993</td> </tr>
+<tr><td>-m SPLIT 16 4 -r NOSSE -</td> <td>1148.810</td> </tr>
+<tr><td>-m LOG -</td> <td>1019.896</td> </tr>
+<tr><td>-m LOG_ZERO -</td> <td>1016.814</td> </tr>
+<tr><td>-m BYTWO_b -</td> <td>738.879</td> </tr>
+<tr><td>-m COMPOSITE 2 - -</td> <td>596.819</td> </tr>
+<tr><td>-m BYTWO_p -</td> <td>560.972</td> </tr>
+<tr><td>-m GROUP 4 4 -</td> <td>450.815</td> </tr>
+<tr><td>-m BYTWO_b -r NOSSE -</td> <td>332.967</td> </tr>
+<tr><td>-m BYTWO_p -r NOSSE -</td> <td>249.849</td> </tr>
+<tr><td>-m CARRY_FREE -</td> <td>111.582</td> </tr>
+<tr><td>-m SHIFT -</td> <td>95.813</td> </tr>
+
+
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 5: Speed of various calls to multiply region()  for <em>w</em> = 4. </center><br><br>
+
+[PMG<sup>+</sup>13] J. S. Plank, E. L. Miller, K. M. Greenan, B. A. Arnold, J. A. Burnum, A. W. Disney, and A. C. McBride.
+GF-Complete: A comprehensive open source library for Galois Field arithmetic. version 1.0. Technical
+Report UT-CS-13-716, University of Tennessee, September 2013.<br><br>
+
+[PSR12] J. S. Plank, C. D. Schuman, and B. D. Robison. Heuristics for optimizing matrix-based erasure codes for
+fault-tolerant storage systems. In DSN-2012:<em> The International Conference on Dependable Systems and
+Networks,</em> Boston, MA, June 2012. IEEE.<br><br>
+
+[Rab89] M. O. Rabin. Efficient dispersal of information for security, load balancing, and fault tolerance. <em>Journal
+of the Association for Computing Machinery,</em> 36(2):335-348, April 1989.
+
+
+
+
+
+
+
+
+
+<br/>
+<em>REFERENCES </em>   <span id="index_number">47  </span> <br><br><br>
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+<tr>
+ 
+<td>
+
+-m SPLIT 32 4 -r SSE -r ALTMAP - <br>
+-m SPLIT 32 4 (Default)  <br>
+-m COMPOSITE 2 -m SPLIT 16 4 -r ALTMAP - -r ALTMAP - <br>
+-m COMPOSITE 2 - -r ALTMAP -  <br>
+-m SPLIT 8 8 - <br> 
+-m SPLIT 32 8 - <br> 
+-m SPLIT 32 16 - <br> 
+-m SPLIT 8 8 -r CAUCHY <br> 
+-m SPLIT 32 4 -r NOSSE <br> 
+-m CARRY_FREE -p 0xc5 <br> 
+-m COMPOSITE 2 - <br> 
+-m BYTWO_b - <br> 
+-m BYTWO_p - <br> 
+-m GROUP 4 8 - <br> 
+-m GROUP 4 4 - <br> 
+-m CARRY_FREE - <br> 
+-m BYTWO_b -r NOSSE - <br> 
+-m BYTWO_p -r NOSSE - <br>
+-m SHIFT - <br> 
+
+</td>
+
+<td>
+7185.440 <br>
+5063.966 <br>
+ 4176.440 <br>
+3360.860 <br>
+1345.678 <br>
+1340.656 <br>
+1262.676 <br>
+1143.263  <br>
+ 480.859 <br>
+393.185 <br>
+332.964 <br>
+309.971 <br>
+258.623 <br>
+242.076 <br>
+227.399 <br>
+226.785 <br>
+143.403 <br>
+111.956 <br>
+52.295 <br>
+</td>
+
+
+</tr>
+
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 6: Speed of various calls to multiply region() <em>w</em> = 4. </center><br><br>
+
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+<tr>
+ 
+<td>
+-m SPLIT 64 4 -r ALTMAP - <br>
+-m SPLIT 64 4 -r SSE (Default) - <br>
+-m COMPOSITE 2 -m SPLIT 32 4 -r ALTMAP - -r ALTMAP - <br>
+-m COMPOSITE 2 - -r ALTMAP -  <br>
+-m SPLIT 64 16 - <br>
+-m SPLIT 64 8 -  <br>
+-m CARRY_FREE -  <br>
+-m SPLIT 64 4 -r NOSSE - <br>
+-m GROUP 4 4 -  <br>
+-m GROUP 4 8 -  <br>
+-m BYTWO_b -  <br>
+-m BYTWO_p -  <br>
+-m SPLIT 8 8 - <br>
+-m BYTWO_p -r NOSSE - <br>
+-m COMPOSITE 2 - - <br>
+-m BYTWO_b -r NOSSE - <br>
+-m SHIFT - <br>
+
+</td>
+
+<td>3522.798 <br>
+ 2647.862 <br>
+2461.572 <br>
+1860.921 <br>
+1066.490 <br>
+998.461 <br>
+975.290 <br>
+545.479 <br>
+230.137 <br>
+153.947 <br>
+144.052 <br>
+124.538 <br>
+98.892 <br>
+77.912 <br>
+77.522 <br>
+36.391 <br>
+25.282 <br>
+</td>
+
+
+</tr>
+
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 7: Speed of various calls to multiply region() for  <em>w</em> = 4. </center><br><br>
+
+
+
+
+
+
+
+
+
+
+
+
+
+<br/>
+<em>REFERENCES </em>   <span id="index_number">48  </span> <br><br><br>
+
+<center>
+<div id="data2">
+<table cellpadding="2" cellspacing="0" style="text-align:center;font-size:19px">
+<tr><th>Method</td> <th>Speed (MB/s)</td> </tr>
+<tr>
+ 
+<td>
+
+-m SPLIT 128 4 -r ALTMAP - <br>
+-m COMPOSITE 2 -m SPLIT 64 4 -r ALTMAP - -r ALTMAP - <br> 
+-m COMPOSITE 2 - -r ALTMAP - <br> 
+-m SPLIT 128 8 (Default) - <br>
+-m CARRY_FREE -<br> 
+-m SPLIT 128 4 -<br> 
+-m COMPOSITE 2 - <br>
+-m GROUP 4 8 -<br> 
+-m GROUP 4 4 -<br> 
+-m BYTWO_p -<br> 
+-m BYTWO_b -<br> 
+-m SHIFT -<br> 
+</td>
+
+<td>
+1727.683 <br>
+1385.693 <br>
+1041.456 <br>
+872.619 <br>
+814.030 <br>
+500.133  <br>
+289.207 <br>
+133.583 <br>
+116.187 <br>
+25.162 <br>
+25.157 <br>
+14.183 <br>
+</td>
+
+
+</tr>
+
+</div>
+</table> <br><br>
+</div> </center>
+<center>Table 8: Speed of various calls to multiply region() for <em>w</em> = 4. </center><br><br>
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image1.png b/src/erasure-code/jerasure/gf-complete/manual/image1.png
new file mode 100644
index 000000000..c0f7d9511
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image1.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image2.png b/src/erasure-code/jerasure/gf-complete/manual/image2.png
new file mode 100644
index 000000000..38ff273df
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image2.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image3.png b/src/erasure-code/jerasure/gf-complete/manual/image3.png
new file mode 100644
index 000000000..0b4667c55
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image3.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image4.png b/src/erasure-code/jerasure/gf-complete/manual/image4.png
new file mode 100644
index 000000000..ba6cf0780
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image4.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image5.png b/src/erasure-code/jerasure/gf-complete/manual/image5.png
new file mode 100644
index 000000000..0e169dd71
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image5.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image6.png b/src/erasure-code/jerasure/gf-complete/manual/image6.png
new file mode 100644
index 000000000..33d4cebb3
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image6.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/image7.png b/src/erasure-code/jerasure/gf-complete/manual/image7.png
new file mode 100644
index 000000000..7694e1c07
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/image7.png
diff --git a/src/erasure-code/jerasure/gf-complete/manual/style.css b/src/erasure-code/jerasure/gf-complete/manual/style.css
new file mode 100644
index 000000000..39721946e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/manual/style.css
@@ -0,0 +1,404 @@
+
+body {
+margin:147px 104px 147px 173px;
+
+
+font-size:20px;
+text-align:justify;
+
+
+
+}
+
+#index_number{
+
+float:right;
+
+}
+
+a {
+
+text-decoration:none;
+font-size:19px;
+color:#19191F;
+letter-spacing:1.5px;
+
+font-family: 'Roboto Condensed', sans-serif;
+
+
+
+
+
+}
+/*This is page1 css */
+
+#box {
+
+text-align:center;
+font-size:19px;
+margin-top:166px;
+
+
+}
+
+#body_text{
+
+font-family: 'Roboto Condensed', sans-serif;
+font-size:18px;
+
+}
+
+h1{
+font-weight:inherit;
+
+}
+
+h4{
+font-size:22px;
+
+font-weight:inherit;
+
+}
+
+#footer{
+
+margin:1px 0px 1px 0px;
+font-size:18px;
+text-align:justify;
+padding-bottom:104px;
+
+
+
+}
+
+p {
+margin:0;
+text-indent: 50px;
+font-size:19px;
+text-align:justify;
+
+
+}
+
+
+#footer_bar {
+border-top:solid;
+
+border-top-width:thin;
+
+}
+
+#pages_paragraphs {
+margin:1px 115px 1px 57px;
+
+
+
+}
+
+#pages_paragraphs_2{
+margin:1px 0px 1px 0px;
+font-size:20px;
+text-align:justify;
+}
+
+.code{
+font-size:22px;
+
+}
+
+
+
+
+/* This is page3 css */
+
+.index{
+font-weight:bold;
+text-align:justify;
+
+}
+
+.sub_indices {
+
+padding-left:52px;
+text-align:justify;
+}
+
+
+
+.aligning_numbers{
+
+padding-left:27px;
+
+
+}
+
+.aligning_page_number{
+
+
+float:right;
+
+
+}
+
+/* This page 6 css  */
+.box {
+
+height:223px;
+}
+
+
+
+.image-cell_1 {
+  background: url(image1.png) no-repeat; 
+  width:716px;
+  height:300px;
+
+   float:left;
+   margin-left:180px;
+   margin-right:134px;
+   margin-bottom:1px;
+   margin-bottom:31px;
+   
+}
+
+
+/* This page 9 and 10 css */
+
+
+
+#number_spacing{
+
+letter-spacing:1px;
+font-size:17px;
+
+
+}
+
+
+#number_spacing_1{
+
+letter-spacing:1px;
+font-size:19px;
+margin-left:10px;
+
+
+}
+
+/* this page 13 css */
+
+
+.image-cell_2 {
+  background: url(image2.png) no-repeat; 
+  width:939px;
+  height:419px;
+
+   float:left;
+   margin-left:68px;
+   margin-right:134px;
+   margin-bottom:1px;
+   margin-bottom:31px;
+   
+}
+
+/* This is page 14 */
+#data1 table{
+border-top-style:solid;
+border-left-style:solid;
+
+border-bottom-style:solid;
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+#data1 th{
+border-bottom-style:solid;
+border-right-style:solid;
+border-right-style:thin;
+font-family: 'Roboto Condensed', sans-serif;
+
+
+}
+
+#data1 td {
+border-right-style:solid;
+
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+
+/* This is page 28 */
+#table_page28 table{
+border-top-style:solid;
+border-left-style:solid;
+
+border-bottom-style:solid;
+border-top-width:thin;
+border-left-width:thin;
+border-bottom-width:thin;
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+#table_page28 th{
+border-bottom-style:solid;
+border-right-style:solid;
+border-right-width:thin;
+border-bottom-width:thin;
+font-family: 'Roboto Condensed', sans-serif;
+
+
+}
+
+#table_page28 td {
+border-right-style:solid;
+border-bottom-style:solid;
+border-bottom-width:thin;
+border-right-width:thin;
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+
+/* This is page 30 */
+#table_page30 table{
+border-top-style:solid;
+border-left-style:solid;
+
+border-bottom-style:solid;
+
+}
+
+#table_page30 th{
+border-bottom-style:solid;
+border-right-style:solid;
+
+
+}
+
+#table_page30 td {
+border-right-style:solid;
+border-bottom-style:solid;
+
+
+}
+#box_1 {
+
+height:485px;
+margin-top:44px;
+margin-bottom:-61px;
+
+}
+.image-cell_3 {
+  background: url(image3.png) no-repeat; 
+  width:583px;
+  height:393px;
+
+   float:left;
+   
+}
+
+.image-cell_4 {
+  background: url(image4.png) no-repeat; 
+  width:487px;
+  height:390px;
+
+   float:right;
+   
+
+   
+}
+
+/* This is page 42 Css */
+
+
+.image-cell_5 {
+  background: url(image5.png) no-repeat; 
+  width:907px;
+  height:592px;
+
+   float:left;
+   margin-right:134px;
+   margin-bottom:1px;
+   margin-bottom:31px;
+   
+}
+
+
+/* This is page 43 Css */
+
+
+.image-cell_6 {
+  background: url(image6.png) no-repeat; 
+  width:851px;
+  height:532px;
+
+   margin-right:134px;
+   margin-bottom:1px;
+   margin-bottom:31px;
+   
+}
+
+/* This is page 44 Css */
+
+
+.image-cell_7{
+  background: url(image7.png) no-repeat; 
+  width:945px;
+  height:321px;
+
+   margin-right:134px;
+   margin-bottom:1px;
+   margin-bottom:31px;
+   
+}
+
+/* This is page 45 */
+#data2 table{
+border-top-style:solid;
+border-left-style:solid;
+
+border-bottom-style:solid;
+border-top-width:2px;
+border-left-width:2px;
+border-bottom-width:2px;
+border-color:black;
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+#data2 th{
+border-bottom-style:solid;
+border-right-style:solid;
+border-bottom-width:2px;
+border-right-width:2px;
+font-family: 'Roboto Condensed', sans-serif;
+
+
+}
+ #data2 td {
+border-right-style:solid;
+border-right-width:2px;
+font-family: 'Roboto Condensed', sans-serif;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/Makefile.am b/src/erasure-code/jerasure/gf-complete/src/Makefile.am
new file mode 100644
index 000000000..cfc2a5062
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/Makefile.am
@@ -0,0 +1,32 @@
+# GF-Complete 'core' AM file
+# Creates the library
+
+AUTOMAKE_OPTIONS = subdir-objects
+
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+
+# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
+# versions will use SIMD for the strcmp implementation. Instead
+# we create a static library just for gf_method that is not compiled
+# with SIMD_FLAGS, this static library will get linked into gf_complete.so
+noinst_LTLIBRARIES = libgf_util.la
+libgf_util_la_SOURCES = gf_method.c
+libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
+
+# we narrowly use SIMD_FLAGS for code that needs it
+lib_LTLIBRARIES = libgf_complete.la
+libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
+          gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
+libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+libgf_complete_la_LIBADD = libgf_util.la
+
+if HAVE_NEON
+libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
+                             neon/gf_w8_neon.c  \
+                             neon/gf_w16_neon.c \
+                             neon/gf_w32_neon.c \
+                             neon/gf_w64_neon.c
+endif
+
+libgf_complete_la_LDFLAGS = -version-info 1:0:0
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c
new file mode 100644
index 000000000..84d6996d9
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf.c
@@ -0,0 +1,1090 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf.c
+ *
+ * Generic routines for Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "gf_cpu.h"
+
+int _gf_errno = GF_E_DEFAULT;
+
+void gf_error()
+{
+  char *s;
+
+  switch(_gf_errno) {
+    case GF_E_DEFAULT: s = "No Error."; break;
+    case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
+    case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
+    case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
+    case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
+    case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
+    case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
+    case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
+    case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
+    case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
+    case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
+    case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
+    case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
+    case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
+    case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
+    case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
+    case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
+    case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
+    case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
+    case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
+    case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
+    case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
+    case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
+    case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
+    case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
+    case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
+    case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
+    case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
+    case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
+    case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
+    case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
+    case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
+    case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
+    case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
+    case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
+    case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
+    case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
+    case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
+    case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
+    case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
+    case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
+    case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
+    case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
+    case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
+    case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
+    case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
+    case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
+    case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
+    case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
+    case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
+    case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
+    case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
+    case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
+    case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
+    case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
+    case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
+    case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
+    case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
+    case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
+    case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
+    case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
+    case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
+    case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
+    case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
+    case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
+    case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
+    case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
+    case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
+    case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
+    case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
+    case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
+    case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
+    case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
+    case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
+    case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
+    case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
+    case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
+    case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
+    case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
+    case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
+    case GF_E_UNK_REG: s = "Unknown region type."; break;
+    case GF_E_UNK_DIV: s = "Unknown division type."; break;
+    default: s = "Undefined error.";
+  }
+
+  fprintf(stderr, "%s\n", s);
+}
+
+uint64_t gf_composite_get_default_poly(gf_t *base) 
+{
+  gf_internal_t *h;
+  uint64_t rv;
+
+  h = (gf_internal_t *) base->scratch;
+  if (h->w == 4) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x13) return 2;
+    return 0;
+  } 
+  if (h->w == 8) {
+    if (h->mult_type == GF_MULT_COMPOSITE) return 0;
+    if (h->prim_poly == 0x11d) return 3;
+    return 0;
+  }
+  if (h->w == 16) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x105;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1100b) return 2;
+      if (h->prim_poly == 0x1002d) return 7;
+      return 0;
+    }
+  }
+  if (h->w == 32) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 2) return 0x10005;
+      if (rv == 7) return 0x10008;
+      if (rv == 0x105) return 0x10002;
+      return 0;
+    } else {
+      if (h->prim_poly == 0x400007) return 2;
+      if (h->prim_poly == 0xc5) return 3;
+      return 0;
+    }
+  }
+  if (h->w == 64) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      rv = gf_composite_get_default_poly(h->base_gf);
+      if (rv != h->prim_poly) return 0;
+      if (rv == 3) return 0x100000009ULL;
+      if (rv == 2) return 0x100000004ULL;
+      if (rv == 0x10005) return 0x100000003ULL;
+      if (rv == 0x10002) return 0x100000005ULL;
+      if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too, 
+                                                    but I want to differentiate cases). */
+      return 0;
+    } else {
+      if (h->prim_poly == 0x1bULL) return 2;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+int gf_error_check(int w, int mult_type, int region_type, int divide_type,
+                   int arg1, int arg2, uint64_t poly, gf_t *base)
+{
+  int sse3 = 0;
+  int sse2 = 0;
+  int pclmul = 0;
+  int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
+  gf_internal_t *sub;
+
+  rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
+  rquad   = (region_type & GF_REGION_QUAD_TABLE);
+  rlazy   = (region_type & GF_REGION_LAZY);
+  rsimd   = (region_type & GF_REGION_SIMD);
+  rnosimd = (region_type & GF_REGION_NOSIMD);
+  raltmap = (region_type & GF_REGION_ALTMAP);
+  rcauchy = (region_type & GF_REGION_CAUCHY);
+
+  if (divide_type != GF_DIVIDE_DEFAULT &&
+      divide_type != GF_DIVIDE_MATRIX && 
+      divide_type != GF_DIVIDE_EUCLID) {
+    _gf_errno = GF_E_UNK_DIV;
+    return 0;
+  }
+
+  tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
+          GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
+          GF_REGION_CAUCHY );
+  if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
+
+#ifdef INTEL_SSE2
+  if (gf_cpu_supports_intel_sse2) {
+    sse2 = 1;
+  }
+#endif
+
+#ifdef INTEL_SSSE3
+  if (gf_cpu_supports_intel_ssse3) {
+    sse3 = 1;
+  }
+#endif
+
+#ifdef INTEL_SSE4_PCLMUL
+  if (gf_cpu_supports_intel_pclmul) {
+    pclmul = 1;
+  }
+#endif
+
+#ifdef ARM_NEON
+  if (gf_cpu_supports_arm_neon) {
+    pclmul = (w == 4 || w == 8);
+    sse3 = 1;
+  }
+#endif
+
+
+  if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
+    
+  if (mult_type != GF_MULT_COMPOSITE && w < 64) {
+    if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
+  }
+
+  if (mult_type == GF_MULT_DEFAULT) {
+    if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
+    if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
+    if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
+    return 1;
+  }
+  
+  if (rsimd && rnosimd)                              { _gf_errno = GF_E_SIMD_NO; return 0; }
+  if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
+  if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
+  if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
+
+  if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && 
+      mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG1SET;
+    return 0;
+  }
+
+  if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
+    _gf_errno = GF_E_ARG2SET;
+    return 0;
+  }
+
+  if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
+
+  if (rdouble) {
+    if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
+    if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
+    if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
+    return 1;
+  }
+
+  if (rquad) {
+    if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
+    if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
+    if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
+    return 1;
+  }
+
+  if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
+
+  if (mult_type == GF_MULT_SHIFT) {
+    if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
+    if (rsimd || rnosimd)           { _gf_errno = GF_E_SSESHIF; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
+    if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
+    if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
+    if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
+    if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_CARRY_FREE_GK) {
+    if (w != 4 && w != 8 && w != 16 &&
+        w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
+    if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
+    if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
+    if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
+    if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
+    if (rsimd && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
+                                     || mult_type == GF_MULT_LOG_ZERO_EXT ) {
+    if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
+    if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
+
+    if (mult_type == GF_MULT_LOG_TABLE) return 1;
+
+    if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
+
+    if (mult_type == GF_MULT_LOG_ZERO) return 1;
+
+    if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_GROUP) {
+    if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
+    if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
+    if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
+    if (w == 128 && (arg1 != 4 || 
+       (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
+    if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
+    if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
+    if (raltmap || rsimd || rnosimd)            { _gf_errno = GF_E_GR____J; return 0; }
+    return 1;
+  }
+  
+  if (mult_type == GF_MULT_TABLE) {
+    if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
+    if (w != 4 && (rsimd || rnosimd))           { _gf_errno = GF_E_TAB_SSE; return 0; }
+    if (rsimd && !sse3)                         { _gf_errno = GF_E_TABSSE3; return 0; }
+    if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_SPLIT_TABLE) {
+    if (arg1 > arg2) {
+      tmp = arg1;
+      arg1 = arg2;
+      arg2 = tmp;
+    }
+    if (w == 8) {
+      if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
+      if (rsimd && !sse3)                       { _gf_errno = GF_E_SP_SSE3; return 0; }
+      if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
+    } else if (w == 16) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 16)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_16_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 16) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
+    } else if (w == 32) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 32) ||
+          (arg1 == 16 && arg2 == 32)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_32_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 32) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_32AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
+    } else if (w == 64) {
+      if ((arg1 == 8 && arg2 == 8) ||
+          (arg1 == 8 && arg2 == 64) ||
+          (arg1 == 16 && arg2 == 64)) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_64_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 64) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_64AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
+    } else if (w == 128) {
+      if (arg1 == 8 && arg2 == 128) {
+        if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP128_S; return 0; }
+        if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
+      } else if (arg1 == 4 && arg2 == 128) {
+        if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
+        if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
+        if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP128AS; return 0; }
+      } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
+    } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
+    return 1;
+  }
+
+  if (mult_type == GF_MULT_COMPOSITE) {
+    if (w != 8 && w != 16 && w != 32 
+               && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
+    if (w < 128 && (poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
+    if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
+    if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
+    if (rsimd || rnosimd)                       { _gf_errno = GF_E_COMP_SS; return 0; }
+    if (base != NULL) {
+      sub = (gf_internal_t *) base->scratch;
+      if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
+      if (poly == 0) {
+        if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
+      }
+    }
+    return 1;
+  }
+
+  _gf_errno = GF_E_UNKNOWN; 
+  return 0;
+}
+
+int gf_scratch_size(int w, 
+                    int mult_type, 
+                    int region_type, 
+                    int divide_type, 
+                    int arg1, 
+                    int arg2)
+{
+  if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
+
+  switch(w) {
+    case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
+    default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  }
+}
+
+extern int gf_size(gf_t *gf)
+{
+  gf_internal_t *h;
+  int s;
+
+  s = sizeof(gf_t);
+  h = (gf_internal_t *) gf->scratch;
+  s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
+  if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
+  return s;
+}
+
+
+int gf_init_easy(gf_t *gf, int w)
+{
+  return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, 
+                      0, 0, 0, NULL, NULL);
+}
+
+/* Allen: What's going on here is this function is putting info into the
+       scratch mem of gf, and then calling the relevant REAL init
+       func for the word size.  Probably done this way to consolidate
+       those aspects of initialization that don't rely on word size,
+       and then take care of word-size-specific stuff. */
+
+int gf_init_hard(gf_t *gf, int w, int mult_type, 
+                        int region_type,
+                        int divide_type,
+                        uint64_t prim_poly,
+                        int arg1, int arg2,
+                        gf_t *base_gf,
+                        void *scratch_memory) 
+{
+  int sz;
+  gf_internal_t *h;
+ 
+  gf_cpu_identify();
+
+  if (gf_error_check(w, mult_type, region_type, divide_type, 
+                     arg1, arg2, prim_poly, base_gf) == 0) return 0;
+
+  sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
+  if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
+                             in gf_error_check() */
+  
+  if (scratch_memory == NULL) {
+    h = (gf_internal_t *) malloc(sz);
+    h->free_me = 1;
+  } else {
+    h = scratch_memory;
+    h->free_me = 0;
+  }
+  gf->scratch = (void *) h;
+  h->mult_type = mult_type;
+  h->region_type = region_type;
+  h->divide_type = divide_type;
+  h->w = w;
+  h->prim_poly = prim_poly;
+  h->arg1 = arg1;
+  h->arg2 = arg2;
+  h->base_gf = base_gf;
+  h->private = (void *) gf->scratch;
+  h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
+  gf->extract_word.w32 = NULL;
+
+  switch(w) {
+    case 4: return gf_w4_init(gf);
+    case 8: return gf_w8_init(gf);
+    case 16: return gf_w16_init(gf);
+    case 32: return gf_w32_init(gf);
+    case 64: return gf_w64_init(gf);
+    case 128: return gf_w128_init(gf);
+    default: return gf_wgen_init(gf);
+  }
+}
+
+int gf_free(gf_t *gf, int recursive)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (recursive && h->base_gf != NULL) {
+    gf_free(h->base_gf, 1);
+    free(h->base_gf);
+  }
+  if (h->free_me) free(h);
+  return 0; /* Making compiler happy */
+}
+
+void gf_alignment_error(char *s, int a)
+{
+  fprintf(stderr, "Alignment error in %s:\n", s);
+  fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
+  fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
+  assert(0);
+}
+
+static 
+void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
+  int cols, i, j;
+  uint32_t tmp;
+
+  cols = rows;
+
+  for (i = 0; i < rows; i++) inv[i] = (1 << i);
+
+  /* First -- convert into upper triangular */
+
+  for (i = 0; i < cols; i++) {
+
+    /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
+       matrix was not invertible */
+
+    if ((mat[i] & (1 << i)) == 0) {
+      for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
+      if (j == rows) {
+        fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
+        assert(0);
+      }
+      tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
+      tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
+    }
+
+    /* Now for each j>i, add A_ji*Ai to Aj */
+    for (j = i+1; j != rows; j++) {
+      if ((mat[j] & (1 << i)) != 0) {
+        mat[j] ^= mat[i];
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+
+  /* Now the matrix is upper triangular.  Start at the top and multiply down */
+
+  for (i = rows-1; i >= 0; i--) {
+    for (j = 0; j < i; j++) {
+      if (mat[j] & (1 << i)) {
+        /*  mat[j] ^= mat[i]; */
+        inv[j] ^= inv[i];
+      }
+    }
+  }
+}
+
+uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) 
+{
+  uint32_t mat[32], inv[32], mask;
+  int i;
+
+  mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
+  for (i = 0; i < w; i++) {
+    mat[i] = y;
+
+    if (y & (1 << (w-1))) {
+      y = y << 1;
+      y = ((y ^ pp) & mask);
+    } else {
+      y = y << 1;
+    }
+  }
+
+  gf_invert_binary_matrix(mat, inv, w);
+  return inv[0];
+}
+
+void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
+{
+  uint64_t a, prod;
+  int xor;
+  uint64_t *s64, *d64, *top;
+
+  s64 = rd->s_start;
+  d64 = rd->d_start;
+  top = rd->d_top;
+  xor = rd->xor;
+  
+  if (xor) {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      prod ^= *d64;
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  } else {
+    while (d64 != top) {
+      a = *s64;
+      prod = base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      a <<= 16;
+      prod <<= 16;
+      prod ^= base[a >> 48];
+      *d64 = prod;
+      s64++;
+      d64++;
+    }
+  }
+}
+
+static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
+{
+  uint8_t *s8, *d8;
+  uint16_t *s16, *d16;
+  uint32_t *s32, *d32;
+  uint64_t *s64, *d64;
+  gf_internal_t *h;
+  int wb;
+  uint32_t p, a;
+
+  h = rd->gf->scratch;
+  wb = (h->w)/8;
+  if (wb == 0) wb = 1;
+  
+  while (src < s_top) {
+    switch (h->w) {
+    case 8:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s8);
+      break;
+    case 4:
+      s8 = (uint8_t *) src;
+      d8 = (uint8_t *) dest;
+      a = *s8;
+      p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
+      p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
+      if (rd->xor) p ^= *d8;
+      *d8 = p;
+      break;
+    case 16:
+      s16 = (uint16_t *) src;
+      d16 = (uint16_t *) dest;
+      *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s16);
+      break;
+    case 32:
+      s32 = (uint32_t *) src;
+      d32 = (uint32_t *) dest;
+      *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : 
+                      rd->gf->multiply.w32(rd->gf, rd->val, *s32);
+      break;
+    case 64:
+      s64 = (uint64_t *) src;
+      d64 = (uint64_t *) dest;
+      *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : 
+                      rd->gf->multiply.w64(rd->gf, rd->val, *s64);
+      break;
+    default:
+      fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
+      exit(1);
+    }
+    src = (uint8_t *)src + wb;
+    dest = (uint8_t *)dest + wb;
+  }
+}
+
+/* JSP - The purpose of this procedure is to error check alignment,
+   and to set up the region operation so that it can best leverage
+   large words.
+
+   It stores its information in rd.
+
+   Assuming you're not doing Cauchy coding, (see below for that),
+   then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
+   should change that).
+
+   src and dest must then be aligned on ceil(w/8)-byte boundaries.
+   Moreover, bytes must be a multiple of ceil(w/8).  If the variable
+   align is equal to ceil(w/8), then we will set s_start = src,
+   d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
+   And we return -- the implementation will go ahead and do the
+   multiplication on individual words (e.g. using discrete logs).
+
+   If align is greater than ceil(w/8), then the implementation needs
+   to work on groups of "align" bytes.  For example, suppose you are
+   implementing BYTWO, without SSE. Then you will be doing the region
+   multiplication in units of 8 bytes, so align = 8. Or, suppose you
+   are doing a Quad table in GF(2^4). You will be doing the region
+   multiplication in units of 2 bytes, so align = 2. Or, suppose you
+   are doing split multiplication with SSE operations in GF(2^8).
+   Then align = 16. Worse yet, suppose you are doing split
+   multiplication with SSE operations in GF(2^16), with or without
+   ALTMAP. Then, you will be doing the multiplication on 256 bits at
+   a time.  So align = 32.
+
+   When align does not equal ceil(w/8), we split the region
+   multiplication into three parts.  We are going to make s_start be
+   the first address greater than or equal to src that is a multiple
+   of align.  s_top is going to be the largest address >= src+bytes
+   such that (s_top - s_start) is a multiple of align.  We do the
+   same with d_start and d_top.  When we say that "src and dest must
+   be aligned with respect to each other, we mean that s_start-src
+   must equal d_start-dest.
+
+   Now, the region multiplication is done in three parts -- the part
+   between src and s_start must be done using single words.
+   Similarly, the part between s_top and src+bytes must also be done
+   using single words.  The part between s_start and s_top will be
+   done in chunks of "align" bytes.
+
+   One final thing -- if align > 16, then s_start and d_start will be
+   aligned on a 16 byte boundary.  Perhaps we should have two
+   variables: align and chunksize.  Then we'd have s_start & d_start
+   aligned to "align", and have s_top-s_start be a multiple of
+   chunksize.  That may be less confusing, but it would be a big
+   change.
+
+   Finally, if align = -1, then we are doing Cauchy multiplication,
+   using only XOR's.  In this case, we're not going to care about
+   alignment because we are just doing XOR's.  Instead, the only
+   thing we care about is that bytes must be a multiple of w.
+
+   This is not to say that alignment doesn't matter in performance
+   with XOR's.  See that discussion in gf_multby_one().
+
+   After you call gf_set_region_data(), the procedure
+   gf_do_initial_region_alignment() calls gf->multiply.w32() on
+   everything between src and s_start.  The procedure
+   gf_do_final_region_alignment() calls gf->multiply.w32() on
+   everything between s_top and src+bytes.
+   */
+
+void gf_set_region_data(gf_region_data *rd,
+  gf_t *gf,
+  void *src,
+  void *dest,
+  int bytes,
+  uint64_t val,
+  int xor,
+  int align)
+{
+  gf_internal_t *h = NULL;
+  int wb;
+  uint32_t a;
+  unsigned long uls, uld;
+
+  if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
+    wb = 1;
+  } else {
+    h = gf->scratch;
+    wb = (h->w)/8;
+    if (wb == 0) wb = 1;
+  }
+  
+  rd->gf = gf;
+  rd->src = src;
+  rd->dest = dest;
+  rd->bytes = bytes;
+  rd->val = val;
+  rd->xor = xor;
+  rd->align = align;
+
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+  a = (align <= 16) ? align : 16;
+
+  if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
+                        so that there are no alignment regions. */
+    if (h != NULL && bytes % h->w != 0) {
+      fprintf(stderr, "Error in region multiply operation.\n");
+      fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
+      assert(0);
+    }
+  
+    rd->s_start = src;
+    rd->d_start = dest;
+    rd->s_top = (uint8_t *)src + bytes;
+    rd->d_top = (uint8_t *)src + bytes;
+    return;
+  }
+
+  if (uls % a != uld % a) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
+    fprintf(stderr, "to each other along a %d byte boundary.\n", a);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (uls % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
+    fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
+            (unsigned long) dest);
+    assert(0);
+  }
+
+  if (bytes % wb != 0) {
+    fprintf(stderr, "Error in region multiply operation.\n");
+    fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
+    assert(0);
+  }
+
+  uls %= a;
+  if (uls != 0) uls = (a-uls);
+  rd->s_start = (uint8_t *)rd->src + uls;
+  rd->d_start = (uint8_t *)rd->dest + uls;
+  bytes -= uls;
+  bytes -= (bytes % align);
+  rd->s_top = (uint8_t *)rd->s_start + bytes;
+  rd->d_top = (uint8_t *)rd->d_start + bytes;
+
+}
+
+void gf_do_initial_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
+}
+
+void gf_do_final_region_alignment(gf_region_data *rd)
+{
+  gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
+}
+
+void gf_multby_zero(void *dest, int bytes, int xor) 
+{
+  if (xor) return;
+  bzero(dest, bytes);
+  return;
+}
+
+/* JSP - gf_multby_one tries to do this in the most efficient way
+   possible.  If xor = 0, then simply call memcpy() since that
+   should be optimized by the system.  Otherwise, try to do the xor
+   in the following order:
+
+   If src and dest are aligned with respect to each other on 16-byte
+   boundaries and you have SSE instructions, then use aligned SSE
+   instructions.
+
+   If they aren't but you still have SSE instructions, use unaligned
+   SSE instructions.
+
+   If there are no SSE instructions, but they are aligned with
+   respect to each other on 8-byte boundaries, then do them with
+   uint64_t's.
+
+   Otherwise, call gf_unaligned_xor(), which does the following:
+   align a destination pointer along an 8-byte boundary, and then
+   memcpy 32 bytes at a time from the src pointer to an array of
+   doubles.  I'm not sure if that's the best -- probably needs
+   testing, but this seems like it could be a black hole.
+ */
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes);
+
+void gf_multby_one(void *src, void *dest, int bytes, int xor) 
+{
+  unsigned long uls, uld;
+  uint8_t *s8, *d8;
+  uint64_t *s64, *d64, *dtop64;
+  gf_region_data rd;
+
+  if (!xor) {
+    if (dest != src)
+      memcpy(dest, src, bytes);
+    return;
+  }
+  uls = (unsigned long) src;
+  uld = (unsigned long) dest;
+
+#ifdef   INTEL_SSE2
+  if (gf_cpu_supports_intel_sse2) {
+    __m128i ms, md;
+    int abytes;
+    s8 = (uint8_t *) src;
+    d8 = (uint8_t *) dest;
+    if (uls % 16 == uld % 16) {
+      gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+      while (s8 != rd.s_start) {
+        *d8 ^= *s8;
+        d8++;
+        s8++;
+      }
+      while (s8 < (uint8_t *) rd.s_top) {
+        ms = _mm_load_si128 ((__m128i *)(s8));
+        md = _mm_load_si128 ((__m128i *)(d8));
+        md = _mm_xor_si128(md, ms);
+        _mm_store_si128((__m128i *)(d8), md);
+        s8 += 16;
+        d8 += 16;
+      }
+      while (s8 != (uint8_t *) src + bytes) {
+        *d8 ^= *s8;
+        d8++;
+        s8++;
+      }
+      return;
+    }
+
+    abytes = (bytes & 0xfffffff0);
+
+    while (d8 < (uint8_t *) dest + abytes) {
+      ms = _mm_loadu_si128 ((__m128i *)(s8));
+      md = _mm_loadu_si128 ((__m128i *)(d8));
+      md = _mm_xor_si128(md, ms);
+      _mm_storeu_si128((__m128i *)(d8), md);
+      s8 += 16;
+      d8 += 16;
+    }
+    while (d8 != (uint8_t *) dest+bytes) {
+      *d8 ^= *s8;
+      d8++;
+      s8++;
+    }
+    return;
+  }
+#endif
+#if defined(ARM_NEON)
+  if (gf_cpu_supports_arm_neon) {
+    s8 = (uint8_t *) src;
+    d8 = (uint8_t *) dest;
+
+    if (uls % 16 == uld % 16) {
+      gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+      while (s8 != rd.s_start) {
+        *d8 ^= *s8;
+        s8++;
+        d8++;
+      }
+      while (s8 < (uint8_t *) rd.s_top) {
+        uint8x16_t vs = vld1q_u8 (s8);
+        uint8x16_t vd = vld1q_u8 (d8);
+        uint8x16_t vr = veorq_u8 (vs, vd);
+        vst1q_u8 (d8, vr);
+        s8 += 16;
+        d8 += 16;
+      }
+    } else {
+      while (s8 + 15 < (uint8_t *) src + bytes) {
+        uint8x16_t vs = vld1q_u8 (s8);
+        uint8x16_t vd = vld1q_u8 (d8);
+        uint8x16_t vr = veorq_u8 (vs, vd);
+        vst1q_u8 (d8, vr);
+        s8 += 16;
+        d8 += 16;
+      }
+    }
+    while (s8 < (uint8_t *) src + bytes) {
+      *d8 ^= *s8;
+      s8++;
+      d8++;
+    }
+    return;
+  }
+#endif
+  if (uls % 8 != uld % 8) {
+    gf_unaligned_xor(src, dest, bytes);
+    return;
+  }
+  
+  gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  while (d8 != rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  dtop64 = (uint64_t *) rd.d_top;
+
+  d64 = (uint64_t *) rd.d_start;
+  s64 = (uint64_t *) rd.s_start;
+
+  while (d64 < dtop64) {
+    *d64 ^= *s64;
+    d64++;
+    s64++;
+  }
+
+  s8 = (uint8_t *) rd.s_top;
+  d8 = (uint8_t *) rd.d_top;
+
+  while (d8 != (uint8_t *) dest+bytes) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  return;
+}
+
+#define UNALIGNED_BUFSIZE (8)
+
+static void gf_unaligned_xor(void *src, void *dest, int bytes)
+{
+  uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
+  int i;
+  gf_region_data rd;
+  uint8_t *s8, *d8;
+
+  /* JSP - call gf_set_region_data(), but use dest in both places.  This is
+     because I only want to set up dest.  If I used src, gf_set_region_data()
+     would fail because src and dest are not aligned to each other wrt 
+     8-byte pointers.  I know this will actually align d_start to 16 bytes.
+     If I change gf_set_region_data() to split alignment & chunksize, then 
+     I could do this correctly. */
+
+  gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  while (d8 < (uint8_t *) rd.d_start) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+  
+  d64 = (uint64_t *) d8;
+  while (d64 < (uint64_t *) rd.d_top) {
+    memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
+    s8 += 8*UNALIGNED_BUFSIZE;
+    for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
+      *d64 ^= scopy[i];
+      d64++;
+    }
+  }
+  
+  d8 = (uint8_t *) d64;
+  while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
+    *d8 ^= *s8;
+    d8++;
+    s8++;
+  }
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c b/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c
new file mode 100644
index 000000000..f65131f58
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c
@@ -0,0 +1,180 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int gf_cpu_identified = 0;
+
+int gf_cpu_supports_intel_pclmul = 0;
+int gf_cpu_supports_intel_sse4 = 0;
+int gf_cpu_supports_intel_ssse3 = 0;
+int gf_cpu_supports_intel_sse3 = 0;
+int gf_cpu_supports_intel_sse2 = 0;
+int gf_cpu_supports_arm_neon = 0;
+
+#if defined(__x86_64__)
+
+/* CPUID Feature Bits */
+
+/* ECX */
+#define GF_CPU_SSE3     (1 << 0)
+#define GF_CPU_PCLMUL   (1 << 1)
+#define GF_CPU_SSSE3    (1 << 9)
+#define GF_CPU_SSE41    (1 << 19)
+#define GF_CPU_SSE42    (1 << 20)
+
+/* EDX */
+#define GF_CPU_SSE2     (1 << 26)
+
+#if defined(_MSC_VER)
+
+#define cpuid(info, x)    __cpuidex(info, x, 0)
+
+#elif defined(__GNUC__)
+
+#include <cpuid.h>
+void cpuid(int info[4], int InfoType){
+    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+
+#else
+
+#error please add a way to detect CPU SIMD support at runtime 
+
+#endif
+
+void gf_cpu_identify(void)
+{
+  if (gf_cpu_identified) {
+      return;
+  }
+
+  int reg[4];
+
+  cpuid(reg, 1);
+
+#if defined(INTEL_SSE4_PCLMUL)
+  if ((reg[2] & GF_CPU_PCLMUL) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
+      gf_cpu_supports_intel_pclmul = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_pclmul\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE4)
+  if (((reg[2] & GF_CPU_SSE42) != 0 || (reg[2] & GF_CPU_SSE41) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
+      gf_cpu_supports_intel_sse4 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse4\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSSE3)
+  if ((reg[2] & GF_CPU_SSSE3) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
+      gf_cpu_supports_intel_ssse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_ssse3\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE3)
+  if ((reg[2] & GF_CPU_SSE3) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
+      gf_cpu_supports_intel_sse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse3\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE2)
+  if ((reg[3] & GF_CPU_SSE2) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
+      gf_cpu_supports_intel_sse2 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse2\n");
+#endif
+  }
+#endif
+
+  gf_cpu_identified = 1;
+}
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+#ifdef __linux__
+
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <linux/auxvec.h>
+#include <asm/hwcap.h>
+#include <fcntl.h>
+
+unsigned long get_hwcap(unsigned long type) {
+    unsigned long hwcap = 0; 
+    int fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd > 0) {
+        Elf32_auxv_t auxv;
+        while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
+            if (auxv.a_type == type) {
+                hwcap = auxv.a_un.a_val;
+                break;
+            }
+        }
+        close(fd);
+    }
+
+    return hwcap;
+}
+
+#endif // linux
+
+void gf_cpu_identify(void)
+{
+  if (gf_cpu_identified) {
+      return;
+  }
+
+#if defined(ARM_NEON)
+  if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
+#if __linux__ && __arm__
+	  gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
+#elif __aarch64__
+    // ASIMD is supported on all aarch64 architectures
+	  gf_cpu_supports_arm_neon = 1;
+#else
+    // we assume that NEON is supported if the compiler supports
+    // NEON and we dont have a reliable way to detect runtime support.
+	  gf_cpu_supports_arm_neon = 1;
+#endif
+
+#ifdef DEBUG_CPU_DETECTION
+    if (gf_cpu_supports_arm_neon) {
+      printf("#gf_cpu_supports_arm_neon\n");
+    }
+#endif
+  }
+#endif // defined(ARM_NEON)
+
+  gf_cpu_identified = 1;
+}
+
+#else // defined(__arm__) || defined(__aarch64__)
+
+int gf_cpu_identify(void)
+{
+    gf_cpu_identified = 1;
+    return 0;
+}
+
+#endif
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_general.c b/src/erasure-code/jerasure/gf-complete/src/gf_general.c
new file mode 100644
index 000000000..769f7a082
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_general.c
@@ -0,0 +1,539 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_general.c
+ *
+ * This file has helper routines for doing basic GF operations with any
+ * legal value of w.  The problem is that w <= 32, w=64 and w=128 all have
+ * different data types, which is a pain.  The procedures in this file try
+ * to alleviate that pain.  They are used in gf_unit and gf_time.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+
+#include "gf_complete.h"
+#include "gf_int.h"
+#include "gf_method.h"
+#include "gf_rand.h"
+#include "gf_general.h"
+
+void gf_general_set_zero(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 0;
+  } else if (w <= 64) {
+    v->w64 = 0;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 0;
+  }
+}
+
+void gf_general_set_one(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 1;
+  } else if (w <= 64) {
+    v->w64 = 1;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 1;
+  }
+}
+
+void gf_general_set_two(gf_general_t *v, int w)
+{
+  if (w <= 32) {
+    v->w32 = 2;
+  } else if (w <= 64) {
+    v->w64 = 2;
+  } else {
+    v->w128[0] = 0;
+    v->w128[1] = 2;
+  }
+}
+
+int gf_general_is_zero(gf_general_t *v, int w) 
+{
+  if (w <= 32) {
+    return (v->w32 == 0);
+  } else if (w <= 64) {
+    return (v->w64 == 0);
+  } else {
+    return (v->w128[0] == 0 && v->w128[1] == 0);
+  }
+}
+
+int gf_general_is_one(gf_general_t *v, int w) 
+{
+  if (w <= 32) {
+    return (v->w32 == 1);
+  } else if (w <= 64) {
+    return (v->w64 == 1);
+  } else {
+    return (v->w128[0] == 0 && v->w128[1] == 1);
+  }
+}
+
+void gf_general_set_random(gf_general_t *v, int w, int zero_ok) 
+{
+  if (w <= 32) {
+      v->w32 = MOA_Random_W(w, zero_ok);
+  } else if (w <= 64) {
+    while (1) {
+      v->w64 = MOA_Random_64();
+      if (v->w64 != 0 || zero_ok) return;
+    }
+  } else {
+    while (1) {
+      MOA_Random_128(v->w128);
+      if (v->w128[0] != 0 || v->w128[1] != 0 || zero_ok) return;
+    }
+  }
+}
+
+void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex)
+{
+  if (w <= 32) {
+    if (hex) {
+      sprintf(s, "%x", v->w32);
+    } else {
+      sprintf(s, "%u", v->w32);
+    }
+  } else if (w <= 64) {
+    if (hex) {
+      sprintf(s, "%llx", (long long unsigned int) v->w64);
+    } else {
+      sprintf(s, "%lld", (long long unsigned int) v->w64);
+    }
+  } else {
+    if (v->w128[0] == 0) {
+      sprintf(s, "%llx", (long long unsigned int) v->w128[1]);
+    } else {
+      sprintf(s, "%llx%016llx", (long long unsigned int) v->w128[0], 
+                                (long long unsigned int) v->w128[1]);
+    }
+  }
+}
+
+int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex)
+{
+  int l;
+  int save;
+
+  if (w <= 32) {
+    if (hex) {
+      if (sscanf(s, "%x", &(v->w32)) == 0) return 0;
+    } else {
+      if (sscanf(s, "%u", &(v->w32)) == 0) return 0;
+    }
+    if (w == 32) return 1;
+    if (w == 31) {
+      if (v->w32 & ((gf_val_32_t)1 << 31)) return 0;
+      return 1;
+    } 
+    if (v->w32 & ~((1 << w)-1)) return 0;
+    return 1;
+  } else if (w <= 64) {
+    if (hex) return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w64))) == 1);
+    return (sscanf(s, "%lld", (long long int *) (&(v->w64))) == 1);
+  } else {
+    if (!hex) return 0;
+    l = strlen(s);
+    if (l <= 16) {
+      v->w128[0] = 0;
+      return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1);
+    } else {
+      if (l > 32) return 0;
+      save = s[l-16];
+      s[l-16] = '\0';
+      if (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[0]))) == 0) {
+        s[l-16] = save;
+        return 0;
+      }
+      return (sscanf(s+(l-16), "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1);
+    }
+  }
+}
+    
+void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = a->w32 ^ b->w32;
+  } else if (w <= 64) {
+    c->w64 = a->w64 ^ b->w64;
+  } else {
+    c->w128[0] = a->w128[0] ^ b->w128[0];
+    c->w128[1] = a->w128[1] ^ b->w128[1];
+  }
+}
+  
+void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = gf->multiply.w32(gf, a->w32, b->w32);
+  } else if (w <= 64) {
+    c->w64 = gf->multiply.w64(gf, a->w64, b->w64);
+  } else {
+    gf->multiply.w128(gf, a->w128, b->w128, c->w128);
+  }
+}
+  
+void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    c->w32 = gf->divide.w32(gf, a->w32, b->w32);
+  } else if (w <= 64) {
+    c->w64 = gf->divide.w64(gf, a->w64, b->w64);
+  } else {
+    gf->divide.w128(gf, a->w128, b->w128, c->w128);
+  }
+}
+  
+void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    b->w32 = gf->inverse.w32(gf, a->w32);
+  } else if (w <= 64) {
+    b->w64 = gf->inverse.w64(gf, a->w64);
+  } else {
+    gf->inverse.w128(gf, a->w128, b->w128);
+  }
+}
+  
+int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w)
+{
+  if (w <= 32) {
+    return (v1->w32 == v2->w32);
+  } else if (w <= 64) {
+    return (v1->w64 == v2->w64);
+  } else {
+    return (v1->w128[0] == v2->w128[0] &&
+            v1->w128[1] == v2->w128[1]);
+  }
+}
+
+void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, void *ra, void *rb, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int w;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  if (w <= 32) {
+    gf->multiply_region.w32(gf, ra, rb, a->w32, bytes, xor);
+  } else if (w <= 64) {
+    gf->multiply_region.w64(gf, ra, rb, a->w64, bytes, xor);
+  } else {
+    gf->multiply_region.w128(gf, ra, rb, a->w128, bytes, xor);
+  }
+}
+
+void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *orig_target, void *final_target, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int w, words, i;
+  gf_general_t oa, ot, ft, sb;
+  char sa[50], soa[50], sot[50], sft[50], ssb[50];
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+
+  words = (bytes * 8) / w;
+  for (i = 0; i < words; i++) {
+    if (w <= 32) {
+      oa.w32 = gf->extract_word.w32(gf, orig_a, bytes, i);
+      ot.w32 = gf->extract_word.w32(gf, orig_target, bytes, i);
+      ft.w32 = gf->extract_word.w32(gf, final_target, bytes, i);
+      sb.w32 = gf->multiply.w32(gf, a->w32, oa.w32);
+      if (xor) sb.w32 ^= ot.w32;
+    } else if (w <= 64) {
+      oa.w64 = gf->extract_word.w64(gf, orig_a, bytes, i);
+      ot.w64 = gf->extract_word.w64(gf, orig_target, bytes, i);
+      ft.w64 = gf->extract_word.w64(gf, final_target, bytes, i);
+      sb.w64 = gf->multiply.w64(gf, a->w64, oa.w64);
+      if (xor) sb.w64 ^= ot.w64;
+    } else {
+      gf->extract_word.w128(gf, orig_a, bytes, i, oa.w128);
+      gf->extract_word.w128(gf, orig_target, bytes, i, ot.w128);
+      gf->extract_word.w128(gf, final_target, bytes, i, ft.w128);
+      gf->multiply.w128(gf, a->w128, oa.w128, sb.w128);
+      if (xor) {
+        sb.w128[0] ^= ot.w128[0];
+        sb.w128[1] ^= ot.w128[1];
+      }
+    }
+
+    if (!gf_general_are_equal(&ft, &sb, w)) {
+      
+      fprintf(stderr,"Problem with region multiply (all values in hex):\n");
+      fprintf(stderr,"   Target address base: 0x%lx.  Word 0x%x of 0x%x.  Xor: %d\n", 
+                 (unsigned long) final_target, i, words, xor);
+      gf_general_val_to_s(a, w, sa, 1);
+      gf_general_val_to_s(&oa, w, soa, 1);
+      gf_general_val_to_s(&ot, w, sot, 1);
+      gf_general_val_to_s(&ft, w, sft, 1);
+      gf_general_val_to_s(&sb, w, ssb, 1);
+      fprintf(stderr,"   Value: %s\n", sa);
+      fprintf(stderr,"   Original source word: %s\n", soa);
+      if (xor) fprintf(stderr,"   XOR with target word: %s\n", sot);
+      fprintf(stderr,"   Product word: %s\n", sft);
+      fprintf(stderr,"   It should be: %s\n", ssb);
+      assert(0);
+    }
+  }
+}
+
+void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size)
+{
+  void *top;
+  gf_general_t g;
+  uint8_t *r8, *r8a;
+  uint16_t *r16;
+  uint32_t *r32;
+  uint64_t *r64;
+  int i;
+
+  top = (uint8_t *)rb+size;
+
+  /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes.
+     However, don't allow for zeros in rb, because that will screw up
+     division.
+     
+     When w is 4, you fill the regions with random 4-bit words in each byte.
+
+     Otherwise, treat every four bytes as an uint32_t
+     and fill it with a random value mod (1 << w).
+   */
+
+  if (w == 8 || w == 16 || w == 32 || w == 64 || w == 128) {
+    MOA_Fill_Random_Region (ra, size);
+    while (rb < top) {
+      gf_general_set_random(&g, w, 0);
+      switch (w) {
+        case 8: 
+          r8 = (uint8_t *) rb;
+          *r8 = g.w32;
+          break;
+        case 16: 
+          r16 = (uint16_t *) rb;
+          *r16 = g.w32;
+          break;
+        case 32: 
+          r32 = (uint32_t *) rb;
+          *r32 = g.w32;
+          break;
+        case 64:
+          r64 = (uint64_t *) rb;
+          *r64 = g.w64;
+          break;
+        case 128: 
+          r64 = (uint64_t *) rb;
+          r64[0] = g.w128[0];
+          r64[1] = g.w128[1];
+          break;
+      }
+      rb = (uint8_t *)rb + (w/8);
+    }
+  } else if (w == 4) {
+    r8a = (uint8_t *) ra;
+    r8 = (uint8_t *) rb;
+    while (r8 < (uint8_t *) top) {
+      gf_general_set_random(&g, w, 1);
+      *r8a = g.w32;
+      gf_general_set_random(&g, w, 0);
+      *r8 = g.w32;
+      r8a++;
+      r8++;
+    }
+  } else {
+    r32 = (uint32_t *) ra;
+    for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1);
+    r32 = (uint32_t *) rb;
+    for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 0);
+  }
+}
+
+/* This sucks, but in order to time, you really need to avoid putting ifs in 
+   the inner loops.  So, I'm doing a separate timing test for each w: 
+   (4 & 8), 16, 32, 64, 128 and everything else.  Fortunately, the "everything else"
+   tests can be equivalent to w=32.
+
+   I'm also putting the results back into ra, because otherwise, the optimizer might
+   figure out that we're not really doing anything in the inner loops and it 
+   will chuck that. */
+
+int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char test)
+{
+  gf_internal_t *h;
+  void *top;
+  uint8_t *r8a, *r8b, *top8;
+  uint16_t *r16a, *r16b, *top16;
+  uint32_t *r32a, *r32b, *top32;
+  uint64_t *r64a, *r64b, *top64, *r64c;
+  int w, rv;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  top = (uint8_t *)ra + size;
+
+  if (w == 8 || w == 4) {
+    r8a = (uint8_t *) ra; 
+    r8b = (uint8_t *) rb; 
+    top8 = (uint8_t *) top;
+    if (test == 'M') {
+      while (r8a < top8) {
+        *r8a = gf->multiply.w32(gf, *r8a, *r8b);
+        r8a++;
+        r8b++;
+      }
+    } else if (test == 'D') {
+      while (r8a < top8) {
+        *r8a = gf->divide.w32(gf, *r8a, *r8b);
+        r8a++;
+        r8b++;
+      }
+    } else if (test == 'I') {
+      while (r8a < top8) {
+        *r8a = gf->inverse.w32(gf, *r8a);
+        r8a++;
+      }
+    }
+    return (top8 - (uint8_t *) ra);
+  }
+
+  if (w == 16) {
+    r16a = (uint16_t *) ra; 
+    r16b = (uint16_t *) rb; 
+    top16 = (uint16_t *) top;
+    if (test == 'M') {
+      while (r16a < top16) {
+        *r16a = gf->multiply.w32(gf, *r16a, *r16b);
+        r16a++;
+        r16b++;
+      }
+    } else if (test == 'D') {
+      while (r16a < top16) {
+        *r16a = gf->divide.w32(gf, *r16a, *r16b);
+        r16a++;
+        r16b++;
+      }
+    } else if (test == 'I') {
+      while (r16a < top16) {
+        *r16a = gf->inverse.w32(gf, *r16a);
+        r16a++;
+      }
+    }
+    return (top16 - (uint16_t *) ra);
+  }
+  if (w <= 32) {
+    r32a = (uint32_t *) ra; 
+    r32b = (uint32_t *) rb; 
+    top32 = (uint32_t *) ra + (size/4); /* This is for the "everything elses" */
+    
+    if (test == 'M') {
+      while (r32a < top32) {
+        *r32a = gf->multiply.w32(gf, *r32a, *r32b);
+        r32a++;
+        r32b++;
+      }
+    } else if (test == 'D') {
+      while (r32a < top32) {
+        *r32a = gf->divide.w32(gf, *r32a, *r32b);
+        r32a++;
+        r32b++;
+      }
+    } else if (test == 'I') {
+      while (r32a < top32) {
+        *r32a = gf->inverse.w32(gf, *r32a);
+        r32a++;
+      }
+    }
+    return (top32 - (uint32_t *) ra);
+  }
+  if (w == 64) {
+    r64a = (uint64_t *) ra; 
+    r64b = (uint64_t *) rb; 
+    top64 = (uint64_t *) top;
+    if (test == 'M') {
+      while (r64a < top64) {
+        *r64a = gf->multiply.w64(gf, *r64a, *r64b);
+        r64a++;
+        r64b++;
+      }
+    } else if (test == 'D') {
+      while (r64a < top64) {
+        *r64a = gf->divide.w64(gf, *r64a, *r64b);
+        r64a++;
+        r64b++;
+      }
+    } else if (test == 'I') {
+      while (r64a < top64) {
+        *r64a = gf->inverse.w64(gf, *r64a);
+        r64a++;
+      }
+    }
+    return (top64 - (uint64_t *) ra);
+  }
+  if (w == 128) {
+    r64a = (uint64_t *) ra; 
+    r64c = r64a;
+    r64a += 2;
+    r64b = (uint64_t *) rb; 
+    top64 = (uint64_t *) top;
+    rv = (top64 - r64a)/2;
+    if (test == 'M') {
+      while (r64a < top64) {
+        gf->multiply.w128(gf, r64a, r64b, r64c);
+        r64a += 2;
+        r64b += 2;
+      }
+    } else if (test == 'D') {
+      while (r64a < top64) {
+        gf->divide.w128(gf, r64a, r64b, r64c);
+        r64a += 2;
+        r64b += 2;
+      }
+    } else if (test == 'I') {
+      while (r64a < top64) {
+        gf->inverse.w128(gf, r64a, r64c);
+        r64a += 2;
+      }
+    }
+    return rv;
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_method.c b/src/erasure-code/jerasure/gf-complete/src/gf_method.c
new file mode 100644
index 000000000..2210305d8
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_method.c
@@ -0,0 +1,193 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_method.c
+ *
+ * Parses argv to figure out the mult_type and arguments.  Returns the gf.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "gf_complete.h"
+#include "gf_int.h"
+#include "gf_method.h"
+
+int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
+{
+  int mult_type, divide_type, region_type;
+  int arg1, arg2;
+  uint64_t prim_poly;
+  gf_t *base;
+
+  mult_type = GF_MULT_DEFAULT;
+  region_type = GF_REGION_DEFAULT;
+  divide_type = GF_DIVIDE_DEFAULT;
+  prim_poly = 0;
+  base = NULL;
+  arg1 = 0;
+  arg2 = 0;
+  while (1) {
+    if (argc > starting) {
+      if (strcmp(argv[starting], "-m") == 0) {
+        starting++;
+        if (mult_type != GF_MULT_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWOMULT;
+          return 0;
+        }
+        if (strcmp(argv[starting], "SHIFT") == 0) {
+          mult_type = GF_MULT_SHIFT;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE") == 0) {
+          mult_type = GF_MULT_CARRY_FREE;
+          starting++;
+        } else if (strcmp(argv[starting], "CARRY_FREE_GK") == 0) {
+          mult_type = GF_MULT_CARRY_FREE_GK;
+          starting++;
+        } else if (strcmp(argv[starting], "GROUP") == 0) {
+          mult_type = GF_MULT_GROUP;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_GROUPAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_GROUPNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "BYTWO_p") == 0) {
+          mult_type = GF_MULT_BYTWO_p;
+          starting++;
+        } else if (strcmp(argv[starting], "BYTWO_b") == 0) {
+          mult_type = GF_MULT_BYTWO_b;
+          starting++;
+        } else if (strcmp(argv[starting], "TABLE") == 0) {
+          mult_type = GF_MULT_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG") == 0) {
+          mult_type = GF_MULT_LOG_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO") == 0) {
+          mult_type = GF_MULT_LOG_ZERO;
+          starting++;
+        } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) {
+          mult_type = GF_MULT_LOG_ZERO_EXT;
+          starting++;
+        } else if (strcmp(argv[starting], "SPLIT") == 0) {
+          mult_type = GF_MULT_SPLIT_TABLE;
+          if (argc < starting + 3) {
+            _gf_errno = GF_E_SPLITAR;
+            return 0;
+          }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0 ||
+              sscanf(argv[starting+2], "%d", &arg2) == 0) {
+            _gf_errno = GF_E_SPLITNU;
+            return 0;
+          }
+          starting += 3;
+        } else if (strcmp(argv[starting], "COMPOSITE") == 0) {
+          mult_type = GF_MULT_COMPOSITE;
+          if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; }
+          if (sscanf(argv[starting+1], "%d", &arg1) == 0) {
+            _gf_errno = GF_E_COMP_A2;
+            return 0;
+          }
+          starting += 2;
+          base = (gf_t *) malloc(sizeof(gf_t));
+          starting = create_gf_from_argv(base, w/arg1, argc, argv, starting);
+          if (starting == 0) {
+            free(base);
+            return 0;
+          }
+        } else {
+          _gf_errno = GF_E_UNKNOWN;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-r") == 0) {
+        starting++;
+        if (strcmp(argv[starting], "DOUBLE") == 0) {
+          region_type |= GF_REGION_DOUBLE_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "QUAD") == 0) {
+          region_type |= GF_REGION_QUAD_TABLE;
+          starting++;
+        } else if (strcmp(argv[starting], "LAZY") == 0) {
+          region_type |= GF_REGION_LAZY;
+          starting++;
+        } else if (strcmp(argv[starting], "SIMD") == 0) {
+          region_type |= GF_REGION_SIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSIMD") == 0) {
+          region_type |= GF_REGION_NOSIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "SSE") == 0) {
+          region_type |= GF_REGION_SIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "NOSSE") == 0) {
+          region_type |= GF_REGION_NOSIMD;
+          starting++;
+        } else if (strcmp(argv[starting], "CAUCHY") == 0) {
+          region_type |= GF_REGION_CAUCHY;
+          starting++;
+        } else if (strcmp(argv[starting], "ALTMAP") == 0) {
+          region_type |= GF_REGION_ALTMAP;
+          starting++;
+        } else {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_UNK_REG;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-p") == 0) {
+        starting++;
+        if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_POLYSPC;
+          return 0;
+        }
+        starting++;
+      } else if (strcmp(argv[starting], "-d") == 0) {
+        starting++;
+        if (divide_type != GF_DIVIDE_DEFAULT) {
+          if (base != NULL) gf_free(base, 1);
+          _gf_errno = GF_E_TWO_DIV;
+          return 0;
+        } else if (strcmp(argv[starting], "EUCLID") == 0) {
+          divide_type = GF_DIVIDE_EUCLID;
+          starting++;
+        } else if (strcmp(argv[starting], "MATRIX") == 0) {
+          divide_type = GF_DIVIDE_MATRIX;
+          starting++;
+        } else {
+          _gf_errno = GF_E_UNK_DIV;
+          return 0;
+        }
+      } else if (strcmp(argv[starting], "-") == 0) {
+         /*
+         printf("Scratch size: %d\n", gf_scratch_size(w, 
+                                      mult_type, region_type, divide_type, arg1, arg2));
+         */
+        if (gf_init_hard(gf, w, mult_type, region_type, divide_type, 
+                         prim_poly, arg1, arg2, base, NULL) == 0) {
+          if (base != NULL) gf_free(base, 1);
+          return 0;
+        } else
+          return starting + 1;
+      } else {
+        if (base != NULL) gf_free(base, 1);
+        _gf_errno = GF_E_UNKFLAG;
+        return 0;
+      }
+    } else {
+      if (base != NULL) gf_free(base, 1);
+      _gf_errno = GF_E_FEWARGS;
+      return 0;
+    }
+  }
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c
new file mode 100644
index 000000000..a9aa7ad36
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c
@@ -0,0 +1,80 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_rand.c -- Random number generator.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "gf_rand.h"
+
+/* Lifted the "Mother of All" random number generator from http://www.agner.org/random/ */
+
+static uint32_t MOA_X[5];
+
+uint32_t MOA_Random_32() {
+  uint64_t sum;
+  sum = (uint64_t)2111111111UL * (uint64_t)MOA_X[3] +
+     (uint64_t)1492 * (uint64_t)(MOA_X[2]) +
+     (uint64_t)1776 * (uint64_t)(MOA_X[1]) +
+     (uint64_t)5115 * (uint64_t)(MOA_X[0]) +
+     (uint64_t)MOA_X[4];
+  MOA_X[3] = MOA_X[2];  MOA_X[2] = MOA_X[1];  MOA_X[1] = MOA_X[0];
+  MOA_X[4] = (uint32_t)(sum >> 32);
+  MOA_X[0] = (uint32_t)sum;
+  return MOA_X[0];
+}
+
+uint64_t MOA_Random_64() {
+  uint64_t sum;
+
+  sum = MOA_Random_32();
+  sum <<= 32;
+  sum |= MOA_Random_32();
+  return sum;
+}
+
+void MOA_Random_128(uint64_t *x) {
+  x[0] = MOA_Random_64();
+  x[1] = MOA_Random_64();
+  return;
+}
+
+uint32_t MOA_Random_W(int w, int zero_ok)
+{
+  uint32_t b;
+
+  do {
+    b = MOA_Random_32();
+    if (w == 31) b &= 0x7fffffff;
+    if (w < 31)  b %= (1 << w);
+  } while (!zero_ok && b == 0);
+  return b;
+}
+
+void MOA_Seed(uint32_t seed) {
+  int i;
+  uint32_t s = seed;
+  for (i = 0; i < 5; i++) {
+    s = s * 29943829 - 1;
+    MOA_X[i] = s;
+  }
+  for (i=0; i<19; i++) MOA_Random_32();
+}
+
+
+void MOA_Fill_Random_Region (void *reg, int size)
+{
+  uint32_t *r32;
+  uint8_t *r8;
+  int i;
+
+  r32 = (uint32_t *) reg;
+  r8 = (uint8_t *) reg;
+  for (i = 0; i < size/4; i++) r32[i] = MOA_Random_32();
+  for (i *= 4; i < size; i++) r8[i] = MOA_Random_W(8, 1);
+}
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
new file mode 100644
index 000000000..3bc2d651a
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c
@@ -0,0 +1,1776 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w128.c
+ *
+ * Routines for 128-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_cpu.h"
+
+#define GF_FIELD_WIDTH (128)
+
+#define two_x(a) {\
+  a[0] <<= 1; \
+  if (a[1] & 1ULL << 63) a[0] ^= 1; \
+  a[1] <<= 1; }
+  
+#define a_get_b(a, i, b, j) {\
+  a[i] = b[j]; \
+  a[i + 1] = b[j + 1];}
+
+#define set_zero(a, i) {\
+  a[i] = 0; \
+  a[i + 1] = 0;}
+
+struct gf_w128_split_4_128_data {
+  uint64_t last_value[2];
+  uint64_t tables[2][32][16];
+};
+
+struct gf_w128_split_8_128_data {
+  uint64_t last_value[2];
+  uint64_t tables[2][16][256];
+};
+
+typedef struct gf_group_tables_s {
+  gf_val_128_t m_table;
+  gf_val_128_t r_table;
+} gf_group_tables_t;
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+static
+void
+gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+    uint32_t i;
+    gf_val_128_t s128;
+    gf_val_128_t d128;
+    uint64_t c128[2];
+    gf_region_data rd;
+
+    /* We only do this to check on alignment. */
+    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+    if (val[0] == 0) {
+      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+    }
+
+    set_zero(c128, 0);
+
+    s128 = (gf_val_128_t) src;
+    d128 = (gf_val_128_t) dest;
+
+    if (xor) {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        gf->multiply.w128(gf, &s128[i], val, c128);
+        d128[i] ^= c128[0];
+        d128[i+1] ^= c128[1];
+      }
+    } else {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        gf->multiply.w128(gf, &s128[i], val, &d128[i]);
+      }
+    }
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
+int xor)
+{
+    uint32_t i;
+    gf_val_128_t s128;
+    gf_val_128_t d128;
+    gf_region_data rd;
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+    /* We only do this to check on alignment. */
+    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+    if (val[0] == 0) {
+      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+    }
+
+    s128 = (gf_val_128_t) src;
+    d128 = (gf_val_128_t) dest;
+
+    if (xor) {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+    
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b); 
+        d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    } else {
+      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
+        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
+        b = _mm_insert_epi64 (a, val[1], 0);
+        a = _mm_insert_epi64 (a, s128[i], 1);
+        b = _mm_insert_epi64 (b, val[0], 1);
+
+        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ 
+        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ 
+
+        /* now reusing a and b as temporary variables*/
+        result0 = _mm_setzero_si128();
+        result1 = result0;
+
+        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+
+        a = _mm_srli_si128 (result0, 8);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+
+        a = _mm_insert_epi64 (result0, 0, 1);
+        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+        result1 = _mm_xor_si128 (result1, b);
+        d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
+        d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
+      }
+    }
+}
+#endif
+
+/*
+ * Some w128 notes:
+ * --Big Endian
+ * --return values allocated beforehand
+ */
+
+#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
+
+void
+gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
+  uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
+    set_zero(c128, 0);
+    return;
+  }
+
+  a_get_b(a, 0, a128, 0);
+  a_get_b(br, 0, b128, 0);
+  set_zero(bl, 0);
+
+  one = 1;
+  lbit = (one << 63);
+
+  set_zero(pl, 0);
+  set_zero(pr, 0);
+
+  /* Allen: a*b for right half of a */
+  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
+    if (a[1] & (one << i)) {
+      pl[1] ^= bl[1];
+      pr[0] ^= br[0];
+      pr[1] ^= br[1];
+    }
+    bl[1] <<= 1;
+    if (br[0] & lbit) bl[1] ^= 1;
+    br[0] <<= 1;
+    if (br[1] & lbit) br[0] ^= 1;
+    br[1] <<= 1;
+  }
+
+  /* Allen: a*b for left half of a */
+  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
+    if (a[0] & (one << i)) {
+      pl[0] ^= bl[0];
+      pl[1] ^= bl[1];
+      pr[0] ^= br[0];
+    }
+    bl[0] <<= 1;
+    if (bl[1] & lbit) bl[0] ^= 1;
+    bl[1] <<= 1;
+    if (br[0] & lbit) bl[1] ^= 1;
+    br[0] <<= 1;
+  }
+
+  /* Allen: do first half of reduction (based on left quarter of initial product) */
+  one = lbit >> 1;
+  ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
+  ppl[1] = h->prim_poly >> 2;
+  ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
+  ppr[1] = 0;
+  while (one != 0) {
+    if (pl[0] & one) {
+      pl[0] ^= ppl[0];
+      pl[1] ^= ppl[1];
+      pr[0] ^= ppr[0];
+      pr[1] ^= ppr[1];
+    }
+    one >>= 1;
+    ppr[1] >>= 1;
+    if (ppr[0] & 1) ppr[1] ^= lbit;
+    ppr[0] >>= 1;
+    if (ppl[1] & 1) ppr[0] ^= lbit;
+    ppl[1] >>= 1;
+    if (ppl[0] & 1) ppl[1] ^= lbit;
+    ppl[0] >>= 1;
+  }
+
+  /* Allen: final half of reduction */
+  one = lbit;
+  while (one != 0) {
+    if (pl[1] & one) {
+      pl[1] ^= ppl[1];
+      pr[0] ^= ppr[0];
+      pr[1] ^= ppr[1];
+    }
+    one >>= 1;
+    ppr[1] >>= 1;
+    if (ppr[0] & 1) ppr[1] ^= lbit;
+    ppr[0] >>= 1;
+    if (ppl[1] & 1) ppr[0] ^= lbit;
+    ppl[1] >>= 1;
+  }
+
+  /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
+  c128[0] = pr[0];
+  c128[1] = pr[1];
+
+  return;
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+void
+gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+    __m128i     a,b;
+    __m128i     result0,result1;
+    __m128i     prim_poly;
+    __m128i     c,d,e,f;
+    gf_internal_t * h = gf->scratch;
+    
+    a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
+    b = _mm_insert_epi64 (a, b128[1], 0);
+    a = _mm_insert_epi64 (a, a128[0], 1);
+    b = _mm_insert_epi64 (b, b128[0], 1);
+
+    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+
+    /* we need to test algorithm 2 later*/
+    c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
+    f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
+    e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
+    d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
+    
+    /* now reusing a and b as temporary variables*/
+    result0 = _mm_setzero_si128();
+    result1 = result0;
+
+    result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
+    a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
+    result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
+
+    a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
+    result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
+    result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
+    /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
+    
+    a = _mm_srli_si128 (result0, 8);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
+    result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
+    
+    a = _mm_insert_epi64 (result0, 0, 1);
+    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
+    result1 = _mm_xor_si128 (result1, b);
+
+    c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
+    c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
+}
+#endif
+
+void
+gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint64_t topbit; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  prod[0] = 0;
+  prod[1] = 0;
+  pmask = 0x8000000000000000ULL;
+  amask[0] = 0x8000000000000000ULL;
+  amask[1] = 0;
+
+  while (amask[1] != 0 || amask[0] != 0) {
+    topbit = (prod[0] & pmask);
+    prod[0] <<= 1;
+    if (prod[1] & pmask) prod[0] ^= 1;
+    prod[1] <<= 1;
+    if (topbit) prod[1] ^= pp;
+    if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
+      prod[0] ^= b128[0];
+      prod[1] ^= b128[1];
+    }
+    amask[1] >>= 1;
+    if (amask[0] & 1) amask[1] ^= pmask;
+    amask[0] >>= 1;
+  }
+  c128[0] = prod [0];
+  c128[1] = prod [1];
+  return;
+}
+
+#if defined(INTEL_SSE4)
+void
+gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  int i;
+  __m128i a, b, pp, prod, amask, u_middle_one; 
+  /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
+  uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
+  gf_internal_t *h;
+
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
+  prod = _mm_setzero_si128();
+  a = _mm_insert_epi64(prod, a128[1], 0x0);
+  a = _mm_insert_epi64(a, a128[0], 0x1);
+  b = _mm_insert_epi64(prod, b128[1], 0x0);
+  b = _mm_insert_epi64(b, b128[0], 0x1);
+  pmask = 0x80000000;
+  amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
+  u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
+  
+  for (i = 0; i < 64; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
+    if (middlebit) {
+      prod = _mm_xor_si128(prod, u_middle_one);
+    }
+    if (topbit) {
+      prod = _mm_xor_si128(prod, pp);
+    }
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
+  }
+  amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1);
+  for (i = 64; i < 128; i++) {
+    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
+    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
+    prod = _mm_slli_epi64(prod, 1);
+    if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
+    if (topbit) prod = _mm_xor_si128(prod, pp);
+    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
+      prod = _mm_xor_si128(prod, b);
+    }
+    amask = _mm_srli_epi64(amask, 1);
+  }
+  c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
+  c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
+  return;
+}
+#endif
+
+
+/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+#if defined(INTEL_SSE4)
+void
+gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  __m128i a, b, lmask, hmask, pp, c, middle_one;
+  gf_internal_t *h;
+  uint64_t topbit, middlebit;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  c = _mm_setzero_si128();
+  lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
+  hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
+  b = _mm_insert_epi64(c, a128[0], 1);
+  b = _mm_insert_epi64(b, a128[1], 0);
+  a = _mm_insert_epi64(c, b128[0], 1);
+  a = _mm_insert_epi64(a, b128[1], 0);
+  pp = _mm_insert_epi64(c, h->prim_poly, 0);
+  middle_one = _mm_insert_epi64(c, 1, 0x1);
+
+  while (1) {
+    if (_mm_extract_epi32(a, 0x0) & 1) {
+      c = _mm_xor_si128(c, b);
+    }
+    middlebit = (_mm_extract_epi32(a, 0x2) & 1);
+    a = _mm_srli_epi64(a, 1);
+    if (middlebit) a = _mm_xor_si128(a, lmask);
+    if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
+      c128[0] = _mm_extract_epi64(c, 0x1);
+      c128[1] = _mm_extract_epi64(c, 0x0);
+      return;
+    }
+    topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
+    middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
+    b = _mm_slli_epi64(b, 1);
+    if (middlebit) b = _mm_xor_si128(b, middle_one);
+    if (topbit) b = _mm_xor_si128(b, pp);
+  }
+}
+#endif
+
+void
+gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t bmask, pp;
+  gf_internal_t *h;
+  uint64_t a[2], b[2], c[2];
+
+  h = (gf_internal_t *) gf->scratch;
+
+  bmask = (1ULL << 63);
+  set_zero(c, 0);
+  b[0] = a128[0];
+  b[1] = a128[1];
+  a[0] = b128[0];
+  a[1] = b128[1];
+  
+  while (1) {
+    if (a[1] & 1) {
+      c[0] ^= b[0];
+      c[1] ^= b[1];
+    }
+    a[1] >>= 1;
+    if (a[0] & 1) a[1] ^= bmask;
+    a[0] >>= 1;
+    if (a[1] == 0 && a[0] == 0) {
+      c128[0] = c[0];
+      c128[1] = c[1];
+      return;
+    }
+    pp = (b[0] & bmask);
+    b[0] <<= 1;
+    if (b[1] & bmask) b[0] ^= 1;
+    b[1] <<= 1;
+    if (pp) b[1] ^= h->prim_poly;
+  }
+}
+
+static
+void
+gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i, j, k;
+  uint64_t pp;
+  gf_internal_t *h;
+  uint64_t *s64, *d64, *top;
+  gf_region_data rd;
+  uint64_t v[2], s;
+  struct gf_w128_split_4_128_data *ld;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+/*
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
+    }
+    printf("\n");
+  }
+ */
+  while (d64 < top) {
+    v[0] = (xor) ? d64[0] : 0;
+    v[1] = (xor) ? d64[1] : 0;
+    s = s64[1];
+    i = 0;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xf];
+      v[1] ^= ld->tables[1][i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    s = s64[0];
+    i = 16;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xf];
+      v[1] ^= ld->tables[1][i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    d64[0] = v[0];
+    d64[1] = v[1];
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
+static
+void
+gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v[2], s, *s64, *d64, *top;
+  __m128i p, tables[32][16];
+  struct gf_w128_split_4_128_data *ld;
+  gf_region_data rd;
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);
+
+  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      v[0] = ld->tables[0][i][j];
+      v[1] = ld->tables[1][i][j];
+      tables[i][j] = _mm_loadu_si128((__m128i *) v);
+
+/*
+      printf("%2d %2d: ", i, j);
+      MM_PRINT8("", tables[i][j]); */
+    }
+  }
+
+  while (d64 != top) {
+
+    if (xor) {
+      p = _mm_load_si128 ((__m128i *) d64);
+    } else {
+      p = _mm_setzero_si128();
+    }
+    s = *s64;
+    s64++;
+    for (i = 0; i < 16; i++) {
+      j = (s&0xf);
+      s >>= 4;
+      p = _mm_xor_si128(p, tables[16+i][j]);
+    }
+    s = *s64;
+    s64++;
+    for (i = 0; i < 16; i++) {
+      j = (s&0xf);
+      s >>= 4;
+      p = _mm_xor_si128(p, tables[i][j]);
+    }
+    _mm_store_si128((__m128i *) d64, p);
+    d64 += 2;
+  }
+
+  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+#endif
+
+#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
+static
+void
+gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v[2], *s64, *d64, *top;
+  __m128i si, tables[32][16], p[16], v0, mask1;
+  struct gf_w128_split_4_128_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
+
+  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_w128_split_4_128_data *) h->private;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 32; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  for (i = 0; i < 32; i++) {
+    for (j = 0; j < 16; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
+        ld->tables[1-(j/8)][i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+/*
+      printf("%2d %2d: ", i, j);
+      MM_PRINT8("", tables[i][j]);
+ */
+    }
+  }
+
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  while (d64 != top) {
+
+    if (xor) {
+      for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+    } else {
+      for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
+    }
+    i = 0;
+    for (k = 0; k < 16; k++) {
+      v0 = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+      
+      si = _mm_and_si128(v0, mask1);
+  
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      for (j = 0; j < 16; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+    for (i = 0; i < 16; i++) {
+      _mm_store_si128((__m128i *) d64, p[i]);
+      d64 += 2;
+    }
+  }
+  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+#endif
+
+static
+void
+gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i, j, k;
+  uint64_t pp;
+  gf_internal_t *h;
+  uint64_t *s64, *d64, *top;
+  gf_region_data rd;
+  uint64_t v[2], s;
+  struct gf_w128_split_8_128_data *ld;
+
+  /* Check on alignment. Ignore it otherwise. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  ld = (struct gf_w128_split_8_128_data *) h->private;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
+    v[0] = val[0];
+    v[1] = val[1];
+    for (i = 0; i < 16; i++) {
+      ld->tables[0][i][0] = 0;
+      ld->tables[1][i][0] = 0;
+      for (j = 1; j < (1 << 8); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
+          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
+        }
+        pp = (v[0] & (1ULL << 63));
+        v[0] <<= 1;
+        if (v[1] & (1ULL << 63)) v[0] ^= 1;
+        v[1] <<= 1;
+        if (pp) v[1] ^= h->prim_poly;
+      }
+    }
+  }
+  ld->last_value[0] = val[0];
+  ld->last_value[1] = val[1];
+
+  while (d64 < top) {
+    v[0] = (xor) ? d64[0] : 0;
+    v[1] = (xor) ? d64[1] : 0;
+    s = s64[1];
+    i = 0;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    s = s64[0];
+    i = 8;
+    while (s != 0) {
+      v[0] ^= ld->tables[0][i][s&0xff];
+      v[1] ^= ld->tables[1][i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    d64[0] = v[0];
+    d64[1] = v[1];
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+void
+gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  uint64_t bmask, pp;
+  gf_internal_t *h;
+  uint64_t a[2], c[2], b[2], *s64, *d64, *top;
+  gf_region_data rd;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  h = (gf_internal_t *) gf->scratch;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+  bmask = (1ULL << 63);
+
+  while (d64 < top) {
+    set_zero(c, 0);
+    b[0] = s64[0];
+    b[1] = s64[1];
+    a[0] = val[0];
+    a[1] = val[1];
+
+    while (a[0] != 0) {
+      if (a[1] & 1) {
+        c[0] ^= b[0];
+        c[1] ^= b[1];
+      }
+      a[1] >>= 1;
+      if (a[0] & 1) a[1] ^= bmask;
+      a[0] >>= 1;
+      pp = (b[0] & bmask);
+      b[0] <<= 1;
+      if (b[1] & bmask) b[0] ^= 1;    
+      b[1] <<= 1;
+      if (pp) b[1] ^= h->prim_poly;
+    }
+    while (1) {
+      if (a[1] & 1) {
+        c[0] ^= b[0];
+        c[1] ^= b[1];
+      }
+      a[1] >>= 1;
+      if (a[1] == 0) break;
+      pp = (b[0] & bmask);
+      b[0] <<= 1;
+      if (b[1] & bmask) b[0] ^= 1;    
+      b[1] <<= 1;
+      if (pp) b[1] ^= h->prim_poly;
+    }
+    if (xor) {
+      d64[0] ^= c[0];
+      d64[1] ^= c[1];
+    } else {
+      d64[0] = c[0];
+      d64[1] = c[1];
+    }
+    s64 += 2;
+    d64 += 2;
+  }
+}
+
+static
+void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
+{
+  int i, j;
+  int g_m;
+  uint64_t prim_poly, lbit;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  uint64_t a128[2];
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  prim_poly = scratch->prim_poly;
+
+
+  set_zero(gt->m_table, 0);
+  a_get_b(gt->m_table, 2, b128, 0);
+  lbit = 1;
+  lbit <<= 63;
+
+  for (i = 2; i < (1 << g_m); i <<= 1) {
+    a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
+    two_x(a128);
+    a_get_b(gt->m_table, 2 * i, a128, 0);
+    if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
+    for (j = 0; j < i; j++) {
+      gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
+      gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
+    }
+  }
+  return;
+}
+
+void
+gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  int i;
+  /* index_r, index_m, total_m (if g_r > g_m) */
+  int i_r, i_m, t_m;
+  int mask_m, mask_r;
+  int g_m, g_r;
+  uint64_t p_i[2], a[2];
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+
+  if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
+    gf_w128_group_m_init(gf, b128);
+  }
+  
+  p_i[0] = 0;
+  p_i[1] = 0;
+  a[0] = a128[0];
+  a[1] = a128[1];
+
+  t_m = 0;
+  i_r = 0;
+
+  /* Top 64 bits */
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[0] >> (i * g_m)) & mask_m;
+    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+    p_i[0] <<= g_m;
+    p_i[0] ^= (p_i[1] >> (64-g_m));
+    p_i[1] <<= g_m;
+    p_i[0] ^= gt->m_table[2 * i_m];
+    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i[1] ^= gt->r_table[i_r];
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+
+  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+    i_m = (a[1] >> (i * g_m)) & mask_m;
+    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+    p_i[0] <<= g_m;
+    p_i[0] ^= (p_i[1] >> (64-g_m));
+    p_i[1] <<= g_m;
+    p_i[0] ^= gt->m_table[2 * i_m];
+    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+    t_m += g_m;
+    if (t_m == g_r) {
+      p_i[1] ^= gt->r_table[i_r];
+      t_m = 0;
+      i_r = 0;
+    } else {
+      i_r <<= g_m;
+    }
+  }
+  c128[0] = p_i[0];
+  c128[1] = p_i[1];
+}
+
+static
+void
+gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  int i;
+  int i_r, i_m, t_m;
+  int mask_m, mask_r;
+  int g_m, g_r;
+  uint64_t p_i[2], a[2];
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  gf_region_data rd;
+  uint64_t *a128, *c128, *top;
+
+  /* We only do this to check on alignment. */
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+      
+  if (val[0] == 0) {
+    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  }
+    
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_m = scratch->arg1;
+  g_r = scratch->arg2;
+
+  mask_m = (1 << g_m) - 1;
+  mask_r = (1 << g_r) - 1;
+
+  if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
+    gf_w128_group_m_init(gf, val);
+  }
+
+  a128 = (uint64_t *) src;
+  c128 = (uint64_t *) dest;
+  top = (uint64_t *) rd.d_top;
+
+  while (c128 < top) {
+    p_i[0] = 0;
+    p_i[1] = 0;
+    a[0] = a128[0];
+    a[1] = a128[1];
+  
+    t_m = 0;
+    i_r = 0;
+  
+    /* Top 64 bits */
+    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+      i_m = (a[0] >> (i * g_m)) & mask_m;
+      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+      p_i[0] <<= g_m;
+      p_i[0] ^= (p_i[1] >> (64-g_m));
+      p_i[1] <<= g_m;
+      
+      p_i[0] ^= gt->m_table[2 * i_m];
+      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+      t_m += g_m;
+      if (t_m == g_r) {
+        p_i[1] ^= gt->r_table[i_r];
+        t_m = 0;
+        i_r = 0;
+      } else {
+        i_r <<= g_m;
+      }
+    }
+    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
+      i_m = (a[1] >> (i * g_m)) & mask_m;
+      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
+      p_i[0] <<= g_m;
+      p_i[0] ^= (p_i[1] >> (64-g_m));
+      p_i[1] <<= g_m;
+      p_i[0] ^= gt->m_table[2 * i_m];
+      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
+      t_m += g_m;
+      if (t_m == g_r) {
+        p_i[1] ^= gt->r_table[i_r];
+        t_m = 0;
+        i_r = 0;
+      } else {
+        i_r <<= g_m;
+      }
+    }
+  
+    if (xor) {
+      c128[0] ^= p_i[0];
+      c128[1] ^= p_i[1];
+    } else {
+      c128[0] = p_i[0];
+      c128[1] = p_i[1];
+    }
+    a128 += 2;
+    c128 += 2;
+  }
+}
+
+/* a^-1 -> b */
+void
+gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
+{
+  uint64_t e_i[2], e_im1[2], e_ip1[2];
+  uint64_t d_i, d_im1, d_ip1;
+  uint64_t y_i[2], y_im1[2], y_ip1[2];
+  uint64_t c_i[2];
+  uint64_t *b;
+  uint64_t one = 1;
+
+  /* This needs to return some sort of error (in b128?) */
+  if (a128[0] == 0 && a128[1] == 0) return;
+
+  b = (uint64_t *) b128;
+
+  e_im1[0] = 0;
+  e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i[0] = a128[0];
+  e_i[1] = a128[1];
+  d_im1 = 128;
+
+  //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
+  //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
+  //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
+
+  for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
+
+  //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
+
+  if (!((one << d_i) & e_i[0])) {
+
+    //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
+    // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
+
+    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
+
+  } else {
+
+    //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
+
+    d_i += 64;
+  }
+  y_i[0] = 0;
+  y_i[1] = 1;
+  y_im1[0] = 0;
+  y_im1[1] = 0;
+
+  while (!(e_i[0] == 0 && e_i[1] == 1)) {
+
+    e_ip1[0] = e_im1[0];
+    e_ip1[1] = e_im1[1];
+    d_ip1 = d_im1;
+    c_i[0] = 0;
+    c_i[1] = 0;
+
+    while (d_ip1 >= d_i) {
+      if ((d_ip1 - d_i) >= 64) {
+        c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
+        e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
+      } else {
+        c_i[1] ^= (one << (d_ip1 - d_i));
+        e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
+        if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
+        e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
+      }
+      d_ip1--;
+      if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
+      while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
+      while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
+    }
+    gf->multiply.w128(gf, c_i, y_i, y_ip1);
+    y_ip1[0] ^= y_im1[0];
+    y_ip1[1] ^= y_im1[1];
+
+    y_im1[0] = y_i[0];
+    y_im1[1] = y_i[1];
+
+    y_i[0] = y_ip1[0];
+    y_i[1] = y_ip1[1];
+
+    e_im1[0] = e_i[0];
+    e_im1[1] = e_i[1];
+    d_im1 = d_i;
+    e_i[0] = e_ip1[0];
+    e_i[1] = e_ip1[1];
+    d_i = d_ip1;
+  }
+
+  b[0] = y_i[0];
+  b[1] = y_i[1];
+  return;
+}
+
+void
+gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
+{
+  uint64_t d[2];
+  gf->inverse.w128(gf, b128, d);
+  gf->multiply.w128(gf, a128, d, c128);
+  return;
+}
+
+void
+gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
+{
+  uint64_t one128[2];
+  one128[0] = 0;
+  one128[1] = 1;
+  gf->divide.w128(gf, one128, a128, b128);
+  return;
+}
+
+
+static
+void
+gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t c0, c1, d, tmp;
+  uint64_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w64(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w64(base_gf, a1);
+    a0inv = base_gf->inverse.w64(base_gf, a0);
+
+    d = base_gf->multiply.w64(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w64(base_gf, tmp);
+
+    d = base_gf->multiply.w64(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w64(base_gf, d, a1inv);
+  }
+  inv[0] = c1;
+  inv[1] = c0;
+}
+
+static
+  void
+gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = b[1];
+  uint64_t b1 = b[0];
+  uint64_t a0 = a[1];
+  uint64_t a1 = a[0];
+  uint64_t a1b1;
+
+  a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+  rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+  rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ 
+    base_gf->multiply.w64(base_gf, a0, b1) ^ 
+    base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
+}
+
+static
+  void
+gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint64_t b0 = val[1];
+  uint64_t b1 = val[0];
+  uint64_t *s64, *d64;
+  uint64_t *top;
+  uint64_t a0, a1, a1b1;
+  gf_region_data rd;
+
+  if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
+
+  s64 = rd.s_start;
+  d64 = rd.d_start;
+  top = rd.d_top;
+
+  if (xor) {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  } else {
+    while (d64 < top) {
+      a1 = s64[0];
+      a0 = s64[1];
+      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
+
+      d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
+      d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ 
+          base_gf->multiply.w64(base_gf, a0, b1) ^ 
+          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
+      s64 += 2;
+      d64 += 2;
+    }
+  }
+}
+
+static
+void
+gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int 
+    xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
+  gf_val_64_t val0 = val[1];
+  gf_val_64_t val1 = val[0];
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+  int sub_reg_size;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
+  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t*) rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
+        ), sub_reg_size, 1);
+
+  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
+}
+
+
+  static
+int gf_w128_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt)   
+  } else {
+    SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region)
+  }
+
+  SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply)
+  SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
+  SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse)
+
+  return 1;
+}
+
+static
+int gf_w128_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+    SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
+    SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
+    return 1;
+  }
+#endif
+
+  return 0;
+}
+
+static
+int gf_w128_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply)
+  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+  SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single)
+  return 1;
+}
+
+  static
+int gf_w128_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
+    /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/
+    /* John: the sse function is slower.*/
+  } else {
+    SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply)
+    /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply)
+Ben: This sse function is also slower. */
+  }
+  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+  SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region)
+  return 1;
+}
+
+/*
+ * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
+ * bits in all of these numbers.
+ */
+  static
+void gf_w128_group_r_init(gf_t *gf)
+{
+  int i, j;
+  int g_r;
+  uint64_t pp;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_r = scratch->arg2;
+  pp = scratch->prim_poly;
+
+  gt->r_table[0] = 0;
+  for (i = 1; i < (1 << g_r); i++) {
+    gt->r_table[i] = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        gt->r_table[i] ^= (pp << j);
+      }
+    }
+  }
+  return;
+}
+
+#if 0 // defined(INTEL_SSE4)
+  static
+void gf_w128_group_r_sse_init(gf_t *gf)
+{
+  int i, j;
+  int g_r;
+  uint64_t pp;
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  __m128i zero = _mm_setzero_si128();
+  __m128i *table = (__m128i *)(gt->r_table);
+  g_r = scratch->arg2;
+  pp = scratch->prim_poly;
+  table[0] = zero;
+  for (i = 1; i < (1 << g_r); i++) {
+    table[i] = zero;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
+      }
+    }
+  }
+  return;
+}
+#endif
+
+  static 
+int gf_w128_split_init(gf_t *gf)
+{
+  struct gf_w128_split_4_128_data *sd4;
+  struct gf_w128_split_8_128_data *sd8;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
+    SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
+  }
+#endif
+
+  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+
+  if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
+    sd8 = (struct gf_w128_split_8_128_data *) h->private;
+    sd8->last_value[0] = 0;
+    sd8->last_value[1] = 0;
+    SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region)
+  } else {
+    sd4 = (struct gf_w128_split_4_128_data *) h->private;
+    sd4->last_value[0] = 0;
+    sd4->last_value[1] = 0;
+    if((h->region_type & GF_REGION_ALTMAP))
+    {
+      #ifdef INTEL_SSE4
+        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
+          SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
+        else
+      #endif
+          return 0;
+    }
+    else {
+      #ifdef INTEL_SSE4
+        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
+          SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
+        else
+      #endif
+        SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
+    }
+  }
+  return 1;
+}
+
+
+static
+int gf_w128_group_init(gf_t *gf)
+{
+  gf_internal_t *scratch;
+  gf_group_tables_t *gt;
+  int g_r, size_r;
+
+  scratch = (gf_internal_t *) gf->scratch;
+  gt = scratch->private;
+  g_r = scratch->arg2;
+  size_r = (1 << g_r);
+
+  gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
+  gt->m_table = gt->r_table + size_r;
+  gt->m_table[2] = 0;
+  gt->m_table[3] = 0;
+
+  SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply)
+  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+  SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region)
+
+  gf_w128_group_r_init(gf);
+
+  return 1;
+}
+
+void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  gf_val_128_t s;
+
+  s = (gf_val_128_t) start;
+  s += (index * 2); 
+  memcpy(rv, s, 16);
+}
+
+static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int i, blocks;
+  uint64_t *r64, tmp;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r64 = (uint64_t *) rd.d_start;
+
+  blocks = index/16;
+  r64 += (blocks*32);
+  index %= 16;
+  r8 = (uint8_t *) r64;
+  r8 += index;
+  rv[0] = 0;
+  rv[1] = 0;
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[1] |= (tmp << (i*8));
+    r8 += 16;
+  }
+
+  for (i = 0; i < 8; i++) {
+    tmp = *r8;
+    rv[0] |= (tmp << (i*8));
+    r8 += 16;
+  }
+  return;
+}
+
+  static
+void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint64_t *r64;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+  r64 = (uint64_t *) start;
+  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
+      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
+    memcpy(rv, r64+(index*2), 16);
+    return;
+  }
+  index -= (((uint64_t *) rd.d_start) - r64)/2;
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
+  rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
+  
+  return;
+}
+
+int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  int size_m, size_r;
+  if (divide_type==GF_DIVIDE_MATRIX) return 0;
+
+  switch(mult_type)
+  {
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:
+      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
+      }
+      return 0;
+      break;
+    case GF_MULT_GROUP:
+      /* JSP We've already error checked the arguments. */
+      size_m = (1 << arg1) * 2 * sizeof(uint64_t);
+      size_r = (1 << arg2) * 2 * sizeof(uint64_t);
+      /* 
+       * two pointers prepend the table data for structure
+       * because the tables are of dynamic size
+       */
+      return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
+      break;
+    case GF_MULT_COMPOSITE:
+      if (arg1 == 2) {
+        return sizeof(gf_internal_t) + 4;
+      } else {
+        return 0;
+      }
+      break;
+
+    default:
+      return 0;
+   }
+}
+
+int gf_w128_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
+    }
+  }
+
+  SET_FUNCTION(gf,multiply,w128,NULL)
+  SET_FUNCTION(gf,divide,w128,NULL)
+  SET_FUNCTION(gf,inverse,w128,NULL)
+  SET_FUNCTION(gf,multiply_region,w128,NULL)
+  switch(h->mult_type) {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
+     are multiple flags in h->region_type */
+  if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
+    SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word)
+  } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word)
+  } else {
+    SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word)
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
+  } 
+
+  if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
+    SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
+  }
+  if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
+    SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide)
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
new file mode 100644
index 000000000..831689267
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c
@@ -0,0 +1,2449 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w16.c
+ *
+ * Routines for 16-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+#include "gf_cpu.h"
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
+
+#define GF_FIRST_BIT (1 << 15)
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
+
+static
+inline
+gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+void
+gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      *d16 ^= gf->multiply.w32(gf, val, *s16);
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      *d16 = gf->multiply.w32(gf, val, *s16);
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint16_t *s16;
+  uint16_t *d16;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  if (xor) {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+
+      *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  } else {
+    while (d16 < ((uint16_t *) rd.d_top)) {
+      
+      /* see gf_w16_clm_multiply() to see explanation of method */
+      
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+      result = _mm_xor_si128 (result, w);
+      
+      *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d16++;
+      s16++;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+inline
+gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
+{
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 16;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint16_t *r16, rv;
+
+  r16 = (uint16_t *) start;
+  rv = r16[index];
+  return rv;
+}
+
+static
+gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint16_t a, b, *r16;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r16 = (uint16_t *) start;
+  if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
+  if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
+  index -= (((uint16_t *) rd.d_start) - r16);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 8));
+}
+
+static
+gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint16_t *r16, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r16 = (uint16_t *) start;
+  if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
+  if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
+  index -= (((uint16_t *) rd.d_start) - r16);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*2);
+  r8 += (index & 0xf);
+  rv = (*r8 << 8);
+  r8 += 16;
+  rv |= *r8;
+  return rv;
+}
+
+static
+inline
+gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+ */
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 2 bytes. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
+  b = _mm_insert_epi32 (a, b16, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+
+static
+inline
+ gf_val_32_t
+gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
+{
+  gf_val_32_t product, i, pp, a, b;
+  gf_internal_t *h;
+
+  a = a16;
+  b = b16;
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+static 
+int gf_w16_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w32,gf_w16_shift_multiply)
+  return 1;
+}
+
+static 
+int gf_w16_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+    
+    /*Ben: Determining how many reductions to do */
+    
+    if ((0xfe00 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
+    } else if((0xf000 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
+    } else if ((0xe000 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    } 
+    return 1;
+  }
+#endif
+
+  return 0;
+}
+
+/* KMG: GF_MULT_LOGTABLE: */
+
+static
+void
+gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *s16, *d16;
+  int lv;
+  struct gf_w16_logtable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    while (d16 < (uint16_t *) rd.d_top) {
+      *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
+      d16++;
+      s16++;
+    }
+  } else {
+    while (d16 < (uint16_t *) rd.d_top) {
+      *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
+      d16++;
+      s16++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_w16_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
+  return (ltd->d_antilog[log_sum]);
+}
+
+static
+gf_val_32_t
+gf_w16_log_inverse(gf_t *gf, gf_val_32_t a)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+int gf_w16_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+  int i, b;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+  
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++)
+    ltd->log_tbl[i] = 0;
+  ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE;
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+      if (ltd->log_tbl[b] != 0) check = 1;
+      ltd->log_tbl[b] = i;
+      ltd->antilog_tbl[i] = b;
+      ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+
+  /* If you can't construct the log table, there's a problem.  This code is used for
+     some other implementations (e.g. in SPLIT), so if the log table doesn't work in 
+     that instance, use CARRY_FREE / SHIFT instead. */
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) {
+      if (gf_cpu_supports_intel_pclmul) {
+        return gf_w16_cfm_init(gf);
+      }
+      return gf_w16_shift_init(gf);
+    } else {
+      _gf_errno = GF_E_LOGPOLY;
+      return 0;
+    }
+  }
+
+  ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
+  ltd->inv_tbl[1] = 1;
+  for (i = 2; i < GF_FIELD_SIZE; i++) {
+    ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
+  }
+
+  SET_FUNCTION(gf,inverse,w32,gf_w16_log_inverse)
+  SET_FUNCTION(gf,divide,w32,gf_w16_log_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w16_log_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_multiply_region)
+
+  return 1;
+}
+
+/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions.
+*/
+
+
+/* Ben: Does alternate mapping multiplication using a split table in the
+ lazy method without sse instructions*/
+
+static 
+void
+gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, c, prod;
+  uint8_t *s8, *d8, *top;
+  uint16_t table[4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);    
+
+  /*Ben: Constructs lazy multiplication table*/
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      table[i][j] = gf->multiply.w32(gf, c, val);
+    }
+  }
+
+  /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+
+
+  while (d8 < top) {
+    
+    /*Ben: Multiplies across 16 two byte quantities using alternate mapping 
+       high bits are on the left, low bits are on the right. */
+  
+    for (j=0;j<16;j++) {
+    
+      /*Ben: If the xor flag is set, the product should include what is in dest */
+      prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0;
+
+      /*Ben: xors all 4 table lookups into the product variable*/
+      
+      prod ^= ((table[0][*(s8+16)&0xf]) ^
+          (table[1][(*(s8+16)&0xf0)>>4]) ^
+          (table[2][*(s8)&0xf]) ^
+          (table[3][(*(s8)&0xf0)>>4]));
+
+      /*Ben: Stores product in the destination and moves on*/
+      
+      *d8 = (uint8_t)(prod >> 8);
+      *(d8+16) = (uint8_t)(prod & 0x00ff);
+      s8++;
+      d8++;
+    }
+    s8+=16;
+    d8+=16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+  void
+gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, a, c, prod;
+  uint16_t *s16, *d16, *top;
+  uint16_t table[4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);    
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      table[i][j] = gf->multiply.w32(gf, c, val);
+    }
+  }
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top = (uint16_t *) rd.d_top;
+
+  while (d16 < top) {
+    a = *s16;
+    prod = (xor) ? *d16 : 0;
+    for (i = 0; i < 4; i++) {
+      prod ^= table[i][a&0xf];
+      a >>= 4;
+    }
+    *d16 = prod;
+    s16++;
+    d16++;
+  }
+}
+
+static
+void
+gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t j, k, v, a, prod, *s64, *d64, *top64;
+  gf_internal_t *h;
+  uint64_t htable[256], ltable[256];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  
+  h = (gf_internal_t *) gf->scratch;
+
+  v = val;
+  ltable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]);
+    v = GF_MULTBY_TWO(v);
+  }
+  htable[0] = 0;
+  for (j = 1; j < 256; j <<= 1) {
+    for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]);
+    v = GF_MULTBY_TWO(v);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+  
+/* Does Unrolling Matter?  -- Doesn't seem to.
+  while (d64 != top64) {
+    a = *s64;
+
+    prod = htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    a <<= 8;
+    prod <<= 16;
+
+    prod ^= htable[a >> 56];
+    a <<= 8;
+    prod ^= ltable[a >> 56];
+    prod ^= ((xor) ? *d64 : 0); 
+    *d64 = prod;
+    s64++;
+    d64++;
+  }
+*/
+  
+  while (d64 != top64) {
+    a = *s64;
+
+    prod = 0;
+    for (j = 0; j < 4; j++) {
+      prod <<= 16;
+      prod ^= htable[a >> 56];
+      a <<= 8;
+      prod ^= ltable[a >> 56];
+      a <<= 8;
+    }
+
+    //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better.
+   
+    prod ^= ((xor) ? *d64 : 0); 
+    *d64 = prod;
+    s64++;
+    d64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static void
+gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t c;
+  gf_internal_t *h;
+  struct gf_w16_lazytable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = (struct gf_w16_lazytable_data *) h->private;
+
+  ltd->lazytable[0] = 0;
+
+  /*
+  a = val;
+  c = 1;
+  pp = h->prim_poly;
+
+  do {
+    ltd->lazytable[c] = a;
+    c <<= 1;
+    if (c & (1 << GF_FIELD_WIDTH)) c ^= pp;
+    a <<= 1;
+    if (a & (1 << GF_FIELD_WIDTH)) a ^= pp;
+  } while (c != 1);
+  */
+
+  for (c = 1; c < GF_FIELD_SIZE; c++) {
+    ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val);
+  }
+   
+  gf_two_byte_region_table_multiply(&rd, ltd->lazytable);
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, *s64, *d64, *top64;;
+  uint64_t c, prod;
+  uint8_t low[4][16];
+  uint8_t high[4][16];
+  gf_region_data rd;
+
+  __m128i  mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      low[i][j] = (prod & 0xff);
+      high[i][j] = (prod >> 8);
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
+    thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+
+  mask = _mm_set1_epi8 (0x0f);
+  lmask = _mm_set1_epi16 (0xff);
+
+  if (xor) {
+    while (d64 != top64) {
+      
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      tta = _mm_srli_epi16(ta, 8);
+      ttb = _mm_srli_epi16(tb, 8);
+      tpl = _mm_and_si128(tb, lmask);
+      tph = _mm_and_si128(ta, lmask);
+
+      tb = _mm_packus_epi16(tpl, tph);
+      ta = _mm_packus_epi16(ttb, tta);
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_unpackhi_epi8(tpl, tph);
+      tb = _mm_unpacklo_epi8(tpl, tph);
+
+      tta = _mm_load_si128((__m128i *) d64);
+      ta = _mm_xor_si128(ta, tta);
+      ttb = _mm_load_si128((__m128i *) (d64+2));
+      tb = _mm_xor_si128(tb, ttb); 
+      _mm_store_si128 ((__m128i *)d64, ta);
+      _mm_store_si128 ((__m128i *)(d64+2), tb);
+
+      d64 += 4;
+      s64 += 4;
+      
+    }
+  } else {
+    while (d64 != top64) {
+      
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      tta = _mm_srli_epi16(ta, 8);
+      ttb = _mm_srli_epi16(tb, 8);
+      tpl = _mm_and_si128(tb, lmask);
+      tph = _mm_and_si128(ta, lmask);
+
+      tb = _mm_packus_epi16(tpl, tph);
+      ta = _mm_packus_epi16(ttb, tta);
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_unpackhi_epi8(tpl, tph);
+      tb = _mm_unpacklo_epi8(tpl, tph);
+
+      _mm_store_si128 ((__m128i *)d64, ta);
+      _mm_store_si128 ((__m128i *)(d64+2), tb);
+
+      d64 += 4;
+      s64 += 4;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t i, j, *s64, *d64, *top64;;
+  uint64_t c, prod;
+  uint8_t low[4][16];
+  uint8_t high[4][16];
+  gf_region_data rd;
+  __m128i  mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4];
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  for (j = 0; j < 16; j++) {
+    for (i = 0; i < 4; i++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      low[i][j] = (prod & 0xff);
+      high[i][j] = (prod >> 8);
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
+    thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
+  }
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top64 = (uint64_t *) rd.d_top;
+
+  mask = _mm_set1_epi8 (0x0f);
+
+  if (xor) {
+    while (d64 != top64) {
+
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      ta = _mm_load_si128((__m128i *) d64);
+      tph = _mm_xor_si128(tph, ta);
+      _mm_store_si128 ((__m128i *)d64, tph);
+      tb = _mm_load_si128((__m128i *) (d64+2));
+      tpl = _mm_xor_si128(tpl, tb);
+      _mm_store_si128 ((__m128i *)(d64+2), tpl);
+
+      d64 += 4;
+      s64 += 4;
+    }
+  } else {
+    while (d64 != top64) {
+
+      ta = _mm_load_si128((__m128i *) s64);
+      tb = _mm_load_si128((__m128i *) (s64+2));
+
+      ti = _mm_and_si128 (mask, tb);
+      tph = _mm_shuffle_epi8 (thigh[0], ti);
+      tpl = _mm_shuffle_epi8 (tlow[0], ti);
+  
+      tb = _mm_srli_epi16(tb, 4);
+      ti = _mm_and_si128 (mask, tb);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
+
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
+  
+      ta = _mm_srli_epi16(ta, 4);
+      ti = _mm_and_si128 (mask, ta);
+      tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
+      tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
+
+      _mm_store_si128 ((__m128i *)d64, tph);
+      _mm_store_si128 ((__m128i *)(d64+2), tpl);
+
+      d64 += 4;
+      s64 += 4;
+      
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+
+}
+#endif
+
+uint32_t 
+gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t alow, blow;
+  struct gf_w16_split_8_8_data *d8;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_w16_split_8_8_data *) h->private;
+
+  alow = a & 0xff;
+  blow = b & 0xff;
+  a >>= 8;
+  b >>= 8;
+
+  return d8->tables[0][alow][blow] ^
+         d8->tables[1][alow][b] ^
+         d8->tables[1][a][blow] ^
+         d8->tables[2][a][b];
+}
+
+static 
+int gf_w16_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_split_8_8_data *d8;
+  int i, j, exp;
+  uint32_t p, basep, tmp;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    d8 = (struct gf_w16_split_8_8_data *) h->private;
+    basep = 1;
+    for (exp = 0; exp < 3; exp++) {
+      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+      d8->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d8->tables[exp][i^1][1];
+          d8->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d8->tables[exp][i>>1][1];
+          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d8->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+          } else {
+            tmp = d8->tables[exp][i][j>>1];
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(tmp);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+    SET_FUNCTION(gf,multiply,w32,gf_w16_split_8_8_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+    return 1;
+
+  }
+
+  /* We'll be using LOG for multiplication, unless the pp isn't primitive.
+     In that case, we'll be using SHIFT. */
+
+  gf_w16_log_init(gf);
+
+  /* Defaults */
+
+#ifdef INTEL_SSSE3
+  if (gf_cpu_supports_intel_ssse3) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
+  } else {
+#elif ARM_NEON
+  if (gf_cpu_supports_arm_neon) {
+    gf_w16_neon_split_init(gf);
+  } else {
+#endif
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+  }
+#endif
+
+  if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+
+  } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+      if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
+      else if(h->region_type & GF_REGION_NOSIMD)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3)
+      else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
+#endif        
+    } else {
+#endif
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+      else if(h->region_type & GF_REGION_ALTMAP)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
+      else
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
+#endif
+  }
+
+  return 1;
+}
+
+static 
+int gf_w16_table_init(gf_t *gf)
+{
+  gf_w16_log_init(gf);
+
+  SET_FUNCTION(gf,multiply_region,w32,gf_w16_table_lazy_multiply_region) 
+  return 1;
+}
+
+static
+void
+gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t lv;
+  int i;
+  uint16_t *s16, *d16, *top16;
+  struct gf_w16_zero_logtable_data *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top16 = (uint16_t *) rd.d_top;
+  bytes = top16 - d16;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
+    }
+  }
+
+  /* This isn't necessary. */
+  
+  gf_do_final_region_alignment(&rd);
+}
+
+/* Here -- double-check Kevin */
+
+static
+inline
+gf_val_32_t
+gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w16_zero_logtable_data *ltd;
+
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_w16_zero_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
+  return (ltd->antilog_tbl[log_sum]);
+}
+
+static
+gf_val_32_t
+gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a)
+{
+  struct gf_w16_zero_logtable_data *ltd;
+
+  ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  
+  prod = 0;
+  pmask = 0x8000;
+  amask = 0x8000;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8000;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+void 
+gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w16_bytwo_data *btd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi16(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint32_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 16; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+  one = _mm_set1_epi16(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi16(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w16_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi16(btd->prim_poly&0xffff);
+  m1 = _mm_set1_epi16((btd->mask1)&0xffff);
+  m2 = _mm_set1_epi16((btd->mask2)&0xffff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void 
+gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w16_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w16_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w16_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xffff;
+  m1 = 0xfffe;
+  m2 = 0x8000;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
+    #ifdef INTEL_SSE2
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
+    } else {
+    #endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #ifdef INTEL_SSE2
+    }
+    #endif
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
+    #ifdef INTEL_SSE2
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
+    } else {
+    #endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #ifdef INTEL_SSE2
+    }
+    #endif
+  }
+
+  return 1;
+}
+
+static
+int gf_w16_log_zero_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_zero_logtable_data *ltd;
+  int i, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+
+  ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1;
+
+  bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
+
+  ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]);
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+      ltd->log_tbl[b] = (uint16_t)i;
+      ltd->antilog_tbl[i] = (uint16_t)b;
+      ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+  ltd->inv_tbl[0] = 0;  /* Not really, but we need to fill it with something  */
+  ltd->inv_tbl[1] = 1;
+  for (i = 2; i < GF_FIELD_SIZE; i++) {
+    ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
+  }
+
+  SET_FUNCTION(gf,inverse,w32,gf_w16_log_zero_inverse)
+  SET_FUNCTION(gf,divide,w32,gf_w16_log_zero_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w16_log_zero_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_zero_multiply_region)
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = b & 0x00ff;
+  uint8_t b1 = (b & 0xff00) >> 8;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t a1b1;
+  uint16_t rv;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+  return rv;
+}
+
+static
+gf_val_32_t
+gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint8_t b0 = b & 0x00ff;
+  uint8_t b1 = (b & 0xff00) >> 8;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t a1b1, *mt;
+  uint16_t rv;
+  struct gf_w16_composite_data *cd;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
+
+  a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+
+  rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+  return rv;
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+gf_val_32_t
+gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t a0 = a & 0x00ff;
+  uint8_t a1 = (a & 0xff00) >> 8;
+  uint8_t c0, c1, d, tmp;
+  uint16_t c;
+  uint8_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | (c1 << 8);
+
+  return c;
+}
+
+static
+void
+gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = val & 0x00ff;
+  uint8_t b1 = (val & 0xff00) >> 8;
+  uint16_t *s16, *d16, *top;
+  uint8_t a0, a1, a1b1, *mt;
+  gf_region_data rd;
+  struct gf_w16_composite_data *cd;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  mt = cd->mult_table;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+
+  s16 = rd.s_start;
+  d16 = rd.d_start;
+  top = rd.d_top;
+
+  if (mt == NULL) {
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                    base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                    base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    } else {
+      while (d16 < top) {
+        a0 = (*s16) & 0x00ff;
+        a1 = ((*s16) & 0xff00) >> 8;
+        a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
+  
+        (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
+                  ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ 
+                    GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
+        s16++;
+        d16++;
+      }
+    }
+  }
+}
+
+static
+void
+gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t val0 = val & 0x00ff;
+  uint8_t val1 = (val & 0xff00) >> 8;
+  gf_region_data rd;
+  int sub_reg_size;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;;
+
+  /* JSP: I want the two pointers aligned wrt each other on 16 byte 
+     boundaries.  So I'm going to make sure that the area on 
+     which the two operate is a multiple of 32. Of course, that 
+     junks up the mapping, but so be it -- that's why we have extract_word.... */
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t *)  rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w16_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w16_composite_data *) h->private;
+  cd->mult_table = gf_w8_get_mult_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region_alt)
+  } else {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region)
+  }
+
+  if (cd->mult_table == NULL) {
+    SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_recursive)
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_inline)
+  }
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,gf_w16_composite_inverse)
+
+  return 1;
+}
+
+static
+void
+gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h)
+{
+  int i, j;
+
+  shift[0] = 0;
+  for (i = 0; i < 16; i += 2) {
+    j = (shift[i>>1] << 1);
+    if (j & (1 << 16)) j ^= h->prim_poly;
+    shift[i] = j;
+    shift[i^1] = j^val;
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint16_t p, l, ind, r, a16;
+
+  struct gf_w16_group_4_4_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  gf_w16_group_4_set_shift_tables(d44->shift, b, h);
+
+  a16 = a;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = d44->shift[ind];
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  a16 <<= 4;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  r = p & 0xfff;
+  l = p >> 12;
+  ind = a16 >> 12;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+  return p;
+}
+
+static
+void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t p, l, ind, r, a16, p16;
+  struct gf_w16_group_4_4_data *d44;
+  gf_region_data rd;
+  uint16_t *s16, *d16, *top;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  gf_w16_group_4_set_shift_tables(d44->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
+  gf_do_initial_region_alignment(&rd);
+
+  s16 = (uint16_t *) rd.s_start;
+  d16 = (uint16_t *) rd.d_start;
+  top = (uint16_t *) rd.d_top;
+
+  while (d16 < top) {
+    a16 = *s16;
+    p16 = (xor) ? *d16 : 0;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = d44->shift[ind];
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    a16 <<= 4;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    r = p & 0xfff;
+    l = p >> 12;
+    ind = a16 >> 12;
+    p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
+    p ^= p16;
+    *d16 = p;
+    d16++;
+    s16++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w16_group_init(gf_t *gf)
+{
+  int i, j, p;
+  struct gf_w16_group_4_4_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w16_group_4_4_data *) h->private;
+  d44->reduce[0] = 0;
+  for (i = 0; i < 16; i++) {
+    p = 0;
+    for (j = 0; j < 4; j++) {
+      if (i & (1 << j)) p ^= (h->prim_poly << j);
+    }
+    d44->reduce[p>>16] = (p&0xffff);
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_w16_group_4_4_multiply)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w16_group_4_4_region_multiply)
+
+  return 1;
+}
+
+int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64;
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data);
+      break;
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64;
+      break;
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_SPLIT_TABLE: 
+      if (arg1 == 8 && arg2 == 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64;
+      } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      } else if (mult_type == GF_MULT_DEFAULT || 
+                 (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
+      }
+      return 0;
+      break;
+    case GF_MULT_GROUP:     
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64;
+      break;
+
+    default:
+      return 0;
+   }
+   return 0;
+}
+
+int gf_w16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;
+    } else { 
+
+     /* Allen: use the following primitive polynomial to make 
+               carryless multiply work more efficiently for GF(2^16).
+
+        h->prim_poly = 0x1002d;
+
+        The following is the traditional primitive polynomial for GF(2^16) */
+
+      h->prim_poly = 0x1100b;
+    } 
+  }
+
+  if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
+
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,NULL)
+
+  switch(h->mult_type) {
+    case GF_MULT_LOG_ZERO:    if (gf_w16_log_zero_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:   if (gf_w16_log_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:       if (gf_w16_table_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:  if (gf_w16_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w16_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w16_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p: 
+    case GF_MULT_BYTWO_b:     if (gf_w16_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:       if (gf_w16_group_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w16_matrix)
+  }
+
+  if (gf->divide.w32 == NULL) {
+    SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
+    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
+  }
+
+  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w16_inverse_from_divide)
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      SET_FUNCTION(gf,extract_word,w32,gf_w16_composite_extract_word)
+    } else {
+      SET_FUNCTION(gf,extract_word,w32,gf_w16_split_extract_word)
+    }
+  } else if (h->region_type == GF_REGION_CAUCHY) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+  } else {
+    SET_FUNCTION(gf,extract_word,w32,gf_w16_extract_word)
+  }
+  if (gf->multiply_region.w32 == NULL) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_multiply_region_from_single)
+  }
+  return 1;
+}
+
+/* Inline setup functions */
+
+uint16_t *gf_w16_get_log_table(gf_t *gf)
+{
+  struct gf_w16_logtable_data *ltd;
+
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint16_t *) ltd->log_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) h->private;
+    return (uint16_t *) ltd->antilog_tbl;
+  }
+  return NULL;
+}
+
+uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w16_logtable_data *ltd;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w16_log_multiply) {
+    ltd = (struct gf_w16_logtable_data *) h->private;
+    return (uint16_t *) ltd->d_antilog;
+  }
+  return NULL;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
new file mode 100644
index 000000000..976b68b2e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
@@ -0,0 +1,2810 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w32.c
+ *
+ * Routines for 32-bit Galois fields
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+#include "gf_cpu.h"
+
+#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+static
+inline
+uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+void
+gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int 
+xor)
+{
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+   
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      d32[i] = gf->multiply.w32(gf, val, s32[i]);
+    } 
+  }
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+   
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL) 
+
+static 
+void
+gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static 
+void
+gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+static
+inline
+uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
+{
+  uint32_t e_i, e_im1, e_ip1;
+  uint32_t d_i, d_im1, d_ip1;
+  uint32_t y_i, y_im1, y_ip1;
+  uint32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; 
+  e_i = b;
+  d_im1 = 32;
+  for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      d_ip1--;
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint32_t *r32, rv;
+
+  r32 = (uint32_t *) start;
+  rv = r32[index];
+  return rv;
+}
+
+static
+gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint32_t a, b, *r32;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r32 = (uint32_t *) start;
+  if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
+  if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
+  index -= (((uint32_t *) rd.d_start) - r32);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 16));
+}
+
+static
+gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int i;
+  uint32_t *r32, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
+  r32 = (uint32_t *) start;
+  if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
+  if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
+  index -= (((uint32_t *) rd.d_start) - r32);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*4);
+  r8 += (index & 0xf);
+  r8 += 48;
+  rv =0;
+  for (i = 0; i < 4; i++) {
+    rv <<= 8;
+    rv |= *r8;
+    r8 -= 16;
+  }
+  return rv;
+}
+
+
+static
+inline
+uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
+{
+  return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+*/
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static
+inline
+gf_val_32_t
+gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         w;
+  __m128i         g, q;
+  gf_internal_t * h = gf->scratch;
+  uint64_t        g_star, q_plus;
+
+  q_plus = *(uint64_t *) h->private;
+  g_star = *((uint64_t *) h->private + 1);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+  g = _mm_insert_epi64 (a, g_star, 0);
+  q = _mm_insert_epi64 (a, q_plus, 0);
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+  w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+  w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static 
+void
+gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+
+  uint32_t i;
+  uint32_t *s32;
+  uint32_t *d32;
+  
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         w;
+  __m128i         g, q;
+  gf_internal_t * h = gf->scratch;
+  uint64_t        g_star, q_plus;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  q_plus = *(uint64_t *) h->private;
+  g_star = *((uint64_t *) h->private + 1);
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+  g = _mm_insert_epi64 (a, g_star, 0);
+  q = _mm_insert_epi64 (a, q_plus, 0);
+  s32 = (uint32_t *) src;
+  d32 = (uint32_t *) dest; 
+ 
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+      w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  } else {
+    for (i = 0; i < bytes/sizeof(uint32_t); i++) {
+      b = _mm_insert_epi32 (a, s32[i], 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
+      w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
+      result = _mm_xor_si128 (result, w);
+      d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+    } 
+  }
+}
+#endif
+
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+  
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 
+
+   _mm_clmulepi64_si128 is the carryless multiply operation. Here
+   _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
+   us to multiply the prim_poly by the leading bits of the result. We
+   then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+
+static
+inline
+gf_val_32_t
+gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
+  b = _mm_insert_epi32 (a, b32, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+  return rv;
+}
+#endif
+
+
+static
+inline
+uint32_t
+gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint64_t product, i, pp, a, b, one;
+  gf_internal_t *h;
+
+  a = a32;
+  b = b32;
+  h = (gf_internal_t *) gf->scratch;
+  one = 1;
+  pp = h->prim_poly | (one << 32);
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (one << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+  static 
+int gf_w32_cfmgk_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
+  
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+    SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
+
+    uint64_t *q_plus = (uint64_t *) h->private;
+    uint64_t *g_star = (uint64_t *) h->private + 1;
+
+    uint64_t tmp = h->prim_poly << 32;
+    *q_plus = 1ULL << 32;
+
+    int i;
+    for(i = 63; i >= 32; i--)
+      if((1ULL << i) & tmp)
+      {
+        *q_plus |= 1ULL << (i-32);
+        tmp ^= h->prim_poly << (i-32);
+      }
+
+    *g_star = h->prim_poly & ((1ULL << 32) - 1);
+
+    return 1;
+  }
+#endif
+
+  return 0;
+}
+
+  static 
+int gf_w32_cfm_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
+  
+  /*Ben: We also check to see if the prim poly will work for pclmul */
+  /*Ben: Check to see how many reduction steps it will take*/
+
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+
+    if ((0xfffe0000 & h->prim_poly) == 0){ 
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
+    }else if ((0xffc00000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
+    }else if ((0xfe000000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    }
+    return 1;
+  }
+  #endif
+
+  return 0;
+}
+
+  static 
+int gf_w32_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
+  SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply)
+  return 1;
+}
+
+static
+  void
+gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
+{
+  uint32_t i;
+  uint32_t j;
+
+  shift[0] = 0;
+
+  for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & GF_FIRST_BIT) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+  static
+void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+  gf_region_data rd;
+  uint32_t *s32, *d32, *top;
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w32_group_data *) h->private;
+  g_s = h->arg1;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  leftover = 32 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  while (d32 < top) {
+    rs = 32 - leftover;
+    a32 = *s32;
+    ind = a32 >> rs;
+    a32 <<= leftover;
+    p = gd->shift[ind];
+
+    bits_left = rs;
+    rs = 32 - g_s;
+
+    while (bits_left > 0) {
+      bits_left -= g_s;
+      ind = a32 >> rs;
+      a32 <<= g_s;
+      l = p >> rs;
+      p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+    }
+    if (xor) p ^= *d32;
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+  static
+void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint32_t *s32, *d32, *top;
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_w32_group_data *gd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  leftover = GF_FIELD_WIDTH % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 < top) {
+    a32 = *s32;
+    ind = a32 >> (GF_FIELD_WIDTH - leftover);
+    p = gd->shift[ind];
+    p <<= g_s;
+    a32 <<= leftover;
+  
+    i = (GF_FIELD_WIDTH - leftover);
+    while (i > g_s) {
+      ind = a32 >> (GF_FIELD_WIDTH-g_s);
+      p ^= gd->shift[ind];
+      a32 <<= g_s;
+      p <<= g_s;
+      i -= g_s;
+    }
+  
+    ind = a32 >> (GF_FIELD_WIDTH-g_s);
+    p ^= gd->shift[ind];
+  
+    for (i = gd->tshift ; i >= 0; i -= g_r) {
+      l = p & (gd->rmask << i);
+      r = gd->reduce[l >> (i+32)];
+      r <<= (i);
+      p ^= r;
+    }
+
+    if (xor) p ^= *d32;
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = 32 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = 32 - leftover;
+  a32 = a;
+  ind = a32 >> rs;
+  a32 <<= leftover;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = 32 - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a32 >> rs;
+    a32 <<= g_s;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+  }
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t p, l, ind, a32;
+
+  struct gf_w32_group_data *d44;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  d44 = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(d44->shift, b, h);
+
+  a32 = a;
+  ind = a32 >> 28;
+  a32 <<= 4;
+  p = d44->shift[ind];
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  a32 <<= 4;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  ind = a32 >> 28;
+  l = p >> 28;
+  p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_w32_group_data *gd;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w32_group_data *) h->private;
+  gf_w32_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = GF_FIELD_WIDTH % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  a32 = a;
+  ind = a32 >> (GF_FIELD_WIDTH - leftover);
+  p = gd->shift[ind];
+  p <<= g_s;
+  a32 <<= leftover;
+
+  i = (GF_FIELD_WIDTH - leftover);
+  while (i > g_s) {
+    ind = a32 >> (GF_FIELD_WIDTH-g_s);
+    p ^= gd->shift[ind];
+    a32 <<= g_s;
+    p <<= g_s;
+    i -= g_s;
+  }
+
+  ind = a32 >> (GF_FIELD_WIDTH-g_s);
+  p ^= gd->shift[ind];
+
+  for (i = gd->tshift ; i >= 0; i -= g_r) {
+    l = p & (gd->rmask << i);
+    r = gd->reduce[l >> (i+32)];
+    r <<= (i);
+    p ^= r;
+  }
+  return p;
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x80000000;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+
+  prod = 0;
+  pmask = 0x80000000;
+  amask = 0x80000000;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+void
+gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w32_bytwo_data *btd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80000000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80000000;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi32(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint32_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+   
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 32; i++) {
+    vrev <<= 1;
+    if (!(val & ((gf_val_32_t)1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+  one = _mm_set1_epi32(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi32(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void
+gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint32_t itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w32_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w32_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
+  m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
+  m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+int gf_w32_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w32_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w32_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xffffffff;
+  m1 = 0xfffffffe;
+  m2 = 0x80000000;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
+    #ifdef INTEL_SSE2
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region) 
+      } else {
+    #endif 
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) 
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
+    #endif
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply) 
+    #ifdef INTEL_SSE2
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region) 
+      } else {
+    #endif 
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region) 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #ifdef INTEL_SSE2
+      }
+    #endif
+  }
+
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+  return 1;
+}
+
+static
+inline
+uint32_t
+gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint32_t product, i, j, mask, tb;
+  gf_internal_t *h;
+  struct gf_w32_split_8_8_data *d8;
+  
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_w32_split_8_8_data *) h->private;
+  product = 0;
+  mask = 0xff;
+
+  for (i = 0; i < 4; i++) {
+    tb = b32;
+    for (j = 0; j < 4; j++) {
+      product ^= d8->tables[i+j][a32&mask][tb&mask];
+      tb >>= 8;
+    }
+    a32 >>= 8;
+  }
+  return product;
+}
+
+static
+inline
+void
+gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  uint32_t *s32, *d32, *top, p, a, v;
+  struct gf_split_8_32_lazy_data *d8;
+  struct gf_w32_split_8_8_data *d88;
+  uint32_t *t[4];
+  int i, j, k, change;
+  uint32_t pp;
+  gf_region_data rd;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
+    d8 = (struct gf_split_8_32_lazy_data *) h->private;
+    for (i = 0; i < 4; i++) t[i] = d8->tables[i];
+    change = (val != d8->last_value);
+    if (change) d8->last_value = val;
+  } else {
+    d88 = (struct gf_w32_split_8_8_data *) h->private;
+    for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
+    change = (val != d88->last_value);
+    if (change) d88->last_value = val;
+  }
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  if (change) {
+    v = val;
+    for (i = 0; i < 4; i++) {
+      t[i][0] = 0;
+      for (j = 1; j < 256; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          t[i][k^j] = (v ^ t[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  } 
+
+  while (d32 < top) {
+    p = (xor) ? *d32 : 0;
+    a = *s32;
+    i = 0;
+    while (a != 0) {
+      v = (a & 0xff);
+      p ^= t[i][v];
+      a >>= 8;
+      i++;
+    }
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+void
+gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  uint32_t *s32, *d32, *top, p, a, v;
+  struct gf_split_16_32_lazy_data *d16;
+  uint32_t *t[2];
+  int i, j, k, change;
+  uint32_t pp;
+  gf_region_data rd;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  d16 = (struct gf_split_16_32_lazy_data *) h->private;
+  for (i = 0; i < 2; i++) t[i] = d16->tables[i];
+  change = (val != d16->last_value);
+  if (change) d16->last_value = val;
+
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  if (change) {
+    v = val;
+    for (i = 0; i < 2; i++) {
+      t[i][0] = 0;
+      for (j = 1; j < (1 << 16); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          t[i][k^j] = (v ^ t[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  } 
+
+  while (d32 < top) {
+    p = (xor) ? *d32 : 0;
+    a = *s32;
+    i = 0;
+    while (a != 0 && i < 2) {
+      v = (a & 0xffff);
+      p ^= t[i][v];
+      a >>= 16;
+      i++;
+    }
+    *d32 = p;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_2_32_lazy_data *ld;
+  int i;
+  uint32_t pp, v, v2, s, *s32, *d32, *top;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_2_32_lazy_data *) h->private;
+  
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 16; i++) {
+      v2 = (v << 1);
+      if (v & GF_FIRST_BIT) v2 ^= pp;
+      ld->tables[i][0] = 0;
+      ld->tables[i][1] = v;
+      ld->tables[i][2] = v2;
+      ld->tables[i][3] = (v2 ^ v);
+      v = (v2 << 1);
+      if (v2 & GF_FIRST_BIT) v ^= pp;
+    }
+  }
+  ld->last_value = val;
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 != top) {
+    v = (xor) ? *d32 : 0;
+    s = *s32;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&3];
+      s >>= 2;
+      i++;
+    }
+    *d32 = v;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, tindex;
+  uint32_t pp, v, v2, *s32, *d32, *top;
+  __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  v = val;
+  for (i = 0; i < 16; i++) {
+    v2 = (v << 1);
+    if (v & GF_FIRST_BIT) v2 ^= pp;
+    tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
+    v = (v2 << 1);
+    if (v2 & GF_FIRST_BIT) v ^= pp;
+  }
+
+  shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+  adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+  mask1 = _mm_set1_epi8(0x3);
+  mask2 = _mm_set1_epi8(0xc);
+
+  while (d32 != top) {
+    pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
+    vi = _mm_load_si128((__m128i *) s32);
+ 
+    tindex = 0;
+    for (i = 0; i < 4; i++) {
+      si = _mm_shuffle_epi8(vi, shuffler);
+
+      xi = _mm_and_si128(si, mask1);
+      xi = _mm_slli_epi16(xi, 2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      si = _mm_srli_epi16(si, 2);
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      si = _mm_srli_epi16(si, 2);
+      tindex++;
+
+      xi = _mm_and_si128(si, mask2);
+      xi = _mm_xor_si128(xi, adder);
+      pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
+      tindex++;
+      
+      vi = _mm_srli_epi32(vi, 8);
+    }
+    _mm_store_si128((__m128i *) d32, pi);
+    d32 += 4;
+    s32 += 4;
+  }
+
+  gf_do_final_region_alignment(&rd);
+
+}
+#endif
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_4_32_lazy_data *ld;
+  int i, j, k;
+  uint32_t pp, v, s, *s32, *d32, *top;
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_4_32_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+  
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 8; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  while (d32 != top) {
+    v = (xor) ? *d32 : 0;
+    s = *s32;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    *d32 = v;
+    d32++;
+    s32++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top;
+  __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
+  struct gf_split_4_32_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+ 
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+  
+  ld = (struct gf_split_4_32_lazy_data *) h->private;
+ 
+  v = val;
+  for (i = 0; i < 8; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  if (xor) {
+    while (d32 != top) {
+      p0 = _mm_load_si128 ((__m128i *) d32);
+      p1 = _mm_load_si128 ((__m128i *) (d32+4));
+      p2 = _mm_load_si128 ((__m128i *) (d32+8));
+      p3 = _mm_load_si128 ((__m128i *) (d32+12));
+  
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+  
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  } else {
+    while (d32 != top) {
+  
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[0][0], si);
+      p1 = _mm_shuffle_epi8(tables[0][1], si);
+      p2 = _mm_shuffle_epi8(tables[0][2], si);
+      p3 = _mm_shuffle_epi8(tables[0][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+
+#ifdef INTEL_SSSE3
+static
+void
+gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+  __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
+  __m128i tv1, tv2, tv3, tv0;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  v = val;
+  for (i = 0; i < 8; i++) {
+    tmp_table[0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        tmp_table[k^j] = (v ^ tmp_table[k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) tmp_table[k];
+        tmp_table[k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+  mask8 = _mm_set1_epi16(0xff);
+
+  if (xor) {
+    while (d32 != top) {
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+  
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+
+      v0 = _mm_packus_epi16(p1, p0);
+      v1 = _mm_packus_epi16(tv1, tv0);
+      v2 = _mm_packus_epi16(p3, p2);
+      v3 = _mm_packus_epi16(tv3, tv2);
+
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+
+      v0 = _mm_packus_epi16(p2, p0);
+      v1 = _mm_packus_epi16(p3, p1);
+      v2 = _mm_packus_epi16(tv2, tv0);
+      v3 = _mm_packus_epi16(tv3, tv1);
+
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[6][0], si);
+      p1 = _mm_shuffle_epi8(tables[6][1], si);
+      p2 = _mm_shuffle_epi8(tables[6][2], si);
+      p3 = _mm_shuffle_epi8(tables[6][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
+  
+      tv0 = _mm_unpackhi_epi8(p1, p3);
+      tv1 = _mm_unpackhi_epi8(p0, p2);
+      tv2 = _mm_unpacklo_epi8(p1, p3);
+      tv3 = _mm_unpacklo_epi8(p0, p2);
+
+      p0 = _mm_unpackhi_epi8(tv1, tv0);
+      p1 = _mm_unpacklo_epi8(tv1, tv0);
+      p2 = _mm_unpackhi_epi8(tv3, tv2);
+      p3 = _mm_unpacklo_epi8(tv3, tv2);
+
+      v0 = _mm_load_si128 ((__m128i *) d32);
+      v1 = _mm_load_si128 ((__m128i *) (d32+4));
+      v2 = _mm_load_si128 ((__m128i *) (d32+8));
+      v3 = _mm_load_si128 ((__m128i *) (d32+12));
+  
+      p0 = _mm_xor_si128(p0, v0);
+      p1 = _mm_xor_si128(p1, v1);
+      p2 = _mm_xor_si128(p2, v2);
+      p3 = _mm_xor_si128(p3, v3);
+
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  } else {
+    while (d32 != top) {
+      v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
+      v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
+ 
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+      
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+      
+      v0 = _mm_packus_epi16(p1, p0);
+      v1 = _mm_packus_epi16(tv1, tv0);
+      v2 = _mm_packus_epi16(p3, p2);
+      v3 = _mm_packus_epi16(tv3, tv2);
+      
+      p0 = _mm_srli_epi16(v0, 8);
+      p1 = _mm_srli_epi16(v1, 8);
+      p2 = _mm_srli_epi16(v2, 8);
+      p3 = _mm_srli_epi16(v3, 8);
+     
+      tv0 = _mm_and_si128(v0, mask8);
+      tv1 = _mm_and_si128(v1, mask8);
+      tv2 = _mm_and_si128(v2, mask8);
+      tv3 = _mm_and_si128(v3, mask8);
+      
+      v0 = _mm_packus_epi16(p2, p0);
+      v1 = _mm_packus_epi16(p3, p1);
+      v2 = _mm_packus_epi16(tv2, tv0);
+      v3 = _mm_packus_epi16(tv3, tv1);
+      
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_shuffle_epi8(tables[6][0], si);
+      p1 = _mm_shuffle_epi8(tables[6][1], si);
+      p2 = _mm_shuffle_epi8(tables[6][2], si);
+      p3 = _mm_shuffle_epi8(tables[6][3], si);
+      
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
+  
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
+      
+      v1 = _mm_srli_epi32(v1, 4);
+      si = _mm_and_si128(v1, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
+  
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
+      
+      v2 = _mm_srli_epi32(v2, 4);
+      si = _mm_and_si128(v2, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
+  
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
+      
+      v3 = _mm_srli_epi32(v3, 4);
+      si = _mm_and_si128(v3, mask1);
+      p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
+      p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
+      p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
+      p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); 
+  
+      tv0 = _mm_unpackhi_epi8(p1, p3);
+      tv1 = _mm_unpackhi_epi8(p0, p2);
+      tv2 = _mm_unpacklo_epi8(p1, p3);
+      tv3 = _mm_unpacklo_epi8(p0, p2);
+      
+      p0 = _mm_unpackhi_epi8(tv1, tv0);
+      p1 = _mm_unpacklo_epi8(tv1, tv0);
+      p2 = _mm_unpackhi_epi8(tv3, tv2);
+      p3 = _mm_unpacklo_epi8(tv3, tv2);
+      
+      _mm_store_si128((__m128i *) d32, p0);
+      _mm_store_si128((__m128i *) (d32+4), p1);
+      _mm_store_si128((__m128i *) (d32+8), p2);
+      _mm_store_si128((__m128i *) (d32+12), p3);
+      d32 += 16;
+    } 
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static 
+int gf_w32_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_split_2_32_lazy_data *ld2;
+  struct gf_split_4_32_lazy_data *ld4;
+  struct gf_w32_split_8_8_data *d8;
+  struct gf_split_8_32_lazy_data *d32;
+  struct gf_split_16_32_lazy_data *d16;
+  uint32_t p, basep;
+  int i, j, exp;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Defaults */
+  
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+
+  /* JSP: First handle single multiplication:  
+     If args == 8, then we're doing split 8 8.  
+     Otherwise, if PCLMUL, we use that.
+     Otherwise, we use bytwo_p.
+   */
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
+#if defined(INTEL_SSE4_PCLMUL)
+  } else if (gf_cpu_supports_intel_pclmul) {
+    if ((0xfffe0000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
+    } else if ((0xffc00000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
+    } else if ((0xfe000000 & h->prim_poly) == 0){
+     SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
+    }
+#endif
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
+  }
+
+  /* Easy cases: 16/32 and 2/32 */
+
+  if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
+    d16 = (struct gf_split_16_32_lazy_data *) h->private;
+    d16->last_value = 0;
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region)
+    return 1;
+  }
+
+  if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
+    ld2 = (struct gf_split_2_32_lazy_data *) h->private;
+    ld2->last_value = 0;
+    #ifdef INTEL_SSSE3
+      if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
+        if(h->region_type & GF_REGION_SIMD) return 0;
+    #ifdef INTEL_SSSE3
+      }
+    #endif
+    return 1;
+  } 
+
+  /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
+
+  if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
+      ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
+    ld4 = (struct gf_split_4_32_lazy_data *) h->private;
+    ld4->last_value = 0;
+    if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
+    } else if (gf_cpu_supports_arm_neon) {
+#ifdef ARM_NEON
+      gf_w32_neon_split_init(gf);
+#endif
+    } else if (h->region_type & GF_REGION_ALTMAP) {
+#ifdef INTEL_SSSE3
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
+#endif
+    } else {
+#ifdef INTEL_SSSE3
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
+#endif
+    }
+    return 1;
+  } 
+
+  /* 8/32 or Default + no SSE */
+
+  if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || 
+       h->mult_type == GF_MULT_DEFAULT) {
+    d32 = (struct gf_split_8_32_lazy_data *) h->private;
+    d32->last_value = 0;
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
+    return 1;
+  }
+
+  /* Finally, if args == 8, then we have to set up the tables here. */
+
+  if (h->arg1 == 8 && h->arg2 == 8) {
+    d8 = (struct gf_w32_split_8_8_data *) h->private;
+    d8->last_value = 0;
+    SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
+    basep = 1;
+    for (exp = 0; exp < 7; exp++) {
+      for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
+      d8->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d8->tables[exp][i^1][1];
+          d8->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d8->tables[exp][i>>1][1];
+          d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d8->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
+          } else {
+            d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+    return 1;
+  }
+
+  /* If we get here, then the arguments were bad. */
+
+  return 0;
+}
+
+static
+int gf_w32_group_init(gf_t *gf)
+{
+  uint32_t i, j, p, index;
+  struct gf_w32_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t g_r, g_s;
+
+  g_s = h->arg1;
+  g_r = h->arg2;
+
+  gd = (struct gf_w32_group_data *) h->private;
+  gd->shift = (uint32_t *) (&(gd->memory));
+  gd->reduce = gd->shift + (1 << g_s);
+
+  gd->rmask = (1 << g_r) - 1;
+  gd->rmask <<= 32;
+
+  gd->tshift = 32 % g_s;
+  if (gd->tshift == 0) gd->tshift = g_s;
+  gd->tshift = (32 - gd->tshift);
+  gd->tshift = ((gd->tshift-1)/g_r) * g_r;
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (1 << j);
+        index ^= (h->prim_poly >> (32-j));
+      }
+    }
+    gd->reduce[index] = p;
+  }
+
+  if (g_s == g_r) {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region) 
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region)
+  }
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+
+  return 1;
+}
+
+
+static
+uint32_t
+gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = (b & 0xffff0000) >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = (a & 0xffff0000) >> 16;
+  uint32_t a1b1;
+  uint32_t rv;
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
+  return rv;
+}
+
+/* JSP: This could be made faster. Someday, when I'm bored. */
+
+static
+uint32_t
+gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t b0 = b & 0x0000ffff;
+  uint32_t b1 = b >> 16;
+  uint32_t a0 = a & 0x0000ffff;
+  uint32_t a1 = a >> 16;
+  uint32_t a1b1, prod;
+  uint16_t *log, *alog;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
+
+  a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+  prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+  prod <<= 16;
+  prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+  prod ^= a1b1;
+  return prod;
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+uint32_t
+gf_w32_composite_inverse(gf_t *gf, uint32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint16_t a0 = a & 0x0000ffff;
+  uint16_t a1 = (a & 0xffff0000) >> 16;
+  uint16_t c0, c1, d, tmp;
+  uint32_t c;
+  uint16_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | (c1 << 16);
+
+  return c;
+}
+
+static
+void
+gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = val & 0x0000ffff;
+  uint32_t b1 = (val & 0xffff0000) >> 16;
+  uint32_t *s32, *d32, *top;
+  uint16_t a0, a1, a1b1, *log, *alog;
+  uint32_t prod;
+  gf_region_data rd;
+  struct gf_w32_composite_data *cd;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  log = cd->log;
+  alog = cd->alog;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  
+  s32 = rd.s_start;
+  d32 = rd.d_start;
+  top = rd.d_top;
+
+  if (log == NULL) {
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                  ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); 
+        s32++;
+        d32++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        *d32 ^= prod;
+        s32++;
+        d32++;
+      }
+    } else {
+      while (d32 < top) {
+        a0 = *s32 & 0x0000ffff;
+        a1 = (*s32 & 0xffff0000) >> 16;
+        a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
+  
+        prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
+        prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
+        prod <<= 16;
+        prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
+        prod ^= a1b1;
+        
+        *d32 = prod;
+        s32++;
+        d32++;
+      }
+    }
+  }
+}
+
+static
+void
+gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint16_t    val0 = val & 0x0000ffff;
+  uint16_t    val1 = (val & 0xffff0000) >> 16;
+  gf_region_data rd;
+  int sub_reg_size;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+
+  /* JSP: I want the two pointers aligned wrt each other on 16 byte
+     boundaries.  So I'm going to make sure that the area on
+     which the two operate is a multiple of 32. Of course, that
+     junks up the mapping, but so be it -- that's why we have extract_word.... */
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t *)  rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+  
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+int gf_w32_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w32_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w32_composite_data *) h->private;
+  cd->log = gf_w16_get_log_table(h->base_gf);
+  cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt)
+  } else {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region)
+  }
+
+  if (cd->log == NULL) {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive)
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline) 
+  }
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse)
+
+  return 1;
+}
+
+
+
+int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
+      break;
+    case GF_MULT_GROUP: 
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
+               sizeof(uint32_t) * (1 << arg1) +
+               sizeof(uint32_t) * (1 << arg2) + 64;
+      break;
+    case GF_MULT_DEFAULT:
+
+    case GF_MULT_SPLIT_TABLE: 
+        if (arg1 == 8 && arg2 == 8){
+          return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
+        }
+        if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
+        }
+        if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
+        }
+        if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 
+             (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
+        }
+        if ((arg1 == 4 && arg2 == 32) || 
+            (arg2 == 4 && arg1 == 32) ||
+            mult_type == GF_MULT_DEFAULT) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
+        }
+        return 0;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_CARRY_FREE_GK:
+      return sizeof(gf_internal_t) + sizeof(uint64_t)*2;
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
+      break;
+
+    default:
+      return 0;
+   }
+   return 0;
+}
+
+int gf_w32_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else { 
+
+      /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
+
+      /* h->prim_poly = 0xc5; */
+
+      /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
+
+      h->prim_poly = 0x400007;
+    } 
+  }
+
+  /* No leading one */
+
+  if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
+    
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,NULL)
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE:    if (gf_w32_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:         if (gf_w32_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:     if (gf_w32_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT: 
+    case GF_MULT_SPLIT_TABLE:   if (gf_w32_split_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:         if (gf_w32_group_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:   
+    case GF_MULT_BYTWO_b:       if (gf_w32_bytwo_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w32_matrix)
+  }
+
+  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+    SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
+  }
+  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
+    SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide)
+  }
+  if (h->region_type == GF_REGION_CAUCHY) {
+    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+  } else if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word)
+    } else {
+      SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word)
+    }
+  } else {
+    SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word)
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c
new file mode 100644
index 000000000..3a7b95316
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c
@@ -0,0 +1,2047 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w4.c
+ *
+ * Routines for 4-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+#include "gf_cpu.h"
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+// ToDo(KMG/JSP): Why is 0x88 hard-coded?
+#define SSE_AB2(pp, m1, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+/* ------------------------------------------------------------
+   JSP: These are basic and work from multiple implementations.
+ */
+
+static
+inline
+gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b)
+{
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 4;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static 
+gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *r8, v;
+
+  r8 = (uint8_t *) start;
+  v = r8[index/2];
+  if (index%2) {
+    return v >> 4;
+  } else {
+    return v&0xf;
+  }
+}
+
+
+static
+inline
+gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+
+static
+inline
+gf_val_32_t
+gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint8_t product, i, pp;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+/* Ben: This function works, but it is 33% slower than the normal shift mult */
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0);
+  b = _mm_insert_epi32 (a, b4, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only
+     have to do the reduction only once, because (w-2)/z == 1. Where
+     z is equal to the number of zeros after the leading 1.
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_epi64 shifts the result to the right by 4 bits. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result. */
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+  return rv;
+}
+#endif
+
+static
+void
+gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int 
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | 
+             ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: LOG_TABLE: 
+
+  JSP: This is a basic log-antilog implementation.  
+       I'm not going to spend any time optimizing it because the
+       other techniques are faster for both single and region
+       operations. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_logtable_data *ltd;
+    
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int log_sum = 0;
+  struct gf_logtable_data *ltd;
+    
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b];
+  return (ltd->antilog_tbl_div[log_sum]);
+}
+
+static
+void 
+gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv, b, c;
+  uint8_t *s8, *d8;
+  
+  struct gf_logtable_data *ltd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = ltd->log_tbl[val];
+
+  for (i = 0; i < bytes; i++) {
+    c = (xor) ? d8[i] : 0;
+    b = (s8[i] >> GF_FIELD_WIDTH);
+    c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH);
+    b = (s8[i] & 0xf);
+    c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]];
+    d8[i] = c;
+  }
+}
+
+static 
+int gf_w4_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_logtable_data *ltd;
+  int i, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  ltd = h->private;
+
+  for (i = 0; i < GF_FIELD_SIZE; i++)
+    ltd->log_tbl[i]=0;
+
+  ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1);
+  b = 1;
+  i = 0;
+  do {
+    if (ltd->log_tbl[b] != 0 && i != 0) {
+      fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n");
+      return 0;
+    }
+    ltd->log_tbl[b] = i;
+    ltd->antilog_tbl[i] = b;
+    ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b;
+    b <<= 1;
+    i++;
+    if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly;
+  } while (b != 1);
+
+  if (i != GF_FIELD_SIZE - 1) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+    
+  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
+  SET_FUNCTION(gf,divide,w32,gf_w4_log_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region)
+  return 1;
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: SINGLE TABLE: JSP. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_single_table_data *std;
+    
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->mult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_single_table_data *std;
+    
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+void 
+gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t b, c;
+  uint8_t *s8, *d8;
+  
+  struct gf_single_table_data *std;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  for (i = 0; i < bytes; i++) {
+    c = (xor) ? d8[i] : 0;
+    b = (s8[i] >> GF_FIELD_WIDTH);
+    c ^= (std->mult[val][b] << GF_FIELD_WIDTH);
+    b = (s8[i] & 0xf);
+    c ^= (std->mult[val][b]);
+    d8[i] = c;
+  }
+}
+
+#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); }
+
+#ifdef INTEL_SSSE3
+static
+void 
+gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *base, *sptr, *dptr, *top;
+  __m128i  tl, loset, r, va, th;
+  
+  struct gf_single_table_data *std;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_FIELD_WIDTH);
+
+  gf_do_initial_region_alignment(&rd);
+
+  tl = _mm_loadu_si128((__m128i *)base);
+  th = _mm_slli_epi64(tl, 4);
+  loset = _mm_set1_epi8 (0x0f);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+  top = rd.s_top;
+
+  while (sptr < (uint8_t *) top) {
+    va = _mm_load_si128 ((__m128i *)(sptr));
+    r = _mm_and_si128 (loset, va);
+    r = _mm_shuffle_epi8 (tl, r);
+    va = _mm_srli_epi64 (va, 4);
+    va = _mm_and_si128 (loset, va);
+    va = _mm_shuffle_epi8 (th, va);
+    r = _mm_xor_si128 (r, va);
+    va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); 
+    r = _mm_xor_si128 (r, va);
+    _mm_store_si128 ((__m128i *)(dptr), r);
+    dptr += 16;
+    sptr += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+
+}
+#endif
+
+static 
+int gf_w4_single_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  int a, b, prod;
+
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_single_table_data *)h->private;
+
+  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      std->mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
+  #if defined(INTEL_SSSE3)
+    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
+    } else {
+  #elif defined(ARM_NEON)
+    if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
+      gf_w4_neon_single_table_init(gf);
+    } else {
+  #endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+      if (h->region_type & GF_REGION_SIMD) return 0;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
+  #endif
+
+  return 1;
+}
+
+/* ------------------------------------------------------------
+  IMPLEMENTATION: DOUBLE TABLE: JSP. 
+ */
+
+static
+inline
+gf_val_32_t
+gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_double_table_data *std;
+    
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->mult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_double_table_data *std;
+    
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+void 
+gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8, *base;
+  gf_region_data rd;
+  struct gf_double_table_data *std;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+
+  std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_DOUBLE_WIDTH);
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]];
+  } else {
+    for (i = 0; i < bytes; i++) d8[i] = base[s8[i]];
+  }
+}
+
+static 
+int gf_w4_double_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_double_table_data *std;
+  int a, b, c, prod, ab;
+  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_double_table_data *)h->private;
+
+  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+  bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  for (a = 0; a < GF_FIELD_SIZE; a++) {
+    for (b = 0; b < GF_FIELD_SIZE; b++) {
+      ab = mult[a][b];
+      for (c = 0; c < GF_FIELD_SIZE; c++) {
+        std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]);
+      }
+    }
+  }
+
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region)
+  return 1;
+}
+
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_lazy_data *std;
+    
+  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_lazy_data *std;
+    
+  std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->smult[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_data *std;
+    
+  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  return std->div[a][b];
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_quad_table_data *std;
+  uint16_t v;
+    
+  std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  v = std->mult[a][b];
+  return v;
+}
+
+static
+void 
+gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *base;
+  gf_region_data rd;
+  struct gf_quad_table_data *std;
+  struct gf_quad_table_lazy_data *ltd;
+  gf_internal_t *h;
+  int a, b, c, d, va, vb, vc, vd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) (gf->scratch);
+  if (h->region_type & GF_REGION_LAZY) {
+    ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private;
+    base = ltd->mult;
+    for (a = 0; a < 16; a++) {
+      va = (ltd->smult[val][a] << 12);
+      for (b = 0; b < 16; b++) {
+        vb = (ltd->smult[val][b] << 8);
+        for (c = 0; c < 16; c++) {
+          vc = (ltd->smult[val][c] << 4);
+          for (d = 0; d < 16; d++) {
+            vd = ltd->smult[val][d];
+            base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
+          }
+        }
+      }
+    }
+  } else {
+    std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+    base = &(std->mult[val][0]);
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  gf_two_byte_region_table_multiply(&rd, base);
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w4_quad_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_quad_table_data *std;
+  int prod, val, a, b, c, d, va, vb, vc, vd;
+  uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_quad_table_data *)h->private;
+
+  bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w4_shift_multiply(gf, a, b);
+      mult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  for (val = 0; val < 16; val++) {
+    for (a = 0; a < 16; a++) {
+      va = (mult[val][a] << 12);
+      for (b = 0; b < 16; b++) {
+        vb = (mult[val][b] << 8);
+        for (c = 0; c < 16; c++) {
+          vc = (mult[val][c] << 4);
+          for (d = 0; d < 16; d++) {
+            vd = mult[val][d];
+            std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd);
+          }
+        }
+      }
+    }
+  }
+
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
+  return 1;
+}
+static 
+int gf_w4_quad_table_lazy_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_quad_table_lazy_data *std;
+  int a, b, prod, loga, logb;
+  uint8_t log_tbl[GF_FIELD_SIZE];
+  uint8_t antilog_tbl[GF_FIELD_SIZE*2];
+
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_quad_table_lazy_data *)h->private;
+
+  b = 1;
+  for (a = 0; a < GF_MULT_GROUP_SIZE; a++) {
+      log_tbl[b] = a;
+      antilog_tbl[a] = b;
+      antilog_tbl[a+GF_MULT_GROUP_SIZE] = b;
+      b <<= 1;
+      if (b & GF_FIELD_SIZE) {
+          b = b ^ h->prim_poly;
+      }
+  }
+
+  bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    loga = log_tbl[a];
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      logb = log_tbl[b];
+      prod = antilog_tbl[loga+logb];
+      std->smult[a][b] = prod;
+      std->div[prod][b] = a;
+    }
+  }
+
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide)
+  SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region)
+  return 1;
+}
+
+static 
+int gf_w4_table_init(gf_t *gf)
+{
+  int rt;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  rt = (h->region_type);
+
+  if (h->mult_type == GF_MULT_DEFAULT && 
+    !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) 
+      rt |= GF_REGION_DOUBLE_TABLE;
+
+  if (rt & GF_REGION_DOUBLE_TABLE) {
+    return gf_w4_double_table_init(gf);
+  } else if (rt & GF_REGION_QUAD_TABLE) {
+    if (rt & GF_REGION_LAZY) {
+      return gf_w4_quad_table_lazy_init(gf);
+    } else {
+      return gf_w4_quad_table_init(gf);
+    }
+  } else {
+    return gf_w4_single_table_init(gf);
+  }
+  return 0;
+}
+
+/* ------------------------------------------------------------
+   JSP: GF_MULT_BYTWO_p and _b: See the paper.
+*/
+
+static
+inline
+gf_val_32_t
+gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  
+  prod = 0;
+  pmask = 0x8;
+  amask = 0x8;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+gf_val_32_t
+gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+void 
+gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_bytwo_data *btd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x8;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi8(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint8_t vrev;
+  __m128i pp, m1, ta, prod, t1, t2, tp, one, v;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 4; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  one = _mm_set1_epi8(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi8(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+/*
+#ifdef INTEL_SSE2
+static
+void 
+gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *d8, *s8, tb;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  if (xor) {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_load_si128 ((__m128i *)(d8));
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        SSE_AB2(pp, m1, m2, va, t1, t2);
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_setzero_si128 ();
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
+        t2 = _mm_and_si128(va, m2);
+        t2 = _mm_sub_epi64 (
+          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
+        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+*/
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(va, vb);
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = va;
+    SSE_AB2(pp, m1, va, t1, t2);
+    va = _mm_xor_si128(va, vb);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static 
+void
+gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va);
+    SSE_AB2(pp, m1, va, t1, t2);
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void 
+gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *d8, *s8, tb;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+    
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  switch (val) {
+    case 2:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_2_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 3:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_3_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 4:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_4_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 5:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_5_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 6:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_6_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+    case 7:
+      if (!xor) {
+        gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd);
+      } else {
+        gf_w4_bytwo_b_sse_region_7_xor(&rd, btd);
+      }
+      gf_do_final_region_alignment(&rd);
+      return;
+  }
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  if (xor) {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_load_si128 ((__m128i *)(d8));
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        SSE_AB2(pp, m1, va, t1, t2);
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < (uint8_t *) rd.d_top) {
+      va = _mm_load_si128 ((__m128i *)(s8));
+      vb = _mm_setzero_si128 ();
+      tb = val;
+      while (1) {
+        if (tb & 1) vb = _mm_xor_si128(vb, va);
+        tb >>= 1;
+        if (tb == 0) break;
+        t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1);
+        t2 = _mm_and_si128(va, m2);
+        t2 = _mm_sub_epi64 (
+          _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1)));
+        va = _mm_xor_si128(t1, _mm_and_si128(t2, pp));
+      }
+      _mm_store_si128((__m128i *)d8, vb);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+void 
+gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+  case 1:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        *d64 ^= *s64;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        *d64 = *s64;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 2:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 3:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 4:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 5:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 6:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  case 7:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta ^ prod;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 8:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= ta;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = ta;
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 9:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 10:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 11:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 12:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 13:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 14:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  case 15:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 ^= (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        ta = *s64;
+        prod = ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        prod ^= ta;
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        *d64 = (ta ^ prod);
+        d64++;
+        s64++;
+      }
+    }
+    break; 
+  default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w4_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xf;
+  m1 = 0xe;
+  m2 = 0x8;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
+    #ifdef INTEL_SSE2
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
+    #endif
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
+    #ifdef INTEL_SSE2
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
+    #endif
+  }
+  return 1;
+}
+
+
+static 
+int gf_w4_cfm_init(gf_t *gf)
+{
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+    return 1;
+  }
+#elif defined(ARM_NEON)
+  if (gf_cpu_supports_arm_neon) {
+    return gf_w4_neon_cfm_init(gf);
+  }
+#endif
+  return 0;
+}
+
+static 
+int gf_w4_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply)
+  return 1;
+}
+
+/* JSP: I'm putting all error-checking into gf_error_check(), so you don't 
+   have to do error checking in scratch_size or in init */
+
+int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data);
+      break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_TABLE:
+      if (region_type == GF_REGION_CAUCHY) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
+      }
+
+      if (mult_type == GF_MULT_DEFAULT && 
+          !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
+          region_type = GF_REGION_DOUBLE_TABLE;
+
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
+      } else if (region_type & GF_REGION_QUAD_TABLE) {
+        if ((region_type & GF_REGION_LAZY) == 0) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64;
+        } else {
+          return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64;
+        }
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
+      }
+      break;
+
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    default:
+      return 0;
+   }
+  return 0;
+}
+
+int
+gf_w4_init (gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->prim_poly == 0) h->prim_poly = 0x13;
+  h->prim_poly |= 0x10;
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,NULL)
+  SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word)
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:      if (gf_w4_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:   
+    case GF_MULT_BYTWO_b:    if (gf_w4_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE:  if (gf_w4_log_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT:   
+    case GF_MULT_TABLE:      if (gf_w4_table_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w4_matrix)
+  }
+
+  if (gf->divide.w32 == NULL) {
+    SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse)
+    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid)
+  }
+
+  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide)
+
+  if (h->region_type == GF_REGION_CAUCHY) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+  }
+
+  if (gf->multiply_region.w32 == NULL) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single)
+  }
+
+  return 1;
+}
+
+/* Inline setup functions */
+
+uint8_t *gf_w4_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->mult;
+  } 
+  return NULL;
+}
+    
+uint8_t *gf_w4_get_div_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_single_table_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w4_single_table_multiply) {
+    std = (struct gf_single_table_data *) h->private;
+    return (uint8_t *) std->div;
+  } 
+  return NULL;
+}
+
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
new file mode 100644
index 000000000..69e55dbd2
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c
@@ -0,0 +1,2235 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w64.c
+ *
+ * Routines for 64-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+#include "gf_cpu.h"
+
+static
+inline
+gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
+{
+  return gf->divide.w64(gf, 1, a);
+}
+
+#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
+
+static
+inline
+gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  b = gf->inverse.w64(gf, b);
+  return gf->multiply.w64(gf, a, b);
+}
+
+static
+void
+gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  uint32_t i;
+  gf_val_64_t *s64;
+  gf_val_64_t *d64;
+
+  s64 = (gf_val_64_t *) src;
+  d64 = (gf_val_64_t *) dest;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  if (xor) {
+    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
+      d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
+    }
+  } else {
+    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
+      d64[i] = gf->multiply.w64(gf, val, s64[i]);
+    }
+  }
+}
+
+#if defined(INTEL_SSE4_PCLMUL) 
+static
+void
+gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         w;
+  __m128i         m1, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+      
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      
+      a = _mm_load_si128((__m128i *) s64);  
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
+xor)
+{
+  gf_val_64_t *s64, *d64, *top;
+  gf_region_data rd;
+
+  __m128i         a, b;
+  __m128i         result, r1;
+  __m128i         prim_poly;
+  __m128i         w;
+  __m128i         m1, m3, m4;
+  gf_internal_t * h = gf->scratch;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+  
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
+  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
+  m3 = _mm_slli_si128(m1, 8);
+  m4 = _mm_slli_si128(m3, 4);
+
+  s64 = (gf_val_64_t *) rd.s_start;
+  d64 = (gf_val_64_t *) rd.d_start;
+  top = (gf_val_64_t *) rd.d_top;
+
+  if (xor) {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      r1 = _mm_load_si128((__m128i *) d64);
+      result = _mm_xor_si128(r1, result);
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2;
+    }
+  } else {
+    while (d64 != top) {
+      a = _mm_load_si128((__m128i *) s64);
+      result = _mm_clmulepi64_si128 (a, b, 1);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      r1 = _mm_xor_si128 (result, w);
+
+      result = _mm_clmulepi64_si128 (a, b, 0);
+
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
+      result = _mm_xor_si128 (result, w);
+
+      result = _mm_unpacklo_epi64(result, r1);
+
+      _mm_store_si128((__m128i *) d64, result);
+      d64 += 2;
+      s64 += 2; 
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+  inline
+gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
+{
+  gf_val_64_t e_i, e_im1, e_ip1;
+  gf_val_64_t d_i, d_im1, d_ip1;
+  gf_val_64_t y_i, y_im1, y_ip1;
+  gf_val_64_t c_i;
+  gf_val_64_t one = 1;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 64;
+  for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (one << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      d_ip1--;
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
+   include it for completeness.  It does have the feature that it requires no
+   extra memory.  
+*/
+
+static
+inline
+gf_val_64_t
+gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+  uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set leading one of primitive polynomial */
+  
+  a = a64;
+  bl = 0;
+  br = b64;
+  one = 1;
+  lbit = (one << 63);
+
+  pl = 0; /* Allen: left side of product */
+  pr = 0; /* Allen: right side of product */
+
+  /* Allen: unlike the corresponding functions for smaller word sizes,
+   * this loop carries out the initial carryless multiply by
+   * shifting b itself rather than simply looking at successively
+   * higher shifts of b */
+  
+  for (i = 0; i < GF_FIELD_WIDTH; i++) {
+    if (a & (one << i)) {
+      pl ^= bl;
+      pr ^= br;
+    }
+
+    bl <<= 1;
+    if (br & lbit) bl ^= 1;
+    br <<= 1;
+  }
+
+  /* Allen: the name of the variable "one" is no longer descriptive at this point */
+  
+  one = lbit >> 1;
+  ppl = (h->prim_poly >> 2) | one;
+  ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
+  while (one != 0) {
+    if (pl & one) {
+      pl ^= ppl;
+      pr ^= ppr;
+    }
+    one >>= 1;
+    ppr >>= 1;
+    if (ppl & 1) ppr ^= lbit;
+    ppl >>= 1;
+  }
+  return pr;
+}
+
+/*
+ * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
+ */
+
+#if defined(INTEL_SSE4_PCLMUL) 
+
+static
+inline
+gf_val_64_t
+gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+       gf_val_64_t rv = 0;
+
+        __m128i         a, b;
+        __m128i         result;
+        __m128i         prim_poly;
+        __m128i         v, w;
+        gf_internal_t * h = gf->scratch;
+
+        a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+        b = _mm_insert_epi64 (a, b64, 0); 
+        prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+        /* Do the initial multiply */
+   
+        result = _mm_clmulepi64_si128 (a, b, 0);
+        
+        /* Mask off the high order 32 bits using subtraction of the polynomial.
+         * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
+         */
+
+        /* Adam: We cant include the leading one in the 64 bit pclmul,
+         so we need to split up the high 8 bytes of the result into two 
+         parts before we multiply them with the prim_poly.*/
+
+        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+        result = _mm_xor_si128 (result, w);
+        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+        result = _mm_xor_si128 (result, w);
+
+        rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
+        return rv;
+}
+#endif
+ 
+#if defined(INTEL_SSE4_PCLMUL) 
+
+static
+inline
+gf_val_64_t
+gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+  gf_val_64_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         v, w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
+  b = _mm_insert_epi64 (a, b64, 0);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+ 
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
+  result = _mm_xor_si128 (result, w);
+
+  rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
+  return rv;
+}
+#endif
+
+
+#if defined(INTEL_SSE4_PCLMUL) 
+  void
+gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  uint8_t *s8, *d8, *dtop;
+  gf_region_data rd;
+  __m128i  v, b, m, prim_poly, c, fr, w, result;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  dtop = (uint8_t *) rd.d_top;
+
+  v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
+  m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+  if (xor) {
+    while (d8 != dtop) {
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+      result = _mm_load_si128((__m128i *) d8);
+      fr = _mm_xor_si128 (result, fr);
+
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
+    }
+  } else {
+    while (d8 < dtop) {
+      b = _mm_load_si128((__m128i *) s8);
+      result = _mm_clmulepi64_si128 (b, v, 0);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      fr = _mm_xor_si128 (result, w);
+      fr = _mm_and_si128 (fr, m);
+  
+      result = _mm_clmulepi64_si128 (b, v, 1);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
+      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
+      result = _mm_xor_si128 (result, w);
+      result = _mm_slli_si128 (result, 8);
+      fr = _mm_xor_si128 (result, fr);
+  
+      _mm_store_si128((__m128i *) d8, fr);
+      d8 += 16;
+      s8 += 16;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+void
+gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_4_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 16; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 16; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xf];
+      s >>= 4;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+uint64_t
+gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64)
+{
+  uint64_t product, i, j, mask, tb;
+  gf_internal_t *h;
+  struct gf_split_8_8_data *d8;
+ 
+  h = (gf_internal_t *) gf->scratch;
+  d8 = (struct gf_split_8_8_data *) h->private;
+  product = 0;
+  mask = 0xff;
+
+  for (i = 0; a64 != 0; i++) {
+    tb = b64;
+    for (j = 0; tb != 0; j++) {
+      product ^= d8->tables[i+j][a64&mask][tb&mask];
+      tb >>= 8;
+    }
+    a64 >>= 8;
+  }
+  return product;
+}
+
+void
+gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_8_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_8_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 8; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < 256; j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xff];
+      s >>= 8;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+void
+gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  struct gf_split_16_64_lazy_data *ld;
+  int i, j, k;
+  uint64_t pp, v, s, *s64, *d64, *top;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  ld = (struct gf_split_16_64_lazy_data *) h->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  if (ld->last_value != val) {
+    v = val;
+    for (i = 0; i < 4; i++) {
+      ld->tables[i][0] = 0;
+      for (j = 1; j < (1<<16); j <<= 1) {
+        for (k = 0; k < j; k++) {
+          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+        }
+        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+      }
+    }
+  }
+  ld->last_value = val;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  while (d64 != top) {
+    v = (xor) ? *d64 : 0;
+    s = *s64;
+    i = 0;
+    while (s != 0) {
+      v ^= ld->tables[i][s&0xffff];
+      s >>= 16;
+      i++;
+    }
+    *d64 = v;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static 
+int gf_w64_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w64,gf_w64_shift_multiply)
+  SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+  SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
+  return 1;
+}
+
+static 
+int gf_w64_cfm_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+  SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
+
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+
+    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
+    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    }
+    return 1;
+  }
+#endif
+
+  return 0;
+}
+
+static
+void
+gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
+{
+  uint64_t i;
+  uint64_t j;
+  uint64_t one = 1;
+  int g_s;
+
+  g_s = h->arg1;
+  shift[0] = 0;
+ 
+  for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & (one << 63)) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t top, bot, mask, tp;
+  int g_s, g_r, lshift, rshift;
+  struct gf_w64_group_data *gd;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gd = (struct gf_w64_group_data *) h->private;
+  gf_w64_group_set_shift_tables(gd->shift, b, h);
+
+  mask = (((uint64_t)1 << g_s) - 1);
+  top = 0;
+  bot = gd->shift[a&mask];
+  a >>= g_s; 
+
+  if (a == 0) return bot;
+  lshift = 0;
+  rshift = 64;
+
+  do {              /* Shifting out is straightfoward */
+    lshift += g_s;
+    rshift -= g_s;
+    tp = gd->shift[a&mask];
+    top ^= (tp >> rshift);
+    bot ^= (tp << lshift);
+    a >>= g_s; 
+  } while (a != 0);
+
+  /* Reducing is a bit gross, because I don't zero out the index bits of top.
+     The reason is that we throw top away.  Even better, that last (tp >> rshift)
+     is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */
+     
+  lshift = ((lshift-1) / g_r) * g_r;
+  rshift = 64 - lshift;
+  mask = ((uint64_t)1 << g_r) - 1;
+  while (lshift >= 0) {
+    tp = gd->reduce[(top >> lshift) & mask];
+    top ^= (tp >> rshift);
+    bot ^= (tp << lshift);
+    lshift -= g_r;
+    rshift += g_r;
+  }
+    
+  return bot;
+}
+
+static
+void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  int i, fzb;
+  uint64_t a64, smask, rmask, top, bot, tp;
+  int lshift, rshift, g_s, g_r;
+  gf_region_data rd;
+  uint64_t *s64, *d64, *dtop;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w64_group_data *) h->private;
+  g_s = h->arg1;
+  g_r = h->arg2;
+  gf_w64_group_set_shift_tables(gd->shift, val, h);
+
+  for (i = 63; !(val & (1ULL << i)); i--) ;
+  i += g_s;
+  
+  /* i is the bit position of the first zero bit in any element of
+                           gd->shift[] */
+  
+  if (i > 64) i = 64;   
+  
+  fzb = i;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  dtop = (uint64_t *) rd.d_top;
+
+  smask = ((uint64_t)1 << g_s) - 1;
+  rmask = ((uint64_t)1 << g_r) - 1;
+
+  while (d64 < dtop) {
+    a64 = *s64;
+    
+    top = 0;
+    bot = gd->shift[a64&smask];
+    a64 >>= g_s;
+    i = fzb;
+
+    if (a64 != 0) {
+      lshift = 0;
+      rshift = 64;
+  
+      do {  
+        lshift += g_s;
+        rshift -= g_s;
+        tp = gd->shift[a64&smask];
+        top ^= (tp >> rshift);
+        bot ^= (tp << lshift);
+        a64 >>= g_s;
+      } while (a64 != 0);
+      i += lshift;
+  
+      lshift = ((i-64-1) / g_r) * g_r;
+      rshift = 64 - lshift;
+      while (lshift >= 0) {
+        tp = gd->reduce[(top >> lshift) & rmask];
+        top ^= (tp >> rshift);    
+        bot ^= (tp << lshift);
+        lshift -= g_r;
+        rshift += g_r;
+      }
+    }
+
+    if (xor) bot ^= *d64;
+    *d64 = bot;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  int leftover, rs;
+  uint64_t p, l, ind, a64;
+  int bits_left;
+  int g_s;
+
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+
+  gd = (struct gf_w64_group_data *) h->private;
+  gf_w64_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = 64 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = 64 - leftover;
+  a64 = a;
+  ind = a64 >> rs;
+  a64 <<= leftover;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = 64 - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a64 >> rs;
+    a64 <<= g_s;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+  }
+  return p;
+}
+
+static
+void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  int leftover, rs;
+  uint64_t p, l, ind, a64;
+  int bits_left;
+  int g_s;
+  gf_region_data rd;
+  uint64_t *s64, *d64, *top;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gd = (struct gf_w64_group_data *) h->private;
+  g_s = h->arg1;
+  gf_w64_group_set_shift_tables(gd->shift, val, h);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  leftover = 64 % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  while (d64 < top) {
+    rs = 64 - leftover;
+    a64 = *s64;
+    ind = a64 >> rs;
+    a64 <<= leftover;
+    p = gd->shift[ind];
+
+    bits_left = rs;
+    rs = 64 - g_s;
+
+    while (bits_left > 0) {
+      bits_left -= g_s;
+      ind = a64 >> rs;
+      a64 <<= g_s;
+      l = p >> rs;
+      p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
+    }
+    if (xor) p ^= *d64;
+    *d64 = p;
+    d64++;
+    s64++;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+
+static
+int gf_w64_group_init(gf_t *gf)
+{
+  uint64_t i, j, p, index;
+  struct gf_w64_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint64_t g_r, g_s;
+
+  g_s = h->arg1;
+  g_r = h->arg2;
+
+  gd = (struct gf_w64_group_data *) h->private;
+  gd->shift = (uint64_t *) (&(gd->memory));
+  gd->reduce = gd->shift + (1 << g_s);
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint64_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (1 << j);
+        if (j > 0) index ^= (h->prim_poly >> (64-j)); 
+      }
+    }
+    gd->reduce[index] = p;
+  }
+
+  if (g_s == g_r) {
+    SET_FUNCTION(gf,multiply,w64,gf_w64_group_s_equals_r_multiply)
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_s_equals_r_multiply_region) 
+  } else {
+    SET_FUNCTION(gf,multiply,w64,gf_w64_group_multiply)
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_multiply_region) 
+  }
+  SET_FUNCTION(gf,divide,w64,NULL)
+  SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+
+  return 1;
+}
+
+static
+gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint64_t *r64, rv;
+
+  r64 = (uint64_t *) start;
+  rv = r64[index];
+  return rv;
+}
+
+static
+gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint64_t a, b, *r64;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r64 = (uint64_t *) start;
+  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
+  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
+  index -= (((uint64_t *) rd.d_start) - r64);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | ((uint64_t)b << 32));
+}
+
+static
+gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int i;
+  uint64_t *r64, rv;
+  uint8_t *r8;
+  gf_region_data rd;
+
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128);
+  r64 = (uint64_t *) start;
+  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
+  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
+  index -= (((uint64_t *) rd.d_start) - r64);
+  r8 = (uint8_t *) rd.d_start;
+  r8 += ((index & 0xfffffff0)*8);
+  r8 += (index & 0xf);
+  r8 += 112;
+  rv =0;
+  for (i = 0; i < 8; i++) {
+    rv <<= 8;
+    rv |= *r8;
+    r8 -= 16;
+  }
+  return rv;
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x8000000000000000ULL;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_64_t
+gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  uint64_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  
+  /* changed from declare then shift to just declare.*/
+  
+  pmask = 0x8000000000000000ULL;
+  amask = 0x8000000000000000ULL;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+void
+gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, ta, prod, amask, pmask, pp;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  pmask = 0x80000000;
+  pmask <<= 32;
+  pp = h->prim_poly;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = pmask;
+      ta = *s64;
+      while (amask != 0) {
+        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = pmask;
+      ta = *s64;
+      while (amask != 0) {
+        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, ta, tb, prod, bmask, pp;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  bmask = 0x80000000;
+  bmask <<= 32;
+  pp = h->prim_poly;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      tb = val;
+      ta = *s64;
+      while (1) {
+        if (tb & 1) prod ^= ta;
+        tb >>= 1;
+        if (tb == 0) break;
+        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      tb = val;
+      ta = *s64;
+      while (1) {
+        if (tb & 1) prod ^= ta;
+        tb >>= 1;
+        if (tb == 0) break;
+        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define BYTWO_P_ONESTEP {\
+      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+      t1 = _mm_and_si128(v, one); \
+      t1 = _mm_sub_epi64(t1, one); \
+      t1 = _mm_and_si128(t1, ta); \
+      prod = _mm_xor_si128(prod, t1); \
+      v = _mm_srli_epi64(v, 1); }
+
+
+#ifdef INTEL_SSE2
+void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint64_t vrev, one64;
+  uint64_t amask;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  gf_region_data rd;
+  gf_internal_t *h;
+  
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  h = (gf_internal_t *) gf->scratch;
+  one64 = 1;
+  vrev = 0;
+  for (i = 0; i < 64; i++) {
+    vrev <<= 1;
+    if (!(val & (one64 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+  one = _mm_set1_epi64x(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi64x(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
+{
+  uint64_t one64, amask;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  gf_internal_t *h;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  h = (gf_internal_t *) rd->gf->scratch;
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
+{
+  uint64_t one64, amask;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+  gf_internal_t *h;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  h = (gf_internal_t *) rd->gf->scratch;
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+void
+gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  uint64_t itb, amask, one64;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  gf_region_data rd;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  if (val == 2) {
+    if (xor) {
+      gf_w64_bytwo_b_sse_region_2_xor(&rd);
+    } else {
+      gf_w64_bytwo_b_sse_region_2_noxor(&rd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+  h = (gf_internal_t *) gf->scratch;
+
+  one64 = 1;
+  amask = -1;
+  amask ^= 1;
+  pp = _mm_set1_epi64x(h->prim_poly);
+  m1 = _mm_set1_epi64x(amask);
+  m2 = _mm_set1_epi64x(one64 << 63);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+
+static
+int gf_w64_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
+    #ifdef INTEL_SSE2 
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region) 
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) 
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      } 
+    #endif
+  } else {
+    SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
+    #ifdef INTEL_SSE2 
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region) 
+      } else {
+    #endif
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) 
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+    #ifdef INTEL_SSE2
+      } 
+    #endif
+  }
+  SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+  return 1;
+}
+
+
+static
+gf_val_64_t
+gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = b & 0x00000000ffffffff;
+  uint32_t b1 = (b & 0xffffffff00000000) >> 32;
+  uint32_t a0 = a & 0x00000000ffffffff;
+  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
+  uint32_t a1b1;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+         ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report)
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
+ *
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ *
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ *
+ * a / b = a * c
+ */
+
+static
+gf_val_64_t
+gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t a0 = a & 0x00000000ffffffff;
+  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
+  uint32_t c0, c1, d, tmp;
+  uint64_t c;
+  uint32_t a0inv, a1inv;
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1);
+    a0inv = base_gf->inverse.w32(base_gf, a0);
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv);
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
+    tmp = base_gf->inverse.w32(base_gf, tmp);
+
+    d = base_gf->multiply.w32(base_gf, d, tmp);
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
+  }
+
+  c = c0 | ((uint64_t)c1 << 32);
+
+  return c;
+}
+
+static
+void
+gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint32_t b0 = val & 0x00000000ffffffff;
+  uint32_t b1 = (val & 0xffffffff00000000) >> 32;
+  uint64_t *s64, *d64;
+  uint64_t *top;
+  uint64_t a0, a1, a1b1;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+
+  s64 = rd.s_start;
+  d64 = rd.d_start;
+  top = rd.d_top;
+  
+  if (xor) {
+    while (d64 < top) {
+      a0 = *s64 & 0x00000000ffffffff;
+      a1 = (*s64 & 0xffffffff00000000) >> 32;
+      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+      *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+      s64++;
+      d64++;
+    }
+  } else {
+    while (d64 < top) {
+      a0 = *s64 & 0x00000000ffffffff;
+      a1 = (*s64 & 0xffffffff00000000) >> 32;
+      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+      *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
+                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
+      s64++;
+      d64++;
+    }
+  }
+}
+
+static
+void
+gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  gf_val_32_t val0 = val & 0x00000000ffffffff;
+  gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32;
+  uint8_t *slow, *shigh;
+  uint8_t *dlow, *dhigh, *top;
+  int sub_reg_size;
+  gf_region_data rd;
+
+  if (!xor) {
+    memset(dest, 0, bytes);
+  }
+  
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  slow = (uint8_t *) rd.s_start;
+  dlow = (uint8_t *) rd.d_start;
+  top = (uint8_t*) rd.d_top;
+  sub_reg_size = (top - dlow)/2;
+  shigh = slow + sub_reg_size;
+  dhigh = dlow + sub_reg_size;
+
+  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+
+
+static
+int gf_w64_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region_alt)
+  } else {
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region)
+  }
+
+  SET_FUNCTION(gf,multiply,w64,gf_w64_composite_multiply)
+  SET_FUNCTION(gf,divide,w64,NULL)
+  SET_FUNCTION(gf,inverse,w64,gf_w64_composite_inverse)
+
+  return 1;
+}
+
+#ifdef INTEL_SSSE3
+static
+  void
+gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  __m128i si, tables[16][8], p[8], v0, mask1;
+  struct gf_split_4_64_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+
+  while (d64 != top) {
+
+    if (xor) {
+      for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
+    } else {
+      for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+    }
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      v0 = _mm_load_si128((__m128i *) s64); 
+      /* MM_PRINT8("v", v0); */
+      s64 += 2;
+      
+      si = _mm_and_si128(v0, mask1);
+  
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      v0 = _mm_srli_epi32(v0, 4);
+      si = _mm_and_si128(v0, mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+    for (i = 0; i < 8; i++) {
+      /* MM_PRINT8("v", p[i]); */
+      _mm_store_si128((__m128i *) d64, p[i]);
+      d64 += 2;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE4
+static
+  void
+gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1;
+  struct gf_split_4_64_lazy_data *ld;
+  uint8_t btable[16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+ 
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
+    }
+  }
+
+  mask1 = _mm_set1_epi8(0xf);
+  mask8 = _mm_set1_epi16(0xff);
+  mask16 = _mm_set1_epi32(0xffff);
+
+  while (d64 != top) {
+
+    for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = _mm_load_si128((__m128i *) s64); 
+      s64 += 2;
+    }
+
+    for (k = 0; k < 4; k ++) {
+      st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
+      st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
+      t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = _mm_srli_si128(st[k], 8);
+      st[k+4] = _mm_slli_si128(st[k+4], 8);
+      st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
+      st[k] = t1;
+    }
+
+/*
+    printf("After pack pass 1\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    
+    t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
+    st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
+    st[0] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
+    st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
+    st[1] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
+    st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
+    st[4] = t1;
+    t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
+    st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
+    st[5] = t1;
+
+/*
+    printf("After pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+    printf("\n");
+ */
+    t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
+    st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
+    st[0] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
+    st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
+    st[2] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
+    st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
+    st[4] = t1;
+    t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
+    st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
+    st[6] = t1;
+
+/*
+    printf("After final pack pass 2\n");
+    for (k = 0; k < 8; k++) {
+      MM_PRINT8("v", st[k]);
+    }
+ */
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = _mm_and_si128(st[k], mask1);
+  
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+      st[k] = _mm_srli_epi32(st[k], 4);
+      si = _mm_and_si128(st[k], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    t1 = _mm_unpacklo_epi8(p[0], p[1]);
+    p[1] = _mm_unpackhi_epi8(p[0], p[1]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi8(p[2], p[3]);
+    p[3] = _mm_unpackhi_epi8(p[2], p[3]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi8(p[4], p[5]);
+    p[5] = _mm_unpackhi_epi8(p[4], p[5]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi8(p[6], p[7]);
+    p[7] = _mm_unpackhi_epi8(p[6], p[7]);
+    p[6] = t1;
+
+/*
+    printf("After unpack pass 1:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi16(p[0], p[2]);
+    p[2] = _mm_unpackhi_epi16(p[0], p[2]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi16(p[1], p[3]);
+    p[3] = _mm_unpackhi_epi16(p[1], p[3]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi16(p[4], p[6]);
+    p[6] = _mm_unpackhi_epi16(p[4], p[6]);
+    p[4] = t1;
+    t1 = _mm_unpacklo_epi16(p[5], p[7]);
+    p[7] = _mm_unpackhi_epi16(p[5], p[7]);
+    p[5] = t1;
+
+/*
+    printf("After unpack pass 2:\n");
+    for (i = 0; i < 8; i++) {
+      MM_PRINT8("v", p[i]);
+    }
+ */
+
+    t1 = _mm_unpacklo_epi32(p[0], p[4]);
+    p[4] = _mm_unpackhi_epi32(p[0], p[4]);
+    p[0] = t1;
+    t1 = _mm_unpacklo_epi32(p[1], p[5]);
+    p[5] = _mm_unpackhi_epi32(p[1], p[5]);
+    p[1] = t1;
+    t1 = _mm_unpacklo_epi32(p[2], p[6]);
+    p[6] = _mm_unpackhi_epi32(p[2], p[6]);
+    p[2] = t1;
+    t1 = _mm_unpacklo_epi32(p[3], p[7]);
+    p[7] = _mm_unpackhi_epi32(p[3], p[7]);
+    p[3] = t1;
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        t1 = _mm_load_si128((__m128i *) d64);
+        _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
+        d64 += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        _mm_store_si128((__m128i *) d64, p[i]);
+        d64 += 2;
+      }
+    }
+
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
+
+static
+int gf_w64_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_split_4_64_lazy_data *d4;
+  struct gf_split_8_64_lazy_data *d8;
+  struct gf_split_8_8_data *d88;
+  struct gf_split_16_64_lazy_data *d16;
+  uint64_t p, basep;
+  int exp, i, j;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Defaults */
+
+  SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
+
+  SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) 
+
+#if defined(INTEL_SSE4_PCLMUL) 
+  if (gf_cpu_supports_intel_pclmul) {
+    if ((!(h->region_type & GF_REGION_NOSIMD) &&
+        (h->arg1 == 64 || h->arg2 == 64)) ||
+        h->mult_type == GF_MULT_DEFAULT){
+    
+      if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+        SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
+      }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) 
+      }else{
+        return 0;
+      }
+    }
+  }
+#endif
+
+  SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+
+  /* Allen: set region pointers for default mult type. Single pointers are
+   * taken care of above (explicitly for sse, implicitly for no sse). */
+
+  if (h->mult_type == GF_MULT_DEFAULT) {
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+      d4 = (struct gf_split_4_64_lazy_data *) h->private;
+      d4->last_value = 0;
+#if defined(INTEL_SSE4)
+      if (gf_cpu_supports_intel_sse4)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+#elif defined(ARCH_AARCH64)
+      if (gf_cpu_supports_arm_neon)
+        gf_w64_neon_split_init(gf);
+#endif
+    } else {
+#endif
+      d8 = (struct gf_split_8_64_lazy_data *) h->private;
+      d8->last_value = 0;
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    }
+#endif
+  }
+
+  if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
+    d4 = (struct gf_split_4_64_lazy_data *) h->private;
+    d4->last_value = 0;
+
+    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
+    if(h->region_type & GF_REGION_ALTMAP)
+    {
+      #ifdef INTEL_SSSE3
+        if (gf_cpu_supports_intel_ssse3) {
+          SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+        } else
+      #elif defined(ARCH_AARCH64)
+        if (gf_cpu_supports_arm_neon) {
+          gf_w64_neon_split_init(gf);
+        } else
+      #endif
+        return 0;
+    }
+    else //no altmap
+    {
+      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+        if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+          if (h->region_type & GF_REGION_NOSIMD) {
+            SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+          } else
+          #if defined(INTEL_SSE4)
+            SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+          #elif defined(ARCH_AARCH64)
+            gf_w64_neon_split_init(gf);
+          #endif
+        } else {
+      #endif
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+        }
+      #endif
+    }
+  }
+  if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
+    d8 = (struct gf_split_8_64_lazy_data *) h->private;
+    d8->last_value = 0;
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+  }
+  if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
+    d16 = (struct gf_split_16_64_lazy_data *) h->private;
+    d16->last_value = 0;
+    SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_16_64_lazy_multiply_region)
+  }
+  if ((h->arg1 == 8 && h->arg2 == 8)) {
+    d88 = (struct gf_split_8_8_data *) h->private;
+    SET_FUNCTION(gf,multiply,w64,gf_w64_split_8_8_multiply)
+
+    /* The performance of this guy sucks, so don't bother with a region op */
+    
+    basep = 1;
+    for (exp = 0; exp < 15; exp++) {
+      for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
+      for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0;
+      d88->tables[exp][1][1] = basep;
+      for (i = 2; i < 256; i++) {
+        if (i&1) {
+          p = d88->tables[exp][i^1][1];
+          d88->tables[exp][i][1] = p ^ basep;
+        } else {
+          p = d88->tables[exp][i>>1][1];
+          d88->tables[exp][i][1] = GF_MULTBY_TWO(p);
+        }
+      }
+      for (i = 1; i < 256; i++) {
+        p = d88->tables[exp][i][1];
+        for (j = 1; j < 256; j++) {
+          if (j&1) {
+            d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p;
+          } else {
+            d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]);
+          }
+        }
+      }
+      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
+    }
+  }
+  return 1;
+}
+
+int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t);
+      break;
+
+    case GF_MULT_DEFAULT:
+
+      /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
+       * then fall through to split table scratch size code. */
+
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+      arg1 = 64;
+      arg2 = 4;
+    } else {
+#endif
+      arg1 = 64;
+      arg2 = 8;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    }
+#endif
+
+    case GF_MULT_SPLIT_TABLE:
+        if (arg1 == 8 && arg2 == 8) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
+        }
+        if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
+        }
+        if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
+        }
+
+        if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
+        }
+        return 0;
+    case GF_MULT_GROUP:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
+               sizeof(uint64_t) * (1 << arg1) +
+               sizeof(uint64_t) * (1 << arg2) + 64;
+      break;
+    case GF_MULT_COMPOSITE:
+      if (arg1 == 2) return sizeof(gf_internal_t) + 64;
+      return 0;
+      break;
+    default:
+      return 0;
+   }
+}
+
+int gf_w64_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  /* Omitting the leftmost 1 as in w=32 */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
+    } else {
+      h->prim_poly = 0x1b;
+    } 
+  }
+
+  SET_FUNCTION(gf,multiply,w64,NULL)
+  SET_FUNCTION(gf,divide,w64,NULL)
+  SET_FUNCTION(gf,inverse,w64,NULL)
+  SET_FUNCTION(gf,multiply_region,w64,NULL)
+
+  switch(h->mult_type) {
+    case GF_MULT_CARRY_FREE:  if (gf_w64_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:       if (gf_w64_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:   if (gf_w64_composite_init(gf) == 0) return 0; break;
+    case GF_MULT_DEFAULT:
+    case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; 
+    case GF_MULT_GROUP:       if (gf_w64_group_init(gf) == 0) return 0; break; 
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:     if (gf_w64_bytwo_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
+  } 
+
+  if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
+    SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
+  }
+  if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
+    SET_FUNCTION(gf,inverse,w64,gf_w64_inverse_from_divide)
+  }
+
+  if (h->region_type == GF_REGION_CAUCHY) return 0;
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    if (h->mult_type == GF_MULT_COMPOSITE) {
+      SET_FUNCTION(gf,extract_word,w64,gf_w64_composite_extract_word)
+    } else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
+      SET_FUNCTION(gf,extract_word,w64,gf_w64_split_extract_word)
+    }
+  } else {
+    SET_FUNCTION(gf,extract_word,w64,gf_w64_extract_word)
+  }
+
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c
new file mode 100644
index 000000000..f647a31bf
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c
@@ -0,0 +1,2398 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_w8.c
+ *
+ * Routines for 8-bit Galois fields
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "gf_cpu.h"
+
+#define AB2(ip, am1 ,am2, b, t1, t2) {\
+  t1 = (b << 1) & am1;\
+  t2 = b & am2; \
+  t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
+  b = (t1 ^ (t2 & ip));}
+
+#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
+          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
+          t2 = _mm_and_si128(va, m2); \
+          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
+          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
+
+#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf("  %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
+
+static
+inline
+uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
+{
+  uint32_t e_i, e_im1, e_ip1;
+  uint32_t d_i, d_im1, d_ip1;
+  uint32_t y_i, y_im1, y_ip1;
+  uint32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = 8;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+static
+gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *r8;
+
+  r8 = (uint8_t *) start;
+  return r8[index];
+}
+
+static
+gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  int sub_size;
+  gf_internal_t *h;
+  uint8_t *r8, *top;
+  uint8_t a, b;
+  gf_region_data rd;
+
+  h = (gf_internal_t *) gf->scratch;
+  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
+  r8 = (uint8_t *) start;
+  if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
+  if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
+  index -= (((uint8_t *) rd.d_start) - r8);
+  r8 = (uint8_t *) rd.d_start;
+  top = (uint8_t *) rd.d_top;
+  sub_size = (top-r8)/2;
+
+  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
+  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
+  return (a | (b << 4));
+}
+
+static
+inline
+uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
+{
+  return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1
+
+     _mm_clmulepi64_si128 is the carryless multiply operation. Here
+     _mm_srli_si128 shifts the result to the right by 1 byte. This allows
+     us to multiply the prim_poly by the leading bits of the result. We
+     then xor the result of that operation back with the result.*/
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+inline
+gf_val_32_t
+gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
+{
+  gf_val_32_t rv = 0;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
+  b = _mm_insert_epi32 (a, b8, 0);
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  
+  result = _mm_clmulepi64_si128 (a, b, 0);
+
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+  w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+  result = _mm_xor_si128 (result, w);
+
+  /* Extracts 32 bit value from result. */
+  rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+
+  return rv;
+}
+#endif
+
+
+static
+void
+gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 ^= gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      *d8 = gf->multiply.w32(gf, val, *s8);
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
+static
+void
+gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
+    xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  __m128i         a, b;
+  __m128i         result;
+  __m128i         prim_poly;
+  __m128i         w;
+  gf_internal_t * h = gf->scratch;
+
+  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor) {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  } else {
+    while (d8 < ((uint8_t *) rd.d_top)) {
+      b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
+      result = _mm_clmulepi64_si128 (a, b, 0);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
+      result = _mm_xor_si128 (result, w);
+      *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
+      d8++;
+      s8++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: SHIFT:
+
+JSP: The world's dumbest multiplication algorithm.  I only
+include it for completeness.  It does have the feature that it requires no
+extra memory.  
+ */
+
+static
+inline
+  uint32_t
+gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
+{
+  uint16_t product, i, pp, a, b;
+  gf_internal_t *h;
+
+  a = a8;
+  b = b8;
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  product = 0;
+
+  for (i = 0; i < GF_FIELD_WIDTH; i++) { 
+    if (a & (1 << i)) product ^= (b << i);
+  }
+  for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
+    if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); 
+  }
+  return product;
+}
+
+static 
+int gf_w8_cfm_init(gf_t *gf)
+{ 
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+
+      if ((0xe0 & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
+      }else if ((0xc0 & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
+      }else if ((0x80 & h->prim_poly) == 0){ 
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
+      }else{
+        return 0;
+      }
+    return 1;
+  }
+#elif defined(ARM_NEON)
+  if (gf_cpu_supports_arm_neon) {
+    return gf_w8_neon_cfm_init(gf);
+  }
+#endif
+
+  return 0;
+
+}
+
+static 
+int gf_w8_shift_init(gf_t *gf)
+{ 
+  SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply)  /* The others will be set automatically */
+  return 1;
+}
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: LOG_TABLE:
+
+JSP: Kevin wrote this, and I'm converting it to my structure.
+*/
+
+static
+inline
+  uint32_t
+gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  if (b == 0) return 0;
+  return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]];
+}
+
+static
+inline
+  uint32_t
+gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
+{
+  struct gf_w8_logtable_data *ltd;
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
+}
+
+static
+inline
+  uint32_t
+gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
+{
+  int log_sum = 0;
+  struct gf_w8_logtable_data *ltd;
+
+  if (a == 0 || b == 0) return 0;
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
+  return (ltd->antilog_tbl[log_sum]);
+}
+
+static
+  uint32_t
+gf_w8_log_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logtable_data *ltd;
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+  uint32_t
+gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logzero_table_data *ltd;
+
+  ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ltd->inv_tbl[a]);
+}
+
+static
+  uint32_t
+gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
+{
+  struct gf_w8_logzero_small_table_data *std;
+
+  std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (std->inv_tbl[a]);
+}
+
+static
+  void
+gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv;
+  uint8_t *s8, *d8;
+  struct gf_w8_logtable_data *ltd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = ltd->log_tbl[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
+    }
+  }
+}
+
+static
+  void
+gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t lv;
+  uint8_t *s8, *d8;
+  struct gf_w8_logzero_table_data *ltd;
+  struct gf_w8_logzero_small_table_data *std;
+  short *log;
+  uint8_t *alt;
+  gf_internal_t *h;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->arg1 == 1) {
+    std = (struct gf_w8_logzero_small_table_data *) h->private;
+    log = std->log_tbl;
+    alt = std->antilog_tbl;
+  } else {
+    ltd = (struct gf_w8_logzero_table_data *) h->private;
+    log = ltd->log_tbl;
+    alt = ltd->antilog_tbl;
+  }
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  lv = log[val];
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (alt[lv + log[s8[i]]]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (alt[lv + log[s8[i]]]);
+    }
+  }
+}
+
+  static
+int gf_w8_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_logtable_data *ltd = NULL;
+  struct gf_w8_logzero_table_data *ztd = NULL;
+  struct gf_w8_logzero_small_table_data *std = NULL;
+  uint8_t *alt;
+  uint8_t *inv;
+  int i, b;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    ltd = h->private;
+    alt = ltd->antilog_tbl;
+    inv = ltd->inv_tbl;
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    std = h->private;
+    alt = std->antilog_tbl;
+    std->div_tbl = (alt + 255);
+    inv = std->inv_tbl;
+  } else {
+    ztd = h->private;
+    alt = ztd->antilog_tbl;
+    ztd->inv_tbl = (alt + 512 + 256);
+    ztd->div_tbl = (alt + 255);
+    inv = ztd->inv_tbl;
+  }
+
+  for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
+    if (h->mult_type == GF_MULT_LOG_TABLE)
+      ltd->log_tbl[i] = 0;
+    else if (h->mult_type == GF_MULT_LOG_ZERO)
+      std->log_tbl[i] = 0;
+    else
+      ztd->log_tbl[i] = 0;
+  }
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    ltd->log_tbl[0] = 0;
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    std->log_tbl[0] = 510;
+  } else {
+    ztd->log_tbl[0] = 512;
+  }
+
+  b = 1;
+  for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
+    if (h->mult_type == GF_MULT_LOG_TABLE) {
+      if (ltd->log_tbl[b] != 0) check = 1;
+      ltd->log_tbl[b] = i;
+    } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+      if (std->log_tbl[b] != 0) check = 1;
+      std->log_tbl[b] = i;
+    } else {
+      if (ztd->log_tbl[b] != 0) check = 1;
+      ztd->log_tbl[b] = i;
+    }
+    alt[i] = b;
+    alt[i+GF_MULT_GROUP_SIZE] = b;
+    b <<= 1;
+    if (b & GF_FIELD_SIZE) {
+      b = b ^ h->prim_poly;
+    }
+  }
+  if (check) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
+
+  if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
+    bzero(alt+512, 255);
+    alt[512+512] = 0;
+  }
+
+  inv[0] = 0;  /* Not really, but we need to fill it with something  */
+  i = 1;
+  b = GF_MULT_GROUP_SIZE;
+  do {
+    inv[i] = alt[b];
+    i <<= 1;
+    if (i & (1 << 8)) i ^= h->prim_poly;
+    b--;
+  } while (i != 1);
+
+  if (h->mult_type == GF_MULT_LOG_TABLE) {
+    SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse)
+    SET_FUNCTION(gf,divide,w32,gf_w8_log_divide)
+    SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region)
+  } else if (h->mult_type == GF_MULT_LOG_ZERO) {
+    SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse)
+    SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide)
+    SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
+  } else {
+    SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse)
+    SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide)
+    SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
+  }
+  return 1;
+}
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: FULL_TABLE:
+
+JSP: Kevin wrote this, and I'm converting it to my structure.
+ */
+
+static
+  gf_val_32_t
+gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_single_table_data *ftd;
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->multtable[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_single_table_data *ftd;
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->divtable[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_default_data *ftd;
+
+  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->multtable[a][b]);
+}
+
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+static
+  gf_val_32_t
+gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_default_data *ftd;
+
+  ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->divtable[a][b]);
+}
+#endif
+
+static
+  gf_val_32_t
+gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_data *ftd;
+
+  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->mult[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_data *ftd;
+
+  ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->div[a][b]);
+}
+
+static
+  void
+gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint16_t *base;
+  uint32_t b, c, vc, vb;
+  gf_internal_t *h;
+  struct gf_w8_double_table_data  *dtd;
+  struct gf_w8_double_table_lazy_data  *ltd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) (gf->scratch);
+  if (h->region_type & GF_REGION_LAZY) {
+    ltd = (struct gf_w8_double_table_lazy_data *) h->private;
+    base = ltd->mult;
+    for (b = 0; b < GF_FIELD_SIZE; b++) {
+      vb = (ltd->smult[val][b] << 8);
+      for (c = 0; c < GF_FIELD_SIZE; c++) {
+        vc = ltd->smult[val][c];
+        base[(b << 8)| c] = (vb | vc);
+      }
+    }
+
+  } else {
+    dtd = (struct gf_w8_double_table_data *) h->private;
+    base = &(dtd->mult[val][0]);
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+  gf_two_byte_region_table_multiply(&rd, base);
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_lazy_data *ftd;
+
+  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->smult[a][b]);
+}
+
+static
+  gf_val_32_t
+gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_double_table_lazy_data *ftd;
+
+  ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
+  return (ftd->div[a][b]);
+}
+
+static
+  void
+gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  struct gf_w8_single_table_data *ftd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= ftd->multtable[s8[i]][val];
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = ftd->multtable[s8[i]][val];
+    }
+  }
+}
+
+#ifdef INTEL_SSSE3
+static
+  void
+gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *bh, *bl, *sptr, *dptr;
+  __m128i  loset, t1, r, va, mth, mtl;
+  struct gf_w8_half_table_data *htd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  bh = (uint8_t *) htd->high;
+  bh += (val << 4);
+  bl = (uint8_t *) htd->low;
+  bl += (val << 4);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+
+  mth = _mm_loadu_si128 ((__m128i *)(bh));
+  mtl = _mm_loadu_si128 ((__m128i *)(bl));
+  loset = _mm_set1_epi8 (0x0f);
+
+  if (xor) {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = _mm_load_si128 ((__m128i *)(sptr));
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_shuffle_epi8 (mtl, t1);
+      va = _mm_srli_epi64 (va, 4);
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
+      va = _mm_load_si128 ((__m128i *)(dptr));
+      r = _mm_xor_si128 (r, va);
+      _mm_store_si128 ((__m128i *)(dptr), r);
+      dptr += 16;
+      sptr += 16;
+    }
+  } else {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = _mm_load_si128 ((__m128i *)(sptr));
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_shuffle_epi8 (mtl, t1);
+      va = _mm_srli_epi64 (va, 4);
+      t1 = _mm_and_si128 (loset, va);
+      r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
+      _mm_store_si128 ((__m128i *)(dptr), r);
+      dptr += 16;
+      sptr += 16;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+
+/* ------------------------------------------------------------
+IMPLEMENTATION: FULL_TABLE:
+ */
+
+static
+  gf_val_32_t
+gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  struct gf_w8_half_table_data *htd;
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
+
+  return htd->high[b][a>>4] ^ htd->low[b][a&0xf];
+}
+
+static
+  void
+gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  struct gf_w8_half_table_data *htd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
+  s8 = (uint8_t *) src;
+  d8 = (uint8_t *) dest;
+
+  if (xor) {
+    for (i = 0; i < bytes; i++) {
+      d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
+    }
+  } else {
+    for (i = 0; i < bytes; i++) {
+      d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
+    }
+  }
+}
+
+
+  static
+int gf_w8_split_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_half_table_data *htd;
+  int a, b;
+
+  h = (gf_internal_t *) gf->scratch;
+  htd = (struct gf_w8_half_table_data *)h->private;
+
+  bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
+  bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_HALF_SIZE; b++) {
+      htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
+      htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
+    }
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
+
+  #if defined(INTEL_SSSE3)
+    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+    } else {
+  #elif defined(ARM_NEON)
+    if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
+      gf_w8_neon_split_init(gf);
+    } else {
+  #endif
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
+    if(h->region_type & GF_REGION_SIMD)
+      return 0;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
+  #endif
+
+  return 1;
+}
+
+/* JSP: This is disgusting, but it is what it is.  If there is no SSE,
+   then the default is equivalent to single table.  If there is SSE, then
+   we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
+   
+static
+int gf_w8_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_single_table_data *ftd = NULL;
+  struct gf_w8_double_table_data *dtd = NULL;
+  struct gf_w8_double_table_lazy_data *ltd = NULL;
+  struct gf_w8_default_data *dd = NULL;
+  int a, b, c, prod, scase;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if (h->mult_type == GF_MULT_DEFAULT &&
+      (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
+    dd = (struct gf_w8_default_data *)h->private;
+    scase = 3;
+    bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
+    bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
+    bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+  } else if (h->mult_type == GF_MULT_DEFAULT || 
+             h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
+    ftd = (struct gf_w8_single_table_data *)h->private;
+    bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 0;
+  } else if (h->region_type == GF_REGION_DOUBLE_TABLE) {
+    dtd = (struct gf_w8_double_table_data *)h->private;
+    bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 1;
+  } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
+    ltd = (struct gf_w8_double_table_lazy_data *)h->private;
+    bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
+    scase = 2;
+  } else {
+    fprintf(stderr, "Internal error in gf_w8_table_init\n");
+    assert(0);
+  }
+
+  for (a = 1; a < GF_FIELD_SIZE; a++) {
+    for (b = 1; b < GF_FIELD_SIZE; b++) {
+      prod = gf_w8_shift_multiply(gf,a,b);
+      switch (scase) {
+        case 0: 
+          ftd->multtable[a][b] = prod;
+          ftd->divtable[prod][b] = a;
+          break;
+        case 1:
+          dtd->div[prod][b] = a;
+          for (c = 0; c < GF_FIELD_SIZE; c++) {
+            dtd->mult[a][(c<<8)|b] |= prod;
+            dtd->mult[a][(b<<8)|c] |= (prod<<8);
+          }
+          break;
+        case 2:
+          ltd->div[prod][b] = a;
+          ltd->smult[a][b] = prod;
+          break;
+        case 3:
+          dd->multtable[a][b] = prod;
+          dd->divtable[prod][b] = a;
+          if ((b & 0xf) == b) { dd->low[a][b] = prod; }
+          if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
+          break;
+      }
+    }
+  }
+
+  SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */
+  switch (scase) {
+    case 0: 
+      SET_FUNCTION(gf,divide,w32,gf_w8_table_divide)
+      SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region)
+      break;
+    case 1:
+      SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide)
+      SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
+      break;
+    case 2:
+      SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide)
+      SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
+      break;
+    case 3:
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+        SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
+        SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
+#if defined(INTEL_SSSE3)
+        if (gf_cpu_supports_intel_ssse3) {
+          SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+        }
+#elif defined(ARM_NEON)
+        if (gf_cpu_supports_arm_neon) {
+          gf_w8_neon_split_init(gf);
+        }
+#endif
+      }
+#endif
+      break;
+  }
+  return 1;
+}
+
+static
+  void
+gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t val0 = val & 0x0f;
+  uint8_t val1 = (val & 0xf0) >> 4;
+  gf_region_data rd;
+  int sub_reg_size;
+
+  if (val == 0) {
+    if (xor) return;
+    bzero(dest, bytes);
+    return;
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2;
+
+  base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
+  base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
+
+   gf_do_final_region_alignment(&rd);
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = b & 0x0f; 
+  uint8_t b1 = (b & 0xf0) >> 4; 
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t a1b1;
+
+  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+
+  return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+          ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+           base_gf->multiply.w32(base_gf, a0, b1) ^ 
+           base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+}
+
+static
+gf_val_32_t
+gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint8_t b0 = b & 0x0f; 
+  uint8_t b1 = (b & 0xf0) >> 4; 
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t a1b1, *mt;
+  struct gf_w8_composite_data *cd;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  mt = cd->mult_table;
+
+  a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+
+  return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+          ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+           GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+}
+
+/*
+ * Composite field division trick (explained in 2007 tech report) 
+ *
+ * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 
+ * 
+ * let c = b^-1
+ *
+ * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
+ * 
+ * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 
+ *
+ * let d = b1c1 and d+1 = b0c0
+ *
+ * solve s*b1c1+b1c0+b0c1 = 0
+ *
+ * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
+ *
+ * c0 = (d+1)b0^-1
+ * c1 = d*b1^-1
+ * 
+ * a / b = a * c
+ */
+
+static
+gf_val_32_t
+gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t a0 = a & 0x0f; 
+  uint8_t a1 = (a & 0xf0) >> 4; 
+  uint8_t c0, c1, c, d, tmp;
+  uint8_t a0inv, a1inv; 
+
+  if (a0 == 0) {
+    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
+    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
+    c1 = a1inv;
+  } else if (a1 == 0) {
+    c0 = base_gf->inverse.w32(base_gf, a0);
+    c1 = 0;
+  } else {
+    a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
+    a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf;
+
+    d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
+
+    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
+    tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
+
+    d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
+
+    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; 
+    c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; 
+  }
+
+  c = c0 | (c1 << 4);
+
+  return c;
+}
+
+static
+void
+gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  gf_t *base_gf = h->base_gf;
+  uint8_t b0 = val & 0x0f; 
+  uint8_t b1 = (val & 0xf0) >> 4; 
+  uint8_t *s8;
+  uint8_t *d8; 
+  uint8_t *mt;
+  uint8_t a0, a1, a1b1;
+  struct gf_w8_composite_data *cd;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+
+  if (val == 0) {
+    if (xor) return;
+    bzero(dest, bytes);
+    return;
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
+  gf_do_initial_region_alignment(&rd);
+  
+  
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  mt = cd->mult_table;
+  if (mt == NULL) {
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+               ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                 base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
+  
+        *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
+              ((base_gf->multiply.w32(base_gf, a1, b0) ^ 
+                base_gf->multiply.w32(base_gf, a0, b1) ^ 
+                base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    }
+  } else {
+    if (xor) {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+               ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    } else {
+      while (d8 < (uint8_t *) rd.d_top) {
+        a0 = *s8 & 0x0f; 
+        a1 = (*s8 & 0xf0) >> 4; 
+        a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
+  
+        *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | 
+              ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ 
+                GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
+        s8++;
+        d8++;
+      }
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+  return;
+}
+
+static
+int gf_w8_composite_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_w8_composite_data *cd;
+
+  if (h->base_gf == NULL) return 0;
+
+  cd = (struct gf_w8_composite_data *) h->private;
+  cd->mult_table = gf_w4_get_mult_table(h->base_gf);
+
+  if (h->region_type & GF_REGION_ALTMAP) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt)
+  } else {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region)
+  }
+
+  if (cd->mult_table == NULL) {
+    SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive)
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline)
+  }
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse)
+
+  return 1;
+}
+
+static
+inline
+  gf_val_32_t
+gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+
+  prod = 0;
+  pmask = 0x80;
+  amask = 0x80;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+static
+inline
+  gf_val_32_t
+gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = 0x80;
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static
+  void 
+gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, prod, amask;
+  gf_region_data rd;
+  struct gf_w8_bytwo_data *btd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  if (xor) {
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 ^= prod;
+      d64++;
+      s64++;
+    }
+  } else { 
+    while (s64 < (uint64_t *) rd.s_top) {
+      prod = 0;
+      amask = 0x80;
+      ta = *s64;
+      while (amask != 0) {
+        AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
+        if (val & amask) prod ^= ta;
+        amask >>= 1;
+      }
+      *d64 = prod;
+      d64++;
+      s64++;
+    }
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+#define BYTWO_P_ONESTEP {\
+  SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
+  t1 = _mm_and_si128(v, one); \
+  t1 = _mm_sub_epi8(t1, one); \
+  t1 = _mm_and_si128(t1, ta); \
+  prod = _mm_xor_si128(prod, t1); \
+  v = _mm_srli_epi64(v, 1); }
+
+#ifdef INTEL_SSE2
+static
+  void 
+gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int i;
+  uint8_t *s8, *d8;
+  uint8_t vrev;
+  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  vrev = 0;
+  for (i = 0; i < 8; i++) {
+    vrev <<= 1;
+    if (!(val & (1 << i))) vrev |= 1;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+  one = _mm_set1_epi8(1);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    prod = _mm_setzero_si128();
+    v = _mm_set1_epi8(vrev);
+    ta = _mm_load_si128((__m128i *) s8);
+    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    BYTWO_P_ONESTEP;
+    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
+    d8 += 16;
+    s8 += 16;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+  void
+gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    _mm_store_si128((__m128i *)d8, va);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+#ifdef INTEL_SSE2
+static
+  void
+gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
+{
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+
+  s8 = (uint8_t *) rd->s_start;
+  d8 = (uint8_t *) rd->d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd->d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    SSE_AB2(pp, m1, m2, va, t1, t2);
+    vb = _mm_load_si128 ((__m128i *)(d8));
+    vb = _mm_xor_si128(vb, va);
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+}
+#endif
+
+
+#ifdef INTEL_SSE2
+static
+  void 
+gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  int itb;
+  uint8_t *d8, *s8;
+  __m128i pp, m1, m2, t1, t2, va, vb;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  if (val == 2) {
+    if (xor) {
+      gf_w8_bytwo_b_sse_region_2_xor(&rd, btd);
+    } else {
+      gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd);
+    }
+    gf_do_final_region_alignment(&rd);
+    return;
+  }
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  pp = _mm_set1_epi8(btd->prim_poly&0xff);
+  m1 = _mm_set1_epi8((btd->mask1)&0xff);
+  m2 = _mm_set1_epi8((btd->mask2)&0xff);
+
+  while (d8 < (uint8_t *) rd.d_top) {
+    va = _mm_load_si128 ((__m128i *)(s8));
+    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
+    itb = val;
+    while (1) {
+      if (itb & 1) vb = _mm_xor_si128(vb, va);
+      itb >>= 1;
+      if (itb == 0) break;
+      SSE_AB2(pp, m1, m2, va, t1, t2);
+    }
+    _mm_store_si128((__m128i *)d8, vb);
+    d8 += 16;
+    s8 += 16;
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+#endif
+
+static
+  void 
+gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint64_t *s64, *d64, t1, t2, ta, tb, prod;
+  struct gf_w8_bytwo_data *btd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+
+  switch (val) {
+    case 2:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 3:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 4:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+    case 5:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+      break;
+    case 6:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= (ta ^ prod);
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          prod = ta;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta ^ prod;
+          d64++;
+          s64++;
+        }
+      }
+      break;
+      /*
+         case 7:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         prod ^= ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = ta ^ prod;
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       */
+    case 8:
+      if (xor) {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 ^= ta;
+          d64++;
+          s64++;
+        }
+      } else {
+        while (d64 < (uint64_t *) rd.d_top) {
+          ta = *s64;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+          *d64 = ta;
+          d64++;
+          s64++;
+        }
+      }
+      break; 
+      /*
+         case 9:
+         if (xor) {
+         while (d64 < (uint64_t *) rd.d_top) {
+         ta = *s64;
+         prod = ta;
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+         AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 10:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 = (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       }
+       break; 
+       case 11:
+       if (xor) {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       *d64 ^= (ta ^ prod);
+       d64++;
+       s64++;
+       }
+       } else {
+       while (d64 < (uint64_t *) rd.d_top) {
+       ta = *s64;
+       prod = ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+       prod ^= ta;
+       AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+      }
+  }
+  break; 
+    case 12:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 13:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 14:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+    case 15:
+  if (xor) {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 ^= (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  } else {
+    while (d64 < (uint64_t *) rd.d_top) {
+      ta = *s64;
+      prod = ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      prod ^= ta;
+      AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+      *d64 = (ta ^ prod);
+      d64++;
+      s64++;
+    }
+  }
+  break; 
+  */
+    default:
+    if (xor) {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = *d64 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    } else {
+      while (d64 < (uint64_t *) rd.d_top) {
+        prod = 0 ;
+        ta = *s64;
+        tb = val;
+        while (1) {
+          if (tb & 1) prod ^= ta;
+          tb >>= 1;
+          if (tb == 0) break;
+          AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
+        }
+        *d64 = prod;
+        d64++;
+        s64++;
+      }
+    }
+    break;
+  }
+  gf_do_final_region_alignment(&rd);
+}
+
+  static
+int gf_w8_bytwo_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  uint64_t ip, m1, m2;
+  struct gf_w8_bytwo_data *btd;
+
+  h = (gf_internal_t *) gf->scratch;
+  btd = (struct gf_w8_bytwo_data *) (h->private);
+  ip = h->prim_poly & 0xff;
+  m1 = 0xfe;
+  m2 = 0x80;
+  btd->prim_poly = 0;
+  btd->mask1 = 0;
+  btd->mask2 = 0;
+
+  while (ip != 0) {
+    btd->prim_poly |= ip;
+    btd->mask1 |= m1;
+    btd->mask2 |= m2;
+    ip <<= GF_FIELD_WIDTH;
+    m1 <<= GF_FIELD_WIDTH;
+    m2 <<= GF_FIELD_WIDTH;
+  }
+
+  if (h->mult_type == GF_MULT_BYTWO_p) {
+    SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
+#ifdef INTEL_SSE2
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
+    } else {
+#endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+#ifdef INTEL_SSE2
+    }
+#endif
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
+#ifdef INTEL_SSE2
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
+    } else {
+#endif
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
+    if(h->region_type & GF_REGION_SIMD)
+      return 0;
+#ifdef INTEL_SSE2
+    }
+#endif
+  }
+  return 1;
+}
+
+
+/* ------------------------------------------------------------
+   General procedures.
+   You don't need to error check here on in init, because it's done
+   for you in gf_error_check().
+ */
+
+int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+  switch(mult_type)
+  {
+    case GF_MULT_DEFAULT:
+      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+      }
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+    case GF_MULT_TABLE:
+      if (region_type == GF_REGION_CAUCHY) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+      }
+
+      if (region_type == GF_REGION_DEFAULT) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
+      } 
+      if (region_type & GF_REGION_DOUBLE_TABLE) {
+        if (region_type == GF_REGION_DOUBLE_TABLE) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64;
+        } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
+        } else {
+          return 0;
+        }
+      }
+      return 0;
+      break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
+      break;
+    case GF_MULT_SPLIT_TABLE:
+      if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
+      }
+      break;
+    case GF_MULT_LOG_TABLE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
+      break;
+    case GF_MULT_LOG_ZERO_EXT:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
+      break;
+    case GF_MULT_CARRY_FREE:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_SHIFT:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_COMPOSITE:
+      return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+int gf_w8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
+
+  if (h->prim_poly == 0) {
+    if (h->mult_type == GF_MULT_COMPOSITE) { 
+      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
+      if (h->prim_poly == 0) return 0;   /* JSP: This shouldn't happen, but just in case. */
+    } else {             
+      h->prim_poly = 0x11d;
+    } 
+  }
+  if (h->mult_type != GF_MULT_COMPOSITE) { 
+    h->prim_poly |= 0x100;
+  }
+
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,NULL)
+  SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word)
+
+  switch(h->mult_type) {
+    case GF_MULT_DEFAULT:      
+    case GF_MULT_TABLE:        if (gf_w8_table_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:
+    case GF_MULT_BYTWO_b:      if (gf_w8_bytwo_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_ZERO:
+    case GF_MULT_LOG_ZERO_EXT:
+    case GF_MULT_LOG_TABLE:    if (gf_w8_log_init(gf) == 0) return 0; break;
+    case GF_MULT_CARRY_FREE:   if (gf_w8_cfm_init(gf) == 0) return 0; break;
+    case GF_MULT_SHIFT:        if (gf_w8_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_SPLIT_TABLE:  if (gf_w8_split_init(gf) == 0) return 0; break;
+    case GF_MULT_COMPOSITE:    if (gf_w8_composite_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_w8_matrix)
+  }
+
+  if (gf->divide.w32 == NULL) {
+    SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
+    if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
+  }
+
+  if (gf->inverse.w32 == NULL)  SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide)
+
+  if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
+    SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word)
+  }
+
+  if (h->region_type == GF_REGION_CAUCHY) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+    SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+  }
+
+  if (gf->multiply_region.w32 == NULL) {
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single)
+  }
+
+  return 1;
+}
+
+
+/* Inline setup functions */
+
+uint8_t *gf_w8_get_mult_table(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) h->private;
+    return (uint8_t *) ftd->multtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) h->private;
+    return (uint8_t *) std->multtable;
+  }
+  return NULL;
+}
+
+uint8_t *gf_w8_get_div_table(gf_t *gf)
+{
+  struct gf_w8_default_data *ftd;
+  struct gf_w8_single_table_data *std;
+
+  if (gf->multiply.w32 == gf_w8_default_multiply) {
+    ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) ftd->divtable;
+  } else if (gf->multiply.w32 == gf_w8_table_multiply) {
+    std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
+    return (uint8_t *) std->divtable;
+  }
+  return NULL;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
new file mode 100644
index 000000000..1e3d2e0ce
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c
@@ -0,0 +1,1019 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_wgen.c
+ *
+ * Routines for Galois fields for general w < 32.  For specific w, 
+   like 4, 8, 16, 32, 64 and 128, see the other files.
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+struct gf_wgen_table_w8_data {
+  uint8_t *mult;
+  uint8_t *div;
+  uint8_t base;
+};
+
+struct gf_wgen_table_w16_data {
+  uint16_t *mult;
+  uint16_t *div;
+  uint16_t base;
+};
+
+struct gf_wgen_log_w8_data {
+  uint8_t *log;
+  uint8_t *anti;
+  uint8_t *danti;
+  uint8_t base;
+};
+
+struct gf_wgen_log_w16_data {
+  uint16_t *log;
+  uint16_t *anti;
+  uint16_t *danti;
+  uint16_t base;
+};
+
+struct gf_wgen_log_w32_data {
+  uint32_t *log;
+  uint32_t *anti;
+  uint32_t *danti;
+  uint32_t base;
+};
+
+struct gf_wgen_group_data {
+    uint32_t *reduce;
+    uint32_t *shift;
+    uint32_t mask;
+    uint64_t rmask;
+    int tshift;
+    uint32_t memory;
+};
+
+static
+inline
+gf_val_32_t gf_wgen_inverse_from_divide (gf_t *gf, gf_val_32_t a)
+{
+  return gf->divide.w32(gf, 1, a);
+}
+
+static
+inline
+gf_val_32_t gf_wgen_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  b = gf->inverse.w32(gf, b);
+  return gf->multiply.w32(gf, a, b);
+}
+
+static
+inline
+gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b)
+{
+  
+  gf_val_32_t e_i, e_im1, e_ip1;
+  gf_val_32_t d_i, d_im1, d_ip1;
+  gf_val_32_t y_i, y_im1, y_ip1;
+  gf_val_32_t c_i;
+
+  if (b == 0) return -1;
+  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
+  e_i = b;
+  d_im1 = ((gf_internal_t *) (gf->scratch))->w;
+  for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
+  y_i = 1;
+  y_im1 = 0;
+
+  while (e_i != 1) {
+
+    e_ip1 = e_im1;
+    d_ip1 = d_im1;
+    c_i = 0;
+
+    while (d_ip1 >= d_i) {
+      c_i ^= (1 << (d_ip1 - d_i));
+      e_ip1 ^= (e_i << (d_ip1 - d_i));
+      if (e_ip1 == 0) return 0;
+      while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
+    }
+
+    y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
+    y_im1 = y_i;
+    y_i = y_ip1;
+
+    e_im1 = e_i;
+    d_im1 = d_i;
+    e_i = e_ip1;
+    d_i = d_ip1;
+  }
+
+  return y_i;
+}
+
+gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index)
+{
+  uint8_t *ptr;
+  uint32_t rv;
+  int rs;
+  int byte, bit, i;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  rs = bytes / h->w;
+  byte = index/8;
+  bit = index%8;
+
+  ptr = (uint8_t *) start;
+  ptr += bytes;
+  ptr -= rs;
+  ptr += byte;
+
+  rv = 0;
+  for (i = 0; i < h->w; i++) {
+    rv <<= 1;
+    if ((*ptr) & (1 << bit)) rv |= 1;
+    ptr -= rs;
+  }
+  
+  return rv;
+}
+
+static
+inline
+gf_val_32_t gf_wgen_matrix (gf_t *gf, gf_val_32_t b)
+{
+  return gf_bitmatrix_inverse(b, ((gf_internal_t *) (gf->scratch))->w, 
+              ((gf_internal_t *) (gf->scratch))->prim_poly);
+}
+
+static
+inline
+uint32_t
+gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
+{
+  uint64_t product, i, pp, a, b, one;
+  gf_internal_t *h;
+ 
+  a = a32;
+  b = b32;
+  h = (gf_internal_t *) gf->scratch;
+  one = 1;
+  pp = h->prim_poly | (one << h->w);
+
+  product = 0;
+
+  for (i = 0; i < (uint64_t)h->w; i++) {
+    if (a & (one << i)) product ^= (b << i);
+  }
+  for (i = h->w*2-1; i >= (uint64_t)h->w; i--) {
+    if (product & (one << i)) product ^= (pp << (i-h->w));
+  }
+  return product;
+}
+
+static 
+int gf_wgen_shift_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_shift_multiply)
+  SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, bmask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  bmask = (1 << (h->w-1));
+
+  while (1) {
+    if (a & 1) prod ^= b;
+    a >>= 1;
+    if (a == 0) return prod;
+    if (b & bmask) {
+      b = ((b << 1) ^ pp);
+    } else {
+      b <<= 1;
+    }
+  }
+}
+
+static 
+int gf_wgen_bytwo_b_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_b_multiply)
+  SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
+  return 1;
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  uint32_t prod, pp, pmask, amask;
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  prod = 0;
+  pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/
+  amask = pmask;
+
+  while (amask != 0) {
+    if (prod & pmask) {
+      prod = ((prod << 1) ^ pp);
+    } else {
+      prod <<= 1;
+    }
+    if (a & amask) prod ^= b;
+    amask >>= 1;
+  }
+  return prod;
+}
+
+
+static 
+int gf_wgen_bytwo_p_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_p_multiply)
+  SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
+  return 1;
+}
+
+static
+void
+gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
+{
+  uint32_t i;
+  uint32_t j;
+  int g_s;
+
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+  } else {
+    g_s = h->arg1;
+  }
+
+  shift[0] = 0;
+
+  for (i = 1; i < ((uint32_t)1 << g_s); i <<= 1) {
+    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
+    if (val & (1 << (h->w-1))) {
+      val <<= 1;
+      val ^= h->prim_poly;
+    } else {
+      val <<= 1;
+    }
+  }
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int leftover, rs;
+  uint32_t p, l, ind, a32;
+  int bits_left;
+  int g_s;
+  int w;
+
+  struct gf_wgen_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  g_s = h->arg1;
+  w = h->w;
+
+  gd = (struct gf_wgen_group_data *) h->private;
+  gf_wgen_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = w % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  rs = w - leftover;
+  a32 = a;
+  ind = a32 >> rs;
+  a32 <<= leftover;
+  a32 &= gd->mask;
+  p = gd->shift[ind];
+
+  bits_left = rs;
+  rs = w - g_s;
+
+  while (bits_left > 0) {
+    bits_left -= g_s;
+    ind = a32 >> rs;
+    a32 <<= g_s;
+    a32 &= gd->mask;
+    l = p >> rs;
+    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)) & gd->mask;
+  }
+  return p;
+}
+
+char *bits(uint32_t v)
+{
+  char *rv;
+  int i, j;
+
+  rv = malloc(30);
+  j = 0;
+  for (i = 27; i >= 0; i--) {
+    rv[j] = '0' + ((v & (1 << i)) ? 1 : 0);
+    j++;
+  }
+  rv[j] = '\0';
+  return rv;
+}
+char *bits_56(uint64_t v)
+{
+  char *rv;
+  int i, j;
+  uint64_t one;
+
+  one = 1;
+
+  rv = malloc(60);
+  j = 0;
+  for (i = 55; i >= 0; i--) {
+    rv[j] = '0' + ((v & (one << i)) ? 1 : 0);
+    j++;
+  }
+  rv[j] = '\0';
+  return rv;
+}
+
+static
+inline
+gf_val_32_t
+gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  int i;
+  int leftover;
+  uint64_t p, l, r;
+  uint32_t a32, ind;
+  int g_s, g_r;
+  struct gf_wgen_group_data *gd;
+  int w;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+    g_r = 8;
+  } else {
+    g_s = h->arg1;
+    g_r = h->arg2;
+  }
+  w = h->w;
+  gd = (struct gf_wgen_group_data *) h->private;
+  gf_wgen_group_set_shift_tables(gd->shift, b, h);
+
+  leftover = w % g_s;
+  if (leftover == 0) leftover = g_s;
+
+  a32 = a;
+  ind = a32 >> (w - leftover);
+  p = gd->shift[ind];
+  p <<= g_s;
+  a32 <<= leftover;
+  a32 &= gd->mask;
+
+  i = (w - leftover);
+  while (i > g_s) {
+    ind = a32 >> (w-g_s);
+    p ^= gd->shift[ind];
+    a32 <<= g_s;
+    a32 &= gd->mask;
+    p <<= g_s;
+    i -= g_s;
+  }
+
+  ind = a32 >> (h->w-g_s);
+  p ^= gd->shift[ind];
+
+  for (i = gd->tshift ; i >= 0; i -= g_r) {
+    l = p & (gd->rmask << i);
+    r = gd->reduce[l >> (i+w)];
+    r <<= (i);
+    p ^= r;
+  }
+  return p & gd->mask;
+}
+
+static
+int gf_wgen_group_init(gf_t *gf)
+{
+  uint32_t i, j, p, index;
+  struct gf_wgen_group_data *gd;
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  uint32_t g_s, g_r;
+
+  if (h->mult_type == GF_MULT_DEFAULT) {
+    g_s = 2;
+    g_r = 8;
+  } else {
+    g_s = h->arg1;
+    g_r = h->arg2;
+  }
+  gd = (struct gf_wgen_group_data *) h->private;
+  gd->shift = &(gd->memory);
+  gd->reduce = gd->shift + (1 << g_s);
+  gd->mask = (h->w != 31) ? ((1 << h->w)-1) : 0x7fffffff;
+
+  gd->rmask = (1 << g_r) - 1;
+  gd->rmask <<= h->w;
+
+  gd->tshift = h->w % g_s;
+  if (gd->tshift == 0) gd->tshift = g_s;
+  gd->tshift = (h->w - gd->tshift);
+  gd->tshift = ((gd->tshift-1)/g_r) * g_r;
+
+  gd->reduce[0] = 0;
+  for (i = 0; i < ((uint32_t)1 << g_r); i++) {
+    p = 0;
+    index = 0;
+    for (j = 0; j < g_r; j++) {
+      if (i & (1 << j)) {
+        p ^= (h->prim_poly << j);
+        index ^= (h->prim_poly >> (h->w-j));
+      }
+    }
+    gd->reduce[index] = (p & gd->mask);
+  }
+
+  if (g_s == g_r) {
+    SET_FUNCTION(gf,multiply,w32,gf_wgen_group_s_equals_r_multiply)
+  } else {
+    SET_FUNCTION(gf,multiply,w32,gf_wgen_group_multiply) 
+  }
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  return 1;
+}
+
+
+static
+gf_val_32_t
+gf_wgen_table_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+
+  return (std->mult[(a<<h->w)+b]);
+}
+
+static
+gf_val_32_t
+gf_wgen_table_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+
+  return (std->div[(a<<h->w)+b]);
+}
+
+static 
+int gf_wgen_table_8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int w;
+  struct gf_wgen_table_w8_data *std;
+  uint32_t a, b, p;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_table_w8_data *) h->private;
+  
+  std->mult = &(std->base);
+  std->div = std->mult + ((1<<h->w)*(1<<h->w));
+  
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
+    std->mult[a] = 0;
+    std->mult[a<<w] = 0;
+    std->div[a] = 0;
+    std->div[a<<w] = 0;
+  }
+    
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
+      std->mult[(a<<w)|b] = p;
+      std->div[(p<<w)|a] = b;
+    }
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_table_8_multiply)
+  SET_FUNCTION(gf,divide,w32,gf_wgen_table_8_divide)
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_table_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+
+  return (std->mult[(a<<h->w)+b]);
+}
+
+static
+gf_val_32_t
+gf_wgen_table_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_table_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+
+  return (std->div[(a<<h->w)+b]);
+}
+
+static 
+int gf_wgen_table_16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  int w;
+  struct gf_wgen_table_w16_data *std;
+  uint32_t a, b, p;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_table_w16_data *) h->private;
+  
+  std->mult = &(std->base);
+  std->div = std->mult + ((1<<h->w)*(1<<h->w));
+  
+  for (a = 0; a < ((uint32_t)1 << w); a++) {
+    std->mult[a] = 0;
+    std->mult[a<<w] = 0;
+    std->div[a] = 0;
+    std->div[a<<w] = 0;
+  }
+  
+  for (a = 1; a < ((uint32_t)1 << w); a++) {
+    for (b = 1; b < ((uint32_t)1 << w); b++) {
+      p = gf_wgen_shift_multiply(gf, a, b);
+      std->mult[(a<<w)|b] = p;
+      std->div[(p<<w)|a] = b;
+    }
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_table_16_multiply)
+  SET_FUNCTION(gf,divide,w32,gf_wgen_table_16_divide)
+  return 1;
+}
+
+static 
+int gf_wgen_table_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (h->w <= 8) return gf_wgen_table_8_init(gf);
+  if (h->w <= 14) return gf_wgen_table_16_init(gf);
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_8_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w8_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w8_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+  
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_log_8_multiply)
+  SET_FUNCTION(gf,divide,w32,gf_wgen_log_8_divide)
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_16_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w16_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+  
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w16_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+ 
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check) {
+    if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf);
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+  
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_log_16_multiply)
+  SET_FUNCTION(gf,divide,w32,gf_wgen_log_16_divide)
+  return 1;
+}
+
+static
+gf_val_32_t
+gf_wgen_log_32_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  return (std->anti[std->log[a]+std->log[b]]);
+}
+
+static
+gf_val_32_t
+gf_wgen_log_32_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  int index;
+  
+  h = (gf_internal_t *) gf->scratch;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+
+  if (a == 0 || b == 0) return 0;
+  index = std->log[a];
+  index -= std->log[b];
+
+  return (std->danti[index]);
+}
+
+static 
+int gf_wgen_log_32_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  struct gf_wgen_log_w32_data *std;
+  int w;
+  uint32_t a, i;
+  int check = 0;
+
+  h = (gf_internal_t *) gf->scratch;
+  w = h->w;
+  std = (struct gf_wgen_log_w32_data *) h->private;
+  
+  std->log = &(std->base);
+  std->anti = std->log + (1<<h->w);
+  std->danti = std->anti + (1<<h->w)-1;
+  
+  for (i = 0; i < ((uint32_t)1 << w); i++)
+    std->log[i] = 0;
+
+  a = 1;
+  for(i=0; i < ((uint32_t)1<<w)-1; i++)
+  {
+    if (std->log[a] != 0) check = 1;
+    std->log[a] = i;
+    std->anti[i] = a;
+    std->danti[i] = a;
+    a <<= 1;
+    if(a & (1<<w))
+      a ^= h->prim_poly;
+    //a &= ((1 << w)-1);
+  }
+
+  if (check != 0) {
+    _gf_errno = GF_E_LOGPOLY;
+    return 0;
+  }
+
+  SET_FUNCTION(gf,multiply,w32,gf_wgen_log_32_multiply)
+  SET_FUNCTION(gf,divide,w32,gf_wgen_log_32_divide)
+  return 1;
+}
+
+static 
+int gf_wgen_log_init(gf_t *gf)
+{
+  gf_internal_t *h;
+  
+  h = (gf_internal_t *) gf->scratch;
+  if (h->w <= 8) return gf_wgen_log_8_init(gf);
+  if (h->w <= 16) return gf_wgen_log_16_init(gf);
+  if (h->w <= 32) return gf_wgen_log_32_init(gf); 
+
+  /* Returning zero to make the compiler happy, but this won't get 
+     executed, because it is tested in _scratch_space. */
+
+  return 0;
+}
+
+int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2)
+{
+
+  switch(mult_type)
+  {
+    case GF_MULT_DEFAULT: 
+      if (w <= 8) {
+          return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
+               sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
+      } else if (w <= 16) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
+               sizeof(uint16_t)*(1 << w)*3;
+      } else {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
+               sizeof(uint32_t) * (1 << 2) +
+               sizeof(uint32_t) * (1 << 8) + 64;
+      }
+    case GF_MULT_SHIFT:
+    case GF_MULT_BYTWO_b:
+    case GF_MULT_BYTWO_p:
+      return sizeof(gf_internal_t);
+      break;
+    case GF_MULT_GROUP:
+      return sizeof(gf_internal_t) + sizeof(struct gf_wgen_group_data) +
+               sizeof(uint32_t) * (1 << arg1) +
+               sizeof(uint32_t) * (1 << arg2) + 64;
+      break;
+
+    case GF_MULT_TABLE: 
+      if (w <= 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) +
+               sizeof(uint8_t)*(1 << w)*(1<<w)*2 + 64;
+      } else if (w < 15) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w16_data) +
+               sizeof(uint16_t)*(1 << w)*(1<<w)*2 + 64;
+      } 
+      return 0;
+    case GF_MULT_LOG_TABLE: 
+      if (w <= 8) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w8_data) +
+               sizeof(uint8_t)*(1 << w)*3;
+      } else if (w <= 16) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w16_data) +
+               sizeof(uint16_t)*(1 << w)*3;
+      } else if (w <= 27) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_wgen_log_w32_data) +
+               sizeof(uint32_t)*(1 << w)*3;
+      } else 
+      return 0;
+    default:
+      return 0;
+   }
+}
+
+void
+gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  gf_internal_t *h;
+  gf_region_data rd;
+  int written;    
+  int rs, i, j;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, -1);
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  rs = bytes / (h->w);
+  
+  written = (xor) ? 0xffffffff : 0;
+  for (i = 0; i < h->w; i++) {
+    for (j = 0; j < h->w; j++) {
+      if (val & (1 << j)) {
+        gf_multby_one(src, ((uint8_t *)dest) + j*rs, rs, (written & (1 << j)));
+        written |= (1 << j);
+      }
+    }
+    src = (uint8_t *)src + rs;
+    val = gf->multiply.w32(gf, val, 2);
+  }
+}
+
+int gf_wgen_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+  if (h->prim_poly == 0) {
+    switch (h->w) {
+      case 1: h->prim_poly = 1; break;
+      case 2: h->prim_poly = 7; break;
+      case 3: h->prim_poly = 013; break;
+      case 4: h->prim_poly = 023; break;
+      case 5: h->prim_poly = 045; break;
+      case 6: h->prim_poly = 0103; break;
+      case 7: h->prim_poly = 0211; break;
+      case 8: h->prim_poly = 0435; break;
+      case 9: h->prim_poly = 01021; break;
+      case 10: h->prim_poly = 02011; break;
+      case 11: h->prim_poly = 04005; break;
+      case 12: h->prim_poly = 010123; break;
+      case 13: h->prim_poly = 020033; break;
+      case 14: h->prim_poly = 042103; break;
+      case 15: h->prim_poly = 0100003; break;
+      case 16: h->prim_poly = 0210013; break;
+      case 17: h->prim_poly = 0400011; break;
+      case 18: h->prim_poly = 01000201; break;
+      case 19: h->prim_poly = 02000047; break;
+      case 20: h->prim_poly = 04000011; break;
+      case 21: h->prim_poly = 010000005; break;
+      case 22: h->prim_poly = 020000003; break;
+      case 23: h->prim_poly = 040000041; break;
+      case 24: h->prim_poly = 0100000207; break;
+      case 25: h->prim_poly = 0200000011; break;
+      case 26: h->prim_poly = 0400000107; break;
+      case 27: h->prim_poly = 01000000047; break;
+      case 28: h->prim_poly = 02000000011; break;
+      case 29: h->prim_poly = 04000000005; break;
+      case 30: h->prim_poly = 010040000007; break;
+      case 31: h->prim_poly = 020000000011; break;
+      case 32: h->prim_poly = 00020000007; break;
+      default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1);
+    }
+  } else {
+    if (h->w == 32) {
+      h->prim_poly &= 0xffffffff;
+    } else {
+      h->prim_poly |= (1 << h->w);
+      if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0;
+    }
+  }
+
+  SET_FUNCTION(gf,multiply,w32,NULL)
+  SET_FUNCTION(gf,divide,w32,NULL)
+  SET_FUNCTION(gf,inverse,w32,NULL)
+  SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
+  SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
+
+  switch(h->mult_type) {
+    case GF_MULT_DEFAULT:
+      if (h->w <= 8) {
+        if (gf_wgen_table_init(gf) == 0) return 0; 
+      } else if (h->w <= 16) {
+        if (gf_wgen_log_init(gf) == 0) return 0; 
+      } else {
+        if (gf_wgen_bytwo_p_init(gf) == 0) return 0; 
+      }
+      break;
+    case GF_MULT_SHIFT:     if (gf_wgen_shift_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_b:     if (gf_wgen_bytwo_b_init(gf) == 0) return 0; break;
+    case GF_MULT_BYTWO_p:     if (gf_wgen_bytwo_p_init(gf) == 0) return 0; break;
+    case GF_MULT_GROUP:     if (gf_wgen_group_init(gf) == 0) return 0; break;
+    case GF_MULT_TABLE:     if (gf_wgen_table_init(gf) == 0) return 0; break;
+    case GF_MULT_LOG_TABLE: if (gf_wgen_log_init(gf) == 0) return 0; break;
+    default: return 0;
+  }
+  if (h->divide_type == GF_DIVIDE_EUCLID) {
+    SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
+  } else if (h->divide_type == GF_DIVIDE_MATRIX) {
+    SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
+    SET_FUNCTION(gf,inverse,w32,gf_wgen_matrix)
+  }
+
+  if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid)
+
+  if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
+    SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse)
+  }
+  if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
+    SET_FUNCTION(gf,inverse,w32,gf_wgen_inverse_from_divide)
+  }
+  return 1;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
new file mode 100644
index 000000000..477ee6359
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c
@@ -0,0 +1,276 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * gf_w16_neon.c
+ *
+ * Neon routines for 16-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w16.h"
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
+                                 uint16_t *d_end, uint8_t *tbl,
+                                 gf_val_32_t val, int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x16_t loset, rl, rh;
+  uint8x16x2_t va;
+
+#ifdef ARCH_AARCH64
+  uint8x16_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+  }
+#else
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+  for (i = 0; i < 4; i++) {
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+  }
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  if (xor) {
+    uint8x16x2_t vb;
+    while (dst < d_end) {
+      va = vld2q_u8((uint8_t*)src);
+      vb = vld2q_u8((uint8_t*)dst);
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+      va.val[0] = vshrq_n_u8(va.val[0], 4);
+      va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+      va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+      va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+      va.val[0] = veorq_u8(va.val[0], vb.val[0]);
+      va.val[1] = veorq_u8(va.val[1], vb.val[1]);
+      vst2q_u8((uint8_t*)dst, va);
+
+      src += 16;
+      dst += 16;
+    }
+  } else {
+    while (dst < d_end) {
+      va = vld2q_u8((uint8_t*)src);
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+      va.val[0] = vshrq_n_u8(va.val[0], 4);
+      va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+      va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+      va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+      vst2q_u8((uint8_t*)dst, va);
+
+      src += 16;
+      dst += 16;
+    }
+  }
+}
+
+static
+inline
+void
+neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
+                                        uint8_t *dst, uint8_t *d_end,
+                                        uint8_t *tbl, gf_val_32_t val,
+                                        int xor)
+{
+  unsigned i;
+  uint8_t *high = tbl + 4 * 16;
+  uint8x16_t vh, vl, rh, rl;
+  uint8x16_t loset;
+
+#ifdef ARCH_AARCH64
+  uint8x16_t tbl_h[4], tbl_l[4];
+#else
+  uint8x8x2_t tbl_h[4], tbl_l[4];
+#endif
+  for (i = 0; i < 4; i++) {
+#ifdef ARCH_AARCH64
+      tbl_l[i] = vld1q_u8(tbl + i*16);
+      tbl_h[i] = vld1q_u8(high + i*16);
+#else
+      tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+      tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+      tbl_h[i].val[0] = vld1_u8(high + i*16);
+      tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+#endif
+  }
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      vh = vld1q_u8(src);
+      vl = vld1q_u8(src + 16);
+
+      rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
+      rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
+
+      vl = vshrq_n_u8(vl, 4);
+      vh = vshrq_n_u8(vh, 4);
+
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
+      rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
+      rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
+
+      if (xor) {
+          vh = vld1q_u8(dst);
+          vl = vld1q_u8(dst + 16);
+          rh = veorq_u8(rh, vh);
+          rl = veorq_u8(rl, vl);
+      }
+      vst1q_u8(dst, rh);
+      vst1q_u8(dst + 16, rl);
+
+      src += 32;
+      dst += 32;
+  }
+}
+
+
+
+static
+inline
+void
+neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         gf_val_32_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_region_data rd;
+  unsigned i, j;
+  uint64_t c, prod;
+  uint8_t tbl[2 * 4 * 16];
+  uint8_t *high = tbl + 4 * 16;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 16; j++) {
+      c = (j << (i*4));
+      prod = gf->multiply.w32(gf, c, val);
+      tbl[i*16 + j]  = prod & 0xff;
+      high[i*16 + j] = prod >> 8;
+    }
+  }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
+  gf_do_initial_region_alignment(&rd);
+
+  if (altmap) {
+    uint8_t *s8   = rd.s_start;
+    uint8_t *d8   = rd.d_start;
+    uint8_t *end8 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
+    else
+      neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
+  } else {
+    uint16_t *s16   = rd.s_start;
+    uint16_t *d16   = rd.d_start;
+    uint16_t *end16 = rd.d_top;
+    if (xor)
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
+    else
+      neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest,
+                                                   gf_val_32_t val, int bytes,
+                                                   int xor)
+{
+  neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+
+void gf_w16_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon)
+  else
+    SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon)
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
new file mode 100644
index 000000000..7fd13290e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c
@@ -0,0 +1,269 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w32_neon.c
+ *
+ * Neon routines for 32-bit Galois fields
+ *
+ */
+
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w32.h"
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
+                                    uint32_t *d_end, uint8_t btable[8][4][16],
+                                    uint32_t val, int xor, int altmap)
+{
+  int i, j;
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[8][4];
+#else
+  uint8x8x2_t tables[8][4];
+#endif
+  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
+  uint8x16_t p0, p1, p2, p3, si, mask1;
+  uint16x8x2_t r0, r1;
+  uint8x16x2_t q0, q1;
+
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 4; j++) {
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable[i][j]);
+#else
+      tables[i][j].val[0] = vld1_u8(btable[i][j]);
+      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+      v0 = vld1q_u32(src); src += 4;
+      v1 = vld1q_u32(src); src += 4;
+      v2 = vld1q_u32(src); src += 4;
+      v3 = vld1q_u32(src); src += 4;
+
+      if (altmap) {
+          q0.val[0] = vreinterpretq_u8_u32(v0);
+          q0.val[1] = vreinterpretq_u8_u32(v1);
+          q1.val[0] = vreinterpretq_u8_u32(v2);
+          q1.val[1] = vreinterpretq_u8_u32(v3);
+      } else {
+          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
+          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
+
+          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
+                        vreinterpretq_u8_u16(r1.val[0]));
+          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
+                        vreinterpretq_u8_u16(r1.val[1]));
+      }
+
+      si = vandq_u8(q0.val[0], mask1);
+      p0 = vqtbl1q_u8(tables[0][0], si);
+      p1 = vqtbl1q_u8(tables[0][1], si);
+      p2 = vqtbl1q_u8(tables[0][2], si);
+      p3 = vqtbl1q_u8(tables[0][3], si);
+
+      si = vshrq_n_u8(q0.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
+
+      si = vandq_u8(q0.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
+
+      si = vshrq_n_u8(q0.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
+
+      si = vandq_u8(q1.val[0], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
+
+      si = vshrq_n_u8(q1.val[0], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
+
+      si = vandq_u8(q1.val[1], mask1);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
+
+      si = vshrq_n_u8(q1.val[1], 4);
+      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
+      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
+      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
+      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
+
+      if (altmap) {
+          s0 = vreinterpretq_u32_u8(p0);
+          s1 = vreinterpretq_u32_u8(p1);
+          s2 = vreinterpretq_u32_u8(p2);
+          s3 = vreinterpretq_u32_u8(p3);
+      } else {
+          q0 = vtrnq_u8(p0, p1);
+          q1 = vtrnq_u8(p2, p3);
+
+          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
+                         vreinterpretq_u16_u8(q1.val[0]));
+          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
+                         vreinterpretq_u16_u8(q1.val[1]));
+
+          s0 = vreinterpretq_u32_u16(r0.val[0]);
+          s1 = vreinterpretq_u32_u16(r1.val[0]);
+          s2 = vreinterpretq_u32_u16(r0.val[1]);
+          s3 = vreinterpretq_u32_u16(r1.val[1]);
+      }
+
+      if (xor) {
+          v0 = vld1q_u32(dst);
+          v1 = vld1q_u32(dst + 4);
+          v2 = vld1q_u32(dst + 8);
+          v3 = vld1q_u32(dst + 12);
+          s0 = veorq_u32(s0, v0);
+          s1 = veorq_u32(s1, v1);
+          s2 = veorq_u32(s2, v2);
+          s3 = veorq_u32(s3, v3);
+      }
+
+      vst1q_u32(dst,      s0);
+      vst1q_u32(dst + 4,  s1);
+      vst1q_u32(dst + 8,  s2);
+      vst1q_u32(dst + 12, s3);
+
+      dst += 16;
+  }
+}
+
+static
+inline
+void
+neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
+  uint8_t btable[8][4][16];
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
+  gf_do_initial_region_alignment(&rd);
+
+  s32 = (uint32_t *) rd.s_start;
+  d32 = (uint32_t *) rd.d_start;
+  top = (uint32_t *) rd.d_top;
+
+  v = val;
+  for (i = 0; i < 8; i++) {
+    tmp_table[0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        tmp_table[k^j] = (v ^ tmp_table[k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+    for (j = 0; j < 4; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[i][j][k] = (uint8_t) tmp_table[k];
+        tmp_table[k] >>= 8;
+      }
+    }
+  }
+
+  if (xor)
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
+  else
+    neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, gf_val_32_t val,
+                                                   int bytes, int xor)
+{
+  neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w32_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_altmap_multiply_region_neon)
+  else
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region_neon)
+
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
new file mode 100644
index 000000000..5f35c8634
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c
@@ -0,0 +1,247 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w4_neon.c
+ *
+ * Neon routines for 4-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w4.h"
+
+static
+gf_val_32_t
+gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       result, prim_poly;
+  poly8x8_t       a, b, w;
+  uint8x8_t       v;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a4);
+  b =  vdup_n_p8 (b4);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
+
+  /* Do the initial multiply */
+  result = vmul_p8 (a, b);
+  v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
+  w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+  result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
+
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
+
+  return rv;
+}
+
+static inline void
+neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                      gf_val_32_t val, uint8_t *d_end, int xor)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       prim_poly;
+  poly8x8_t       a, w, even, odd;
+  uint8x8_t       b, c, v, mask;
+
+  a         = vdup_n_p8 (val);
+  mask      = vdup_n_u8 (0xf);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
+
+  while (d8 < d_end) {
+    b = vld1_u8 (s8);
+
+    even = vreinterpret_p8_u8 (vand_u8 (b, mask));
+    odd  = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    even = vmul_p8 (a, even);
+    odd  = vmul_p8 (a, odd);
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
+
+    v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
+    w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
+    odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
+
+    v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
+
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+
+static void
+gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
+                                            gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *s8;
+  uint8_t *d8;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  s8 = (uint8_t *) rd.s_start;
+  d8 = (uint8_t *) rd.d_start;
+
+  if (xor)
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
+  else
+    neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
+
+  gf_do_final_region_alignment(&rd);
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
+                                     uint8_t * d_end, gf_val_32_t val, int xor)
+{
+  struct gf_single_table_data *std;
+  uint8_t *base;
+  uint8x16_t r, va, vh, vl, loset;
+
+#ifdef ARCH_AARCH64
+  uint8x16_t th, tl;
+#else
+  uint8x8x2_t th, tl;
+#endif
+
+  std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+  base = (uint8_t *) std->mult;
+  base += (val << GF_FIELD_WIDTH);
+
+#ifdef ARCH_AARCH64
+  tl = vld1q_u8 (base);
+  th = vshlq_n_u8 (tl, 4);
+#else
+  tl.val[0] = vld1_u8 (base);
+  tl.val[1] = vld1_u8 (base + 8);
+  th.val[0] =  vshl_n_u8 (tl.val[0], 4);
+  th.val[1] =  vshl_n_u8 (tl.val[1], 4);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+      va = vld1q_u8 (src);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+
+      if (xor)
+        va = vld1q_u8 (dst);
+
+      vh = vqtbl1q_u8 (th, vh);
+      vl = vqtbl1q_u8 (tl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      if (xor)
+        r = veorq_u8 (va, r);
+
+      vst1q_u8 (dst, r);
+
+    dst += 16;
+    src += 16;
+  }
+}
+
+static
+void
+gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                        gf_val_32_t val, int bytes, int xor)
+{
+  gf_region_data rd;
+  uint8_t *sptr, *dptr, *top;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+  top  = rd.d_top;
+
+  if (xor)
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
+  else
+      w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
+
+  gf_do_final_region_alignment(&rd);
+
+}
+
+
+int gf_w4_neon_cfm_init(gf_t *gf)
+{
+  // single clm multiplication probably pointless
+  SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply)
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single)
+
+  return 1;
+}
+
+void gf_w4_neon_single_table_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon)
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
new file mode 100644
index 000000000..24098232e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
@@ -0,0 +1,333 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w64_neon.c
+ *
+ * Neon routines for 64-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "gf_w64.h"
+
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+inline
+void
+neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
+                                             uint64_t *dst, uint64_t *d_end,
+                                             uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    if (xor) {
+      for (i = 0; i < 8; i++)
+        p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
+    } else {
+      for (i = 0; i < 8; i++)
+        p[i] = vdupq_n_u8(0);
+    }
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      uint8x16_t v0 = vld1q_u8((uint8_t *) src);
+      src += 2;
+
+      si = vandq_u8(v0, mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(v0, 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+
+    }
+    for (i = 0; i < 8; i++) {
+      vst1q_u8((uint8_t *) dst, p[i]);
+      dst += 2;
+    }
+  }
+}
+
+static
+inline
+void
+neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
+                                      uint64_t *d_end, uint64_t val, int xor)
+{
+  unsigned i, j, k;
+  uint8_t btable[16];
+#ifdef ARCH_AARCH64
+  uint8x16_t tables[16][8];
+#else
+  uint8x8x2_t tables[16][8];
+#endif
+  uint8x16_t p[8], mask1, si;
+  uint64x2_t st[8];
+  uint32x4x2_t s32[4];
+  uint16x8x2_t s16[4];
+  uint8x16x2_t s8[4];
+
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+  struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  for (i = 0; i < 16; i++) {
+    for (j = 0; j < 8; j++) {
+      for (k = 0; k < 16; k++) {
+        btable[k] = (uint8_t) ld->tables[i][k];
+        ld->tables[i][k] >>= 8;
+      }
+#ifdef ARCH_AARCH64
+      tables[i][j] = vld1q_u8(btable);
+#else
+      tables[i][j].val[0] = vld1_u8(btable);
+      tables[i][j].val[1] = vld1_u8(btable + 8);
+#endif
+    }
+  }
+
+  mask1 = vdupq_n_u8(0xf);
+
+  while (dst < d_end) {
+
+    for (k = 0; k < 8; k++) {
+      st[k]  = vld1q_u64(src);
+      src += 2;
+      p[k] = vdupq_n_u8(0);
+    }
+
+    s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
+                       vreinterpretq_u32_u64(st[1]));
+    s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
+                       vreinterpretq_u32_u64(st[3]));
+    s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
+                       vreinterpretq_u32_u64(st[5]));
+    s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
+                       vreinterpretq_u32_u64(st[7]));
+
+    s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
+                       vreinterpretq_u16_u32(s32[1].val[0]));
+    s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
+                       vreinterpretq_u16_u32(s32[3].val[0]));
+    s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
+                       vreinterpretq_u16_u32(s32[1].val[1]));
+    s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
+                       vreinterpretq_u16_u32(s32[3].val[1]));
+
+    s8[0]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
+                      vreinterpretq_u8_u16(s16[1].val[0]));
+    s8[1]  = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
+                      vreinterpretq_u8_u16(s16[1].val[1]));
+    s8[2]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
+                      vreinterpretq_u8_u16(s16[3].val[0]));
+    s8[3]  = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
+                      vreinterpretq_u8_u16(s16[3].val[1]));
+
+    i = 0;
+    for (k = 0; k < 8; k++) {
+      si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+      si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
+      for (j = 0; j < 8; j++) {
+        p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
+      }
+      i++;
+    }
+
+    s8[0]  = vzipq_u8(p[0], p[1]);
+    s8[1]  = vzipq_u8(p[2], p[3]);
+    s8[2]  = vzipq_u8(p[4], p[5]);
+    s8[3]  = vzipq_u8(p[6], p[7]);
+
+    s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
+                       vreinterpretq_u16_u8(s8[1].val[0]));
+    s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
+                       vreinterpretq_u16_u8(s8[3].val[0]));
+    s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
+                       vreinterpretq_u16_u8(s8[1].val[1]));
+    s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
+                       vreinterpretq_u16_u8(s8[3].val[1]));
+
+    s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
+                       vreinterpretq_u32_u16(s16[1].val[0]));
+    s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
+                       vreinterpretq_u32_u16(s16[1].val[1]));
+    s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
+                       vreinterpretq_u32_u16(s16[3].val[0]));
+    s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
+                       vreinterpretq_u32_u16(s16[3].val[1]));
+
+    for (k = 0; k < 8; k ++) {
+        st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
+    }
+
+    if (xor) {
+      for (i = 0; i < 8; i++) {
+        uint64x2_t t1 = vld1q_u64(dst);
+        vst1q_u64(dst, veorq_u64(st[i], t1));
+        dst += 2;
+      }
+    } else {
+      for (i = 0; i < 8; i++) {
+        vst1q_u64(dst, st[i]);
+        dst += 2;
+      }
+    }
+
+  }
+}
+
+static
+void
+gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
+                                         uint64_t val, int bytes, int xor,
+                                         int altmap)
+{
+  gf_internal_t *h;
+  int i, j, k;
+  uint64_t pp, v, *s64, *d64, *top;
+  struct gf_split_4_64_lazy_data *ld;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
+  gf_do_initial_region_alignment(&rd);
+
+  s64 = (uint64_t *) rd.s_start;
+  d64 = (uint64_t *) rd.d_start;
+  top = (uint64_t *) rd.d_top;
+
+  h = (gf_internal_t *) gf->scratch;
+  pp = h->prim_poly;
+  ld = (struct gf_split_4_64_lazy_data *) h->private;
+
+  v = val;
+  for (i = 0; i < 16; i++) {
+    ld->tables[i][0] = 0;
+    for (j = 1; j < 16; j <<= 1) {
+      for (k = 0; k < j; k++) {
+        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
+      }
+      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
+    }
+  }
+
+  if (altmap) {
+    if (xor)
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
+  } else {
+    if (xor)
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
+    else
+      neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
+                                            uint64_t val, int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
+}
+
+static
+void
+gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
+                                                   void *dest, uint64_t val,
+                                                   int bytes, int xor)
+{
+  gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
+}
+
+void gf_w64_neon_split_init(gf_t *gf)
+{
+  gf_internal_t *h = (gf_internal_t *) gf->scratch;
+
+  if (h->region_type & GF_REGION_ALTMAP)
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon)
+  else
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon)
+
+}
diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
new file mode 100644
index 000000000..0cce5ba7e
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c
@@ -0,0 +1,302 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * Copyright (c) 2014: Janne Grunau <j@jannau.net>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  - Neither the name of the University of Tennessee nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * gf_w8_neon.c
+ *
+ * Neon optimized routines for 8-bit Galois fields
+ *
+ */
+
+#include "gf_int.h"
+#include "gf_w8.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ARM NEON reducing macro for the carry free multiplication
+ *   vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
+ *   the result to the right by 1 byte. This allows us to multiply
+ *   the prim_poly by the leading bits of the result. We then xor the result
+ *   of that operation back with the result. */
+#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial)               \
+  do {								        \
+    if (initial)                                                        \
+      v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8);               \
+    else                                                                \
+      v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8));  \
+    w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v));                    \
+    result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
+  } while (0)
+
+static
+inline
+gf_val_32_t
+gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
+{
+  gf_val_32_t rv = 0;
+  poly8x8_t       a, b;
+  uint8x8_t       v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+  gf_internal_t * h = gf->scratch;
+
+  a =  vdup_n_p8 (a8);
+  b =  vdup_n_p8 (b8);
+
+  prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
+
+  /* Do the initial multiply */
+  result = vmull_p8 (a, b);
+
+  /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
+     have to do the reduction at most twice, because (w-2)/z == 2. Where
+     z is equal to the number of zeros after the leading 1 */
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
+  NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  if (x >= 3) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  if (x >= 4) {
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+  }
+  /* Extracts 32 bit value from result. */
+  rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
+
+  return rv;
+}
+
+#define CLM_MULTIPLY(x) \
+static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
+{\
+    return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
+}
+
+CLM_MULTIPLY(2)
+CLM_MULTIPLY(3)
+CLM_MULTIPLY(4)
+
+static inline void
+neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
+                                       gf_val_32_t val, uint8_t *d_end,
+                                       int xor, int x)
+{
+  gf_internal_t * h = gf->scratch;
+  poly8x8_t       a, b;
+  uint8x8_t       c, v;
+  poly16x8_t      result;
+  poly8x8_t       prim_poly;
+  poly16x8_t      w;
+
+  a         = vdup_n_p8 (val);
+  prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
+
+  while (d8 < d_end) {
+    b = vld1_p8 ((poly8_t *) s8);
+
+    if (xor)
+        c = vld1_u8 (d8);
+
+    result = vmull_p8 (a, b);
+
+    NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
+    NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    if (x >= 3) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    if (x >= 4) {
+      NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
+    }
+    v = vmovn_u16 (vreinterpretq_u16_p16 (result));
+    if (xor)
+      v = veor_u8 (c, v);
+
+    vst1_u8 (d8, v);
+
+    d8 += 8;
+    s8 += 8;
+  }
+}
+
+#define CLM_MULT_REGION(x)                                              \
+static void                                                             \
+gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src,  \
+                                                  void *dest,           \
+                                                  gf_val_32_t val, int bytes, \
+                                                  int xor)              \
+{                                                                       \
+  gf_region_data rd;                                                    \
+  uint8_t *s8;                                                          \
+  uint8_t *d8;                                                          \
+                                                                        \
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }           \
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }       \
+                                                                        \
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);          \
+  gf_do_initial_region_alignment(&rd);                                  \
+  s8 = (uint8_t *) rd.s_start;                                          \
+  d8 = (uint8_t *) rd.d_start;                                          \
+                                                                        \
+  if (xor)                                                              \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
+  else                                                                  \
+    neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
+  gf_do_final_region_alignment(&rd);                                    \
+}
+
+CLM_MULT_REGION(2)
+CLM_MULT_REGION(3)
+CLM_MULT_REGION(4)
+
+
+int gf_w8_neon_cfm_init(gf_t *gf)
+{
+  gf_internal_t *h;
+
+  h = (gf_internal_t *) gf->scratch;
+
+  if ((0xe0 & h->prim_poly) == 0){
+    SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_2)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_2)
+  }else if ((0xc0 & h->prim_poly) == 0){
+    SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_3)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_3)
+  }else if ((0x80 & h->prim_poly) == 0){
+    SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_4)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_4)
+  }else{
+    return 0;
+  }
+  return 1;
+}
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
+                                       vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
+static
+void
+gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
+{
+  uint8_t *bh, *bl, *sptr, *dptr;
+  uint8x16_t r, va, vh, vl, loset;
+#ifdef ARCH_AARCH64
+  uint8x16_t mth, mtl;
+#else
+  uint8x8x2_t mth, mtl;
+#endif
+  struct gf_w8_half_table_data *htd;
+  gf_region_data rd;
+
+  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
+  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
+
+  htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
+
+  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
+  gf_do_initial_region_alignment(&rd);
+
+  bh = (uint8_t *) htd->high;
+  bh += (val << 4);
+  bl = (uint8_t *) htd->low;
+  bl += (val << 4);
+
+  sptr = rd.s_start;
+  dptr = rd.d_start;
+
+#ifdef ARCH_AARCH64
+  mth = vld1q_u8 (bh);
+  mtl = vld1q_u8 (bl);
+#else
+  mth.val[0] = vld1_u8 (bh);
+  mtl.val[0] = vld1_u8 (bl);
+  mth.val[1] = vld1_u8 (bh + 8);
+  mtl.val[1] = vld1_u8 (bl + 8);
+#endif
+
+  loset = vdupq_n_u8(0xf);
+
+  if (xor) {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+      va = vld1q_u8 (dptr);
+
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8 (dptr, veorq_u8 (va, r));
+
+      dptr += 16;
+      sptr += 16;
+    }
+  } else {
+    while (sptr < (uint8_t *) rd.s_top) {
+      va = vld1q_u8 (sptr);
+
+      vh = vshrq_n_u8 (va, 4);
+      vl = vandq_u8 (va, loset);
+#ifdef ARCH_AARCH64
+      vh = vqtbl1q_u8 (mth, vh);
+      vl = vqtbl1q_u8 (mtl, vl);
+#else
+      vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
+			vtbl2_u8 (mth, vget_high_u8 (vh)));
+      vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
+			vtbl2_u8 (mtl, vget_high_u8 (vl)));
+#endif
+
+      r = veorq_u8 (vh, vl);
+
+      vst1q_u8(dptr, r);
+
+      dptr += 16;
+      sptr += 16;
+    }
+  }
+
+  gf_do_final_region_alignment(&rd);
+}
+
+
+void gf_w8_neon_split_init(gf_t *gf)
+{
+  SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_neon)
+}
diff --git a/src/erasure-code/jerasure/gf-complete/test/Makefile.am b/src/erasure-code/jerasure/gf-complete/test/Makefile.am
new file mode 100644
index 000000000..f590ecca0
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/test/Makefile.am
@@ -0,0 +1,11 @@
+# GF-Complete 'test' AM file
+
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 -fPIC
+
+bin_PROGRAMS = gf_unit 
+
+gf_unit_SOURCES = gf_unit.c
+#gf_unit_LDFLAGS = -lgf_complete
+gf_unit_LDADD = ../src/libgf_complete.la
+
diff --git a/src/erasure-code/jerasure/gf-complete/test/gf_unit.c b/src/erasure-code/jerasure/gf-complete/test/gf_unit.c
new file mode 100644
index 000000000..db26849db
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/test/gf_unit.c
@@ -0,0 +1,458 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_unit.c
+ *
+ * Performs unit testing for gf arithmetic
+ */
+
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <signal.h>
+
+#include "gf_complete.h"
+#include "gf_int.h"
+#include "gf_method.h"
+#include "gf_rand.h"
+#include "gf_general.h"
+
+#define REGION_SIZE (16384)
+#define RMASK (0x00000000ffffffffLL)
+#define LMASK (0xffffffff00000000LL)
+
+void problem(char *s)
+{
+  fprintf(stderr, "Unit test failed.\n");
+  fprintf(stderr, "%s\n", s);
+  exit(1);
+}
+
+char *BM = "Bad Method: ";
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
+  fprintf(stderr, "           128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Tests may be any combination of:\n");
+  fprintf(stderr, "       A: All\n");
+  fprintf(stderr, "       S: Single operations (multiplication/division)\n");
+  fprintf(stderr, "       R: Region operations\n");
+  fprintf(stderr, "       V: Verbose Output\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Use -1 for time(0) as a seed.\n");
+  fprintf(stderr, "\n");
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
+  exit(1);
+}
+
+void SigHandler(int v)
+{
+  fprintf(stderr, "Problem: SegFault!\n");
+  fflush(stdout);
+  exit(2);
+}
+
+int main(int argc, char **argv)
+{
+  signal(SIGSEGV, SigHandler);
+
+  int w, i, verbose, single, region, top;
+  int s_start, d_start, bytes, xor, alignment_test;
+  gf_t   gf, gf_def;
+  time_t t0;
+  gf_internal_t *h;
+  gf_general_t *a, *b, *c, *d;
+  uint8_t a8, b8, c8, *mult4 = NULL, *mult8 = NULL;
+  uint16_t a16, b16, c16, *log16 = NULL, *alog16 = NULL;
+  char as[50], bs[50], cs[50], ds[50];
+  uint32_t mask = 0;
+  char *ra, *rb, *rc, *rd, *target;
+  int align;
+#ifndef HAVE_POSIX_MEMALIGN
+  char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd;
+#endif
+
+
+  if (argc < 4) usage(NULL);
+
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w\n");
+  }
+
+  if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
+  if (t0 == -1) t0 = time(0);
+  MOA_Seed(t0);
+
+  if (w > 32 && w != 64 && w != 128) usage("Bad w");
+
+  if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) {
+    usage(BM);
+  }
+
+  printf("Args: ");
+  for (i = 1; i < argc; i++) {
+    printf ("%s ", argv[i]);
+  }
+  printf("/ size (bytes): %d\n", gf_size(&gf));
+
+  for (i = 0; i < strlen(argv[2]); i++) {
+    if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n");
+  }
+
+  h = (gf_internal_t *) gf.scratch;
+  a = (gf_general_t *) malloc(sizeof(gf_general_t));
+  b = (gf_general_t *) malloc(sizeof(gf_general_t));
+  c = (gf_general_t *) malloc(sizeof(gf_general_t));
+  d = (gf_general_t *) malloc(sizeof(gf_general_t));
+
+#if HAVE_POSIX_MEMALIGN
+  if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE))
+    ra = NULL;
+  if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE))
+    rb = NULL;
+  if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE))
+    rc = NULL;
+  if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE))
+    rd = NULL;
+#else
+  //15 bytes extra to make sure it's 16byte aligned
+  malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
+  ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+  rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+  rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf));
+  rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf));
+#endif
+
+  if (w <= 32) {
+    mask = 0;
+    for (i = 0; i < w; i++) mask |= (1 << i);
+  }
+
+  verbose = (strchr(argv[2], 'V') != NULL);
+  single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL);
+  region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL);
+
+  if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
+      (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL))
+    problem("No default for this value of w");
+
+  if (w == 4) {
+    mult4 = gf_w4_get_mult_table(&gf);
+  } else if (w == 8) {
+    mult8 = gf_w8_get_mult_table(&gf);
+  } else if (w == 16) {
+    log16 = gf_w16_get_log_table(&gf);
+    alog16 = gf_w16_get_mult_alog_table(&gf);
+  }
+
+  if (verbose) printf("Seed: %ld\n", t0);
+
+  if (single) {
+    
+    if (gf.multiply.w32 == NULL) problem("No multiplication operation defined.");
+    if (verbose) { printf("Testing single multiplications/divisions.\n"); fflush(stdout); }
+    if (w <= 10) {
+      top = (1 << w)*(1 << w);
+    } else {
+      top = 1024*1024;
+    }
+    for (i = 0; i < top; i++) {
+      if (w <= 10) {
+        a->w32 = i % (1 << w);
+        b->w32 = (i >> w);
+
+      //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to
+      //me for these special cases, so I converted to doing this mod stuff to easily make the number of times
+      //run both larger and proportional to the total size of the run.
+      } else {
+        switch (i % 32)
+        {
+          case 0: 
+            gf_general_set_zero(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 1:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_zero(b, w);
+            break;
+          case 2:
+            gf_general_set_one(a, w);
+            gf_general_set_random(b, w, 1);
+            break;
+          case 3:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_one(b, w);
+            break;
+          default:
+            gf_general_set_random(a, w, 1);
+            gf_general_set_random(b, w, 1);
+        }
+      }
+
+      //Allen: the following special cases for w=64 are based on the code below for w=128.
+      //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64
+      //involve splitting it in two. I think they're less likely to give errors than the 128-bit case
+      //though, because the 128 bit case is always split in two.
+      //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 64) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break;
+        }
+      }
+
+      //Allen: for w=128, we have important special cases where one half or the other of the number is all
+      //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases
+      //we'll probably never hit them. This could be implemented more efficiently by changing the set-random
+      //function for w=128, but I think this is easier to follow.
+      //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases
+      if (w == 128) {
+        switch (i % 32)
+        {
+          case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break;
+          case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break;
+          case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+          case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break;
+          case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break;
+        }
+      }
+
+      gf_general_multiply(&gf, a, b, c);
+      
+      /* If w is 4, 8 or 16, then there are inline multiplication/division methods.  
+         Test them here. */
+
+      if (w == 4 && mult4 != NULL) {
+        a8 = a->w32;
+        b8 = b->w32;
+        c8 = GF_W4_INLINE_MULTDIV(mult4, a8, b8);
+        if (c8 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a8, b8, c8, c->w32);
+          exit(1);
+        }
+      }
+
+      if (w == 8 && mult8 != NULL) {
+        a8 = a->w32;
+        b8 = b->w32;
+        c8 = GF_W8_INLINE_MULTDIV(mult8, a8, b8);
+        if (c8 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a8, b8, c8, c->w32);
+          exit(1);
+        }
+      }
+
+      if (w == 16 && log16 != NULL) {
+        a16 = a->w32;
+        b16 = b->w32;
+        c16 = GF_W16_INLINE_MULT(log16, alog16, a16, b16);
+        if (c16 != c->w32) {
+          printf("Error in inline multiplication. %d * %d.  Inline = %d.  Default = %d.\n",
+             a16, b16, c16, c->w32);
+          printf("%d %d\n", log16[a16], log16[b16]);
+          top = log16[a16] + log16[b16];
+          printf("%d %d\n", top, alog16[top]);
+          exit(1);
+        }
+      }
+
+      /* If this is not composite, then first test against the default: */
+
+      if (h->mult_type != GF_MULT_COMPOSITE) {
+        gf_general_multiply(&gf_def, a, b, d);
+
+        if (!gf_general_are_equal(c, d, w)) {
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
+          printf("Error in single multiplication (all numbers in hex):\n\n");
+          printf("  gf.multiply(gf, %s, %s) = %s\n", as, bs, cs);
+          printf("  The default gf multiplier returned %s\n", ds);
+          exit(1);
+        }
+      }
+
+      /* Now, we also need to double-check by other means, in case the default is wanky, 
+         and when we're performing composite operations. Start with 0 and 1, where we know
+         what the result should be. */
+
+      if (gf_general_is_zero(a, w) || gf_general_is_zero(b, w) || 
+          gf_general_is_one(a, w)  || gf_general_is_one(b, w)) {
+        if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) ||
+            (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) ||
+            (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) {
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          printf("Error in single multiplication (all numbers in hex):\n\n");
+          printf("  gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs);
+          exit(1);
+        }
+      }
+
+      /* Dumb check to make sure that it's not returning numbers that are too big: */
+
+      if (w < 32 && (c->w32 & mask) != c->w32) {
+        gf_general_val_to_s(a, w, as, 1);
+        gf_general_val_to_s(b, w, bs, 1);
+        gf_general_val_to_s(c, w, cs, 1);
+        printf("Error in single multiplication (all numbers in hex):\n\n");
+        printf("  gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs);
+        exit(1);
+      }
+
+      /* Finally, let's check to see that multiplication and division work together */
+
+      if (!gf_general_is_zero(a, w)) {
+        gf_general_divide(&gf, c, a, d);
+        if (!gf_general_are_equal(b, d, w)) {
+          gf_general_val_to_s(a, w, as, 1);
+          gf_general_val_to_s(b, w, bs, 1);
+          gf_general_val_to_s(c, w, cs, 1);
+          gf_general_val_to_s(d, w, ds, 1);
+          printf("Error in single multiplication/division (all numbers in hex):\n\n");
+          printf("  gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds);
+          exit(1);
+        }
+      }
+
+    }
+  }
+
+  if (region) {
+    if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); }
+    for (i = 0; i < 1024; i++) {
+      //Allen: changing to a switch thing as with the single ops to make things proportional
+      switch (i % 32)
+      {
+        case 0:
+          gf_general_set_zero(a, w);
+          break;
+        case 1:
+          gf_general_set_one(a, w);
+          break;
+        case 2:
+          gf_general_set_two(a, w);
+          break;
+        default:
+          gf_general_set_random(a, w, 1);
+      }
+      MOA_Fill_Random_Region(ra, REGION_SIZE);
+      MOA_Fill_Random_Region(rb, REGION_SIZE);
+      xor = (i/32)%2;
+      align = w/8;
+      if (align == 0) align = 1;
+      if (align > 16) align = 16;
+
+      /* JSP - Cauchy test.  When w < 32 & it doesn't equal 4, 8 or 16, the default is
+         equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing
+         three alignments here:
+
+         1. Anything goes -- no alignment guaranteed.
+         2. Perfect alignment.  Here src and dest must be aligned wrt each other,
+            and bytes must be a multiple of 16*w.  
+         3. Imperfect alignment.  Here we'll have src and dest be aligned wrt each 
+            other, but bytes is simply a multiple of w.  That means some XOR's will
+            be aligned, and some won't.
+       */
+
+      if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) {
+        alignment_test = (i%3);
+        
+        s_start = MOA_Random_W(5, 1);
+        if (alignment_test == 0) {
+          d_start = MOA_Random_W(5, 1);
+        } else {
+          d_start = s_start;
+        }
+
+        bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start;
+        bytes -= MOA_Random_W(5, 1);
+        if (alignment_test == 1) {
+          bytes -= (bytes % (w*16));
+        } else {
+          bytes -= (bytes % w);
+        }
+
+        target = rb;
+ 
+      /* JSP - Otherwise, we're testing a non-cauchy test, and alignment
+        must be more strict.  We have to make sure that the regions are
+        aligned wrt each other on 16-byte pointers.  */
+
+      } else {
+        s_start = MOA_Random_W(5, 1) * align;
+        d_start = s_start;
+        bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1);
+        bytes -= (bytes % align);
+
+        if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
+          target = rb ;
+        } else {
+          target = (i/64)%2 ? rb : ra;
+        }
+      }
+
+      memcpy(rc, ra, REGION_SIZE);
+      memcpy(rd, target, REGION_SIZE);
+      gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor);
+      gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor);
+    }
+  }
+
+  free(a);
+  free(b);
+  free(c);
+  free(d);
+#ifdef HAVE_POSIX_MEMALIGN
+  free(ra);
+  free(rb);
+  free(rc);
+  free(rd);
+#else
+  free(malloc_ra);
+  free(malloc_rb);
+  free(malloc_rc);
+  free(malloc_rd);
+#endif
+  
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/Makefile.am b/src/erasure-code/jerasure/gf-complete/tools/Makefile.am
new file mode 100644
index 000000000..4ca9131aa
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/Makefile.am
@@ -0,0 +1,56 @@
+# GF-Complete 'tools' AM file
+
+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
+AM_CFLAGS = -O3 -fPIC
+
+bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time
+
+gf_mult_SOURCES = gf_mult.c
+#gf_mult_LDFLAGS = -lgf_complete
+gf_mult_LDADD = ../src/libgf_complete.la
+
+gf_div_SOURCES = gf_div.c
+#gf_div_LDFLAGS = -lgf_complete
+gf_div_LDADD = ../src/libgf_complete.la
+
+gf_add_SOURCES = gf_add.c
+#gf_add_LDFLAGS = -lgf_complete
+gf_add_LDADD = ../src/libgf_complete.la
+
+gf_time_SOURCES = gf_time.c
+#gf_time_LDFLAGS = -lgf_complete
+gf_time_LDADD = ../src/libgf_complete.la
+
+gf_methods_SOURCES = gf_methods.c
+#gf_methods_LDFLAGS = -lgf_complete
+gf_methods_LDADD = ../src/libgf_complete.la
+
+gf_poly_SOURCES = gf_poly.c
+#gf_poly_LDFLAGS = -lgf_complete
+gf_poly_LDADD = ../src/libgf_complete.la
+
+gf_inline_time_SOURCES = gf_inline_time.c
+#gf_inline_time_LDFLAGS = -lgf_complete
+gf_inline_time_LDADD = ../src/libgf_complete.la
+
+# gf_unit 8 A -1 -m LOG_ZERO_EXT is excluded until http://lab.jerasure.org/jerasure/gf-complete/issues/13 is resolved
+if ENABLE_VALGRIND
+VALGRIND = | perl -p -e 's|^|../libtool --mode=execute valgrind --quiet --error-exitcode=1 --tool=memcheck | if(!/gf_unit 8 A -1 -m LOG_ZERO_EXT/)'
+endif
+
+# gf_unit tests as generated by gf_methods
+gf_unit_w%.sh: gf_methods
+	./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} > $@ || rm $@
+
+TESTS = gf_unit_w128.sh \
+        gf_unit_w64.sh  \
+        gf_unit_w32.sh  \
+        gf_unit_w16.sh  \
+        gf_unit_w8.sh   \
+        gf_unit_w4.sh
+
+TEST_EXTENSIONS = .sh
+SH_LOG_COMPILER = $(SHELL)
+AM_SH_LOG_FLAGS = -e
+
+CLEANFILES = $(TESTS)
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_add.c b/src/erasure-code/jerasure/gf-complete/tools/gf_add.c
new file mode 100644
index 000000000..28cc12c1c
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_add.c
@@ -0,0 +1,114 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_add.c
+ *
+ * Adds two numbers in gf_2^w
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_add a b w - does addition of a and b in GF(2^w)\n");
+  fprintf(stderr, "       If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "       legal w are: 1-32, 64 and 128\n");
+  fprintf(stderr, "       128 is hex only (i.e. '128' will be an error - do '128h')\n");
+
+  if (s != NULL) fprintf(stderr, "%s", s);
+  exit(1);
+}
+
+int read_128(char *s, uint64_t *v)
+{
+  int l, t;
+  char save;
+
+  l = strlen(s);
+  if (l > 32) return 0;
+
+  if (l > 16) {
+    if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0;
+    save = s[l-16];
+    s[l-16] = '\0';
+    t = sscanf(s, "%llx", (long long unsigned int *) &(v[0]));
+    s[l-16] = save;
+    return t;
+  } else {
+    v[0] = 0;
+    return sscanf(s, "%llx", (long long unsigned int *)&(v[1]));
+  }
+  return 1;
+}
+
+void print_128(uint64_t *v) 
+{
+  if (v[0] > 0) {
+    printf("%llx", (long long unsigned int) v[0]);
+    printf("%016llx", (long long unsigned int) v[1]);
+  } else {
+    printf("%llx", (long long unsigned int) v[1]);
+  }
+  printf("\n");
+}
+
+
+int main(int argc, char **argv)
+{
+  int hex, w;
+  uint32_t a, b, c, top;
+  uint64_t a64, b64, c64;
+  uint64_t a128[2], b128[2], c128[2];
+  char *format;
+
+  if (argc != 4) usage(NULL);
+  if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n");
+
+  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w");
+
+  hex = (strchr(argv[3], 'h') != NULL);
+
+  if (!hex && w == 128) usage(NULL);
+ 
+  if (w <= 32) {
+    format = (hex) ? "%x" : "%u";
+    if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n");
+    if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n");
+
+    if (w < 32) {
+      top = (w == 31) ? 0x80000000 : (1 << w);
+      if (w != 32 && a >= top) usage("a is too large\n");
+      if (w != 32 && b >= top) usage("b is too large\n");
+    }
+  
+    c = a ^ b;
+    printf(format, c);
+    printf("\n");
+
+  } else if (w == 64) {
+    format = (hex) ? "%llx" : "%llu";
+    if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n");
+    if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n");
+    c64 = a64 ^ b64;
+
+    printf(format, c64);
+    printf("\n");
+
+  } else if (w == 128) {
+
+    if (read_128(argv[1], a128) == 0) usage("Bad a\n");
+    if (read_128(argv[2], b128) == 0) usage("Bad b\n");
+    c128[0] = a128[0] ^ b128[0];
+    c128[1] = a128[1] ^ b128[1];
+
+    print_128(c128);
+  }
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_div.c b/src/erasure-code/jerasure/gf-complete/tools/gf_div.c
new file mode 100644
index 000000000..9797f07da
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_div.c
@@ -0,0 +1,68 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_div.c
+ *
+ * Multiplies two numbers in gf_2^w
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "gf_complete.h"
+#include "gf_method.h"
+#include "gf_general.h"
+
+void usage(int why)
+{
+  fprintf(stderr, "usage: gf_div a b w [method] - does division of a and b in GF(2^w)\n");
+  if (why == 'W') {
+    fprintf(stderr, "Bad w.\n");
+    fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n");
+    fprintf(stderr, "Append 'h' to w to treat a, b and the quotient as hexadecimal.\n");
+    fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  }
+  if (why == 'A') fprintf(stderr, "Bad a\n");
+  if (why == 'B') fprintf(stderr, "Bad b\n");
+  if (why == 'M') {
+    fprintf(stderr, "Bad Method Specification: ");
+    gf_error();
+  }
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  int hex, w;
+  gf_t gf;
+  gf_general_t a, b, c;
+  char output[50];
+
+  if (argc < 4) usage(' ');
+
+  if (sscanf(argv[3], "%d", &w) == 0) usage('W');
+  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W');
+
+  hex = (strchr(argv[3], 'h') != NULL);
+  if (!hex && w == 128) usage('W');
+
+  if (argc == 4) {
+    if (gf_init_easy(&gf, w) == 0) usage('M');
+  } else {
+    if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M');
+  }
+ 
+  if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A');
+  if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B');
+
+  gf_general_divide(&gf, &a, &b, &c);
+  gf_general_val_to_s(&c, w, output, hex);
+  
+  printf("%s\n", output);
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c b/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c
new file mode 100644
index 000000000..f8119da65
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c
@@ -0,0 +1,170 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_inline_time.c
+ *
+ * Times inline single multiplication when w = 4, 8 or 16
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "gf_complete.h"
+#include "gf_rand.h"
+
+void
+timer_start (double *t)
+{
+    struct timeval  tv;
+
+    gettimeofday (&tv, NULL);
+    *t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
+}
+
+double
+timer_split (const double *t)
+{
+    struct timeval  tv;
+    double  cur_t;
+
+    gettimeofday (&tv, NULL);
+    cur_t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
+    return (cur_t - *t);
+}
+
+void problem(char *s)
+{
+  fprintf(stderr, "Timing test failed.\n");
+  fprintf(stderr, "%s\n", s);
+  exit(1);
+}
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_inline_time w seed #elts iterations - does timing of single multiplies\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Legal w are: 4, 8 or 16\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Use -1 for time(0) as a seed.\n");
+  fprintf(stderr, "\n");
+  if (s != NULL) fprintf(stderr, "%s\n", s);
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  int w, j, i, size, iterations;
+  gf_t      gf;
+  double timer, elapsed, dnum, num;
+  uint8_t *ra = NULL, *rb = NULL, *mult4, *mult8;
+  uint16_t *ra16 = NULL, *rb16 = NULL, *log16, *alog16;
+  time_t t0;
+  
+  if (argc != 5) usage(NULL);
+  if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n");
+  if (w != 4 && w != 8 && w != 16) usage("Bad w\n");
+  if (sscanf(argv[2], "%ld", &t0) == 0) usage("Bad seed\n");
+  if (sscanf(argv[3], "%d", &size) == 0) usage("Bad #elts\n");
+  if (sscanf(argv[4], "%d", &iterations) == 0) usage("Bad iterations\n");
+  if (t0 == -1) t0 = time(0);
+  MOA_Seed(t0);
+
+  num = size;
+
+  gf_init_easy(&gf, w);
+  
+  printf("Seed: %ld\n", t0);
+
+  if (w == 4 || w == 8) {
+    ra = (uint8_t *) malloc(size);
+    rb = (uint8_t *) malloc(size);
+
+    if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); }
+  } else if (w == 16) {
+    ra16 = (uint16_t *) malloc(size*2);
+    rb16 = (uint16_t *) malloc(size*2);
+
+    if (ra16 == NULL || rb16 == NULL) { perror("malloc"); exit(1); }
+  }
+
+  if (w == 4) {
+    mult4 = gf_w4_get_mult_table(&gf);
+    if (mult4 == NULL) {
+      printf("Couldn't get inline multiplication table.\n");
+      exit(1);
+    }
+    elapsed = 0;
+    dnum = 0;
+    for (i = 0; i < iterations; i++) {
+      for (j = 0; j < size; j++) {
+        ra[j] = MOA_Random_W(w, 1);
+        rb[j] = MOA_Random_W(w, 1);
+      }
+      timer_start(&timer);
+      for (j = 0; j < size; j++) {
+        ra[j] = GF_W4_INLINE_MULTDIV(mult4, ra[j], rb[j]);
+      }
+      dnum += num;
+      elapsed += timer_split(&timer);
+    }
+    printf("Inline mult:   %10.6lf s   Mops: %10.3lf    %10.3lf Mega-ops/s\n",
+           elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed);
+
+  } else if (w == 8) {
+    mult8 = gf_w8_get_mult_table(&gf);
+    if (mult8 == NULL) {
+      printf("Couldn't get inline multiplication table.\n");
+      exit(1);
+    }
+    elapsed = 0;
+    dnum = 0;
+    for (i = 0; i < iterations; i++) {
+      for (j = 0; j < size; j++) {
+        ra[j] = MOA_Random_W(w, 1);
+        rb[j] = MOA_Random_W(w, 1);
+      }
+      timer_start(&timer);
+      for (j = 0; j < size; j++) {
+        ra[j] = GF_W8_INLINE_MULTDIV(mult8, ra[j], rb[j]);
+      }
+      dnum += num;
+      elapsed += timer_split(&timer);
+    }
+    printf("Inline mult:   %10.6lf s   Mops: %10.3lf    %10.3lf Mega-ops/s\n",
+           elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed);
+  } else if (w == 16) {
+    log16 = gf_w16_get_log_table(&gf);
+    alog16 = gf_w16_get_mult_alog_table(&gf);
+    if (log16 == NULL) {
+      printf("Couldn't get inline multiplication table.\n");
+      exit(1);
+    }
+    elapsed = 0;
+    dnum = 0;
+    for (i = 0; i < iterations; i++) {
+      for (j = 0; j < size; j++) {
+        ra16[j] = MOA_Random_W(w, 1);
+        rb16[j] = MOA_Random_W(w, 1);
+      }
+      timer_start(&timer);
+      for (j = 0; j < size; j++) {
+        ra16[j] = GF_W16_INLINE_MULT(log16, alog16, ra16[j], rb16[j]);
+      }
+      dnum += num;
+      elapsed += timer_split(&timer);
+    }
+    printf("Inline mult:   %10.6lf s   Mops: %10.3lf    %10.3lf Mega-ops/s\n",
+           elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed);
+  }
+  free (ra);
+  free (rb);
+  free (ra16);
+  free (rb16);
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c b/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c
new file mode 100644
index 000000000..b016c33c9
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c
@@ -0,0 +1,246 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_methods.c
+ *
+ * Lists supported methods (incomplete w.r.t. GROUP and COMPOSITE)
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "gf_complete.h"
+#include "gf_method.h"
+#include "gf_int.h"
+
+#define BNMULTS (8)
+static char *BMULTS[BNMULTS] = { "CARRY_FREE", "GROUP48", 
+                               "TABLE", "LOG", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE" };
+#define NMULTS (17)
+static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
+                               "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2",
+                               "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" };
+
+/* Make sure CAUCHY is last */
+
+#define NREGIONS (7) 
+static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD",
+                                   "ALTMAP", "CAUCHY" };
+
+#define BNREGIONS (4) 
+static char *BREGIONS[BNREGIONS] = { "DOUBLE", "QUAD", "ALTMAP", "CAUCHY" };
+
+#define NDIVS (2)
+static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; 
+
+void usage(char *s)
+{
+   fprintf(stderr, "usage: gf_methods w -BADC -LXUMDRB\n");
+   fprintf(stderr, "\n");
+   fprintf(stderr, "       w can be 1-32, 64, 128\n");
+   fprintf(stderr, "\n");
+   fprintf(stderr, "       -B lists basic methods that are useful\n");
+   fprintf(stderr, "       -A does a nearly exhaustive listing\n");
+   fprintf(stderr, "       -D adds EUCLID and MATRIX division\n");
+   fprintf(stderr, "       -C adds CAUCHY when possible\n");
+   fprintf(stderr, "       Combinations are fine.\n");
+   fprintf(stderr, "\n");
+   fprintf(stderr, "       -L Simply lists methods\n");
+   fprintf(stderr, "       -X List methods and functions selected (compile with DEBUG_FUNCTIONS)\n");
+   fprintf(stderr, "       -U Produces calls to gf_unit\n");
+   fprintf(stderr, "       -M Produces calls to time_tool.sh for single multiplications\n");
+   fprintf(stderr, "       -D Produces calls to time_tool.sh for single divisions\n");
+   fprintf(stderr, "       -R Produces calls to time_tool.sh for region multiplications\n");
+   fprintf(stderr, "       -B Produces calls to time_tool.sh for the fastest region multiplications\n");
+   fprintf(stderr, "       Cannot combine L, U, T.\n");
+   if (s != NULL) {
+     fprintf(stderr, "\n");
+     fprintf(stderr, "%s\n", s);
+   }
+   exit(1);
+}
+
+void print_methods(gf_t *gf)
+{
+#ifdef DEBUG_FUNCTIONS
+    gf_internal_t *h = (gf_internal_t*) gf->scratch;
+
+    printf("multiply = %s\n", h->multiply);
+    printf("divide = %s\n", h->divide);
+    printf("inverse = %s\n", h->inverse);
+    printf("multiply_region = %s\n", h->multiply_region);
+    printf("extract_word = %s\n", h->extract_word);
+#endif
+}
+
+int main(int argc, char *argv[])
+{
+  int m, r, d, w, i, sa, j, k, reset, ok;
+  int nregions;
+  int nmults;
+  char **regions;
+  char **mults;
+  int exhaustive = 0;
+  int divide = 0;
+  int cauchy = 0;
+  int listing;
+  char *gf_argv[50], *x;
+  gf_t gf;
+  char ls[10];
+  char * w_str;
+
+  if (argc != 4) usage(NULL);
+  w = atoi(argv[1]);
+  ok = (w >= 1 && w <= 32);
+  if (w == 64) ok = 1;
+  if (w == 128) ok = 1;
+  if (!ok) usage("Bad w");
+  
+  if (argv[2][0] != '-' || argv[3][0] != '-' || strlen(argv[2]) == 1 || strlen(argv[3]) != 2) {
+    usage(NULL);
+  }
+  for (i = 1; argv[2][i] != '\0'; i++) {
+    switch(argv[2][i]) {
+      case 'B': exhaustive = 0; break;
+      case 'A': exhaustive = 1; break;
+      case 'D': divide = 1; break;
+      case 'C': cauchy = 1; break;
+      default: usage("Bad -BADC");
+    }
+  }
+
+  if (strchr("LXUMDRB", argv[3][1]) == NULL) { usage("Bad -LXUMDRB"); }
+  listing = argv[3][1];
+
+  if (listing == 'U') {
+    w_str = "../test/gf_unit %d A -1";
+  } else if (listing == 'L' || listing == 'X') {
+    w_str = "w=%d:";
+  } else {
+    w_str = strdup("sh time_tool.sh X %d");
+    x = strchr(w_str, 'X');
+    *x = listing;
+  }
+
+  gf_argv[0] = "-";
+  if (create_gf_from_argv(&gf, w, 1, gf_argv, 0) > 0) {
+    printf(w_str, w);
+    printf(" - \n");
+    gf_free(&gf, 1);
+  } else if (_gf_errno == GF_E_DEFAULT) {
+    fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2);
+    exit(1);
+  }
+
+  nregions = (exhaustive) ? NREGIONS : BNREGIONS;
+  if (!cauchy) nregions--;
+  regions = (exhaustive) ? REGIONS : BREGIONS;
+  mults = (exhaustive) ? MULTS : BMULTS;
+  nmults = (exhaustive) ? NMULTS : BNMULTS;
+
+
+  for (m = 0; m < nmults; m++) {
+    sa = 0;
+    gf_argv[sa++] = "-m";
+    if (strcmp(mults[m], "GROUP44") == 0) {
+      gf_argv[sa++] = "GROUP";
+      gf_argv[sa++] = "4";
+      gf_argv[sa++] = "4";
+    } else if (strcmp(mults[m], "GROUP48") == 0) {
+      gf_argv[sa++] = "GROUP";
+      gf_argv[sa++] = "4";
+      gf_argv[sa++] = "8";
+    } else if (strcmp(mults[m], "SPLIT2") == 0) {
+      gf_argv[sa++] = "SPLIT";
+      sprintf(ls, "%d", w);
+      gf_argv[sa++] = ls;
+      gf_argv[sa++] = "2";
+    } else if (strcmp(mults[m], "SPLIT4") == 0) {
+      gf_argv[sa++] = "SPLIT";
+      sprintf(ls, "%d", w);
+      gf_argv[sa++] = ls;
+      gf_argv[sa++] = "4";
+    } else if (strcmp(mults[m], "SPLIT8") == 0) {
+      gf_argv[sa++] = "SPLIT";
+      sprintf(ls, "%d", w);
+      gf_argv[sa++] = ls;
+      gf_argv[sa++] = "8";
+    } else if (strcmp(mults[m], "SPLIT16") == 0) {
+      gf_argv[sa++] = "SPLIT";
+      sprintf(ls, "%d", w);
+      gf_argv[sa++] = ls;
+      gf_argv[sa++] = "16";
+    } else if (strcmp(mults[m], "SPLIT88") == 0) {
+      gf_argv[sa++] = "SPLIT";
+      gf_argv[sa++] = "8";
+      gf_argv[sa++] = "8";
+    } else if (strcmp(mults[m], "COMPOSITE") == 0) {
+      gf_argv[sa++] = "COMPOSITE";
+      gf_argv[sa++] = "2";
+      gf_argv[sa++] = "-";
+    } else {
+      gf_argv[sa++] = mults[m];
+    }
+    reset = sa;
+
+
+    for (r = 0; r < (1 << nregions); r++) {
+      sa = reset;
+      for (k = 0; k < nregions; k++) {
+        if (r & (1 << k)) {
+          gf_argv[sa++] = "-r";
+          gf_argv[sa++] = regions[k];
+        }
+      }
+      gf_argv[sa++] = "-";
+
+      /* printf("Hmmmm. %s", gf_argv[0]);
+      for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
+      printf("\n");  */
+  
+      if (create_gf_from_argv(&gf, w, sa, gf_argv, 0) > 0) {
+        printf(w_str, w);
+        for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
+        printf("\n");
+        if (listing == 'X')
+          print_methods(&gf);
+        gf_free(&gf, 1);
+      } else if (_gf_errno == GF_E_DEFAULT) {
+        fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+        for (j = 0; j < sa; j++) fprintf(stderr, " %s", gf_argv[j]);
+        fprintf(stderr, "\n");
+        exit(1);
+      }
+      sa--;
+      if (divide) {
+        for (d = 0; d < NDIVS; d++) {
+          gf_argv[sa++] = "-d";
+          gf_argv[sa++] = divides[d];
+          /*          printf("w=%d:", w);
+                      for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
+                      printf("\n"); */
+          gf_argv[sa++] = "-";
+          if (create_gf_from_argv(&gf, w, sa, gf_argv, 0) > 0) {
+            printf(w_str, w);
+            for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]);
+            printf("\n");
+            if (listing == 'X')
+              print_methods(&gf);
+            gf_free(&gf, 1);
+          } else if (_gf_errno == GF_E_DEFAULT) {
+            fprintf(stderr, "Unlabeled failed method: w=%d:", w);
+            for (j = 0; j < sa; j++) fprintf(stderr, " %s", gf_argv[j]);
+            fprintf(stderr, "\n");
+            exit(1);
+          } 
+          sa-=3;
+        }
+      }
+    }
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c b/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c
new file mode 100644
index 000000000..815bd8b26
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c
@@ -0,0 +1,68 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_mult.c
+ *
+ * Multiplies two numbers in gf_2^w
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "gf_complete.h"
+#include "gf_method.h"
+#include "gf_general.h"
+
+void usage(int why)
+{
+  fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n");
+  if (why == 'W') {
+    fprintf(stderr, "Bad w.\n");
+    fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n");
+    fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n");
+    fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n");
+  }
+  if (why == 'A') fprintf(stderr, "Bad a\n");
+  if (why == 'B') fprintf(stderr, "Bad b\n");
+  if (why == 'M') {
+    fprintf(stderr, "Bad Method Specification: ");
+    gf_error();
+  }
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  int hex, w;
+  gf_t gf;
+  gf_general_t a, b, c;
+  char output[50];
+
+  if (argc < 4) usage(' ');
+
+  if (sscanf(argv[3], "%d", &w) == 0) usage('W');
+  if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W');
+
+  hex = (strchr(argv[3], 'h') != NULL);
+  if (!hex && w == 128) usage('W');
+
+  if (argc == 4) {
+    if (gf_init_easy(&gf, w) == 0) usage('M');
+  } else {
+    if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M');
+  }
+ 
+  if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A');
+  if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B');
+
+  gf_general_multiply(&gf, &a, &b, &c);
+  gf_general_val_to_s(&c, w, output, hex);
+  
+  printf("%s\n", output);
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c b/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c
new file mode 100644
index 000000000..b3faf254d
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c
@@ -0,0 +1,275 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_poly.c - program to help find irreducible polynomials in composite fields,
+ * using the Ben-Or algorithm.  
+ * 
+ * (This one was written by Jim) 
+ * 
+ * Please see the following paper for a description of the Ben-Or algorithm:
+ * 
+ * author    S. Gao and D. Panario
+ * title     Tests and Constructions of Irreducible Polynomials over Finite Fields
+ * booktitle Foundations of Computational Mathematics
+ * year      1997
+ * publisher Springer Verlag
+ * pages     346-361
+ * 
+ * The basic technique is this.  You have a polynomial f(x) whose coefficients are
+ * in a base field GF(2^w).  The polynomial is of degree n.  You need to do the 
+ * following for all i from 1 to n/2:
+ * 
+ * Construct x^(2^w)^i modulo f.  That will be a polynomial of maximum degree n-1
+ * with coefficients in GF(2^w).  You construct that polynomial by starting with x
+ * and doubling it w times, each time taking the result modulo f.  Then you 
+ * multiply that by itself i times, again each time taking the result modulo f.
+ * 
+ * When you're done, you need to "subtract" x -- since addition = subtraction = 
+ * XOR, that means XOR x.  
+ * 
+ * Now, find the GCD of that last polynomial and f, using Euclid's algorithm.  If
+ * the GCD is not one, then f is reducible.  If it is not reducible for each of
+ * those i, then it is irreducible.
+ * 
+ * In this code, I am using a gf_general_t to represent elements of GF(2^w).  This
+ * is so that I can use base fields that are GF(2^64) or GF(2^128). 
+ * 
+ * I have two main procedures.  The first is x_to_q_to_i_minus_x, which calculates
+ * x^(2^w)^i - x, putting the result into a gf_general_t * called retval.
+ * 
+ * The second is gcd_one, which takes a polynomial of degree n and a second one
+ * of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1.
+ * 
+ * These can be made faster (e.g. calculate x^(2^w) once and store it).
+ */
+
+#include "gf_complete.h"
+#include "gf_method.h"
+#include "gf_general.h"
+#include "gf_int.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+char *BM = "Bad Method: ";
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "       use - for the default method.\n");
+  fprintf(stderr, "       use 0x in front of the coefficient if it's in hex\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       For example, to test whether x^2 + 2x + 1 is irreducible\n");
+  fprintf(stderr, "       in GF(2^16), the call is:\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       gf_poly 16 - 2:1 1:2 0:1\n");
+  fprintf(stderr, "       \n");
+  fprintf(stderr, "       See the user's manual for more information.\n");
+  if (s != NULL) {
+    fprintf(stderr, "\n");
+    if (s == BM) {
+      fprintf(stderr, "%s", s);
+      gf_error();
+    } else {
+      fprintf(stderr, "%s\n", s);
+    }
+  }
+  exit(1);
+}
+
+int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod)
+{
+  gf_general_t *a, *b, zero, factor, p;
+  int i, j, da, db;
+
+  gf_general_set_zero(&zero, w);
+
+  a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1);
+  b = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i);
+  for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i);
+
+  da = n;
+  while (1) {
+    for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ;
+    if (db < 0) return 0;
+    if (db == 0) return 1;
+    for (j = da; j >= db; j--) {
+      if (!gf_general_is_zero(a+j, w)) {
+        gf_general_divide(gf, a+j, b+db, &factor);
+        for (i = 0; i <= db; i++) {
+          gf_general_multiply(gf, b+i, &factor, &p); 
+          gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db));
+        }
+      }
+    }
+    for (i = 0; i < n; i++) {
+      gf_general_add(gf, a+i, &zero, &p);
+      gf_general_add(gf, b+i, &zero, a+i);
+      gf_general_add(gf, &p, &zero, b+i);
+    }
+  }
+
+}
+
+void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval)
+{
+  gf_general_t x;
+  gf_general_t *x_to_q;
+  gf_general_t *product;
+  gf_general_t p, zero, factor;
+  int j, k, lq;
+
+  gf_general_set_zero(&zero, w);
+  product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2);
+  x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n);
+  for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w);
+  gf_general_set_one(x_to_q+1, w);
+
+  for (lq = 0; lq < logq; lq++) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
+    }
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j);
+  }
+  for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w);
+  gf_general_set_one(retval, w);
+
+  while (i > 0) {
+    for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w);
+    for (j = 0; j < n; j++) {
+      for (k = 0; k < n; k++) {
+        gf_general_multiply(gf, x_to_q+j, retval+k, &p);
+        gf_general_add(gf, product+(j+k), &p, product+(j+k));
+      }
+    }
+    for (j = n*2-1; j >= n; j--) {
+      if (!gf_general_is_zero(product+j, w)) {
+        gf_general_add(gf, product+j, &zero, &factor);
+        for (k = 0; k <= n; k++) {
+          gf_general_multiply(gf, poly+k, &factor, &p);
+          gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k));
+        }
+      }
+    }
+    for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j);
+    i--;
+  }
+
+  gf_general_set_one(&x, w);
+  gf_general_add(gf, &x, retval+1, retval+1);
+
+  free(product);
+  free(x_to_q);
+}
+
+int main(int argc, char **argv)
+{
+  int w, i, power, n, ap, success;
+  gf_t gf;
+  gf_general_t *poly, *prod;
+  char *string, *ptr;
+  char buf[100];
+
+  if (argc < 4) usage(NULL);
+
+  if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w.");
+  ap = create_gf_from_argv(&gf, w, argc, argv, 2);
+
+  if (ap == 0) usage(BM);
+
+  if (ap == argc) usage("No powers/coefficients given.");
+
+  n = -1;
+  for (i = ap; i < argc; i++) {
+    if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
+    }
+    if (power < 0) {
+      usage("Can't have negative powers\n");
+    } else {
+      n = power;
+    }
+  }
+  // in case the for-loop header fails
+  assert (n >= 0);
+
+  poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1));
+  for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w);
+  prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n);
+
+  for (i = ap; i < argc; i++) {
+    sscanf(argv[i], "%d:", &power);
+    ptr = strchr(argv[i], ':');
+    ptr++;
+    if (strncmp(ptr, "0x", 2) == 0) {
+      success = gf_general_s_to_val(poly+power, w, ptr+2, 1);
+    } else {
+      success = gf_general_s_to_val(poly+power, w, ptr, 0);
+    }
+    if (success == 0) {
+      string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100)));
+      sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]);
+      usage(string);
+    }
+  }
+
+  printf("Poly:");
+  for (power = n; power >= 0; power--) {
+    if (!gf_general_is_zero(poly+power, w)) {
+      printf("%s", (power == n) ? " " : " + ");
+      if (!gf_general_is_one(poly+power, w)) {
+        gf_general_val_to_s(poly+power, w, buf, 1);
+        if (n > 0) {
+          printf("(0x%s)", buf);
+        } else {
+          printf("0x%s", buf);
+        }
+      }
+      if (power == 0) {
+        if (gf_general_is_one(poly+power, w)) printf("1");
+      } else if (power == 1) {
+        printf("x");
+      } else {
+        printf("x^%d", power);
+      }
+    }
+  }
+  printf("\n");
+
+  if (!gf_general_is_one(poly+n, w)) {
+    printf("\n");
+    printf("Can't do Ben-Or, because the polynomial is not monic.\n");
+    exit(0);
+  }
+
+  for (i = 1; i <= n/2; i++) {
+    x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod); 
+    if (!gcd_one(&gf, w, n, poly, prod)) {
+      printf("Reducible.\n");
+      exit(0);
+    }
+  }
+  
+  printf("Irreducible.\n");
+  exit(0);
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_time.c b/src/erasure-code/jerasure/gf-complete/tools/gf_time.c
new file mode 100644
index 000000000..7402ab5c2
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/gf_time.c
@@ -0,0 +1,232 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_time.c
+ *
+ * Performs timing for gf arithmetic
+ */
+
+#include "config.h"
+
+#ifdef HAVE_POSIX_MEMALIGN
+#ifndef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 600
+#endif
+#endif
+
+#include <stdio.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "gf_complete.h"
+#include "gf_method.h"
+#include "gf_rand.h"
+#include "gf_general.h"
+
+void
+timer_start (double *t)
+{
+    struct timeval  tv;
+
+    gettimeofday (&tv, NULL);
+    *t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
+}
+
+double
+timer_split (const double *t)
+{
+    struct timeval  tv;
+    double  cur_t;
+
+    gettimeofday (&tv, NULL);
+    cur_t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
+    return (cur_t - *t);
+}
+
+void problem(char *s)
+{
+  fprintf(stderr, "Timing test failed.\n");
+  fprintf(stderr, "%s\n", s);
+  exit(1);
+}
+
+char *BM = "Bad Method: ";
+
+void usage(char *s)
+{
+  fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "does unit testing in GF(2^w)\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Tests may be any combination of:\n");
+  fprintf(stderr, "       A: All\n");
+  fprintf(stderr, "       S: All Single Operations\n");
+  fprintf(stderr, "       R: All Region Operations\n");
+  fprintf(stderr, "       M: Single: Multiplications\n");
+  fprintf(stderr, "       D: Single: Divisions\n");
+  fprintf(stderr, "       I: Single: Inverses\n");
+  fprintf(stderr, "       G: Region: Buffer-Constant Multiplication\n");
+  fprintf(stderr, "       0: Region: Doing nothing, and bzero()\n");
+  fprintf(stderr, "       1: Region: Memcpy() and XOR\n");
+  fprintf(stderr, "       2: Region: Multiplying by two\n");
+  fprintf(stderr, "\n");
+  fprintf(stderr, "Use -1 for time(0) as a seed.\n");
+  fprintf(stderr, "\n");
+  if (s == BM) {
+    fprintf(stderr, "%s", BM);
+    gf_error();
+  } else if (s != NULL) {
+    fprintf(stderr, "%s\n", s);
+  }
+  exit(1);
+}
+
+int main(int argc, char **argv)
+{
+  int w, it, i, size, iterations, xor;
+  char tests[100];
+  char test;
+  char *single_tests = "MDI";
+  char *region_tests = "G012";
+  char *tstrings[256];
+  void *tmethods[256];
+  gf_t      gf;
+  double timer, elapsed, ds, di, dnum;
+  int num;
+  time_t t0;
+  uint8_t *ra, *rb;
+  gf_general_t a;
+#ifndef HAVE_POSIX_MEMALIGN
+  uint8_t *malloc_ra, *malloc_rb;
+#endif
+
+  
+  if (argc < 6) usage(NULL);
+  
+  if (sscanf(argv[1], "%d", &w) == 0){
+    usage("Bad w[-pp]\n");
+  }
+
+  
+  if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n");
+  if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n");
+  if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n");
+  if (t0 == -1) t0 = time(0);
+  MOA_Seed(t0);
+
+  ds = size;
+  di = iterations;
+
+  if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w");
+  if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n");
+  
+  if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM);
+
+  strcpy(tests, "");
+  for (i = 0; argv[2][i] != '\0'; i++) {
+    switch(argv[2][i]) {
+      case 'A': strcat(tests, single_tests); 
+                strcat(tests, region_tests); 
+                break;
+      case 'S': strcat(tests, single_tests); break;
+      case 'R': strcat(tests, region_tests); break;
+      case 'G': strcat(tests, "G"); break;
+      case '0': strcat(tests, "0"); break;
+      case '1': strcat(tests, "1"); break;
+      case '2': strcat(tests, "2"); break;
+      case 'M': strcat(tests, "M"); break;
+      case 'D': strcat(tests, "D"); break;
+      case 'I': strcat(tests, "I"); break;
+      default: usage("Bad tests");
+    }
+  }
+
+  tstrings['M'] = "Multiply";
+  tstrings['D'] = "Divide";
+  tstrings['I'] = "Inverse";
+  tstrings['G'] = "Region-Random";
+  tstrings['0'] = "Region-By-Zero";
+  tstrings['1'] = "Region-By-One";
+  tstrings['2'] = "Region-By-Two";
+
+  tmethods['M'] = (void *) gf.multiply.w32;
+  tmethods['D'] = (void *) gf.divide.w32;
+  tmethods['I'] = (void *) gf.inverse.w32;
+  tmethods['G'] = (void *) gf.multiply_region.w32;
+  tmethods['0'] = (void *) gf.multiply_region.w32;
+  tmethods['1'] = (void *) gf.multiply_region.w32;
+  tmethods['2'] = (void *) gf.multiply_region.w32;
+
+  printf("Seed: %ld\n", t0);
+
+#ifdef HAVE_POSIX_MEMALIGN
+  if (posix_memalign((void **) &ra, 16, size))
+    ra = NULL;
+  if (posix_memalign((void **) &rb, 16, size))
+    rb = NULL;
+#else
+  malloc_ra = (uint8_t *) malloc(size + 15);
+  malloc_rb = (uint8_t *) malloc(size + 15);
+  ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
+  rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
+#endif
+
+  if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); }
+
+  for (i = 0; i < 3; i++) {
+    test = single_tests[i];
+    if (strchr(tests, test) != NULL) {
+      if (tmethods[(int)test] == NULL) {
+        printf("No %s method.\n", tstrings[(int)test]);
+      } else {
+        elapsed = 0;
+        dnum = 0;
+        for (it = 0; it < iterations; it++) {
+          gf_general_set_up_single_timing_test(w, ra, rb, size);
+          timer_start(&timer);
+          num = gf_general_do_single_timing_test(&gf, ra, rb, size, test);
+          dnum += num;
+          elapsed += timer_split(&timer);
+        }
+        printf("%14s:           %10.6lf s   Mops: %10.3lf    %10.3lf Mega-ops/s\n", 
+               tstrings[(int)test], elapsed, 
+               dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed);
+      }
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    test = region_tests[i];
+    if (strchr(tests, test) != NULL) {
+      if (tmethods[(int)test] == NULL) {
+        printf("No %s method.\n", tstrings[(int)test]);
+      } else {
+        if (test == '0') gf_general_set_zero(&a, w);
+        if (test == '1') gf_general_set_one(&a, w);
+        if (test == '2') gf_general_set_two(&a, w);
+
+        for (xor = 0; xor < 2; xor++) {
+          elapsed = 0;
+          for (it = 0; it < iterations; it++) {
+            if (test == 'G') gf_general_set_random(&a, w, 1);
+            gf_general_set_up_single_timing_test(8, ra, rb, size);
+            timer_start(&timer);
+            gf_general_do_region_multiply(&gf, &a, ra, rb, size, xor);
+            elapsed += timer_split(&timer);
+          }
+          printf("%14s: XOR: %d    %10.6lf s     MB: %10.3lf    %10.3lf MB/s\n", 
+               tstrings[(int)test], xor, elapsed, 
+               ds*di/1024.0/1024.0, ds*di/1024.0/1024.0/elapsed);
+        }
+      }
+    }
+  }
+  return 0;
+}
diff --git a/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh b/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh
new file mode 100755
index 000000000..e514e4f6f
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh
@@ -0,0 +1,367 @@
+#!/bin/bash -e
+
+# this scripts has a number of tests for SIMD. It can be invoked
+# on the host or on a QEMU machine.
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+host_cpu=`uname -p`
+results=${script_dir}/test_simd.results
+nprocs=$(grep -c ^processor /proc/cpuinfo)
+
+# runs unit tests and save the results
+test_unit(){
+    { ./configure && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+    make -j$nprocs check || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+    cat tools/test-suite.log >> ${results} || true
+}
+
+# build with DEBUG_FUNCTIONS and save all methods selected
+# to a results file
+test_functions() {
+    failed=0
+
+    { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+# build with DEBUG_CPU_FUNCTIONS and print out CPU detection
+test_detection() {
+    failed=0
+
+    { ./configure --enable-debug-cpu && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; }
+    { ${script_dir}/gf_methods 32 -ACD -L | grep '#' >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); }
+
+    return ${failed}
+}
+
+compile_arm() {
+    failed=0
+
+    echo -n "Compiling with NO SIMD support..." >> ${results}
+    { ./configure --disable-neon && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with FULL SIMD support..." >> ${results}
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    return ${failed}
+}
+
+compile_intel() {
+    failed=0
+
+    echo -n "Compiling with NO SIMD support..." >> ${results}
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with SSE2 only..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=no
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with SSE2,SSE3 only..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with SSE2,SSE3,SSSE3 only..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_1 only..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=yes
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_2 only..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=yes
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    echo -n "Compiling with FULL SIMD support..." >> ${results}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=yes
+    export ax_cv_have_sse42_ext=yes
+    export ax_cv_have_pclmuldq_ext=yes
+    { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); }
+
+    return ${failed}
+}
+
+# test that we can compile the source code with different
+# SIMD options. We assume that we are running on processor 
+# full SIMD support
+test_compile() {
+    case $host_cpu in
+        aarch64*|arm*) compile_arm ;;
+        i[[3456]]86*|x86_64*|amd64*) compile_intel ;;
+    esac
+}
+
+# disable through build flags
+runtime_arm_flags() {
+    failed=0
+
+    echo "====NO SIMD support..." >> ${1}
+    { ./configure --disable-neon --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+# build once with FULL SIMD and disable at runtime through environment
+runtime_arm_env() {
+    failed=0
+
+    { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+    echo "====NO SIMD support..." >> ${1}
+    export GF_COMPLETE_DISABLE_NEON=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_NEON
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+runtime_intel_flags() {
+    failed=0
+
+    echo "====NO SIMD support..." >> ${1}
+    { ./configure --disable-sse --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+   echo "====SSE2 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=no
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=yes
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=yes
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+runtime_intel_env() {
+    failed=0
+
+    # compile a build with full SIMD support
+    { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+    echo "====NO SIMD support..." >> ${1}
+    export GF_COMPLETE_DISABLE_SSE2=1
+    export GF_COMPLETE_DISABLE_SSE3=1
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    export GF_COMPLETE_DISABLE_SSE3=1
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    unset GF_COMPLETE_DISABLE_SSE4_PCLMUL
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+test_runtime() {
+    rm -f ${results}.left
+    rm -f ${results}.right
+    
+    case $host_cpu in
+        aarch64*|arm*) 
+            runtime_arm_flags ${results}.left
+            runtime_arm_env ${results}.right
+            ;;
+        i[[3456]]86*|x86_64*|amd64*)
+            runtime_intel_flags ${results}.left
+            runtime_intel_env ${results}.right
+            ;;
+    esac
+
+    echo "======LEFT======" > ${results}
+    cat ${results}.left >> ${results}
+    echo "======RIGHT======" >> ${results}
+    cat ${results}.right >> ${results}
+    echo "======RESULT======" >> ${results}
+    if diff "${results}.left" "${results}.right"; then
+        echo SUCCESS >> ${results}
+        return 0
+    else
+        echo SUCCESS >> ${results}
+        return 1
+    fi
+}
+
+cd ${script_dir}/..
+rm -f ${results}
+
+test_$1
+exit $?
diff --git a/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh b/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh
new file mode 100755
index 000000000..5771874f7
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh
@@ -0,0 +1,258 @@
+#!/bin/bash -e
+
+# This script will use QEMU to test gf-complete especially SIMD support
+# on different architectures and cpus. It will boot a qemu machine 
+# and run an Ubuntu cloud image. All testing will happen inside the 
+# QEMU machine. 
+
+# The following packages are required:
+#   qemu-system-aarch64
+#   qemu-system-arm
+#   qemu-system-x86_64
+#   genisoimage
+
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+qemu_dir="${script_dir}/.qemu"
+ssh_port=2222
+ssh_pubkey_file="${qemu_dir}/qemu.pub"
+ssh_key_file="${qemu_dir}/qemu"
+
+mkdir -p "${qemu_dir}"
+
+cleanup() {
+    if [[ -n "$(jobs -p)" ]]; then
+        echo killing qemu processes "$(jobs -p)"
+        kill $(jobs -p)
+    fi
+}
+
+trap cleanup EXIT
+
+start_qemu() {
+    arch=$1
+    cpu=$2
+
+    image_version="xenial"
+    image_url_base="http://cloud-images.ubuntu.com/${image_version}/current"
+
+    case $arch in
+        i[[3456]]86*|x86_64*|amd64*)
+            image_kernel="${image_version}-server-cloudimg-amd64-vmlinuz-generic"
+            image_initrd="${image_version}-server-cloudimg-amd64-initrd-generic"
+            image_disk="${image_version}-server-cloudimg-amd64-disk1.img"
+            ;;
+        aarch64*) 
+            image_kernel="${image_version}-server-cloudimg-arm64-vmlinuz-generic"
+            image_initrd="${image_version}-server-cloudimg-arm64-initrd-generic"
+            image_disk="${image_version}-server-cloudimg-arm64-disk1.img"
+            ;;
+        arm*) 
+            image_kernel="${image_version}-server-cloudimg-armhf-vmlinuz-lpae"
+            image_initrd="${image_version}-server-cloudimg-armhf-initrd-generic-lpae"
+            image_disk="${image_version}-server-cloudimg-armhf-disk1.img"
+            ;; 
+        *) die "Unsupported arch" ;;
+    esac
+
+    [[ -f ${qemu_dir}/${image_kernel} ]] || wget -O ${qemu_dir}/${image_kernel} ${image_url_base}/unpacked/${image_kernel}
+    [[ -f ${qemu_dir}/${image_initrd} ]] || wget -O ${qemu_dir}/${image_initrd} ${image_url_base}/unpacked/${image_initrd}
+    [[ -f ${qemu_dir}/${image_disk} ]] || wget -O ${qemu_dir}/${image_disk} ${image_url_base}/${image_disk}
+
+    #create a delta disk to keep the original image clean
+    delta_disk="${qemu_dir}/disk.img"
+    rm -f ${delta_disk}
+    qemu-img create -q -f qcow2 -b "${qemu_dir}/${image_disk}" ${delta_disk}
+
+    # generate an ssh keys
+    [[ -f ${ssh_pubkey_file} ]] || ssh-keygen -q -N "" -f ${ssh_key_file} 
+
+    # create a config disk to set the SSH keys
+    cat > "${qemu_dir}/meta-data" <<EOF 
+instance-id: qemu
+local-hostname: qemu
+EOF
+    cat > "${qemu_dir}/user-data" <<EOF 
+#cloud-config
+hostname: qemu
+manage_etc_hosts: true
+users:
+  - name: qemu
+    ssh-authorized-keys:
+      - $(cat "${ssh_pubkey_file}")
+    sudo: ['ALL=(ALL) NOPASSWD:ALL']
+    groups: sudo
+    shell: /bin/bash
+EOF
+    genisoimage -quiet -output "${qemu_dir}/cloud.iso" -volid cidata -joliet -rock "${qemu_dir}/user-data" "${qemu_dir}/meta-data"
+
+    common_args=( \
+        -name "qemu" \
+        -m 1024 \
+        -nodefaults \
+        -nographic \
+        -kernel ${qemu_dir}/${image_kernel} \
+        -initrd ${qemu_dir}/${image_initrd} \
+        -cdrom ${qemu_dir}/cloud.iso \
+        -serial file:${qemu_dir}/console.log
+    )
+
+    case $arch in
+        i[[3456]]86*|x86_64*|amd64*)
+            qemu-system-x86_64 \
+                "${common_args[@]}" \
+                -machine accel=kvm -cpu $cpu \
+                -append "console=ttyS0 root=/dev/sda1" \
+                -hda "${delta_disk}" \
+                -net nic,vlan=0,model=virtio \
+                -net user,vlan=0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \
+            &
+        ;;
+        aarch64*|arm*)
+            qemu-system-$arch \
+                "${common_args[@]}" \
+                -machine virt -cpu $cpu -machine type=virt -smp 1 \
+                -drive if=none,file="${delta_disk}",id=hd0 \
+                -device virtio-blk-device,drive=hd0 \
+                -append "console=ttyAMA0 root=/dev/vda1" \
+                -netdev user,id=eth0,hostfwd=tcp::"${ssh_port}"-:22,hostname="${vm_name}" \
+                -device virtio-net-device,netdev=eth0 \
+                &
+        ;;
+        *) die "Unsupported arch" ;;
+    esac
+
+    wait_for_ssh
+}
+
+stop_qemu() {
+    run_ssh "sudo shutdown now" || true
+    wait $(jobs -p)
+}
+
+shared_args=(
+    -i ${ssh_key_file}
+    -F /dev/null
+    -o BatchMode=yes
+    -o UserKnownHostsFile=/dev/null
+    -o StrictHostKeyChecking=no
+    -o IdentitiesOnly=yes
+)
+
+ssh_args=(
+    ${shared_args[*]}
+    -p ${ssh_port}
+)
+
+wait_for_ssh() {
+    retries=0
+    retry_count=50
+
+    echo "waiting for machine to come up."
+    echo "tail -F ${qemu_dir}/console.log for progress."
+
+    while true; do
+        set +e
+        ssh -q ${ssh_args[*]} -o ConnectTimeout=1 qemu@localhost "echo done"
+        error=$?
+        set -e
+        if [[ $error == 0 ]]; then 
+            return 0
+        fi
+
+        if [[ ${retries} == ${retry_count} ]]; then
+            echo "timeout"
+            return 1
+        fi
+
+        echo -n "."
+        ((++retries))
+        sleep 10
+    done
+}
+
+run_ssh() {
+    ssh -q ${ssh_args[*]} qemu@localhost "$@"
+}
+
+run_scp() {
+    scp -q ${shared_args[*]} -P ${ssh_port} "$@"
+}
+
+rsync_args=(
+    --exclude '.qemu'
+    --exclude '.git'
+)
+
+run_rsync() {
+    rsync -avz -e "ssh ${ssh_args[*]}" ${rsync_args[*]} "$@"
+}
+
+init_machine() {
+    run_ssh "sudo apt-get -y install --no-install-recommends make gcc autoconf libtool automake"
+}
+
+init_machine_and_copy_source() {
+    init_machine
+    run_ssh "rm -fr ~/gf-complete; mkdir -p ~/gf-complete"
+    run_rsync ${script_dir}/.. qemu@localhost:gf-complete
+    run_ssh "cd ~/gf-complete && ./autogen.sh"
+}
+
+run_test() {
+    arch=$1; shift
+    cpu=$1; shift
+    test=$1; shift
+
+    run_ssh "~/gf-complete/tools/test_simd.sh ${test}"
+    run_scp qemu@localhost:gf-complete/tools/test_simd.results ${script_dir}/test_simd_${test}_${arch}_${cpu}.results
+}
+
+# this test run the unit tests on the machine using "make check"
+run_test_simd_basic() {
+    arch=$1; shift
+    cpu=$1; shift
+
+    failed=0
+
+    echo "=====starting qemu machine $arch $cpu"
+    start_qemu $arch $cpu
+    init_machine_and_copy_source
+    echo "=====running compile test"
+    { run_test $arch $cpu "compile" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    echo "=====running unit test"
+    { run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    echo "=====running functions test"
+    { run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    echo "=====running detection test"
+    { run_test $arch $cpu "detection" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    echo "=====running runtime test"
+    { run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    stop_qemu
+
+    return ${failed}
+}
+
+run_all_tests() {
+    failed=0
+
+    echo ============================
+    echo =====running x86_64 tests
+    # NOTE: Broadwell has all the supported SIMD instructions
+    { run_test_simd_basic "x86_64" "Broadwell" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    
+    echo ============================
+    echo =====running aarch64 tests
+    # NOTE: cortex-a57 has ASIMD support
+    { run_test_simd_basic "aarch64" "cortex-a57" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    
+    echo ============================
+    echo =====running arm tests
+    # NOTE: cortex-a15 has NEON support
+    { run_test_simd_basic "arm" "cortex-a15" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+
+    return ${failed}
+}
+
+run_all_tests
+exit $?
diff --git a/src/erasure-code/jerasure/gf-complete/tools/time_tool.sh b/src/erasure-code/jerasure/gf-complete/tools/time_tool.sh
new file mode 100644
index 000000000..7b165e178
--- /dev/null
+++ b/src/erasure-code/jerasure/gf-complete/tools/time_tool.sh
@@ -0,0 +1,98 @@
+# time_tool.sh - Shell script to test various timings.  
+# This is a rough tester -- its job is to work quickly rather than precisely.
+# (Jim Plank)
+
+#!/bin/sh
+
+if [ $# -lt 3 ]; then
+  echo 'usage sh time_tool.sh M|D|R|B w method' >&2
+  exit 1
+fi
+
+op=$1
+w=$2
+
+shift ; shift
+
+method="$*"
+
+if [ $op != M -a $op != D -a $op != R -a $op != B ]; then
+  echo 'usage sh time_tool.sh M|D|R|B w method' >&2
+  echo 'You have to specify a test: ' >&2 
+  echo '  M=Multiplication' >&2 
+  echo '  D=Division' >&2 
+  echo '  R=Regions' >&2 
+  echo '  B=Best-Region' >&2 
+  exit 1
+fi
+
+# First, use a 16K buffer to test the performance of single multiplies.
+
+fac=`echo $w | awk '{ n = $1; while (n != 0 && n%2==0) n /= 2; print n }'`
+if [ $fac -eq 0 ]; then
+  echo 'usage sh time_tool.sh M|D|R|B w method' >&2
+  echo 'Bad w' >&2
+  exit 1
+fi
+
+bsize=16384
+bsize=`echo $bsize $fac | awk '{ print $1 * $2 }'`
+
+if [ `./gf_time $w M -1 $bsize 1 $method 2>&1 | wc | awk '{ print $1 }'` -gt 2 ]; then
+  echo 'usage sh time_tool.sh w method' >&2
+  echo "Bad method"
+  exit 1
+fi
+
+if [ $op = M -o $op = D ]; then
+  iter=1
+  c1=`./gf_time $w $op -1 $bsize $iter $method`
+  t=`echo $c1 | awk '{ printf "%d\n", $4*100 }'`
+  s=`echo $c1 | awk '{ print $8 }'`
+  bs=$s
+  
+  while [ $t -lt 1 ]; do
+    bs=$s
+    iter=`echo $iter | awk '{ print $1*2 }'`
+    c1=`./gf_time $w $op -1 $bsize $iter $method`
+    t=`echo $c1 | awk '{ printf "%d\n", $4*100 }'`
+    s=`echo $c1 | awk '{ print $8 }'`
+  done
+  
+  echo $op $bs | awk '{ printf "%s speed (MB/s): %8.2lf   W-Method: ", $1, $2 }'
+  echo $w $method 
+  exit 0
+fi
+  
+bsize=16384
+bsize=`echo $bsize $fac | awk '{ print $1 * $2 }'`
+
+best=0
+while [ $bsize -le 4194304 ]; do
+  iter=1
+  c1=`./gf_time $w G -1 $bsize $iter $method`
+  t=`echo $c1 | awk '{ printf "%d\n", $6*500 }'`
+  s=`echo $c1 | awk '{ print $10 }'`
+  bs=$s
+
+  while [ $t -lt 1 ]; do
+    bs=$s
+    iter=`echo $iter | awk '{ print $1*2 }'`
+    c1=`./gf_time $w G -1 $bsize $iter $method`
+    t=`echo $c1 | awk '{ printf "%d\n", $6*500 }'`
+    s=`echo $c1 | awk '{ print $10 }'`
+  done
+  if [ $bsize -lt 1048576 ]; then
+    str=`echo $bsize | awk '{ printf "%3dK\n", $1/1024 }'`
+  else 
+    str=`echo $bsize | awk '{ printf "%3dM\n", $1/1024/1024 }'`
+  fi
+  if [ $op = R ]; then
+    echo $str $bs | awk '{ printf "Region Buffer-Size: %4s (MB/s): %8.2lf   W-Method: ", $1, $2 }'
+    echo $w $method 
+  fi
+  best=`echo $best $bs | awk '{ print ($1 > $2) ? $1 : $2 }'`
+  bsize=`echo $bsize | awk '{ print $1 * 2 }'`
+done
+echo $best | awk '{ printf "Region Best (MB/s): %8.2lf   W-Method: ", $1 }'
+echo $w $method