From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/erasure-code/jerasure/gf-complete/.gitignore | 78 + src/erasure-code/jerasure/gf-complete/AUTHORS | 0 src/erasure-code/jerasure/gf-complete/COPYING | 32 + src/erasure-code/jerasure/gf-complete/ChangeLog | 0 src/erasure-code/jerasure/gf-complete/License.txt | 32 + src/erasure-code/jerasure/gf-complete/Makefile.am | 10 + src/erasure-code/jerasure/gf-complete/NEWS | 0 src/erasure-code/jerasure/gf-complete/README | 21 + src/erasure-code/jerasure/gf-complete/README.txt | 21 + src/erasure-code/jerasure/gf-complete/autogen.sh | 2 + src/erasure-code/jerasure/gf-complete/configure.ac | 87 + .../jerasure/gf-complete/examples/Makefile.am | 37 + .../jerasure/gf-complete/examples/gf_example_1.c | 58 + .../jerasure/gf-complete/examples/gf_example_2.c | 107 + .../jerasure/gf-complete/examples/gf_example_3.c | 74 + .../jerasure/gf-complete/examples/gf_example_4.c | 69 + .../jerasure/gf-complete/examples/gf_example_5.c | 78 + .../jerasure/gf-complete/examples/gf_example_6.c | 84 + .../jerasure/gf-complete/examples/gf_example_7.c | 75 + .../jerasure/gf-complete/include/gf_complete.h | 204 ++ .../jerasure/gf-complete/include/gf_cpu.h | 20 + .../jerasure/gf-complete/include/gf_general.h | 61 + .../jerasure/gf-complete/include/gf_int.h | 216 ++ .../jerasure/gf-complete/include/gf_method.h | 20 + .../jerasure/gf-complete/include/gf_rand.h | 22 + .../jerasure/gf-complete/include/gf_w16.h | 66 + .../jerasure/gf-complete/include/gf_w32.h | 71 + .../jerasure/gf-complete/include/gf_w4.h | 63 + .../jerasure/gf-complete/include/gf_w64.h | 50 + .../jerasure/gf-complete/include/gf_w8.h | 99 + .../gf-complete/m4/ax_check_compile_flag.m4 | 72 + src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 | 75 + .../jerasure/gf-complete/manual/gf-complete.html | 3484 ++++++++++++++++++++ .../jerasure/gf-complete/manual/image1.png | Bin 0 -> 32752 bytes .../jerasure/gf-complete/manual/image2.png | Bin 0 -> 28317 bytes .../jerasure/gf-complete/manual/image3.png | Bin 0 -> 42222 bytes .../jerasure/gf-complete/manual/image4.png | Bin 0 -> 26886 bytes .../jerasure/gf-complete/manual/image5.png | Bin 0 -> 51457 bytes .../jerasure/gf-complete/manual/image6.png | Bin 0 -> 47306 bytes .../jerasure/gf-complete/manual/image7.png | Bin 0 -> 21312 bytes .../jerasure/gf-complete/manual/style.css | 404 +++ .../jerasure/gf-complete/src/Makefile.am | 32 + src/erasure-code/jerasure/gf-complete/src/gf.c | 1090 ++++++ src/erasure-code/jerasure/gf-complete/src/gf_cpu.c | 180 + .../jerasure/gf-complete/src/gf_general.c | 539 +++ .../jerasure/gf-complete/src/gf_method.c | 193 ++ .../jerasure/gf-complete/src/gf_rand.c | 80 + .../jerasure/gf-complete/src/gf_w128.c | 1776 ++++++++++ src/erasure-code/jerasure/gf-complete/src/gf_w16.c | 2449 ++++++++++++++ src/erasure-code/jerasure/gf-complete/src/gf_w32.c | 2810 ++++++++++++++++ src/erasure-code/jerasure/gf-complete/src/gf_w4.c | 2047 ++++++++++++ src/erasure-code/jerasure/gf-complete/src/gf_w64.c | 2235 +++++++++++++ src/erasure-code/jerasure/gf-complete/src/gf_w8.c | 2398 ++++++++++++++ .../jerasure/gf-complete/src/gf_wgen.c | 1019 ++++++ .../jerasure/gf-complete/src/neon/gf_w16_neon.c | 276 ++ .../jerasure/gf-complete/src/neon/gf_w32_neon.c | 269 ++ .../jerasure/gf-complete/src/neon/gf_w4_neon.c | 247 ++ .../jerasure/gf-complete/src/neon/gf_w64_neon.c | 333 ++ .../jerasure/gf-complete/src/neon/gf_w8_neon.c | 302 ++ .../jerasure/gf-complete/test/Makefile.am | 11 + .../jerasure/gf-complete/test/gf_unit.c | 458 +++ .../jerasure/gf-complete/tools/Makefile.am | 56 + .../jerasure/gf-complete/tools/gf_add.c | 114 + .../jerasure/gf-complete/tools/gf_div.c | 68 + .../jerasure/gf-complete/tools/gf_inline_time.c | 170 + .../jerasure/gf-complete/tools/gf_methods.c | 246 ++ .../jerasure/gf-complete/tools/gf_mult.c | 68 + .../jerasure/gf-complete/tools/gf_poly.c | 275 ++ .../jerasure/gf-complete/tools/gf_time.c | 232 ++ .../jerasure/gf-complete/tools/test_simd.sh | 367 +++ .../jerasure/gf-complete/tools/test_simd_qemu.sh | 258 ++ .../jerasure/gf-complete/tools/time_tool.sh | 98 + 72 files changed, 26488 insertions(+) create mode 100644 src/erasure-code/jerasure/gf-complete/.gitignore create mode 100644 src/erasure-code/jerasure/gf-complete/AUTHORS create mode 100644 src/erasure-code/jerasure/gf-complete/COPYING create mode 100644 src/erasure-code/jerasure/gf-complete/ChangeLog create mode 100644 src/erasure-code/jerasure/gf-complete/License.txt create mode 100644 src/erasure-code/jerasure/gf-complete/Makefile.am create mode 100644 src/erasure-code/jerasure/gf-complete/NEWS create mode 100644 src/erasure-code/jerasure/gf-complete/README create mode 100644 src/erasure-code/jerasure/gf-complete/README.txt create mode 100755 src/erasure-code/jerasure/gf-complete/autogen.sh create mode 100644 src/erasure-code/jerasure/gf-complete/configure.ac create mode 100644 src/erasure-code/jerasure/gf-complete/examples/Makefile.am create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c create mode 100644 src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_complete.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_cpu.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_general.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_int.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_method.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_rand.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_w16.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_w32.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_w4.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_w64.h create mode 100644 src/erasure-code/jerasure/gf-complete/include/gf_w8.h create mode 100644 src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4 create mode 100644 src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 create mode 100644 src/erasure-code/jerasure/gf-complete/manual/gf-complete.html create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image1.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image2.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image3.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image4.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image5.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image6.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/image7.png create mode 100644 src/erasure-code/jerasure/gf-complete/manual/style.css create mode 100644 src/erasure-code/jerasure/gf-complete/src/Makefile.am create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_cpu.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_general.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_method.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_rand.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w128.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w16.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w32.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w4.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w64.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_w8.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/gf_wgen.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c create mode 100644 src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c create mode 100644 src/erasure-code/jerasure/gf-complete/test/Makefile.am create mode 100644 src/erasure-code/jerasure/gf-complete/test/gf_unit.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/Makefile.am create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_add.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_div.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_methods.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_mult.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_poly.c create mode 100644 src/erasure-code/jerasure/gf-complete/tools/gf_time.c create mode 100755 src/erasure-code/jerasure/gf-complete/tools/test_simd.sh create mode 100755 src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh create mode 100644 src/erasure-code/jerasure/gf-complete/tools/time_tool.sh (limited to 'src/erasure-code/jerasure/gf-complete') diff --git a/src/erasure-code/jerasure/gf-complete/.gitignore b/src/erasure-code/jerasure/gf-complete/.gitignore new file mode 100644 index 000000000..bfc1dfc10 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/.gitignore @@ -0,0 +1,78 @@ +Makefile +Makefile.in +/autom4te.cache +/aclocal.m4 +/compile +/configure +/depcomp +/install-sh +/missing +include/config.h +include/config.h.in +include/config.h.in~ +include/stamp-h1 + +# Object files +*.o +*.ko +*.obj +*.elf + +# Libraries +*.lib +*.la +*.a + +# Shared objects (inc. Windows DLLs) +*.dll +*.lo +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Other stuff +.deps/ +.libs/ +/config.log +/config.status +/libtool +INSTALL +config.guess +config.sub +ltmain.sh +m4/libtool.m4 +m4/ltversion.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/lt~obsolete.m4 +test-driver +src/.dirstamp +test-driver + +examples/gf_example_1 +examples/gf_example_2 +examples/gf_example_3 +examples/gf_example_4 +examples/gf_example_5 +examples/gf_example_6 +examples/gf_example_7 +test/gf_unit +tools/gf_add +tools/gf_div +tools/gf_inline_time +tools/gf_methods +tools/gf_mult +tools/gf_poly +tools/gf_time +tools/gf_unit_w* +tools/test-suite.log +tools/.qemu/ +tools/test_simd*.results* diff --git a/src/erasure-code/jerasure/gf-complete/AUTHORS b/src/erasure-code/jerasure/gf-complete/AUTHORS new file mode 100644 index 000000000..e69de29bb diff --git a/src/erasure-code/jerasure/gf-complete/COPYING b/src/erasure-code/jerasure/gf-complete/COPYING new file mode 100644 index 000000000..df8d9ed33 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/COPYING @@ -0,0 +1,32 @@ +Copyright (c) 2013, James S. Plank, Ethan L. Miller, Kevin M. Greenan, +Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + - Neither the name of the University of Tennessee nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/src/erasure-code/jerasure/gf-complete/ChangeLog b/src/erasure-code/jerasure/gf-complete/ChangeLog new file mode 100644 index 000000000..e69de29bb diff --git a/src/erasure-code/jerasure/gf-complete/License.txt b/src/erasure-code/jerasure/gf-complete/License.txt new file mode 100644 index 000000000..df8d9ed33 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/License.txt @@ -0,0 +1,32 @@ +Copyright (c) 2013, James S. Plank, Ethan L. Miller, Kevin M. Greenan, +Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + - Neither the name of the University of Tennessee nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/src/erasure-code/jerasure/gf-complete/Makefile.am b/src/erasure-code/jerasure/gf-complete/Makefile.am new file mode 100644 index 000000000..cfb293a15 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/Makefile.am @@ -0,0 +1,10 @@ +# Top-level GF-Complete AM file +# Distributes headers + +SUBDIRS = src tools test examples +ACLOCAL_AMFLAGS = -I m4 + +include_HEADERS = include/gf_complete.h include/gf_method.h include/gf_rand.h include/gf_general.h + +# display the output of failed TESTS after a failed make check +export VERBOSE = true diff --git a/src/erasure-code/jerasure/gf-complete/NEWS b/src/erasure-code/jerasure/gf-complete/NEWS new file mode 100644 index 000000000..e69de29bb diff --git a/src/erasure-code/jerasure/gf-complete/README b/src/erasure-code/jerasure/gf-complete/README new file mode 100644 index 000000000..7fd2f0494 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/README @@ -0,0 +1,21 @@ +This is GF-Complete, Revision 1.03. January 1, 2015. + +Authors: James S. Plank (University of Tennessee) + Ethan L. Miller (UC Santa Cruz) + Kevin M. Greenan (Box) + Benjamin A. Arnold (University of Tennessee) + John A. Burnum (University of Tennessee) + Adam W. Disney (University of Tennessee, + Allen C. McBride (University of Tennessee) + +The user's manual is in the file Manual.pdf. + +The online home for GF-Complete is: + + - https://jerasure.org/jerasure/gf-complete + +To compile, do: + + ./configure + make + sudo make install diff --git a/src/erasure-code/jerasure/gf-complete/README.txt b/src/erasure-code/jerasure/gf-complete/README.txt new file mode 100644 index 000000000..cd2d66e1e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/README.txt @@ -0,0 +1,21 @@ +This is GF-Complete, Revision 1.03. January 1, 2015. + +Authors: James S. Plank (University of Tennessee) + Ethan L. Miller (UC Santa Cruz) + Kevin M. Greenan (Box) + Benjamin A. Arnold (University of Tennessee) + John A. Burnum (University of Tennessee) + Adam W. Disney (University of Tennessee, + Allen C. McBride (University of Tennessee) + +The user's manual is in the file Manual.pdf. + +The online home for GF-Complete is: + + - http://jerasure.org/jerasure/gf-complete + +To compile, do: + + ./configure + make + sudo make install diff --git a/src/erasure-code/jerasure/gf-complete/autogen.sh b/src/erasure-code/jerasure/gf-complete/autogen.sh new file mode 100755 index 000000000..b483139f9 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/autogen.sh @@ -0,0 +1,2 @@ +#!/bin/sh +autoreconf --force --install -I m4 diff --git a/src/erasure-code/jerasure/gf-complete/configure.ac b/src/erasure-code/jerasure/gf-complete/configure.ac new file mode 100644 index 000000000..d696f6eb0 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/configure.ac @@ -0,0 +1,87 @@ +# gf-complete autoconf template + +# FIXME - add project url as the last argument +AC_INIT(gf-complete, 1.0) + +# Override default CFLAGS +: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"} + +AC_PREREQ([2.61]) + +AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests]) +LT_INIT # libtool + +AC_CONFIG_HEADER(include/config.h) + +dnl Needed when reconfiguring with 'autoreconf -i -s' +AC_CONFIG_MACRO_DIR([m4]) + +# This prevents './configure; make' from trying to run autotools. +AM_MAINTAINER_MODE([disable]) + +dnl Compiling with per-target flags requires AM_PROG_CC_C_O. +AC_PROG_CC + +# Check for functions to provide aligned memory +# +AC_CHECK_FUNCS([posix_memalign], + [found_memalign=yes; break]) + +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])]) + +AC_ARG_ENABLE([debug-functions], + AS_HELP_STRING([--enable-debug-func], [Enable debugging of functions selected])) +AS_IF([test "x$enable_debug_func" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_FUNCTIONS"]) + +AC_ARG_ENABLE([debug-cpu], + AS_HELP_STRING([--enable-debug-cpu], [Enable debugging of SIMD detection])) +AS_IF([test "x$enable_debug_cpu" = "xyes"], [CPPFLAGS="$CPPFLAGS -DDEBUG_CPU_DETECTION"]) + +AX_EXT() + +AC_ARG_ENABLE([neon], + AS_HELP_STRING([--disable-neon], [Build without NEON optimizations])) + +AS_IF([test "x$enable_neon" != "xno"], + [noneon_CPPFLAGS=$CPPFLAGS + CPPFLAGS="$CPPFLAGS $SIMD_FLAGS" + AC_CHECK_HEADER([arm_neon.h], + [have_neon=yes], + [have_neon=no + CPPFLAGS=$noneon_CPPFLAGS])], + [have_neon=no + AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"], + [SIMD_FLAGS=""]) + ]) + +AS_IF([test "x$have_neon" = "xno"], + [AS_IF([test "x$enable_neon" = "xyes"], + [AC_MSG_ERROR([neon requested but arm_neon.h not found])]) + ]) +AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"]) + +AC_ARG_ENABLE([sse], + AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]), + [if test "x$enableval" = "xno" ; then + SIMD_FLAGS="" + echo "DISABLED SSE!!!" + fi] +) + +AC_ARG_ENABLE([valgrind], + [AS_HELP_STRING([--enable-valgrind], [run tests with valgrind])], + [], + [enable_valgrind=no]) +AM_CONDITIONAL(ENABLE_VALGRIND, test "x$enable_valgrind" != xno) + +AC_ARG_ENABLE([avx], AS_HELP_STRING([--enable-avx], [Build with AVX optimizations])) +AX_CHECK_COMPILE_FLAG(-mavx, [ax_cv_support_avx=yes], []) + +AS_IF([test "x$enable_avx" = "xyes"], + [AS_IF([test "x$ax_cv_support_avx" = "xno"], + [AC_MSG_ERROR([AVX requested but compiler does not support -mavx])], + [SIMD_FLAGS="$SIMD_FLAGS -mavx"]) + ]) + +AC_CONFIG_FILES([Makefile src/Makefile tools/Makefile test/Makefile examples/Makefile]) +AC_OUTPUT diff --git a/src/erasure-code/jerasure/gf-complete/examples/Makefile.am b/src/erasure-code/jerasure/gf-complete/examples/Makefile.am new file mode 100644 index 000000000..a420bda84 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/Makefile.am @@ -0,0 +1,37 @@ +# GF-Complete 'examples' AM file + +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC + +bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \ + gf_example_5 gf_example_6 gf_example_7 + +gf_example_1_SOURCES = gf_example_1.c +#gf_example_1_LDFLAGS = -lgf_complete +gf_example_1_LDADD = ../src/libgf_complete.la + +gf_example_2_SOURCES = gf_example_2.c +#gf_example_2_LDFLAGS = -lgf_complete +gf_example_2_LDADD = ../src/libgf_complete.la + +gf_example_3_SOURCES = gf_example_3.c +#gf_example_3_LDFLAGS = -lgf_complete +gf_example_3_LDADD = ../src/libgf_complete.la + +gf_example_4_SOURCES = gf_example_4.c +#gf_example_4_LDFLAGS = -lgf_complete +gf_example_4_LDADD = ../src/libgf_complete.la + +gf_example_5_SOURCES = gf_example_5.c +#gf_example_5_LDFLAGS = -lgf_complete +gf_example_5_LDADD = ../src/libgf_complete.la + +gf_example_6_SOURCES = gf_example_6.c +#gf_example_6_LDFLAGS = -lgf_complete +gf_example_6_LDADD = ../src/libgf_complete.la + +gf_example_7_SOURCES = gf_example_7.c +#gf_example_7_LDFLAGS = -lgf_complete +gf_example_7_LDADD = ../src/libgf_complete.la + + diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c new file mode 100644 index 000000000..a7a415595 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_1.c @@ -0,0 +1,58 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_1.c + * + * Demonstrates using the procedures for examples in GF(2^w) for w <= 32. + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_1 w - w must be between 1 and 32\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint32_t a, b, c; + int w; + gf_t gf; + + if (argc != 2) usage(NULL); + w = atoi(argv[1]); + if (w <= 0 || w > 32) usage("Bad w"); + + /* Get two random numbers in a and b */ + + MOA_Seed(time(0)); + a = MOA_Random_W(w, 0); + b = MOA_Random_W(w, 0); + + /* Create the proper instance of the gf_t object using defaults: */ + + gf_init_easy(&gf, w); + + /* And multiply a and b using the galois field: */ + + c = gf.multiply.w32(&gf, a, b); + printf("%u * %u = %u\n", a, b, c); + + /* Divide the product by a and b */ + + printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a)); + printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b)); + + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c new file mode 100644 index 000000000..576d9a534 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_2.c @@ -0,0 +1,107 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_2.c + * + * Demonstrates using the procedures for examples in GF(2^w) for w <= 32. + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_2 w - w must be between 1 and 32\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint32_t a, b, c; + uint8_t *r1, *r2; + uint16_t *r16 = NULL; + uint32_t *r32 = NULL; + int w, i; + gf_t gf; + + if (argc != 2) usage(NULL); + w = atoi(argv[1]); + if (w <= 0 || w > 32) usage("Bad w"); + + /* Get two random numbers in a and b */ + + MOA_Seed(time(0)); + a = MOA_Random_W(w, 0); + b = MOA_Random_W(w, 0); + + /* Create the proper instance of the gf_t object using defaults: */ + + gf_init_easy(&gf, w); + + /* And multiply a and b using the galois field: */ + + c = gf.multiply.w32(&gf, a, b); + printf("%u * %u = %u\n", a, b, c); + + /* Divide the product by a and b */ + + printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a)); + printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b)); + + /* If w is 4, 8, 16 or 32, do a very small region operation */ + + if (w == 4 || w == 8 || w == 16 || w == 32) { + r1 = (uint8_t *) malloc(16); + r2 = (uint8_t *) malloc(16); + + if (w == 4 || w == 8) { + r1[0] = b; + for (i = 1; i < 16; i++) r1[i] = MOA_Random_W(8, 1); + } else if (w == 16) { + r16 = (uint16_t *) r1; + r16[0] = b; + for (i = 1; i < 8; i++) r16[i] = MOA_Random_W(16, 1); + } else { + r32 = (uint32_t *) r1; + r32[0] = b; + for (i = 1; i < 4; i++) r32[i] = MOA_Random_W(32, 1); + } + + gf.multiply_region.w32(&gf, r1, r2, a, 16, 0); + + printf("\nmultiply_region by 0x%x (%u)\n\n", a, a); + printf("R1 (the source): "); + if (w == 4) { + for (i = 0; i < 16; i++) printf(" %x %x", r1[i] >> 4, r1[i] & 0xf); + } else if (w == 8) { + for (i = 0; i < 16; i++) printf(" %02x", r1[i]); + } else if (w == 16) { + for (i = 0; i < 8; i++) printf(" %04x", r16[i]); + } else if (w == 32) { + for (i = 0; i < 4; i++) printf(" %08x", r32[i]); + } + printf("\nR2 (the product): "); + if (w == 4) { + for (i = 0; i < 16; i++) printf(" %x %x", r2[i] >> 4, r2[i] & 0xf); + } else if (w == 8) { + for (i = 0; i < 16; i++) printf(" %02x", r2[i]); + } else if (w == 16) { + r16 = (uint16_t *) r2; + for (i = 0; i < 8; i++) printf(" %04x", r16[i]); + } else if (w == 32) { + r32 = (uint32_t *) r2; + for (i = 0; i < 4; i++) printf(" %08x", r32[i]); + } + printf("\n"); + } + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c new file mode 100644 index 000000000..d6fef879e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_3.c @@ -0,0 +1,74 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_3.c + * + * Identical to example_2 except it works in GF(2^64) + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_3\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint64_t a, b, c; + uint64_t *r1, *r2; + int i; + gf_t gf; + + if (argc != 1) usage(NULL); + + /* Get two random numbers in a and b */ + + MOA_Seed(time(0)); + a = MOA_Random_64(); + b = MOA_Random_64(); + + /* Create the proper instance of the gf_t object using defaults: */ + + gf_init_easy(&gf, 64); + + /* And multiply a and b using the galois field: */ + + c = gf.multiply.w64(&gf, a, b); + printf("%llx * %llx = %llx\n", (long long unsigned int) a, (long long unsigned int) b, (long long unsigned int) c); + + /* Divide the product by a and b */ + + printf("%llx / %llx = %llx\n", (long long unsigned int) c, (long long unsigned int) a, (long long unsigned int) gf.divide.w64(&gf, c, a)); + printf("%llx / %llx = %llx\n", (long long unsigned int) c, (long long unsigned int) b, (long long unsigned int) gf.divide.w64(&gf, c, b)); + + r1 = (uint64_t *) malloc(32); + r2 = (uint64_t *) malloc(32); + + r1[0] = b; + + for (i = 1; i < 4; i++) r1[i] = MOA_Random_64(); + + gf.multiply_region.w64(&gf, r1, r2, a, 32, 0); + + printf("\nmultiply_region by %llx\n\n", (long long unsigned int) a); + printf("R1 (the source): "); + for (i = 0; i < 4; i++) printf(" %016llx", (long long unsigned int) r1[i]); + + printf("\nR2 (the product): "); + for (i = 0; i < 4; i++) printf(" %016llx", (long long unsigned int) r2[i]); + printf("\n"); + + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c new file mode 100644 index 000000000..17529b5b0 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_4.c @@ -0,0 +1,69 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_4.c + * + * Identical to example_3 except it works in GF(2^128) + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +#define LLUI (long long unsigned int) + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_3\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint64_t a[2], b[2], c[2]; + uint64_t *r1, *r2; + int i; + gf_t gf; + + if (argc != 1) usage(NULL); + + /* Get two random numbers in a and b */ + + MOA_Seed(time(0)); + MOA_Random_128(a); + MOA_Random_128(b); + + /* Create the proper instance of the gf_t object using defaults: */ + + gf_init_easy(&gf, 128); + + /* And multiply a and b using the galois field: */ + + gf.multiply.w128(&gf, a, b, c); + printf("%016llx%016llx * %016llx%016llx =\n%016llx%016llx\n", + LLUI a[0], LLUI a[1], LLUI b[0], LLUI b[1], LLUI c[0], LLUI c[1]); + + r1 = (uint64_t *) malloc(32); + r2 = (uint64_t *) malloc(32); + + for (i = 0; i < 4; i++) r1[i] = MOA_Random_64(); + + gf.multiply_region.w128(&gf, r1, r2, a, 32, 0); + + printf("\nmultiply_region by %016llx%016llx\n\n", LLUI a[0], LLUI a[1]); + printf("R1 (the source): "); + for (i = 0; i < 4; i += 2) printf(" %016llx%016llx", LLUI r1[i], LLUI r1[i+1]); + + printf("\nR2 (the product): "); + for (i = 0; i < 4; i += 2) printf(" %016llx%016llx", LLUI r2[i], LLUI r2[i+1]); + printf("\n"); + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c new file mode 100644 index 000000000..da6e9ca68 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_5.c @@ -0,0 +1,78 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_5.c + * + * Demonstrating altmap and extract_word + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_5\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint16_t *a, *b; + int i, j; + gf_t gf; + + if (gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, + 0, 16, 4, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard failed\n"); + exit(1); + } + + a = (uint16_t *) malloc(200); + b = (uint16_t *) malloc(200); + + a += 6; + b += 6; + + MOA_Seed(0); + + for (i = 0; i < 30; i++) a[i] = MOA_Random_W(16, 1); + + gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + for (i = 0; i < 30; i += 10) { + printf("\n"); + printf(" "); + for (j = 0; j < 10; j++) printf(" %4d", i+j); + printf("\n"); + + printf("a:"); + for (j = 0; j < 10; j++) printf(" %04x", a[i+j]); + printf("\n"); + + printf("b:"); + for (j = 0; j < 10; j++) printf(" %04x", b[i+j]); + printf("\n"); + printf("\n"); + } + + for (i = 0; i < 15; i ++) { + printf("Word %2d: 0x%04x * 0x1234 = 0x%04x ", i, + gf.extract_word.w32(&gf, a, 30*2, i), + gf.extract_word.w32(&gf, b, 30*2, i)); + printf("Word %2d: 0x%04x * 0x1234 = 0x%04x\n", i+15, + gf.extract_word.w32(&gf, a, 30*2, i+15), + gf.extract_word.w32(&gf, b, 30*2, i+15)); + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c new file mode 100644 index 000000000..800a35ffb --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_6.c @@ -0,0 +1,84 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_6.c + * + * Demonstrating altmap and extract_word + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_6\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint32_t *a, *b; + int i, j; + gf_t gf, gf_16; + + if (gf_init_hard(&gf_16, 16, GF_MULT_LOG_TABLE, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + 0, 0, 0, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard (6) failed\n"); + exit(1); + } + + if (gf_init_hard(&gf, 32, GF_MULT_COMPOSITE, GF_REGION_ALTMAP, GF_DIVIDE_DEFAULT, + 0, 2, 0, &gf_16, NULL) == 0) { + fprintf(stderr, "gf_init_hard (32) failed\n"); + exit(1); + } + + a = (uint32_t *) malloc(200); + b = (uint32_t *) malloc(200); + + a += 3; + b += 3; + + MOA_Seed(0); + + for (i = 0; i < 30; i++) a[i] = MOA_Random_W(32, 1); + + gf.multiply_region.w32(&gf, a, b, 0x12345678, 30*4, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + for (i = 0; i < 30; i += 10) { + printf("\n"); + printf(" "); + for (j = 0; j < 10; j++) printf(" %8d", i+j); + printf("\n"); + + printf("a:"); + for (j = 0; j < 10; j++) printf(" %08x", a[i+j]); + printf("\n"); + + printf("b:"); + for (j = 0; j < 10; j++) printf(" %08x", b[i+j]); + printf("\n"); + printf("\n"); + } + + for (i = 0; i < 15; i ++) { + printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x ", i, + gf.extract_word.w32(&gf, a, 30*4, i), + gf.extract_word.w32(&gf, b, 30*4, i)); + printf("Word %2d: 0x%08x * 0x12345678 = 0x%08x\n", i+15, + gf.extract_word.w32(&gf, a, 30*4, i+15), + gf.extract_word.w32(&gf, b, 30*4, i+15)); + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c b/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c new file mode 100644 index 000000000..ee07d5353 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/examples/gf_example_7.c @@ -0,0 +1,75 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_example_7.c + * + * Demonstrating extract_word and Cauchy + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_example_7\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + uint8_t *a, *b; + int i, j; + gf_t gf; + + if (gf_init_hard(&gf, 3, GF_MULT_TABLE, GF_REGION_CAUCHY, GF_DIVIDE_DEFAULT, 0, 0, 0, NULL, NULL) == 0) { + fprintf(stderr, "gf_init_hard failed\n"); + exit(1); + } + + a = (uint8_t *) malloc(3); + b = (uint8_t *) malloc(3); + + MOA_Seed(0); + + for (i = 0; i < 3; i++) a[i] = MOA_Random_W(8, 1); + + gf.multiply_region.w32(&gf, a, b, 5, 3, 0); + + printf("a: 0x%lx b: 0x%lx\n", (unsigned long) a, (unsigned long) b); + + printf("\n"); + printf("a: 0x%02x 0x%02x 0x%02x\n", a[0], a[1], a[2]); + printf("b: 0x%02x 0x%02x 0x%02x\n", b[0], b[1], b[2]); + printf("\n"); + + printf("a bits:"); + for (i = 0; i < 3; i++) { + printf(" "); + for (j = 7; j >= 0; j--) printf("%c", (a[i] & (1 << j)) ? '1' : '0'); + } + printf("\n"); + + printf("b bits:"); + for (i = 0; i < 3; i++) { + printf(" "); + for (j = 7; j >= 0; j--) printf("%c", (b[i] & (1 << j)) ? '1' : '0'); + } + printf("\n"); + + printf("\n"); + for (i = 0; i < 8; i++) { + printf("Word %2d: %d * 5 = %d\n", i, + gf.extract_word.w32(&gf, a, 3, i), + gf.extract_word.w32(&gf, b, 3, i)); + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_complete.h b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h new file mode 100644 index 000000000..c4783e80a --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_complete.h @@ -0,0 +1,204 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_complete.h + * + * The main include file for gf_complete. + */ + +#ifndef _GF_COMPLETE_H_ +#define _GF_COMPLETE_H_ +#include + +#ifdef INTEL_SSE4 + #ifdef __SSE4_2__ + #include + #endif + #ifdef __SSE4_1__ + #include + #endif +#endif + +#ifdef INTEL_SSSE3 + #include +#endif + +#ifdef INTEL_SSE2 + #include +#endif + +#ifdef INTEL_SSE4_PCLMUL + #include +#endif + +#if defined(ARM_NEON) + #include +#endif + + +/* These are the different ways to perform multiplication. + Not all are implemented for all values of w. + See the paper for an explanation of how they work. */ + +typedef enum {GF_MULT_DEFAULT, + GF_MULT_SHIFT, + GF_MULT_CARRY_FREE, + GF_MULT_CARRY_FREE_GK, + GF_MULT_GROUP, + GF_MULT_BYTWO_p, + GF_MULT_BYTWO_b, + GF_MULT_TABLE, + GF_MULT_LOG_TABLE, + GF_MULT_LOG_ZERO, + GF_MULT_LOG_ZERO_EXT, + GF_MULT_SPLIT_TABLE, + GF_MULT_COMPOSITE } gf_mult_type_t; + +/* These are the different ways to optimize region + operations. They are bits because you can compose them. + Certain optimizations only apply to certain gf_mult_type_t's. + Again, please see documentation for how to use these */ + +#define GF_REGION_DEFAULT (0x0) +#define GF_REGION_DOUBLE_TABLE (0x1) +#define GF_REGION_QUAD_TABLE (0x2) +#define GF_REGION_LAZY (0x4) +#define GF_REGION_SIMD (0x8) +#define GF_REGION_SSE (0x8) +#define GF_REGION_NOSIMD (0x10) +#define GF_REGION_NOSSE (0x10) +#define GF_REGION_ALTMAP (0x20) +#define GF_REGION_CAUCHY (0x40) + +typedef uint32_t gf_region_type_t; + +/* These are different ways to implement division. + Once again, it's best to use "DEFAULT". However, + there are times when you may want to experiment + with the others. */ + +typedef enum { GF_DIVIDE_DEFAULT, + GF_DIVIDE_MATRIX, + GF_DIVIDE_EUCLID } gf_division_type_t; + +/* We support w=4,8,16,32,64 and 128 with their own data types and + operations for multiplication, division, etc. We also support + a "gen" type so that you can do general gf arithmetic for any + value of w from 1 to 32. You can perform a "region" operation + on these if you use "CAUCHY" as the mapping. + */ + +typedef uint32_t gf_val_32_t; +typedef uint64_t gf_val_64_t; +typedef uint64_t *gf_val_128_t; + +extern int _gf_errno; +extern void gf_error(); + +typedef struct gf *GFP; + +typedef union gf_func_a_b { + gf_val_32_t (*w32) (GFP gf, gf_val_32_t a, gf_val_32_t b); + gf_val_64_t (*w64) (GFP gf, gf_val_64_t a, gf_val_64_t b); + void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t c); +} gf_func_a_b; + +typedef union { + gf_val_32_t (*w32) (GFP gf, gf_val_32_t a); + gf_val_64_t (*w64) (GFP gf, gf_val_64_t a); + void (*w128)(GFP gf, gf_val_128_t a, gf_val_128_t b); +} gf_func_a; + +typedef union { + void (*w32) (GFP gf, void *src, void *dest, gf_val_32_t val, int bytes, int add); + void (*w64) (GFP gf, void *src, void *dest, gf_val_64_t val, int bytes, int add); + void (*w128)(GFP gf, void *src, void *dest, gf_val_128_t val, int bytes, int add); +} gf_region; + +typedef union { + gf_val_32_t (*w32) (GFP gf, void *start, int bytes, int index); + gf_val_64_t (*w64) (GFP gf, void *start, int bytes, int index); + void (*w128)(GFP gf, void *start, int bytes, int index, gf_val_128_t rv); +} gf_extract; + +typedef struct gf { + gf_func_a_b multiply; + gf_func_a_b divide; + gf_func_a inverse; + gf_region multiply_region; + gf_extract extract_word; + void *scratch; +} gf_t; + +/* Initializes the GF to defaults. Pass it a pointer to a gf_t. + Returns 0 on failure, 1 on success. */ + +extern int gf_init_easy(GFP gf, int w); + +/* Initializes the GF changing the defaults. + Returns 0 on failure, 1 on success. + Pass it a pointer to a gf_t. + For mult_type and divide_type, use one of gf_mult_type_t gf_divide_type_t . + For region_type, OR together the GF_REGION_xxx's defined above. + Use 0 as prim_poly for defaults. Otherwise, the leading 1 is optional. + Use NULL for scratch_memory to have init_hard allocate memory. Otherwise, + use gf_scratch_size() to determine how big scratch_memory has to be. + */ + +extern int gf_init_hard(GFP gf, + int w, + int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, + int arg2, + GFP base_gf, + void *scratch_memory); + +/* Determines the size for scratch_memory. + Returns 0 on failure and non-zero on success. */ + +extern int gf_scratch_size(int w, + int mult_type, + int region_type, + int divide_type, + int arg1, + int arg2); + +/* This reports the gf_scratch_size of a gf_t that has already been created */ + +extern int gf_size(GFP gf); + +/* Frees scratch memory if gf_init_easy/gf_init_hard called malloc. + If recursive = 1, then it calls itself recursively on base_gf. */ + +extern int gf_free(GFP gf, int recursive); + +/* This is support for inline single multiplications and divisions. + I know it's yucky, but if you've got to be fast, you've got to be fast. + We support inlining for w=4, w=8 and w=16. + + To use inline multiplication and division with w=4 or 8, you should use the + default gf_t, or one with a single table. Otherwise, gf_w4/8_get_mult_table() + will return NULL. Similarly, with w=16, the gf_t must be LOG */ + +uint8_t *gf_w4_get_mult_table(GFP gf); +uint8_t *gf_w4_get_div_table(GFP gf); + +#define GF_W4_INLINE_MULTDIV(table, a, b) (table[((a)<<4)|(b)]) + +uint8_t *gf_w8_get_mult_table(GFP gf); +uint8_t *gf_w8_get_div_table(GFP gf); + +#define GF_W8_INLINE_MULTDIV(table, a, b) (table[(((uint32_t) (a))<<8)|(b)]) + +uint16_t *gf_w16_get_log_table(GFP gf); +uint16_t *gf_w16_get_mult_alog_table(GFP gf); +uint16_t *gf_w16_get_div_alog_table(GFP gf); + +#define GF_W16_INLINE_MULT(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(uint32_t)log[a]+(uint32_t)log[b]]) +#define GF_W16_INLINE_DIV(log, alog, a, b) ((a) == 0 || (b) == 0) ? 0 : (alog[(int)log[a]-(int)log[b]]) +#endif diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h b/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h new file mode 100644 index 000000000..71c722706 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_cpu.h @@ -0,0 +1,20 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_cpu.h + * + * Identifies whether the CPU supports SIMD instructions at runtime. + */ + +#pragma once + +extern int gf_cpu_supports_intel_pclmul; +extern int gf_cpu_supports_intel_sse4; +extern int gf_cpu_supports_intel_ssse3; +extern int gf_cpu_supports_intel_sse3; +extern int gf_cpu_supports_intel_sse2; +extern int gf_cpu_supports_arm_neon; + +void gf_cpu_identify(void); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_general.h b/src/erasure-code/jerasure/gf-complete/include/gf_general.h new file mode 100644 index 000000000..9a5de529d --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_general.h @@ -0,0 +1,61 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_general.h + * + * This file has helper routines for doing basic GF operations with any + * legal value of w. The problem is that w <= 32, w=64 and w=128 all have + * different data types, which is a pain. The procedures in this file try + * to alleviate that pain. They are used in gf_unit and gf_time. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" + +typedef union { + uint32_t w32; + uint64_t w64; + uint64_t w128[2]; +} gf_general_t; + +void gf_general_set_zero(gf_general_t *v, int w); +void gf_general_set_one(gf_general_t *v, int w); +void gf_general_set_two(gf_general_t *v, int w); + +int gf_general_is_zero(gf_general_t *v, int w); +int gf_general_is_one(gf_general_t *v, int w); +int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w); + +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex); +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex); + +void gf_general_set_random(gf_general_t *v, int w, int zero_ok); + +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c); +void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b); + +void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, + void *ra, void *rb, + int bytes, int xor); + +void gf_general_do_region_check(gf_t *gf, gf_general_t *a, + void *orig_a, void *orig_target, void *final_target, + int bytes, int xor); + + +/* Which is M, D or I for multiply, divide or inverse. */ + +void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size); +int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char which); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_int.h b/src/erasure-code/jerasure/gf-complete/include/gf_int.h new file mode 100644 index 000000000..0356920fd --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_int.h @@ -0,0 +1,216 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_int.h + * + * Internal code for Galois field routines. This is not meant for + * users to include, but for the internal GF files to use. + */ + +#pragma once + +#include "gf_complete.h" + +#include + +extern void timer_start (double *t); +extern double timer_split (const double *t); +extern void galois_fill_random (void *buf, int len, unsigned int seed); + +typedef struct { + int mult_type; + int region_type; + int divide_type; + int w; + uint64_t prim_poly; + int free_me; + int arg1; + int arg2; + gf_t *base_gf; + void *private; +#ifdef DEBUG_FUNCTIONS + const char *multiply; + const char *divide; + const char *inverse; + const char *multiply_region; + const char *extract_word; +#endif +} gf_internal_t; + +#ifdef DEBUG_FUNCTIONS +#define SET_FUNCTION(gf,method,size,func) \ + { (gf)->method.size = (func); \ + ((gf_internal_t*)(gf)->scratch)->method = #func; } +#else +#define SET_FUNCTION(gf,method,size,func) \ + (gf)->method.size = (func); +#endif + +extern int gf_w4_init (gf_t *gf); +extern int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w8_init (gf_t *gf); +extern int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w16_init (gf_t *gf); +extern int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w32_init (gf_t *gf); +extern int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w64_init (gf_t *gf); +extern int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_w128_init (gf_t *gf); +extern int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2); + +extern int gf_wgen_init (gf_t *gf); +extern int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2); + +void gf_wgen_cauchy_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor); +gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index); + +extern void gf_alignment_error(char *s, int a); + +extern uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp); + +/* This returns the correct default for prim_poly when base is used as the base + field for COMPOSITE. It returns 0 if we don't have a default prim_poly. */ + +extern uint64_t gf_composite_get_default_poly(gf_t *base); + +/* This structure lets you define a region multiply. It helps because you can handle + unaligned portions of the data with the procedures below, which really cleans + up the code. */ + +typedef struct { + gf_t *gf; + void *src; + void *dest; + int bytes; + uint64_t val; + int xor; + int align; /* The number of bytes to which to align. */ + void *s_start; /* The start and the top of the aligned region. */ + void *d_start; + void *s_top; + void *d_top; +} gf_region_data; + +/* This lets you set up one of these in one call. It also sets the start/top pointers. */ + +void gf_set_region_data(gf_region_data *rd, + gf_t *gf, + void *src, + void *dest, + int bytes, + uint64_t val, + int xor, + int align); + +/* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */ + +extern void gf_do_initial_region_alignment(gf_region_data *rd); + +/* This performs gf->multiply.32() on all of the unaligned bytes in the end of the region */ + +extern void gf_do_final_region_alignment(gf_region_data *rd); + +extern void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base); + +extern void gf_multby_zero(void *dest, int bytes, int xor); +extern void gf_multby_one(void *src, void *dest, int bytes, int xor); + +typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */ + GF_E_MDEFREG, /* Reg != Default && Mult == Default */ + GF_E_MDEFARG, /* Args != Default && Mult == Default */ + GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ + GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ + GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ + GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */ + GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ + GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ + GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ + GF_E_ARG2SET, /* Arg2 != 0 && Mult \notin SPLIT/GROUP */ + GF_E_MATRIXW, /* Div == MATRIX && w > 32 */ + GF_E_BAD___W, /* Illegal w */ + GF_E_DOUBLET, /* Reg == DOUBLE && Mult != TABLE */ + GF_E_DOUBLEW, /* Reg == DOUBLE && w \notin {4,8} */ + GF_E_DOUBLEJ, /* Reg == DOUBLE && other Reg */ + GF_E_DOUBLEL, /* Reg == DOUBLE & LAZY but w = 4 */ + GF_E_QUAD__T, /* Reg == QUAD && Mult != TABLE */ + GF_E_QUAD__W, /* Reg == QUAD && w != 4 */ + GF_E_QUAD__J, /* Reg == QUAD && other Reg */ + GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ + GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ + GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */ + GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */ + GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ + GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ + GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ + GF_E_LOGBADW, /* Mult == LOGx, w too big*/ + GF_E_LOG___J, /* Mult == LOGx, && Reg == SSE|ALTMAP|NOSSE */ + GF_E_ZERBADW, /* Mult == LOG_ZERO, w \notin {8,16} */ + GF_E_ZEXBADW, /* Mult == LOG_ZERO_EXT, w != 8 */ + GF_E_LOGPOLY, /* Mult == LOG & poly not primitive */ + GF_E_GR_ARGX, /* Mult == GROUP, Bad arg1/2 */ + GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */ + GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */ + GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */ + GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */ + GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ + GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ + GF_E_TABLE_W, /* Mult == TABLE, w too big */ + GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */ + GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ + GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ + GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ + GF_E_SP128AL, /* Mult == SPLIT, w=128, SSE requires ALTMAP */ + GF_E_SP128AS, /* Mult == SPLIT, w=128, ALTMAP requires SSE */ + GF_E_SP128_A, /* Mult == SPLIT, w=128, ALTMAP only with 4/128 */ + GF_E_SP128_S, /* Mult == SPLIT, w=128, SSE only with 4/128 */ + GF_E_SPLIT_W, /* Mult == SPLIT, Bad w (8, 16, 32, 64, 128) */ + GF_E_SP_16AR, /* Mult == SPLIT, w=16, Bad arg1/arg2 */ + GF_E_SP_16_A, /* Mult == SPLIT, w=16, ALTMAP only with 4/16 */ + GF_E_SP_16_S, /* Mult == SPLIT, w=16, SSE only with 4/16 */ + GF_E_SP_32AR, /* Mult == SPLIT, w=32, Bad arg1/arg2 */ + GF_E_SP_32AS, /* Mult == SPLIT, w=32, ALTMAP requires SSE */ + GF_E_SP_32_A, /* Mult == SPLIT, w=32, ALTMAP only with 4/32 */ + GF_E_SP_32_S, /* Mult == SPLIT, w=32, SSE only with 4/32 */ + GF_E_SP_64AR, /* Mult == SPLIT, w=64, Bad arg1/arg2 */ + GF_E_SP_64AS, /* Mult == SPLIT, w=64, ALTMAP requires SSE */ + GF_E_SP_64_A, /* Mult == SPLIT, w=64, ALTMAP only with 4/64 */ + GF_E_SP_64_S, /* Mult == SPLIT, w=64, SSE only with 4/64 */ + GF_E_SP_8_AR, /* Mult == SPLIT, w=8, Bad arg1/arg2 */ + GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ + GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ + GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ + GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */ + GF_E_COMP__W, /* Mult == COMP, Bad w. */ + GF_E_UNKFLAG, /* Unknown flag in create_from.... */ + GF_E_UNKNOWN, /* Unknown mult_type. */ + GF_E_UNK_REG, /* Unknown region_type. */ + GF_E_UNK_DIV, /* Unknown divide_type. */ + GF_E_CFM___W, /* Mult == CFM, Bad w. */ + GF_E_CFM4POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CFM8POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF16POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF32POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_CF64POL, /* Mult == CFM & Prim Poly has high bits set. */ + GF_E_FEWARGS, /* Too few args in argc/argv. */ + GF_E_BADPOLY, /* Bad primitive polynomial -- too many bits set. */ + GF_E_COMP_PP, /* Bad primitive polynomial -- bigger than sub-field. */ + GF_E_COMPXPP, /* Can't derive a default pp for composite field. */ + GF_E_BASE__W, /* Composite -- Base field is the wrong size. */ + GF_E_TWOMULT, /* In create_from... two -m's. */ + GF_E_TWO_DIV, /* In create_from... two -d's. */ + GF_E_POLYSPC, /* Bad numbera after -p. */ + GF_E_SPLITAR, /* Ran out of arguments in SPLIT */ + GF_E_SPLITNU, /* Arguments not integers in SPLIT. */ + GF_E_GROUPAR, /* Ran out of arguments in GROUP */ + GF_E_GROUPNU, /* Arguments not integers in GROUP. */ + GF_E_DEFAULT } gf_error_type_t; + diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_method.h b/src/erasure-code/jerasure/gf-complete/include/gf_method.h new file mode 100644 index 000000000..880b34967 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_method.h @@ -0,0 +1,20 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_method.h + * + * Parses argv to figure out the flags and arguments. Creates the gf. + */ + +#pragma once + +#include "gf_complete.h" + +/* Parses argv starting at "starting". + + Returns 0 on failure. + On success, it returns one past the last argument it read in argv. */ + +extern int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_rand.h b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h new file mode 100644 index 000000000..24294adc7 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_rand.h @@ -0,0 +1,22 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_rand.h + * + * Random number generation, using the "Mother of All" random number generator. */ + +#pragma once +#include +#include +#include + +/* These are all pretty self-explanatory */ +uint32_t MOA_Random_32(); +uint64_t MOA_Random_64(); +void MOA_Random_128(uint64_t *x); +uint32_t MOA_Random_W(int w, int zero_ok); +void MOA_Fill_Random_Region (void *reg, int size); /* reg should be aligned to 4 bytes, but + size can be anything. */ +void MOA_Seed(uint32_t seed); diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w16.h b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h new file mode 100644 index 000000000..fb4c0e98f --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_w16.h @@ -0,0 +1,66 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w16.h + * + * Defines and data structures for 16-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W16_H +#define GF_COMPLETE_GF_W16_H + +#include + +#define GF_FIELD_WIDTH (16) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (8) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w16_logtable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; +}; + +struct gf_w16_zero_logtable_data { + int log_tbl[GF_FIELD_SIZE]; + uint16_t _antilog_tbl[GF_FIELD_SIZE * 4]; + uint16_t *antilog_tbl; + uint16_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w16_lazytable_data { + uint16_t log_tbl[GF_FIELD_SIZE]; + uint16_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint16_t inv_tbl[GF_FIELD_SIZE]; + uint16_t *d_antilog; + uint16_t lazytable[GF_FIELD_SIZE]; +}; + +struct gf_w16_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w16_split_8_8_data { + uint16_t tables[3][256][256]; +}; + +struct gf_w16_group_4_4_data { + uint16_t reduce[16]; + uint16_t shift[16]; +}; + +struct gf_w16_composite_data { + uint8_t *mult_table; +}; + +void gf_w16_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W16_H */ diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w32.h b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h new file mode 100644 index 000000000..7734f30ff --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_w32.h @@ -0,0 +1,71 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w32.h + * + * Defines and data structures for 32-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W32_H +#define GF_COMPLETE_GF_W32_H + +#include + +#define GF_FIELD_WIDTH (32) +#define GF_FIRST_BIT ((gf_val_32_t)1 << 31) + +#define GF_BASE_FIELD_WIDTH (16) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +struct gf_split_2_32_lazy_data { + uint32_t tables[16][4]; + uint32_t last_value; +}; + +struct gf_w32_split_8_8_data { + uint32_t tables[7][256][256]; + uint32_t region_tables[4][256]; + uint32_t last_value; +}; + +struct gf_w32_group_data { + uint32_t *reduce; + uint32_t *shift; + int tshift; + uint64_t rmask; + uint32_t *memory; +}; + +struct gf_split_16_32_lazy_data { + uint32_t tables[2][(1<<16)]; + uint32_t last_value; +}; + +struct gf_split_8_32_lazy_data { + uint32_t tables[4][256]; + uint32_t last_value; +}; + +struct gf_split_4_32_lazy_data { + uint32_t tables[8][16]; + uint32_t last_value; +}; + +struct gf_w32_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w32_composite_data { + uint16_t *log; + uint16_t *alog; +}; + +void gf_w32_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W32_H */ diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w4.h b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h new file mode 100644 index 000000000..8ee94a339 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_w4.h @@ -0,0 +1,63 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w4.h + * + * Defines and data structures for 4-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W4_H +#define GF_COMPLETE_GF_W4_H + +#include + +#define GF_FIELD_WIDTH 4 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) + +/* ------------------------------------------------------------ + JSP: Each implementation has its own data, which is allocated + at one time as part of the handle. For that reason, it + shouldn't be hierarchical -- i.e. one should be able to + allocate it with one call to malloc. */ + +struct gf_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_single_table_data { + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; +struct gf_quad_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][(1<<16)]; +}; + +struct gf_quad_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[(1 << 16)]; +}; + +struct gf_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +// ARM NEON init functions +int gf_w4_neon_cfm_init(gf_t *gf); +void gf_w4_neon_single_table_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W4_H */ diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w64.h b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h new file mode 100644 index 000000000..9a74a8125 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_w64.h @@ -0,0 +1,50 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w64.h + * + * Defines and data structures for 64-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W64_H +#define GF_COMPLETE_GF_W64_H + +#include + +#define GF_FIELD_WIDTH (64) +#define GF_FIRST_BIT (1ULL << 63) + +#define GF_BASE_FIELD_WIDTH (32) +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 + +struct gf_w64_group_data { + uint64_t *reduce; + uint64_t *shift; + uint64_t *memory; +}; + +struct gf_split_4_64_lazy_data { + uint64_t tables[16][16]; + uint64_t last_value; +}; + +struct gf_split_8_64_lazy_data { + uint64_t tables[8][(1<<8)]; + uint64_t last_value; +}; + +struct gf_split_16_64_lazy_data { + uint64_t tables[4][(1<<16)]; + uint64_t last_value; +}; + +struct gf_split_8_8_data { + uint64_t tables[15][256][256]; +}; + +void gf_w64_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W64_H */ diff --git a/src/erasure-code/jerasure/gf-complete/include/gf_w8.h b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h new file mode 100644 index 000000000..938fcfdf1 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/include/gf_w8.h @@ -0,0 +1,99 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w8.c + * + * Defines and data stuctures for 8-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W8_H +#define GF_COMPLETE_GF_W8_H + +#include "gf_int.h" +#include + +#define GF_FIELD_WIDTH (8) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2)) +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1 + +#define GF_BASE_FIELD_WIDTH (4) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) + +struct gf_w8_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t inv_tbl[GF_FIELD_SIZE]; +}; + +struct gf_w8_logzero_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[512+512+1]; + uint8_t *div_tbl; + uint8_t *inv_tbl; +}; + +struct gf_w8_logzero_small_table_data { + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */ + uint8_t antilog_tbl[255*3]; + uint8_t inv_tbl[GF_FIELD_SIZE]; + uint8_t *div_tbl; +}; + +struct gf_w8_composite_data { + uint8_t *mult_table; +}; + +/* Don't change the order of these relative to gf_w8_half_table_data */ + +struct gf_w8_default_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_half_table_data { + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE]; + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE]; +}; + +struct gf_w8_single_table_data { + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w8_double_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; + +struct gf_w4_logtable_data { + uint8_t log_tbl[GF_BASE_FIELD_SIZE]; + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_w4_single_table_data { + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE]; +}; + +struct gf_w8_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +int gf_w8_neon_cfm_init(gf_t *gf); +void gf_w8_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W8_H */ diff --git a/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4 b/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4 new file mode 100644 index 000000000..c3a8d695a --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/m4/ax_check_compile_flag.m4 @@ -0,0 +1,72 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the current language's compiler +# or gives an error. (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 2 + +AC_DEFUN([AX_CHECK_COMPILE_FLAG], +[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX +AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl +AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ + ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) +AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_COMPILE_FLAGS diff --git a/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 b/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 new file mode 100644 index 000000000..95c4dbe23 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/m4/ax_ext.m4 @@ -0,0 +1,75 @@ +# +# This macro is based on http://www.gnu.org/software/autoconf-archive/ax_ext.html +# but simplified to do compile time SIMD checks only +# + +AC_DEFUN([AX_EXT], +[ + AC_REQUIRE([AC_CANONICAL_HOST]) + + case $host_cpu in + aarch64*) + AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64]) + SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64" + + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) + if test "$ax_cv_have_neon_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd, [SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON"], [ax_cv_have_neon_ext=no]) + fi + ;; + + arm*) + AC_CACHE_CHECK([whether NEON is enabled], [ax_cv_have_neon_ext], [ax_cv_have_neon_ext=yes]) + if test "$ax_cv_have_neon_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mfpu=neon, [SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON"], [ax_cv_have_neon_ext=no]) + fi + ;; + + powerpc*) + AC_CACHE_CHECK([whether altivec is enabled], [ax_cv_have_altivec_ext], [ax_cv_have_altivec_ext=yes]) + if test "$ax_cv_have_altivec_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [ax_cv_have_altivec_ext=no]) + fi + ;; + + i[[3456]]86*|x86_64*|amd64*) + + AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes]) + if test "$ax_cv_have_sse_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no]) + fi + + AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes]) + if test "$ax_cv_have_sse2_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no]) + fi + + AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes]) + if test "$ax_cv_have_sse3_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no]) + fi + + AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes]) + if test "$ax_cv_have_ssse3_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no]) + fi + + AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes]) + if test "$ax_cv_have_pclmuldq_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no]) + fi + + AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes]) + if test "$ax_cv_have_sse41_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no]) + fi + + AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes]) + if test "$ax_cv_have_sse42_ext" = yes; then + AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no]) + fi + ;; + esac + + AC_SUBST(SIMD_FLAGS) +]) diff --git a/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html b/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html new file mode 100644 index 000000000..ed79e2576 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/manual/gf-complete.html @@ -0,0 +1,3484 @@ + + + + + + + + + + +
+ +

+GF-Complete: A Comprehensive Open Source Library for Galois
+Field Arithmetic +

+ +

Version 1.02

+ +

James S. Plank*        Ethan L. Miller +Kevin M. Greenan        Benjamin A. Arnold
+John A. Burnum        Adam W. Disney        +Allen C. McBride + +


+ + + + + +https://bitbucket.org/jimplank/gf-complete + +

+ +http://web.eecs.utk.edu/~plank/plank/papers/GF-Complete-Manual-1.02.pdf + + +

+ + + + + + + +
+ + +
+ +This is a user's manual for GF-Complete, version 1.02. This release supersedes version 0.1 and represents the first +major release of GF-Complete. To our knowledge, this library implements every Galois Field multiplication technique +applicable to erasure coding for storage, which is why we named it GF-Complete. The primary goal of this library is +to allow storage system researchers and implementors to utilize very fast Galois Field arithmetic for Reed-Solomon +coding and the like in their storage installations. The secondary goal is to allow those who want to explore different +ways to perform Galois Field arithmetic to be able to do so effectively. + + +

+If you wish to cite GF-Complete, please cite technical report UT-CS-13-716: [PMG+13]. + +

+ + +

If You Use This Library or Document

+ + + +Please send me an email to let me know how it goes. Or send me an email just to let me know you are using the +library. One of the ways in which we are evaluated both internally and externally is by the impact of our work, and if +you have found this library and/or this document useful, we would like to be able to document it. Please send mail to +plank@cs.utk.edu. Please send bug reports to that address as well. + + + +

+The library itself is protected by the New BSD License. It is free to use and modify within the bounds of this +license. To the authors' knowledge, none of the techniques implemented in this library have been patented, and the +authors are not pursing patents.


+ +
+ + +Finding the Code +

+This code is actively maintained on bitbucket: https://bitbucket.org/jimplank/gf-complete. There are +previous versions on my UTK site as a technical report; however, that it too hard to maintain, so the main version is +on bitbucket.

+ + +Two Related Papers

+ +This software acccompanies a large paper that describes these implementation techniques in detail [PGM13a]. We +will refer to this as "The Paper." You do not have to read The Paper to use the software. However, if you want to +start exploring the various implementations, then The Paper is where you'll want to go to learn about the techniques +in detail. + + + +

This library implements the techniques described in the paper "Screaming Fast Galois Field Arithmetic Using Intel +SIMD Instructions," [PGM13b]. The Paper describes all of those techniques as well. +



+ +If You Would Like HelpWith the Software

+ +Please contact the first author of this manual.

+ +Changes from Revision 1.01 +

+The major change is that we are using autoconf to aid with compilation, thus obviating the need for the old flag_tester +code. Additionally, we have added a quick timing tool, and we have modified gf_methods so that it may be used to +run the timing tool and the unit tester. + + + + + + + + + + + + + + + + + + +
+CONTENT 3 +

Contents

+
+1 Introduction 5 +

+2 Files in the Library 6
+ +
+2.1 Header files in the directory "include" . .. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6
+2.2 Source files in the "src" directory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7
+2.3 Library tools files in the "tools" directory . . . . . . . . . . .. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7
+2.4 The unit tester in the "test" directory. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8
+2.5 Example programs in the "examples" directory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8 + +
+
+
+ +3 Compilation 8

+4 Some Tools and Examples to Get You Started 8

+ + + +
+4.1 Three Simple Command Line Tools: gf_mult, gf_div and gf_add . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8
+4.2 Quick Starting Example #1: Simple multiplication and division . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9
+4.3 Quick Starting Example #2: Multiplying a region by a constant . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 10
+4.4 Quick Starting Example #3: Using w = 64 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11
+4.5 Quick Starting Example #4: Using w = 128. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11 +
+
+ + +
+5 Important Information on Alignment when Multiplying Regions 12

+ +6 The Defaults 13
+ +
+ +
+6.1 Changing the Defaults . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 14
+ + +
    +
  • 6.1.1 Changing the Components of a Galois Field with create_gf_from_argv() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 15
    +
  • +
  • +6.1.2 Changing the Polynomial. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 16
    +
  • +
  • +6.1.3 Changing the Multiplication Technique. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 17 +
  • + + +
  • +6.1.4 Changing the Division Technique . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 19 +
  • + + +
  • +6.1.5 Changing the Region Technique. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. 19 +
  • +
+6.2 Determining Supported Techniques with gf_methods . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 20
+ +6.3 Testing with gf_unit, gf_time, and time_tool.sh . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21 + +
    +
  • +6.3.1 time_tool.sh . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . 22 +
  • + +
  • +6.3.2 An example of gf_methods and time_tool.sh . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . .. . . 23 +
  • + +
+ +6.4 Calling gf_init_hard() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . 24
+ +6.5 gf_size() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . .. . 26

+
+ + +
+8 Further Information on Options and Algorithms 26


+
+7.1 Inlining Single Multiplication and Division for Speed . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 26
+7.2 Using different techniques for single and region multiplication . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 27
+7.3 General w . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 28
+ +7.4 Arguments to "SPLIT" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 28
+7.5 Arguments to "GROUP" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 29
+7.6 Considerations with "COMPOSITE" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 30
+7.7 "CARRY_FREE" and the Primitive Polynomial . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 31
+7.8 More on Primitive Polynomials . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . . 31
+ + +
    +
  • +7.8.1 Primitive Polynomials that are not Primitive . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 31
    + +
  • +
  • 7.8.2 Default Polynomials for Composite Fields . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 32
    + +
  • +
+ +
+ + + + + + + + + + + +
+CONTENT 4 + +
+
    +
  • 7.8.3 The Program gf_poly for Verifying Irreducibility of Polynomials 33 +
  • +
+ + +7.9"ALTMAP" considerations and extract_word() 34 +
    +
  • + +7.9.1 Alternate mappings with "SPLIT" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 34
    +
  • +
  • +7.9.2 Alternate mappings with "COMPOSITE" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 36
    +
  • +
  • +7.9.3 The mapping of "CAUCHY" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . .. . . . . . . . . . . .. 37
    +
  • +
+
+ + +8 Thread Safety 37

+ +9 Listing of Procedures 37

+ +10 Troubleshooting 38

+11 Timings 41

+ +
+11.1 Multiply() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . . . . . . .. . . . 42
+11.2 Divide() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . .. . . . . 42
+11.3 Multiply Region() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . .. . . . . . . . .. . . . . . . . . . . . . . . . . . 43
+
+ + + + + + +
+INTRODUCTION 5 + + +

1 Introduction

+ +Galois Field arithmetic forms the backbone of erasure-coded storage systems, most famously the Reed-Solomon +erasure code. A Galois Field is defined over w-bit words and is termed GF(2w). As such, the elements of a Galois +Field are the integers 0, 1, . . ., 2w - 1. Galois Field arithmetic defines addition and multiplication over these closed +sets of integers in such a way that they work as you would hope they would work. Specifically, every number has a +unique multiplicative inverse. Moreover, there is a value, typically the value 2, which has the property that you can +enumerate all of the non-zero elements of the field by taking that value to successively higher powers. + + +

Addition in a Galois Field is equal to the bitwise exclusive-or operation. That's nice and convenient. Multiplication +is a little more complex, and there are many, many ways to implement it. The Paper describes them all, and the +following references providemore supporting material: [Anv09, GMS08, LHy08, LD00, LBOX12, Pla97]. The intent +of this library is to implement all of the techniques. That way, their performancemay be compared, and their tradeoffs +may be analyzed.

+ + + + +

    + +When used for erasure codes, there are typically five important operations:
    +
  1. Adding two numbers in GF(2w). That's bitwise exclusive-or.
  2. +
  3. Multiplying two numbers in GF(2w). Erasure codes are usually based on matrices in GF(2w), and constructing +these matrices requires both addition and multiplication.
  4. +
  5. Dividing two numbers in GF(2w). Sometimes you need to divide to construct matrices (for example, Cauchy +Reed-Solomon codes [BKK+95, Rab89]). More often, though, you use division to invert matrices for decoding. +Sometimes it is easier to find a number's inverse than it is to divide. In that case, you can divide by multiplying +by an inverse.
  6. + +
  7. adding two regions of numbers in GF(2w), which will be explained along with...
  8. +
  9. Mutiplying a region of numbers in GF(2w) by a constant in GF(2w). Erasure coding typically boils down +to performing dot products in GF(2w). For example, you may define a coding disk using the equation:

  10. + + + + +
    c0= d0 + 2d1 + 4d2 + 8d3.

    + +That looks like three multiplications and three additions However, the way ' implemented in a disk system +looks as in Figure 1. Large regions of disks are partitioned into w-bit words in GF(2w). In the example, let us +suppose that w = 8, and therefore that words are bytes. Then the regions pictured are 1 KB from each disk. +The bytes on disk Di are labeled di,0, di,1, . . . , di,1023, and the equation above is replicated 1024 times. For +0 ≤ j < 1024: +

    +
    c0,j = d0,j + 2d1,j + 4d2,j + 8d3,j .
    +
    + + +While it's possible to implement each of these 1024 equations independently, using the single multiplication +and addition operations above, it is often much more efficient to aggregate. For example, most computer architectures +support bitwise exclusive-or of 64 and 128 bit words. Thus, it makes much more sense to add regions +of numbers in 64 or 128 bit chunks rather than as words in GF(2w). Multiplying a region by a constant can +leverage similar optimizations.
+ + +

GF-Complete supports multiplication and division of single values for all values of w ≤ 32, plus w = 64 and w = +128. It also supports adding two regions of memory (for any value of w, since addition equals XOR), and multiplying +a region by a constant in GF(24), GF(28), GF(216), GF(232), GF(264) and GF(2128). These values are chosen +because words in GF(2w) fit into machine words with these values of w. Other values of w don't lend themselves +to efficient multiplication of regions by constants (although see the "CAUCHY" option in section 6.1.5 for a way to +multiply regions for other values of w).

+ + + + + + +
+ +2     FILES IN THE LIBRARY 6


+ + + +



+ +Figure 1: An example of adding two regions of numbers, and multiplying a region of numbers by a constant +in GF(2w) . In this example, w = 8, and each disk is holding a 1KB region. The same coding equation - +c0,j = d0,j + ad1,j + a2d2,j + a3d3,j is applied 1024 times. However, rather than executing this equation 1024 +times, it is more efficient to implement this with three region-constant multiplications and three region-region additions. + +

2     Files in the Library

+This section provides an overview of the files that compose GF-Complete. They are partitioned among multiple +directories. + +

2.1     Header files in the directory "include"

+ +The following header files are part of GF-Complete. +
    +
  • gf_complete.h: This is the header file that applications should include. It defines the gf_t type, which holds +all of the data that you need to perform the various operations in GF(2w). It also defines all of the arithmetic +operations. For an application to use this library, you should include gf_complete.h and then compile with the +library src/libgf_complete.la.

  • + +
  • gf_method.h: If you are wanting to modify the implementation techniques from the defaults, this file provides +a "helper" function so that you can do it from the Unix command line. +

  • + +
  • gf_general.h: This file has helper routines for doing basic Galois Field operations with any legal value of w. +The problem is that w ≤ 32, w = 64 and w = 128 all have different data types, which is a pain. The procedures +in this file try to alleviate that pain. They are used in gf_mult, gf_unit and gf_time. I'm guessing that most +applications won't use them, as most applications use w ≤ 32.

  • + +
  • gf_rand.h: I've learned that srand48() and its kin are not supported in all C installations. Therefore, this file +defines some randomnumber generators to help test the programs. The randomnumber generator is the "Mother +
  • + +
+ + + + + + + +
+ +2     FILES IN THE LIBRARY 7


+
    + +of All" random number generator [Mar94] which we've selected because it has no patent issues. gf_unit and +gf_time use these random number generators.

    +
  • gf_int.h: This is an internal header file that the various source files use. This is not intended for applications to +include.

  • +
  • config.xx and stamp-h1 are created by autoconf, and should be ignored by applications.
  • +
+ +

2.2     Source files in the "src" directory"

+
    +The following C files compose gf_complete.a, and they are in the direcoty src. You shouldn't have to mess with these +files, but we include them in case you have to:

    +
  • gf_.c: This implements all of the procedures in both gf_complete.h and gf_int.h.

  • +
  • gf_w4.c: Procedures specific to w = 4.

  • +
  • gf_w8.c: Procedures specific to w = 8

  • +
  • gf_w16.c: Procedures specific to w = 16

  • +
  • gf_w32.c: Procedures specific to w = 32

  • +
  • gf_w64.c: Procedures specific to w = 64

  • +
  • gf_w128.c: Procedures specific to w = 128

  • +
  • gf_wgen.c: Procedures specific to other values of w between 1 and 31

  • +
  • gf_general.c: Procedures that let you manipulate general values, regardless of whether w ≤ 32, w = 64 +or w = 128. (I.e. the procedures defined in gf_ general.h)

  • +
  • gf_method.c: Procedures to help you switch between the various implementation techniques. (I.e. the procedures +defined in gf_method.h)

  • +
  • gf_ rand.c:"The Mother of all" random number generator. (I.e. the procedures defined in gf_rand.h)

+ +

2.3     Library tools files in the "tools" directory

+ +
    +The following are tools to help you with Galois Field arithmetic, and with the library. They are explained in greater +detail elsewhere in this manual.

    +
  • gf_mult.c, gf_ div.c and gf_ add: Command line tools to do multiplication, division and addition by single numbers

  • +
  • gf_time.c: A program that times the procedures for given values of w and implementation options

  • +
  • time_tool.sh: A shell script that helps perform rough timings of the various multiplication, division and region +operations in GF-Complete

  • +
  • gf_methods.c: A program that enumerates most of the implementation methods supported by GF-Complete

  • +
  • gf_poly.c: A program to identify irreducible polynomials in regular and composite Galois Fields

  • + +
+ + + + + + + + +
+ +3     COMPILATION 8


+ + +

2.4     The unit tester in the "test" directory

+ +The test directory contains the proram gf_unit.c, which performs a battery of unit tests on GF-Complete. This is +explained in more detail in section 6.3. + + +

2.5    Example programs in the "examples" directory

+ +There are seven example programs to help you understand various facets of GF-Complete. They are in the files +gf_example x.c in the examples directory. They are explained in sections 4.2 through 4.5, and section 7.9.

+ +

3     Compilation

+ +From revision 1.02 forward, we are using autoconf. The old "flag tester" directory is now gone, as it is no longer in +use.

+To compile and install, you should do the standard operations that you do with most open source Unix code:

+ +UNIX> ./configure
+...
+UNIX> make
+...
+UNIX> sudo make install

+ + +

If you perform the install, then the header, source, tool, and library files will be moved to system locations. In +particular, you may then compile the library by linking with the flag -lgf_complete, and you may use the tools from a +global executable directory (like /usr/local/bin).

+ +

+If you don't perform the install, then the header and tool files will be in their respective directories, and the library +will be in src/libgf_complete.la.

+

+If your system supports the various Intel SIMD instructions, the compiler will find them, and GF-Complete will +use them by default.

+ + + +

4     Some Tools and Examples to Get You Started

+

4.1 Three Simple Command Line Tools: gf_mult, gf_div and gf_add

+ + +Before delving into the library, it may be helpful to explore Galois Field arithmetic with the command line tools: +gf_mult, gf_div and gf_add. These perform multiplication, division and addition on elements in GF(2w). If these are +not installed on your system, then you may find them in the tools directory. Their syntax is: +
    +
  • gf_mult a b w - Multiplies a and b in GF(2w).

  • +
  • gf_div a b w - Divides a by b in GF(2w ).

  • +
  • gf_add a b w - Adds a and b in GF(2w ).

  • + +You may use any value of w from 1 to 32, plus 64 and 128. By default, the values are read and printed in decimal; +however, if you append an 'h' to w , then a, b and the result will be printed in hexadecimal. For w = 128, the 'h' is +mandatory, and all values will be printed in hexadecimal. + + + + + + + +
    + +4     SOME TOOLS AND EXAMPLES TO GET YOU STARTED 9 9


    + + +

    Try them out on some examples like the ones below. You of course don't need to know that, for example, 5 * 4 = 7 +in GF(24 ) ; however, once you know that, you know that 7/ +5 = 4 and 7/4 = 5. You should be able to verify the gf_add +statements below in your head. As for the other gf_mult's, you can simply verify that division and multiplication work +with each other as you hope they would.

    +

    +
    + +UNIX> gf_mult 5 4 4
    +7
    +UNIX> gf_div 7 5 4
    +4
    +UNIX> gf_div 7 4 4
    +5
    +UNIX> gf_mult 8000 2 16h
    +100b
    +UNIX> gf_add f0f0f0f0f0f0f0f0 1313131313131313 64h
    +e3e3e3e3e3e3e3e3
    +UNIX> gf_mult f0f0f0f0f0f0f0f0 1313131313131313 64h
    +8da08da08da08da0
    +UNIX> gf_div 8da08da08da08da0 1313131313131313 64h
    +f0f0f0f0f0f0f0f0
    +UNIX> gf_add f0f0f0f0f0f0f0f01313131313131313 1313131313131313f0f0f0f0f0f0f0f0 128h
    +e3e3e3e3e3e3e3e3e3e3e3e3e3e3e3e3
    +UNIX> gf_mult f0f0f0f0f0f0f0f01313131313131313 1313131313131313f0f0f0f0f0f0f0f0 128h
    +786278627862784982d782d782d7816e
    +UNIX> gf_div 786278627862784982d782d782d7816e f0f0f0f0f0f0f0f01313131313131313 128h
    +1313131313131313f0f0f0f0f0f0f0f0
    +UNIX>

    + +
    + + +Don't bother trying to read the source code of these programs yet. Start with some simpler examples like the ones +below.

    + +

    4.2 Quick Starting Example #1: Simple multiplication and division

    + +The source files for these examples are in the examples directory. +

    These two examples are intended for those who just want to use the library without getting too complex. The +first example is gf_example 1, and it takes one command line argument - w, which must be between 1 and 32. It +generates two random non-zero numbers in GF(2w ) and multiplies them. After doing that, it divides the product by +each number.

    +

    +To perform multiplication and division in GF(2w ) , you must declare an instance of the gf_t type, and then initialize +it for GF(2w ) by calling gf_init_easy(). This is done in gf_example 1.c with the following lines:



    + +gf_t gf;

    r +...

    +if (!gf_init_easy(&gf, w)) {
    +fprintf(stderr, "Couldn't initialize GF structure.\n");
    +exit(0);
    +}
    + + + + + + +
    + +4     SOME TOOLS AND EXAMPLES TO GET YOU STARTED 10


    + +

    Once gf is initialized, you may use it for multiplication and division with the function pointers multiply.w32 and +divide.w32. These work for any element of GF(2w) so long as w ≤ 32.



    + +
    +
    +c = gf.multiply.w32(&gf, a, b);
    +printf("%u * %u = %u\n", a, b, c);

    +printf("%u / %u = %u\n", c, a, gf.divide.w32(&gf, c, a));
    +printf("%u / %u = %u\n", c, b, gf.divide.w32(&gf, c, b));
    + + +
    +

    +Go ahead and test this program out. You can use gf_mult and gf_div to verify the results:

    + +
    +UNIX> gf_example_1 4
    +12 * 4 = 5
    +5 / 12 = 4
    +5 / 4 = 12
    +UNIX> gf_mult 12 4 4
    +5
    +UNIX> gf_example_1 16
    +14411 * 60911 = 44568
    +44568 / 14411 = 60911
    +44568 / 60911 = 14411
    +UNIX> gf_mult 14411 60911 16
    +44568
    +UNIX>

    +
    + +gf_init_easy() (and later_gf_init_hard()) do call malloc() to implement internal structures. To release memory, call +gf_free(). Please see section 6.4 to see how to call gf_init_hard() in such a way that it doesn't call malloc().

    + + + +

    4.3      Quick Starting Example #2: Multiplying a region by a constant

    + + +The program gf_example 2 expands on gf_example 1. If w is equal to 4, 8, 16 or 32, it performs a region multiply +operation. It allocates two sixteen byte regions, r1 and r2, and then multiples r1 by a and puts the result in r2 using +the multiply_region.w32 function pointer:

    + +
    +gf.multiply_region.w32 (&gf, r1, r2, a, 16, 0);

    +
    + +That last argument specifies whether to simply place the product into r2 or to XOR it with the contents that are already +in r2. Zero means to place the product there. When we run it, it prints the results of the multiply_region.w32 in +hexadecimal. Again, you can verify it using gf_mult:

    +
    +UNIX> gf_example_2 4
    +12 * 2 = 11
    +11 / 12 = 2
    +11 / 2 = 12

    +multiply_region by 0xc (12)

    +R1 (the source): 0 2 d 9 d 6 8 a 8 d b 3 5 c 1 8 8 e b 0 6 1 5 a 2 c 4 b 3 9 3 6
    +R2 (the product): 0 b 3 6 3 e a 1 a 3 d 7 9 f c a a 4 d 0 e c 9 1 b f 5 d 7 6 7 e
    + +
    + + + + + + + + + + +
    + +4     SOME TOOLS AND EXAMPLES TO GET YOU STARTED 11


    + +
    + + + + + + + + +
    UNIX> gf_example_2 16
    49598 * 35999 = 19867
    19867 / 49598 = 35999
    19867 / 35999 = 49598

    + + +  multiply_region by 0xc1be (49598)

    + + + + + + +
    R1 (the source): 8c9f b30e 5bf3 7cbb 16a9 105d 9368 4bbe
    R2 (the product): 4d9b 992d 02f2 c95c 228e ec82 324e 35e4
    +
    +
    +
    +UNIX> gf_mult c1be 8c9f 16h
    +4d9b
    +UNIX> gf_mult c1be b30e 16h
    +992d
    +UNIX>

    +
    +
    + +

    4.4       Quick Starting Example #3: Using w = 64

    +The program in gf_example 3.c is identical to the previous program, except it uses GF(264 ). Now a, b and c are +uint64 t's, and you have to use the function pointers that have w64 extensions so that the larger types may be employed. +

    +
    + +UNIX> gf_example_31 + + + + + + + +
    a9af3adef0d23242 * 61fd8433b25fe7cd = bf5acdde4c41ee0c
    bf5acdde4c41ee0c / a9af3adef0d23242 = 61fd8433b25fe7cd
    bf5acdde4c41ee0c / 61fd8433b25fe7cd = a9af3adef0d23242


    + +  multiply_region by a9af3adef0d23242

    + + + + +
    R1 (the source): 61fd8433b25fe7cd 272d5d4b19ca44b7 3870bf7e63c3451a 08992149b3e2f8b7
    R2 (the product): bf5acdde4c41ee0c ad2d786c6e4d66b7 43a7d857503fd261 d3d29c7be46b1f7c
    + +
    + +UNIX> gf_mult a9af3adef0d23242 61fd8433b25fe7cd 64h
    +bf5acdde4c41ee0c
    +UNIX>

    +
    +
    +

    4.5       Quick Starting Example #4: Using w = 128

    +Finally, the program in gf_example_4.c uses GF(2128). Since there is not universal support for uint128 t, the library +represents 128-bit numbers as arrays of two uint64 t's. The function pointers for multiplication, division and region +multiplication now accept the return values as arguments:

    + +gf.multiply.w128(&gf, a, b, c);

    + +Again, we can use gf_mult and gf_div to verify the results:

    +
    +
    +UNIX> gf_example_4
    + + + + + + +
    e252d9c145c0bf29b85b21a1ae2921fa * b23044e7f45daf4d70695fb7bf249432 =
    7883669ef3001d7fabf83784d52eb414
    + +
    + + + + + + + + +
    + +4     IMPORTANT INFORMATION ON ALIGNMENT WHEN MULTIPLYING REGIONS 12


    + +
    +multiply_region by e252d9c145c0bf29b85b21a1ae2921fa
    +R1 (the source): f4f56f08fa92494c5faa57ddcd874149 b4c06a61adbbec2f4b0ffc68e43008cb
    +R2 (the product): b1e34d34b031660676965b868b892043 382f12719ffe3978385f5d97540a13a1
    +UNIX> gf_mult e252d9c145c0bf29b85b21a1ae2921fa f4f56f08fa92494c5faa57ddcd874149 128h
    +b1e34d34b031660676965b868b892043
    +UNIX> gf_div 382f12719ffe3978385f5d97540a13a1 b4c06a61adbbec2f4b0ffc68e43008cb 128h
    +e252d9c145c0bf29b85b21a1ae2921fa
    +UNIX>

    + +
    + + +

    5      Important Information on Alignment when Multiplying Regions

    + + + +In order to make multiplication of regions fast, we often employ 64 and 128 bit instructions. This has ramifications +for pointer alignment, because we want to avoid bus errors, and because on many machines, loading and manipulating +aligned quantities is much faster than unalinged quantities.

    + + +When you perform multiply_region.wxx(gf, source, dest, value, size, add ), there are three requirements: +
      +
    1. + The pointers source and dest must be aligned for w-bit words. For w = 4 and w = 8, there is no restriction; +however for w = 16, the pointers must be multiples of 2, for w = 32, they must be multiples of 4, and for +w ϵ {64, 128}, they must be multiples of 8.

    2. + +
    3. The size must be a multiple of [ w / + +8 .] + With w = 4 and w = 8, w/ +8 = 1 and there is no restriction. The other +sizes must be multiples of w / +8 because you have to be multiplying whole elements of GF(2w ) .

    4. + +
    5. The source and dest pointers must be aligned identically with respect to each other for the implementation +chosen. This is subtle, and we explain it in detail in the next few paragraphs. However, if you'd rather not figure +it out, the following recommendation will always work in GF-Complete:
    6. + +
    + + + +
    +If you want to be safe, make sure that source and dest are both multiples of 16. That is not a +strict requirement, but it will always work!

    +
    + + +If you want to relax the above recommendation, please read further. +

    When performing multiply_region.wxx() , the implementation is typically optimized for a region of bytes whose +size must be a multiple of a variable s ,, and which must be aligned to a multiple of another variable t . For example, +when doing multiply_region.w32() in GF(216 ) with SSE enabled, the implementation is optimized for regions of +32 bytes, which must be aligned on a 16-byte quantity. Thus, s = 32 and t = 16. However, we don't want multiply_ +region.w32() to be too restrictive, so instead of requiring source and dest to be aligned to 16-byte regions, we +require that (source mod 16) equal (dest mod 16). Or, in general, that (source mod t) equal (dest mod t).

    + + +

    +Then, multiply_region.wxx() proceeds in three phases. In the first phase, multiply.wxx() is called on successive +words until (source mod t) equals zero. The second phase then performs the optimized region multiplication on +chunks of s bytes, until the remaining part of the region is less than s bytes. At that point, the third phase calls +multiply.wxx() on the last part of the region.

    + +A detailed example helps to illustrate. Suppose we make the following call in GF(216) with SSE enabled:

    +
    multiply region.w32(gf, 0x10006, 0x20006, a, 274, 0)
    + + + + + + + +
    + +2     FILES IN THE LIBRARY 13


    + + + +



    + +Figure 2: Example of multiplying a region of 274 bytes in GF(216) when (source mod 16) = (dest mod 16) = 6. The +alignment parameters are s = 32 and t = 16. The multiplication is in three phases, which correspond to the initial +unaligned region (10 bytes), the aligned region of s-byte chunks (256 bytes), and the final leftover region (8 bytes). + + +

    First, note that source and dest are aligned on two-byte quantities, which they must be in GF(216). Second, note +that size is a multiple of [ 16/ +8 ] = 2. And last, note that (source mod 16) equals (dest mod 16). We illustrate the three +phases of region multiplication in Figure 2. Because (source mod 16) = 6, there are 10 bytes of unaligned words that +are multiplied with five calls to multiply.w32() in the first phase. The second phase multiplies 256 bytes (eight chunks +of s = 32 bytes) using the SSE instructions. That leaves 8 bytes remaining for the third phase. +

    + +

    +When we describe the defaults and the various implementation options, we specify s and t as "alignment parameters." +

    +

    +One of the advanced region options is using an alternate mapping of words to memory ("ALTMAP"). These interact +in a more subtle manner with alignment. Please see Section 7.9 for details. +

    + +

    6    The Defaults

    + + +GF-Complete implements a wide variety of techniques for multiplication, division and region multiplication. We have +set the defaults with three considerations in mind: +
      +
    1. +Speed: Obviously, we want the implementations to be fast. Therefore, we choose the fastest implementations +that don’t violate the other considerations. The compilation environment is considered. For example, if SSE is +enabled, region multiplication in GF(24 ) employs a single multiplication table. If SSE is not enabled, then a +"double" table is employed that performs table lookup two bytes at a time.

    2. +
    3. +Memory Consumption: We try to keep the memory footprint of GF-Complete low. For example, the fastest +way to perform multiply.w32() in GF(232) is to employ 1.75 MB of multiplication tables (see Section 7.4 +below). We do not include this as a default, however, because we want to keep the default memory consumption +of GF-Complete low. +
    4. + +
+ + + + + + +
+ +6     THE DEFAULTS 14


+ +
    + +3.   Compatibility with "standard" implementations: While there is no de facto standard of Galois Field arithmetic, +most libraries implement the same fields. For that reason, we have not selected composite fields, alternate +polynomials or memory layouts for the defaults, even though these would be faster. Again, see section 7.7 for +more information. + +
+ +

Table 1 shows the default methods used for each power-of-two word size, their alignment parameters s and t, their +memory consumption and their rough performance. The performance tests are on an Intel Core i7-3770 running at +3.40 GHz, and are included solely to give a flavor of performance on a standard microprocessor. Some processors +will be faster with some techniques and others will be slower, so we only put numbers in so that you can ballpark it. +For other values of w between 1 and 31, we use table lookup when w ≤ 8, discrete logarithms when w ≤ 16 and +"Bytwop" for w ≤ 32.

+

+
With SSE +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
w Memory
Usage
multiply()
Implementation
Performance
(Mega Ops / s)
multiply region()
Implementation
s t Performance
(MB/s)
4 <1K Table501Table16 16 11,659
8 136K Table501Split Table (8,4)16 16 11,824
16 896K Log260Split Table (16,4)32 16 7,749
32 <1K Carry-Free48Split Table (32,4)64 16 5,011
64 2K Carry-Free84Split Table (64,4)128 16 2,402
128 64K Carry-Free48Split Table (128,4)16 16 833
+ + +
+
Without SE
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
w Memory
Usage
multiply()
Implementation
Performance
(Mega Ops / s)
multiply region()
Implementation
s t Performance
(MB/s)
4 4K Table501Double Table16 16 11,659
8 128K Table501Table1 1 1,397
16 896K Log266Split Table (16,8)32 16 2,135
32 4K Bytwop19Split Table (32,4)4 4 1,149
64 16K Bytwop9Split Table (64,4)8 8 987
128 64K Bytwop1.4Split Table (128,4)16 8 833
+
+
+

+Table 1: The default implementations, memory consumption and rough performance when w is a power of two. The +variables s and t are alignment variables described in Section 5. +

+A few comments on Table 1 are in order. First, with SSE, the performance of multiply() is faster when w = 64 +than when w = 32. That is because the primitive polynomial for w = 32, that has historically been used in Galois +Field implementations, is sub-ideal for using carry-free multiplication (PCLMUL). You can change this polynomial +(see section 7.7) so that the performance matches w = 64.

+

+The region operations for w = 4 and w = 8 without SSE have been selected to have a low memory footprint. There +are better options that consume more memory, or that only work on large memory regions (see section 6.1.5). +

+ +There are times that you may want to stray from the defaults. For example: +
    +
  • +You may want better performance. +
  • + +
+ + + + + + + + + + +
+ +6     THE DEFAULTS 15


+ +
    +
  • You may want a lower memory footprint.
  • +
  • You may want to use a different Galois Field or even a ring.
  • +
  • You only care about multiplying a region by the value two.
  • + +
+ + +

+Our command line tools allow you to deviate from the defaults, and we have two C functions -gf_init_hard() +and create_gf_from_argv() that can be called from application code to override the default methods. There are six +command-line tools that can be used to explore the many techniques implemented in GF-Complete:

+ +

    + +
  • gf_methods is a tool that enumerates most of the possible command-line arguments that can be sent to the other +tools

  • +
  • gf_mult and gf_div are explained above. You may change the multiplication and division technique in these +tools if you desire

  • +
  • gf_unit performs unit tests on a set of techniques to verify correctness

  • +
  • gf_time measures the performance of a particular set of techniques

  • +
  • time_tool.sh makes some quick calls to gf_time so that you may gauge rough performance.

  • +
  • gf_poly tests the irreducibility of polynomials in a Galois Field

  • +
+ + +

To change the default behavior in application code, you need to call gf_init_hard() rather than gf_init_easy(). +Alternatively, you can use create_g_from_argv(), included from gf_method.h, which uses an argv-style array of +strings to specify the options that you want. The procedure in gf_method.c parses the array and makes the proper +gf_init_hard() procedure call. This is the technique used to parse the command line in gf_mult, gf_div, gf_unit et al.

+ + +

6.1.1 Changing the Components of a Galois Field with create gf_from_argv()

+There are five main components to every Galois Field instance: +
    +
  • w
  • +
  • Multiplication technique
  • +
  • Division technique
  • +
  • Region technique(s)
  • +
  • Polynomial
  • +
+ +

The procedures gf_init_hard() and create_gf_from_argv() allow you to specify these parameters when you create +your Galois Field instance. We focus first on create_gf_from_argv(), because that is how the tools allow you to specify +the components. The prototype of create_gf_from_argv() is as follows:


+ +
+int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting);

+ +You pass it a pointer to a gf_t, which it will initialize. You specify the word size with the parameter w, and then you +pass it an argc/argv pair as in any C or C++ program. You also specify a starting argument, which is where in argv +the specifications begin. If it successfully parses argc and argv, then it creates the gf_t using gf_init_hard() (described +below in section 6.4). It returns one past the last index of argv that it considered when creating the gf_t. If it fails, then +it returns zero, and the gf_t is unmodified. + + + +

For example, gf_mult.c calls create gf_from_argv() by simply passing argc and argv from its main() declaration, +and setting starting to 4.

+ + + + + + + + +
+ +6     THE DEFAULTS 16


+ +

+To choose defaults, argv[starting] should equal "-". Otherwise, you specify the component that you are changing +with "-m" for multiplication technique, "-d" for division technique, "-r" for region technique, and "-p" for the +polynomial. You may change multiple components. You end your specification with a single dash. For example, the +following call multiplies 6 and 5 in GF(24) with polynomial 0x19 using the "SHIFT" technique for multiplication +(we'll explain these parameters later): +



+ +
+UNIX> ./gf_mult 6 5 4 -p 0x19 -m SHIFT -
+7
+UNIX>

+
+ +

If create_gf_from_argv() fails, then you can call the procedure gf_error(), which prints out the reason why create_ +gf_from_argv() failed.

+ + +

6.1.2 Changing the Polynomial

+ +Galois Fields are typically implemented by representing numbers as polynomials with binary coefficients, and then +using the properties of polynomials to define addition and multiplication. You do not need to understand any of that to +use this library. However, if you want to learn more about polynomial representations and how they construct fields, +please refer to The Paper. + +

Multiplication is based on a special polynomial that we will refer to here as the "defining polynomial." This +polynomial has binary coefficients and is of degree w. You may change the polynomial with "-p" and then a number +in hexadecimal (the leading "0x" is optional). It is assumed that the w-th bit of the polynomial is set - you may include +it or omit it. For example, if you wish to set the polynomial for GF(216) to x16 + x5 + x3 + x2 + 1, rather than its +default of x16 + x12 + x3 + x + 1, you may say "-p 0x1002d," "-p 1002d," "-p 0x2d" or "-p 2d." +We discuss changing the polynomial for three reasons in other sections:

+
    +
  • Leveraging carry-free multiplication (section 7.7).
  • +
  • Defining composite fields (section 7.6).
  • +
  • Implementing rings (section 7.8.1).
  • + +
+ +

+Some words about nomenclature with respect to the polynomial. A Galois Field requires the polynomial to be +irreducible .. That means that it cannot be factored. For example, when the coefficients are binary, the polynomial x5+ +x4+x+1 may be factored as (x4+1)(x+1). Therefore it is not irreducible and cannot be used to define a Galois Field. +It may, however, be used to define a ring. Please see section 7.8.1 for a discussion of ring support in GF-Complete.

+

+There is a subset of irreducible polynomials called primitive. These have an important property that one may enumerate +all of the elements of the field by raising 2 to successive posers. All of the default polynomials in GF-Complete +are primitive. However, so long as a polynomial is irreducible, it defines a Galois Field. Please see section 7.7 for a +further discussion of the polynomial.

+ +

+One thing that we want to stress here is that changing the polynomial changes the field, so fields with different +polynomialsmay not be used interchangeably. So long as the polynomial is irreducible, it generates a Galois Field that +is isomorphic to all other Galois Fields; however the multiplication and division of elements will differ. For example, +the polynomials 0x13 (the default) and 0x19 in GF(24) are both irreducible, so both generate valid Galois Fields. +However, their multiplication differs:


+ +
+UNIX> gf_mult 8 2 4 -p 0x13 -
+3
+UNIX> gf_mult 8 2 4 -p 0x19 -
+9
+
+ + + + + + + + + +
+ +6     THE DEFAULTS 17


+ +
+UNIX> gf_div 3 8 4 -p 0x13 -
+2
+UNIX> gf_div 9 8 4 -p 0x19 -
+2
+UNIX>
+ +
+ + +

6.1.3     Changing the Multiplication Technique

+The following list describes the multiplication techinques that may be changed with "-m". We keep the description +here brief. Please refer to The Paper for detailed descriptions of these techniques.

+ + +
  • "TABLE:" Multiplication and division are implemented with tables. The tables consume quite a bit of memory +(2w × 2 w × w/ +8 bytes), so they are most useful when w is small. Please see "SSE," "LAZY," "DOUBLE" and + +"QUAD" under region techniques below for further modifications to "TABLE" to perform multiply_region()

  • + + +
  • "LOG:" This employs discrete (or "Zeph") logarithm tables to implement multiplication and division. The +memory usage is roughly (3 × 2w × w / +8 bytes), so they are most useful when w is small, but they tolerate +larger w than "TABLE." If the polynomial is not primitive (see section 6.1.2), then you cannot use "LOG" as +an implementation. In that case, gf_init_hard() or create_gf_from_argv() will fail

  • + + +
  • "LOG_ZERO:" Discrete logarithm tables which include extra room for zero entries. This more than doubles +the memory consumption to remove an if statement (please see [GMS08] or The Paper for more description). It +doesn’t really make a huge deal of difference in performance

  • + +
  • "LOG_ZERO_EXT:" This expends even more memory to remove another if statement. Again, please see The +Paper for an explanation. As with "LOG_ZERO," the performance difference is negligible

  • + +
  • "SHIFT:" Implementation straight from the definition of Galois Field multiplication, by shifting and XOR-ing, +then reducing the product using the polynomial. This is slooooooooow, so we don’t recommend you use it

  • + + +
  • "CARRY_FREE:" This is identical to "SHIFT," however it leverages the SSE instruction PCLMUL to perform +carry-freemultiplications in single instructions. As such, it is the fastest way to perform multiplication for large +values of w when that instruction is available. Its performance depends on the polynomial used. See The Paper +for details, and see section 7.7 below for the speedups available when w = 16 and w = 32 if you use a different +polynomial than the default one

  • + + +
  • "BYTWO_p:" This implements multiplication by successively multiplying the product by two and selectively +XOR-ing the multiplicand. See The Paper for more detail. It can leverage Anvin’s optimization that multiplies +64 and 128 bits of numbers in GF(2w) by two with just a few instructions. The SSE version requires SSE2

  • + + +
  • "BYTWO_b:" This implements multiplication by successively multiplying the multiplicand by two and selectively +XOR-ing it into the product. It can also leverage Anvin's optimization, and it has the feature that when +you're multiplying a region by a very small constant (like 2), it can terminate the multiplication early. As such, +if you are multiplying regions of bytes by two (as in the Linux RAID-6 Reed-Solomon code [Anv09]), this is +the fastest of the techniques, regardless of the value of w. The SSE version requires SSE2

  • + + +
  • "SPLIT:" Split multiplication tables (like the LR tables in [GMS08], or the SIMD tables for w ≤ 8 in [LHy08, +Anv09, PGM13b]). This argument must be followed by two more arguments, wa and wb, which are the index +sizes of the sub-tables. This implementation reduces the size of the table from "TABLE," but requires multiple +

  • + + + + + + +
    + +6     THE DEFAULTS 18


    +
      +table lookups. For example, the following multiplies 100 and 200 in GF(28) using two 4K tables, as opposed +to one 64K table when you use "TABLE:"

      +
      +UNIX> ./gf_mult 100 200 8 -m SPLIT 8 4 -
      +79
      +UNIX>

      +
      + +See section 7.4 for additional information on the arguments to "SPLIT." The SSE version typically requires +SSSE3.

      + + +
    • "GROUP:" This implements the "left-to-right comb" technique [LBOX12]. I'm afraid we don't like that name, +so we call it "GROUP," because it performs table lookup on groups of bits for shifting (left) and reducing (right). +It takes two additional arguments - gs, which is the number of bits you use while shifting (left) and gr, which +is the number of bits you use while reducing (right). Increasing these arguments can you higher computational +speed, but requires more memory. SSE version exists only for w = 128 and it requires SSE4. For more +description on the arguments gs and gr, see section 7.5. For a full description of "GROUP" algorithm, please +see The Paper. +

    • + +
    • "COMPOSITE:" This allows you to perform operations on a composite Galois Field, GF((2l)k) as described +in [GMS08], [LBOX12] and The Paper. The field size w is equal to lk. It takes one argument, which is k, and +then a specification of the base field. Currently, the only value of k that is supported is two. However, that may +change in a future revision of the library.

    • + + +In order to specify the base field, put appropriate flags after specifying k. The single dash ends the base field, +and after that, you may continue making specifications for the composite field. This process can be continued +for multiple layers of "COMPOSITE." As an example, the following multiplies 1000000 and 2000000 +in GF((216)2), where the base field uses BYTWO_p for multiplication:

      +
      ./gf_mult 1000000 2000000 32 -m COMPOSITE 2 -m BYTWO_p - -

      + +In the above example, the red text applies to the base field, and the black text applies to the composite field. +Composite fields have two defining polynomials - one for the composite field, and one for the base field. Thus, if +you want to change polynomials, you should change both. The polynomial for the composite field must be of the +form x2+sx+1, where s is an element of GF(2k). To change it, you specify s (in hexadecimal)with "-p." In the +example below, we multiply 20000 and 30000 in GF((28)2) , setting s to three, and using x8+x4+x3+x2+1 +as the polynomial for the base field:

      + +
      ./gf_mult 20000 30000 16 -m COMPOSITE 2 -p 0x11d - -p 0x3 -


      + +If you use composite fields, you should consider using "ALTMAP" as well. The reason is that the region +operations will go much faster. Please see section 7.6.

      +As with changing the polynomial, when you use a composite field, GF((2l)k), you are using a different field +than the "standard" field for GF((2l)k). All Galois Fields are isomorphic to each other, so they all have the +desired properties; however, the fields themselves change when you use composite fields.

      +
    +

    +With the exception of "COMPOSITE", only one multiplication technique can be provided for a given Galois +Field instance. Composite fields may use composite fields as their base fields, in which case the specification will be +recursive.

    + + + + + + + + +
    + +6     THE DEFAULTS 19


    + +

    6.1.4       Changing the Division Technique

    + +There are two techniques for division that may be set with "-d". If "-d" is not specified, then appropriate defaults +are employed. For example, when the multiplication technique is "TABLE," a table is created for division as well as +multiplication. When "LOG" is specified, the logarithm tables are used for division. With "COMPOSITE," a special +variant of Euclid's algorithm is employed that performs division using multiplication and division in the base field. +Otherwise, Euclid's algorithm is used. Please see The Paper for a description of Euclid's algorithm applied to Galois +Fields. + +

    If you use "-d", you must also specify the multiplication technique with "-m."

    +

    To force Euclid's algorithm instead of the defaults, you may specify it with "-d EUCLID." If instead, you would +rather convert elements of a Galois Field to a binary matrix and find an element's inverse by inverting the matrix, +then specify "-d MATRIX." In all of our tests, "MATRIX" is slower than "EUCLID." "MATRIX" is also not defined +for w > 32. +

    + + +

    6.1.5     Changing the Region Technique

    +The following are the region multiplication options ("-r"): +
      +
    • +"SSE:" Use SSE instructions. Initialization will fail if the instructions aren't supported. Table 2 details the +multiplication techniques which can leverage SSE instructions and which versions of SSE are required.

    • + +
      +
      + + + + + + + + + + + + + + + + + + + + +
      Multiplication
      Technique
      multiply() multiply_region() SSE Version Comments
      "TABLE"- YesSSSE3Only for GF(24).
      "SPLIT"-YesSSSE3Only when the second argument equals 4.
      "SPLIT"- YesSSE4When w = 64 and not using "ALTMAP".
      "BYTWO_p"- YesSSE2
      "BYTWO_p"- YesSSE2


      +Table 2: Multiplication techniques which can leverage SSE instructions when they are available. +


      + + + + + + + + + + + + +
    • "NOSSE:" Force non-SSE version

    • + +
    • "DOUBLE:" Use a table that is indexed on two words rather than one. This applies only to w = 4, where +the table is indexed on bytes rather than 4-bit quantities, and to w = 8, where the table is indexed on shorts +rather than bytes. In each case, the table lookup performs two multiplications at a time, which makes region +multiplication faster. It doubles the size of the lookup table.

    • + +
    • "QUAD:" Use a table that is indexed on four words rather than two or one. This only applies to w = 4, where +the table is indexed on shorts. The "Quad" table may be lazily created or created ahead of time (the default). If +the latter, then it consumes 24 × 216 × 2 = 2 MB of memory.

    • + +
    • "LAZY:" Typically it's clear whether tables are constructed upon initialization or lazily when a region operation +is performed. There are two times where it is ambiguous: "QUAD" when w = 4 and "DOUBLE" when w = 8. +If you don't specify anything, these tables are created upon initialization, consuming a lot of memory. If you +specify "LAZY," then the necessary row of the table is created lazily when you call "multiply_region(). +
    • + +
    + + + + + + + + + + + +
    + +6     THE DEFAULTS 20


    +
      + +
    • "ALTMAP:" Use an alternate mapping, where words are split across different subregions of memory. There +are two places where this matters. The first is when implementing "SPLIT w 4" using SSE when w > 8. In +these cases, each byte of the word is stored in a different 128-bit vector, which allows the implementation to +better leverage 16-byte table lookups. See section 7.4 for examples, and The Paper or [PGM13b] for detailed +explanations.

    • + +The second place where it matters is when using "COMPOSITE." In this case, it is advantageous to split each +memory region into two chunks, and to store half of each word in a different chunk. This allows us to call +region_multiply() recursively on the base field, which is much faster than the alternative. See Section 7.6 for +examples, and The Paper for an explanation.

      + +It is important to note that with "ALTMAP," the words are not "converted" from a standard mapping to an +alternate mapping and back again. They are assumed to always be in the alternate mapping. This typically +doesn't matter, so long as you always use the same "ALTMAP" calls. Please see section 7.9 for further details +on "ALTMAP," especially with respect to alignment.

      + +
    • "CAUCHY:" Break memory into w subregions and perform only XOR's as in Cauchy Reed-Solomon coding +[BKK+95] (also described in The Paper). This works for any value of w ≤ 32, even those that are not +powers of two. If SSE2 is available, then XOR's work 128 bits at a time. For "CAUCHY" to work correctly, +size must be a multiple of w .
    + + + +

    It is possible to combine region multiplication options. This is fully supported as long as gf_methods has the combination +listed. If multiple region options are required, they should be specified independently (as flags for gf_init_hard() +and independent options for command-line tools and create_gf_from_argv()).

    + + +

    6.2    Determining Supported Techniques with gf_methods

    + + +The program gf_methods prints a list of supported methods on standard output. It is called as follows:

    +
    +
    ./gf_methods w -BADC -LUMDRB

    + +The first argument is w , which may be any legal value of w . The second argument has the following flags:

    +
      + +
    • "B:" This only prints out "basic" methods that are useful for the given value of w . It omits "SHIFT" and other +methods that are never really going to be useful.

    • + +
    • "A:" In constrast, this specifies to print "all" methods.

    • + +
    • "D:" This includes the "EUCLID" and "MATRIX" methods for division. By default, they are not included.

    • + +
    • "C:" This includes the "CAUCHY" methods for region multiplication. By default, it is not included.

    • +
    +

    +You may specify multiple of these as the second argument. If you include both "B" and "A," then it uses the last +one specified.

    +

    +The last argument determines the output format of gf_methods. If it is "L," then it simply lists methods. If it +is "U," then the output contains gf_unit commands for each of the methods. For the others, the output contains +gf_time_tool.sh commands for M ultiplication,Division,Region multiplications with multiple buffer sizes, and the +Best region multiplication.

    +

    +gf_methods enumerates combinations of flags, and calls create_gf_from_argv() to see if the combinations are +supported. Although it enumerates a large number of combinations, it doesn't enumerate all possible parameters for +"SPLIT," "GROUP" or "COMPOSITE."

    + +

    Some examples of calling gf_methods are shown below in section 6.3.2.

    + + + + + + + +
    + +6     THE DEFAULTS 21


    + + +

    6.3 Testing with gf_unit , gf_time , and time_tool.sh

    + + + +gf_unit and gf_time may be used to verify that a combination of arguments works correctly and efficiently on your +platform. If you plan to stray from the defaults, it is probably best to run both tools to ensure there are no issues with +your environment. gf_unit will run a set of unit tests based on the arguments provided to the tool, and gf_time will +time Galois Field methods based on the provided arguments.
    +The usage of gf_ unit is:

    +
    +gf_unit w tests seed method

    +The usage of gf_ time is:

    +
    +gf_time w tests seed buffer-size iterations method

    +
    + + + +The seed is an integer- negative one uses the current time. The tests are specified by a listing of characters. The +following tests are supported (All are supported by gf_time. Only ', 'S' and 'R' are supported by gf_unit):

    + +
      +
    • 'M': Single multiplications

    • +
    • 'D': Single divisions

    • +
    • 'I': Single inverses

    • +
    • 'G': Region multiplication of a buffer by a random constant

    • +
    • '0': Region multiplication of a buffer by zero (does nothing andbzero())

    • +
    • '1': Region multiplication of a buffer by one (does memcpy() and XOR)

    • +
    • '2': Region multiplication of a buffer by two – sometimes this is faster than general multiplication

    • +
    • 'S': All three single tests

    • +
    • 'R': All four region tests

    • +
    • 'A': All seven tests

    • +
    + + + + + +

    Here are some examples of calling gf_unit and gf_time to verify that "-m SPLIT 32 4 -r ALTMAP -" works +in GF(232), and to get a feel for its performance. First, we go to the test directory and call gf_unit:



    + + +
    +UNIX> cd test
    +UNIX> ./gf_unit 32 A -1 -m SPLIT 32 4 -r ALTMAP -
    +Args: 32 A -1 -m SPLIT 32 4 -r ALTMAP - / size (bytes): 684
    +UNIX>

    +
    + +gf_unit reports on the arguments and how may bytes the gf_t consumes. If it discovers any problems or inconsistencies +with multiplication, division or region multiplication, it will report them. Here, there are no problems. +Next, we move to the tools directory and run performance tests on a 10K buffer, with 10,000 iterations of each test:

    + + +UNIX> cd ../tools
    +UNIX> ./gf_time 32 A -1 10240 10000 -m SPLIT 32 4 -r ALTMAP -
    +Seed: 1388435794
    +
    + + + + + + + + + + + + +
    Multiply: 4.090548 s Mops: 24.414 5.968 Mega-ops/s
    Divide: 37.794962 s Mops: 24.414 0.646 Mega-ops/s
    Inverse: 33.709875 s Mops: 24.414 0.724 Mega-ops/s
    Region-Random: XOR: 0 0.035210 s MB: 97.656 2773.527 MB/s
    Region-Random: XOR: 1 0.036081 s MB: 97.656 2706.578 MB/s
    Region-By-Zero:XOR: 0 0.003199 s MB: 97.656 30523.884 MB/s
    Region-By-Zero: XOR: 1 0.000626 s MB: 97.656 156038.095 MB/s
    +
    + + + + + + + + + + +
    + +6     THE DEFAULTS 22


    + +
    + + + + + + + +
    Region-By-One: XOR: 0 0.003810 s MB: 97.656 25628.832 MB/s
    Region-By-One: XOR: 1 0.008363 s MB: 97.656 11677.500 MB/s
    Region-By-Two: XOR: 0 0.032942 s MB: 97.656 2964.486 MB/s
    Region-By-Two: XOR: 1 0.033488 s MB: 97.656 2916.153 MB/s
    +
    +UNIX>

    + +

    The first column of output displays the name of the test performed. Region tests will test with and without the XOR +flag being set (see Section 4.3 for an example). The second column displays the total time the test took to complete +measured in seconds (s). The third column displays the size of the test measured in millions of operations (Mops) for +single tests and in Megabytes (MB) for the region tests. The final column displays the speed of the tests calculated +from the second and third columns, and is where you should look to get an idea of a method's performance.

    +

    +If the output of gf_unit and gf_time are to your satisfaction, you can incorporate the method into application code +using create gf_from_argv() or gf_init hard().

    +

    +The performance of "Region-By-Zero" and "Region-By-One" will not change from test to test, as all methods make +the same calls for these. "Region-By-Zero" with "XOR: 1" does nothing except set up the tests. Therefore, you may +use it as a control.

    + +

    6.3.1       time_tool.sh

    + +Finally, the shell script time_tool.sh makes a bunch of calls to gf_time to give a rough estimate of performance. It is +called as follows:

    +usage sh time_tool.sh M|D|R|B w method

    + + +

    The values for the first argument are MDRB, for Multiplication, Division,Region multiplications with multiple +buffer sizes, and the Best region multiplication. For the example above, let's call time_tool.sh to get a rough idea of +performance:



    + +
    +UNIX> sh time_tool.sh M 32 -m SPLIT 32 4 -r ALTMAP -
    +M speed (MB/s): 6.03 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    +UNIX> sh time_tool.sh D 32 -m SPLIT 32 4 -r ALTMAP -
    +D speed (MB/s): 0.65 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    +UNIX> sh time_tool.sh R 32 -m SPLIT 32 4 -r ALTMAP -
    + + + + + + + + + + + + + +
    Region Buffer-Size: 16K (MB/s): 3082.91 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 32K (MB/s): 3529.07 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 64K (MB/s): 3749.94 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 128K (MB/s): 3861.27 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 512K (MB/s): 3820.82 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 1M (MB/s): 3737.41 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 2M (MB/s): 3002.90 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Buffer-Size: 4M (MB/s): 2760.77 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    Region Best (MB/s): 3861.27 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    + +UNIX> sh time_tool.sh B 32 -m SPLIT 32 4 -r ALTMAP -
    +Region Best (MB/s): 3929.09 W-Method: 32 -m SPLIT 32 4 -r ALTMAP -
    +UNIX>

    +
    +

    +We say that time_tool.sh is "rough" because it tries to limit each test to 5 ms or less. Thus, the time granularity +is fine, which means that the numbers may not be as precise as they could be were the time granularity to be course. +When in doubt, you should make your own calls to gf_time with a lot of iterations, so that startup costs and roundoff +error may be minimized.

    + + + + + + + + +
    + +6     THE DEFAULTS 23


    + +

    6.3.2       An example of gf_methods and time_tool.sh



    +Let's give an example of how some of these components fit together. Suppose we want to explore the basic techniques +in GF(232). First, let's take a look at what gf_methods suggests as "basic" methods:

    +
    +UNIX> gf_methods 32 -B -L
    +w=32: -
    +w=32: -m GROUP 4 8 -
    +w=32: -m SPLIT 32 4 -
    +w=32: -m SPLIT 32 4 -r ALTMAP -
    +w=32: -m SPLIT 32 8 -
    +w=32: -m SPLIT 8 8 -
    +w=32: -m COMPOSITE 2 - -
    +w=32: -m COMPOSITE 2 - -r ALTMAP -
    +UNIX>

    +
    + + +

    + +You'll note, this is on my old Macbook Pro, which doesn't support (PCLMUL), so "CARRY_FREE" is not included +as an option. Now, let's run the unit tester on these to make sure they work, and to see their memory consumption:



    + +
    +UNIX> gf_methods 32 -B -U
    +../test/gf_unit 32 A -1 -
    +../test/gf_unit 32 A -1 -m GROUP 4 8 -
    +../test/gf_unit 32 A -1 -m SPLIT 32 4 -
    +../test/gf_unit 32 A -1 -m SPLIT 32 4 -r ALTMAP -
    +../test/gf_unit 32 A -1 -m SPLIT 32 8 -
    +../test/gf_unit 32 A -1 -m SPLIT 8 8 -
    +../test/gf_unit 32 A -1 -m COMPOSITE 2 - -
    +../test/gf_unit 32 A -1 -m COMPOSITE 2 - -r ALTMAP -
    +UNIX> gf_methods 32 -B -U | sh
    +Args: 32 A -1 - / size (bytes): 684
    +Args: 32 A -1 -m GROUP 4 8 - / size (bytes): 1296
    +Args: 32 A -1 -m SPLIT 32 4 - / size (bytes): 684
    +Args: 32 A -1 -m SPLIT 32 4 -r ALTMAP - / size (bytes): 684
    +Args: 32 A -1 -m SPLIT 32 8 - / size (bytes): 4268
    +Args: 32 A -1 -m SPLIT 8 8 - / size (bytes): 1839276
    +Args: 32 A -1 -m COMPOSITE 2 - - / size (bytes): 524648
    +Args: 32 A -1 -m COMPOSITE 2 - -r ALTMAP - / size (bytes): 524648
    +UNIX>

    +
    +

    +As anticipated, "SPLIT 8 8" consumes quite a bit of memory! Now, let's see how well they perform with both +single multiplications and region multiplications:



    +
    +UNIX> gf_methods 32 -B -M
    +sh time_tool.sh M 32 -
    +sh time_tool.sh M 32 -m GROUP 4 8 -
    +sh time_tool.sh M 32 -m SPLIT 32 4 -
    +sh time_tool.sh M 32 -m SPLIT 32 4 -r ALTMAP -
    +sh time_tool.sh M 32 -m SPLIT 32 8 -
    +sh time_tool.sh M 32 -m SPLIT 8 8 -
    + +
    + + + + + + + + +
    + +6     THE DEFAULTS 24


    + +
    +sh time_tool.sh M 32 -m COMPOSITE 2 -
    +sh time_tool.sh M 32 -m COMPOSITE 2 - -r ALTMAP
    +UNIX> gf_methods 32 -B -M | sh +M speed (MB/s): 5.90 W-Method: 32
    +M speed (MB/s): 14.09 W-Method: 32 -m GROUP 4 8
    +M speed (MB/s): 5.60 W-Method: 32 -m SPLIT 32 4
    +M speed (MB/s): 5.19 W-Method: 32 -m SPLIT 32 4 -r ALTMAP
    +M speed (MB/s): 5.98 W-Method: 32 -m SPLIT 32 8
    +M speed (MB/s): 22.10 W-Method: 32 -m SPLIT 8 8
    +M speed (MB/s): 34.98 W-Method: 32 -m COMPOSITE 2 -
    +M speed (MB/s): 34.16 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP
    +UNIX> gf_methods 32 -B -B | sh +Region Best (MB/s): 2746.76 W-Method: 32
    +Region Best (MB/s): 177.06 W-Method: 32 -m GROUP 4 8
    +Region Best (MB/s): 2818.75 W-Method: 32 -m SPLIT 32 4
    +Region Best (MB/s): 3818.21 W-Method: 32 -m SPLIT 32 4 -r ALTMAP
    +Region Best (MB/s): 728.68 W-Method: 32 -m SPLIT 32 8
    +Region Best (MB/s): 730.97 W-Method: 32 -m SPLIT 8 8
    +Region Best (MB/s): 190.20 W-Method: 32 -m COMPOSITE 2 -
    +Region Best (MB/s): 1837.99 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP
    +UNIX> +
    +

    +The default is quite a bit slower than the best performing methods for both single and region multiplication. So +why are the defaults the way that they are? As detailed at the beginning of this chapter, we strive for lower memory +consumption, so we don't use "SPLIT 8 8," which consumes 1.75MB.We don't implement alternate fields by default, +which is why we don't use "COMPOSITE." Finally, we don't implement alternate mappings of memory by default, +which is why we don't use "-m SPLIT 32 4 -r ALTMAP -."

    + +

    Of course, you may change these defaults if you please.

    +

    +Test question: Given the numbers above, it would appear that "COMPOSITE" yields the fastest performance of +single multiplication, while "SPLIT 32 4" yields the fastest performance of region multiplication. Should I use two +gf_t's in my application – one for single multiplication that uses "COMPOSITE," and one for region multiplication +that uses "SPLIT 32 4?"

    +

    +The answer to this is "no." Why? Because composite fields are different from the "standard" fields, and if you mix +these two gf_t's, then you are using different fields for single multiplication and region multiplication. Please read +section 7.2 for a little more information on this.

    + +

    6.4      Calling gf_init_hard()

    + +We recommend that you use create_gf_from_argv() instead of gf_init_hard(). However, there are extra things that +you can do with gf_init_hard(). Here's the prototype:

    +
    +int gf_init_hard(gf_t *gf
    +
    +int w
    +int mult_type
    +int region_type
    +int divide_type
    +uint64_t prim_poly
    +int arg1
    +int arg2
    +
    +
    + + + + + + + + +
    + +6     THE DEFAULTS 25


    +
    +
    +GFP base_gf,
    +void *scratch_memory);


    + + +The arguments mult type, region type and divide type allow for the same specifications as above, except the +types are integer constants defined in gf_complete.h:

    +typedef enum {GF_MULT_DEFAULT,
    +
    +GF_MULT_SHIFT
    +GF_MULT_CARRY_FREE
    +GF_MULT_GROUP
    +GF_MULT_BYTWO_p
    +GF_MULT_BYTWO_b
    +GF_MULT_TABLE
    +GF_MULT_LOG_TABLE
    +GF_MULT_LOG_ZERO
    +GF_MULT_LOG_ZERO_EXT
    +GF_MULT_SPLIT_TABLE
    +GF_MULT_COMPOSITE } gf_mult_type_t;

    + +
    + +#define GF_REGION_DEFAULT (0x0)
    +#define GF_REGION_DOUBLE_TABLE (0x1)
    +#define GF_REGION_QUAD_TABLE (0x2)
    +#define GF_REGION_LAZY (0x4)
    +#define GF_REGION_SSE (0x8)
    +#define GF_REGION_NOSSE (0x10)
    +#define GF_REGION_ALTMAP (0x20)
    +#define GF_REGION_CAUCHY (0x40)

    +typedef enum { GF_DIVIDE_DEFAULT
    +
    GF_DIVIDE_MATRIX
    +GF_DIVIDE_EUCLID } gf_division_type_t;

    +
    +
    +

    +You can mix the region types with bitwise or. The arguments to GF_MULT_GROUP,GF_MULT_SPLIT_TABLE +and GF_MULT_COMPOSITE are specified in arg1 and arg2. GF_MULT_COMPOSITE also takes a base field +in base_gf. The base field is itself a gf_t, which should have been created previously with create_gf_fro_argv(), +gf_init_easy() or gf_init_hard(). Note that this base_gf has its own base_gf member and can be a composite field +itself.

    +

    +You can specify an alternate polynomial in prim_poly. For w ≤ 32, the leftmost one (the one in bit position w) is +optional. If you omit it, it will be added for you. For w = 64, there's no room for that one, so you have to leave it off. +For w = 128, your polynomial can only use the bottom-most 64 bits. Fortunately, the standard polynomial only uses +those bits. If you set prim_poly to zero, the library selects the "standard" polynomial. +

    +

    +Finally, scratch_memory is there in case you don't want gf_init_hard() to call malloc(). Youmay call gf_scratch_size() +to find out how much extra memory each technique uses, and then you may pass it a pointer for it to use in scratc_memory. +If you set scratch memory to NULL, then the extra memory is allocated for you with malloc(). If you use gf_init_easy() +or create_gf_from_argv(), or you use gf_init_hard() and set scratch_memory to NULL, then you should call gf_free() +to free memory. If you use gf_init_hard() and use your own scratch_memory you can still call gf_free(), and it will +not do anything.

    +

    +Both gf_init_hard() and gf_scratch_size() return zero if the arguments don't specify a valid gf_t. When that happens, +you can call gf_error() to print why the call failed.

    + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 26


    + + +

    We'll give you one example of calling gf_ init_hard(). Suppose you want to make a gf_ init_hard() call to be +equivalent to "-m SPLIT 16 4 -r SSE -r ALTMAP -" and you want to allocate the scratch space yourself. Then you'd +do the following:



    + +
    +gf_t gf;
    +void *scratch;
    +int size;
    +size = gf_scratch_size(16, GF_MULT_SPLIT_TABLE,
    +GF_REGION_SSE | GF_REGION_ALTMAP,
    +GF_DIVIDE_DEFAULT,
    +16, 4);
    +if (size == 0) { gf_error(); exit(1); } /* It failed. That shouldn’t happen */
    +scratch = (void *) malloc(size);
    +if (scratch == NULL) { perror("malloc"); exit(1); }
    +if (!gf_init_hard(&gf, 16, GF_MULT_SPLIT_TABLE,
    +GF_REGION_SSE | GF_REGION_ALTMAP,
    +GF_DIVIDE_DEFAULT,
    +0, 16, 4, NULL, scratch)) {
    +gf_error();
    +exit(1);
    +}
    + +
    + + +

    6.5     gf_size()

    + +You can call gf_size(gf_t *gf) to learn the memory consumption of the gf_t. It returns all memory consumed by the +gf_t, including the gf_t itself, any scratch memory required by the gf_ t, and the memory consumed by the sub-field +if the field is "COMPOSITE." If you provided your own memory to gf_init_hard(), it does not report the size of +this memory, but what the size should be, as determined by gf_scratch size(). gf_ unit() prints out the return value of +gf_size() on the given field. + + +

    7   Further Information on Options and Algorithms

    +

    +7.1   Inlining Single Multiplication and Division for Speed

    + +Obviously, procedure calls are more expensive than single instructions, and the mechanics of multiplication in "TABLE" +and "LOG" are pretty simple. For that reason, we support inlining for "TABLE" when w = 4 and w = 8, and +for "LOG" when w = 16. We elaborate below. +

    +When w = 4, you may inline multiplication and division as follows. The following procedures return pointers to +the multiplication and division tables respectively:



    + +
    +uint8_t *gf_w4_get_mult_table(gf_t * gf);
    +uint8_t *gf_w4_get_div_table(gf_t * gf);

    +
    +

    The macro Gf_W4_INLINE_MULTDIV (table, a, b) then multiplies or divides a by b using the given table. This +of course only works if the multiplication technique is "TABLE," which is the default for w = 4. If the multiplication +technique is not "TABLE," then gf_w4_get_mult_table() will return NULL.

    + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 27


    + + + + +

    When w = 8, the procedures gf_w8_et_mult_table() and gf_ w8_get_div_table(), and the macro

    + +GF_W8_INLINE_MULTDIV (table, a, b) work identically to the w = 4 case. + +

    When w = 16, the following procedures return pointers to the logarithm table, and the two inverse logarithm tables +respectively:


    + +
    +uint16_t *gf_w16_get_log_table(gf_t * gf);
    +uint16_t *gf_w16_get_mult_alog_table(gf_t * gf);
    +uint16_t *gf_w16_get_div_alog_table(gf_t * gf);
    + +
    +
    +

    +The first inverse logarithm table works for multiplication, and the second works for division. They actually point +to the same table, but to different places in the table. You may then use the macro GF_W16_INLINE_MULT(log, +alog, a, b ) to multiply a and b, and the macro GF_W16_INLINE_DIV (log, alog, a, b ) to divide a and b. Make +sure you use the alog table returned by gf_w16_get_mult_alog_table() for multiplication and the one returned by +gf_w16_get_div_alog_table() for division. Here are some timings:



    + + +UNIX> gf_time 4 M 0 10240 10240 -
    +Seed: 0
    +Multiply: 0.228860 s Mops: 100.000 436.949 Mega-ops/s
    +UNIX> gf_inline_time 4 0 10240 10240
    +Seed: 0
    +Inline mult: 0.096859 s Mops: 100.000 1032.424 Mega-ops/s
    +UNIX> gf_time 8 M 0 10240 10240 -
    +Seed: 0
    +Multiply: 0.228931 s Mops: 100.000 436.812 Mega-ops/s
    +UNIX> gf_inline_time 8 0 10240 10240
    +Seed: 0
    +Inline mult: 0.114300 s Mops: 100.000 874.889 Mega-ops/s
    +UNIX> gf_time 16 M 0 10240 10240 -
    +Seed: 0
    +Multiply: 0.193626 s Mops: 50.000 258.229 Mega-ops/s
    +UNIX> gf_inline_time 16 0 10240 10240
    +Seed: 0
    +Inline mult: 0.310229 s Mops: 100.000 322.342 Mega-ops/s
    +UNIX>

    + +

    +7.2     Using different techniques for single and region multiplication

    + + +You may want to "mix and match" the techniques. For example, suppose you'd like to use "-m SPLIT 8 8" for +multiply() in GF(232), because it's fast, and you don't mind consuming all of that space for tables. However, for +multiply_region(), you'd like to use "-m SPLIT 32 4 -r ALTMAP," because that's the fastest way to implement +multiply_region(). Unfortunately, There is no way to create a gf_t that does this combination. In this case, you should +simply create two gf_t's, and use one for multiply() and the other for multiply_region(). All of the implementations +may be used interchangably with the following exceptions: + +
      +
    • +"COMPOSITE" implements a different Galois Field.

    • + +
    • If you change a field's polynomial, then the resulting Galois Field will be different.
    • + +
    + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 28


    + +
      +
    • + +If you are using "ALTMAP" to multiply regions, then the contents of the resulting regions of memory will +depend on the multiplication technique, the size of the region and its alignment. Please see section 7.9 for a +detailed explanation of this.
    • + +
    • If you are using "CAUCHY" to multiply regions, then like "ALTMAP," the contents of the result regions of +memory the multiplication technique and the size of the region. You don't have to worry about alignment.
    • + +

      7.3     General w

      +The library supports Galois Field arithmetic with 2 < w ≤ 32. Values of w which are not whole number powers of +2 are handled by the functions in gf_wgen.c . For these values of w , the available multiplication types are "SHIFT," +"BYTw O p," "BYTw O b," "GROUP," "TABLE" and "LOG." "LOG" is only valid for w < 28 and "TABLE" + +is only valid for w < 15. The defaults for these values of w are "TABLE" for w < 8, "LOG" for w < 16, and +"BYTw O p" for w < 32.

      + +

      7.4 Arguments to "SPLIT"

      + +The "SPLIT" technique is based on the distributive property of multiplication and addition:

      +
      +a * (b + c) = (a * b) + (a * c).
      +
      +This property allow s us to, for example, split an eight bit w ord into tw o four-bit components and calculate the product +by performing tw o table lookups in 16-element tables on each of the compoents, and adding the result. There is much +more information on "SPLIT" in The Paper. Here w e describe the version of "SPLIT" implemented in GF-Complete. + +

      +"SPLIT" takes tw o arguments, w hich are the number of bits in each component of a, w hich w e call w a, and the +number of bits in each component of b, w hich w e call w b. If the tw o differ, it does not matter w hich is bigger - the +library recognizes this and performs the correct implementation. The legal values of w a and w b fall into five categories: +


      + + +
        +
      1. + w a is equal to w and w b is equal to four. In this case, b is broken up into w /4 +four-bit w ords w hich are used +in 16-element lookup tables. The tables are created on demand in multiply_region() and the SSSE3 instruction + +mm_shuffle_epi8() is leveraged to perform 16 lookups in parallel. Thus, these are very fast implementations. +w hen w ≥ 16, you should combine this w ith "ALTMAP" to get the best performance (see The Paper +or [PGM13b] for explanation). If you do this please see section 7.9 for information about "ALTMAP" and +alignment.

        + + +If you don't use "ALTMAP," the implementations for w ∈ {16, 32, 64} convert the standard representation into +"ALTMAP," perform the multiplication w ith "ALTMAP" and then convert back to the standard representation. +The performance difference using "ALTMAP" can be significant:


        + +
        +
        +
        + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        gf_time 16 G 0 1048576 100 -m SPLIT 16 4 - Speed = 8,389 MB/s
        gf_time 16 G 0 1048576 100 -m SPLIT 16 4 -r ALTMAP - Speed = 8,389 MB/s
        gf_time 32 G 0 1048576 100 -m SPLIT 32 4 - Speed = 5,304 MB/s
        gf_time 32 G 0 1048576 100 -m SPLIT 32 4 -r ALTMAP - Speed = 7,146 MB/s
        gf_time 64 G 0 1048576 100 -m SPLIT 64 4 - Speed = 2,595 MB/s
        gf_time 64 G 0 1048576 100 -m SPLIT 64 4 -r ALTMAP - Speed = 3,436 MB/s
        +
        + + + + + + + + + +
        + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 29


        + +
          + + +
        1. 2.   wa is equal to w and wb is equal to eight. Now, b is broken into bytes, each of these is used in its own 256-element +lookup table. This is typically the best way to perform multiply_region() without SSE.
        2. +Because this is a region optimization, when you specify these options, you get a default multiply() see +Table 1 for a listing of the defaults. See section 7.2 for using a different multiply() than the defaults.

          + + +
        3. +3.   wa is equal to w and w b is equal to 16. This is only valid for w = 32 and w = 64. Now , b is broken into shorts, +each of these is used in its own 64K-element lookup table. This is typically slower than when w b equals 8, and +requires more amortization (larger buffer sizes) to be effective.

        4. + + +
        5. 4.   w a and w b are both equal to eight. Now both a and b are broken into bytes, +and the products of the various bytes +are looked up in multiple 256 × 256 tables. In GF(216), there are three of these tables. In GF(232), there are +seven, and in GF(264) there are fifteen. Thus, this implementation can be a space hog. How ever, for w = 32, +this is the fastest way to perform multiply() on some machines. +when this option is employed, multiply_region() is implemented in an identical fashion to when w a = w +and w b = 8.

        6. + +
        7. 5.  wa = 32 and wb = 2. (w = 32 only). I was playing with a different way to use mm_shuffle_epi8(). It works, +but it's slower than when wb = 4. +
        8. + +
    + + + +

    7.5    Arguments to "GROUP"

    + +The "GROUP" multiplication option takes tw o arguments, gs and gr. It implements multiplication in the same manner +as "SHIFT," except it uses a table of size 2gs to perform gs shifts at a time, and a table of size 2gr to perform gr +reductions at at time. The program gf_methods only prints the options 4 4 and 4 8 as arguments for "GROUP." +However, other values of gs and gr are legal and sometimes desirable:

    + +
      +
    1. + For w ≤ 32 and w = 64, any values of gs and gr may be used, so long as they are less than or equal to w and so +long as the tables fit into memory. There are four exceptions to this, listed below .

    2. +
    3. For w = 4, "GROUP" is not supported.

    4. +
    5. For w = 8, "GROUP" is not supported.

    6. +
    7. For w = 16, "GROUP" is only supported for gs = gr = 4.

    8. +
    9. For w = 128 "GROUP" only supports gs = 4 and gr ∈ {4, 8, 16}.

    10. +
    +

    +The way that gs and gr impact performance is as follows. The "SHIFT" implementation works by performing a +carry-free multiplication in w steps, and then performing reduction in w steps. In "GROUP," the carry-free multiplication +is reduced to w /gssteps, and the reduction is reduced to w /gr + +. Both require tables. The table for the carry-free +multiplication must be created at the beginning of each multiply() or multiply_region(), while the table for reduction +is created when the gf_t is initialized. For that reason, it makes sense for gr to be bigger than gs.

    + +

    +To give a flavor for the impact of these arguments, Figure 3 show s the performance of varying gs and gr for +single multiplication and region multiplication respectively, in GF(232) and GF(264). As the graphs demonstrate, +multiply() performs better w ith smaller values of gs, w hile multiply region() amortizes the creation of the shifting +table, and can tolerate larger values of gs. w hen gs equals gr, there are some optimizations that we hand-encode. +These can be seen clearly in the multiply_region() graphs. +

    + + + + + + + + +
    +7     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 30 + + +
    + +
    + +
    +
    +Figure 3: The performance of multiply() and multiply_region() using "GROUP," and varying the arguments
    gs +and gr. All graphs are heat maps with black equaling zero. The region size is 100KB. + +

    7.6  Considerations with "COMPOSITE"

    + + +As mentioned above, using "ALTMAP" with "COMPOSITE" allows multiply_region() to recursively call multiply_ +region(), rather than simply calling multiply() on every word in the region. The difference can be pronounced:

    + +
    + + + + + + + + + +
    +gf_time 32 G 0 10240 10240 -m COMPOSITE 2 - - +Speed = 322 MB/s
    gf_time 32 G 0 10240 10240 -m COMPOSITE 2 - -r ALTMAP - +Speed = 3,368 MB/s
    +gf_time 32 G 0 10240 10240 -m COMPOSITE 2 -m SPLIT 16 4 -r ALTMAP - -r ALTMAP - +Speed = 3,925 MB/s
    +
    + + +

    +

    +There is support for performing multiply() inline for the "TABLE" implementations for w ∈ {4, 8} and for the +"LOG" implementation for w = 16 (see section 7.1). These are leveraged by multiply() in "COMPOSITE," and +by multiply_region() if you are not using "ALTMAP." To demonstrate this, in the table below, you can see that the +performance of multiply() with "SPLIT 8 4" is 88 percent as fast than the default in w = 8 (which is "TABLE"). +When you use each as a base field for "COMPOSITE" with w = 16, the one with "SPLIT 8 4" is now just 37 percent +as fast. The difference is the inlining of multiplication in the base field when "TABLE" is employed:



    + +
    + + + + + + + + +
    gf_time 8 M 0 1048576 100 - Speed = 501 Mega-ops/s
    gf_time 8 M 0 1048576 100 -m SPLIT 8 4 - Speed = 439 Mega-ops/s
    gf_time 8 M 0 1048576 100 -m COMPOSITE 2 - - Speed = 207 Mega-ops/s
    gf_time 8 M 0 1048576 100 -m COMPOSITE 2 -m SPLIT 8 4 - - Speed = 77 Mega-ops/s
    +
    +

    +
    + +You can keep making recursive definitions of composites field if you want. For example, this one's not too slow for +region operations (641 MB/s): + + + + + + + + +
    +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 31


    + +
    +
    +gf_time 128 G 0 1048576 100 -m COMPOSITE 2 -m COMPOSITE 2 -m COMPOSITE 2
    +-m SPLIT 16 4 -r ALTMAP - -r ALTMAP - -r ALTMAP - -r ALTMAP - +
    +

    + +

    Please see section 7.8.1 for a discussion of polynomials in composite fields.

    + +

    7.7       "CARRY_FREE" and the Primitive Polynomial

    + + +If your machine supports the PCLMUL instruction, then we leverage that in "CARRY_FREE." This implementation +first performs a carry free multiplication of two w-bit numbers, which yields a 2w-bit number. It does this with +one PCLMUL instruction. To reduce the 2w-bit number back to a w-bit number requires some manipulation of the +polynomial. As it turns out, if the polynomial has a lot of contiguous zeroes following its leftmost one, the number of +reduction steps may be minimized. For example, with w = 32, we employ the polynomial 0x100400007, because that +is what other libraries employ. This only has 9 contiguous zeros following the one, which means that the reduction +takes four steps. If we instead use 0x1000000c5, which has 24 contiguous zeros, the reduction takes just two steps. +You can see the difference in performance: +

    +
    +
    + + + + + + + + +
    gf_time 32 M 0 1048576 100 -m CARRY_FREE - Speed = 48 Mega-ops/s
    gf_time 32 M 0 1048576 100 -m CARRY_FREE -p 0xc5 - Speed = 81 Mega-ops/s
    + +

    + +

    +This is relevant for w = 16 and w = 32, where the "standard" polynomials are sub-optimal with respect to +"CARRY_FREE." For w = 16, the polynomial 0x1002d has the desired property. It’s less important, of course, +with w = 16, because "LOG" is so much faster than CARRY_FREE.

    + +

    7.8   More on Primitive Polynomials

    + +

    7.8.1   Primitive Polynomials that are not Primitive

    + +The library is willing to work with most polynomials, even if they are not primitive or irreducible. For example, the +polynomial x4 + x3 +x2 +x+1 is irreducible, and therefore generates a valid Galois Field for GF(24). However, it +is not primitive, because 25 = 1. For that reason, if you use this polynomial, you cannot use the "LOG" method. The +other methods will work fine:

    + +
    + +UNIX> gf_mult 2 2 4 -p 0xf -
    +4
    +UNIX> gf_mult 4 2 4 -p 0xf -
    +8
    +UNIX> gf_mult 8 2 4 -p 0xf -
    +15
    +UNIX> gf_mult 15 2 4 -p 0xf -
    +1
    +UNIX> gf_div 1 15 4 -p 0xf -
    +2
    +UNIX> gf_div 1 15 4 -p 0xf -m LOG -
    +usage: gf_div a b w [method] - does division of a and b in GF(2ˆw)
    +Bad Method Specification: Cannot use Log tables because the polynomial is not primitive.
    +UNIX>
    +
    +

    +If a polynomial is reducible, then it does not define a Galois Field, but instead a ring. GF-Complete attempts to +work here where it can; however certain parts of the library will not work: +

    + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 32


    +
      +
    1. +Division is a best effort service. The problemis that often quotients are not unique. If divide() returns a non-zero +number, then that number will be a valid quotient, but it may be one of many. If the multiplication technique is +"TABLE," then if a quotient exists, one is returned. Otherwise, zero is returned. Here are some examples - the +polynomial x4 + 1 is reducible, and therefore produces a ring. Below, we see that with this polynomal, 1*6 = 6 +and 14*6 = 6. Therefore, 6/6 has two valid quotients: 1 and 14. GF-Complete returns 14 as the quotient:

    2. + +
      +UNIX> gf_mult 1 6 4 -p 0x1 -
      +6
      +UNIX> gf_mult 14 6 4 -p 0x1 -
      +6
      +UNIX> gf_div 6 6 4 -p 0x1 -
      +14
      +UNIX>

      +
      + + +
    3. When "EUCLID" is employed for division, it uses the extended Euclidean algorithm for GCD to find a number's +inverse, and then it multiplies by the inverse. The problem is that not all numbers in a ring have inverses. For +example, in the above ring, there is no number a such that 6a = 1. Thus, 6 has no inverse. This means that even +though 6/6 has quotients in this ring, "EUCLID" will fail on it because it is unable to find the inverse of 6. It will +return 0: +

    4. +
      +UNIX> gf_div 6 6 4 -p 0x1 -m TABLE -d EUCLID -
      +0
      +UNIX>
      +

      + +
    5. Inverses only work if a number has an inverse. Inverses may not be unique.

    6. + +
    7. "LOG" will not work. In cases where the default would be "LOG," "SHIFT" is used instead.
    8. +
    + +

    +Due to problems with division, gf_unit may fail on a reducible polynomial. If you are determined to use such a +polynomial, don't let this error discourage you. +

    + +

    7.8.2 Default Polynomials for Composite Fields

    + +GF-Complete will successfully select a default polynomial in the following composite fields: +
      +
    • w = 8 and the default polynomial (0x13) is employed for GF(24)

    • +
    • w = 16 and the default polynomial (0x11d) is employed for GF(28)

    • +
    • w = 32 and the default polynomial (0x1100b) is employed for GF(216)

    • +
    • w = 32 and 0x1002d is employed for GF(216)

    • +
    • w = 32 and the base field for GF(w16) is a composite field that uses a default polynomial

    • +
    • w = 64 and the default polynomial (0x100400007) is employed for GF(232)

    • +
    • w = 64 and 0x1000000c5 is employed for GF(232)

    • +
    • w = 64 and the base field for GF(w32) is a composite field that uses a default polynomial

    • +
    • w = 128 and the default polynomial (0x1b) is employed for GF(264)

    • +
    • w = 128 and the base field for GF(w64 ) is a composite field that uses a default polynomial

    • +
    + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 33


    + + +

    7.8.3 The Program gf_poly for Verifying Irreducibility of Polynomials

    + +The program gf_poly uses the Ben-Or algorithm[GP97] to determine whether a polynomial with coefficients in GF(2w ) +is reducible. Its syntax is:

    +
    +gf_poly w method power:coef power:coef ... +
    + +
    +

    You can use it to test for irreducible polynomials with binary coefficients by specifying w = 1. For example, from +the discussion above, we know that x4 +x+1 and x4 +x3 +x2 +x+1 are both irreducible, but x4 +1 is reducible. +gf_poly confirms:


    + +

    +UNIX> gf_poly 1 - 4:1 1:1 0:1
    +Poly: xˆ4 + x + 1
    +Irreducible.
    +UNIX> gf_poly 1 - 4:1 3:1 2:1 1:1 0:1 +Poly: xˆ4 + xˆ3 + xˆ2 + x + 1
    +Irreducible.
    +UNIX> gf_poly 1 - 4:1 0:1 r
    +Poly: xˆ4 + 1
    +Reducible.
    +UNIX>
    + +
    + + +

    +For composite fields GF((2l)2), we are looking for a value s such that x2 + sx + 1 is irreducible. That value +depends on the base field. For example, for the default field GF(232), a value of s = 2 makes the polynomial +irreducible. However, if the polynomial 0xc5 is used (so that PCLMUL is fast - see section 7.7), then s = 2 yields a +reducible polynomial, but s = 3 yields an irreducible one. You can use gf_poly to help verify these things, and to help +define s if you need to stray from the defaults:


    + +
    +UNIX> gf_poly 32 - 2:1 1:2 0:1
    +Poly: xˆ2 + (0x2)x + 1
    +Irreducible.
    +UNIX> gf_poly 32 -p 0xc5 - 2:1 1:2 0:1
    +Poly: xˆ2 + (0x2)x + 1
    +Reducible.
    +UNIX> gf_poly 32 -p 0xc5 - 2:1 1:3 0:1
    +Poly: xˆ2 + (0x3)x + 1
    +Irreducible.
    +UNIX>
    +
    + +

    +gf_unit does random sampling to test for problems. In particular, it chooses a random a and a random b, multiplies +them, and then tests the result by dividing it by a and b. When w is large, this sampling does not come close to +providing complete coverage to check for problems. In particular, if the polynomial is reducible, there is a good +chance that gf_unit won't discover any problems. For example, the following gf_unit call does not flag any problems, +even though the polynomial is reducible.

    +
    +
    +UNIX> gf_unit 64 A 0 -m COMPOSITE 2 -p 0xc5 - -p 2 -
    +UNIX> +
    + +

    +How can we demonstrate that this particular field has a problem? Well, when the polynomial is 0xc5, we can factor +x2 + 2x + 1 as (x + 0x7f6f95f9)(x + 0x7f6f95fb). Thus, in the composite field, when we multiply 0x17f6f95f9 by +0x17f6f95fb, we get zero. That's the problem: +

    + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 34


    + +
    + +UNIX> gf_mult 7f6f95f9 7f6f95fb 32h -p 0xc5 -
    +1
    +UNIX> gf_mult 17f6f95f9 17f6f95fb 64h -m COMPOSITE 2 -p 0xc5 - -p 2 -
    +0
    +UNIX>
    + +
    + +

    7.9 "ALTMAP" considerations and extract_word()

    + +There are two times when you may employ alternate memory mappings: +
      +
    1. When using "SPLIT" and wb = 4.
    2. +
    3. When using "COMPOSITE."
    4. +
    + +Additionally, by default, the "CAUCHY" region option also employs an alternate memory mapping. + +

    When you use alternate memory mappings, the exact mapping of words in GF(2w ) to memory depends on the +situation, the size of the region, and the alignment of the pointers. To help you figure things out, we have included the +procedures extract_word.wxx() as part of the gf_t struct. This procedure takes four parameters:

    +
      +
    • A pointer to the gf_t.
    • +
    • The beginning of the memory region.
    • +
    • The number of bytes in the memory region.
    • +
    • The desired word number: n.
    • +
    + +

    +It then returns the n-th word in memory. When the standard mapping is employed, this simply returns the n- +th contiguous word in memory. With alternate mappings, each word may be split over several memory regions, so +extract_word() grabs the relevant parts of each memory region to extract the word. Below, we go over each of the +above situations in detail. Please refer to Figure 2 in Section 5 for reference.

    + + +

    7.9.1 Alternate mappings with "SPLIT"

    + +The alternate mapping with "SPLIT" is employed so that we can best leverage mm_shuffle_epi8(). Please read [PGM13b] +for details as to why. Consider an example when w = 16. In the main region of memory (the middle region in Figure +2), multiplication proceeds in units of 32 bytes, which are each broken into two 16-byte regions. The first region +holds the high bytes of each word in GF(216), and the second region holds the low bytes. +Let's look at a very detailed example, from gf_example_5.c. This program makes the following call, where gf has + +been initialized for w = 16, using "SPLIT" and "ALTMAP:"

    +
    +gf.multiply_region.w32(&gf, a, b, 0x1234, 30*2, 0); +

    + + +

    In other words, it is multiplying a region a of 60 bytes (30 words) by the constant 0x1234 in GF(216), and placing +the result into b. The pointers a and b have been set up so that they are not multiples of 16. The first line of output +prints a and b:


    + +a: 0x10010008c b: 0x10010015c

    + +As described in Section 5, the regions of memory are split into three parts: + + + + + + + + +
    + + +6     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 35


    + + +
      +
    1. 4 bytes starting at 0x1001008c / 0x10010015c.
    2. +
    3. 32 bytes starting at 0x10010090 / 0x100100160.
    4. +
    5. 24 bytes starting at 0x100100b0 / 0x100100180.
    6. + +
    + + +

    In the first and third parts, the bytes are laid out according to the standard mapping. However, the second part is +split into two 16-byte regions- one that holds the high bytes of each word and one that holds the low bytes. To help +illustrate, the remainder of the output prints the 30 words of a and b as they appear in memory, and then the 30 return +values of extract_word.w32():


    + +
    + + + + + + + + + + + +
    1 2 3 4 5 6 7 8 9
    a: 640b 07e5 2fba ce5d f1f9 3ab8 c518 1d97 45a7 0160
    b: 1ba3 644e 84f8 be3c 4318 4905 b2fb 46eb ef01 a503
    +

    + + + + + + + + + + + +
    10 11 12 13 14 15 16 17 18 19
    a: 3759 b107 9660 3fde b3ea 8a53 75ff 46dc c504 72c2
    b: da27 e166 a0d2 b3a2 1699 3a3e 47fb 39af 1314 8e76
    + + +

    + + + + + + + + + +
    20 21 22 23 24 25 26 27 28 29
    a: b469 1b97 e91d 1dbc 131e 47e0 c11a 7f07 76e0 fe86
    b: 937c a5db 01b7 7f5f 8974 05e1 cff3 a09c de3c 4ac0
    +

    + + + + + + + + + + + + + + + + + + + +
    Word 0: 0x640b * 0x1234 = 0x1ba3 Word 15: 0x4575 * 0x1234 = 0xef47
    Word 1: 0x07e5 * 0x1234 = 0x644e Word 16: 0x60dc * 0x1234 = 0x03af
    Word 2: 0xba59 * 0x1234 = 0xf827 Word 17: 0x0146 * 0x1234 = 0xa539
    Word 3: 0x2f37 * 0x1234 = 0x84da Word 18: 0xc504 * 0x1234 = 0x1314
    Word 4: 0x5d07 * 0x1234 = 0x3c66 Word 19: 0x72c2 * 0x1234 = 0x8e76
    Word 5: 0xceb1 * 0x1234 = 0xbee1 Word 20: 0xb469 * 0x1234 = 0x937c
    Word 6: 0xf960 * 0x1234 = 0x18d2 Word 21: 0x1b97 * 0x1234 = 0xa5db
    Word 7: 0xf196 * 0x1234 = 0x43a0 Word 22: 0xe91d * 0x1234 = 0x01b7
    Word 8: 0xb8de * 0x1234 = 0x05a2 Word 23: 0x1dbc * 0x1234 = 0x7f5f
    Word 9: 0x3a3f * 0x1234 = 0x49b3 Word 24: 0x131e * 0x1234 = 0x8974
    Word 10: 0x18ea * 0x1234 = 0xfb99 Word 25: 0x47e0 * 0x1234 = 0x05e1
    Word 11: 0xc5b3 * 0x1234 = 0xb216 Word 26: 0xc11a * 0x1234 = 0xcff3
    Word 12: 0x9753 * 0x1234 = 0xeb3e Word 27: 0x7f07 * 0x1234 = 0xa09c
    Word 13: 0x1d8a * 0x1234 = 0x463a Word 28: 0x76e0 * 0x1234 = 0xde3c
    Word 14: 0xa7ff * 0x1234 = 0x01fb Word 29: 0xfe86 * 0x1234 = 0x4ac0
    +
    +
    +In the first region are words 0 and 1, which are identical to how they appear in memory: 0x640b and 0x07e5. In +the second region are words 2 through 17. These words are split among the two sixteen-byte regions. For example, +word 2, which extract_word() reports is 0xba59, is constructed from the low byte in word 2 (0xba) and the low byte +in word 10 (0x59). Since 0xba59 * 0x1234 = 0xf827, we see that the low byte in word 2 of b is 0xf8, and the low byte +in word 10 is 0x27. +

    When we reach word 22, we are in the third region of memory, and words are once again identical to how they +appear in memory.

    + +

    While this is confusing, we stress that that so long as you call multiply_region() with pointers of the same alignment +and regions of the same size, your results with ALTMAP will be consistent. If you call it with pointers of

    + + + + + + +
    + + +7     FURTHER INFORMATION ON OPTIONS AND ALGORITHMS 36


    + +different alignments, or with different region sizes, then the results will not be consistent. To reiterate, if you don't use +ALTMAP, you don't have to worry about any of this - words will always be laid out contiguously in memory. +

    +When w = 32, the middle region is a multiple of 64, and each word in the middle region is broken into bytes, each +of which is in a different 16-byte region. When w = 64, the middle region is a multiple of 128, and each word is +stored in eight 16-byte regions. And finally, whenw = 128, the middle region is a multiple of 128, and each word is +stored in 16 16-byte regions.


    + +

    7.9.2   Alternate mappings with "COMPOSITE"

    + +With "COMPOSITE," the alternate mapping divides the middle region in half. The lower half of each word is stored +in the first half of the middle region, and the higher half is stored in the second half. To illustrate, gf_example_6 +performs the same example as gf_example_5, except it is using "COMPOSITE" in GF((216)2), and it is multiplying +a region of 120 bytes rather than 60. As before, the pointers are not aligned on 16-bit quantities, so the region is broken +into three regions of 4 bytes, 96 bytes, and 20 bytes. In the first and third region, each consecutive four byte word is a +word in GF(232). For example, word 0 is 0x562c640b, and word 25 is 0x46bc47e0. In the middle region, the low two +bytes of each word come from the first half, and the high two bytes come from the second half. For example, word 1 +as reported by extract_word() is composed of the lower two bytes of word 1 of memory (0x07e5), and the lower two +bytes of word 13 (0x3fde). The product of 0x3fde07e5 and 0x12345678 is 0x211c880d, which is stored in the lower +two bytes of words 1 and 13 of b.

    + +a: 0x10010011c b: 0x1001001ec + +

    + +
    + + + + + + + + + + + + + + +
    1 2 3 4 5 6 7 8 9
    a: 562c640b 959407e5 56592fba cbadce5d 1d1cf1f9 35d73ab8 6493c518 b37c1d97 8e4545a7 c0d80160
    b: f589f36c f146880d 74f7b349 7ea7c5c6 34827c1a 93cc3746 bfd9288b 763941d1 bcd33a5d da695e64
    + + +

    + + + + + + + + + + + + + +
    10 11 12 13 14 15 16 17 18 19
    a: 965b3759 cb3eb107 1b129660 95a33fde 95a7b3ea d16c8a53 153375ff f74646dc 35aac504 98f972c2
    b: fd70f125 3274fa8f d9dd34ee c01a211c d4402403 8b55c08b da45f0ad 90992e18 b65e0902 d91069b5
    + + + +

    + + + + + + + + + + + +
    20 21 22 23 24 25 26 27 28 29
    a: 5509b469 7f8a1b97 3472e91d 9ee71dbc de4e131e 46bc47e0 5bc9c11a 931d7f07 c85cfe86 fe86
    b: fc92b8f5 edd59668 b4bc0d90 a679e4ce 1a98f7d0 6038765f b2ff333f e7937e49 fa5a5867 79c00ea2
    +

    + + + + + + + + + + + + + + + + + + + +
    Word 0: 0x562c640b * 0x12345678 = 0xf589f36c Word 15: 0xb46945a7 * 0x12345678 = 0xb8f53a5d
    Word 1: 0x3fde07e5 * 0x12345678 = 0x211c880d Word 16: 0x55098e45 * 0x12345678 = 0xfc92bcd3
    Word 2: 0x95a39594 * 0x12345678 = 0xc01af146 Word 17: 0x1b970160 * 0x12345678 = 0x96685e64
    Word 3: 0xb3ea2fba * 0x12345678 = 0x2403b349 Word 18: 0x7f8ac0d8 * 0x12345678 = 0xedd5da69
    Word 4: 0x95a75659 * 0x12345678 = 0xd44074f7 Word 19: 0xe91d3759 * 0x12345678 = 0x0d90f125
    Word 5: 0x8a53ce5d * 0x12345678 = 0xc08bc5c6 Word 20: 0x3472965b * 0x12345678 = 0xb4bcfd70
    Word 6: 0xd16ccbad * 0x12345678 = 0x8b557ea7 Word 21: 0x1dbcb107 * 0x12345678 = 0xe4cefa8f
    Word 7: 0x75fff1f9 * 0x12345678 = 0xf0ad7c1a Word 22: 0x9ee7cb3e * 0x12345678 = 0xa6793274
    Word 8: 0x15331d1c * 0x12345678 = 0xda453482 Word 23: 0x131e9660 * 0x12345678 = 0xf7d034ee
    Word 9: 0x46dc3ab8 * 0x12345678 = 0x2e183746 Word 24: 0xde4e1b12 * 0x12345678 = 0x1a98d9dd
    Word 10: 0xf74635d7 * 0x12345678 = 0x909993cc Word 25: 0x46bc47e0 * 0x12345678 = 0x6038765f
    Word 11: 0xc504c518 * 0x12345678 = 0x0902288b Word 26: 0x5bc9c11a * 0x12345678 = 0xb2ff333f
    Word 12: 0x35aa6493 * 0x12345678 = 0xb65ebfd9 Word 27: 0x931d7f07 * 0x12345678 = 0xe7937e49
    +
    + + + + + + + + +
    + + +8     THREAD SAFETY 37


    +
    + + + + + + + + + +
    Word 13: 0x72c21d97 * 0x12345678 = 0x69b541d1 Word 28: 0xd40676e0 * 0x12345678 = 0xfa5a5867
    Word 14: 0x98f9b37c * 0x12345678 = 0xd9107639 Word 29: 0xc85cfe86 * 0x12345678 = 0x79c00ea2
    +

    + + +

    +As with "SPLIT," using multiply_region() with "COMPOSITE" and "ALTMAP" will be consistent only if the +alignment of pointers and region sizes are identical.

    + + +

    7.9.3 The mapping of "CAUCHY"

    + +With "CAUCHY," the region is partitioned into w subregions, and each word in the region is broken into w bits, +each of which is stored in a different subregion. To illustrate, gf_example_7 multiplies a region of three bytes by 5 +in GF(23) using "CAUCHY:"

    + +
    + +UNIX> gf_example_7
    +a: 0x100100190 b: 0x1001001a0

    +a: 0x0b 0xe5 0xba
    +b: 0xee 0xba 0x0b

    +a bits: 00001011 11100101 10111010
    +b bits: 11101110 10111010 00001011

    +Word 0: 3 * 5 = 4
    +Word 1: 5 * 5 = 7
    +Word 2: 2 * 5 = 1
    +Word 3: 5 * 5 = 7
    +Word 4: 4 * 5 = 2
    +Word 5: 6 * 5 = 3
    +Word 6: 2 * 5 = 1
    +Word 7: 6 * 5 = 3
    +UNIX>

    +

    + +The program prints the three bytes of a and b in hexadecimal and in binary. To see how words are broken up, +consider word 0, which is the lowest bit of each of the three bytes of a (and b). These are the bits 1, 1 and 0 in a, and +0, 0, and 1 in b. Accordingly, the word is 3 in a, and 3*5 = 4 in b. Similarly, word 7 is the high bit in each byte: 0, 1, 1 +(6) in a, and 1, 1, 0 (3) in b.

    +

    With "CAUCHY," multiply_region()may be implemented exclusively with XOR operations. Please see [BKK+95] +for more information on the motivation behind "CAUCHY."

    + +

    8   Thread Safety

    + +Once you initialize a gf_t, you may use it wontonly in multiple threads for all operations except for the ones below. +With the implementations listed below, the scratch space in the gf_t is used for temporary tables, and therefore you +cannot call region_multiply, and in some cases multiply from multiple threads because they will overwrite each +others' tables. In these cases, if you want to call the procedures from multiple threads, you should allocate a separate +gf_t for each thread: +
      +
    • + All "GROUP" implementations are not thread safe for either region_multiply() or multiply(). Other than +"GROUP," multiply() is always thread-safe. + +
    • +
    + + + + + + + + + +
    + + +9     LISTING OF PROCEDURES 38


    +
      +
    • + +For w = 4, region_multiply.w32() is unsafe in in "-m TABLE -r QUAD -r LAZY."

    • +
    • For w = 8, region_multiply.w32() is unsafe in in "-m TABLE -r DOUBLE -r LAZY."

    • +
    • For w = 16, region_multiply.w32() is unsafe in in "-m TABLE."

    • +
    • For w ∈ {32, 64, 128}, all "SPLIT" implementations are unsafe for region_multiply(). This means that if the +default uses "SPLIT" (see Table 1 for when that occurs), then region_multiply() is not thread safe.

    • +
    • The "COMPOSITE" operations are only safe if the implementations of the underlying fields are safe.
    • +
    + +

    9  Listing of Procedures

    + +The following is an alphabetical listing of the procedures, data types and global variables for users to employ in +GF-complete.
    + +
      +
    • GF_W16_INLINE_DIV() in gf_complete.h: This is a macro for inline division when w = 16. See section 7.1.

    • +
    • GF_W16_INLINE_MULT() in gf_complete.h: This is a macro for inline multiplication when w = 16. See +section 7.1.

    • +
    • GF_W4_INLINE_MULTDIV() in gf_complete.h: This is a macro for inline multiplication/division when w = +4. See section 7.1.

    • + +
    • GF_W8_INLINE_MULTDIV() in gf_complete.h: This is a macro for inline multiplication/division when w = +8. See section 7.1.

    • +
    • MOA_Fill_Random_Region() in gf_rand.h: Fills a region with random numbers.

    • +
    • MOA_Random_128() in gf_rand.h: Creates a random 128-bit number.

    • +
    • MOA_Random_32() in gf_rand.h: Creates a random 32-bit number.

    • +
    • MOA_Random_64() in gf_rand.h: Creates a random 64-bit number.

    • +
    • MOA_Random_W() in gf_rand.h: Creates a random w-bit number, where w ≤ 32.

    • +
    • MOA_Seed() in gf_rand.h: Sets the seed for the random number generator.

    • +
    • gf_errno in gf_complete.h: This is to help figure out why an initialization call failed. See section 6.1.

    • +
    • gf_create_gf_from_argv() in gf_method.h: Creates a gf_t using C style argc/argv. See section 6.1.1.

    • +
    • gf_division_type_t in gf_complete.h: the different ways to specify division when using gf_init_hard(). See +section 6.4.

    • +
    • gf_error() in gf_complete.h: This prints out why an initialization call failed. See section 6.1.

    • + +
    • gf_extract in gf_complete.h: This is the data type of extract_word() in a gf_t. See section 7.9 for an example +of how to use extract word().
    • +
    + + + + + +
    + + +9     LISTING OF PROCEDURES 39


    +
      +
    • +gf_free() in gf_complete.h: If gf_init easy(), gf_init hard() or create_gf_from_argv() allocated memory, this +frees it. See section 6.4.
    • + +
    • gf_func_a_b in gf_complete.h: This is the data type of multiply() and divide() in a gf_t. See section 4.2 for +examples of how to use multiply() and divide()

    • + +
    • gf_func_a_b in gf_complete.h: This is the data type of multiply() and divide() in a gf_t. See section 4.2 for +examples of how to use multiply() and divide()

    • + +
    • gf_func_a in gf_complete.h: This is the data type of inverse() in a gf_t

    • + +
    • gf_general_add() in gf_general.h: This adds two gf_general_t's

    • + +
    • gf_general_divide() in gf_general.h: This divides two gf_general t's

    • + +
    • gf_general_do_region_check() in gf_general.h: This checks a region multiply of gf_general_t's

    • + +
    • gf_general_do_region_multiply() in gf_general.h: This does a region multiply of gf_general_t's

    • + +
    • gf_general_do_single_timing_test() in gf_general.h: Used in gf_time.c

    • + +
    • gf_general_inverse() in gf_general.h: This takes the inverse of a gf_general_t

    • + +
    • gf_general_is_one() in gf_general.h: This tests whether a gf_general_t is one

    • + +
    • gf_general_is_two() in gf_general.h: This tests whether a gf_general_t is two

    • + +
    • gf_general_is_zero() in gf_general.h: This tests whether a gf_general_t is zero

    • + +
    • gf_general_multiply() in gf_general.h: This multiplies two gf_general_t's. See the implementation of gf_mult.c + +for an example

    • +
    • gf_general_s_to_val() in gf_general.h: This converts a string to a gf_general t. See the implementation of +gf_mult.c for an example

    • +
    • gf_general_set_one() in gf_general.h: This sets a gf_general_t to one

    • +
    • gf_general_set_random() in gf_general.h: This sets a gf_general_t to a random number

    • +
    • gf_general_set_two() in gf_general.h: This sets a gf_general_t to two

    • +
    • gf_general_set_up_single_timing_test() in gf_general.h: Used in gf_time.c

    • +
    • gf_general_set_zero() in gf_general.h: This sets a gf_general_t_to_zero

    • +
    • gf_general_t_in .gf_general.h: This is a general data type for all values of w. See the implementation of gf_mult.c +for examples of using these

    • +
    • gf_general_val_to_s() ingf_general.h: This converts a gf_general_t to a string. See the implementation of +gf_mult.c for an example

    • + +
    • gf_init_easy() in gf_complete.h: This is how you initialize a default gf_t. See 4.2 through 4.5 for examples of +calling gf_init_easy()

    • +
    + + + + + + + +
    + + +9     LISTING OF PROCEDURES 40


    + +
      + +
    • gf_init hard() in gf_complete.h: This allows you to initialize a gf_t without using the defaults. See 6.4. We +recommend calling create gf_from argv() when you can, instead of gf_ init_hard()

    • + +
    • gf_ mult_type_t in gf_complete.h: the different ways to specify multiplication when using gf_init hard(). See +section 6.4

    • + +
    • gf_region_type_t in gf_complete.h: the different ways to specify region multiplication when using gf_init_hard(). +See section 6.4

    • + +
    • gf_region_in gf_complete.h: This is the data type of multiply_region() in a gf_t. See section 4.3 for an example +of how to use multiply_region()

    • + +
    • gf_scratch_size() in gf_complete.h: This is how you calculate how much memory a gf_t needs. See section 6.4.

    • + +
    • gf_size() in gf_complete.h: Returns the memory consumption of a gf_t. See section 6.5.

    • + +
    • gf_ val_128_t in gf_complete.h: This is how you store a value where w ≤ 128. It is a pointer to two 64-bit +unsigned integers. See section 4.4

    • + + +
    • gf_val_32_t in gf_ complete.h: This is how you store a value where w ≤ 32. It is equivalent to a 32-bit unsigned +integer. See section 4.2

    • + +
    • gf_ val_64_t in gf_complete.h: This is how you store a value where w ≤ 64. It is equivalent to a 64-bit unsigned +integer. See section 4.5

    • + +
    • gf_w16_get_div_alog_table() in gf_ complete.h: This returns a pointer to an inverse logarithm table that can be +used for inlining division when w = 16. See section 7.1

    • + + +
    • gf_w16_get_log_table() in gf_complete.h: This returns a pointer to a logarithm table that can be used for inlining +when w = 16. See section 7.1

    • + + +
    • gf_w16_get_mult_alog_table() in gf_complete.h: This returns a pointer to an inverse logarithm table that can be +used for inlining multiplication when w = 16. See section 7.1

    • + + +
    • gf_ w4 get div table() in gf_complete.h: This returns a pointer to a division table that can be used for inlining +when w = 4. See section 7.1

    • + + +
    • gf_w4_get_mult_table() in gf_complete.h: This returns a pointer to a multiplication table that can be used for +inlining when w = 4. See section 7.1

    • + +
    • gf_w8_get_div_table() in gf_complete.h: This returns a pointer to a division table that can be used for inlining +when w = 8. See section 7.1

    • + +
    • gf_w8_get_mult_table() in gf_complete.h: This returns a pointer to a multiplication table that can be used for +inlining when w = 8. See section 7.1

    • + +
    + + + + + + + + + +
    +10     TROUBLESHOOTING 41


    + +
      +
    • SSE support. Leveraging SSE instructions requires processor support as well as compiler support. For example, +the Mac OS 10.8.4 (and possibly earlier versions) default compile environment fails to properly compile +PCLMUL instructions. This issue can be fixed by installing an alternative compiler; see Section 3 for details

    • + +
    • Initialization segfaults. You have to already have allocated your gf_t before you pass a pointer to it in +bgf_init_easy(), create_gf_ from_argv(), or bgf_ini_hard()

    • + + +
    • GF-Complete is slower than it should be. Perhaps your machine has SSE, but you haven't specified the SSE +compilation flags. See section 3 for how to compile using the proper flags

    • + + +
    • Bad alignment. If you get alignment errors, see Section 5

    • + +
    • Mutually exclusive region types. Some combinations of region types are invalid. All valid and implemented +combinations are printed by bgf_methods.c

    • + +
    • Incompatible division types. Some choices of multiplication type constrain choice of divide type. For example, +"COMPOSITE" methods only allow the default division type, which divides by finding inverses (i.e., +neither "EUCLID" nor "MATRIX" are allowed). For each multiplication method printed by gf_methods.c, the +corresponding valid division types are also printed

    • + + +
    • Arbitrary "GROUP" arguments. The legal arguments to "GROUP" are specified in section 7.5

    • + +
    • Arbitrary "SPLIt" arguments. The legal arguments to "SPLIt" are specified in section 7.4

    • + +
    • Threading problems. For threading questions, see Section 8

    • + +
    • No default polynomial. If you change the polynomial in a base field using "COMPOSITE," then unless it is +a special case for which GF-Complete finds a default polynomial, you'll need to specify the polynomial of the +composite field too. See 7.8.2 for the fields where GF-Complete will support default polynomials

    • +
    • Encoding/decoding with different fields. Certain fields are not compatible. Please see section 7.2 for an +explanation

    • + + +
    • "ALTMAP" is confusing. We agree. Please see section 7.9 for more explanation.

    • + +
    • I used "ALTMAP" and it doesn't appear to be functioning correctly. With 7.9, the size of the region and +its alignment both matter in terms of how "ALTMAP" performs multiply_region(). Please see section 7.9 for +detailed explanation

    • + +
    • Where are the erasure codes?. This library only implements Galois Field arithmetic, which is an underlying +component for erasure coding. Jerasure will eventually be ported to this library, so that you can have fast erasure +coding

    • +
    +

    11     Timings

    + +We don't want to get too detailed with timing, because it is quite machine specific. However, here are the timings on +an Intel Core i7-3770 CPU running at 3.40 GHz, with 4 × 256 KB L2 caches and an 8MB L3 cache. All timings are +obtained with gf_time or gf_inline_time, in user mode with the machine dedicated solely to running these jobs. + + + + + + + + + +
    +10     TROUBLESHOOTING 41


    + +
    +
    Figure 4: Speed of doing single multiplications for w ∈ {4, 8, 16}.
    +

    11.1   Multiply()

    + +The performance of multiply() is displayed in Figures 4 for w ∈ {4, 8, 16} and 5 for w ∈ {32, 64, 128}. These +numbers were obtained by calling gf_time with the size and iterations both set to 10240. We plot the speed in megaops +per second. + +

    As would be anticipated, the inlined operations (see section 7.1) outperform the others. Additionally, in all +cases with the exception of w = 32, the defaults are the fastest performing implementations. With w = 32, +"CARRY_FREE" is the fastest with an alternate polynomial (see section 7.7). Because we require the defaults to +use a "standard" polynomial, we cannot use this implementation as the default.

    + +

    11.2   Divide()

    + +For the "TABLE" and "LOG" implementations, the performance of division is the same as multiplication. This means +that for w ∈ {4, 8, 16}, it is very fast indeed. For the other implementations, division is implemented with Euclid's +method, and is several factors slower than multiplication. +In Figure 6, we plot the speed of a few implementations of the larger word sizes. Compared to the "TABLE" and +"LOG" implemenations for the smaller word sizes, where the speeds are in the hundreds of mega-ops per second, +these are very slow. Of note is the "COMPOSITE" implementation for w = 32, which is much faster than the others + + + + + + + + +
    +10     TROUBLESHOOTING 43


    + +
    + +
    Figure 5: Speed of doing single multiplications for w ∈ {32, 64, 128}.

    + +because it uses a special application of Euclid's method, which relies on division in GF(216), which is very fast.

    + +

    11.3   Multiply_Region()

    + +Tables 3 through 8 show the performance of the various region operations. It should be noted that for GF(216 ) +through GF(2128), the default is not the fastest implementation of multiply_region(). The reasons for this are outlined +in section 6 +

    +For these tables, we performed 1GB worth of multiply_region() calls for all regions of size 2i bytes for 10 ≤ i ≤ +30. In the table, we plot the fastest speed obtained.

    +

    We note that the performance of "CAUCHY" can be improved with techniques from [LSXP13] and [PSR12].

    + + + + + + + + + +
    +REFERENCES 44


    + +
    + +
    Figure 6: Speed of doing single divisions for w ∈ {32, 64, 128}.

    + +
    +
    + + + + + + + + + + + + + + + + + + + + +
    Method Speed (MB/s)
    -m TABLE (Default) - 11879.909
    -m TABLE -r CAUCHY - 9079.712
    -m BYTWO_b - 5242.400
    -m BYTWO_p - 4078.431
    -m BYTWO_b -r NOSSE - 3799.699
    -m TABLE -r QUAD - 3014.315
    -m TABLE -r DOUBLE - 2253.627
    -m TABLE -r NOSSE - 2021.237
    -m TABLE -r NOSSE - 1061.497
    -m LOG - 503.310
    m SHIFT - 157.749
    -m CARRY_FREE - 86.202


    +
    +
    Table 3: Speed of various calls to multiply_region() for w = 4.
    + +

    References

    + +[Anv09] H. P. Anvin. The mathematics of RAID-6. http://kernel.org/pub/linux/kernel/people/hpa/ +raid6.pdf, 2009.

    + +[BKK+95] J. Blomer, M. Kalfane, M. Karpinski, R. Karp, M. Luby, and D. Zuckerman. An XOR-based erasureresilient +coding scheme. Technical Report TR-95-048, International Computer Science Institute, August +1995.

    + +[GMS08] K. Greenan, E. Miller, and T. J. Schwartz. Optimizing Galois Field arithmetic for diverse processor +architectures and applications. In MASCOTS 2008: 16th IEEE Symposium on Modeling, Analysis and +Simulation of Computer and Telecommunication Systems, Baltimore, MD, September 2008.

    + + +[GP97] S. Gao and D. Panario. Tests and constructions of irreducible polynomials over finite fields. In Foundations +of Computational Mathematics, pages 346–361. Springer Verlag, 1997. + + + + + + + + + + + + + + + + +
    +REFERENCES 45


    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Method Speed (MB/s)
    -m SPLIT 8 4 (Default) 13279.146
    -m COMPOSITE 2 - -r ALTMAP - 5516.588
    -m TABLE -r CAUCHY - 4968.721
    -m BYTWO_b - 2656.463
    -m TABLE -r DOUBLE - 2561.225
    -m TABLE - 1408.577
    -m BYTWO_b -r NOSSE - 1382.409
    -m BYTWO_p - 1376.661
    -m LOG_ZERO_EXT - 1175.739
    -m LOG_ZERO - 1174.694
    -m LOG - 997.838
    -m SPLIT 8 4 -r NOSSE - 885.897
    -m BYTWO_p -r NOSSE - 589.520
    -m COMPOSITE 2 - - 327.039
    -m SHIFT - 106.115
    -m CARRY_FREE - 104.299


    +
    +
    Table 4: Speed of various calls to multiply region() for w = 4.


    + +[LBOX12] J. Luo, K. D. Bowers, A. Oprea, and L. Xu. Efficient software implementations of large finite fields +GF(2n) for secure storage applications. ACM Transactions on Storage, 8(2), February 2012.

    + +[LD00] J. Lopez and R. Dahab. High-speed software multiplication in f2m. In Annual International Conference +on Cryptology in India, 2000.

    + +[LHy08] H. Li and Q. Huan-yan. Parallelized network coding with SIMD instruction sets. In International Symposium +on Computer Science and Computational Technology, pages 364-369. IEEE, December 2008.

    + +[LSXP13] J. Luo, M. Shrestha, L. Xu, and J. S. Plank. Efficient encoding schedules for XOR-based erasure codes. +IEEE Transactions on Computing,May 2013.

    + +[Mar94] G. Marsaglia. The mother of all random generators. ftp://ftp.taygeta.com/pub/c/mother. +c, October 1994.
    + +[PGM13a] J. S. Plank, K. M. Greenan, and E. L. Miller. A complete treatment of software implementations of +finite field arithmetic for erasure coding applications. Technical Report UT-CS-13-717, University of +Tennessee, September 2013.

    + +[PGM13b] J. S. Plank, K. M. Greenan, and E. L. Miller. Screaming fast Galois Field arithmetic using Intel SIMD +instructions. In FAST-2013: 11th Usenix Conference on File and Storage Technologies, San Jose, February +2013.

    + +[Pla97] J. S. Plank. A tutorial on Reed-Solomon coding for fault-tolerance in RAID-like systems. Software - +Practice & Experience, 27(9):995-1012, September 1997. + + + + + + + + + + + + +
    +REFERENCES 46


    + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + +
    Method Speed (MB/s)
    -m SPLIT 16 4 -r ALTMAP - 10460.834
    -m SPLIT 16 4 -r SSE (Default) - 8473.793
    -m COMPOSITE 2 - -r ALTMAP - 5215.073
    -m LOG -r CAUCHY - 2428.824
    -m TABLE - 2319.129
    -m SPLIT 16 8 - 2164.111
    -m SPLIT 8 8 - 2163.993
    -m SPLIT 16 4 -r NOSSE - 1148.810
    -m LOG - 1019.896
    -m LOG_ZERO - 1016.814
    -m BYTWO_b - 738.879
    -m COMPOSITE 2 - - 596.819
    -m BYTWO_p - 560.972
    -m GROUP 4 4 - 450.815
    -m BYTWO_b -r NOSSE - 332.967
    -m BYTWO_p -r NOSSE - 249.849
    -m CARRY_FREE - 111.582
    -m SHIFT - 95.813


    +
    +
    Table 5: Speed of various calls to multiply region() for w = 4.


    + +[PMG+13] J. S. Plank, E. L. Miller, K. M. Greenan, B. A. Arnold, J. A. Burnum, A. W. Disney, and A. C. McBride. +GF-Complete: A comprehensive open source library for Galois Field arithmetic. version 1.0. Technical +Report UT-CS-13-716, University of Tennessee, September 2013.

    + +[PSR12] J. S. Plank, C. D. Schuman, and B. D. Robison. Heuristics for optimizing matrix-based erasure codes for +fault-tolerant storage systems. In DSN-2012: The International Conference on Dependable Systems and +Networks, Boston, MA, June 2012. IEEE.

    + +[Rab89] M. O. Rabin. Efficient dispersal of information for security, load balancing, and fault tolerance. Journal +of the Association for Computing Machinery, 36(2):335-348, April 1989. + + + + + + + + + +
    +REFERENCES 47


    +
    +
    + + + + + + + + + + + + +
    Method Speed (MB/s)
    + +-m SPLIT 32 4 -r SSE -r ALTMAP -
    +-m SPLIT 32 4 (Default)
    +-m COMPOSITE 2 -m SPLIT 16 4 -r ALTMAP - -r ALTMAP -
    +-m COMPOSITE 2 - -r ALTMAP -
    +-m SPLIT 8 8 -
    +-m SPLIT 32 8 -
    +-m SPLIT 32 16 -
    +-m SPLIT 8 8 -r CAUCHY
    +-m SPLIT 32 4 -r NOSSE
    +-m CARRY_FREE -p 0xc5
    +-m COMPOSITE 2 -
    +-m BYTWO_b -
    +-m BYTWO_p -
    +-m GROUP 4 8 -
    +-m GROUP 4 4 -
    +-m CARRY_FREE -
    +-m BYTWO_b -r NOSSE -
    +-m BYTWO_p -r NOSSE -
    +-m SHIFT -
    + +
    +7185.440
    +5063.966
    + 4176.440
    +3360.860
    +1345.678
    +1340.656
    +1262.676
    +1143.263
    + 480.859
    +393.185
    +332.964
    +309.971
    +258.623
    +242.076
    +227.399
    +226.785
    +143.403
    +111.956
    +52.295
    +


    +
    +
    Table 6: Speed of various calls to multiply region() w = 4.


    + +
    +
    + + + + + + + + + + + + +
    Method Speed (MB/s)
    +-m SPLIT 64 4 -r ALTMAP -
    +-m SPLIT 64 4 -r SSE (Default) -
    +-m COMPOSITE 2 -m SPLIT 32 4 -r ALTMAP - -r ALTMAP -
    +-m COMPOSITE 2 - -r ALTMAP -
    +-m SPLIT 64 16 -
    +-m SPLIT 64 8 -
    +-m CARRY_FREE -
    +-m SPLIT 64 4 -r NOSSE -
    +-m GROUP 4 4 -
    +-m GROUP 4 8 -
    +-m BYTWO_b -
    +-m BYTWO_p -
    +-m SPLIT 8 8 -
    +-m BYTWO_p -r NOSSE -
    +-m COMPOSITE 2 - -
    +-m BYTWO_b -r NOSSE -
    +-m SHIFT -
    + +
    3522.798
    + 2647.862
    +2461.572
    +1860.921
    +1066.490
    +998.461
    +975.290
    +545.479
    +230.137
    +153.947
    +144.052
    +124.538
    +98.892
    +77.912
    +77.522
    +36.391
    +25.282
    +


    +
    +
    Table 7: Speed of various calls to multiply region() for w = 4.


    + + + + + + + + + + + + + +
    +REFERENCES 48


    + +
    +
    + + + + + + + + + + + + +
    Method Speed (MB/s)
    + +-m SPLIT 128 4 -r ALTMAP -
    +-m COMPOSITE 2 -m SPLIT 64 4 -r ALTMAP - -r ALTMAP -
    +-m COMPOSITE 2 - -r ALTMAP -
    +-m SPLIT 128 8 (Default) -
    +-m CARRY_FREE -
    +-m SPLIT 128 4 -
    +-m COMPOSITE 2 -
    +-m GROUP 4 8 -
    +-m GROUP 4 4 -
    +-m BYTWO_p -
    +-m BYTWO_b -
    +-m SHIFT -
    +
    +1727.683
    +1385.693
    +1041.456
    +872.619
    +814.030
    +500.133
    +289.207
    +133.583
    +116.187
    +25.162
    +25.157
    +14.183
    +


    +
    +
    Table 8: Speed of various calls to multiply region() for w = 4.


    diff --git a/src/erasure-code/jerasure/gf-complete/manual/image1.png b/src/erasure-code/jerasure/gf-complete/manual/image1.png new file mode 100644 index 000000000..c0f7d9511 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image1.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image2.png b/src/erasure-code/jerasure/gf-complete/manual/image2.png new file mode 100644 index 000000000..38ff273df Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image2.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image3.png b/src/erasure-code/jerasure/gf-complete/manual/image3.png new file mode 100644 index 000000000..0b4667c55 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image3.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image4.png b/src/erasure-code/jerasure/gf-complete/manual/image4.png new file mode 100644 index 000000000..ba6cf0780 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image4.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image5.png b/src/erasure-code/jerasure/gf-complete/manual/image5.png new file mode 100644 index 000000000..0e169dd71 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image5.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image6.png b/src/erasure-code/jerasure/gf-complete/manual/image6.png new file mode 100644 index 000000000..33d4cebb3 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image6.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/image7.png b/src/erasure-code/jerasure/gf-complete/manual/image7.png new file mode 100644 index 000000000..7694e1c07 Binary files /dev/null and b/src/erasure-code/jerasure/gf-complete/manual/image7.png differ diff --git a/src/erasure-code/jerasure/gf-complete/manual/style.css b/src/erasure-code/jerasure/gf-complete/manual/style.css new file mode 100644 index 000000000..39721946e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/manual/style.css @@ -0,0 +1,404 @@ + +body { +margin:147px 104px 147px 173px; + + +font-size:20px; +text-align:justify; + + + +} + +#index_number{ + +float:right; + +} + +a { + +text-decoration:none; +font-size:19px; +color:#19191F; +letter-spacing:1.5px; + +font-family: 'Roboto Condensed', sans-serif; + + + + + +} +/*This is page1 css */ + +#box { + +text-align:center; +font-size:19px; +margin-top:166px; + + +} + +#body_text{ + +font-family: 'Roboto Condensed', sans-serif; +font-size:18px; + +} + +h1{ +font-weight:inherit; + +} + +h4{ +font-size:22px; + +font-weight:inherit; + +} + +#footer{ + +margin:1px 0px 1px 0px; +font-size:18px; +text-align:justify; +padding-bottom:104px; + + + +} + +p { +margin:0; +text-indent: 50px; +font-size:19px; +text-align:justify; + + +} + + +#footer_bar { +border-top:solid; + +border-top-width:thin; + +} + +#pages_paragraphs { +margin:1px 115px 1px 57px; + + + +} + +#pages_paragraphs_2{ +margin:1px 0px 1px 0px; +font-size:20px; +text-align:justify; +} + +.code{ +font-size:22px; + +} + + + + +/* This is page3 css */ + +.index{ +font-weight:bold; +text-align:justify; + +} + +.sub_indices { + +padding-left:52px; +text-align:justify; +} + + + +.aligning_numbers{ + +padding-left:27px; + + +} + +.aligning_page_number{ + + +float:right; + + +} + +/* This page 6 css */ +.box { + +height:223px; +} + + + +.image-cell_1 { + background: url(image1.png) no-repeat; + width:716px; + height:300px; + + float:left; + margin-left:180px; + margin-right:134px; + margin-bottom:1px; + margin-bottom:31px; + +} + + +/* This page 9 and 10 css */ + + + +#number_spacing{ + +letter-spacing:1px; +font-size:17px; + + +} + + +#number_spacing_1{ + +letter-spacing:1px; +font-size:19px; +margin-left:10px; + + +} + +/* this page 13 css */ + + +.image-cell_2 { + background: url(image2.png) no-repeat; + width:939px; + height:419px; + + float:left; + margin-left:68px; + margin-right:134px; + margin-bottom:1px; + margin-bottom:31px; + +} + +/* This is page 14 */ +#data1 table{ +border-top-style:solid; +border-left-style:solid; + +border-bottom-style:solid; +font-family: 'Roboto Condensed', sans-serif; + +} + +#data1 th{ +border-bottom-style:solid; +border-right-style:solid; +border-right-style:thin; +font-family: 'Roboto Condensed', sans-serif; + + +} + +#data1 td { +border-right-style:solid; + +font-family: 'Roboto Condensed', sans-serif; + +} + + +/* This is page 28 */ +#table_page28 table{ +border-top-style:solid; +border-left-style:solid; + +border-bottom-style:solid; +border-top-width:thin; +border-left-width:thin; +border-bottom-width:thin; +font-family: 'Roboto Condensed', sans-serif; + +} + +#table_page28 th{ +border-bottom-style:solid; +border-right-style:solid; +border-right-width:thin; +border-bottom-width:thin; +font-family: 'Roboto Condensed', sans-serif; + + +} + +#table_page28 td { +border-right-style:solid; +border-bottom-style:solid; +border-bottom-width:thin; +border-right-width:thin; +font-family: 'Roboto Condensed', sans-serif; + +} + + +/* This is page 30 */ +#table_page30 table{ +border-top-style:solid; +border-left-style:solid; + +border-bottom-style:solid; + +} + +#table_page30 th{ +border-bottom-style:solid; +border-right-style:solid; + + +} + +#table_page30 td { +border-right-style:solid; +border-bottom-style:solid; + + +} +#box_1 { + +height:485px; +margin-top:44px; +margin-bottom:-61px; + +} +.image-cell_3 { + background: url(image3.png) no-repeat; + width:583px; + height:393px; + + float:left; + +} + +.image-cell_4 { + background: url(image4.png) no-repeat; + width:487px; + height:390px; + + float:right; + + + +} + +/* This is page 42 Css */ + + +.image-cell_5 { + background: url(image5.png) no-repeat; + width:907px; + height:592px; + + float:left; + margin-right:134px; + margin-bottom:1px; + margin-bottom:31px; + +} + + +/* This is page 43 Css */ + + +.image-cell_6 { + background: url(image6.png) no-repeat; + width:851px; + height:532px; + + margin-right:134px; + margin-bottom:1px; + margin-bottom:31px; + +} + +/* This is page 44 Css */ + + +.image-cell_7{ + background: url(image7.png) no-repeat; + width:945px; + height:321px; + + margin-right:134px; + margin-bottom:1px; + margin-bottom:31px; + +} + +/* This is page 45 */ +#data2 table{ +border-top-style:solid; +border-left-style:solid; + +border-bottom-style:solid; +border-top-width:2px; +border-left-width:2px; +border-bottom-width:2px; +border-color:black; +font-family: 'Roboto Condensed', sans-serif; + +} + +#data2 th{ +border-bottom-style:solid; +border-right-style:solid; +border-bottom-width:2px; +border-right-width:2px; +font-family: 'Roboto Condensed', sans-serif; + + +} + #data2 td { +border-right-style:solid; +border-right-width:2px; +font-family: 'Roboto Condensed', sans-serif; + +} + + + + + + + + + + + + + + + + + + + + diff --git a/src/erasure-code/jerasure/gf-complete/src/Makefile.am b/src/erasure-code/jerasure/gf-complete/src/Makefile.am new file mode 100644 index 000000000..cfc2a5062 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/Makefile.am @@ -0,0 +1,32 @@ +# GF-Complete 'core' AM file +# Creates the library + +AUTOMAKE_OPTIONS = subdir-objects + +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include + +# avoid using SIMD_FLAGS for code that calls strcmp as new gcc +# versions will use SIMD for the strcmp implementation. Instead +# we create a static library just for gf_method that is not compiled +# with SIMD_FLAGS, this static library will get linked into gf_complete.so +noinst_LTLIBRARIES = libgf_util.la +libgf_util_la_SOURCES = gf_method.c +libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare + +# we narrowly use SIMD_FLAGS for code that needs it +lib_LTLIBRARIES = libgf_complete.la +libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ + gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c +libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare +libgf_complete_la_LIBADD = libgf_util.la + +if HAVE_NEON +libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ + neon/gf_w8_neon.c \ + neon/gf_w16_neon.c \ + neon/gf_w32_neon.c \ + neon/gf_w64_neon.c +endif + +libgf_complete_la_LDFLAGS = -version-info 1:0:0 + diff --git a/src/erasure-code/jerasure/gf-complete/src/gf.c b/src/erasure-code/jerasure/gf-complete/src/gf.c new file mode 100644 index 000000000..84d6996d9 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf.c @@ -0,0 +1,1090 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf.c + * + * Generic routines for Galois fields + */ + +#include "gf_int.h" +#include +#include +#include +#include "gf_cpu.h" + +int _gf_errno = GF_E_DEFAULT; + +void gf_error() +{ + char *s; + + switch(_gf_errno) { + case GF_E_DEFAULT: s = "No Error."; break; + case GF_E_TWOMULT: s = "Cannot specify two -m's."; break; + case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break; + case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break; + case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break; + case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break; + case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break; + case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break; + case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break; + case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break; + case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break; + case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break; + case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break; + case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break; + case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break; + case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break; + case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break; + case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break; + case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break; + case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; + case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; + case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; + case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break; + case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; + case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; + case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; + case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break; + case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break; + case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break; + case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; + case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; + case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break; + case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; + case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; + case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break; + case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; + case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; + case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; + case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break; + case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break; + case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; + case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break; + case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break; + case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; + case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; + case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; + case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break; + case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break; + case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break; + case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; + case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; + case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break; + case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; + case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break; + case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break; + case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; + case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break; + case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; + case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break; + case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; + case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; + case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break; + case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; + case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break; + case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; + case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; + case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break; + case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; + case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break; + case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break; + case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; + case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; + case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; + case GF_E_UNK_REG: s = "Unknown region type."; break; + case GF_E_UNK_DIV: s = "Unknown division type."; break; + default: s = "Undefined error."; + } + + fprintf(stderr, "%s\n", s); +} + +uint64_t gf_composite_get_default_poly(gf_t *base) +{ + gf_internal_t *h; + uint64_t rv; + + h = (gf_internal_t *) base->scratch; + if (h->w == 4) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x13) return 2; + return 0; + } + if (h->w == 8) { + if (h->mult_type == GF_MULT_COMPOSITE) return 0; + if (h->prim_poly == 0x11d) return 3; + return 0; + } + if (h->w == 16) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x105; + return 0; + } else { + if (h->prim_poly == 0x1100b) return 2; + if (h->prim_poly == 0x1002d) return 7; + return 0; + } + } + if (h->w == 32) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 2) return 0x10005; + if (rv == 7) return 0x10008; + if (rv == 0x105) return 0x10002; + return 0; + } else { + if (h->prim_poly == 0x400007) return 2; + if (h->prim_poly == 0xc5) return 3; + return 0; + } + } + if (h->w == 64) { + if (h->mult_type == GF_MULT_COMPOSITE) { + rv = gf_composite_get_default_poly(h->base_gf); + if (rv != h->prim_poly) return 0; + if (rv == 3) return 0x100000009ULL; + if (rv == 2) return 0x100000004ULL; + if (rv == 0x10005) return 0x100000003ULL; + if (rv == 0x10002) return 0x100000005ULL; + if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too, + but I want to differentiate cases). */ + return 0; + } else { + if (h->prim_poly == 0x1bULL) return 2; + return 0; + } + } + return 0; +} + +int gf_error_check(int w, int mult_type, int region_type, int divide_type, + int arg1, int arg2, uint64_t poly, gf_t *base) +{ + int sse3 = 0; + int sse2 = 0; + int pclmul = 0; + int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp; + gf_internal_t *sub; + + rdouble = (region_type & GF_REGION_DOUBLE_TABLE); + rquad = (region_type & GF_REGION_QUAD_TABLE); + rlazy = (region_type & GF_REGION_LAZY); + rsimd = (region_type & GF_REGION_SIMD); + rnosimd = (region_type & GF_REGION_NOSIMD); + raltmap = (region_type & GF_REGION_ALTMAP); + rcauchy = (region_type & GF_REGION_CAUCHY); + + if (divide_type != GF_DIVIDE_DEFAULT && + divide_type != GF_DIVIDE_MATRIX && + divide_type != GF_DIVIDE_EUCLID) { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + + tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | + GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP | + GF_REGION_CAUCHY ); + if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } + +#ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2) { + sse2 = 1; + } +#endif + +#ifdef INTEL_SSSE3 + if (gf_cpu_supports_intel_ssse3) { + sse3 = 1; + } +#endif + +#ifdef INTEL_SSE4_PCLMUL + if (gf_cpu_supports_intel_pclmul) { + pclmul = 1; + } +#endif + +#ifdef ARM_NEON + if (gf_cpu_supports_arm_neon) { + pclmul = (w == 4 || w == 8); + sse3 = 1; + } +#endif + + + if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } + + if (mult_type != GF_MULT_COMPOSITE && w < 64) { + if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; } + } + + if (mult_type == GF_MULT_DEFAULT) { + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; } + if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; } + if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; } + return 1; + } + + if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; } + if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } + if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } + if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } + + if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE && + mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG1SET; + return 0; + } + + if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) { + _gf_errno = GF_E_ARG2SET; + return 0; + } + + if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; } + + if (rdouble) { + if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } + if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } + if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } + return 1; + } + + if (rquad) { + if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } + if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } + return 1; + } + + if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; } + + if (mult_type == GF_MULT_SHIFT) { + if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; } + return 1; + } + + if (mult_type == GF_MULT_CARRY_FREE) { + if (w != 4 && w != 8 && w != 16 && + w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } + if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; } + if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; } + if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; } + if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } + if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } + if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } + return 1; + } + + if (mult_type == GF_MULT_CARRY_FREE_GK) { + if (w != 4 && w != 8 && w != 16 && + w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } + if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; } + if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } + return 1; + } + + if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { + if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } + if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } + return 1; + } + + if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO + || mult_type == GF_MULT_LOG_ZERO_EXT ) { + if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; } + + if (mult_type == GF_MULT_LOG_TABLE) return 1; + + if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; } + + if (mult_type == GF_MULT_LOG_ZERO) return 1; + + if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; } + return 1; + } + + if (mult_type == GF_MULT_GROUP) { + if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; } + if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; } + if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; } + if (w == 128 && (arg1 != 4 || + (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } + if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } + if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; } + return 1; + } + + if (mult_type == GF_MULT_TABLE) { + if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } + if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } + return 1; + } + + if (mult_type == GF_MULT_SPLIT_TABLE) { + if (arg1 > arg2) { + tmp = arg1; + arg1 = arg2; + arg2 = tmp; + } + if (w == 8) { + if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } + } else if (w == 16) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 16)) { + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } + } else if (arg1 == 4 && arg2 == 16) { + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + } else { _gf_errno = GF_E_SP_16AR; return 0; } + } else if (w == 32) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 32) || + (arg1 == 16 && arg2 == 32)) { + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } + } else if (arg1 == 4 && arg2 == 32) { + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; } + } else { _gf_errno = GF_E_SP_32AR; return 0; } + } else if (w == 64) { + if ((arg1 == 8 && arg2 == 8) || + (arg1 == 8 && arg2 == 64) || + (arg1 == 16 && arg2 == 64)) { + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } + } else if (arg1 == 4 && arg2 == 64) { + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; } + } else { _gf_errno = GF_E_SP_64AR; return 0; } + } else if (w == 128) { + if (arg1 == 8 && arg2 == 128) { + if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; } + if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } + } else if (arg1 == 4 && arg2 == 128) { + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } + if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } + if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; } + } else { _gf_errno = GF_E_SP128AR; return 0; } + } else { _gf_errno = GF_E_SPLIT_W; return 0; } + return 1; + } + + if (mult_type == GF_MULT_COMPOSITE) { + if (w != 8 && w != 16 && w != 32 + && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; } + if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } + if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } + if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } + if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; } + if (base != NULL) { + sub = (gf_internal_t *) base->scratch; + if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } + if (poly == 0) { + if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; } + } + } + return 1; + } + + _gf_errno = GF_E_UNKNOWN; + return 0; +} + +int gf_scratch_size(int w, + int mult_type, + int region_type, + int divide_type, + int arg1, + int arg2) +{ + if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0; + + switch(w) { + case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2); + default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); + } +} + +extern int gf_size(gf_t *gf) +{ + gf_internal_t *h; + int s; + + s = sizeof(gf_t); + h = (gf_internal_t *) gf->scratch; + s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2); + if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf); + return s; +} + + +int gf_init_easy(gf_t *gf, int w) +{ + return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + 0, 0, 0, NULL, NULL); +} + +/* Allen: What's going on here is this function is putting info into the + scratch mem of gf, and then calling the relevant REAL init + func for the word size. Probably done this way to consolidate + those aspects of initialization that don't rely on word size, + and then take care of word-size-specific stuff. */ + +int gf_init_hard(gf_t *gf, int w, int mult_type, + int region_type, + int divide_type, + uint64_t prim_poly, + int arg1, int arg2, + gf_t *base_gf, + void *scratch_memory) +{ + int sz; + gf_internal_t *h; + + gf_cpu_identify(); + + if (gf_error_check(w, mult_type, region_type, divide_type, + arg1, arg2, prim_poly, base_gf) == 0) return 0; + + sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2); + if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught + in gf_error_check() */ + + if (scratch_memory == NULL) { + h = (gf_internal_t *) malloc(sz); + h->free_me = 1; + } else { + h = scratch_memory; + h->free_me = 0; + } + gf->scratch = (void *) h; + h->mult_type = mult_type; + h->region_type = region_type; + h->divide_type = divide_type; + h->w = w; + h->prim_poly = prim_poly; + h->arg1 = arg1; + h->arg2 = arg2; + h->base_gf = base_gf; + h->private = (void *) gf->scratch; + h->private = (uint8_t *)h->private + (sizeof(gf_internal_t)); + gf->extract_word.w32 = NULL; + + switch(w) { + case 4: return gf_w4_init(gf); + case 8: return gf_w8_init(gf); + case 16: return gf_w16_init(gf); + case 32: return gf_w32_init(gf); + case 64: return gf_w64_init(gf); + case 128: return gf_w128_init(gf); + default: return gf_wgen_init(gf); + } +} + +int gf_free(gf_t *gf, int recursive) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (recursive && h->base_gf != NULL) { + gf_free(h->base_gf, 1); + free(h->base_gf); + } + if (h->free_me) free(h); + return 0; /* Making compiler happy */ +} + +void gf_alignment_error(char *s, int a) +{ + fprintf(stderr, "Alignment error in %s:\n", s); + fprintf(stderr, " The source and destination buffers must be aligned to each other,\n"); + fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a); + assert(0); +} + +static +void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) { + int cols, i, j; + uint32_t tmp; + + cols = rows; + + for (i = 0; i < rows; i++) inv[i] = (1 << i); + + /* First -- convert into upper triangular */ + + for (i = 0; i < cols; i++) { + + /* Swap rows if we ave a zero i,i element. If we can't swap, then the + matrix was not invertible */ + + if ((mat[i] & (1 << i)) == 0) { + for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ; + if (j == rows) { + fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n"); + assert(0); + } + tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp; + tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp; + } + + /* Now for each j>i, add A_ji*Ai to Aj */ + for (j = i+1; j != rows; j++) { + if ((mat[j] & (1 << i)) != 0) { + mat[j] ^= mat[i]; + inv[j] ^= inv[i]; + } + } + } + + /* Now the matrix is upper triangular. Start at the top and multiply down */ + + for (i = rows-1; i >= 0; i--) { + for (j = 0; j < i; j++) { + if (mat[j] & (1 << i)) { + /* mat[j] ^= mat[i]; */ + inv[j] ^= inv[i]; + } + } + } +} + +uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp) +{ + uint32_t mat[32], inv[32], mask; + int i; + + mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1; + for (i = 0; i < w; i++) { + mat[i] = y; + + if (y & (1 << (w-1))) { + y = y << 1; + y = ((y ^ pp) & mask); + } else { + y = y << 1; + } + } + + gf_invert_binary_matrix(mat, inv, w); + return inv[0]; +} + +void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base) +{ + uint64_t a, prod; + int xor; + uint64_t *s64, *d64, *top; + + s64 = rd->s_start; + d64 = rd->d_start; + top = rd->d_top; + xor = rd->xor; + + if (xor) { + while (d64 != top) { + a = *s64; + prod = base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + prod ^= *d64; + *d64 = prod; + s64++; + d64++; + } + } else { + while (d64 != top) { + a = *s64; + prod = base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + a <<= 16; + prod <<= 16; + prod ^= base[a >> 48]; + *d64 = prod; + s64++; + d64++; + } + } +} + +static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top) +{ + uint8_t *s8, *d8; + uint16_t *s16, *d16; + uint32_t *s32, *d32; + uint64_t *s64, *d64; + gf_internal_t *h; + int wb; + uint32_t p, a; + + h = rd->gf->scratch; + wb = (h->w)/8; + if (wb == 0) wb = 1; + + while (src < s_top) { + switch (h->w) { + case 8: + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s8); + break; + case 4: + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + a = *s8; + p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf); + p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4); + if (rd->xor) p ^= *d8; + *d8 = p; + break; + case 16: + s16 = (uint16_t *) src; + d16 = (uint16_t *) dest; + *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s16); + break; + case 32: + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) : + rd->gf->multiply.w32(rd->gf, rd->val, *s32); + break; + case 64: + s64 = (uint64_t *) src; + d64 = (uint64_t *) dest; + *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) : + rd->gf->multiply.w64(rd->gf, rd->val, *s64); + break; + default: + fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w); + exit(1); + } + src = (uint8_t *)src + wb; + dest = (uint8_t *)dest + wb; + } +} + +/* JSP - The purpose of this procedure is to error check alignment, + and to set up the region operation so that it can best leverage + large words. + + It stores its information in rd. + + Assuming you're not doing Cauchy coding, (see below for that), + then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably + should change that). + + src and dest must then be aligned on ceil(w/8)-byte boundaries. + Moreover, bytes must be a multiple of ceil(w/8). If the variable + align is equal to ceil(w/8), then we will set s_start = src, + d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes). + And we return -- the implementation will go ahead and do the + multiplication on individual words (e.g. using discrete logs). + + If align is greater than ceil(w/8), then the implementation needs + to work on groups of "align" bytes. For example, suppose you are + implementing BYTWO, without SSE. Then you will be doing the region + multiplication in units of 8 bytes, so align = 8. Or, suppose you + are doing a Quad table in GF(2^4). You will be doing the region + multiplication in units of 2 bytes, so align = 2. Or, suppose you + are doing split multiplication with SSE operations in GF(2^8). + Then align = 16. Worse yet, suppose you are doing split + multiplication with SSE operations in GF(2^16), with or without + ALTMAP. Then, you will be doing the multiplication on 256 bits at + a time. So align = 32. + + When align does not equal ceil(w/8), we split the region + multiplication into three parts. We are going to make s_start be + the first address greater than or equal to src that is a multiple + of align. s_top is going to be the largest address >= src+bytes + such that (s_top - s_start) is a multiple of align. We do the + same with d_start and d_top. When we say that "src and dest must + be aligned with respect to each other, we mean that s_start-src + must equal d_start-dest. + + Now, the region multiplication is done in three parts -- the part + between src and s_start must be done using single words. + Similarly, the part between s_top and src+bytes must also be done + using single words. The part between s_start and s_top will be + done in chunks of "align" bytes. + + One final thing -- if align > 16, then s_start and d_start will be + aligned on a 16 byte boundary. Perhaps we should have two + variables: align and chunksize. Then we'd have s_start & d_start + aligned to "align", and have s_top-s_start be a multiple of + chunksize. That may be less confusing, but it would be a big + change. + + Finally, if align = -1, then we are doing Cauchy multiplication, + using only XOR's. In this case, we're not going to care about + alignment because we are just doing XOR's. Instead, the only + thing we care about is that bytes must be a multiple of w. + + This is not to say that alignment doesn't matter in performance + with XOR's. See that discussion in gf_multby_one(). + + After you call gf_set_region_data(), the procedure + gf_do_initial_region_alignment() calls gf->multiply.w32() on + everything between src and s_start. The procedure + gf_do_final_region_alignment() calls gf->multiply.w32() on + everything between s_top and src+bytes. + */ + +void gf_set_region_data(gf_region_data *rd, + gf_t *gf, + void *src, + void *dest, + int bytes, + uint64_t val, + int xor, + int align) +{ + gf_internal_t *h = NULL; + int wb; + uint32_t a; + unsigned long uls, uld; + + if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */ + wb = 1; + } else { + h = gf->scratch; + wb = (h->w)/8; + if (wb == 0) wb = 1; + } + + rd->gf = gf; + rd->src = src; + rd->dest = dest; + rd->bytes = bytes; + rd->val = val; + rd->xor = xor; + rd->align = align; + + uls = (unsigned long) src; + uld = (unsigned long) dest; + + a = (align <= 16) ? align : 16; + + if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers + so that there are no alignment regions. */ + if (h != NULL && bytes % h->w != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w); + assert(0); + } + + rd->s_start = src; + rd->d_start = dest; + rd->s_top = (uint8_t *)src + bytes; + rd->d_top = (uint8_t *)src + bytes; + return; + } + + if (uls % a != uld % a) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The source & destination pointers must be aligned with respect\n"); + fprintf(stderr, "to each other along a %d byte boundary.\n", a); + fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, + (unsigned long) dest); + assert(0); + } + + if (uls % wb != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb); + fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src, + (unsigned long) dest); + assert(0); + } + + if (bytes % wb != 0) { + fprintf(stderr, "Error in region multiply operation.\n"); + fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb); + assert(0); + } + + uls %= a; + if (uls != 0) uls = (a-uls); + rd->s_start = (uint8_t *)rd->src + uls; + rd->d_start = (uint8_t *)rd->dest + uls; + bytes -= uls; + bytes -= (bytes % align); + rd->s_top = (uint8_t *)rd->s_start + bytes; + rd->d_top = (uint8_t *)rd->d_start + bytes; + +} + +void gf_do_initial_region_alignment(gf_region_data *rd) +{ + gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start); +} + +void gf_do_final_region_alignment(gf_region_data *rd) +{ + gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes); +} + +void gf_multby_zero(void *dest, int bytes, int xor) +{ + if (xor) return; + bzero(dest, bytes); + return; +} + +/* JSP - gf_multby_one tries to do this in the most efficient way + possible. If xor = 0, then simply call memcpy() since that + should be optimized by the system. Otherwise, try to do the xor + in the following order: + + If src and dest are aligned with respect to each other on 16-byte + boundaries and you have SSE instructions, then use aligned SSE + instructions. + + If they aren't but you still have SSE instructions, use unaligned + SSE instructions. + + If there are no SSE instructions, but they are aligned with + respect to each other on 8-byte boundaries, then do them with + uint64_t's. + + Otherwise, call gf_unaligned_xor(), which does the following: + align a destination pointer along an 8-byte boundary, and then + memcpy 32 bytes at a time from the src pointer to an array of + doubles. I'm not sure if that's the best -- probably needs + testing, but this seems like it could be a black hole. + */ + +static void gf_unaligned_xor(void *src, void *dest, int bytes); + +void gf_multby_one(void *src, void *dest, int bytes, int xor) +{ + unsigned long uls, uld; + uint8_t *s8, *d8; + uint64_t *s64, *d64, *dtop64; + gf_region_data rd; + + if (!xor) { + if (dest != src) + memcpy(dest, src, bytes); + return; + } + uls = (unsigned long) src; + uld = (unsigned long) dest; + +#ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2) { + __m128i ms, md; + int abytes; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + d8++; + s8++; + } + while (s8 < (uint8_t *) rd.s_top) { + ms = _mm_load_si128 ((__m128i *)(s8)); + md = _mm_load_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_store_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (s8 != (uint8_t *) src + bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; + } + + abytes = (bytes & 0xfffffff0); + + while (d8 < (uint8_t *) dest + abytes) { + ms = _mm_loadu_si128 ((__m128i *)(s8)); + md = _mm_loadu_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_storeu_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (d8 != (uint8_t *) dest+bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; + } +#endif +#if defined(ARM_NEON) + if (gf_cpu_supports_arm_neon) { + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + s8++; + d8++; + } + while (s8 < (uint8_t *) rd.s_top) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } else { + while (s8 + 15 < (uint8_t *) src + bytes) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } + while (s8 < (uint8_t *) src + bytes) { + *d8 ^= *s8; + s8++; + d8++; + } + return; + } +#endif + if (uls % 8 != uld % 8) { + gf_unaligned_xor(src, dest, bytes); + return; + } + + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8); + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + while (d8 != rd.d_start) { + *d8 ^= *s8; + d8++; + s8++; + } + dtop64 = (uint64_t *) rd.d_top; + + d64 = (uint64_t *) rd.d_start; + s64 = (uint64_t *) rd.s_start; + + while (d64 < dtop64) { + *d64 ^= *s64; + d64++; + s64++; + } + + s8 = (uint8_t *) rd.s_top; + d8 = (uint8_t *) rd.d_top; + + while (d8 != (uint8_t *) dest+bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; +} + +#define UNALIGNED_BUFSIZE (8) + +static void gf_unaligned_xor(void *src, void *dest, int bytes) +{ + uint64_t scopy[UNALIGNED_BUFSIZE], *d64; + int i; + gf_region_data rd; + uint8_t *s8, *d8; + + /* JSP - call gf_set_region_data(), but use dest in both places. This is + because I only want to set up dest. If I used src, gf_set_region_data() + would fail because src and dest are not aligned to each other wrt + 8-byte pointers. I know this will actually align d_start to 16 bytes. + If I change gf_set_region_data() to split alignment & chunksize, then + I could do this correctly. */ + + gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE); + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + while (d8 < (uint8_t *) rd.d_start) { + *d8 ^= *s8; + d8++; + s8++; + } + + d64 = (uint64_t *) d8; + while (d64 < (uint64_t *) rd.d_top) { + memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE); + s8 += 8*UNALIGNED_BUFSIZE; + for (i = 0; i < UNALIGNED_BUFSIZE; i++) { + *d64 ^= scopy[i]; + d64++; + } + } + + d8 = (uint8_t *) d64; + while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) { + *d8 ^= *s8; + d8++; + s8++; + } +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c b/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c new file mode 100644 index 000000000..f65131f58 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_cpu.c @@ -0,0 +1,180 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_cpu.h + * + * Identifies whether the CPU supports SIMD instructions at runtime. + */ + +#include +#include + +int gf_cpu_identified = 0; + +int gf_cpu_supports_intel_pclmul = 0; +int gf_cpu_supports_intel_sse4 = 0; +int gf_cpu_supports_intel_ssse3 = 0; +int gf_cpu_supports_intel_sse3 = 0; +int gf_cpu_supports_intel_sse2 = 0; +int gf_cpu_supports_arm_neon = 0; + +#if defined(__x86_64__) + +/* CPUID Feature Bits */ + +/* ECX */ +#define GF_CPU_SSE3 (1 << 0) +#define GF_CPU_PCLMUL (1 << 1) +#define GF_CPU_SSSE3 (1 << 9) +#define GF_CPU_SSE41 (1 << 19) +#define GF_CPU_SSE42 (1 << 20) + +/* EDX */ +#define GF_CPU_SSE2 (1 << 26) + +#if defined(_MSC_VER) + +#define cpuid(info, x) __cpuidex(info, x, 0) + +#elif defined(__GNUC__) + +#include +void cpuid(int info[4], int InfoType){ + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} + +#else + +#error please add a way to detect CPU SIMD support at runtime + +#endif + +void gf_cpu_identify(void) +{ + if (gf_cpu_identified) { + return; + } + + int reg[4]; + + cpuid(reg, 1); + +#if defined(INTEL_SSE4_PCLMUL) + if ((reg[2] & GF_CPU_PCLMUL) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) { + gf_cpu_supports_intel_pclmul = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_pclmul\n"); +#endif + } +#endif + +#if defined(INTEL_SSE4) + if (((reg[2] & GF_CPU_SSE42) != 0 || (reg[2] & GF_CPU_SSE41) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) { + gf_cpu_supports_intel_sse4 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse4\n"); +#endif + } +#endif + +#if defined(INTEL_SSSE3) + if ((reg[2] & GF_CPU_SSSE3) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) { + gf_cpu_supports_intel_ssse3 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_ssse3\n"); +#endif + } +#endif + +#if defined(INTEL_SSE3) + if ((reg[2] & GF_CPU_SSE3) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) { + gf_cpu_supports_intel_sse3 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse3\n"); +#endif + } +#endif + +#if defined(INTEL_SSE2) + if ((reg[3] & GF_CPU_SSE2) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) { + gf_cpu_supports_intel_sse2 = 1; +#ifdef DEBUG_CPU_DETECTION + printf("#gf_cpu_supports_intel_sse2\n"); +#endif + } +#endif + + gf_cpu_identified = 1; +} + +#elif defined(__arm__) || defined(__aarch64__) + +#ifdef __linux__ + +#include +#include +#include +#include +#include +#include + +unsigned long get_hwcap(unsigned long type) { + unsigned long hwcap = 0; + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd > 0) { + Elf32_auxv_t auxv; + while (read(fd, &auxv, sizeof(Elf32_auxv_t))) { + if (auxv.a_type == type) { + hwcap = auxv.a_un.a_val; + break; + } + } + close(fd); + } + + return hwcap; +} + +#endif // linux + +void gf_cpu_identify(void) +{ + if (gf_cpu_identified) { + return; + } + +#if defined(ARM_NEON) + if (!getenv("GF_COMPLETE_DISABLE_NEON")) { +#if __linux__ && __arm__ + gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0; +#elif __aarch64__ + // ASIMD is supported on all aarch64 architectures + gf_cpu_supports_arm_neon = 1; +#else + // we assume that NEON is supported if the compiler supports + // NEON and we dont have a reliable way to detect runtime support. + gf_cpu_supports_arm_neon = 1; +#endif + +#ifdef DEBUG_CPU_DETECTION + if (gf_cpu_supports_arm_neon) { + printf("#gf_cpu_supports_arm_neon\n"); + } +#endif + } +#endif // defined(ARM_NEON) + + gf_cpu_identified = 1; +} + +#else // defined(__arm__) || defined(__aarch64__) + +int gf_cpu_identify(void) +{ + gf_cpu_identified = 1; + return 0; +} + +#endif diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_general.c b/src/erasure-code/jerasure/gf-complete/src/gf_general.c new file mode 100644 index 000000000..769f7a082 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_general.c @@ -0,0 +1,539 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_general.c + * + * This file has helper routines for doing basic GF operations with any + * legal value of w. The problem is that w <= 32, w=64 and w=128 all have + * different data types, which is a pain. The procedures in this file try + * to alleviate that pain. They are used in gf_unit and gf_time. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_int.h" +#include "gf_method.h" +#include "gf_rand.h" +#include "gf_general.h" + +void gf_general_set_zero(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 0; + } else if (w <= 64) { + v->w64 = 0; + } else { + v->w128[0] = 0; + v->w128[1] = 0; + } +} + +void gf_general_set_one(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 1; + } else if (w <= 64) { + v->w64 = 1; + } else { + v->w128[0] = 0; + v->w128[1] = 1; + } +} + +void gf_general_set_two(gf_general_t *v, int w) +{ + if (w <= 32) { + v->w32 = 2; + } else if (w <= 64) { + v->w64 = 2; + } else { + v->w128[0] = 0; + v->w128[1] = 2; + } +} + +int gf_general_is_zero(gf_general_t *v, int w) +{ + if (w <= 32) { + return (v->w32 == 0); + } else if (w <= 64) { + return (v->w64 == 0); + } else { + return (v->w128[0] == 0 && v->w128[1] == 0); + } +} + +int gf_general_is_one(gf_general_t *v, int w) +{ + if (w <= 32) { + return (v->w32 == 1); + } else if (w <= 64) { + return (v->w64 == 1); + } else { + return (v->w128[0] == 0 && v->w128[1] == 1); + } +} + +void gf_general_set_random(gf_general_t *v, int w, int zero_ok) +{ + if (w <= 32) { + v->w32 = MOA_Random_W(w, zero_ok); + } else if (w <= 64) { + while (1) { + v->w64 = MOA_Random_64(); + if (v->w64 != 0 || zero_ok) return; + } + } else { + while (1) { + MOA_Random_128(v->w128); + if (v->w128[0] != 0 || v->w128[1] != 0 || zero_ok) return; + } + } +} + +void gf_general_val_to_s(gf_general_t *v, int w, char *s, int hex) +{ + if (w <= 32) { + if (hex) { + sprintf(s, "%x", v->w32); + } else { + sprintf(s, "%u", v->w32); + } + } else if (w <= 64) { + if (hex) { + sprintf(s, "%llx", (long long unsigned int) v->w64); + } else { + sprintf(s, "%lld", (long long unsigned int) v->w64); + } + } else { + if (v->w128[0] == 0) { + sprintf(s, "%llx", (long long unsigned int) v->w128[1]); + } else { + sprintf(s, "%llx%016llx", (long long unsigned int) v->w128[0], + (long long unsigned int) v->w128[1]); + } + } +} + +int gf_general_s_to_val(gf_general_t *v, int w, char *s, int hex) +{ + int l; + int save; + + if (w <= 32) { + if (hex) { + if (sscanf(s, "%x", &(v->w32)) == 0) return 0; + } else { + if (sscanf(s, "%u", &(v->w32)) == 0) return 0; + } + if (w == 32) return 1; + if (w == 31) { + if (v->w32 & ((gf_val_32_t)1 << 31)) return 0; + return 1; + } + if (v->w32 & ~((1 << w)-1)) return 0; + return 1; + } else if (w <= 64) { + if (hex) return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w64))) == 1); + return (sscanf(s, "%lld", (long long int *) (&(v->w64))) == 1); + } else { + if (!hex) return 0; + l = strlen(s); + if (l <= 16) { + v->w128[0] = 0; + return (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); + } else { + if (l > 32) return 0; + save = s[l-16]; + s[l-16] = '\0'; + if (sscanf(s, "%llx", (long long unsigned int *) (&(v->w128[0]))) == 0) { + s[l-16] = save; + return 0; + } + return (sscanf(s+(l-16), "%llx", (long long unsigned int *) (&(v->w128[1]))) == 1); + } + } +} + +void gf_general_add(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = a->w32 ^ b->w32; + } else if (w <= 64) { + c->w64 = a->w64 ^ b->w64; + } else { + c->w128[0] = a->w128[0] ^ b->w128[0]; + c->w128[1] = a->w128[1] ^ b->w128[1]; + } +} + +void gf_general_multiply(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = gf->multiply.w32(gf, a->w32, b->w32); + } else if (w <= 64) { + c->w64 = gf->multiply.w64(gf, a->w64, b->w64); + } else { + gf->multiply.w128(gf, a->w128, b->w128, c->w128); + } +} + +void gf_general_divide(gf_t *gf, gf_general_t *a, gf_general_t *b, gf_general_t *c) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + c->w32 = gf->divide.w32(gf, a->w32, b->w32); + } else if (w <= 64) { + c->w64 = gf->divide.w64(gf, a->w64, b->w64); + } else { + gf->divide.w128(gf, a->w128, b->w128, c->w128); + } +} + +void gf_general_inverse(gf_t *gf, gf_general_t *a, gf_general_t *b) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + b->w32 = gf->inverse.w32(gf, a->w32); + } else if (w <= 64) { + b->w64 = gf->inverse.w64(gf, a->w64); + } else { + gf->inverse.w128(gf, a->w128, b->w128); + } +} + +int gf_general_are_equal(gf_general_t *v1, gf_general_t *v2, int w) +{ + if (w <= 32) { + return (v1->w32 == v2->w32); + } else if (w <= 64) { + return (v1->w64 == v2->w64); + } else { + return (v1->w128[0] == v2->w128[0] && + v1->w128[1] == v2->w128[1]); + } +} + +void gf_general_do_region_multiply(gf_t *gf, gf_general_t *a, void *ra, void *rb, int bytes, int xor) +{ + gf_internal_t *h; + int w; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + if (w <= 32) { + gf->multiply_region.w32(gf, ra, rb, a->w32, bytes, xor); + } else if (w <= 64) { + gf->multiply_region.w64(gf, ra, rb, a->w64, bytes, xor); + } else { + gf->multiply_region.w128(gf, ra, rb, a->w128, bytes, xor); + } +} + +void gf_general_do_region_check(gf_t *gf, gf_general_t *a, void *orig_a, void *orig_target, void *final_target, int bytes, int xor) +{ + gf_internal_t *h; + int w, words, i; + gf_general_t oa, ot, ft, sb; + char sa[50], soa[50], sot[50], sft[50], ssb[50]; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + + words = (bytes * 8) / w; + for (i = 0; i < words; i++) { + if (w <= 32) { + oa.w32 = gf->extract_word.w32(gf, orig_a, bytes, i); + ot.w32 = gf->extract_word.w32(gf, orig_target, bytes, i); + ft.w32 = gf->extract_word.w32(gf, final_target, bytes, i); + sb.w32 = gf->multiply.w32(gf, a->w32, oa.w32); + if (xor) sb.w32 ^= ot.w32; + } else if (w <= 64) { + oa.w64 = gf->extract_word.w64(gf, orig_a, bytes, i); + ot.w64 = gf->extract_word.w64(gf, orig_target, bytes, i); + ft.w64 = gf->extract_word.w64(gf, final_target, bytes, i); + sb.w64 = gf->multiply.w64(gf, a->w64, oa.w64); + if (xor) sb.w64 ^= ot.w64; + } else { + gf->extract_word.w128(gf, orig_a, bytes, i, oa.w128); + gf->extract_word.w128(gf, orig_target, bytes, i, ot.w128); + gf->extract_word.w128(gf, final_target, bytes, i, ft.w128); + gf->multiply.w128(gf, a->w128, oa.w128, sb.w128); + if (xor) { + sb.w128[0] ^= ot.w128[0]; + sb.w128[1] ^= ot.w128[1]; + } + } + + if (!gf_general_are_equal(&ft, &sb, w)) { + + fprintf(stderr,"Problem with region multiply (all values in hex):\n"); + fprintf(stderr," Target address base: 0x%lx. Word 0x%x of 0x%x. Xor: %d\n", + (unsigned long) final_target, i, words, xor); + gf_general_val_to_s(a, w, sa, 1); + gf_general_val_to_s(&oa, w, soa, 1); + gf_general_val_to_s(&ot, w, sot, 1); + gf_general_val_to_s(&ft, w, sft, 1); + gf_general_val_to_s(&sb, w, ssb, 1); + fprintf(stderr," Value: %s\n", sa); + fprintf(stderr," Original source word: %s\n", soa); + if (xor) fprintf(stderr," XOR with target word: %s\n", sot); + fprintf(stderr," Product word: %s\n", sft); + fprintf(stderr," It should be: %s\n", ssb); + assert(0); + } + } +} + +void gf_general_set_up_single_timing_test(int w, void *ra, void *rb, int size) +{ + void *top; + gf_general_t g; + uint8_t *r8, *r8a; + uint16_t *r16; + uint32_t *r32; + uint64_t *r64; + int i; + + top = (uint8_t *)rb+size; + + /* If w is 8, 16, 32, 64 or 128, fill the regions with random bytes. + However, don't allow for zeros in rb, because that will screw up + division. + + When w is 4, you fill the regions with random 4-bit words in each byte. + + Otherwise, treat every four bytes as an uint32_t + and fill it with a random value mod (1 << w). + */ + + if (w == 8 || w == 16 || w == 32 || w == 64 || w == 128) { + MOA_Fill_Random_Region (ra, size); + while (rb < top) { + gf_general_set_random(&g, w, 0); + switch (w) { + case 8: + r8 = (uint8_t *) rb; + *r8 = g.w32; + break; + case 16: + r16 = (uint16_t *) rb; + *r16 = g.w32; + break; + case 32: + r32 = (uint32_t *) rb; + *r32 = g.w32; + break; + case 64: + r64 = (uint64_t *) rb; + *r64 = g.w64; + break; + case 128: + r64 = (uint64_t *) rb; + r64[0] = g.w128[0]; + r64[1] = g.w128[1]; + break; + } + rb = (uint8_t *)rb + (w/8); + } + } else if (w == 4) { + r8a = (uint8_t *) ra; + r8 = (uint8_t *) rb; + while (r8 < (uint8_t *) top) { + gf_general_set_random(&g, w, 1); + *r8a = g.w32; + gf_general_set_random(&g, w, 0); + *r8 = g.w32; + r8a++; + r8++; + } + } else { + r32 = (uint32_t *) ra; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 1); + r32 = (uint32_t *) rb; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_W(w, 0); + } +} + +/* This sucks, but in order to time, you really need to avoid putting ifs in + the inner loops. So, I'm doing a separate timing test for each w: + (4 & 8), 16, 32, 64, 128 and everything else. Fortunately, the "everything else" + tests can be equivalent to w=32. + + I'm also putting the results back into ra, because otherwise, the optimizer might + figure out that we're not really doing anything in the inner loops and it + will chuck that. */ + +int gf_general_do_single_timing_test(gf_t *gf, void *ra, void *rb, int size, char test) +{ + gf_internal_t *h; + void *top; + uint8_t *r8a, *r8b, *top8; + uint16_t *r16a, *r16b, *top16; + uint32_t *r32a, *r32b, *top32; + uint64_t *r64a, *r64b, *top64, *r64c; + int w, rv; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + top = (uint8_t *)ra + size; + + if (w == 8 || w == 4) { + r8a = (uint8_t *) ra; + r8b = (uint8_t *) rb; + top8 = (uint8_t *) top; + if (test == 'M') { + while (r8a < top8) { + *r8a = gf->multiply.w32(gf, *r8a, *r8b); + r8a++; + r8b++; + } + } else if (test == 'D') { + while (r8a < top8) { + *r8a = gf->divide.w32(gf, *r8a, *r8b); + r8a++; + r8b++; + } + } else if (test == 'I') { + while (r8a < top8) { + *r8a = gf->inverse.w32(gf, *r8a); + r8a++; + } + } + return (top8 - (uint8_t *) ra); + } + + if (w == 16) { + r16a = (uint16_t *) ra; + r16b = (uint16_t *) rb; + top16 = (uint16_t *) top; + if (test == 'M') { + while (r16a < top16) { + *r16a = gf->multiply.w32(gf, *r16a, *r16b); + r16a++; + r16b++; + } + } else if (test == 'D') { + while (r16a < top16) { + *r16a = gf->divide.w32(gf, *r16a, *r16b); + r16a++; + r16b++; + } + } else if (test == 'I') { + while (r16a < top16) { + *r16a = gf->inverse.w32(gf, *r16a); + r16a++; + } + } + return (top16 - (uint16_t *) ra); + } + if (w <= 32) { + r32a = (uint32_t *) ra; + r32b = (uint32_t *) rb; + top32 = (uint32_t *) ra + (size/4); /* This is for the "everything elses" */ + + if (test == 'M') { + while (r32a < top32) { + *r32a = gf->multiply.w32(gf, *r32a, *r32b); + r32a++; + r32b++; + } + } else if (test == 'D') { + while (r32a < top32) { + *r32a = gf->divide.w32(gf, *r32a, *r32b); + r32a++; + r32b++; + } + } else if (test == 'I') { + while (r32a < top32) { + *r32a = gf->inverse.w32(gf, *r32a); + r32a++; + } + } + return (top32 - (uint32_t *) ra); + } + if (w == 64) { + r64a = (uint64_t *) ra; + r64b = (uint64_t *) rb; + top64 = (uint64_t *) top; + if (test == 'M') { + while (r64a < top64) { + *r64a = gf->multiply.w64(gf, *r64a, *r64b); + r64a++; + r64b++; + } + } else if (test == 'D') { + while (r64a < top64) { + *r64a = gf->divide.w64(gf, *r64a, *r64b); + r64a++; + r64b++; + } + } else if (test == 'I') { + while (r64a < top64) { + *r64a = gf->inverse.w64(gf, *r64a); + r64a++; + } + } + return (top64 - (uint64_t *) ra); + } + if (w == 128) { + r64a = (uint64_t *) ra; + r64c = r64a; + r64a += 2; + r64b = (uint64_t *) rb; + top64 = (uint64_t *) top; + rv = (top64 - r64a)/2; + if (test == 'M') { + while (r64a < top64) { + gf->multiply.w128(gf, r64a, r64b, r64c); + r64a += 2; + r64b += 2; + } + } else if (test == 'D') { + while (r64a < top64) { + gf->divide.w128(gf, r64a, r64b, r64c); + r64a += 2; + r64b += 2; + } + } else if (test == 'I') { + while (r64a < top64) { + gf->inverse.w128(gf, r64a, r64c); + r64a += 2; + } + } + return rv; + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_method.c b/src/erasure-code/jerasure/gf-complete/src/gf_method.c new file mode 100644 index 000000000..2210305d8 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_method.c @@ -0,0 +1,193 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_method.c + * + * Parses argv to figure out the mult_type and arguments. Returns the gf. + */ + +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_int.h" +#include "gf_method.h" + +int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) +{ + int mult_type, divide_type, region_type; + int arg1, arg2; + uint64_t prim_poly; + gf_t *base; + + mult_type = GF_MULT_DEFAULT; + region_type = GF_REGION_DEFAULT; + divide_type = GF_DIVIDE_DEFAULT; + prim_poly = 0; + base = NULL; + arg1 = 0; + arg2 = 0; + while (1) { + if (argc > starting) { + if (strcmp(argv[starting], "-m") == 0) { + starting++; + if (mult_type != GF_MULT_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWOMULT; + return 0; + } + if (strcmp(argv[starting], "SHIFT") == 0) { + mult_type = GF_MULT_SHIFT; + starting++; + } else if (strcmp(argv[starting], "CARRY_FREE") == 0) { + mult_type = GF_MULT_CARRY_FREE; + starting++; + } else if (strcmp(argv[starting], "CARRY_FREE_GK") == 0) { + mult_type = GF_MULT_CARRY_FREE_GK; + starting++; + } else if (strcmp(argv[starting], "GROUP") == 0) { + mult_type = GF_MULT_GROUP; + if (argc < starting + 3) { + _gf_errno = GF_E_GROUPAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_GROUPNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "BYTWO_p") == 0) { + mult_type = GF_MULT_BYTWO_p; + starting++; + } else if (strcmp(argv[starting], "BYTWO_b") == 0) { + mult_type = GF_MULT_BYTWO_b; + starting++; + } else if (strcmp(argv[starting], "TABLE") == 0) { + mult_type = GF_MULT_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG") == 0) { + mult_type = GF_MULT_LOG_TABLE; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO") == 0) { + mult_type = GF_MULT_LOG_ZERO; + starting++; + } else if (strcmp(argv[starting], "LOG_ZERO_EXT") == 0) { + mult_type = GF_MULT_LOG_ZERO_EXT; + starting++; + } else if (strcmp(argv[starting], "SPLIT") == 0) { + mult_type = GF_MULT_SPLIT_TABLE; + if (argc < starting + 3) { + _gf_errno = GF_E_SPLITAR; + return 0; + } + if (sscanf(argv[starting+1], "%d", &arg1) == 0 || + sscanf(argv[starting+2], "%d", &arg2) == 0) { + _gf_errno = GF_E_SPLITNU; + return 0; + } + starting += 3; + } else if (strcmp(argv[starting], "COMPOSITE") == 0) { + mult_type = GF_MULT_COMPOSITE; + if (argc < starting + 2) { _gf_errno = GF_E_FEWARGS; return 0; } + if (sscanf(argv[starting+1], "%d", &arg1) == 0) { + _gf_errno = GF_E_COMP_A2; + return 0; + } + starting += 2; + base = (gf_t *) malloc(sizeof(gf_t)); + starting = create_gf_from_argv(base, w/arg1, argc, argv, starting); + if (starting == 0) { + free(base); + return 0; + } + } else { + _gf_errno = GF_E_UNKNOWN; + return 0; + } + } else if (strcmp(argv[starting], "-r") == 0) { + starting++; + if (strcmp(argv[starting], "DOUBLE") == 0) { + region_type |= GF_REGION_DOUBLE_TABLE; + starting++; + } else if (strcmp(argv[starting], "QUAD") == 0) { + region_type |= GF_REGION_QUAD_TABLE; + starting++; + } else if (strcmp(argv[starting], "LAZY") == 0) { + region_type |= GF_REGION_LAZY; + starting++; + } else if (strcmp(argv[starting], "SIMD") == 0) { + region_type |= GF_REGION_SIMD; + starting++; + } else if (strcmp(argv[starting], "NOSIMD") == 0) { + region_type |= GF_REGION_NOSIMD; + starting++; + } else if (strcmp(argv[starting], "SSE") == 0) { + region_type |= GF_REGION_SIMD; + starting++; + } else if (strcmp(argv[starting], "NOSSE") == 0) { + region_type |= GF_REGION_NOSIMD; + starting++; + } else if (strcmp(argv[starting], "CAUCHY") == 0) { + region_type |= GF_REGION_CAUCHY; + starting++; + } else if (strcmp(argv[starting], "ALTMAP") == 0) { + region_type |= GF_REGION_ALTMAP; + starting++; + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNK_REG; + return 0; + } + } else if (strcmp(argv[starting], "-p") == 0) { + starting++; + if (sscanf(argv[starting], "%llx", (long long unsigned int *)(&prim_poly)) == 0) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_POLYSPC; + return 0; + } + starting++; + } else if (strcmp(argv[starting], "-d") == 0) { + starting++; + if (divide_type != GF_DIVIDE_DEFAULT) { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_TWO_DIV; + return 0; + } else if (strcmp(argv[starting], "EUCLID") == 0) { + divide_type = GF_DIVIDE_EUCLID; + starting++; + } else if (strcmp(argv[starting], "MATRIX") == 0) { + divide_type = GF_DIVIDE_MATRIX; + starting++; + } else { + _gf_errno = GF_E_UNK_DIV; + return 0; + } + } else if (strcmp(argv[starting], "-") == 0) { + /* + printf("Scratch size: %d\n", gf_scratch_size(w, + mult_type, region_type, divide_type, arg1, arg2)); + */ + if (gf_init_hard(gf, w, mult_type, region_type, divide_type, + prim_poly, arg1, arg2, base, NULL) == 0) { + if (base != NULL) gf_free(base, 1); + return 0; + } else + return starting + 1; + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_UNKFLAG; + return 0; + } + } else { + if (base != NULL) gf_free(base, 1); + _gf_errno = GF_E_FEWARGS; + return 0; + } + } +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_rand.c b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c new file mode 100644 index 000000000..a9aa7ad36 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_rand.c @@ -0,0 +1,80 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_rand.c -- Random number generator. + */ + +#include +#include +#include +#include "gf_rand.h" + +/* Lifted the "Mother of All" random number generator from http://www.agner.org/random/ */ + +static uint32_t MOA_X[5]; + +uint32_t MOA_Random_32() { + uint64_t sum; + sum = (uint64_t)2111111111UL * (uint64_t)MOA_X[3] + + (uint64_t)1492 * (uint64_t)(MOA_X[2]) + + (uint64_t)1776 * (uint64_t)(MOA_X[1]) + + (uint64_t)5115 * (uint64_t)(MOA_X[0]) + + (uint64_t)MOA_X[4]; + MOA_X[3] = MOA_X[2]; MOA_X[2] = MOA_X[1]; MOA_X[1] = MOA_X[0]; + MOA_X[4] = (uint32_t)(sum >> 32); + MOA_X[0] = (uint32_t)sum; + return MOA_X[0]; +} + +uint64_t MOA_Random_64() { + uint64_t sum; + + sum = MOA_Random_32(); + sum <<= 32; + sum |= MOA_Random_32(); + return sum; +} + +void MOA_Random_128(uint64_t *x) { + x[0] = MOA_Random_64(); + x[1] = MOA_Random_64(); + return; +} + +uint32_t MOA_Random_W(int w, int zero_ok) +{ + uint32_t b; + + do { + b = MOA_Random_32(); + if (w == 31) b &= 0x7fffffff; + if (w < 31) b %= (1 << w); + } while (!zero_ok && b == 0); + return b; +} + +void MOA_Seed(uint32_t seed) { + int i; + uint32_t s = seed; + for (i = 0; i < 5; i++) { + s = s * 29943829 - 1; + MOA_X[i] = s; + } + for (i=0; i<19; i++) MOA_Random_32(); +} + + +void MOA_Fill_Random_Region (void *reg, int size) +{ + uint32_t *r32; + uint8_t *r8; + int i; + + r32 = (uint32_t *) reg; + r8 = (uint8_t *) reg; + for (i = 0; i < size/4; i++) r32[i] = MOA_Random_32(); + for (i *= 4; i < size; i++) r8[i] = MOA_Random_W(8, 1); +} + diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w128.c b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c new file mode 100644 index 000000000..3bc2d651a --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w128.c @@ -0,0 +1,1776 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w128.c + * + * Routines for 128-bit Galois fields + */ + +#include "gf_int.h" +#include +#include +#include "gf_cpu.h" + +#define GF_FIELD_WIDTH (128) + +#define two_x(a) {\ + a[0] <<= 1; \ + if (a[1] & 1ULL << 63) a[0] ^= 1; \ + a[1] <<= 1; } + +#define a_get_b(a, i, b, j) {\ + a[i] = b[j]; \ + a[i + 1] = b[j + 1];} + +#define set_zero(a, i) {\ + a[i] = 0; \ + a[i + 1] = 0;} + +struct gf_w128_split_4_128_data { + uint64_t last_value[2]; + uint64_t tables[2][32][16]; +}; + +struct gf_w128_split_8_128_data { + uint64_t last_value[2]; + uint64_t tables[2][16][256]; +}; + +typedef struct gf_group_tables_s { + gf_val_128_t m_table; + gf_val_128_t r_table; +} gf_group_tables_t; + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +static +void +gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +int xor) +{ + uint32_t i; + gf_val_128_t s128; + gf_val_128_t d128; + uint64_t c128[2]; + gf_region_data rd; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + set_zero(c128, 0); + + s128 = (gf_val_128_t) src; + d128 = (gf_val_128_t) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + gf->multiply.w128(gf, &s128[i], val, c128); + d128[i] ^= c128[0]; + d128[i+1] ^= c128[1]; + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + gf->multiply.w128(gf, &s128[i], val, &d128[i]); + } + } +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, +int xor) +{ + uint32_t i; + gf_val_128_t s128; + gf_val_128_t d128; + gf_region_data rd; + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + s128 = (gf_val_128_t) src; + d128 = (gf_val_128_t) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0); + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) { + a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0); + b = _mm_insert_epi64 (a, val[1], 0); + a = _mm_insert_epi64 (a, s128[i], 1); + b = _mm_insert_epi64 (b, val[0], 1); + + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + d128[i] = (uint64_t)_mm_extract_epi64(result1,1); + d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0); + } + } +} +#endif + +/* + * Some w128 notes: + * --Big Endian + * --return values allocated beforehand + */ + +#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0) + +void +gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */ + uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) { + set_zero(c128, 0); + return; + } + + a_get_b(a, 0, a128, 0); + a_get_b(br, 0, b128, 0); + set_zero(bl, 0); + + one = 1; + lbit = (one << 63); + + set_zero(pl, 0); + set_zero(pr, 0); + + /* Allen: a*b for right half of a */ + for (i = 0; i < GF_FIELD_WIDTH/2; i++) { + if (a[1] & (one << i)) { + pl[1] ^= bl[1]; + pr[0] ^= br[0]; + pr[1] ^= br[1]; + } + bl[1] <<= 1; + if (br[0] & lbit) bl[1] ^= 1; + br[0] <<= 1; + if (br[1] & lbit) br[0] ^= 1; + br[1] <<= 1; + } + + /* Allen: a*b for left half of a */ + for (i = 0; i < GF_FIELD_WIDTH/2; i++) { + if (a[0] & (one << i)) { + pl[0] ^= bl[0]; + pl[1] ^= bl[1]; + pr[0] ^= br[0]; + } + bl[0] <<= 1; + if (bl[1] & lbit) bl[0] ^= 1; + bl[1] <<= 1; + if (br[0] & lbit) bl[1] ^= 1; + br[0] <<= 1; + } + + /* Allen: do first half of reduction (based on left quarter of initial product) */ + one = lbit >> 1; + ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */ + ppl[1] = h->prim_poly >> 2; + ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2); + ppr[1] = 0; + while (one != 0) { + if (pl[0] & one) { + pl[0] ^= ppl[0]; + pl[1] ^= ppl[1]; + pr[0] ^= ppr[0]; + pr[1] ^= ppr[1]; + } + one >>= 1; + ppr[1] >>= 1; + if (ppr[0] & 1) ppr[1] ^= lbit; + ppr[0] >>= 1; + if (ppl[1] & 1) ppr[0] ^= lbit; + ppl[1] >>= 1; + if (ppl[0] & 1) ppl[1] ^= lbit; + ppl[0] >>= 1; + } + + /* Allen: final half of reduction */ + one = lbit; + while (one != 0) { + if (pl[1] & one) { + pl[1] ^= ppl[1]; + pr[0] ^= ppr[0]; + pr[1] ^= ppr[1]; + } + one >>= 1; + ppr[1] >>= 1; + if (ppr[0] & 1) ppr[1] ^= lbit; + ppr[0] >>= 1; + if (ppl[1] & 1) ppr[0] ^= lbit; + ppl[1] >>= 1; + } + + /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */ + c128[0] = pr[0]; + c128[1] = pr[1]; + + return; +} + +#if defined(INTEL_SSE4_PCLMUL) + +void +gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + __m128i a,b; + __m128i result0,result1; + __m128i prim_poly; + __m128i c,d,e,f; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0); + b = _mm_insert_epi64 (a, b128[1], 0); + a = _mm_insert_epi64 (a, a128[0], 1); + b = _mm_insert_epi64 (b, b128[0], 1); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + + /* we need to test algorithm 2 later*/ + c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/ + f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/ + e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ + d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ + + /* now reusing a and b as temporary variables*/ + result0 = _mm_setzero_si128(); + result1 = result0; + + result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0)); + a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1)); + result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a)); + + a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0)); + result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a)); + result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1)); + /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/ + + a = _mm_srli_si128 (result0, 8); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8)); + result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8)); + + a = _mm_insert_epi64 (result0, 0, 1); + b = _mm_clmulepi64_si128 (a, prim_poly, 0x00); + result1 = _mm_xor_si128 (result1, b); + + c128[0] = (uint64_t)_mm_extract_epi64(result1,1); + c128[1] = (uint64_t)_mm_extract_epi64(result1,0); +} +#endif + +void +gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint64_t topbit; /* this is used as a boolean value */ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + prod[0] = 0; + prod[1] = 0; + pmask = 0x8000000000000000ULL; + amask[0] = 0x8000000000000000ULL; + amask[1] = 0; + + while (amask[1] != 0 || amask[0] != 0) { + topbit = (prod[0] & pmask); + prod[0] <<= 1; + if (prod[1] & pmask) prod[0] ^= 1; + prod[1] <<= 1; + if (topbit) prod[1] ^= pp; + if ((a128[0] & amask[0]) || (a128[1] & amask[1])) { + prod[0] ^= b128[0]; + prod[1] ^= b128[1]; + } + amask[1] >>= 1; + if (amask[0] & 1) amask[1] ^= pmask; + amask[0] >>= 1; + } + c128[0] = prod [0]; + c128[1] = prod [1]; + return; +} + +#if defined(INTEL_SSE4) +void +gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + int i; + __m128i a, b, pp, prod, amask, u_middle_one; + /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/ + uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */ + gf_internal_t *h; + + + h = (gf_internal_t *) gf->scratch; + pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly); + prod = _mm_setzero_si128(); + a = _mm_insert_epi64(prod, a128[1], 0x0); + a = _mm_insert_epi64(a, a128[0], 0x1); + b = _mm_insert_epi64(prod, b128[1], 0x0); + b = _mm_insert_epi64(b, b128[0], 0x1); + pmask = 0x80000000; + amask = _mm_insert_epi32(prod, 0x80000000, 0x3); + u_middle_one = _mm_insert_epi32(prod, 1, 0x2); + + for (i = 0; i < 64; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */ + if (middlebit) { + prod = _mm_xor_si128(prod, u_middle_one); + } + if (topbit) { + prod = _mm_xor_si128(prod, pp); + } + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/ + } + amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1); + for (i = 64; i < 128; i++) { + topbit = (_mm_extract_epi32(prod, 0x3) & pmask); + middlebit = (_mm_extract_epi32(prod, 0x1) & pmask); + prod = _mm_slli_epi64(prod, 1); + if (middlebit) prod = _mm_xor_si128(prod, u_middle_one); + if (topbit) prod = _mm_xor_si128(prod, pp); + if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) { + prod = _mm_xor_si128(prod, b); + } + amask = _mm_srli_epi64(amask, 1); + } + c128[0] = (uint64_t)_mm_extract_epi64(prod, 1); + c128[1] = (uint64_t)_mm_extract_epi64(prod, 0); + return; +} +#endif + + +/* Ben: This slow function implements sse instrutions for bytwo_b because why not */ +#if defined(INTEL_SSE4) +void +gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + __m128i a, b, lmask, hmask, pp, c, middle_one; + gf_internal_t *h; + uint64_t topbit, middlebit; + + h = (gf_internal_t *) gf->scratch; + + c = _mm_setzero_si128(); + lmask = _mm_insert_epi64(c, 1ULL << 63, 0); + hmask = _mm_insert_epi64(c, 1ULL << 63, 1); + b = _mm_insert_epi64(c, a128[0], 1); + b = _mm_insert_epi64(b, a128[1], 0); + a = _mm_insert_epi64(c, b128[0], 1); + a = _mm_insert_epi64(a, b128[1], 0); + pp = _mm_insert_epi64(c, h->prim_poly, 0); + middle_one = _mm_insert_epi64(c, 1, 0x1); + + while (1) { + if (_mm_extract_epi32(a, 0x0) & 1) { + c = _mm_xor_si128(c, b); + } + middlebit = (_mm_extract_epi32(a, 0x2) & 1); + a = _mm_srli_epi64(a, 1); + if (middlebit) a = _mm_xor_si128(a, lmask); + if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){ + c128[0] = _mm_extract_epi64(c, 0x1); + c128[1] = _mm_extract_epi64(c, 0x0); + return; + } + topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1)); + middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0)); + b = _mm_slli_epi64(b, 1); + if (middlebit) b = _mm_xor_si128(b, middle_one); + if (topbit) b = _mm_xor_si128(b, pp); + } +} +#endif + +void +gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t bmask, pp; + gf_internal_t *h; + uint64_t a[2], b[2], c[2]; + + h = (gf_internal_t *) gf->scratch; + + bmask = (1ULL << 63); + set_zero(c, 0); + b[0] = a128[0]; + b[1] = a128[1]; + a[0] = b128[0]; + a[1] = b128[1]; + + while (1) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[0] & 1) a[1] ^= bmask; + a[0] >>= 1; + if (a[1] == 0 && a[0] == 0) { + c128[0] = c[0]; + c128[1] = c[1]; + return; + } + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } +} + +static +void +gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i, j, k; + uint64_t pp; + gf_internal_t *h; + uint64_t *s64, *d64, *top; + gf_region_data rd; + uint64_t v[2], s; + struct gf_w128_split_4_128_data *ld; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + ld = (struct gf_w128_split_4_128_data *) h->private; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + +/* + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]); + } + printf("\n"); + } + */ + while (d64 < top) { + v[0] = (xor) ? d64[0] : 0; + v[1] = (xor) ? d64[1] : 0; + s = s64[1]; + i = 0; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xf]; + v[1] ^= ld->tables[1][i][s&0xf]; + s >>= 4; + i++; + } + s = s64[0]; + i = 16; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xf]; + v[1] ^= ld->tables[1][i][s&0xf]; + s >>= 4; + i++; + } + d64[0] = v[0]; + d64[1] = v[1]; + s64 += 2; + d64 += 2; + } +} + +#if defined(INTEL_SSSE3) && defined(INTEL_SSE4) +static +void +gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v[2], s, *s64, *d64, *top; + __m128i p, tables[32][16]; + struct gf_w128_split_4_128_data *ld; + gf_region_data rd; + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16); + + /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_w128_split_4_128_data *) h->private; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + v[0] = ld->tables[0][i][j]; + v[1] = ld->tables[1][i][j]; + tables[i][j] = _mm_loadu_si128((__m128i *) v); + +/* + printf("%2d %2d: ", i, j); + MM_PRINT8("", tables[i][j]); */ + } + } + + while (d64 != top) { + + if (xor) { + p = _mm_load_si128 ((__m128i *) d64); + } else { + p = _mm_setzero_si128(); + } + s = *s64; + s64++; + for (i = 0; i < 16; i++) { + j = (s&0xf); + s >>= 4; + p = _mm_xor_si128(p, tables[16+i][j]); + } + s = *s64; + s64++; + for (i = 0; i < 16; i++) { + j = (s&0xf); + s >>= 4; + p = _mm_xor_si128(p, tables[i][j]); + } + _mm_store_si128((__m128i *) d64, p); + d64 += 2; + } + + /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); +} +#endif + +#if defined(INTEL_SSSE3) && defined(INTEL_SSE4) +static +void +gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v[2], *s64, *d64, *top; + __m128i si, tables[32][16], p[16], v0, mask1; + struct gf_w128_split_4_128_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256); + + /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_w128_split_4_128_data *) h->private; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 32; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + for (i = 0; i < 32; i++) { + for (j = 0; j < 16; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k]; + ld->tables[1-(j/8)][i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); +/* + printf("%2d %2d: ", i, j); + MM_PRINT8("", tables[i][j]); + */ + } + } + + + mask1 = _mm_set1_epi8(0xf); + + while (d64 != top) { + + if (xor) { + for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); + } else { + for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128(); + } + i = 0; + for (k = 0; k < 16; k++) { + v0 = _mm_load_si128((__m128i *) s64); + s64 += 2; + + si = _mm_and_si128(v0, mask1); + + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + for (j = 0; j < 16; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + for (i = 0; i < 16; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */ + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); +} +#endif + +static +void +gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i, j, k; + uint64_t pp; + gf_internal_t *h; + uint64_t *s64, *d64, *top; + gf_region_data rd; + uint64_t v[2], s; + struct gf_w128_split_8_128_data *ld; + + /* Check on alignment. Ignore it otherwise. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + ld = (struct gf_w128_split_8_128_data *) h->private; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) { + v[0] = val[0]; + v[1] = val[1]; + for (i = 0; i < 16; i++) { + ld->tables[0][i][0] = 0; + ld->tables[1][i][0] = 0; + for (j = 1; j < (1 << 8); j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]); + ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]); + } + pp = (v[0] & (1ULL << 63)); + v[0] <<= 1; + if (v[1] & (1ULL << 63)) v[0] ^= 1; + v[1] <<= 1; + if (pp) v[1] ^= h->prim_poly; + } + } + } + ld->last_value[0] = val[0]; + ld->last_value[1] = val[1]; + + while (d64 < top) { + v[0] = (xor) ? d64[0] : 0; + v[1] = (xor) ? d64[1] : 0; + s = s64[1]; + i = 0; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + s = s64[0]; + i = 8; + while (s != 0) { + v[0] ^= ld->tables[0][i][s&0xff]; + v[1] ^= ld->tables[1][i][s&0xff]; + s >>= 8; + i++; + } + d64[0] = v[0]; + d64[1] = v[1]; + s64 += 2; + d64 += 2; + } +} + +void +gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + uint64_t bmask, pp; + gf_internal_t *h; + uint64_t a[2], c[2], b[2], *s64, *d64, *top; + gf_region_data rd; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + h = (gf_internal_t *) gf->scratch; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + bmask = (1ULL << 63); + + while (d64 < top) { + set_zero(c, 0); + b[0] = s64[0]; + b[1] = s64[1]; + a[0] = val[0]; + a[1] = val[1]; + + while (a[0] != 0) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[0] & 1) a[1] ^= bmask; + a[0] >>= 1; + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } + while (1) { + if (a[1] & 1) { + c[0] ^= b[0]; + c[1] ^= b[1]; + } + a[1] >>= 1; + if (a[1] == 0) break; + pp = (b[0] & bmask); + b[0] <<= 1; + if (b[1] & bmask) b[0] ^= 1; + b[1] <<= 1; + if (pp) b[1] ^= h->prim_poly; + } + if (xor) { + d64[0] ^= c[0]; + d64[1] ^= c[1]; + } else { + d64[0] = c[0]; + d64[1] = c[1]; + } + s64 += 2; + d64 += 2; + } +} + +static +void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128) +{ + int i, j; + int g_m; + uint64_t prim_poly, lbit; + gf_internal_t *scratch; + gf_group_tables_t *gt; + uint64_t a128[2]; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + prim_poly = scratch->prim_poly; + + + set_zero(gt->m_table, 0); + a_get_b(gt->m_table, 2, b128, 0); + lbit = 1; + lbit <<= 63; + + for (i = 2; i < (1 << g_m); i <<= 1) { + a_get_b(a128, 0, gt->m_table, 2 * (i >> 1)); + two_x(a128); + a_get_b(gt->m_table, 2 * i, a128, 0); + if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly; + for (j = 0; j < i; j++) { + gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)]; + gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1]; + } + } + return; +} + +void +gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + int i; + /* index_r, index_m, total_m (if g_r > g_m) */ + int i_r, i_m, t_m; + int mask_m, mask_r; + int g_m, g_r; + uint64_t p_i[2], a[2]; + gf_internal_t *scratch; + gf_group_tables_t *gt; + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + + if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) { + gf_w128_group_m_init(gf, b128); + } + + p_i[0] = 0; + p_i[1] = 0; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + c128[0] = p_i[0]; + c128[1] = p_i[1]; +} + +static +void +gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + int i; + int i_r, i_m, t_m; + int mask_m, mask_r; + int g_m, g_r; + uint64_t p_i[2], a[2]; + gf_internal_t *scratch; + gf_group_tables_t *gt; + gf_region_data rd; + uint64_t *a128, *c128, *top; + + /* We only do this to check on alignment. */ + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + if (val[0] == 0) { + if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; } + } + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_m = scratch->arg1; + g_r = scratch->arg2; + + mask_m = (1 << g_m) - 1; + mask_r = (1 << g_r) - 1; + + if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) { + gf_w128_group_m_init(gf, val); + } + + a128 = (uint64_t *) src; + c128 = (uint64_t *) dest; + top = (uint64_t *) rd.d_top; + + while (c128 < top) { + p_i[0] = 0; + p_i[1] = 0; + a[0] = a128[0]; + a[1] = a128[1]; + + t_m = 0; + i_r = 0; + + /* Top 64 bits */ + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[0] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) { + i_m = (a[1] >> (i * g_m)) & mask_m; + i_r ^= (p_i[0] >> (64 - g_m)) & mask_r; + p_i[0] <<= g_m; + p_i[0] ^= (p_i[1] >> (64-g_m)); + p_i[1] <<= g_m; + p_i[0] ^= gt->m_table[2 * i_m]; + p_i[1] ^= gt->m_table[(2 * i_m) + 1]; + t_m += g_m; + if (t_m == g_r) { + p_i[1] ^= gt->r_table[i_r]; + t_m = 0; + i_r = 0; + } else { + i_r <<= g_m; + } + } + + if (xor) { + c128[0] ^= p_i[0]; + c128[1] ^= p_i[1]; + } else { + c128[0] = p_i[0]; + c128[1] = p_i[1]; + } + a128 += 2; + c128 += 2; + } +} + +/* a^-1 -> b */ +void +gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128) +{ + uint64_t e_i[2], e_im1[2], e_ip1[2]; + uint64_t d_i, d_im1, d_ip1; + uint64_t y_i[2], y_im1[2], y_ip1[2]; + uint64_t c_i[2]; + uint64_t *b; + uint64_t one = 1; + + /* This needs to return some sort of error (in b128?) */ + if (a128[0] == 0 && a128[1] == 0) return; + + b = (uint64_t *) b128; + + e_im1[0] = 0; + e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i[0] = a128[0]; + e_i[1] = a128[1]; + d_im1 = 128; + + //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit + //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a. + //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not + + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ; + + //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet + + if (!((one << d_i) & e_i[0])) { + + //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a. + // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0. + + for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ; + + } else { + + //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a. + + d_i += 64; + } + y_i[0] = 0; + y_i[1] = 1; + y_im1[0] = 0; + y_im1[1] = 0; + + while (!(e_i[0] == 0 && e_i[1] == 1)) { + + e_ip1[0] = e_im1[0]; + e_ip1[1] = e_im1[1]; + d_ip1 = d_im1; + c_i[0] = 0; + c_i[1] = 0; + + while (d_ip1 >= d_i) { + if ((d_ip1 - d_i) >= 64) { + c_i[0] ^= (one << ((d_ip1 - d_i) - 64)); + e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64)); + } else { + c_i[1] ^= (one << (d_ip1 - d_i)); + e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i)); + if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i))); + e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i)); + } + d_ip1--; + if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; } + while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--; + while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--; + } + gf->multiply.w128(gf, c_i, y_i, y_ip1); + y_ip1[0] ^= y_im1[0]; + y_ip1[1] ^= y_im1[1]; + + y_im1[0] = y_i[0]; + y_im1[1] = y_i[1]; + + y_i[0] = y_ip1[0]; + y_i[1] = y_ip1[1]; + + e_im1[0] = e_i[0]; + e_im1[1] = e_i[1]; + d_im1 = d_i; + e_i[0] = e_ip1[0]; + e_i[1] = e_ip1[1]; + d_i = d_ip1; + } + + b[0] = y_i[0]; + b[1] = y_i[1]; + return; +} + +void +gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128) +{ + uint64_t d[2]; + gf->inverse.w128(gf, b128, d); + gf->multiply.w128(gf, a128, d, c128); + return; +} + +void +gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128) +{ + uint64_t one128[2]; + one128[0] = 0; + one128[1] = 1; + gf->divide.w128(gf, one128, a128, b128); + return; +} + + +static +void +gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t c0, c1, d, tmp; + uint64_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w64(base_gf, a1); + c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w64(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w64(base_gf, a1); + a0inv = base_gf->inverse.w64(base_gf, a0); + + d = base_gf->multiply.w64(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w64(base_gf, tmp); + + d = base_gf->multiply.w64(base_gf, d, tmp); + + c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w64(base_gf, d, a1inv); + } + inv[0] = c1; + inv[1] = c0; +} + +static + void +gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = b[1]; + uint64_t b1 = b[0]; + uint64_t a0 = a[1]; + uint64_t a1 = a[0]; + uint64_t a1b1; + + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly); +} + +static + void +gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint64_t b0 = val[1]; + uint64_t b1 = val[0]; + uint64_t *s64, *d64; + uint64_t *top; + uint64_t a0, a1, a1b1; + gf_region_data rd; + + if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8); + + s64 = rd.s_start; + d64 = rd.d_start; + top = rd.d_top; + + if (xor) { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } else { + while (d64 < top) { + a1 = s64[0]; + a0 = s64[1]; + a1b1 = base_gf->multiply.w64(base_gf, a1, b1); + + d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1); + d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ + base_gf->multiply.w64(base_gf, a0, b1) ^ + base_gf->multiply.w64(base_gf, a1b1, h->prim_poly)); + s64 += 2; + d64 += 2; + } + } +} + +static +void +gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int + xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; + gf_val_64_t val0 = val[1]; + gf_val_64_t val1 = val[0]; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + int sub_reg_size; + gf_region_data rd; + + gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64); + gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t*) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1 + ), sub_reg_size, 1); + + gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor); +} + + + static +int gf_w128_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) { + SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt) + } else { + SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region) + } + + SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply) + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) + SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse) + + return 1; +} + +static +int gf_w128_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single) + return 1; + } +#endif + + return 0; +} + +static +int gf_w128_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply) + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single) + return 1; +} + + static +int gf_w128_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply) + /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/ + /* John: the sse function is slower.*/ + } else { + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply) + /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply) +Ben: This sse function is also slower. */ + } + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region) + return 1; +} + +/* + * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64 + * bits in all of these numbers. + */ + static +void gf_w128_group_r_init(gf_t *gf) +{ + int i, j; + int g_r; + uint64_t pp; + gf_internal_t *scratch; + gf_group_tables_t *gt; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_r = scratch->arg2; + pp = scratch->prim_poly; + + gt->r_table[0] = 0; + for (i = 1; i < (1 << g_r); i++) { + gt->r_table[i] = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + gt->r_table[i] ^= (pp << j); + } + } + } + return; +} + +#if 0 // defined(INTEL_SSE4) + static +void gf_w128_group_r_sse_init(gf_t *gf) +{ + int i, j; + int g_r; + uint64_t pp; + gf_internal_t *scratch; + gf_group_tables_t *gt; + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + __m128i zero = _mm_setzero_si128(); + __m128i *table = (__m128i *)(gt->r_table); + g_r = scratch->arg2; + pp = scratch->prim_poly; + table[0] = zero; + for (i = 1; i < (1 << g_r); i++) { + table[i] = zero; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0)); + } + } + } + return; +} +#endif + + static +int gf_w128_split_init(gf_t *gf) +{ + struct gf_w128_split_4_128_data *sd4; + struct gf_w128_split_8_128_data *sd8; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply) +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){ + SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply) + } +#endif + + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + + if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) { + sd8 = (struct gf_w128_split_8_128_data *) h->private; + sd8->last_value[0] = 0; + sd8->last_value[1] = 0; + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region) + } else { + sd4 = (struct gf_w128_split_4_128_data *) h->private; + sd4->last_value[0] = 0; + sd4->last_value[1] = 0; + if((h->region_type & GF_REGION_ALTMAP)) + { + #ifdef INTEL_SSE4 + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD)) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region) + else + #endif + return 0; + } + else { + #ifdef INTEL_SSE4 + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD)) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region) + else + #endif + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region) + } + } + return 1; +} + + +static +int gf_w128_group_init(gf_t *gf) +{ + gf_internal_t *scratch; + gf_group_tables_t *gt; + int g_r, size_r; + + scratch = (gf_internal_t *) gf->scratch; + gt = scratch->private; + g_r = scratch->arg2; + size_r = (1 << g_r); + + gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *))); + gt->m_table = gt->r_table + size_r; + gt->m_table[2] = 0; + gt->m_table[3] = 0; + + SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply) + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid) + SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region) + + gf_w128_group_r_init(gf); + + return 1; +} + +void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + gf_val_128_t s; + + s = (gf_val_128_t) start; + s += (index * 2); + memcpy(rv, s, 16); +} + +static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int i, blocks; + uint64_t *r64, tmp; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + + index -= (((uint64_t *) rd.d_start) - r64)/2; + r64 = (uint64_t *) rd.d_start; + + blocks = index/16; + r64 += (blocks*32); + index %= 16; + r8 = (uint8_t *) r64; + r8 += index; + rv[0] = 0; + rv[1] = 0; + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[1] |= (tmp << (i*8)); + r8 += 16; + } + + for (i = 0; i < 8; i++) { + tmp = *r8; + rv[0] |= (tmp << (i*8)); + r8 += 16; + } + return; +} + + static +void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint64_t *r64; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); + r64 = (uint64_t *) start; + if ((r64 + index*2 < (uint64_t *) rd.d_start) || + (r64 + index*2 >= (uint64_t *) rd.d_top)) { + memcpy(rv, r64+(index*2), 16); + return; + } + index -= (((uint64_t *) rd.d_start) - r64)/2; + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index); + rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index); + + return; +} + +int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + int size_m, size_r; + if (divide_type==GF_DIVIDE_MATRIX) return 0; + + switch(mult_type) + { + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t); + break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: + if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64; + } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64; + } + return 0; + break; + case GF_MULT_GROUP: + /* JSP We've already error checked the arguments. */ + size_m = (1 << arg1) * 2 * sizeof(uint64_t); + size_r = (1 << arg2) * 2 * sizeof(uint64_t); + /* + * two pointers prepend the table data for structure + * because the tables are of dynamic size + */ + return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *); + break; + case GF_MULT_COMPOSITE: + if (arg1 == 2) { + return sizeof(gf_internal_t) + 4; + } else { + return 0; + } + break; + + default: + return 0; + } +} + +int gf_w128_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */ + } + } + + SET_FUNCTION(gf,multiply,w128,NULL) + SET_FUNCTION(gf,divide,w128,NULL) + SET_FUNCTION(gf,inverse,w128,NULL) + SET_FUNCTION(gf,multiply_region,w128,NULL) + switch(h->mult_type) { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break; + default: return 0; + } + + /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there + are multiple flags in h->region_type */ + if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) { + SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word) + } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) { + SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word) + } else { + SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word) + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) + } + + if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) { + SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse) + } + if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) { + SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide) + } + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w16.c b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c new file mode 100644 index 000000000..831689267 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w16.c @@ -0,0 +1,2449 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w16.c + * + * Routines for 16-bit Galois fields + */ + +#include "gf_int.h" +#include +#include +#include "gf_w16.h" +#include "gf_cpu.h" + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } + +#define GF_FIRST_BIT (1 << 15) +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +static +inline +gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +void +gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + *d16 ^= gf->multiply.w32(gf, val, *s16); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + *d16 = gf->multiply.w32(gf, val, *s16); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint16_t *s16; + uint16_t *d16; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + if (xor) { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } else { + while (d16 < ((uint16_t *) rd.d_top)) { + + /* see gf_w16_clm_multiply() to see explanation of method */ + + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +inline +gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b) +{ + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 16; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint16_t *r16, rv; + + r16 = (uint16_t *) start; + rv = r16[index]; + return rv; +} + +static +gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint16_t a, b, *r16; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r16 = (uint16_t *) start; + if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; + if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; + index -= (((uint16_t *) rd.d_start) - r16); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 8)); +} + +static +gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint16_t *r16, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r16 = (uint16_t *) start; + if (r16 + index < (uint16_t *) rd.d_start) return r16[index]; + if (r16 + index >= (uint16_t *) rd.d_top) return r16[index]; + index -= (((uint16_t *) rd.d_start) - r16); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*2); + r8 += (index & 0xf); + rv = (*r8 << 8); + r8 += 16; + rv |= *r8; + return rv; +} + +static +inline +gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. + */ + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 2 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0); + b = _mm_insert_epi32 (a, b16, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + + +static +inline + gf_val_32_t +gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16) +{ + gf_val_32_t product, i, pp, a, b; + gf_internal_t *h; + + a = a16; + b = b16; + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +static +int gf_w16_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_w16_shift_multiply) + return 1; +} + +static +int gf_w16_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /*Ben: Determining how many reductions to do */ + + if ((0xfe00 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2) + } else if((0xf000 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3) + } else if ((0xe000 & h->prim_poly) == 0) { + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; + } +#endif + + return 0; +} + +/* KMG: GF_MULT_LOGTABLE: */ + +static +void +gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *s16, *d16; + int lv; + struct gf_w16_logtable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + + lv = ltd->log_tbl[val]; + + if (xor) { + while (d16 < (uint16_t *) rd.d_top) { + *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); + d16++; + s16++; + } + } else { + while (d16 < (uint16_t *) rd.d_top) { + *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]); + d16++; + s16++; + } + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_32_t +gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w16_logtable_data *ltd; + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]]; +} + +static +inline +gf_val_32_t +gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_w16_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b]; + return (ltd->d_antilog[log_sum]); +} + +static +gf_val_32_t +gf_w16_log_inverse(gf_t *gf, gf_val_32_t a) +{ + struct gf_w16_logtable_data *ltd; + + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static +int gf_w16_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + int i, b; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) + ltd->log_tbl[i] = 0; + ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE; + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + if (ltd->log_tbl[b] != 0) check = 1; + ltd->log_tbl[b] = i; + ltd->antilog_tbl[i] = b; + ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + + /* If you can't construct the log table, there's a problem. This code is used for + some other implementations (e.g. in SPLIT), so if the log table doesn't work in + that instance, use CARRY_FREE / SHIFT instead. */ + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) { + if (gf_cpu_supports_intel_pclmul) { + return gf_w16_cfm_init(gf); + } + return gf_w16_shift_init(gf); + } else { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + } + + ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ + ltd->inv_tbl[1] = 1; + for (i = 2; i < GF_FIELD_SIZE; i++) { + ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; + } + + SET_FUNCTION(gf,inverse,w32,gf_w16_log_inverse) + SET_FUNCTION(gf,divide,w32,gf_w16_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w16_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_multiply_region) + + return 1; +} + +/* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions. +*/ + + +/* Ben: Does alternate mapping multiplication using a split table in the + lazy method without sse instructions*/ + +static +void +gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, c, prod; + uint8_t *s8, *d8, *top; + uint16_t table[4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + /*Ben: Constructs lazy multiplication table*/ + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + table[i][j] = gf->multiply.w32(gf, c, val); + } + } + + /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */ + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + + + while (d8 < top) { + + /*Ben: Multiplies across 16 two byte quantities using alternate mapping + high bits are on the left, low bits are on the right. */ + + for (j=0;j<16;j++) { + + /*Ben: If the xor flag is set, the product should include what is in dest */ + prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0; + + /*Ben: xors all 4 table lookups into the product variable*/ + + prod ^= ((table[0][*(s8+16)&0xf]) ^ + (table[1][(*(s8+16)&0xf0)>>4]) ^ + (table[2][*(s8)&0xf]) ^ + (table[3][(*(s8)&0xf0)>>4])); + + /*Ben: Stores product in the destination and moves on*/ + + *d8 = (uint8_t)(prod >> 8); + *(d8+16) = (uint8_t)(prod & 0x00ff); + s8++; + d8++; + } + s8+=16; + d8+=16; + } + gf_do_final_region_alignment(&rd); +} + +static + void +gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, a, c, prod; + uint16_t *s16, *d16, *top; + uint16_t table[4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + table[i][j] = gf->multiply.w32(gf, c, val); + } + } + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top = (uint16_t *) rd.d_top; + + while (d16 < top) { + a = *s16; + prod = (xor) ? *d16 : 0; + for (i = 0; i < 4; i++) { + prod ^= table[i][a&0xf]; + a >>= 4; + } + *d16 = prod; + s16++; + d16++; + } +} + +static +void +gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t j, k, v, a, prod, *s64, *d64, *top64; + gf_internal_t *h; + uint64_t htable[256], ltable[256]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + v = val; + ltable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]); + v = GF_MULTBY_TWO(v); + } + htable[0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]); + v = GF_MULTBY_TWO(v); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + +/* Does Unrolling Matter? -- Doesn't seem to. + while (d64 != top64) { + a = *s64; + + prod = htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + prod <<= 16; + + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + prod ^= ((xor) ? *d64 : 0); + *d64 = prod; + s64++; + d64++; + } +*/ + + while (d64 != top64) { + a = *s64; + + prod = 0; + for (j = 0; j < 4; j++) { + prod <<= 16; + prod ^= htable[a >> 56]; + a <<= 8; + prod ^= ltable[a >> 56]; + a <<= 8; + } + + //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better. + + prod ^= ((xor) ? *d64 : 0); + *d64 = prod; + s64++; + d64++; + } + gf_do_final_region_alignment(&rd); +} + +static void +gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t c; + gf_internal_t *h; + struct gf_w16_lazytable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + ltd = (struct gf_w16_lazytable_data *) h->private; + + ltd->lazytable[0] = 0; + + /* + a = val; + c = 1; + pp = h->prim_poly; + + do { + ltd->lazytable[c] = a; + c <<= 1; + if (c & (1 << GF_FIELD_WIDTH)) c ^= pp; + a <<= 1; + if (a & (1 << GF_FIELD_WIDTH)) a ^= pp; + } while (c != 1); + */ + + for (c = 1; c < GF_FIELD_SIZE; c++) { + ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val); + } + + gf_two_byte_region_table_multiply(&rd, ltd->lazytable); + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSSE3 +static +void +gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, *s64, *d64, *top64;; + uint64_t c, prod; + uint8_t low[4][16]; + uint8_t high[4][16]; + gf_region_data rd; + + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + low[i][j] = (prod & 0xff); + high[i][j] = (prod >> 8); + } + } + + for (i = 0; i < 4; i++) { + tlow[i] = _mm_loadu_si128((__m128i *)low[i]); + thigh[i] = _mm_loadu_si128((__m128i *)high[i]); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + + mask = _mm_set1_epi8 (0x0f); + lmask = _mm_set1_epi16 (0xff); + + if (xor) { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + tta = _mm_srli_epi16(ta, 8); + ttb = _mm_srli_epi16(tb, 8); + tpl = _mm_and_si128(tb, lmask); + tph = _mm_and_si128(ta, lmask); + + tb = _mm_packus_epi16(tpl, tph); + ta = _mm_packus_epi16(ttb, tta); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_unpackhi_epi8(tpl, tph); + tb = _mm_unpacklo_epi8(tpl, tph); + + tta = _mm_load_si128((__m128i *) d64); + ta = _mm_xor_si128(ta, tta); + ttb = _mm_load_si128((__m128i *) (d64+2)); + tb = _mm_xor_si128(tb, ttb); + _mm_store_si128 ((__m128i *)d64, ta); + _mm_store_si128 ((__m128i *)(d64+2), tb); + + d64 += 4; + s64 += 4; + + } + } else { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + tta = _mm_srli_epi16(ta, 8); + ttb = _mm_srli_epi16(tb, 8); + tpl = _mm_and_si128(tb, lmask); + tph = _mm_and_si128(ta, lmask); + + tb = _mm_packus_epi16(tpl, tph); + ta = _mm_packus_epi16(ttb, tta); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_unpackhi_epi8(tpl, tph); + tb = _mm_unpacklo_epi8(tpl, tph); + + _mm_store_si128 ((__m128i *)d64, ta); + _mm_store_si128 ((__m128i *)(d64+2), tb); + + d64 += 4; + s64 += 4; + } + } + + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSSE3 +static +void +gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t i, j, *s64, *d64, *top64;; + uint64_t c, prod; + uint8_t low[4][16]; + uint8_t high[4][16]; + gf_region_data rd; + __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4]; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + for (j = 0; j < 16; j++) { + for (i = 0; i < 4; i++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + low[i][j] = (prod & 0xff); + high[i][j] = (prod >> 8); + } + } + + for (i = 0; i < 4; i++) { + tlow[i] = _mm_loadu_si128((__m128i *)low[i]); + thigh[i] = _mm_loadu_si128((__m128i *)high[i]); + } + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top64 = (uint64_t *) rd.d_top; + + mask = _mm_set1_epi8 (0x0f); + + if (xor) { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + ta = _mm_load_si128((__m128i *) d64); + tph = _mm_xor_si128(tph, ta); + _mm_store_si128 ((__m128i *)d64, tph); + tb = _mm_load_si128((__m128i *) (d64+2)); + tpl = _mm_xor_si128(tpl, tb); + _mm_store_si128 ((__m128i *)(d64+2), tpl); + + d64 += 4; + s64 += 4; + } + } else { + while (d64 != top64) { + + ta = _mm_load_si128((__m128i *) s64); + tb = _mm_load_si128((__m128i *) (s64+2)); + + ti = _mm_and_si128 (mask, tb); + tph = _mm_shuffle_epi8 (thigh[0], ti); + tpl = _mm_shuffle_epi8 (tlow[0], ti); + + tb = _mm_srli_epi16(tb, 4); + ti = _mm_and_si128 (mask, tb); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph); + + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph); + + ta = _mm_srli_epi16(ta, 4); + ti = _mm_and_si128 (mask, ta); + tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl); + tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph); + + _mm_store_si128 ((__m128i *)d64, tph); + _mm_store_si128 ((__m128i *)(d64+2), tpl); + + d64 += 4; + s64 += 4; + + } + } + gf_do_final_region_alignment(&rd); + +} +#endif + +uint32_t +gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t alow, blow; + struct gf_w16_split_8_8_data *d8; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_w16_split_8_8_data *) h->private; + + alow = a & 0xff; + blow = b & 0xff; + a >>= 8; + b >>= 8; + + return d8->tables[0][alow][blow] ^ + d8->tables[1][alow][b] ^ + d8->tables[1][a][blow] ^ + d8->tables[2][a][b]; +} + +static +int gf_w16_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_split_8_8_data *d8; + int i, j, exp; + uint32_t p, basep, tmp; + + h = (gf_internal_t *) gf->scratch; + + if (h->arg1 == 8 && h->arg2 == 8) { + d8 = (struct gf_w16_split_8_8_data *) h->private; + basep = 1; + for (exp = 0; exp < 3; exp++) { + for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; + d8->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d8->tables[exp][i^1][1]; + d8->tables[exp][i][1] = p ^ basep; + } else { + p = d8->tables[exp][i>>1][1]; + d8->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d8->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; + } else { + tmp = d8->tables[exp][i][j>>1]; + d8->tables[exp][i][j] = GF_MULTBY_TWO(tmp); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + SET_FUNCTION(gf,multiply,w32,gf_w16_split_8_8_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) + return 1; + + } + + /* We'll be using LOG for multiplication, unless the pp isn't primitive. + In that case, we'll be using SHIFT. */ + + gf_w16_log_init(gf); + + /* Defaults */ + +#ifdef INTEL_SSSE3 + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region) + } else { +#elif ARM_NEON + if (gf_cpu_supports_arm_neon) { + gf_w16_neon_split_init(gf); + } else { +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + } +#endif + + if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region) + + } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { + if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region) + else if(h->region_type & GF_REGION_NOSIMD) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) + else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region) +#endif + } else { +#endif + if(h->region_type & GF_REGION_SIMD) + return 0; + else if(h->region_type & GF_REGION_ALTMAP) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region) + else + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region) +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + } +#endif + } + + return 1; +} + +static +int gf_w16_table_init(gf_t *gf) +{ + gf_w16_log_init(gf); + + SET_FUNCTION(gf,multiply_region,w32,gf_w16_table_lazy_multiply_region) + return 1; +} + +static +void +gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t lv; + int i; + uint16_t *s16, *d16, *top16; + struct gf_w16_zero_logtable_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private; + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top16 = (uint16_t *) rd.d_top; + bytes = top16 - d16; + + lv = ltd->log_tbl[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]); + } + } + + /* This isn't necessary. */ + + gf_do_final_region_alignment(&rd); +} + +/* Here -- double-check Kevin */ + +static +inline +gf_val_32_t +gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w16_zero_logtable_data *ltd; + + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; +} + +static +inline +gf_val_32_t +gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_w16_zero_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); + return (ltd->antilog_tbl[log_sum]); +} + +static +gf_val_32_t +gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a) +{ + struct gf_w16_zero_logtable_data *ltd; + + ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static +inline +gf_val_32_t +gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x8000; + amask = 0x8000; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline +gf_val_32_t +gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8000; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +void +gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w16_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi16(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint32_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 16; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + one = _mm_set1_epi16(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi16(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w16_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi16(btd->prim_poly&0xffff); + m1 = _mm_set1_epi16((btd->mask1)&0xffff); + m2 = _mm_set1_epi16((btd->mask2)&0xffff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w16_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w16_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w16_bytwo_data *) (h->private); + ip = h->prim_poly & 0xffff; + m1 = 0xfffe; + m2 = 0x8000; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } else { + SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } + + return 1; +} + +static +int gf_w16_log_zero_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_zero_logtable_data *ltd; + int i, b; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1; + + bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl)); + + ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]); + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + ltd->log_tbl[b] = (uint16_t)i; + ltd->antilog_tbl[i] = (uint16_t)b; + ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */ + ltd->inv_tbl[1] = 1; + for (i = 2; i < GF_FIELD_SIZE; i++) { + ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]]; + } + + SET_FUNCTION(gf,inverse,w32,gf_w16_log_zero_inverse) + SET_FUNCTION(gf,divide,w32,gf_w16_log_zero_divide) + SET_FUNCTION(gf,multiply,w32,gf_w16_log_zero_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_zero_multiply_region) + return 1; +} + +static +gf_val_32_t +gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = b & 0x00ff; + uint8_t b1 = (b & 0xff00) >> 8; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t a1b1; + uint16_t rv; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + return rv; +} + +static +gf_val_32_t +gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint8_t b0 = b & 0x00ff; + uint8_t b1 = (b & 0xff00) >> 8; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t a1b1, *mt; + uint16_t rv; + struct gf_w16_composite_data *cd; + + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; + + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + return rv; +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_32_t +gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t a0 = a & 0x00ff; + uint8_t a1 = (a & 0xff00) >> 8; + uint8_t c0, c1, d, tmp; + uint16_t c; + uint8_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | (c1 << 8); + + return c; +} + +static +void +gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = val & 0x00ff; + uint8_t b1 = (val & 0xff00) >> 8; + uint16_t *s16, *d16, *top; + uint8_t a0, a1, a1b1, *mt; + gf_region_data rd; + struct gf_w16_composite_data *cd; + + cd = (struct gf_w16_composite_data *) h->private; + mt = cd->mult_table; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + + s16 = rd.s_start; + d16 = rd.d_start; + top = rd.d_top; + + if (mt == NULL) { + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } + } else { + if (xor) { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } else { + while (d16 < top) { + a0 = (*s16) & 0x00ff; + a1 = ((*s16) & 0xff00) >> 8; + a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1); + + (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8)); + s16++; + d16++; + } + } + } +} + +static +void +gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t val0 = val & 0x00ff; + uint8_t val1 = (val & 0xff00) >> 8; + gf_region_data rd; + int sub_reg_size; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top;; + + /* JSP: I want the two pointers aligned wrt each other on 16 byte + boundaries. So I'm going to make sure that the area on + which the two operate is a multiple of 32. Of course, that + junks up the mapping, but so be it -- that's why we have extract_word.... */ + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w16_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w16_composite_data *) h->private; + cd->mult_table = gf_w8_get_mult_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region_alt) + } else { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region) + } + + if (cd->mult_table == NULL) { + SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_recursive) + } else { + SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_inline) + } + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w16_composite_inverse) + + return 1; +} + +static +void +gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h) +{ + int i, j; + + shift[0] = 0; + for (i = 0; i < 16; i += 2) { + j = (shift[i>>1] << 1); + if (j & (1 << 16)) j ^= h->prim_poly; + shift[i] = j; + shift[i^1] = j^val; + } +} + +static +inline +gf_val_32_t +gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint16_t p, l, ind, r, a16; + + struct gf_w16_group_4_4_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w16_group_4_4_data *) h->private; + gf_w16_group_4_set_shift_tables(d44->shift, b, h); + + a16 = a; + ind = a16 >> 12; + a16 <<= 4; + p = d44->shift[ind]; + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + return p; +} + +static +void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t p, l, ind, r, a16, p16; + struct gf_w16_group_4_4_data *d44; + gf_region_data rd; + uint16_t *s16, *d16, *top; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + d44 = (struct gf_w16_group_4_4_data *) h->private; + gf_w16_group_4_set_shift_tables(d44->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2); + gf_do_initial_region_alignment(&rd); + + s16 = (uint16_t *) rd.s_start; + d16 = (uint16_t *) rd.d_start; + top = (uint16_t *) rd.d_top; + + while (d16 < top) { + a16 = *s16; + p16 = (xor) ? *d16 : 0; + ind = a16 >> 12; + a16 <<= 4; + p = d44->shift[ind]; + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + a16 <<= 4; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + r = p & 0xfff; + l = p >> 12; + ind = a16 >> 12; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4)); + p ^= p16; + *d16 = p; + d16++; + s16++; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w16_group_init(gf_t *gf) +{ + int i, j, p; + struct gf_w16_group_4_4_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w16_group_4_4_data *) h->private; + d44->reduce[0] = 0; + for (i = 0; i < 16; i++) { + p = 0; + for (j = 0; j < 4; j++) { + if (i & (1 << j)) p ^= (h->prim_poly << j); + } + d44->reduce[p>>16] = (p&0xffff); + } + + SET_FUNCTION(gf,multiply,w32,gf_w16_group_4_4_multiply) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_group_4_4_region_multiply) + + return 1; +} + +int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64; + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data); + break; + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64; + break; + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64; + } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + } else if (mult_type == GF_MULT_DEFAULT || + (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64; + } + return 0; + break; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64; + break; + + default: + return 0; + } + return 0; +} + +int gf_w16_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; + } else { + + /* Allen: use the following primitive polynomial to make + carryless multiply work more efficiently for GF(2^16). + + h->prim_poly = 0x1002d; + + The following is the traditional primitive polynomial for GF(2^16) */ + + h->prim_poly = 0x1100b; + } + } + + if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16); + + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + + switch(h->mult_type) { + case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break; + case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w16_euclid) + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w16_matrix) + } + + if (gf->divide.w32 == NULL) { + SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_euclid) + } + + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_inverse_from_divide) + + if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + SET_FUNCTION(gf,extract_word,w32,gf_w16_composite_extract_word) + } else { + SET_FUNCTION(gf,extract_word,w32,gf_w16_split_extract_word) + } + } else if (h->region_type == GF_REGION_CAUCHY) { + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + } else { + SET_FUNCTION(gf,extract_word,w32,gf_w16_extract_word) + } + if (gf->multiply_region.w32 == NULL) { + SET_FUNCTION(gf,multiply_region,w32,gf_w16_multiply_region_from_single) + } + return 1; +} + +/* Inline setup functions */ + +uint16_t *gf_w16_get_log_table(gf_t *gf) +{ + struct gf_w16_logtable_data *ltd; + + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint16_t *) ltd->log_tbl; + } + return NULL; +} + +uint16_t *gf_w16_get_mult_alog_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) h->private; + return (uint16_t *) ltd->antilog_tbl; + } + return NULL; +} + +uint16_t *gf_w16_get_div_alog_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w16_logtable_data *ltd; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w16_log_multiply) { + ltd = (struct gf_w16_logtable_data *) h->private; + return (uint16_t *) ltd->d_antilog; + } + return NULL; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w32.c b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c new file mode 100644 index 000000000..976b68b2e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w32.c @@ -0,0 +1,2810 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w32.c + * + * Routines for 32-bit Galois fields + */ + + +#include "gf_int.h" +#include +#include +#include "gf_w32.h" +#include "gf_cpu.h" + +#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +static +inline +uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +void +gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int +xor) +{ + uint32_t i; + uint32_t *s32; + uint32_t *d32; + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + d32[i] ^= gf->multiply.w32(gf, val, s32[i]); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + d32[i] = gf->multiply.w32(gf, val, s32[i]); + } + } +} + +#if defined(INTEL_SSE4_PCLMUL) + +static +void +gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + uint32_t i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +void +gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + uint32_t i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + uint32_t i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + +static +inline +uint32_t gf_w32_euclid (gf_t *gf, uint32_t b) +{ + uint32_t e_i, e_im1, e_ip1; + uint32_t d_i, d_im1, d_ip1; + uint32_t y_i, y_im1, y_ip1; + uint32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 32; + for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + d_ip1--; + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint32_t *r32, rv; + + r32 = (uint32_t *) start; + rv = r32[index]; + return rv; +} + +static +gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint32_t a, b, *r32; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r32 = (uint32_t *) start; + if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; + if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; + index -= (((uint32_t *) rd.d_start) - r32); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 16)); +} + +static +gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int i; + uint32_t *r32, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64); + r32 = (uint32_t *) start; + if (r32 + index < (uint32_t *) rd.d_start) return r32[index]; + if (r32 + index >= (uint32_t *) rd.d_top) return r32[index]; + index -= (((uint32_t *) rd.d_start) - r32); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*4); + r8 += (index & 0xf); + r8 += 48; + rv =0; + for (i = 0; i < 4; i++) { + rv <<= 8; + rv |= *r8; + r8 -= 16; + } + return rv; +} + + +static +inline +uint32_t gf_w32_matrix (gf_t *gf, uint32_t b) +{ + return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. +*/ + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_32_t +gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i w; + __m128i g, q; + gf_internal_t * h = gf->scratch; + uint64_t g_star, q_plus; + + q_plus = *(uint64_t *) h->private; + g_star = *((uint64_t *) h->private + 1); + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + g = _mm_insert_epi64 (a, g_star, 0); + q = _mm_insert_epi64 (a, q_plus, 0); + + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +void +gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + + uint32_t i; + uint32_t *s32; + uint32_t *d32; + + __m128i a, b; + __m128i result; + __m128i w; + __m128i g, q; + gf_internal_t * h = gf->scratch; + uint64_t g_star, q_plus; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + q_plus = *(uint64_t *) h->private; + g_star = *((uint64_t *) h->private + 1); + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + g = _mm_insert_epi64 (a, g_star, 0); + q = _mm_insert_epi64 (a, q_plus, 0); + s32 = (uint32_t *) src; + d32 = (uint32_t *) dest; + + if (xor) { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } else { + for (i = 0; i < bytes/sizeof(uint32_t); i++) { + b = _mm_insert_epi32 (a, s32[i], 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0); + w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0); + result = _mm_xor_si128 (result, w); + d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + } + } +} +#endif + + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_32_t +gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 4 bytes. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_32_t +gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_32_t +gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + + a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0); + b = _mm_insert_epi32 (a, b32, 0); + + prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + return rv; +} +#endif + + +static +inline +uint32_t +gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint64_t product, i, pp, a, b, one; + gf_internal_t *h; + + a = a32; + b = b32; + h = (gf_internal_t *) gf->scratch; + one = 1; + pp = h->prim_poly | (one << 32); + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (one << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + + static +int gf_w32_cfmgk_init(gf_t *gf) +{ + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) + +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single) + + uint64_t *q_plus = (uint64_t *) h->private; + uint64_t *g_star = (uint64_t *) h->private + 1; + + uint64_t tmp = h->prim_poly << 32; + *q_plus = 1ULL << 32; + + int i; + for(i = 63; i >= 32; i--) + if((1ULL << i) & tmp) + { + *q_plus |= 1ULL << (i-32); + tmp ^= h->prim_poly << (i-32); + } + + *g_star = h->prim_poly & ((1ULL << 32) - 1); + + return 1; + } +#endif + + return 0; +} + + static +int gf_w32_cfm_init(gf_t *gf) +{ + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) + + /*Ben: We also check to see if the prim poly will work for pclmul */ + /*Ben: Check to see how many reduction steps it will take*/ + +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xfffe0000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2) + }else if ((0xffc00000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3) + }else if ((0xfe000000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; + } + #endif + + return 0; +} + + static +int gf_w32_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single) + SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply) + return 1; +} + +static + void +gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) +{ + uint32_t i; + uint32_t j; + + shift[0] = 0; + + for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & GF_FIRST_BIT) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + + static +void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + gf_region_data rd; + uint32_t *s32, *d32, *top; + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w32_group_data *) h->private; + g_s = h->arg1; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + leftover = 32 % g_s; + if (leftover == 0) leftover = g_s; + + while (d32 < top) { + rs = 32 - leftover; + a32 = *s32; + ind = a32 >> rs; + a32 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 32 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + if (xor) p ^= *d32; + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + + static +void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint32_t *s32, *d32, *top; + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_w32_group_data *gd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + leftover = GF_FIELD_WIDTH % g_s; + if (leftover == 0) leftover = g_s; + + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 < top) { + a32 = *s32; + ind = a32 >> (GF_FIELD_WIDTH - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + + i = (GF_FIELD_WIDTH - leftover); + while (i > g_s) { + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+32)]; + r <<= (i); + p ^= r; + } + + if (xor) p ^= *d32; + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_32_t +gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, b, h); + + leftover = 32 % g_s; + if (leftover == 0) leftover = g_s; + + rs = 32 - leftover; + a32 = a; + ind = a32 >> rs; + a32 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 32 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + return p; +} + +static +inline +gf_val_32_t +gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t p, l, ind, a32; + + struct gf_w32_group_data *d44; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + d44 = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(d44->shift, b, h); + + a32 = a; + ind = a32 >> 28; + a32 <<= 4; + p = d44->shift[ind]; + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + a32 <<= 4; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + ind = a32 >> 28; + l = p >> 28; + p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4)); + return p; +} + +static +inline +gf_val_32_t +gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_w32_group_data *gd; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w32_group_data *) h->private; + gf_w32_group_set_shift_tables(gd->shift, b, h); + + leftover = GF_FIELD_WIDTH % g_s; + if (leftover == 0) leftover = g_s; + + a32 = a; + ind = a32 >> (GF_FIELD_WIDTH - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + + i = (GF_FIELD_WIDTH - leftover); + while (i > g_s) { + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (GF_FIELD_WIDTH-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+32)]; + r <<= (i); + p ^= r; + } + return p; +} + +static +inline +gf_val_32_t +gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x80000000; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +inline +gf_val_32_t +gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x80000000; + amask = 0x80000000; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +void +gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w32_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80000000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80000000; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi32(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint32_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 32; i++) { + vrev <<= 1; + if (!(val & ((gf_val_32_t)1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + one = _mm_set1_epi32(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi32(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint32_t itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w32_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w32_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi32(btd->prim_poly&0xffffffff); + m1 = _mm_set1_epi32((btd->mask1)&0xffffffff); + m2 = _mm_set1_epi32((btd->mask2)&0xffffffff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static +int gf_w32_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w32_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w32_bytwo_data *) (h->private); + ip = h->prim_poly & 0xffffffff; + m1 = 0xfffffffe; + m2 = 0x80000000; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } else { + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } + + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + return 1; +} + +static +inline +uint32_t +gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint32_t product, i, j, mask, tb; + gf_internal_t *h; + struct gf_w32_split_8_8_data *d8; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_w32_split_8_8_data *) h->private; + product = 0; + mask = 0xff; + + for (i = 0; i < 4; i++) { + tb = b32; + for (j = 0; j < 4; j++) { + product ^= d8->tables[i+j][a32&mask][tb&mask]; + tb >>= 8; + } + a32 >>= 8; + } + return product; +} + +static +inline +void +gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + uint32_t *s32, *d32, *top, p, a, v; + struct gf_split_8_32_lazy_data *d8; + struct gf_w32_split_8_8_data *d88; + uint32_t *t[4]; + int i, j, k, change; + uint32_t pp; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) { + d8 = (struct gf_split_8_32_lazy_data *) h->private; + for (i = 0; i < 4; i++) t[i] = d8->tables[i]; + change = (val != d8->last_value); + if (change) d8->last_value = val; + } else { + d88 = (struct gf_w32_split_8_8_data *) h->private; + for (i = 0; i < 4; i++) t[i] = d88->region_tables[i]; + change = (val != d88->last_value); + if (change) d88->last_value = val; + } + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + if (change) { + v = val; + for (i = 0; i < 4; i++) { + t[i][0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) { + t[i][k^j] = (v ^ t[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + + while (d32 < top) { + p = (xor) ? *d32 : 0; + a = *s32; + i = 0; + while (a != 0) { + v = (a & 0xff); + p ^= t[i][v]; + a >>= 8; + i++; + } + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +void +gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + uint32_t *s32, *d32, *top, p, a, v; + struct gf_split_16_32_lazy_data *d16; + uint32_t *t[2]; + int i, j, k, change; + uint32_t pp; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + d16 = (struct gf_split_16_32_lazy_data *) h->private; + for (i = 0; i < 2; i++) t[i] = d16->tables[i]; + change = (val != d16->last_value); + if (change) d16->last_value = val; + + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + if (change) { + v = val; + for (i = 0; i < 2; i++) { + t[i][0] = 0; + for (j = 1; j < (1 << 16); j <<= 1) { + for (k = 0; k < j; k++) { + t[i][k^j] = (v ^ t[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + + while (d32 < top) { + p = (xor) ? *d32 : 0; + a = *s32; + i = 0; + while (a != 0 && i < 2) { + v = (a & 0xffff); + p ^= t[i][v]; + a >>= 16; + i++; + } + *d32 = p; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_2_32_lazy_data *ld; + int i; + uint32_t pp, v, v2, s, *s32, *d32, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_2_32_lazy_data *) h->private; + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 16; i++) { + v2 = (v << 1); + if (v & GF_FIRST_BIT) v2 ^= pp; + ld->tables[i][0] = 0; + ld->tables[i][1] = v; + ld->tables[i][2] = v2; + ld->tables[i][3] = (v2 ^ v); + v = (v2 << 1); + if (v2 & GF_FIRST_BIT) v ^= pp; + } + } + ld->last_value = val; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 != top) { + v = (xor) ? *d32 : 0; + s = *s32; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&3]; + s >>= 2; + i++; + } + *d32 = v; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSSE3 +static +void +gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, tindex; + uint32_t pp, v, v2, *s32, *d32, *top; + __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 16; i++) { + v2 = (v << 1); + if (v & GF_FIRST_BIT) v2 ^= pp; + tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0); + v = (v2 << 1); + if (v2 & GF_FIRST_BIT) v ^= pp; + } + + shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + mask1 = _mm_set1_epi8(0x3); + mask2 = _mm_set1_epi8(0xc); + + while (d32 != top) { + pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128(); + vi = _mm_load_si128((__m128i *) s32); + + tindex = 0; + for (i = 0; i < 4; i++) { + si = _mm_shuffle_epi8(vi, shuffler); + + xi = _mm_and_si128(si, mask1); + xi = _mm_slli_epi16(xi, 2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + si = _mm_srli_epi16(si, 2); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + si = _mm_srli_epi16(si, 2); + tindex++; + + xi = _mm_and_si128(si, mask2); + xi = _mm_xor_si128(xi, adder); + pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi)); + tindex++; + + vi = _mm_srli_epi32(vi, 8); + } + _mm_store_si128((__m128i *) d32, pi); + d32 += 4; + s32 += 4; + } + + gf_do_final_region_alignment(&rd); + +} +#endif + +static +void +gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_4_32_lazy_data *ld; + int i, j, k; + uint32_t pp, v, s, *s32, *d32, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_4_32_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + while (d32 != top) { + v = (xor) ? *d32 : 0; + s = *s32; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xf]; + s >>= 4; + i++; + } + *d32 = v; + d32++; + s32++; + } + gf_do_final_region_alignment(&rd); +} + +#ifdef INTEL_SSSE3 +static +void +gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3; + struct gf_split_4_32_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + ld = (struct gf_split_4_32_lazy_data *) h->private; + + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + + if (xor) { + while (d32 != top) { + p0 = _mm_load_si128 ((__m128i *) d32); + p1 = _mm_load_si128 ((__m128i *) (d32+4)); + p2 = _mm_load_si128 ((__m128i *) (d32+8)); + p3 = _mm_load_si128 ((__m128i *) (d32+12)); + + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } else { + while (d32 != top) { + + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[0][0], si); + p1 = _mm_shuffle_epi8(tables[0][1], si); + p2 = _mm_shuffle_epi8(tables[0][2], si); + p3 = _mm_shuffle_epi8(tables[0][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } + + gf_do_final_region_alignment(&rd); +} +#endif + + +#ifdef INTEL_SSSE3 +static +void +gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8; + __m128i tv1, tv2, tv3, tv0; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 8; i++) { + tmp_table[0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + tmp_table[k^j] = (v ^ tmp_table[k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) tmp_table[k]; + tmp_table[k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + mask8 = _mm_set1_epi16(0xff); + + if (xor) { + while (d32 != top) { + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p1, p0); + v1 = _mm_packus_epi16(tv1, tv0); + v2 = _mm_packus_epi16(p3, p2); + v3 = _mm_packus_epi16(tv3, tv2); + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p2, p0); + v1 = _mm_packus_epi16(p3, p1); + v2 = _mm_packus_epi16(tv2, tv0); + v3 = _mm_packus_epi16(tv3, tv1); + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[6][0], si); + p1 = _mm_shuffle_epi8(tables[6][1], si); + p2 = _mm_shuffle_epi8(tables[6][2], si); + p3 = _mm_shuffle_epi8(tables[6][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + tv0 = _mm_unpackhi_epi8(p1, p3); + tv1 = _mm_unpackhi_epi8(p0, p2); + tv2 = _mm_unpacklo_epi8(p1, p3); + tv3 = _mm_unpacklo_epi8(p0, p2); + + p0 = _mm_unpackhi_epi8(tv1, tv0); + p1 = _mm_unpacklo_epi8(tv1, tv0); + p2 = _mm_unpackhi_epi8(tv3, tv2); + p3 = _mm_unpacklo_epi8(tv3, tv2); + + v0 = _mm_load_si128 ((__m128i *) d32); + v1 = _mm_load_si128 ((__m128i *) (d32+4)); + v2 = _mm_load_si128 ((__m128i *) (d32+8)); + v3 = _mm_load_si128 ((__m128i *) (d32+12)); + + p0 = _mm_xor_si128(p0, v0); + p1 = _mm_xor_si128(p1, v1); + p2 = _mm_xor_si128(p2, v2); + p3 = _mm_xor_si128(p3, v3); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } else { + while (d32 != top) { + v0 = _mm_load_si128((__m128i *) s32); s32 += 4; + v1 = _mm_load_si128((__m128i *) s32); s32 += 4; + v2 = _mm_load_si128((__m128i *) s32); s32 += 4; + v3 = _mm_load_si128((__m128i *) s32); s32 += 4; + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p1, p0); + v1 = _mm_packus_epi16(tv1, tv0); + v2 = _mm_packus_epi16(p3, p2); + v3 = _mm_packus_epi16(tv3, tv2); + + p0 = _mm_srli_epi16(v0, 8); + p1 = _mm_srli_epi16(v1, 8); + p2 = _mm_srli_epi16(v2, 8); + p3 = _mm_srli_epi16(v3, 8); + + tv0 = _mm_and_si128(v0, mask8); + tv1 = _mm_and_si128(v1, mask8); + tv2 = _mm_and_si128(v2, mask8); + tv3 = _mm_and_si128(v3, mask8); + + v0 = _mm_packus_epi16(p2, p0); + v1 = _mm_packus_epi16(p3, p1); + v2 = _mm_packus_epi16(tv2, tv0); + v3 = _mm_packus_epi16(tv3, tv1); + + si = _mm_and_si128(v0, mask1); + p0 = _mm_shuffle_epi8(tables[6][0], si); + p1 = _mm_shuffle_epi8(tables[6][1], si); + p2 = _mm_shuffle_epi8(tables[6][2], si); + p3 = _mm_shuffle_epi8(tables[6][3], si); + + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si)); + + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si)); + + v1 = _mm_srli_epi32(v1, 4); + si = _mm_and_si128(v1, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si)); + + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si)); + + v2 = _mm_srli_epi32(v2, 4); + si = _mm_and_si128(v2, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si)); + + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si)); + + v3 = _mm_srli_epi32(v3, 4); + si = _mm_and_si128(v3, mask1); + p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si)); + p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si)); + p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si)); + p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si)); + + tv0 = _mm_unpackhi_epi8(p1, p3); + tv1 = _mm_unpackhi_epi8(p0, p2); + tv2 = _mm_unpacklo_epi8(p1, p3); + tv3 = _mm_unpacklo_epi8(p0, p2); + + p0 = _mm_unpackhi_epi8(tv1, tv0); + p1 = _mm_unpacklo_epi8(tv1, tv0); + p2 = _mm_unpackhi_epi8(tv3, tv2); + p3 = _mm_unpacklo_epi8(tv3, tv2); + + _mm_store_si128((__m128i *) d32, p0); + _mm_store_si128((__m128i *) (d32+4), p1); + _mm_store_si128((__m128i *) (d32+8), p2); + _mm_store_si128((__m128i *) (d32+12), p3); + d32 += 16; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +int gf_w32_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_split_2_32_lazy_data *ld2; + struct gf_split_4_32_lazy_data *ld4; + struct gf_w32_split_8_8_data *d8; + struct gf_split_8_32_lazy_data *d32; + struct gf_split_16_32_lazy_data *d16; + uint32_t p, basep; + int i, j, exp; + + h = (gf_internal_t *) gf->scratch; + + /* Defaults */ + + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + + /* JSP: First handle single multiplication: + If args == 8, then we're doing split 8 8. + Otherwise, if PCLMUL, we use that. + Otherwise, we use bytwo_p. + */ + + if (h->arg1 == 8 && h->arg2 == 8) { + SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply) +#if defined(INTEL_SSE4_PCLMUL) + } else if (gf_cpu_supports_intel_pclmul) { + if ((0xfffe0000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2) + } else if ((0xffc00000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3) + } else if ((0xfe000000 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4) + } +#endif + } else { + SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply) + } + + /* Easy cases: 16/32 and 2/32 */ + + if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) { + d16 = (struct gf_split_16_32_lazy_data *) h->private; + d16->last_value = 0; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region) + return 1; + } + + if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) { + ld2 = (struct gf_split_2_32_lazy_data *) h->private; + ld2->last_value = 0; + #ifdef INTEL_SSSE3 + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region) + if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSSE3 + } + #endif + return 1; + } + + /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ + + + if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || + ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) { + ld4 = (struct gf_split_4_32_lazy_data *) h->private; + ld4->last_value = 0; + if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region) + } else if (gf_cpu_supports_arm_neon) { +#ifdef ARM_NEON + gf_w32_neon_split_init(gf); +#endif + } else if (h->region_type & GF_REGION_ALTMAP) { +#ifdef INTEL_SSSE3 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region) +#endif + } else { +#ifdef INTEL_SSSE3 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region) +#endif + } + return 1; + } + + /* 8/32 or Default + no SSE */ + + if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) || + h->mult_type == GF_MULT_DEFAULT) { + d32 = (struct gf_split_8_32_lazy_data *) h->private; + d32->last_value = 0; + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region) + return 1; + } + + /* Finally, if args == 8, then we have to set up the tables here. */ + + if (h->arg1 == 8 && h->arg2 == 8) { + d8 = (struct gf_w32_split_8_8_data *) h->private; + d8->last_value = 0; + SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region) + basep = 1; + for (exp = 0; exp < 7; exp++) { + for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0; + d8->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d8->tables[exp][i^1][1]; + d8->tables[exp][i][1] = p ^ basep; + } else { + p = d8->tables[exp][i>>1][1]; + d8->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d8->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p; + } else { + d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + return 1; + } + + /* If we get here, then the arguments were bad. */ + + return 0; +} + +static +int gf_w32_group_init(gf_t *gf) +{ + uint32_t i, j, p, index; + struct gf_w32_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint32_t g_r, g_s; + + g_s = h->arg1; + g_r = h->arg2; + + gd = (struct gf_w32_group_data *) h->private; + gd->shift = (uint32_t *) (&(gd->memory)); + gd->reduce = gd->shift + (1 << g_s); + + gd->rmask = (1 << g_r) - 1; + gd->rmask <<= 32; + + gd->tshift = 32 % g_s; + if (gd->tshift == 0) gd->tshift = g_s; + gd->tshift = (32 - gd->tshift); + gd->tshift = ((gd->tshift-1)/g_r) * g_r; + + gd->reduce[0] = 0; + for (i = 0; i < ((uint32_t)1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (1 << j); + index ^= (h->prim_poly >> (32-j)); + } + } + gd->reduce[index] = p; + } + + if (g_s == g_r) { + SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region) + } else { + SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region) + } + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + + return 1; +} + + +static +uint32_t +gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = (b & 0xffff0000) >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = (a & 0xffff0000) >> 16; + uint32_t a1b1; + uint32_t rv; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1); + return rv; +} + +/* JSP: This could be made faster. Someday, when I'm bored. */ + +static +uint32_t +gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint32_t b0 = b & 0x0000ffff; + uint32_t b1 = b >> 16; + uint32_t a0 = a & 0x0000ffff; + uint32_t a1 = a >> 16; + uint32_t a1b1, prod; + uint16_t *log, *alog; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; + + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + return prod; +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +uint32_t +gf_w32_composite_inverse(gf_t *gf, uint32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint16_t a0 = a & 0x0000ffff; + uint16_t a1 = (a & 0xffff0000) >> 16; + uint16_t c0, c1, d, tmp; + uint32_t c; + uint16_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | (c1 << 16); + + return c; +} + +static +void +gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = val & 0x0000ffff; + uint32_t b1 = (val & 0xffff0000) >> 16; + uint32_t *s32, *d32, *top; + uint16_t a0, a1, a1b1, *log, *alog; + uint32_t prod; + gf_region_data rd; + struct gf_w32_composite_data *cd; + + cd = (struct gf_w32_composite_data *) h->private; + log = cd->log; + alog = cd->alog; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + + s32 = rd.s_start; + d32 = rd.d_start; + top = rd.d_top; + + if (log == NULL) { + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16)); + s32++; + d32++; + } + } + } else { + if (xor) { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + *d32 ^= prod; + s32++; + d32++; + } + } else { + while (d32 < top) { + a0 = *s32 & 0x0000ffff; + a1 = (*s32 & 0xffff0000) >> 16; + a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1); + + prod = GF_W16_INLINE_MULT(log, alog, a1, b0); + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1); + prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly); + prod <<= 16; + prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0); + prod ^= a1b1; + + *d32 = prod; + s32++; + d32++; + } + } + } +} + +static +void +gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint16_t val0 = val & 0x0000ffff; + uint16_t val1 = (val & 0xffff0000) >> 16; + gf_region_data rd; + int sub_reg_size; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + + /* JSP: I want the two pointers aligned wrt each other on 16 byte + boundaries. So I'm going to make sure that the area on + which the two operate is a multiple of 32. Of course, that + junks up the mapping, but so be it -- that's why we have extract_word.... */ + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +int gf_w32_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w32_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w32_composite_data *) h->private; + cd->log = gf_w16_get_log_table(h->base_gf); + cd->alog = gf_w16_get_mult_alog_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt) + } else { + SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region) + } + + if (cd->log == NULL) { + SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive) + } else { + SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline) + } + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse) + + return 1; +} + + + +int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64; + break; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) + + sizeof(uint32_t) * (1 << arg1) + + sizeof(uint32_t) * (1 << arg2) + 64; + break; + case GF_MULT_DEFAULT: + + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8){ + return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64; + } + if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64; + } + if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; + } + if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || + (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; + } + if ((arg1 == 4 && arg2 == 32) || + (arg2 == 4 && arg1 == 32) || + mult_type == GF_MULT_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64; + } + return 0; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_CARRY_FREE_GK: + return sizeof(gf_internal_t) + sizeof(uint64_t)*2; + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64; + break; + + default: + return 0; + } + return 0; +} + +int gf_w32_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + + /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/ + + /* h->prim_poly = 0xc5; */ + + /* Allen: The following is the traditional primitive polynomial for GF(2^32) */ + + h->prim_poly = 0x400007; + } + } + + /* No leading one */ + + if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff; + + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w32_euclid) + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w32_matrix) + } + + if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse) + } + if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { + SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide) + } + if (h->region_type == GF_REGION_CAUCHY) { + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + } else if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word) + } else { + SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word) + } + } else { + SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word) + } + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w4.c b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c new file mode 100644 index 000000000..3a7b95316 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w4.c @@ -0,0 +1,2047 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w4.c + * + * Routines for 4-bit Galois fields + */ + +#include "gf_int.h" +#include +#include +#include "gf_w4.h" +#include "gf_cpu.h" + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +// ToDo(KMG/JSP): Why is 0x88 hard-coded? +#define SSE_AB2(pp, m1, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, _mm_set1_epi8(0x88)); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +/* ------------------------------------------------------------ + JSP: These are basic and work from multiple implementations. + */ + +static +inline +gf_val_32_t gf_w4_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_w4_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +gf_val_32_t gf_w4_euclid (gf_t *gf, gf_val_32_t b) +{ + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 4; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w4_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *r8, v; + + r8 = (uint8_t *) start; + v = r8[index/2]; + if (index%2) { + return v >> 4; + } else { + return v&0xf; + } +} + + +static +inline +gf_val_32_t gf_w4_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, 4, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + + +static +inline +gf_val_32_t +gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint8_t product, i, pp; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +/* Ben: This function works, but it is 33% slower than the normal shift mult */ + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a4, 0); + b = _mm_insert_epi32 (a, b4, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben/JSP: Do prim_poly reduction once. We are guaranteed that we will only + have to do the reduction only once, because (w-2)/z == 1. Where + z is equal to the number of zeros after the leading 1. + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_epi64 shifts the result to the right by 4 bits. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result. */ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_epi64 (result, 4), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + return rv; +} +#endif + +static +void +gf_w4_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = (gf->multiply.w32(gf, val, (*s8 & 0xf)) | + ((gf->multiply.w32(gf, val, (*s8 >> 4))) << 4)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: LOG_TABLE: + + JSP: This is a basic log-antilog implementation. + I'm not going to spend any time optimizing it because the + other techniques are faster for both single and region + operations. + */ + +static +inline +gf_val_32_t +gf_w4_log_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_logtable_data *ltd; + + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; +} + +static +inline +gf_val_32_t +gf_w4_log_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int log_sum = 0; + struct gf_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b]; + return (ltd->antilog_tbl_div[log_sum]); +} + +static +void +gf_w4_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t lv, b, c; + uint8_t *s8, *d8; + + struct gf_logtable_data *ltd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ltd = (struct gf_logtable_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = ltd->log_tbl[val]; + + for (i = 0; i < bytes; i++) { + c = (xor) ? d8[i] : 0; + b = (s8[i] >> GF_FIELD_WIDTH); + c ^= (b == 0) ? 0 : (ltd->antilog_tbl[lv + ltd->log_tbl[b]] << GF_FIELD_WIDTH); + b = (s8[i] & 0xf); + c ^= (b == 0) ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[b]]; + d8[i] = c; + } +} + +static +int gf_w4_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_logtable_data *ltd; + int i, b; + + h = (gf_internal_t *) gf->scratch; + ltd = h->private; + + for (i = 0; i < GF_FIELD_SIZE; i++) + ltd->log_tbl[i]=0; + + ltd->antilog_tbl_div = ltd->antilog_tbl + (GF_FIELD_SIZE-1); + b = 1; + i = 0; + do { + if (ltd->log_tbl[b] != 0 && i != 0) { + fprintf(stderr, "Cannot construct log table: Polynomial is not primitive.\n\n"); + return 0; + } + ltd->log_tbl[b] = i; + ltd->antilog_tbl[i] = b; + ltd->antilog_tbl[i+GF_FIELD_SIZE-1] = b; + b <<= 1; + i++; + if (b & GF_FIELD_SIZE) b = b ^ h->prim_poly; + } while (b != 1); + + if (i != GF_FIELD_SIZE - 1) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide) + SET_FUNCTION(gf,divide,w32,gf_w4_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_log_multiply_region) + return 1; +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: SINGLE TABLE: JSP. + */ + +static +inline +gf_val_32_t +gf_w4_single_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_single_table_data *std; + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->mult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_single_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_single_table_data *std; + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +void +gf_w4_single_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t b, c; + uint8_t *s8, *d8; + + struct gf_single_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + for (i = 0; i < bytes; i++) { + c = (xor) ? d8[i] : 0; + b = (s8[i] >> GF_FIELD_WIDTH); + c ^= (std->mult[val][b] << GF_FIELD_WIDTH); + b = (s8[i] & 0xf); + c ^= (std->mult[val][b]); + d8[i] = c; + } +} + +#define MM_PRINT(s, r) { uint8_t blah[16]; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (i = 0; i < 16; i++) printf(" %02x", blah[i]); printf("\n"); } + +#ifdef INTEL_SSSE3 +static +void +gf_w4_single_table_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *base, *sptr, *dptr, *top; + __m128i tl, loset, r, va, th; + + struct gf_single_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = (uint8_t *) std->mult; + base += (val << GF_FIELD_WIDTH); + + gf_do_initial_region_alignment(&rd); + + tl = _mm_loadu_si128((__m128i *)base); + th = _mm_slli_epi64(tl, 4); + loset = _mm_set1_epi8 (0x0f); + + sptr = rd.s_start; + dptr = rd.d_start; + top = rd.s_top; + + while (sptr < (uint8_t *) top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + r = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (tl, r); + va = _mm_srli_epi64 (va, 4); + va = _mm_and_si128 (loset, va); + va = _mm_shuffle_epi8 (th, va); + r = _mm_xor_si128 (r, va); + va = (xor) ? _mm_load_si128 ((__m128i *)(dptr)) : _mm_setzero_si128(); + r = _mm_xor_si128 (r, va); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + gf_do_final_region_alignment(&rd); + +} +#endif + +static +int gf_w4_single_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + int a, b, prod; + + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_single_table_data *)h->private; + + bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + std->mult[a][b] = prod; + std->div[prod][b] = a; + } + } + + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply) + #if defined(INTEL_SSSE3) + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region) + } else { + #elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) { + gf_w4_neon_single_table_init(gf); + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region) + if (h->region_type & GF_REGION_SIMD) return 0; + #if defined(INTEL_SSSE3) || defined(ARM_NEON) + } + #endif + + return 1; +} + +/* ------------------------------------------------------------ + IMPLEMENTATION: DOUBLE TABLE: JSP. + */ + +static +inline +gf_val_32_t +gf_w4_double_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_double_table_data *std; + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->mult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_double_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_double_table_data *std; + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +void +gf_w4_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8, *base; + gf_region_data rd; + struct gf_double_table_data *std; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + + std = (struct gf_double_table_data *) ((gf_internal_t *) (gf->scratch))->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + base = (uint8_t *) std->mult; + base += (val << GF_DOUBLE_WIDTH); + + if (xor) { + for (i = 0; i < bytes; i++) d8[i] ^= base[s8[i]]; + } else { + for (i = 0; i < bytes; i++) d8[i] = base[s8[i]]; + } +} + +static +int gf_w4_double_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_double_table_data *std; + int a, b, c, prod, ab; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_double_table_data *)h->private; + + bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + mult[a][b] = prod; + std->div[prod][b] = a; + } + } + bzero(std->mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); + for (a = 0; a < GF_FIELD_SIZE; a++) { + for (b = 0; b < GF_FIELD_SIZE; b++) { + ab = mult[a][b]; + for (c = 0; c < GF_FIELD_SIZE; c++) { + std->mult[a][(b << 4) | c] = ((ab << 4) | mult[a][c]); + } + } + } + + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_double_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_double_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_double_table_multiply_region) + return 1; +} + + +static +inline +gf_val_32_t +gf_w4_quad_table_lazy_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_lazy_data *std; + + std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_lazy_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_lazy_data *std; + + std = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->smult[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_data *std; + + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + return std->div[a][b]; +} + +static +inline +gf_val_32_t +gf_w4_quad_table_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_quad_table_data *std; + uint16_t v; + + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + v = std->mult[a][b]; + return v; +} + +static +void +gf_w4_quad_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *base; + gf_region_data rd; + struct gf_quad_table_data *std; + struct gf_quad_table_lazy_data *ltd; + gf_internal_t *h; + int a, b, c, d, va, vb, vc, vd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) (gf->scratch); + if (h->region_type & GF_REGION_LAZY) { + ltd = (struct gf_quad_table_lazy_data *) ((gf_internal_t *) (gf->scratch))->private; + base = ltd->mult; + for (a = 0; a < 16; a++) { + va = (ltd->smult[val][a] << 12); + for (b = 0; b < 16; b++) { + vb = (ltd->smult[val][b] << 8); + for (c = 0; c < 16; c++) { + vc = (ltd->smult[val][c] << 4); + for (d = 0; d < 16; d++) { + vd = ltd->smult[val][d]; + base[(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); + } + } + } + } + } else { + std = (struct gf_quad_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = &(std->mult[val][0]); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + gf_two_byte_region_table_multiply(&rd, base); + gf_do_final_region_alignment(&rd); +} + +static +int gf_w4_quad_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_quad_table_data *std; + int prod, val, a, b, c, d, va, vb, vc, vd; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_quad_table_data *)h->private; + + bzero(mult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w4_shift_multiply(gf, a, b); + mult[a][b] = prod; + std->div[prod][b] = a; + } + } + + for (val = 0; val < 16; val++) { + for (a = 0; a < 16; a++) { + va = (mult[val][a] << 12); + for (b = 0; b < 16; b++) { + vb = (mult[val][b] << 8); + for (c = 0; c < 16; c++) { + vc = (mult[val][c] << 4); + for (d = 0; d < 16; d++) { + vd = mult[val][d]; + std->mult[val][(a << 12) | (b << 8) | (c << 4) | d ] = (va | vb | vc | vd); + } + } + } + } + } + + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region) + return 1; +} +static +int gf_w4_quad_table_lazy_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_quad_table_lazy_data *std; + int a, b, prod, loga, logb; + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE*2]; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_quad_table_lazy_data *)h->private; + + b = 1; + for (a = 0; a < GF_MULT_GROUP_SIZE; a++) { + log_tbl[b] = a; + antilog_tbl[a] = b; + antilog_tbl[a+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + + bzero(std->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(std->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + loga = log_tbl[a]; + for (b = 1; b < GF_FIELD_SIZE; b++) { + logb = log_tbl[b]; + prod = antilog_tbl[loga+logb]; + std->smult[a][b] = prod; + std->div[prod][b] = a; + } + } + + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,divide,w32,gf_w4_quad_table_lazy_divide) + SET_FUNCTION(gf,multiply,w32,gf_w4_quad_table_lazy_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_quad_table_multiply_region) + return 1; +} + +static +int gf_w4_table_init(gf_t *gf) +{ + int rt; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + rt = (h->region_type); + + if (h->mult_type == GF_MULT_DEFAULT && + !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) + rt |= GF_REGION_DOUBLE_TABLE; + + if (rt & GF_REGION_DOUBLE_TABLE) { + return gf_w4_double_table_init(gf); + } else if (rt & GF_REGION_QUAD_TABLE) { + if (rt & GF_REGION_LAZY) { + return gf_w4_quad_table_lazy_init(gf); + } else { + return gf_w4_quad_table_init(gf); + } + } else { + return gf_w4_single_table_init(gf); + } + return 0; +} + +/* ------------------------------------------------------------ + JSP: GF_MULT_BYTWO_p and _b: See the paper. +*/ + +static +inline +gf_val_32_t +gf_w4_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x8; + amask = 0x8; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline +gf_val_32_t +gf_w4_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +void +gf_w4_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x8; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi8(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint8_t vrev; + __m128i pp, m1, ta, prod, t1, t2, tp, one, v; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 4; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + one = _mm_set1_epi8(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi8(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +/* +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *d8, *s8, tb; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_load_si128 ((__m128i *)(d8)); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_setzero_si128 (); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); + t2 = _mm_and_si128(va, m2); + t2 = _mm_sub_epi64 ( + _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +} +#endif +*/ + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_4_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_4_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_3_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_3_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_5_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_5_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_7_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(va, vb); + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_7_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_6_noxor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = va; + SSE_AB2(pp, m1, va, t1, t2); + va = _mm_xor_si128(va, vb); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_region_6_xor(gf_region_data *rd, struct gf_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(_mm_load_si128 ((__m128i *)(d8)), va); + SSE_AB2(pp, m1, va, t1, t2); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *d8, *s8, tb; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + switch (val) { + case 2: + if (!xor) { + gf_w4_bytwo_b_sse_region_2_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_2_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 3: + if (!xor) { + gf_w4_bytwo_b_sse_region_3_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_3_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 4: + if (!xor) { + gf_w4_bytwo_b_sse_region_4_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_4_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 5: + if (!xor) { + gf_w4_bytwo_b_sse_region_5_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_5_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 6: + if (!xor) { + gf_w4_bytwo_b_sse_region_6_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_6_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + case 7: + if (!xor) { + gf_w4_bytwo_b_sse_region_7_noxor(&rd, btd); + } else { + gf_w4_bytwo_b_sse_region_7_xor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_load_si128 ((__m128i *)(d8)); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + SSE_AB2(pp, m1, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = _mm_setzero_si128 (); + tb = val; + while (1) { + if (tb & 1) vb = _mm_xor_si128(vb, va); + tb >>= 1; + if (tb == 0) break; + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); + t2 = _mm_and_si128(va, m2); + t2 = _mm_sub_epi64 ( + _mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static +void +gf_w4_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 1: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + *d64 ^= *s64; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + *d64 = *s64; + d64++; + s64++; + } + } + break; + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + case 6: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + case 7: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + case 8: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 9: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 10: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 11: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 12: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 13: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 14: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 15: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w4_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_bytwo_data *) (h->private); + ip = h->prim_poly & 0xf; + m1 = 0xe; + m2 = 0x8; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region) + if (h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } else { + SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region) + if (h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } + return 1; +} + + +static +int gf_w4_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply) + return 1; + } +#elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon) { + return gf_w4_neon_cfm_init(gf); + } +#endif + return 0; +} + +static +int gf_w4_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_w4_shift_multiply) + return 1; +} + +/* JSP: I'm putting all error-checking into gf_error_check(), so you don't + have to do error checking in scratch_size or in init */ + +int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_bytwo_data); + break; + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: + if (region_type == GF_REGION_CAUCHY) { + return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; + } + + if (mult_type == GF_MULT_DEFAULT && + !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3)) + region_type = GF_REGION_DOUBLE_TABLE; + + if (region_type & GF_REGION_DOUBLE_TABLE) { + return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; + } else if (region_type & GF_REGION_QUAD_TABLE) { + if ((region_type & GF_REGION_LAZY) == 0) { + return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_data) + 64; + } else { + return sizeof(gf_internal_t) + sizeof(struct gf_quad_table_lazy_data) + 64; + } + } else { + return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; + } + break; + + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_logtable_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + default: + return 0; + } + return 0; +} + +int +gf_w4_init (gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->prim_poly == 0) h->prim_poly = 0x13; + h->prim_poly |= 0x10; + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + SET_FUNCTION(gf,extract_word,w32,gf_w4_extract_word) + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w4_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w4_shift_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w4_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_w4_log_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: if (gf_w4_table_init(gf) == 0) return 0; break; + default: return 0; + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w4_euclid) + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w4_matrix) + } + + if (gf->divide.w32 == NULL) { + SET_FUNCTION(gf,divide,w32,gf_w4_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_euclid) + } + + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w4_inverse_from_divide) + + if (h->region_type == GF_REGION_CAUCHY) { + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + } + + if (gf->multiply_region.w32 == NULL) { + SET_FUNCTION(gf,multiply_region,w32,gf_w4_multiply_region_from_single) + } + + return 1; +} + +/* Inline setup functions */ + +uint8_t *gf_w4_get_mult_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w4_single_table_multiply) { + std = (struct gf_single_table_data *) h->private; + return (uint8_t *) std->mult; + } + return NULL; +} + +uint8_t *gf_w4_get_div_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w4_single_table_multiply) { + std = (struct gf_single_table_data *) h->private; + return (uint8_t *) std->div; + } + return NULL; +} + diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w64.c b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c new file mode 100644 index 000000000..69e55dbd2 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w64.c @@ -0,0 +1,2235 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w64.c + * + * Routines for 64-bit Galois fields + */ + +#include "gf_int.h" +#include +#include +#include "gf_w64.h" +#include "gf_cpu.h" + +static +inline +gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a) +{ + return gf->divide.w64(gf, 1, a); +} + +#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } + +static +inline +gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + b = gf->inverse.w64(gf, b); + return gf->multiply.w64(gf, a, b); +} + +static +void +gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + uint32_t i; + gf_val_64_t *s64; + gf_val_64_t *d64; + + s64 = (gf_val_64_t *) src; + d64 = (gf_val_64_t *) dest; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + if (xor) { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { + d64[i] ^= gf->multiply.w64(gf, val, s64[i]); + } + } else { + for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { + d64[i] = gf->multiply.w64(gf, val, s64[i]); + } + } +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i w; + __m128i m1, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int +xor) +{ + gf_val_64_t *s64, *d64, *top; + gf_region_data rd; + + __m128i a, b; + __m128i result, r1; + __m128i prim_poly; + __m128i w; + __m128i m1, m3, m4; + gf_internal_t * h = gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); + m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); + m3 = _mm_slli_si128(m1, 8); + m4 = _mm_slli_si128(m3, 4); + + s64 = (gf_val_64_t *) rd.s_start; + d64 = (gf_val_64_t *) rd.d_start; + top = (gf_val_64_t *) rd.d_top; + + if (xor) { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + r1 = _mm_load_si128((__m128i *) d64); + result = _mm_xor_si128(r1, result); + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } else { + while (d64 != top) { + a = _mm_load_si128((__m128i *) s64); + result = _mm_clmulepi64_si128 (a, b, 1); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + r1 = _mm_xor_si128 (result, w); + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); + result = _mm_xor_si128 (result, w); + + result = _mm_unpacklo_epi64(result, r1); + + _mm_store_si128((__m128i *) d64, result); + d64 += 2; + s64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +static + inline +gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) +{ + gf_val_64_t e_i, e_im1, e_ip1; + gf_val_64_t d_i, d_im1, d_ip1; + gf_val_64_t y_i, y_im1, y_ip1; + gf_val_64_t c_i; + gf_val_64_t one = 1; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 64; + for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (one << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + d_ip1--; + if (e_ip1 == 0) return 0; + while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only + include it for completeness. It does have the feature that it requires no + extra memory. +*/ + +static +inline +gf_val_64_t +gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set leading one of primitive polynomial */ + + a = a64; + bl = 0; + br = b64; + one = 1; + lbit = (one << 63); + + pl = 0; /* Allen: left side of product */ + pr = 0; /* Allen: right side of product */ + + /* Allen: unlike the corresponding functions for smaller word sizes, + * this loop carries out the initial carryless multiply by + * shifting b itself rather than simply looking at successively + * higher shifts of b */ + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (one << i)) { + pl ^= bl; + pr ^= br; + } + + bl <<= 1; + if (br & lbit) bl ^= 1; + br <<= 1; + } + + /* Allen: the name of the variable "one" is no longer descriptive at this point */ + + one = lbit >> 1; + ppl = (h->prim_poly >> 2) | one; + ppr = (h->prim_poly << (GF_FIELD_WIDTH-2)); + while (one != 0) { + if (pl & one) { + pl ^= ppl; + pr ^= ppr; + } + one >>= 1; + ppr >>= 1; + if (ppl & 1) ppr ^= lbit; + ppl >>= 1; + } + return pr; +} + +/* + * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. + */ + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_64_t +gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + gf_val_64_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); + b = _mm_insert_epi64 (a, b64, 0); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Mask off the high order 32 bits using subtraction of the polynomial. + * NOTE: this part requires that the polynomial have at least 32 leading 0 bits. + */ + + /* Adam: We cant include the leading one in the 64 bit pclmul, + so we need to split up the high 8 bytes of the result into two + parts before we multiply them with the prim_poly.*/ + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) + +static +inline +gf_val_64_t +gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) +{ + gf_val_64_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i v, w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); + b = _mm_insert_epi64 (a, b64, 0); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, v, 0); + result = _mm_xor_si128 (result, w); + + rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); + return rv; +} +#endif + + +#if defined(INTEL_SSE4_PCLMUL) + void +gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + uint8_t *s8, *d8, *dtop; + gf_region_data rd; + __m128i v, b, m, prim_poly, c, fr, w, result; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + dtop = (uint8_t *) rd.d_top; + + v = _mm_insert_epi64(_mm_setzero_si128(), val, 0); + m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); + + if (xor) { + while (d8 != dtop) { + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + result = _mm_load_si128((__m128i *) d8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; + } + } else { + while (d8 < dtop) { + b = _mm_load_si128((__m128i *) s8); + result = _mm_clmulepi64_si128 (b, v, 0); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + fr = _mm_xor_si128 (result, w); + fr = _mm_and_si128 (fr, m); + + result = _mm_clmulepi64_si128 (b, v, 1); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); + w = _mm_clmulepi64_si128 (prim_poly, c, 0); + result = _mm_xor_si128 (result, w); + result = _mm_slli_si128 (result, 8); + fr = _mm_xor_si128 (result, fr); + + _mm_store_si128((__m128i *) d8, fr); + d8 += 16; + s8 += 16; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +void +gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_4_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xf]; + s >>= 4; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +uint64_t +gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64) +{ + uint64_t product, i, j, mask, tb; + gf_internal_t *h; + struct gf_split_8_8_data *d8; + + h = (gf_internal_t *) gf->scratch; + d8 = (struct gf_split_8_8_data *) h->private; + product = 0; + mask = 0xff; + + for (i = 0; a64 != 0; i++) { + tb = b64; + for (j = 0; tb != 0; j++) { + product ^= d8->tables[i+j][a64&mask][tb&mask]; + tb >>= 8; + } + a64 >>= 8; + } + return product; +} + +void +gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_8_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_8_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 8; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 256; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xff]; + s >>= 8; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +void +gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + struct gf_split_16_64_lazy_data *ld; + int i, j, k; + uint64_t pp, v, s, *s64, *d64, *top; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + ld = (struct gf_split_16_64_lazy_data *) h->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + if (ld->last_value != val) { + v = val; + for (i = 0; i < 4; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < (1<<16); j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + } + ld->last_value = val; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + while (d64 != top) { + v = (xor) ? *d64 : 0; + s = *s64; + i = 0; + while (s != 0) { + v ^= ld->tables[i][s&0xffff]; + s >>= 16; + i++; + } + *d64 = v; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +int gf_w64_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w64,gf_w64_shift_multiply) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) + return 1; +} + +static +int gf_w64_cfm_init(gf_t *gf) +{ + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) + +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; + } +#endif + + return 0; +} + +static +void +gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h) +{ + uint64_t i; + uint64_t j; + uint64_t one = 1; + int g_s; + + g_s = h->arg1; + shift[0] = 0; + + for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & (one << 63)) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + +static +inline +gf_val_64_t +gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t top, bot, mask, tp; + int g_s, g_r, lshift, rshift; + struct gf_w64_group_data *gd; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + g_r = h->arg2; + gd = (struct gf_w64_group_data *) h->private; + gf_w64_group_set_shift_tables(gd->shift, b, h); + + mask = (((uint64_t)1 << g_s) - 1); + top = 0; + bot = gd->shift[a&mask]; + a >>= g_s; + + if (a == 0) return bot; + lshift = 0; + rshift = 64; + + do { /* Shifting out is straightfoward */ + lshift += g_s; + rshift -= g_s; + tp = gd->shift[a&mask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + a >>= g_s; + } while (a != 0); + + /* Reducing is a bit gross, because I don't zero out the index bits of top. + The reason is that we throw top away. Even better, that last (tp >> rshift) + is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */ + + lshift = ((lshift-1) / g_r) * g_r; + rshift = 64 - lshift; + mask = ((uint64_t)1 << g_r) - 1; + while (lshift >= 0) { + tp = gd->reduce[(top >> lshift) & mask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + lshift -= g_r; + rshift += g_r; + } + + return bot; +} + +static +void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + int i, fzb; + uint64_t a64, smask, rmask, top, bot, tp; + int lshift, rshift, g_s, g_r; + gf_region_data rd; + uint64_t *s64, *d64, *dtop; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w64_group_data *) h->private; + g_s = h->arg1; + g_r = h->arg2; + gf_w64_group_set_shift_tables(gd->shift, val, h); + + for (i = 63; !(val & (1ULL << i)); i--) ; + i += g_s; + + /* i is the bit position of the first zero bit in any element of + gd->shift[] */ + + if (i > 64) i = 64; + + fzb = i; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + dtop = (uint64_t *) rd.d_top; + + smask = ((uint64_t)1 << g_s) - 1; + rmask = ((uint64_t)1 << g_r) - 1; + + while (d64 < dtop) { + a64 = *s64; + + top = 0; + bot = gd->shift[a64&smask]; + a64 >>= g_s; + i = fzb; + + if (a64 != 0) { + lshift = 0; + rshift = 64; + + do { + lshift += g_s; + rshift -= g_s; + tp = gd->shift[a64&smask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + a64 >>= g_s; + } while (a64 != 0); + i += lshift; + + lshift = ((i-64-1) / g_r) * g_r; + rshift = 64 - lshift; + while (lshift >= 0) { + tp = gd->reduce[(top >> lshift) & rmask]; + top ^= (tp >> rshift); + bot ^= (tp << lshift); + lshift -= g_r; + rshift += g_r; + } + } + + if (xor) bot ^= *d64; + *d64 = bot; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + +static +inline +gf_val_64_t +gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + int leftover, rs; + uint64_t p, l, ind, a64; + int bits_left; + int g_s; + + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + + gd = (struct gf_w64_group_data *) h->private; + gf_w64_group_set_shift_tables(gd->shift, b, h); + + leftover = 64 % g_s; + if (leftover == 0) leftover = g_s; + + rs = 64 - leftover; + a64 = a; + ind = a64 >> rs; + a64 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 64 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a64 >> rs; + a64 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + return p; +} + +static +void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + int leftover, rs; + uint64_t p, l, ind, a64; + int bits_left; + int g_s; + gf_region_data rd; + uint64_t *s64, *d64, *top; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gd = (struct gf_w64_group_data *) h->private; + g_s = h->arg1; + gf_w64_group_set_shift_tables(gd->shift, val, h); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + leftover = 64 % g_s; + if (leftover == 0) leftover = g_s; + + while (d64 < top) { + rs = 64 - leftover; + a64 = *s64; + ind = a64 >> rs; + a64 <<= leftover; + p = gd->shift[ind]; + + bits_left = rs; + rs = 64 - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a64 >> rs; + a64 <<= g_s; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); + } + if (xor) p ^= *d64; + *d64 = p; + d64++; + s64++; + } + gf_do_final_region_alignment(&rd); +} + + +static +int gf_w64_group_init(gf_t *gf) +{ + uint64_t i, j, p, index; + struct gf_w64_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint64_t g_r, g_s; + + g_s = h->arg1; + g_r = h->arg2; + + gd = (struct gf_w64_group_data *) h->private; + gd->shift = (uint64_t *) (&(gd->memory)); + gd->reduce = gd->shift + (1 << g_s); + + gd->reduce[0] = 0; + for (i = 0; i < ((uint64_t)1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (1 << j); + if (j > 0) index ^= (h->prim_poly >> (64-j)); + } + } + gd->reduce[index] = p; + } + + if (g_s == g_r) { + SET_FUNCTION(gf,multiply,w64,gf_w64_group_s_equals_r_multiply) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_s_equals_r_multiply_region) + } else { + SET_FUNCTION(gf,multiply,w64,gf_w64_group_multiply) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_multiply_region) + } + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + + return 1; +} + +static +gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint64_t *r64, rv; + + r64 = (uint64_t *) start; + rv = r64[index]; + return rv; +} + +static +gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint64_t a, b, *r64; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r64 = (uint64_t *) start; + if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; + if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; + index -= (((uint64_t *) rd.d_start) - r64); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | ((uint64_t)b << 32)); +} + +static +gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int i; + uint64_t *r64, rv; + uint8_t *r8; + gf_region_data rd; + + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128); + r64 = (uint64_t *) start; + if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; + if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; + index -= (((uint64_t *) rd.d_start) - r64); + r8 = (uint8_t *) rd.d_start; + r8 += ((index & 0xfffffff0)*8); + r8 += (index & 0xf); + r8 += 112; + rv =0; + for (i = 0; i < 8; i++) { + rv <<= 8; + rv |= *r8; + r8 -= 16; + } + return rv; +} + +static +inline +gf_val_64_t +gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x8000000000000000ULL; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +inline +gf_val_64_t +gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + uint64_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + + /* changed from declare then shift to just declare.*/ + + pmask = 0x8000000000000000ULL; + amask = 0x8000000000000000ULL; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +void +gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, ta, prod, amask, pmask, pp; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + pmask = 0x80000000; + pmask <<= 32; + pp = h->prim_poly; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = pmask; + ta = *s64; + while (amask != 0) { + prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = pmask; + ta = *s64; + while (amask != 0) { + prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, ta, tb, prod, bmask, pp; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + bmask = 0x80000000; + bmask <<= 32; + pp = h->prim_poly; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + tb = val; + ta = *s64; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + tb = val; + ta = *s64; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi64(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + + +#ifdef INTEL_SSE2 +void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint64_t vrev, one64; + uint64_t amask; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + h = (gf_internal_t *) gf->scratch; + one64 = 1; + vrev = 0; + for (i = 0; i < 64; i++) { + vrev <<= 1; + if (!(val & (one64 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + one = _mm_set1_epi64x(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi64x(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) +{ + uint64_t one64, amask; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + gf_internal_t *h; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + h = (gf_internal_t *) rd->gf->scratch; + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) +{ + uint64_t one64, amask; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + gf_internal_t *h; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + h = (gf_internal_t *) rd->gf->scratch; + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static +void +gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + uint64_t itb, amask, one64; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + gf_region_data rd; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + if (val == 2) { + if (xor) { + gf_w64_bytwo_b_sse_region_2_xor(&rd); + } else { + gf_w64_bytwo_b_sse_region_2_noxor(&rd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + h = (gf_internal_t *) gf->scratch; + + one64 = 1; + amask = -1; + amask ^= 1; + pp = _mm_set1_epi64x(h->prim_poly); + m1 = _mm_set1_epi64x(amask); + m2 = _mm_set1_epi64x(one64 << 63); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + + +static +int gf_w64_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } else { + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply) + #ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region) + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } + #endif + } + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + return 1; +} + + +static +gf_val_64_t +gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = b & 0x00000000ffffffff; + uint32_t b1 = (b & 0xffffffff00000000) >> 32; + uint32_t a0 = a & 0x00000000ffffffff; + uint32_t a1 = (a & 0xffffffff00000000) >> 32; + uint32_t a1b1; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_64_t +gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t a0 = a & 0x00000000ffffffff; + uint32_t a1 = (a & 0xffffffff00000000) >> 32; + uint32_t c0, c1, d, tmp; + uint64_t c; + uint32_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1); + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1); + a0inv = base_gf->inverse.w32(base_gf, a0); + + d = base_gf->multiply.w32(base_gf, a1, a0inv); + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); + tmp = base_gf->inverse.w32(base_gf, tmp); + + d = base_gf->multiply.w32(base_gf, d, tmp); + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); + c1 = base_gf->multiply.w32(base_gf, d, a1inv); + } + + c = c0 | ((uint64_t)c1 << 32); + + return c; +} + +static +void +gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint32_t b0 = val & 0x00000000ffffffff; + uint32_t b1 = (val & 0xffffffff00000000) >> 32; + uint64_t *s64, *d64; + uint64_t *top; + uint64_t a0, a1, a1b1; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + + s64 = rd.s_start; + d64 = rd.d_start; + top = rd.d_top; + + if (xor) { + while (d64 < top) { + a0 = *s64 & 0x00000000ffffffff; + a1 = (*s64 & 0xffffffff00000000) >> 32; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); + s64++; + d64++; + } + } else { + while (d64 < top) { + a0 = *s64 & 0x00000000ffffffff; + a1 = (*s64 & 0xffffffff00000000) >> 32; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); + s64++; + d64++; + } + } +} + +static +void +gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + gf_val_32_t val0 = val & 0x00000000ffffffff; + gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32; + uint8_t *slow, *shigh; + uint8_t *dlow, *dhigh, *top; + int sub_reg_size; + gf_region_data rd; + + if (!xor) { + memset(dest, 0, bytes); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + slow = (uint8_t *) rd.s_start; + dlow = (uint8_t *) rd.d_start; + top = (uint8_t*) rd.d_top; + sub_reg_size = (top - dlow)/2; + shigh = slow + sub_reg_size; + dhigh = dlow + sub_reg_size; + + base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + + + +static +int gf_w64_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region_alt) + } else { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region) + } + + SET_FUNCTION(gf,multiply,w64,gf_w64_composite_multiply) + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,gf_w64_composite_inverse) + + return 1; +} + +#ifdef INTEL_SSSE3 +static + void +gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], v0, mask1; + struct gf_split_4_64_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + + while (d64 != top) { + + if (xor) { + for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); + } else { + for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); + } + i = 0; + for (k = 0; k < 8; k++) { + v0 = _mm_load_si128((__m128i *) s64); + /* MM_PRINT8("v", v0); */ + s64 += 2; + + si = _mm_and_si128(v0, mask1); + + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + v0 = _mm_srli_epi32(v0, 4); + si = _mm_and_si128(v0, mask1); + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + for (i = 0; i < 8; i++) { + /* MM_PRINT8("v", p[i]); */ + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE4 +static + void +gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; + struct gf_split_4_64_lazy_data *ld; + uint8_t btable[16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } + tables[i][j] = _mm_loadu_si128((__m128i *) btable); + } + } + + mask1 = _mm_set1_epi8(0xf); + mask8 = _mm_set1_epi16(0xff); + mask16 = _mm_set1_epi32(0xffff); + + while (d64 != top) { + + for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); + + for (k = 0; k < 8; k++) { + st[k] = _mm_load_si128((__m128i *) s64); + s64 += 2; + } + + for (k = 0; k < 4; k ++) { + st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0)); + st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1)); + t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = _mm_srli_si128(st[k], 8); + st[k+4] = _mm_slli_si128(st[k+4], 8); + st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0); + st[k] = t1; + } + +/* + printf("After pack pass 1\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + + t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16)); + st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16)); + st[0] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16)); + st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16)); + st[1] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16)); + st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16)); + st[4] = t1; + t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16)); + st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16)); + st[5] = t1; + +/* + printf("After pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + printf("\n"); + */ + t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8)); + st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8)); + st[0] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8)); + st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8)); + st[2] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8)); + st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8)); + st[4] = t1; + t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8)); + st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8)); + st[6] = t1; + +/* + printf("After final pack pass 2\n"); + for (k = 0; k < 8; k++) { + MM_PRINT8("v", st[k]); + } + */ + i = 0; + for (k = 0; k < 8; k++) { + si = _mm_and_si128(st[k], mask1); + + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + st[k] = _mm_srli_epi32(st[k], 4); + si = _mm_and_si128(st[k], mask1); + for (j = 0; j < 8; j++) { + p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); + } + i++; + } + + t1 = _mm_unpacklo_epi8(p[0], p[1]); + p[1] = _mm_unpackhi_epi8(p[0], p[1]); + p[0] = t1; + t1 = _mm_unpacklo_epi8(p[2], p[3]); + p[3] = _mm_unpackhi_epi8(p[2], p[3]); + p[2] = t1; + t1 = _mm_unpacklo_epi8(p[4], p[5]); + p[5] = _mm_unpackhi_epi8(p[4], p[5]); + p[4] = t1; + t1 = _mm_unpacklo_epi8(p[6], p[7]); + p[7] = _mm_unpackhi_epi8(p[6], p[7]); + p[6] = t1; + +/* + printf("After unpack pass 1:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi16(p[0], p[2]); + p[2] = _mm_unpackhi_epi16(p[0], p[2]); + p[0] = t1; + t1 = _mm_unpacklo_epi16(p[1], p[3]); + p[3] = _mm_unpackhi_epi16(p[1], p[3]); + p[1] = t1; + t1 = _mm_unpacklo_epi16(p[4], p[6]); + p[6] = _mm_unpackhi_epi16(p[4], p[6]); + p[4] = t1; + t1 = _mm_unpacklo_epi16(p[5], p[7]); + p[7] = _mm_unpackhi_epi16(p[5], p[7]); + p[5] = t1; + +/* + printf("After unpack pass 2:\n"); + for (i = 0; i < 8; i++) { + MM_PRINT8("v", p[i]); + } + */ + + t1 = _mm_unpacklo_epi32(p[0], p[4]); + p[4] = _mm_unpackhi_epi32(p[0], p[4]); + p[0] = t1; + t1 = _mm_unpacklo_epi32(p[1], p[5]); + p[5] = _mm_unpackhi_epi32(p[1], p[5]); + p[1] = t1; + t1 = _mm_unpacklo_epi32(p[2], p[6]); + p[6] = _mm_unpackhi_epi32(p[2], p[6]); + p[2] = t1; + t1 = _mm_unpacklo_epi32(p[3], p[7]); + p[7] = _mm_unpackhi_epi32(p[3], p[7]); + p[3] = t1; + + if (xor) { + for (i = 0; i < 8; i++) { + t1 = _mm_load_si128((__m128i *) d64); + _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1)); + d64 += 2; + } + } else { + for (i = 0; i < 8; i++) { + _mm_store_si128((__m128i *) d64, p[i]); + d64 += 2; + } + } + + } + + gf_do_final_region_alignment(&rd); +} +#endif + +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); + +static +int gf_w64_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_split_4_64_lazy_data *d4; + struct gf_split_8_64_lazy_data *d8; + struct gf_split_8_8_data *d88; + struct gf_split_16_64_lazy_data *d16; + uint64_t p, basep; + int exp, i, j; + + h = (gf_internal_t *) gf->scratch; + + /* Defaults */ + + SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) + + SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) + +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + if ((!(h->region_type & GF_REGION_NOSIMD) && + (h->arg1 == 64 || h->arg2 == 64)) || + h->mult_type == GF_MULT_DEFAULT){ + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + }else{ + return 0; + } + } + } +#endif + + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + + /* Allen: set region pointers for default mult type. Single pointers are + * taken care of above (explicitly for sse, implicitly for no sse). */ + + if (h->mult_type == GF_MULT_DEFAULT) { +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; +#if defined(INTEL_SSE4) + if (gf_cpu_supports_intel_sse4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) +#elif defined(ARCH_AARCH64) + if (gf_cpu_supports_arm_neon) + gf_w64_neon_split_init(gf); +#endif + } else { +#endif + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } +#endif + } + + if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; + + if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0; + if(h->region_type & GF_REGION_ALTMAP) + { + #ifdef INTEL_SSSE3 + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region) + } else + #elif defined(ARCH_AARCH64) + if (gf_cpu_supports_arm_neon) { + gf_w64_neon_split_init(gf); + } else + #endif + return 0; + } + else //no altmap + { + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + if (h->region_type & GF_REGION_NOSIMD) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) + } else + #if defined(INTEL_SSE4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) + #elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); + #endif + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } + #endif + } + } + if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) + } + if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) { + d16 = (struct gf_split_16_64_lazy_data *) h->private; + d16->last_value = 0; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_16_64_lazy_multiply_region) + } + if ((h->arg1 == 8 && h->arg2 == 8)) { + d88 = (struct gf_split_8_8_data *) h->private; + SET_FUNCTION(gf,multiply,w64,gf_w64_split_8_8_multiply) + + /* The performance of this guy sucks, so don't bother with a region op */ + + basep = 1; + for (exp = 0; exp < 15; exp++) { + for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0; + for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0; + d88->tables[exp][1][1] = basep; + for (i = 2; i < 256; i++) { + if (i&1) { + p = d88->tables[exp][i^1][1]; + d88->tables[exp][i][1] = p ^ basep; + } else { + p = d88->tables[exp][i>>1][1]; + d88->tables[exp][i][1] = GF_MULTBY_TWO(p); + } + } + for (i = 1; i < 256; i++) { + p = d88->tables[exp][i][1]; + for (j = 1; j < 256; j++) { + if (j&1) { + d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p; + } else { + d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]); + } + } + } + for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); + } + } + return 1; +} + +int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t); + break; + + case GF_MULT_DEFAULT: + + /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, + * then fall through to split table scratch size code. */ + +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + arg1 = 64; + arg2 = 4; + } else { +#endif + arg1 = 64; + arg2 = 8; +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } +#endif + + case GF_MULT_SPLIT_TABLE: + if (arg1 == 8 && arg2 == 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; + } + if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64; + } + if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64; + } + + if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) { + return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; + } + return 0; + case GF_MULT_GROUP: + return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) + + sizeof(uint64_t) * (1 << arg1) + + sizeof(uint64_t) * (1 << arg2) + 64; + break; + case GF_MULT_COMPOSITE: + if (arg1 == 2) return sizeof(gf_internal_t) + 64; + return 0; + break; + default: + return 0; + } +} + +int gf_w64_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + /* Omitting the leftmost 1 as in w=32 */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* This shouldn't happen */ + } else { + h->prim_poly = 0x1b; + } + } + + SET_FUNCTION(gf,multiply,w64,NULL) + SET_FUNCTION(gf,divide,w64,NULL) + SET_FUNCTION(gf,inverse,w64,NULL) + SET_FUNCTION(gf,multiply_region,w64,NULL) + + switch(h->mult_type) { + case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; + case GF_MULT_DEFAULT: + case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse) + SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) + } + + if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { + SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse) + } + if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) { + SET_FUNCTION(gf,inverse,w64,gf_w64_inverse_from_divide) + } + + if (h->region_type == GF_REGION_CAUCHY) return 0; + + if (h->region_type & GF_REGION_ALTMAP) { + if (h->mult_type == GF_MULT_COMPOSITE) { + SET_FUNCTION(gf,extract_word,w64,gf_w64_composite_extract_word) + } else if (h->mult_type == GF_MULT_SPLIT_TABLE) { + SET_FUNCTION(gf,extract_word,w64,gf_w64_split_extract_word) + } + } else { + SET_FUNCTION(gf,extract_word,w64,gf_w64_extract_word) + } + + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_w8.c b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c new file mode 100644 index 000000000..f647a31bf --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_w8.c @@ -0,0 +1,2398 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w8.c + * + * Routines for 8-bit Galois fields + */ + +#include "gf_int.h" +#include "gf_w8.h" +#include +#include +#include +#include "gf_cpu.h" + +#define AB2(ip, am1 ,am2, b, t1, t2) {\ + t1 = (b << 1) & am1;\ + t2 = b & am2; \ + t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \ + b = (t1 ^ (t2 & ip));} + +#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ + t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ + t2 = _mm_and_si128(va, m2); \ + t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ + va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } + +#define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); } + +static +inline +uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +uint32_t gf_w8_euclid (gf_t *gf, uint32_t b) +{ + uint32_t e_i, e_im1, e_ip1; + uint32_t d_i, d_im1, d_ip1; + uint32_t y_i, y_im1, y_ip1; + uint32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = 8; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +static +gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *r8; + + r8 = (uint8_t *) start; + return r8[index]; +} + +static +gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + int sub_size; + gf_internal_t *h; + uint8_t *r8, *top; + uint8_t a, b; + gf_region_data rd; + + h = (gf_internal_t *) gf->scratch; + gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); + r8 = (uint8_t *) start; + if (r8 + index < (uint8_t *) rd.d_start) return r8[index]; + if (r8 + index >= (uint8_t *) rd.d_top) return r8[index]; + index -= (((uint8_t *) rd.d_start) - r8); + r8 = (uint8_t *) rd.d_start; + top = (uint8_t *) rd.d_top; + sub_size = (top-r8)/2; + + a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); + b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); + return (a | (b << 4)); +} + +static +inline +uint32_t gf_w8_matrix (gf_t *gf, uint32_t b) +{ + return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly); +} + + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 + + _mm_clmulepi64_si128 is the carryless multiply operation. Here + _mm_srli_si128 shifts the result to the right by 1 byte. This allows + us to multiply the prim_poly by the leading bits of the result. We + then xor the result of that operation back with the result.*/ + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +inline +gf_val_32_t +gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) +{ + gf_val_32_t rv = 0; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0); + b = _mm_insert_epi32 (a, b8, 0); + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + + result = _mm_clmulepi64_si128 (a, b, 0); + + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + + /* Extracts 32 bit value from result. */ + rv = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + + return rv; +} +#endif + + +static +void +gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 ^= gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + *d8 = gf->multiply.w32(gf, val, *s8); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +#if defined(INTEL_SSE4_PCLMUL) +static +void +gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int + xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + __m128i a, b; + __m128i result; + __m128i prim_poly; + __m128i w; + gf_internal_t * h = gf->scratch; + + prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL)); + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0); + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } else { + while (d8 < ((uint8_t *) rd.d_top)) { + b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0); + result = _mm_clmulepi64_si128 (a, b, 0); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0); + result = _mm_xor_si128 (result, w); + *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0)); + d8++; + s8++; + } + } + gf_do_final_region_alignment(&rd); +} +#endif + +/* ------------------------------------------------------------ +IMPLEMENTATION: SHIFT: + +JSP: The world's dumbest multiplication algorithm. I only +include it for completeness. It does have the feature that it requires no +extra memory. + */ + +static +inline + uint32_t +gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8) +{ + uint16_t product, i, pp, a, b; + gf_internal_t *h; + + a = a8; + b = b8; + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + product = 0; + + for (i = 0; i < GF_FIELD_WIDTH; i++) { + if (a & (1 << i)) product ^= (b << i); + } + for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) { + if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH)); + } + return product; +} + +static +int gf_w8_cfm_init(gf_t *gf) +{ +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xe0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2) + }else if ((0xc0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3) + }else if ((0x80 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4) + }else{ + return 0; + } + return 1; + } +#elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon) { + return gf_w8_neon_cfm_init(gf); + } +#endif + + return 0; + +} + +static +int gf_w8_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply) /* The others will be set automatically */ + return 1; +} + +/* ------------------------------------------------------------ +IMPLEMENTATION: LOG_TABLE: + +JSP: Kevin wrote this, and I'm converting it to my structure. +*/ + +static +inline + uint32_t +gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + if (b == 0) return 0; + return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]]; +} + +static +inline + uint32_t +gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b) +{ + struct gf_w8_logtable_data *ltd; + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])]; +} + +static +inline + uint32_t +gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b) +{ + int log_sum = 0; + struct gf_w8_logtable_data *ltd; + + if (a == 0 || b == 0) return 0; + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + + log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE); + return (ltd->antilog_tbl[log_sum]); +} + +static + uint32_t +gf_w8_log_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logtable_data *ltd; + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static + uint32_t +gf_w8_logzero_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logzero_table_data *ltd; + + ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ltd->inv_tbl[a]); +} + +static + uint32_t +gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a) +{ + struct gf_w8_logzero_small_table_data *std; + + std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (std->inv_tbl[a]); +} + +static + void +gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + int i; + uint8_t lv; + uint8_t *s8, *d8; + struct gf_w8_logtable_data *ltd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = ltd->log_tbl[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]); + } + } +} + +static + void +gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor) +{ + int i; + uint8_t lv; + uint8_t *s8, *d8; + struct gf_w8_logzero_table_data *ltd; + struct gf_w8_logzero_small_table_data *std; + short *log; + uint8_t *alt; + gf_internal_t *h; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + + if (h->arg1 == 1) { + std = (struct gf_w8_logzero_small_table_data *) h->private; + log = std->log_tbl; + alt = std->antilog_tbl; + } else { + ltd = (struct gf_w8_logzero_table_data *) h->private; + log = ltd->log_tbl; + alt = ltd->antilog_tbl; + } + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + lv = log[val]; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (alt[lv + log[s8[i]]]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (alt[lv + log[s8[i]]]); + } + } +} + + static +int gf_w8_log_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_logtable_data *ltd = NULL; + struct gf_w8_logzero_table_data *ztd = NULL; + struct gf_w8_logzero_small_table_data *std = NULL; + uint8_t *alt; + uint8_t *inv; + int i, b; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + if (h->mult_type == GF_MULT_LOG_TABLE) { + ltd = h->private; + alt = ltd->antilog_tbl; + inv = ltd->inv_tbl; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + std = h->private; + alt = std->antilog_tbl; + std->div_tbl = (alt + 255); + inv = std->inv_tbl; + } else { + ztd = h->private; + alt = ztd->antilog_tbl; + ztd->inv_tbl = (alt + 512 + 256); + ztd->div_tbl = (alt + 255); + inv = ztd->inv_tbl; + } + + for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) { + if (h->mult_type == GF_MULT_LOG_TABLE) + ltd->log_tbl[i] = 0; + else if (h->mult_type == GF_MULT_LOG_ZERO) + std->log_tbl[i] = 0; + else + ztd->log_tbl[i] = 0; + } + + if (h->mult_type == GF_MULT_LOG_TABLE) { + ltd->log_tbl[0] = 0; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + std->log_tbl[0] = 510; + } else { + ztd->log_tbl[0] = 512; + } + + b = 1; + for (i = 0; i < GF_MULT_GROUP_SIZE; i++) { + if (h->mult_type == GF_MULT_LOG_TABLE) { + if (ltd->log_tbl[b] != 0) check = 1; + ltd->log_tbl[b] = i; + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + if (std->log_tbl[b] != 0) check = 1; + std->log_tbl[b] = i; + } else { + if (ztd->log_tbl[b] != 0) check = 1; + ztd->log_tbl[b] = i; + } + alt[i] = b; + alt[i+GF_MULT_GROUP_SIZE] = b; + b <<= 1; + if (b & GF_FIELD_SIZE) { + b = b ^ h->prim_poly; + } + } + if (check) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255); + + if (h->mult_type == GF_MULT_LOG_ZERO_EXT) { + bzero(alt+512, 255); + alt[512+512] = 0; + } + + inv[0] = 0; /* Not really, but we need to fill it with something */ + i = 1; + b = GF_MULT_GROUP_SIZE; + do { + inv[i] = alt[b]; + i <<= 1; + if (i & (1 << 8)) i ^= h->prim_poly; + b--; + } while (i != 1); + + if (h->mult_type == GF_MULT_LOG_TABLE) { + SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_log_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region) + } else if (h->mult_type == GF_MULT_LOG_ZERO) { + SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region) + } else { + SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse) + SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region) + } + return 1; +} + +/* ------------------------------------------------------------ +IMPLEMENTATION: FULL_TABLE: + +JSP: Kevin wrote this, and I'm converting it to my structure. + */ + +static + gf_val_32_t +gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_single_table_data *ftd; + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->multtable[a][b]); +} + +static + gf_val_32_t +gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_single_table_data *ftd; + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->divtable[a][b]); +} + +static + gf_val_32_t +gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_default_data *ftd; + + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->multtable[a][b]); +} + +#if defined(INTEL_SSSE3) || defined(ARM_NEON) +static + gf_val_32_t +gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_default_data *ftd; + + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->divtable[a][b]); +} +#endif + +static + gf_val_32_t +gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_data *ftd; + + ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->mult[a][b]); +} + +static + gf_val_32_t +gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_data *ftd; + + ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->div[a][b]); +} + +static + void +gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint16_t *base; + uint32_t b, c, vc, vb; + gf_internal_t *h; + struct gf_w8_double_table_data *dtd; + struct gf_w8_double_table_lazy_data *ltd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) (gf->scratch); + if (h->region_type & GF_REGION_LAZY) { + ltd = (struct gf_w8_double_table_lazy_data *) h->private; + base = ltd->mult; + for (b = 0; b < GF_FIELD_SIZE; b++) { + vb = (ltd->smult[val][b] << 8); + for (c = 0; c < GF_FIELD_SIZE; c++) { + vc = ltd->smult[val][c]; + base[(b << 8)| c] = (vb | vc); + } + } + + } else { + dtd = (struct gf_w8_double_table_data *) h->private; + base = &(dtd->mult[val][0]); + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + gf_two_byte_region_table_multiply(&rd, base); + gf_do_final_region_alignment(&rd); +} + +static + gf_val_32_t +gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_lazy_data *ftd; + + ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->smult[a][b]); +} + +static + gf_val_32_t +gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_double_table_lazy_data *ftd; + + ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private; + return (ftd->div[a][b]); +} + +static + void +gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + struct gf_w8_single_table_data *ftd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= ftd->multtable[s8[i]][val]; + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = ftd->multtable[s8[i]][val]; + } + } +} + +#ifdef INTEL_SSSE3 +static + void +gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *bh, *bl, *sptr, *dptr; + __m128i loset, t1, r, va, mth, mtl; + struct gf_w8_half_table_data *htd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + bh = (uint8_t *) htd->high; + bh += (val << 4); + bl = (uint8_t *) htd->low; + bl += (val << 4); + + sptr = rd.s_start; + dptr = rd.d_start; + + mth = _mm_loadu_si128 ((__m128i *)(bh)); + mtl = _mm_loadu_si128 ((__m128i *)(bl)); + loset = _mm_set1_epi8 (0x0f); + + if (xor) { + while (sptr < (uint8_t *) rd.s_top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + t1 = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (mtl, t1); + va = _mm_srli_epi64 (va, 4); + t1 = _mm_and_si128 (loset, va); + r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); + va = _mm_load_si128 ((__m128i *)(dptr)); + r = _mm_xor_si128 (r, va); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + } else { + while (sptr < (uint8_t *) rd.s_top) { + va = _mm_load_si128 ((__m128i *)(sptr)); + t1 = _mm_and_si128 (loset, va); + r = _mm_shuffle_epi8 (mtl, t1); + va = _mm_srli_epi64 (va, 4); + t1 = _mm_and_si128 (loset, va); + r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1)); + _mm_store_si128 ((__m128i *)(dptr), r); + dptr += 16; + sptr += 16; + } + } + + gf_do_final_region_alignment(&rd); +} +#endif + + +/* ------------------------------------------------------------ +IMPLEMENTATION: FULL_TABLE: + */ + +static + gf_val_32_t +gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + struct gf_w8_half_table_data *htd; + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; + + return htd->high[b][a>>4] ^ htd->low[b][a&0xf]; +} + +static + void +gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + struct gf_w8_half_table_data *htd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (xor) { + for (i = 0; i < bytes; i++) { + d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); + } + } else { + for (i = 0; i < bytes; i++) { + d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]); + } + } +} + + + static +int gf_w8_split_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_half_table_data *htd; + int a, b; + + h = (gf_internal_t *) gf->scratch; + htd = (struct gf_w8_half_table_data *)h->private; + + bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); + bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE); + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_HALF_SIZE; b++) { + htd->low[a][b] = gf_w8_shift_multiply(gf,a,b); + htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4); + } + } + + SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply) + + #if defined(INTEL_SSSE3) + if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse) + } else { + #elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) { + gf_w8_neon_split_init(gf); + } else { + #endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #if defined(INTEL_SSSE3) || defined(ARM_NEON) + } + #endif + + return 1; +} + +/* JSP: This is disgusting, but it is what it is. If there is no SSE, + then the default is equivalent to single table. If there is SSE, then + we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */ + +static +int gf_w8_table_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_single_table_data *ftd = NULL; + struct gf_w8_double_table_data *dtd = NULL; + struct gf_w8_double_table_lazy_data *ltd = NULL; + struct gf_w8_default_data *dd = NULL; + int a, b, c, prod, scase; + + h = (gf_internal_t *) gf->scratch; + + if (h->mult_type == GF_MULT_DEFAULT && + (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) { + dd = (struct gf_w8_default_data *)h->private; + scase = 3; + bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); + bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE); + bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + } else if (h->mult_type == GF_MULT_DEFAULT || + h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) { + ftd = (struct gf_w8_single_table_data *)h->private; + bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 0; + } else if (h->region_type == GF_REGION_DOUBLE_TABLE) { + dtd = (struct gf_w8_double_table_data *)h->private; + bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 1; + } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { + ltd = (struct gf_w8_double_table_lazy_data *)h->private; + bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE); + scase = 2; + } else { + fprintf(stderr, "Internal error in gf_w8_table_init\n"); + assert(0); + } + + for (a = 1; a < GF_FIELD_SIZE; a++) { + for (b = 1; b < GF_FIELD_SIZE; b++) { + prod = gf_w8_shift_multiply(gf,a,b); + switch (scase) { + case 0: + ftd->multtable[a][b] = prod; + ftd->divtable[prod][b] = a; + break; + case 1: + dtd->div[prod][b] = a; + for (c = 0; c < GF_FIELD_SIZE; c++) { + dtd->mult[a][(c<<8)|b] |= prod; + dtd->mult[a][(b<<8)|c] |= (prod<<8); + } + break; + case 2: + ltd->div[prod][b] = a; + ltd->smult[a][b] = prod; + break; + case 3: + dd->multtable[a][b] = prod; + dd->divtable[prod][b] = a; + if ((b & 0xf) == b) { dd->low[a][b] = prod; } + if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; } + break; + } + } + } + + SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */ + switch (scase) { + case 0: + SET_FUNCTION(gf,divide,w32,gf_w8_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region) + break; + case 1: + SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region) + break; + case 2: + SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region) + break; + case 3: +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { + SET_FUNCTION(gf,divide,w32,gf_w8_default_divide) + SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply) +#if defined(INTEL_SSSE3) + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse) + } +#elif defined(ARM_NEON) + if (gf_cpu_supports_arm_neon) { + gf_w8_neon_split_init(gf); + } +#endif + } +#endif + break; + } + return 1; +} + +static + void +gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t val0 = val & 0x0f; + uint8_t val1 = (val & 0xf0) >> 4; + gf_region_data rd; + int sub_reg_size; + + if (val == 0) { + if (xor) return; + bzero(dest, bytes); + return; + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2; + + base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1); + base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); + + gf_do_final_region_alignment(&rd); +} + +static +gf_val_32_t +gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = b & 0x0f; + uint8_t b1 = (b & 0xf0) >> 4; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t a1b1; + + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); +} + +static +gf_val_32_t +gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint8_t b0 = b & 0x0f; + uint8_t b1 = (b & 0xf0) >> 4; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t a1b1, *mt; + struct gf_w8_composite_data *cd; + + cd = (struct gf_w8_composite_data *) h->private; + mt = cd->mult_table; + + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); +} + +/* + * Composite field division trick (explained in 2007 tech report) + * + * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 + * + * let c = b^-1 + * + * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) + * + * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 + * + * let d = b1c1 and d+1 = b0c0 + * + * solve s*b1c1+b1c0+b0c1 = 0 + * + * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 + * + * c0 = (d+1)b0^-1 + * c1 = d*b1^-1 + * + * a / b = a * c + */ + +static +gf_val_32_t +gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t a0 = a & 0x0f; + uint8_t a1 = (a & 0xf0) >> 4; + uint8_t c0, c1, c, d, tmp; + uint8_t a0inv, a1inv; + + if (a0 == 0) { + a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; + c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); + c1 = a1inv; + } else if (a1 == 0) { + c0 = base_gf->inverse.w32(base_gf, a0); + c1 = 0; + } else { + a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf; + a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf; + + d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf; + + tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf; + tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf; + + d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf; + + c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf; + c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf; + } + + c = c0 | (c1 << 4); + + return c; +} + +static +void +gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + gf_t *base_gf = h->base_gf; + uint8_t b0 = val & 0x0f; + uint8_t b1 = (val & 0xf0) >> 4; + uint8_t *s8; + uint8_t *d8; + uint8_t *mt; + uint8_t a0, a1, a1b1; + struct gf_w8_composite_data *cd; + + cd = (struct gf_w8_composite_data *) h->private; + + if (val == 0) { + if (xor) return; + bzero(dest, bytes); + return; + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1); + gf_do_initial_region_alignment(&rd); + + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + mt = cd->mult_table; + if (mt == NULL) { + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = base_gf->multiply.w32(base_gf, a1, b1); + + *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | + ((base_gf->multiply.w32(base_gf, a1, b0) ^ + base_gf->multiply.w32(base_gf, a0, b1) ^ + base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } + } else { + if (xor) { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } else { + while (d8 < (uint8_t *) rd.d_top) { + a0 = *s8 & 0x0f; + a1 = (*s8 & 0xf0) >> 4; + a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1); + + *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | + ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^ + GF_W4_INLINE_MULTDIV(mt, a0, b1) ^ + GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4)); + s8++; + d8++; + } + } + } + gf_do_final_region_alignment(&rd); + return; +} + +static +int gf_w8_composite_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_w8_composite_data *cd; + + if (h->base_gf == NULL) return 0; + + cd = (struct gf_w8_composite_data *) h->private; + cd->mult_table = gf_w4_get_mult_table(h->base_gf); + + if (h->region_type & GF_REGION_ALTMAP) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt) + } else { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region) + } + + if (cd->mult_table == NULL) { + SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive) + } else { + SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline) + } + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse) + + return 1; +} + +static +inline + gf_val_32_t +gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + + prod = 0; + pmask = 0x80; + amask = 0x80; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + +static +inline + gf_val_32_t +gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = 0x80; + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static + void +gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, prod, amask; + gf_region_data rd; + struct gf_w8_bytwo_data *btd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + if (xor) { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 ^= prod; + d64++; + s64++; + } + } else { + while (s64 < (uint64_t *) rd.s_top) { + prod = 0; + amask = 0x80; + ta = *s64; + while (amask != 0) { + AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2); + if (val & amask) prod ^= ta; + amask >>= 1; + } + *d64 = prod; + d64++; + s64++; + } + } + gf_do_final_region_alignment(&rd); +} + +#define BYTWO_P_ONESTEP {\ + SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ + t1 = _mm_and_si128(v, one); \ + t1 = _mm_sub_epi8(t1, one); \ + t1 = _mm_and_si128(t1, ta); \ + prod = _mm_xor_si128(prod, t1); \ + v = _mm_srli_epi64(v, 1); } + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int i; + uint8_t *s8, *d8; + uint8_t vrev; + __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + vrev = 0; + for (i = 0; i < 8; i++) { + vrev <<= 1; + if (!(val & (1 << i))) vrev |= 1; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + one = _mm_set1_epi8(1); + + while (d8 < (uint8_t *) rd.d_top) { + prod = _mm_setzero_si128(); + v = _mm_set1_epi8(vrev); + ta = _mm_load_si128((__m128i *) s8); + tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + BYTWO_P_ONESTEP; + _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); + d8 += 16; + s8 += 16; + } + gf_do_final_region_alignment(&rd); +} +#endif + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + _mm_store_si128((__m128i *)d8, va); + d8 += 16; + s8 += 16; + } +} +#endif + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd) +{ + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + + s8 = (uint8_t *) rd->s_start; + d8 = (uint8_t *) rd->d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd->d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + SSE_AB2(pp, m1, m2, va, t1, t2); + vb = _mm_load_si128 ((__m128i *)(d8)); + vb = _mm_xor_si128(vb, va); + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } +} +#endif + + +#ifdef INTEL_SSE2 +static + void +gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + int itb; + uint8_t *d8, *s8; + __m128i pp, m1, m2, t1, t2, va, vb; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + + if (val == 2) { + if (xor) { + gf_w8_bytwo_b_sse_region_2_xor(&rd, btd); + } else { + gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd); + } + gf_do_final_region_alignment(&rd); + return; + } + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + pp = _mm_set1_epi8(btd->prim_poly&0xff); + m1 = _mm_set1_epi8((btd->mask1)&0xff); + m2 = _mm_set1_epi8((btd->mask2)&0xff); + + while (d8 < (uint8_t *) rd.d_top) { + va = _mm_load_si128 ((__m128i *)(s8)); + vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); + itb = val; + while (1) { + if (itb & 1) vb = _mm_xor_si128(vb, va); + itb >>= 1; + if (itb == 0) break; + SSE_AB2(pp, m1, m2, va, t1, t2); + } + _mm_store_si128((__m128i *)d8, vb); + d8 += 16; + s8 += 16; + } + + gf_do_final_region_alignment(&rd); +} +#endif + +static + void +gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint64_t *s64, *d64, t1, t2, ta, tb, prod; + struct gf_w8_bytwo_data *btd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private; + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + + switch (val) { + case 2: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 3: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 4: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + case 5: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + case 6: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + /* + case 7: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta ^ prod; + d64++; + s64++; + } + } + break; + */ + case 8: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= ta; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = ta; + d64++; + s64++; + } + } + break; + /* + case 9: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 10: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 11: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 12: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 13: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 14: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + case 15: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 ^= (ta ^ prod); + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + ta = *s64; + prod = ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + prod ^= ta; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + *d64 = (ta ^ prod); + d64++; + s64++; + } + } + break; + */ + default: + if (xor) { + while (d64 < (uint64_t *) rd.d_top) { + prod = *d64 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } else { + while (d64 < (uint64_t *) rd.d_top) { + prod = 0 ; + ta = *s64; + tb = val; + while (1) { + if (tb & 1) prod ^= ta; + tb >>= 1; + if (tb == 0) break; + AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2); + } + *d64 = prod; + d64++; + s64++; + } + } + break; + } + gf_do_final_region_alignment(&rd); +} + + static +int gf_w8_bytwo_init(gf_t *gf) +{ + gf_internal_t *h; + uint64_t ip, m1, m2; + struct gf_w8_bytwo_data *btd; + + h = (gf_internal_t *) gf->scratch; + btd = (struct gf_w8_bytwo_data *) (h->private); + ip = h->prim_poly & 0xff; + m1 = 0xfe; + m2 = 0x80; + btd->prim_poly = 0; + btd->mask1 = 0; + btd->mask2 = 0; + + while (ip != 0) { + btd->prim_poly |= ip; + btd->mask1 |= m1; + btd->mask2 |= m2; + ip <<= GF_FIELD_WIDTH; + m1 <<= GF_FIELD_WIDTH; + m2 <<= GF_FIELD_WIDTH; + } + + if (h->mult_type == GF_MULT_BYTWO_p) { + SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply) +#ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region) + } else { +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; +#ifdef INTEL_SSE2 + } +#endif + } else { + SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply) +#ifdef INTEL_SSE2 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region) + } else { +#endif + SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; +#ifdef INTEL_SSE2 + } +#endif + } + return 1; +} + + +/* ------------------------------------------------------------ + General procedures. + You don't need to error check here on in init, because it's done + for you in gf_error_check(). + */ + +int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + switch(mult_type) + { + case GF_MULT_DEFAULT: + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64; + } + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + case GF_MULT_TABLE: + if (region_type == GF_REGION_CAUCHY) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + } + + if (region_type == GF_REGION_DEFAULT) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64; + } + if (region_type & GF_REGION_DOUBLE_TABLE) { + if (region_type == GF_REGION_DOUBLE_TABLE) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64; + } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64; + } else { + return 0; + } + } + return 0; + break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data); + break; + case GF_MULT_SPLIT_TABLE: + if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) { + return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64; + } + break; + case GF_MULT_LOG_TABLE: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64; + break; + case GF_MULT_LOG_ZERO: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64; + break; + case GF_MULT_LOG_ZERO_EXT: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64; + break; + case GF_MULT_CARRY_FREE: + return sizeof(gf_internal_t); + break; + case GF_MULT_SHIFT: + return sizeof(gf_internal_t); + break; + case GF_MULT_COMPOSITE: + return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64; + default: + return 0; + } + return 0; +} + +int gf_w8_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + /* Allen: set default primitive polynomial / irreducible polynomial if needed */ + + if (h->prim_poly == 0) { + if (h->mult_type == GF_MULT_COMPOSITE) { + h->prim_poly = gf_composite_get_default_poly(h->base_gf); + if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */ + } else { + h->prim_poly = 0x11d; + } + } + if (h->mult_type != GF_MULT_COMPOSITE) { + h->prim_poly |= 0x100; + } + + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,NULL) + SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word) + + switch(h->mult_type) { + case GF_MULT_DEFAULT: + case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: + case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break; + case GF_MULT_LOG_ZERO: + case GF_MULT_LOG_ZERO_EXT: + case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break; + case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break; + case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break; + case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break; + case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break; + default: return 0; + } + + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w8_euclid) + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_w8_matrix) + } + + if (gf->divide.w32 == NULL) { + SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse) + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid) + } + + if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide) + + if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { + SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word) + } + + if (h->region_type == GF_REGION_CAUCHY) { + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + } + + if (gf->multiply_region.w32 == NULL) { + SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single) + } + + return 1; +} + + +/* Inline setup functions */ + +uint8_t *gf_w8_get_mult_table(gf_t *gf) +{ + gf_internal_t *h; + struct gf_w8_default_data *ftd; + struct gf_w8_single_table_data *std; + + h = (gf_internal_t *) gf->scratch; + if (gf->multiply.w32 == gf_w8_default_multiply) { + ftd = (struct gf_w8_default_data *) h->private; + return (uint8_t *) ftd->multtable; + } else if (gf->multiply.w32 == gf_w8_table_multiply) { + std = (struct gf_w8_single_table_data *) h->private; + return (uint8_t *) std->multtable; + } + return NULL; +} + +uint8_t *gf_w8_get_div_table(gf_t *gf) +{ + struct gf_w8_default_data *ftd; + struct gf_w8_single_table_data *std; + + if (gf->multiply.w32 == gf_w8_default_multiply) { + ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint8_t *) ftd->divtable; + } else if (gf->multiply.w32 == gf_w8_table_multiply) { + std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private; + return (uint8_t *) std->divtable; + } + return NULL; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c new file mode 100644 index 000000000..1e3d2e0ce --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/gf_wgen.c @@ -0,0 +1,1019 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_wgen.c + * + * Routines for Galois fields for general w < 32. For specific w, + like 4, 8, 16, 32, 64 and 128, see the other files. + */ + +#include "gf_int.h" +#include +#include + +struct gf_wgen_table_w8_data { + uint8_t *mult; + uint8_t *div; + uint8_t base; +}; + +struct gf_wgen_table_w16_data { + uint16_t *mult; + uint16_t *div; + uint16_t base; +}; + +struct gf_wgen_log_w8_data { + uint8_t *log; + uint8_t *anti; + uint8_t *danti; + uint8_t base; +}; + +struct gf_wgen_log_w16_data { + uint16_t *log; + uint16_t *anti; + uint16_t *danti; + uint16_t base; +}; + +struct gf_wgen_log_w32_data { + uint32_t *log; + uint32_t *anti; + uint32_t *danti; + uint32_t base; +}; + +struct gf_wgen_group_data { + uint32_t *reduce; + uint32_t *shift; + uint32_t mask; + uint64_t rmask; + int tshift; + uint32_t memory; +}; + +static +inline +gf_val_32_t gf_wgen_inverse_from_divide (gf_t *gf, gf_val_32_t a) +{ + return gf->divide.w32(gf, 1, a); +} + +static +inline +gf_val_32_t gf_wgen_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + b = gf->inverse.w32(gf, b); + return gf->multiply.w32(gf, a, b); +} + +static +inline +gf_val_32_t gf_wgen_euclid (gf_t *gf, gf_val_32_t b) +{ + + gf_val_32_t e_i, e_im1, e_ip1; + gf_val_32_t d_i, d_im1, d_ip1; + gf_val_32_t y_i, y_im1, y_ip1; + gf_val_32_t c_i; + + if (b == 0) return -1; + e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; + e_i = b; + d_im1 = ((gf_internal_t *) (gf->scratch))->w; + for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ; + y_i = 1; + y_im1 = 0; + + while (e_i != 1) { + + e_ip1 = e_im1; + d_ip1 = d_im1; + c_i = 0; + + while (d_ip1 >= d_i) { + c_i ^= (1 << (d_ip1 - d_i)); + e_ip1 ^= (e_i << (d_ip1 - d_i)); + if (e_ip1 == 0) return 0; + while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--; + } + + y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i); + y_im1 = y_i; + y_i = y_ip1; + + e_im1 = e_i; + d_im1 = d_i; + e_i = e_ip1; + d_i = d_ip1; + } + + return y_i; +} + +gf_val_32_t gf_wgen_extract_word(gf_t *gf, void *start, int bytes, int index) +{ + uint8_t *ptr; + uint32_t rv; + int rs; + int byte, bit, i; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + rs = bytes / h->w; + byte = index/8; + bit = index%8; + + ptr = (uint8_t *) start; + ptr += bytes; + ptr -= rs; + ptr += byte; + + rv = 0; + for (i = 0; i < h->w; i++) { + rv <<= 1; + if ((*ptr) & (1 << bit)) rv |= 1; + ptr -= rs; + } + + return rv; +} + +static +inline +gf_val_32_t gf_wgen_matrix (gf_t *gf, gf_val_32_t b) +{ + return gf_bitmatrix_inverse(b, ((gf_internal_t *) (gf->scratch))->w, + ((gf_internal_t *) (gf->scratch))->prim_poly); +} + +static +inline +uint32_t +gf_wgen_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32) +{ + uint64_t product, i, pp, a, b, one; + gf_internal_t *h; + + a = a32; + b = b32; + h = (gf_internal_t *) gf->scratch; + one = 1; + pp = h->prim_poly | (one << h->w); + + product = 0; + + for (i = 0; i < (uint64_t)h->w; i++) { + if (a & (one << i)) product ^= (b << i); + } + for (i = h->w*2-1; i >= (uint64_t)h->w; i--) { + if (product & (one << i)) product ^= (pp << (i-h->w)); + } + return product; +} + +static +int gf_wgen_shift_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_wgen_shift_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) + return 1; +} + +static +gf_val_32_t +gf_wgen_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, bmask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + bmask = (1 << (h->w-1)); + + while (1) { + if (a & 1) prod ^= b; + a >>= 1; + if (a == 0) return prod; + if (b & bmask) { + b = ((b << 1) ^ pp); + } else { + b <<= 1; + } + } +} + +static +int gf_wgen_bytwo_b_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_b_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) + return 1; +} + +static +inline +gf_val_32_t +gf_wgen_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + uint32_t prod, pp, pmask, amask; + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + prod = 0; + pmask = (1 << ((h->w)-1)); /*Ben: Had an operator precedence warning here*/ + amask = pmask; + + while (amask != 0) { + if (prod & pmask) { + prod = ((prod << 1) ^ pp); + } else { + prod <<= 1; + } + if (a & amask) prod ^= b; + amask >>= 1; + } + return prod; +} + + +static +int gf_wgen_bytwo_p_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply,w32,gf_wgen_bytwo_p_multiply) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) + return 1; +} + +static +void +gf_wgen_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h) +{ + uint32_t i; + uint32_t j; + int g_s; + + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + } else { + g_s = h->arg1; + } + + shift[0] = 0; + + for (i = 1; i < ((uint32_t)1 << g_s); i <<= 1) { + for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; + if (val & (1 << (h->w-1))) { + val <<= 1; + val ^= h->prim_poly; + } else { + val <<= 1; + } + } +} + +static +inline +gf_val_32_t +gf_wgen_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int leftover, rs; + uint32_t p, l, ind, a32; + int bits_left; + int g_s; + int w; + + struct gf_wgen_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + g_s = h->arg1; + w = h->w; + + gd = (struct gf_wgen_group_data *) h->private; + gf_wgen_group_set_shift_tables(gd->shift, b, h); + + leftover = w % g_s; + if (leftover == 0) leftover = g_s; + + rs = w - leftover; + a32 = a; + ind = a32 >> rs; + a32 <<= leftover; + a32 &= gd->mask; + p = gd->shift[ind]; + + bits_left = rs; + rs = w - g_s; + + while (bits_left > 0) { + bits_left -= g_s; + ind = a32 >> rs; + a32 <<= g_s; + a32 &= gd->mask; + l = p >> rs; + p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)) & gd->mask; + } + return p; +} + +char *bits(uint32_t v) +{ + char *rv; + int i, j; + + rv = malloc(30); + j = 0; + for (i = 27; i >= 0; i--) { + rv[j] = '0' + ((v & (1 << i)) ? 1 : 0); + j++; + } + rv[j] = '\0'; + return rv; +} +char *bits_56(uint64_t v) +{ + char *rv; + int i, j; + uint64_t one; + + one = 1; + + rv = malloc(60); + j = 0; + for (i = 55; i >= 0; i--) { + rv[j] = '0' + ((v & (one << i)) ? 1 : 0); + j++; + } + rv[j] = '\0'; + return rv; +} + +static +inline +gf_val_32_t +gf_wgen_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + int i; + int leftover; + uint64_t p, l, r; + uint32_t a32, ind; + int g_s, g_r; + struct gf_wgen_group_data *gd; + int w; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + g_r = 8; + } else { + g_s = h->arg1; + g_r = h->arg2; + } + w = h->w; + gd = (struct gf_wgen_group_data *) h->private; + gf_wgen_group_set_shift_tables(gd->shift, b, h); + + leftover = w % g_s; + if (leftover == 0) leftover = g_s; + + a32 = a; + ind = a32 >> (w - leftover); + p = gd->shift[ind]; + p <<= g_s; + a32 <<= leftover; + a32 &= gd->mask; + + i = (w - leftover); + while (i > g_s) { + ind = a32 >> (w-g_s); + p ^= gd->shift[ind]; + a32 <<= g_s; + a32 &= gd->mask; + p <<= g_s; + i -= g_s; + } + + ind = a32 >> (h->w-g_s); + p ^= gd->shift[ind]; + + for (i = gd->tshift ; i >= 0; i -= g_r) { + l = p & (gd->rmask << i); + r = gd->reduce[l >> (i+w)]; + r <<= (i); + p ^= r; + } + return p & gd->mask; +} + +static +int gf_wgen_group_init(gf_t *gf) +{ + uint32_t i, j, p, index; + struct gf_wgen_group_data *gd; + gf_internal_t *h = (gf_internal_t *) gf->scratch; + uint32_t g_s, g_r; + + if (h->mult_type == GF_MULT_DEFAULT) { + g_s = 2; + g_r = 8; + } else { + g_s = h->arg1; + g_r = h->arg2; + } + gd = (struct gf_wgen_group_data *) h->private; + gd->shift = &(gd->memory); + gd->reduce = gd->shift + (1 << g_s); + gd->mask = (h->w != 31) ? ((1 << h->w)-1) : 0x7fffffff; + + gd->rmask = (1 << g_r) - 1; + gd->rmask <<= h->w; + + gd->tshift = h->w % g_s; + if (gd->tshift == 0) gd->tshift = g_s; + gd->tshift = (h->w - gd->tshift); + gd->tshift = ((gd->tshift-1)/g_r) * g_r; + + gd->reduce[0] = 0; + for (i = 0; i < ((uint32_t)1 << g_r); i++) { + p = 0; + index = 0; + for (j = 0; j < g_r; j++) { + if (i & (1 << j)) { + p ^= (h->prim_poly << j); + index ^= (h->prim_poly >> (h->w-j)); + } + } + gd->reduce[index] = (p & gd->mask); + } + + if (g_s == g_r) { + SET_FUNCTION(gf,multiply,w32,gf_wgen_group_s_equals_r_multiply) + } else { + SET_FUNCTION(gf,multiply,w32,gf_wgen_group_multiply) + } + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + return 1; +} + + +static +gf_val_32_t +gf_wgen_table_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w8_data *) h->private; + + return (std->mult[(a<w)+b]); +} + +static +gf_val_32_t +gf_wgen_table_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w8_data *) h->private; + + return (std->div[(a<w)+b]); +} + +static +int gf_wgen_table_8_init(gf_t *gf) +{ + gf_internal_t *h; + int w; + struct gf_wgen_table_w8_data *std; + uint32_t a, b, p; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_table_w8_data *) h->private; + + std->mult = &(std->base); + std->div = std->mult + ((1<w)*(1<w)); + + for (a = 0; a < ((uint32_t)1 << w); a++) { + std->mult[a] = 0; + std->mult[a<div[a] = 0; + std->div[a<mult[(a<div[(p<scratch; + std = (struct gf_wgen_table_w16_data *) h->private; + + return (std->mult[(a<w)+b]); +} + +static +gf_val_32_t +gf_wgen_table_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_table_w16_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_table_w16_data *) h->private; + + return (std->div[(a<w)+b]); +} + +static +int gf_wgen_table_16_init(gf_t *gf) +{ + gf_internal_t *h; + int w; + struct gf_wgen_table_w16_data *std; + uint32_t a, b, p; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_table_w16_data *) h->private; + + std->mult = &(std->base); + std->div = std->mult + ((1<w)*(1<w)); + + for (a = 0; a < ((uint32_t)1 << w); a++) { + std->mult[a] = 0; + std->mult[a<div[a] = 0; + std->div[a<mult[(a<div[(p<scratch; + if (h->w <= 8) return gf_wgen_table_8_init(gf); + if (h->w <= 14) return gf_wgen_table_16_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; +} + +static +gf_val_32_t +gf_wgen_log_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w8_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_8_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w8_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_8_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w8_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w8_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < ((uint32_t)1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < ((uint32_t)1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_8_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_8_divide) + return 1; +} + +static +gf_val_32_t +gf_wgen_log_16_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w16_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_16_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w16_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_16_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w16_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w16_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < ((uint32_t)1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < ((uint32_t)1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check) { + if (h->mult_type != GF_MULT_LOG_TABLE) return gf_wgen_shift_init(gf); + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_16_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_16_divide) + return 1; +} + +static +gf_val_32_t +gf_wgen_log_32_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w32_data *) h->private; + + if (a == 0 || b == 0) return 0; + return (std->anti[std->log[a]+std->log[b]]); +} + +static +gf_val_32_t +gf_wgen_log_32_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + int index; + + h = (gf_internal_t *) gf->scratch; + std = (struct gf_wgen_log_w32_data *) h->private; + + if (a == 0 || b == 0) return 0; + index = std->log[a]; + index -= std->log[b]; + + return (std->danti[index]); +} + +static +int gf_wgen_log_32_init(gf_t *gf) +{ + gf_internal_t *h; + struct gf_wgen_log_w32_data *std; + int w; + uint32_t a, i; + int check = 0; + + h = (gf_internal_t *) gf->scratch; + w = h->w; + std = (struct gf_wgen_log_w32_data *) h->private; + + std->log = &(std->base); + std->anti = std->log + (1<w); + std->danti = std->anti + (1<w)-1; + + for (i = 0; i < ((uint32_t)1 << w); i++) + std->log[i] = 0; + + a = 1; + for(i=0; i < ((uint32_t)1<log[a] != 0) check = 1; + std->log[a] = i; + std->anti[i] = a; + std->danti[i] = a; + a <<= 1; + if(a & (1<prim_poly; + //a &= ((1 << w)-1); + } + + if (check != 0) { + _gf_errno = GF_E_LOGPOLY; + return 0; + } + + SET_FUNCTION(gf,multiply,w32,gf_wgen_log_32_multiply) + SET_FUNCTION(gf,divide,w32,gf_wgen_log_32_divide) + return 1; +} + +static +int gf_wgen_log_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->w <= 8) return gf_wgen_log_8_init(gf); + if (h->w <= 16) return gf_wgen_log_16_init(gf); + if (h->w <= 32) return gf_wgen_log_32_init(gf); + + /* Returning zero to make the compiler happy, but this won't get + executed, because it is tested in _scratch_space. */ + + return 0; +} + +int gf_wgen_scratch_size(int w, int mult_type, int region_type, int divide_type, int arg1, int arg2) +{ + + switch(mult_type) + { + case GF_MULT_DEFAULT: + if (w <= 8) { + return sizeof(gf_internal_t) + sizeof(struct gf_wgen_table_w8_data) + + sizeof(uint8_t)*(1 << w)*(1<scratch; + rs = bytes / (h->w); + + written = (xor) ? 0xffffffff : 0; + for (i = 0; i < h->w; i++) { + for (j = 0; j < h->w; j++) { + if (val & (1 << j)) { + gf_multby_one(src, ((uint8_t *)dest) + j*rs, rs, (written & (1 << j))); + written |= (1 << j); + } + } + src = (uint8_t *)src + rs; + val = gf->multiply.w32(gf, val, 2); + } +} + +int gf_wgen_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + if (h->prim_poly == 0) { + switch (h->w) { + case 1: h->prim_poly = 1; break; + case 2: h->prim_poly = 7; break; + case 3: h->prim_poly = 013; break; + case 4: h->prim_poly = 023; break; + case 5: h->prim_poly = 045; break; + case 6: h->prim_poly = 0103; break; + case 7: h->prim_poly = 0211; break; + case 8: h->prim_poly = 0435; break; + case 9: h->prim_poly = 01021; break; + case 10: h->prim_poly = 02011; break; + case 11: h->prim_poly = 04005; break; + case 12: h->prim_poly = 010123; break; + case 13: h->prim_poly = 020033; break; + case 14: h->prim_poly = 042103; break; + case 15: h->prim_poly = 0100003; break; + case 16: h->prim_poly = 0210013; break; + case 17: h->prim_poly = 0400011; break; + case 18: h->prim_poly = 01000201; break; + case 19: h->prim_poly = 02000047; break; + case 20: h->prim_poly = 04000011; break; + case 21: h->prim_poly = 010000005; break; + case 22: h->prim_poly = 020000003; break; + case 23: h->prim_poly = 040000041; break; + case 24: h->prim_poly = 0100000207; break; + case 25: h->prim_poly = 0200000011; break; + case 26: h->prim_poly = 0400000107; break; + case 27: h->prim_poly = 01000000047; break; + case 28: h->prim_poly = 02000000011; break; + case 29: h->prim_poly = 04000000005; break; + case 30: h->prim_poly = 010040000007; break; + case 31: h->prim_poly = 020000000011; break; + case 32: h->prim_poly = 00020000007; break; + default: fprintf(stderr, "gf_wgen_init: w not defined yet\n"); exit(1); + } + } else { + if (h->w == 32) { + h->prim_poly &= 0xffffffff; + } else { + h->prim_poly |= (1 << h->w); + if (h->prim_poly & ~((1ULL<<(h->w+1))-1)) return 0; + } + } + + SET_FUNCTION(gf,multiply,w32,NULL) + SET_FUNCTION(gf,divide,w32,NULL) + SET_FUNCTION(gf,inverse,w32,NULL) + SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region) + SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word) + + switch(h->mult_type) { + case GF_MULT_DEFAULT: + if (h->w <= 8) { + if (gf_wgen_table_init(gf) == 0) return 0; + } else if (h->w <= 16) { + if (gf_wgen_log_init(gf) == 0) return 0; + } else { + if (gf_wgen_bytwo_p_init(gf) == 0) return 0; + } + break; + case GF_MULT_SHIFT: if (gf_wgen_shift_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_b: if (gf_wgen_bytwo_b_init(gf) == 0) return 0; break; + case GF_MULT_BYTWO_p: if (gf_wgen_bytwo_p_init(gf) == 0) return 0; break; + case GF_MULT_GROUP: if (gf_wgen_group_init(gf) == 0) return 0; break; + case GF_MULT_TABLE: if (gf_wgen_table_init(gf) == 0) return 0; break; + case GF_MULT_LOG_TABLE: if (gf_wgen_log_init(gf) == 0) return 0; break; + default: return 0; + } + if (h->divide_type == GF_DIVIDE_EUCLID) { + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) + } else if (h->divide_type == GF_DIVIDE_MATRIX) { + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) + SET_FUNCTION(gf,inverse,w32,gf_wgen_matrix) + } + + if (gf->inverse.w32== NULL && gf->divide.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_wgen_euclid) + + if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) { + SET_FUNCTION(gf,divide,w32,gf_wgen_divide_from_inverse) + } + if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) { + SET_FUNCTION(gf,inverse,w32,gf_wgen_inverse_from_divide) + } + return 1; +} diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c new file mode 100644 index 000000000..477ee6359 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w16_neon.c @@ -0,0 +1,276 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * + * gf_w16_neon.c + * + * Neon routines for 16-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w16.h" + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, + uint16_t *d_end, uint8_t *tbl, + gf_val_32_t val, int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint8x16_t loset, rl, rh; + uint8x16x2_t va; + +#ifdef ARCH_AARCH64 + uint8x16_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i] = vld1q_u8(tbl + i*16); + tbl_h[i] = vld1q_u8(high + i*16); + } +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } +#endif + + loset = vdupq_n_u8(0xf); + + if (xor) { + uint8x16x2_t vb; + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + vb = vld2q_u8((uint8_t*)dst); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + va.val[0] = veorq_u8(va.val[0], vb.val[0]); + va.val[1] = veorq_u8(va.val[1], vb.val[1]); + vst2q_u8((uint8_t*)dst, va); + + src += 16; + dst += 16; + } + } else { + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + vst2q_u8((uint8_t*)dst, va); + + src += 16; + dst += 16; + } + } +} + +static +inline +void +neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, + uint8_t *dst, uint8_t *d_end, + uint8_t *tbl, gf_val_32_t val, + int xor) +{ + unsigned i; + uint8_t *high = tbl + 4 * 16; + uint8x16_t vh, vl, rh, rl; + uint8x16_t loset; + +#ifdef ARCH_AARCH64 + uint8x16_t tbl_h[4], tbl_l[4]; +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; +#endif + for (i = 0; i < 4; i++) { +#ifdef ARCH_AARCH64 + tbl_l[i] = vld1q_u8(tbl + i*16); + tbl_h[i] = vld1q_u8(high + i*16); +#else + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); +#endif + } + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + vh = vld1q_u8(src); + vl = vld1q_u8(src + 16); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset))); + + vl = vshrq_n_u8(vl, 4); + vh = vshrq_n_u8(vh, 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl)); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh)); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh)); + + if (xor) { + vh = vld1q_u8(dst); + vl = vld1q_u8(dst + 16); + rh = veorq_u8(rh, vh); + rl = veorq_u8(rl, vl); + } + vst1q_u8(dst, rh); + vst1q_u8(dst + 16, rl); + + src += 32; + dst += 32; + } +} + + + +static +inline +void +neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor, + int altmap) +{ + gf_region_data rd; + unsigned i, j; + uint64_t c, prod; + uint8_t tbl[2 * 4 * 16]; + uint8_t *high = tbl + 4 * 16; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + for (i = 0; i < 4; i++) { + for (j = 0; j < 16; j++) { + c = (j << (i*4)); + prod = gf->multiply.w32(gf, c, val); + tbl[i*16 + j] = prod & 0xff; + high[i*16 + j] = prod >> 8; + } + } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); + gf_do_initial_region_alignment(&rd); + + if (altmap) { + uint8_t *s8 = rd.s_start; + uint8_t *d8 = rd.d_start; + uint8_t *end8 = rd.d_top; + if (xor) + neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1); + else + neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0); + } else { + uint16_t *s16 = rd.s_start; + uint16_t *d16 = rd.d_start; + uint16_t *end16 = rd.d_top; + if (xor) + neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1); + else + neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0); + } + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, + gf_val_32_t val, int bytes, + int xor) +{ + neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + + +void gf_w16_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon) + else + SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon) +} diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c new file mode 100644 index 000000000..7fd13290e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w32_neon.c @@ -0,0 +1,269 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w32_neon.c + * + * Neon routines for 32-bit Galois fields + * + */ + + +#include "gf_int.h" +#include +#include +#include "gf_w32.h" + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +void +neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, + uint32_t *d_end, uint8_t btable[8][4][16], + uint32_t val, int xor, int altmap) +{ + int i, j; +#ifdef ARCH_AARCH64 + uint8x16_t tables[8][4]; +#else + uint8x8x2_t tables[8][4]; +#endif + uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; + uint8x16_t p0, p1, p2, p3, si, mask1; + uint16x8x2_t r0, r1; + uint8x16x2_t q0, q1; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 4; j++) { +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable[i][j]); +#else + tables[i][j].val[0] = vld1_u8(btable[i][j]); + tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + v0 = vld1q_u32(src); src += 4; + v1 = vld1q_u32(src); src += 4; + v2 = vld1q_u32(src); src += 4; + v3 = vld1q_u32(src); src += 4; + + if (altmap) { + q0.val[0] = vreinterpretq_u8_u32(v0); + q0.val[1] = vreinterpretq_u8_u32(v1); + q1.val[0] = vreinterpretq_u8_u32(v2); + q1.val[1] = vreinterpretq_u8_u32(v3); + } else { + r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); + r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); + + q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), + vreinterpretq_u8_u16(r1.val[0])); + q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), + vreinterpretq_u8_u16(r1.val[1])); + } + + si = vandq_u8(q0.val[0], mask1); + p0 = vqtbl1q_u8(tables[0][0], si); + p1 = vqtbl1q_u8(tables[0][1], si); + p2 = vqtbl1q_u8(tables[0][2], si); + p3 = vqtbl1q_u8(tables[0][3], si); + + si = vshrq_n_u8(q0.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); + + si = vandq_u8(q0.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); + + si = vshrq_n_u8(q0.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); + + si = vandq_u8(q1.val[0], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); + + si = vshrq_n_u8(q1.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); + + si = vandq_u8(q1.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); + + si = vshrq_n_u8(q1.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); + + if (altmap) { + s0 = vreinterpretq_u32_u8(p0); + s1 = vreinterpretq_u32_u8(p1); + s2 = vreinterpretq_u32_u8(p2); + s3 = vreinterpretq_u32_u8(p3); + } else { + q0 = vtrnq_u8(p0, p1); + q1 = vtrnq_u8(p2, p3); + + r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), + vreinterpretq_u16_u8(q1.val[0])); + r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), + vreinterpretq_u16_u8(q1.val[1])); + + s0 = vreinterpretq_u32_u16(r0.val[0]); + s1 = vreinterpretq_u32_u16(r1.val[0]); + s2 = vreinterpretq_u32_u16(r0.val[1]); + s3 = vreinterpretq_u32_u16(r1.val[1]); + } + + if (xor) { + v0 = vld1q_u32(dst); + v1 = vld1q_u32(dst + 4); + v2 = vld1q_u32(dst + 8); + v3 = vld1q_u32(dst + 12); + s0 = veorq_u32(s0, v0); + s1 = veorq_u32(s1, v1); + s2 = veorq_u32(s2, v2); + s3 = veorq_u32(s3, v3); + } + + vst1q_u32(dst, s0); + vst1q_u32(dst + 4, s1); + vst1q_u32(dst + 8, s2); + vst1q_u32(dst + 12, s3); + + dst += 16; + } +} + +static +inline +void +neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap) +{ + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + uint8_t btable[8][4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 8; i++) { + tmp_table[0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + tmp_table[k^j] = (v ^ tmp_table[k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[i][j][k] = (uint8_t) tmp_table[k]; + tmp_table[k] >>= 8; + } + } + } + + if (xor) + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap); + else + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap); + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, gf_val_32_t val, + int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + +void gf_w32_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_altmap_multiply_region_neon) + else + SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region_neon) + +} diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c new file mode 100644 index 000000000..5f35c8634 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c @@ -0,0 +1,247 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w4_neon.c + * + * Neon routines for 4-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w4.h" + +static +gf_val_32_t +gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + poly8x8_t result, prim_poly; + poly8x8_t a, b, w; + uint8x8_t v; + gf_internal_t * h = gf->scratch; + + a = vdup_n_p8 (a4); + b = vdup_n_p8 (b4); + + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + result = vmul_p8 (a, b); + v = vshr_n_u8 (vreinterpret_u8_p8(result), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w))); + + /* Extracts 32 bit value from result. */ + rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0); + + return rv; +} + +static inline void +neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8, + gf_val_32_t val, uint8_t *d_end, int xor) +{ + gf_internal_t * h = gf->scratch; + poly8x8_t prim_poly; + poly8x8_t a, w, even, odd; + uint8x8_t b, c, v, mask; + + a = vdup_n_p8 (val); + mask = vdup_n_u8 (0xf); + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL)); + + while (d8 < d_end) { + b = vld1_u8 (s8); + + even = vreinterpret_p8_u8 (vand_u8 (b, mask)); + odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4)); + + if (xor) + c = vld1_u8 (d8); + + even = vmul_p8 (a, even); + odd = vmul_p8 (a, odd); + + v = vshr_n_u8 (vreinterpret_u8_p8(even), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w))); + + v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w))); + + v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4)); + + if (xor) + v = veor_u8 (c, v); + + vst1_u8 (d8, v); + + d8 += 8; + s8 += 8; + } +} + + +static void +gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1); + else + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0); + + gf_do_final_region_alignment(&rd); +} + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst, + uint8_t * d_end, gf_val_32_t val, int xor) +{ + struct gf_single_table_data *std; + uint8_t *base; + uint8x16_t r, va, vh, vl, loset; + +#ifdef ARCH_AARCH64 + uint8x16_t th, tl; +#else + uint8x8x2_t th, tl; +#endif + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = (uint8_t *) std->mult; + base += (val << GF_FIELD_WIDTH); + +#ifdef ARCH_AARCH64 + tl = vld1q_u8 (base); + th = vshlq_n_u8 (tl, 4); +#else + tl.val[0] = vld1_u8 (base); + tl.val[1] = vld1_u8 (base + 8); + th.val[0] = vshl_n_u8 (tl.val[0], 4); + th.val[1] = vshl_n_u8 (tl.val[1], 4); +#endif + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + va = vld1q_u8 (src); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); + + if (xor) + va = vld1q_u8 (dst); + + vh = vqtbl1q_u8 (th, vh); + vl = vqtbl1q_u8 (tl, vl); + + r = veorq_u8 (vh, vl); + + if (xor) + r = veorq_u8 (va, r); + + vst1q_u8 (dst, r); + + dst += 16; + src += 16; + } +} + +static +void +gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *sptr, *dptr, *top; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + sptr = rd.s_start; + dptr = rd.d_start; + top = rd.d_top; + + if (xor) + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1); + else + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0); + + gf_do_final_region_alignment(&rd); + +} + + +int gf_w4_neon_cfm_init(gf_t *gf) +{ + // single clm multiplication probably pointless + SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply) + SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single) + + return 1; +} + +void gf_w4_neon_single_table_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon) +} diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c new file mode 100644 index 000000000..24098232e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c @@ -0,0 +1,333 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w64_neon.c + * + * Neon routines for 64-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w64.h" + + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src, + uint64_t *dst, uint64_t *d_end, + uint64_t val, int xor) +{ + unsigned i, j, k; + uint8_t btable[16]; +#ifdef ARCH_AARCH64 + uint8x16_t tables[16][8]; +#else + uint8x8x2_t tables[16][8]; +#endif + uint8x16_t p[8], mask1, si; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable); +#else + tables[i][j].val[0] = vld1_u8(btable); + tables[i][j].val[1] = vld1_u8(btable + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + if (xor) { + for (i = 0; i < 8; i++) + p[i] = vld1q_u8((uint8_t *) (dst + i * 2)); + } else { + for (i = 0; i < 8; i++) + p[i] = vdupq_n_u8(0); + } + + i = 0; + for (k = 0; k < 8; k++) { + uint8x16_t v0 = vld1q_u8((uint8_t *) src); + src += 2; + + si = vandq_u8(v0, mask1); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + si = vshrq_n_u8(v0, 4); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + + } + for (i = 0; i < 8; i++) { + vst1q_u8((uint8_t *) dst, p[i]); + dst += 2; + } + } +} + +static +inline +void +neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst, + uint64_t *d_end, uint64_t val, int xor) +{ + unsigned i, j, k; + uint8_t btable[16]; +#ifdef ARCH_AARCH64 + uint8x16_t tables[16][8]; +#else + uint8x8x2_t tables[16][8]; +#endif + uint8x16_t p[8], mask1, si; + uint64x2_t st[8]; + uint32x4x2_t s32[4]; + uint16x8x2_t s16[4]; + uint8x16x2_t s8[4]; + + gf_internal_t *h = (gf_internal_t *) gf->scratch; + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 8; j++) { + for (k = 0; k < 16; k++) { + btable[k] = (uint8_t) ld->tables[i][k]; + ld->tables[i][k] >>= 8; + } +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable); +#else + tables[i][j].val[0] = vld1_u8(btable); + tables[i][j].val[1] = vld1_u8(btable + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + for (k = 0; k < 8; k++) { + st[k] = vld1q_u64(src); + src += 2; + p[k] = vdupq_n_u8(0); + } + + s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]), + vreinterpretq_u32_u64(st[1])); + s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]), + vreinterpretq_u32_u64(st[3])); + s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]), + vreinterpretq_u32_u64(st[5])); + s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]), + vreinterpretq_u32_u64(st[7])); + + s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]), + vreinterpretq_u16_u32(s32[1].val[0])); + s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]), + vreinterpretq_u16_u32(s32[3].val[0])); + s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]), + vreinterpretq_u16_u32(s32[1].val[1])); + s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]), + vreinterpretq_u16_u32(s32[3].val[1])); + + s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]), + vreinterpretq_u8_u16(s16[1].val[0])); + s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]), + vreinterpretq_u8_u16(s16[1].val[1])); + s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]), + vreinterpretq_u8_u16(s16[3].val[0])); + s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]), + vreinterpretq_u8_u16(s16[3].val[1])); + + i = 0; + for (k = 0; k < 8; k++) { + si = vandq_u8(s8[k >> 1].val[k & 1], mask1); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4); + for (j = 0; j < 8; j++) { + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); + } + i++; + } + + s8[0] = vzipq_u8(p[0], p[1]); + s8[1] = vzipq_u8(p[2], p[3]); + s8[2] = vzipq_u8(p[4], p[5]); + s8[3] = vzipq_u8(p[6], p[7]); + + s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]), + vreinterpretq_u16_u8(s8[1].val[0])); + s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]), + vreinterpretq_u16_u8(s8[3].val[0])); + s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]), + vreinterpretq_u16_u8(s8[1].val[1])); + s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]), + vreinterpretq_u16_u8(s8[3].val[1])); + + s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]), + vreinterpretq_u32_u16(s16[1].val[0])); + s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]), + vreinterpretq_u32_u16(s16[1].val[1])); + s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]), + vreinterpretq_u32_u16(s16[3].val[0])); + s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]), + vreinterpretq_u32_u16(s16[3].val[1])); + + for (k = 0; k < 8; k ++) { + st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]); + } + + if (xor) { + for (i = 0; i < 8; i++) { + uint64x2_t t1 = vld1q_u64(dst); + vst1q_u64(dst, veorq_u64(st[i], t1)); + dst += 2; + } + } else { + for (i = 0; i < 8; i++) { + vst1q_u64(dst, st[i]); + dst += 2; + } + } + + } +} + +static +void +gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest, + uint64_t val, int bytes, int xor, + int altmap) +{ + gf_internal_t *h; + int i, j, k; + uint64_t pp, v, *s64, *d64, *top; + struct gf_split_4_64_lazy_data *ld; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); + gf_do_initial_region_alignment(&rd); + + s64 = (uint64_t *) rd.s_start; + d64 = (uint64_t *) rd.d_start; + top = (uint64_t *) rd.d_top; + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + ld = (struct gf_split_4_64_lazy_data *) h->private; + + v = val; + for (i = 0; i < 16; i++) { + ld->tables[i][0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + } + + if (altmap) { + if (xor) + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1); + else + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0); + } else { + if (xor) + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1); + else + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0); + } + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + uint64_t val, int bytes, int xor) +{ + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, uint64_t val, + int bytes, int xor) +{ + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + +void gf_w64_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon) + else + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon) + +} diff --git a/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c new file mode 100644 index 000000000..0cce5ba7e --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c @@ -0,0 +1,302 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w8_neon.c + * + * Neon optimized routines for 8-bit Galois fields + * + */ + +#include "gf_int.h" +#include "gf_w8.h" +#include +#include + +/* ARM NEON reducing macro for the carry free multiplication + * vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts + * the result to the right by 1 byte. This allows us to multiply + * the prim_poly by the leading bits of the result. We then xor the result + * of that operation back with the result. */ +#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial) \ + do { \ + if (initial) \ + v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8); \ + else \ + v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8)); \ + w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v)); \ + result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \ + } while (0) + +static +inline +gf_val_32_t +gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x) +{ + gf_val_32_t rv = 0; + poly8x8_t a, b; + uint8x8_t v; + poly16x8_t result; + poly8x8_t prim_poly; + poly16x8_t w; + gf_internal_t * h = gf->scratch; + + a = vdup_n_p8 (a8); + b = vdup_n_p8 (b8); + + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL)); + + /* Do the initial multiply */ + result = vmull_p8 (a, b); + + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only + have to do the reduction at most twice, because (w-2)/z == 2. Where + z is equal to the number of zeros after the leading 1 */ + NEON_CFM_REDUCE (v, w, result, prim_poly, 1); + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + if (x >= 3) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + if (x >= 4) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + /* Extracts 32 bit value from result. */ + rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0); + + return rv; +} + +#define CLM_MULTIPLY(x) \ +static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \ +{\ + return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\ +} + +CLM_MULTIPLY(2) +CLM_MULTIPLY(3) +CLM_MULTIPLY(4) + +static inline void +neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8, + gf_val_32_t val, uint8_t *d_end, + int xor, int x) +{ + gf_internal_t * h = gf->scratch; + poly8x8_t a, b; + uint8x8_t c, v; + poly16x8_t result; + poly8x8_t prim_poly; + poly16x8_t w; + + a = vdup_n_p8 (val); + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL)); + + while (d8 < d_end) { + b = vld1_p8 ((poly8_t *) s8); + + if (xor) + c = vld1_u8 (d8); + + result = vmull_p8 (a, b); + + NEON_CFM_REDUCE(v, w, result, prim_poly, 1); + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + if (x >= 3) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + if (x >= 4) { + NEON_CFM_REDUCE (v, w, result, prim_poly, 0); + } + v = vmovn_u16 (vreinterpretq_u16_p16 (result)); + if (xor) + v = veor_u8 (c, v); + + vst1_u8 (d8, v); + + d8 += 8; + s8 += 8; + } +} + +#define CLM_MULT_REGION(x) \ +static void \ +gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src, \ + void *dest, \ + gf_val_32_t val, int bytes, \ + int xor) \ +{ \ + gf_region_data rd; \ + uint8_t *s8; \ + uint8_t *d8; \ + \ + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } \ + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } \ + \ + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); \ + gf_do_initial_region_alignment(&rd); \ + s8 = (uint8_t *) rd.s_start; \ + d8 = (uint8_t *) rd.d_start; \ + \ + if (xor) \ + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \ + else \ + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\ + gf_do_final_region_alignment(&rd); \ +} + +CLM_MULT_REGION(2) +CLM_MULT_REGION(3) +CLM_MULT_REGION(4) + + +int gf_w8_neon_cfm_init(gf_t *gf) +{ + gf_internal_t *h; + + h = (gf_internal_t *) gf->scratch; + + if ((0xe0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_2) + }else if ((0xc0 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_3) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_3) + }else if ((0x80 & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_4) + }else{ + return 0; + } + return 1; +} + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +void +gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor) +{ + uint8_t *bh, *bl, *sptr, *dptr; + uint8x16_t r, va, vh, vl, loset; +#ifdef ARCH_AARCH64 + uint8x16_t mth, mtl; +#else + uint8x8x2_t mth, mtl; +#endif + struct gf_w8_half_table_data *htd; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + bh = (uint8_t *) htd->high; + bh += (val << 4); + bl = (uint8_t *) htd->low; + bl += (val << 4); + + sptr = rd.s_start; + dptr = rd.d_start; + +#ifdef ARCH_AARCH64 + mth = vld1q_u8 (bh); + mtl = vld1q_u8 (bl); +#else + mth.val[0] = vld1_u8 (bh); + mtl.val[0] = vld1_u8 (bl); + mth.val[1] = vld1_u8 (bh + 8); + mtl.val[1] = vld1_u8 (bl + 8); +#endif + + loset = vdupq_n_u8(0xf); + + if (xor) { + while (sptr < (uint8_t *) rd.s_top) { + va = vld1q_u8 (sptr); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); + va = vld1q_u8 (dptr); + + vh = vqtbl1q_u8 (mth, vh); + vl = vqtbl1q_u8 (mtl, vl); + + r = veorq_u8 (vh, vl); + + vst1q_u8 (dptr, veorq_u8 (va, r)); + + dptr += 16; + sptr += 16; + } + } else { + while (sptr < (uint8_t *) rd.s_top) { + va = vld1q_u8 (sptr); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); +#ifdef ARCH_AARCH64 + vh = vqtbl1q_u8 (mth, vh); + vl = vqtbl1q_u8 (mtl, vl); +#else + vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)), + vtbl2_u8 (mth, vget_high_u8 (vh))); + vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)), + vtbl2_u8 (mtl, vget_high_u8 (vl))); +#endif + + r = veorq_u8 (vh, vl); + + vst1q_u8(dptr, r); + + dptr += 16; + sptr += 16; + } + } + + gf_do_final_region_alignment(&rd); +} + + +void gf_w8_neon_split_init(gf_t *gf) +{ + SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_neon) +} diff --git a/src/erasure-code/jerasure/gf-complete/test/Makefile.am b/src/erasure-code/jerasure/gf-complete/test/Makefile.am new file mode 100644 index 000000000..f590ecca0 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/test/Makefile.am @@ -0,0 +1,11 @@ +# GF-Complete 'test' AM file + +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 -fPIC + +bin_PROGRAMS = gf_unit + +gf_unit_SOURCES = gf_unit.c +#gf_unit_LDFLAGS = -lgf_complete +gf_unit_LDADD = ../src/libgf_complete.la + diff --git a/src/erasure-code/jerasure/gf-complete/test/gf_unit.c b/src/erasure-code/jerasure/gf-complete/test/gf_unit.c new file mode 100644 index 000000000..db26849db --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/test/gf_unit.c @@ -0,0 +1,458 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_unit.c + * + * Performs unit testing for gf arithmetic + */ + +#include "config.h" + +#ifdef HAVE_POSIX_MEMALIGN +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_int.h" +#include "gf_method.h" +#include "gf_rand.h" +#include "gf_general.h" + +#define REGION_SIZE (16384) +#define RMASK (0x00000000ffffffffLL) +#define LMASK (0xffffffff00000000LL) + +void problem(char *s) +{ + fprintf(stderr, "Unit test failed.\n"); + fprintf(stderr, "%s\n", s); + exit(1); +} + +char *BM = "Bad Method: "; + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_unit w tests seed [method] - does unit testing in GF(2^w)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n"); + fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Tests may be any combination of:\n"); + fprintf(stderr, " A: All\n"); + fprintf(stderr, " S: Single operations (multiplication/division)\n"); + fprintf(stderr, " R: Region operations\n"); + fprintf(stderr, " V: Verbose Output\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Use -1 for time(0) as a seed.\n"); + fprintf(stderr, "\n"); + if (s == BM) { + fprintf(stderr, "%s", BM); + gf_error(); + } else if (s != NULL) { + fprintf(stderr, "%s\n", s); + } + exit(1); +} + +void SigHandler(int v) +{ + fprintf(stderr, "Problem: SegFault!\n"); + fflush(stdout); + exit(2); +} + +int main(int argc, char **argv) +{ + signal(SIGSEGV, SigHandler); + + int w, i, verbose, single, region, top; + int s_start, d_start, bytes, xor, alignment_test; + gf_t gf, gf_def; + time_t t0; + gf_internal_t *h; + gf_general_t *a, *b, *c, *d; + uint8_t a8, b8, c8, *mult4 = NULL, *mult8 = NULL; + uint16_t a16, b16, c16, *log16 = NULL, *alog16 = NULL; + char as[50], bs[50], cs[50], ds[50]; + uint32_t mask = 0; + char *ra, *rb, *rc, *rd, *target; + int align; +#ifndef HAVE_POSIX_MEMALIGN + char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd; +#endif + + + if (argc < 4) usage(NULL); + + if (sscanf(argv[1], "%d", &w) == 0){ + usage("Bad w\n"); + } + + if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n"); + if (t0 == -1) t0 = time(0); + MOA_Seed(t0); + + if (w > 32 && w != 64 && w != 128) usage("Bad w"); + + if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) { + usage(BM); + } + + printf("Args: "); + for (i = 1; i < argc; i++) { + printf ("%s ", argv[i]); + } + printf("/ size (bytes): %d\n", gf_size(&gf)); + + for (i = 0; i < strlen(argv[2]); i++) { + if (strchr("ASRV", argv[2][i]) == NULL) usage("Bad test\n"); + } + + h = (gf_internal_t *) gf.scratch; + a = (gf_general_t *) malloc(sizeof(gf_general_t)); + b = (gf_general_t *) malloc(sizeof(gf_general_t)); + c = (gf_general_t *) malloc(sizeof(gf_general_t)); + d = (gf_general_t *) malloc(sizeof(gf_general_t)); + +#if HAVE_POSIX_MEMALIGN + if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE)) + ra = NULL; + if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE)) + rb = NULL; + if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE)) + rc = NULL; + if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE)) + rd = NULL; +#else + //15 bytes extra to make sure it's 16byte aligned + malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15); + malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15); + ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf)); + rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf)); + rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf)); + rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf)); +#endif + + if (w <= 32) { + mask = 0; + for (i = 0; i < w; i++) mask |= (1 << i); + } + + verbose = (strchr(argv[2], 'V') != NULL); + single = (strchr(argv[2], 'S') != NULL || strchr(argv[2], 'A') != NULL); + region = (strchr(argv[2], 'R') != NULL || strchr(argv[2], 'A') != NULL); + + if (!gf_init_hard(&gf_def, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT, + (h->mult_type != GF_MULT_COMPOSITE) ? h->prim_poly : 0, 0, 0, NULL, NULL)) + problem("No default for this value of w"); + + if (w == 4) { + mult4 = gf_w4_get_mult_table(&gf); + } else if (w == 8) { + mult8 = gf_w8_get_mult_table(&gf); + } else if (w == 16) { + log16 = gf_w16_get_log_table(&gf); + alog16 = gf_w16_get_mult_alog_table(&gf); + } + + if (verbose) printf("Seed: %ld\n", t0); + + if (single) { + + if (gf.multiply.w32 == NULL) problem("No multiplication operation defined."); + if (verbose) { printf("Testing single multiplications/divisions.\n"); fflush(stdout); } + if (w <= 10) { + top = (1 << w)*(1 << w); + } else { + top = 1024*1024; + } + for (i = 0; i < top; i++) { + if (w <= 10) { + a->w32 = i % (1 << w); + b->w32 = (i >> w); + + //Allen: the following conditions were being run 10 times each. That didn't seem like nearly enough to + //me for these special cases, so I converted to doing this mod stuff to easily make the number of times + //run both larger and proportional to the total size of the run. + } else { + switch (i % 32) + { + case 0: + gf_general_set_zero(a, w); + gf_general_set_random(b, w, 1); + break; + case 1: + gf_general_set_random(a, w, 1); + gf_general_set_zero(b, w); + break; + case 2: + gf_general_set_one(a, w); + gf_general_set_random(b, w, 1); + break; + case 3: + gf_general_set_random(a, w, 1); + gf_general_set_one(b, w); + break; + default: + gf_general_set_random(a, w, 1); + gf_general_set_random(b, w, 1); + } + } + + //Allen: the following special cases for w=64 are based on the code below for w=128. + //These w=64 cases are based on Dr. Plank's suggestion because some of the methods for w=64 + //involve splitting it in two. I think they're less likely to give errors than the 128-bit case + //though, because the 128 bit case is always split in two. + //As with w=128, I'm arbitrarily deciding to do this sort of thing with a quarter of the cases + if (w == 64) { + switch (i % 32) + { + case 0: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; break; + case 1: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; break; + case 2: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 3: if (!gf_general_is_one(a, w)) a->w64 &= RMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + case 4: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 5: if (!gf_general_is_one(a, w)) a->w64 &= LMASK; if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + case 6: if (!gf_general_is_one(b, w)) b->w64 &= RMASK; break; + case 7: if (!gf_general_is_one(b, w)) b->w64 &= LMASK; break; + } + } + + //Allen: for w=128, we have important special cases where one half or the other of the number is all + //zeros. The probability of hitting such a number randomly is 1^-64, so if we don't force these cases + //we'll probably never hit them. This could be implemented more efficiently by changing the set-random + //function for w=128, but I think this is easier to follow. + //I'm arbitrarily deciding to do this sort of thing with a quarter of the cases + if (w == 128) { + switch (i % 32) + { + case 0: if (!gf_general_is_one(a, w)) a->w128[0] = 0; break; + case 1: if (!gf_general_is_one(a, w)) a->w128[1] = 0; break; + case 2: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 3: if (!gf_general_is_one(a, w)) a->w128[0] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + case 4: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 5: if (!gf_general_is_one(a, w)) a->w128[1] = 0; if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + case 6: if (!gf_general_is_one(b, w)) b->w128[0] = 0; break; + case 7: if (!gf_general_is_one(b, w)) b->w128[1] = 0; break; + } + } + + gf_general_multiply(&gf, a, b, c); + + /* If w is 4, 8 or 16, then there are inline multiplication/division methods. + Test them here. */ + + if (w == 4 && mult4 != NULL) { + a8 = a->w32; + b8 = b->w32; + c8 = GF_W4_INLINE_MULTDIV(mult4, a8, b8); + if (c8 != c->w32) { + printf("Error in inline multiplication. %d * %d. Inline = %d. Default = %d.\n", + a8, b8, c8, c->w32); + exit(1); + } + } + + if (w == 8 && mult8 != NULL) { + a8 = a->w32; + b8 = b->w32; + c8 = GF_W8_INLINE_MULTDIV(mult8, a8, b8); + if (c8 != c->w32) { + printf("Error in inline multiplication. %d * %d. Inline = %d. Default = %d.\n", + a8, b8, c8, c->w32); + exit(1); + } + } + + if (w == 16 && log16 != NULL) { + a16 = a->w32; + b16 = b->w32; + c16 = GF_W16_INLINE_MULT(log16, alog16, a16, b16); + if (c16 != c->w32) { + printf("Error in inline multiplication. %d * %d. Inline = %d. Default = %d.\n", + a16, b16, c16, c->w32); + printf("%d %d\n", log16[a16], log16[b16]); + top = log16[a16] + log16[b16]; + printf("%d %d\n", top, alog16[top]); + exit(1); + } + } + + /* If this is not composite, then first test against the default: */ + + if (h->mult_type != GF_MULT_COMPOSITE) { + gf_general_multiply(&gf_def, a, b, d); + + if (!gf_general_are_equal(c, d, w)) { + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + gf_general_val_to_s(d, w, ds, 1); + printf("Error in single multiplication (all numbers in hex):\n\n"); + printf(" gf.multiply(gf, %s, %s) = %s\n", as, bs, cs); + printf(" The default gf multiplier returned %s\n", ds); + exit(1); + } + } + + /* Now, we also need to double-check by other means, in case the default is wanky, + and when we're performing composite operations. Start with 0 and 1, where we know + what the result should be. */ + + if (gf_general_is_zero(a, w) || gf_general_is_zero(b, w) || + gf_general_is_one(a, w) || gf_general_is_one(b, w)) { + if (((gf_general_is_zero(a, w) || gf_general_is_zero(b, w)) && !gf_general_is_zero(c, w)) || + (gf_general_is_one(a, w) && !gf_general_are_equal(b, c, w)) || + (gf_general_is_one(b, w) && !gf_general_are_equal(a, c, w))) { + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + printf("Error in single multiplication (all numbers in hex):\n\n"); + printf(" gf.multiply(gf, %s, %s) = %s, which is clearly wrong.\n", as, bs, cs); + exit(1); + } + } + + /* Dumb check to make sure that it's not returning numbers that are too big: */ + + if (w < 32 && (c->w32 & mask) != c->w32) { + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + printf("Error in single multiplication (all numbers in hex):\n\n"); + printf(" gf.multiply.w32(gf, %s, %s) = %s, which is too big.\n", as, bs, cs); + exit(1); + } + + /* Finally, let's check to see that multiplication and division work together */ + + if (!gf_general_is_zero(a, w)) { + gf_general_divide(&gf, c, a, d); + if (!gf_general_are_equal(b, d, w)) { + gf_general_val_to_s(a, w, as, 1); + gf_general_val_to_s(b, w, bs, 1); + gf_general_val_to_s(c, w, cs, 1); + gf_general_val_to_s(d, w, ds, 1); + printf("Error in single multiplication/division (all numbers in hex):\n\n"); + printf(" gf.multiply(gf, %s, %s) = %s, but gf.divide(gf, %s, %s) = %s\n", as, bs, cs, cs, as, ds); + exit(1); + } + } + + } + } + + if (region) { + if (verbose) { printf("Testing region multiplications\n"); fflush(stdout); } + for (i = 0; i < 1024; i++) { + //Allen: changing to a switch thing as with the single ops to make things proportional + switch (i % 32) + { + case 0: + gf_general_set_zero(a, w); + break; + case 1: + gf_general_set_one(a, w); + break; + case 2: + gf_general_set_two(a, w); + break; + default: + gf_general_set_random(a, w, 1); + } + MOA_Fill_Random_Region(ra, REGION_SIZE); + MOA_Fill_Random_Region(rb, REGION_SIZE); + xor = (i/32)%2; + align = w/8; + if (align == 0) align = 1; + if (align > 16) align = 16; + + /* JSP - Cauchy test. When w < 32 & it doesn't equal 4, 8 or 16, the default is + equal to GF_REGION_CAUCHY, even if GF_REGION_CAUCHY is not set. We are testing + three alignments here: + + 1. Anything goes -- no alignment guaranteed. + 2. Perfect alignment. Here src and dest must be aligned wrt each other, + and bytes must be a multiple of 16*w. + 3. Imperfect alignment. Here we'll have src and dest be aligned wrt each + other, but bytes is simply a multiple of w. That means some XOR's will + be aligned, and some won't. + */ + + if ((h->region_type & GF_REGION_CAUCHY) || (w < 32 && w != 4 && w != 8 && w != 16)) { + alignment_test = (i%3); + + s_start = MOA_Random_W(5, 1); + if (alignment_test == 0) { + d_start = MOA_Random_W(5, 1); + } else { + d_start = s_start; + } + + bytes = (d_start > s_start) ? REGION_SIZE - d_start : REGION_SIZE - s_start; + bytes -= MOA_Random_W(5, 1); + if (alignment_test == 1) { + bytes -= (bytes % (w*16)); + } else { + bytes -= (bytes % w); + } + + target = rb; + + /* JSP - Otherwise, we're testing a non-cauchy test, and alignment + must be more strict. We have to make sure that the regions are + aligned wrt each other on 16-byte pointers. */ + + } else { + s_start = MOA_Random_W(5, 1) * align; + d_start = s_start; + bytes = REGION_SIZE - s_start - MOA_Random_W(5, 1); + bytes -= (bytes % align); + + if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) { + target = rb ; + } else { + target = (i/64)%2 ? rb : ra; + } + } + + memcpy(rc, ra, REGION_SIZE); + memcpy(rd, target, REGION_SIZE); + gf_general_do_region_multiply(&gf, a, ra+s_start, target+d_start, bytes, xor); + gf_general_do_region_check(&gf, a, rc+s_start, rd+d_start, target+d_start, bytes, xor); + } + } + + free(a); + free(b); + free(c); + free(d); +#ifdef HAVE_POSIX_MEMALIGN + free(ra); + free(rb); + free(rc); + free(rd); +#else + free(malloc_ra); + free(malloc_rb); + free(malloc_rc); + free(malloc_rd); +#endif + + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/Makefile.am b/src/erasure-code/jerasure/gf-complete/tools/Makefile.am new file mode 100644 index 000000000..4ca9131aa --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/Makefile.am @@ -0,0 +1,56 @@ +# GF-Complete 'tools' AM file + +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include +AM_CFLAGS = -O3 -fPIC + +bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time + +gf_mult_SOURCES = gf_mult.c +#gf_mult_LDFLAGS = -lgf_complete +gf_mult_LDADD = ../src/libgf_complete.la + +gf_div_SOURCES = gf_div.c +#gf_div_LDFLAGS = -lgf_complete +gf_div_LDADD = ../src/libgf_complete.la + +gf_add_SOURCES = gf_add.c +#gf_add_LDFLAGS = -lgf_complete +gf_add_LDADD = ../src/libgf_complete.la + +gf_time_SOURCES = gf_time.c +#gf_time_LDFLAGS = -lgf_complete +gf_time_LDADD = ../src/libgf_complete.la + +gf_methods_SOURCES = gf_methods.c +#gf_methods_LDFLAGS = -lgf_complete +gf_methods_LDADD = ../src/libgf_complete.la + +gf_poly_SOURCES = gf_poly.c +#gf_poly_LDFLAGS = -lgf_complete +gf_poly_LDADD = ../src/libgf_complete.la + +gf_inline_time_SOURCES = gf_inline_time.c +#gf_inline_time_LDFLAGS = -lgf_complete +gf_inline_time_LDADD = ../src/libgf_complete.la + +# gf_unit 8 A -1 -m LOG_ZERO_EXT is excluded until http://lab.jerasure.org/jerasure/gf-complete/issues/13 is resolved +if ENABLE_VALGRIND +VALGRIND = | perl -p -e 's|^|../libtool --mode=execute valgrind --quiet --error-exitcode=1 --tool=memcheck | if(!/gf_unit 8 A -1 -m LOG_ZERO_EXT/)' +endif + +# gf_unit tests as generated by gf_methods +gf_unit_w%.sh: gf_methods + ./$^ $(@:gf_unit_w%.sh=%) -A -U ${VALGRIND} > $@ || rm $@ + +TESTS = gf_unit_w128.sh \ + gf_unit_w64.sh \ + gf_unit_w32.sh \ + gf_unit_w16.sh \ + gf_unit_w8.sh \ + gf_unit_w4.sh + +TEST_EXTENSIONS = .sh +SH_LOG_COMPILER = $(SHELL) +AM_SH_LOG_FLAGS = -e + +CLEANFILES = $(TESTS) diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_add.c b/src/erasure-code/jerasure/gf-complete/tools/gf_add.c new file mode 100644 index 000000000..28cc12c1c --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_add.c @@ -0,0 +1,114 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_add.c + * + * Adds two numbers in gf_2^w + */ + +#include +#include +#include +#include +#include + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_add a b w - does addition of a and b in GF(2^w)\n"); + fprintf(stderr, " If w has an h on the end, treat a, b and the sum as hexadecimal (no 0x required)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " legal w are: 1-32, 64 and 128\n"); + fprintf(stderr, " 128 is hex only (i.e. '128' will be an error - do '128h')\n"); + + if (s != NULL) fprintf(stderr, "%s", s); + exit(1); +} + +int read_128(char *s, uint64_t *v) +{ + int l, t; + char save; + + l = strlen(s); + if (l > 32) return 0; + + if (l > 16) { + if (sscanf(s + (l-16), "%llx", (long long unsigned int *) &(v[1])) == 0) return 0; + save = s[l-16]; + s[l-16] = '\0'; + t = sscanf(s, "%llx", (long long unsigned int *) &(v[0])); + s[l-16] = save; + return t; + } else { + v[0] = 0; + return sscanf(s, "%llx", (long long unsigned int *)&(v[1])); + } + return 1; +} + +void print_128(uint64_t *v) +{ + if (v[0] > 0) { + printf("%llx", (long long unsigned int) v[0]); + printf("%016llx", (long long unsigned int) v[1]); + } else { + printf("%llx", (long long unsigned int) v[1]); + } + printf("\n"); +} + + +int main(int argc, char **argv) +{ + int hex, w; + uint32_t a, b, c, top; + uint64_t a64, b64, c64; + uint64_t a128[2], b128[2], c128[2]; + char *format; + + if (argc != 4) usage(NULL); + if (sscanf(argv[3], "%d", &w) == 0) usage("Bad w\n"); + + if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage("Bad w"); + + hex = (strchr(argv[3], 'h') != NULL); + + if (!hex && w == 128) usage(NULL); + + if (w <= 32) { + format = (hex) ? "%x" : "%u"; + if (sscanf(argv[1], format, &a) == 0) usage("Bad a\n"); + if (sscanf(argv[2], format, &b) == 0) usage("Bad b\n"); + + if (w < 32) { + top = (w == 31) ? 0x80000000 : (1 << w); + if (w != 32 && a >= top) usage("a is too large\n"); + if (w != 32 && b >= top) usage("b is too large\n"); + } + + c = a ^ b; + printf(format, c); + printf("\n"); + + } else if (w == 64) { + format = (hex) ? "%llx" : "%llu"; + if (sscanf(argv[1], format, &a64) == 0) usage("Bad a\n"); + if (sscanf(argv[2], format, &b64) == 0) usage("Bad b\n"); + c64 = a64 ^ b64; + + printf(format, c64); + printf("\n"); + + } else if (w == 128) { + + if (read_128(argv[1], a128) == 0) usage("Bad a\n"); + if (read_128(argv[2], b128) == 0) usage("Bad b\n"); + c128[0] = a128[0] ^ b128[0]; + c128[1] = a128[1] ^ b128[1]; + + print_128(c128); + } + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_div.c b/src/erasure-code/jerasure/gf-complete/tools/gf_div.c new file mode 100644 index 000000000..9797f07da --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_div.c @@ -0,0 +1,68 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_div.c + * + * Multiplies two numbers in gf_2^w + */ + +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_method.h" +#include "gf_general.h" + +void usage(int why) +{ + fprintf(stderr, "usage: gf_div a b w [method] - does division of a and b in GF(2^w)\n"); + if (why == 'W') { + fprintf(stderr, "Bad w.\n"); + fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n"); + fprintf(stderr, "Append 'h' to w to treat a, b and the quotient as hexadecimal.\n"); + fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n"); + } + if (why == 'A') fprintf(stderr, "Bad a\n"); + if (why == 'B') fprintf(stderr, "Bad b\n"); + if (why == 'M') { + fprintf(stderr, "Bad Method Specification: "); + gf_error(); + } + exit(1); +} + +int main(int argc, char **argv) +{ + int hex, w; + gf_t gf; + gf_general_t a, b, c; + char output[50]; + + if (argc < 4) usage(' '); + + if (sscanf(argv[3], "%d", &w) == 0) usage('W'); + if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W'); + + hex = (strchr(argv[3], 'h') != NULL); + if (!hex && w == 128) usage('W'); + + if (argc == 4) { + if (gf_init_easy(&gf, w) == 0) usage('M'); + } else { + if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M'); + } + + if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A'); + if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B'); + + gf_general_divide(&gf, &a, &b, &c); + gf_general_val_to_s(&c, w, output, hex); + + printf("%s\n", output); + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c b/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c new file mode 100644 index 000000000..f8119da65 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_inline_time.c @@ -0,0 +1,170 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_inline_time.c + * + * Times inline single multiplication when w = 4, 8 or 16 + */ + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_rand.h" + +void +timer_start (double *t) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + *t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; +} + +double +timer_split (const double *t) +{ + struct timeval tv; + double cur_t; + + gettimeofday (&tv, NULL); + cur_t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; + return (cur_t - *t); +} + +void problem(char *s) +{ + fprintf(stderr, "Timing test failed.\n"); + fprintf(stderr, "%s\n", s); + exit(1); +} + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_inline_time w seed #elts iterations - does timing of single multiplies\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Legal w are: 4, 8 or 16\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Use -1 for time(0) as a seed.\n"); + fprintf(stderr, "\n"); + if (s != NULL) fprintf(stderr, "%s\n", s); + exit(1); +} + +int main(int argc, char **argv) +{ + int w, j, i, size, iterations; + gf_t gf; + double timer, elapsed, dnum, num; + uint8_t *ra = NULL, *rb = NULL, *mult4, *mult8; + uint16_t *ra16 = NULL, *rb16 = NULL, *log16, *alog16; + time_t t0; + + if (argc != 5) usage(NULL); + if (sscanf(argv[1], "%d", &w) == 0) usage("Bad w\n"); + if (w != 4 && w != 8 && w != 16) usage("Bad w\n"); + if (sscanf(argv[2], "%ld", &t0) == 0) usage("Bad seed\n"); + if (sscanf(argv[3], "%d", &size) == 0) usage("Bad #elts\n"); + if (sscanf(argv[4], "%d", &iterations) == 0) usage("Bad iterations\n"); + if (t0 == -1) t0 = time(0); + MOA_Seed(t0); + + num = size; + + gf_init_easy(&gf, w); + + printf("Seed: %ld\n", t0); + + if (w == 4 || w == 8) { + ra = (uint8_t *) malloc(size); + rb = (uint8_t *) malloc(size); + + if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); } + } else if (w == 16) { + ra16 = (uint16_t *) malloc(size*2); + rb16 = (uint16_t *) malloc(size*2); + + if (ra16 == NULL || rb16 == NULL) { perror("malloc"); exit(1); } + } + + if (w == 4) { + mult4 = gf_w4_get_mult_table(&gf); + if (mult4 == NULL) { + printf("Couldn't get inline multiplication table.\n"); + exit(1); + } + elapsed = 0; + dnum = 0; + for (i = 0; i < iterations; i++) { + for (j = 0; j < size; j++) { + ra[j] = MOA_Random_W(w, 1); + rb[j] = MOA_Random_W(w, 1); + } + timer_start(&timer); + for (j = 0; j < size; j++) { + ra[j] = GF_W4_INLINE_MULTDIV(mult4, ra[j], rb[j]); + } + dnum += num; + elapsed += timer_split(&timer); + } + printf("Inline mult: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", + elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); + + } else if (w == 8) { + mult8 = gf_w8_get_mult_table(&gf); + if (mult8 == NULL) { + printf("Couldn't get inline multiplication table.\n"); + exit(1); + } + elapsed = 0; + dnum = 0; + for (i = 0; i < iterations; i++) { + for (j = 0; j < size; j++) { + ra[j] = MOA_Random_W(w, 1); + rb[j] = MOA_Random_W(w, 1); + } + timer_start(&timer); + for (j = 0; j < size; j++) { + ra[j] = GF_W8_INLINE_MULTDIV(mult8, ra[j], rb[j]); + } + dnum += num; + elapsed += timer_split(&timer); + } + printf("Inline mult: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", + elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); + } else if (w == 16) { + log16 = gf_w16_get_log_table(&gf); + alog16 = gf_w16_get_mult_alog_table(&gf); + if (log16 == NULL) { + printf("Couldn't get inline multiplication table.\n"); + exit(1); + } + elapsed = 0; + dnum = 0; + for (i = 0; i < iterations; i++) { + for (j = 0; j < size; j++) { + ra16[j] = MOA_Random_W(w, 1); + rb16[j] = MOA_Random_W(w, 1); + } + timer_start(&timer); + for (j = 0; j < size; j++) { + ra16[j] = GF_W16_INLINE_MULT(log16, alog16, ra16[j], rb16[j]); + } + dnum += num; + elapsed += timer_split(&timer); + } + printf("Inline mult: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", + elapsed, dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); + } + free (ra); + free (rb); + free (ra16); + free (rb16); + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c b/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c new file mode 100644 index 000000000..b016c33c9 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_methods.c @@ -0,0 +1,246 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_methods.c + * + * Lists supported methods (incomplete w.r.t. GROUP and COMPOSITE) + */ + +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_method.h" +#include "gf_int.h" + +#define BNMULTS (8) +static char *BMULTS[BNMULTS] = { "CARRY_FREE", "GROUP48", + "TABLE", "LOG", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE" }; +#define NMULTS (17) +static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b", + "TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2", + "SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" }; + +/* Make sure CAUCHY is last */ + +#define NREGIONS (7) +static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD", + "ALTMAP", "CAUCHY" }; + +#define BNREGIONS (4) +static char *BREGIONS[BNREGIONS] = { "DOUBLE", "QUAD", "ALTMAP", "CAUCHY" }; + +#define NDIVS (2) +static char *divides[NDIVS] = { "MATRIX", "EUCLID" }; + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_methods w -BADC -LXUMDRB\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " w can be 1-32, 64, 128\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -B lists basic methods that are useful\n"); + fprintf(stderr, " -A does a nearly exhaustive listing\n"); + fprintf(stderr, " -D adds EUCLID and MATRIX division\n"); + fprintf(stderr, " -C adds CAUCHY when possible\n"); + fprintf(stderr, " Combinations are fine.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -L Simply lists methods\n"); + fprintf(stderr, " -X List methods and functions selected (compile with DEBUG_FUNCTIONS)\n"); + fprintf(stderr, " -U Produces calls to gf_unit\n"); + fprintf(stderr, " -M Produces calls to time_tool.sh for single multiplications\n"); + fprintf(stderr, " -D Produces calls to time_tool.sh for single divisions\n"); + fprintf(stderr, " -R Produces calls to time_tool.sh for region multiplications\n"); + fprintf(stderr, " -B Produces calls to time_tool.sh for the fastest region multiplications\n"); + fprintf(stderr, " Cannot combine L, U, T.\n"); + if (s != NULL) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", s); + } + exit(1); +} + +void print_methods(gf_t *gf) +{ +#ifdef DEBUG_FUNCTIONS + gf_internal_t *h = (gf_internal_t*) gf->scratch; + + printf("multiply = %s\n", h->multiply); + printf("divide = %s\n", h->divide); + printf("inverse = %s\n", h->inverse); + printf("multiply_region = %s\n", h->multiply_region); + printf("extract_word = %s\n", h->extract_word); +#endif +} + +int main(int argc, char *argv[]) +{ + int m, r, d, w, i, sa, j, k, reset, ok; + int nregions; + int nmults; + char **regions; + char **mults; + int exhaustive = 0; + int divide = 0; + int cauchy = 0; + int listing; + char *gf_argv[50], *x; + gf_t gf; + char ls[10]; + char * w_str; + + if (argc != 4) usage(NULL); + w = atoi(argv[1]); + ok = (w >= 1 && w <= 32); + if (w == 64) ok = 1; + if (w == 128) ok = 1; + if (!ok) usage("Bad w"); + + if (argv[2][0] != '-' || argv[3][0] != '-' || strlen(argv[2]) == 1 || strlen(argv[3]) != 2) { + usage(NULL); + } + for (i = 1; argv[2][i] != '\0'; i++) { + switch(argv[2][i]) { + case 'B': exhaustive = 0; break; + case 'A': exhaustive = 1; break; + case 'D': divide = 1; break; + case 'C': cauchy = 1; break; + default: usage("Bad -BADC"); + } + } + + if (strchr("LXUMDRB", argv[3][1]) == NULL) { usage("Bad -LXUMDRB"); } + listing = argv[3][1]; + + if (listing == 'U') { + w_str = "../test/gf_unit %d A -1"; + } else if (listing == 'L' || listing == 'X') { + w_str = "w=%d:"; + } else { + w_str = strdup("sh time_tool.sh X %d"); + x = strchr(w_str, 'X'); + *x = listing; + } + + gf_argv[0] = "-"; + if (create_gf_from_argv(&gf, w, 1, gf_argv, 0) > 0) { + printf(w_str, w); + printf(" - \n"); + gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d: -\n", 2); + exit(1); + } + + nregions = (exhaustive) ? NREGIONS : BNREGIONS; + if (!cauchy) nregions--; + regions = (exhaustive) ? REGIONS : BREGIONS; + mults = (exhaustive) ? MULTS : BMULTS; + nmults = (exhaustive) ? NMULTS : BNMULTS; + + + for (m = 0; m < nmults; m++) { + sa = 0; + gf_argv[sa++] = "-m"; + if (strcmp(mults[m], "GROUP44") == 0) { + gf_argv[sa++] = "GROUP"; + gf_argv[sa++] = "4"; + gf_argv[sa++] = "4"; + } else if (strcmp(mults[m], "GROUP48") == 0) { + gf_argv[sa++] = "GROUP"; + gf_argv[sa++] = "4"; + gf_argv[sa++] = "8"; + } else if (strcmp(mults[m], "SPLIT2") == 0) { + gf_argv[sa++] = "SPLIT"; + sprintf(ls, "%d", w); + gf_argv[sa++] = ls; + gf_argv[sa++] = "2"; + } else if (strcmp(mults[m], "SPLIT4") == 0) { + gf_argv[sa++] = "SPLIT"; + sprintf(ls, "%d", w); + gf_argv[sa++] = ls; + gf_argv[sa++] = "4"; + } else if (strcmp(mults[m], "SPLIT8") == 0) { + gf_argv[sa++] = "SPLIT"; + sprintf(ls, "%d", w); + gf_argv[sa++] = ls; + gf_argv[sa++] = "8"; + } else if (strcmp(mults[m], "SPLIT16") == 0) { + gf_argv[sa++] = "SPLIT"; + sprintf(ls, "%d", w); + gf_argv[sa++] = ls; + gf_argv[sa++] = "16"; + } else if (strcmp(mults[m], "SPLIT88") == 0) { + gf_argv[sa++] = "SPLIT"; + gf_argv[sa++] = "8"; + gf_argv[sa++] = "8"; + } else if (strcmp(mults[m], "COMPOSITE") == 0) { + gf_argv[sa++] = "COMPOSITE"; + gf_argv[sa++] = "2"; + gf_argv[sa++] = "-"; + } else { + gf_argv[sa++] = mults[m]; + } + reset = sa; + + + for (r = 0; r < (1 << nregions); r++) { + sa = reset; + for (k = 0; k < nregions; k++) { + if (r & (1 << k)) { + gf_argv[sa++] = "-r"; + gf_argv[sa++] = regions[k]; + } + } + gf_argv[sa++] = "-"; + + /* printf("Hmmmm. %s", gf_argv[0]); + for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); + printf("\n"); */ + + if (create_gf_from_argv(&gf, w, sa, gf_argv, 0) > 0) { + printf(w_str, w); + for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); + printf("\n"); + if (listing == 'X') + print_methods(&gf); + gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d:", w); + for (j = 0; j < sa; j++) fprintf(stderr, " %s", gf_argv[j]); + fprintf(stderr, "\n"); + exit(1); + } + sa--; + if (divide) { + for (d = 0; d < NDIVS; d++) { + gf_argv[sa++] = "-d"; + gf_argv[sa++] = divides[d]; + /* printf("w=%d:", w); + for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); + printf("\n"); */ + gf_argv[sa++] = "-"; + if (create_gf_from_argv(&gf, w, sa, gf_argv, 0) > 0) { + printf(w_str, w); + for (j = 0; j < sa; j++) printf(" %s", gf_argv[j]); + printf("\n"); + if (listing == 'X') + print_methods(&gf); + gf_free(&gf, 1); + } else if (_gf_errno == GF_E_DEFAULT) { + fprintf(stderr, "Unlabeled failed method: w=%d:", w); + for (j = 0; j < sa; j++) fprintf(stderr, " %s", gf_argv[j]); + fprintf(stderr, "\n"); + exit(1); + } + sa-=3; + } + } + } + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c b/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c new file mode 100644 index 000000000..815bd8b26 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_mult.c @@ -0,0 +1,68 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_mult.c + * + * Multiplies two numbers in gf_2^w + */ + +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_method.h" +#include "gf_general.h" + +void usage(int why) +{ + fprintf(stderr, "usage: gf_mult a b w [method] - does multiplication of a and b in GF(2^w)\n"); + if (why == 'W') { + fprintf(stderr, "Bad w.\n"); + fprintf(stderr, "Legal w are: 1 - 32, 64 and 128.\n"); + fprintf(stderr, "Append 'h' to w to treat a, b and the product as hexadecimal.\n"); + fprintf(stderr, "w=128 is hex only (i.e. '128' will be an error - do '128h')\n"); + } + if (why == 'A') fprintf(stderr, "Bad a\n"); + if (why == 'B') fprintf(stderr, "Bad b\n"); + if (why == 'M') { + fprintf(stderr, "Bad Method Specification: "); + gf_error(); + } + exit(1); +} + +int main(int argc, char **argv) +{ + int hex, w; + gf_t gf; + gf_general_t a, b, c; + char output[50]; + + if (argc < 4) usage(' '); + + if (sscanf(argv[3], "%d", &w) == 0) usage('W'); + if (w <= 0 || (w > 32 && w != 64 && w != 128)) usage('W'); + + hex = (strchr(argv[3], 'h') != NULL); + if (!hex && w == 128) usage('W'); + + if (argc == 4) { + if (gf_init_easy(&gf, w) == 0) usage('M'); + } else { + if (create_gf_from_argv(&gf, w, argc, argv, 4) == 0) usage('M'); + } + + if (!gf_general_s_to_val(&a, w, argv[1], hex)) usage('A'); + if (!gf_general_s_to_val(&b, w, argv[2], hex)) usage('B'); + + gf_general_multiply(&gf, &a, &b, &c); + gf_general_val_to_s(&c, w, output, hex); + + printf("%s\n", output); + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c b/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c new file mode 100644 index 000000000..b3faf254d --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_poly.c @@ -0,0 +1,275 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_poly.c - program to help find irreducible polynomials in composite fields, + * using the Ben-Or algorithm. + * + * (This one was written by Jim) + * + * Please see the following paper for a description of the Ben-Or algorithm: + * + * author S. Gao and D. Panario + * title Tests and Constructions of Irreducible Polynomials over Finite Fields + * booktitle Foundations of Computational Mathematics + * year 1997 + * publisher Springer Verlag + * pages 346-361 + * + * The basic technique is this. You have a polynomial f(x) whose coefficients are + * in a base field GF(2^w). The polynomial is of degree n. You need to do the + * following for all i from 1 to n/2: + * + * Construct x^(2^w)^i modulo f. That will be a polynomial of maximum degree n-1 + * with coefficients in GF(2^w). You construct that polynomial by starting with x + * and doubling it w times, each time taking the result modulo f. Then you + * multiply that by itself i times, again each time taking the result modulo f. + * + * When you're done, you need to "subtract" x -- since addition = subtraction = + * XOR, that means XOR x. + * + * Now, find the GCD of that last polynomial and f, using Euclid's algorithm. If + * the GCD is not one, then f is reducible. If it is not reducible for each of + * those i, then it is irreducible. + * + * In this code, I am using a gf_general_t to represent elements of GF(2^w). This + * is so that I can use base fields that are GF(2^64) or GF(2^128). + * + * I have two main procedures. The first is x_to_q_to_i_minus_x, which calculates + * x^(2^w)^i - x, putting the result into a gf_general_t * called retval. + * + * The second is gcd_one, which takes a polynomial of degree n and a second one + * of degree n-1, and uses Euclid's algorithm to decide if their GCD == 1. + * + * These can be made faster (e.g. calculate x^(2^w) once and store it). + */ + +#include "gf_complete.h" +#include "gf_method.h" +#include "gf_general.h" +#include "gf_int.h" +#include +#include +#include +#include + +char *BM = "Bad Method: "; + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_poly w(base-field) method power:coef [ power:coef .. ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " use - for the default method.\n"); + fprintf(stderr, " use 0x in front of the coefficient if it's in hex\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " For example, to test whether x^2 + 2x + 1 is irreducible\n"); + fprintf(stderr, " in GF(2^16), the call is:\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " gf_poly 16 - 2:1 1:2 0:1\n"); + fprintf(stderr, " \n"); + fprintf(stderr, " See the user's manual for more information.\n"); + if (s != NULL) { + fprintf(stderr, "\n"); + if (s == BM) { + fprintf(stderr, "%s", s); + gf_error(); + } else { + fprintf(stderr, "%s\n", s); + } + } + exit(1); +} + +int gcd_one(gf_t *gf, int w, int n, gf_general_t *poly, gf_general_t *prod) +{ + gf_general_t *a, *b, zero, factor, p; + int i, j, da, db; + + gf_general_set_zero(&zero, w); + + a = (gf_general_t *) malloc(sizeof(gf_general_t) * n+1); + b = (gf_general_t *) malloc(sizeof(gf_general_t) * n); + for (i = 0; i <= n; i++) gf_general_add(gf, &zero, poly+i, a+i); + for (i = 0; i < n; i++) gf_general_add(gf, &zero, prod+i, b+i); + + da = n; + while (1) { + for (db = n-1; db >= 0 && gf_general_is_zero(b+db, w); db--) ; + if (db < 0) return 0; + if (db == 0) return 1; + for (j = da; j >= db; j--) { + if (!gf_general_is_zero(a+j, w)) { + gf_general_divide(gf, a+j, b+db, &factor); + for (i = 0; i <= db; i++) { + gf_general_multiply(gf, b+i, &factor, &p); + gf_general_add(gf, &p, a+(i+j-db), a+(i+j-db)); + } + } + } + for (i = 0; i < n; i++) { + gf_general_add(gf, a+i, &zero, &p); + gf_general_add(gf, b+i, &zero, a+i); + gf_general_add(gf, &p, &zero, b+i); + } + } + +} + +void x_to_q_to_i_minus_x(gf_t *gf, int w, int n, gf_general_t *poly, int logq, int i, gf_general_t *retval) +{ + gf_general_t x; + gf_general_t *x_to_q; + gf_general_t *product; + gf_general_t p, zero, factor; + int j, k, lq; + + gf_general_set_zero(&zero, w); + product = (gf_general_t *) malloc(sizeof(gf_general_t) * n*2); + x_to_q = (gf_general_t *) malloc(sizeof(gf_general_t) * n); + for (j = 0; j < n; j++) gf_general_set_zero(x_to_q+j, w); + gf_general_set_one(x_to_q+1, w); + + for (lq = 0; lq < logq; lq++) { + for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w); + for (j = 0; j < n; j++) { + for (k = 0; k < n; k++) { + gf_general_multiply(gf, x_to_q+j, x_to_q+k, &p); + gf_general_add(gf, product+(j+k), &p, product+(j+k)); + } + } + for (j = n*2-1; j >= n; j--) { + if (!gf_general_is_zero(product+j, w)) { + gf_general_add(gf, product+j, &zero, &factor); + for (k = 0; k <= n; k++) { + gf_general_multiply(gf, poly+k, &factor, &p); + gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k)); + } + } + } + for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, x_to_q+j); + } + for (j = 0; j < n; j++) gf_general_set_zero(retval+j, w); + gf_general_set_one(retval, w); + + while (i > 0) { + for (j = 0; j < n*2; j++) gf_general_set_zero(product+j, w); + for (j = 0; j < n; j++) { + for (k = 0; k < n; k++) { + gf_general_multiply(gf, x_to_q+j, retval+k, &p); + gf_general_add(gf, product+(j+k), &p, product+(j+k)); + } + } + for (j = n*2-1; j >= n; j--) { + if (!gf_general_is_zero(product+j, w)) { + gf_general_add(gf, product+j, &zero, &factor); + for (k = 0; k <= n; k++) { + gf_general_multiply(gf, poly+k, &factor, &p); + gf_general_add(gf, product+(j-n+k), &p, product+(j-n+k)); + } + } + } + for (j = 0; j < n; j++) gf_general_add(gf, product+j, &zero, retval+j); + i--; + } + + gf_general_set_one(&x, w); + gf_general_add(gf, &x, retval+1, retval+1); + + free(product); + free(x_to_q); +} + +int main(int argc, char **argv) +{ + int w, i, power, n, ap, success; + gf_t gf; + gf_general_t *poly, *prod; + char *string, *ptr; + char buf[100]; + + if (argc < 4) usage(NULL); + + if (sscanf(argv[1], "%d", &w) != 1 || w <= 0) usage("Bad w."); + ap = create_gf_from_argv(&gf, w, argc, argv, 2); + + if (ap == 0) usage(BM); + + if (ap == argc) usage("No powers/coefficients given."); + + n = -1; + for (i = ap; i < argc; i++) { + if (strchr(argv[i], ':') == NULL || sscanf(argv[i], "%d:", &power) != 1) { + string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100))); + sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]); + usage(string); + } + if (power < 0) { + usage("Can't have negative powers\n"); + } else { + n = power; + } + } + // in case the for-loop header fails + assert (n >= 0); + + poly = (gf_general_t *) malloc(sizeof(gf_general_t)*(n+1)); + for (i = 0; i <= n; i++) gf_general_set_zero(poly+i, w); + prod = (gf_general_t *) malloc(sizeof(gf_general_t)*n); + + for (i = ap; i < argc; i++) { + sscanf(argv[i], "%d:", &power); + ptr = strchr(argv[i], ':'); + ptr++; + if (strncmp(ptr, "0x", 2) == 0) { + success = gf_general_s_to_val(poly+power, w, ptr+2, 1); + } else { + success = gf_general_s_to_val(poly+power, w, ptr, 0); + } + if (success == 0) { + string = (char *) malloc(sizeof(char)*(strlen(argv[i]+100))); + sprintf(string, "Argument '%s' not in proper format of power:coefficient\n", argv[i]); + usage(string); + } + } + + printf("Poly:"); + for (power = n; power >= 0; power--) { + if (!gf_general_is_zero(poly+power, w)) { + printf("%s", (power == n) ? " " : " + "); + if (!gf_general_is_one(poly+power, w)) { + gf_general_val_to_s(poly+power, w, buf, 1); + if (n > 0) { + printf("(0x%s)", buf); + } else { + printf("0x%s", buf); + } + } + if (power == 0) { + if (gf_general_is_one(poly+power, w)) printf("1"); + } else if (power == 1) { + printf("x"); + } else { + printf("x^%d", power); + } + } + } + printf("\n"); + + if (!gf_general_is_one(poly+n, w)) { + printf("\n"); + printf("Can't do Ben-Or, because the polynomial is not monic.\n"); + exit(0); + } + + for (i = 1; i <= n/2; i++) { + x_to_q_to_i_minus_x(&gf, w, n, poly, w, i, prod); + if (!gcd_one(&gf, w, n, poly, prod)) { + printf("Reducible.\n"); + exit(0); + } + } + + printf("Irreducible.\n"); + exit(0); +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/gf_time.c b/src/erasure-code/jerasure/gf-complete/tools/gf_time.c new file mode 100644 index 000000000..7402ab5c2 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/gf_time.c @@ -0,0 +1,232 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_time.c + * + * Performs timing for gf arithmetic + */ + +#include "config.h" + +#ifdef HAVE_POSIX_MEMALIGN +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 600 +#endif +#endif + +#include +#include +#include +#include +#include +#include + +#include "gf_complete.h" +#include "gf_method.h" +#include "gf_rand.h" +#include "gf_general.h" + +void +timer_start (double *t) +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + *t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; +} + +double +timer_split (const double *t) +{ + struct timeval tv; + double cur_t; + + gettimeofday (&tv, NULL); + cur_t = (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; + return (cur_t - *t); +} + +void problem(char *s) +{ + fprintf(stderr, "Timing test failed.\n"); + fprintf(stderr, "%s\n", s); + exit(1); +} + +char *BM = "Bad Method: "; + +void usage(char *s) +{ + fprintf(stderr, "usage: gf_time w tests seed size(bytes) iterations [method [params]] - does timing\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "does unit testing in GF(2^w)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Legal w are: 1 - 32, 64 and 128\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Tests may be any combination of:\n"); + fprintf(stderr, " A: All\n"); + fprintf(stderr, " S: All Single Operations\n"); + fprintf(stderr, " R: All Region Operations\n"); + fprintf(stderr, " M: Single: Multiplications\n"); + fprintf(stderr, " D: Single: Divisions\n"); + fprintf(stderr, " I: Single: Inverses\n"); + fprintf(stderr, " G: Region: Buffer-Constant Multiplication\n"); + fprintf(stderr, " 0: Region: Doing nothing, and bzero()\n"); + fprintf(stderr, " 1: Region: Memcpy() and XOR\n"); + fprintf(stderr, " 2: Region: Multiplying by two\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Use -1 for time(0) as a seed.\n"); + fprintf(stderr, "\n"); + if (s == BM) { + fprintf(stderr, "%s", BM); + gf_error(); + } else if (s != NULL) { + fprintf(stderr, "%s\n", s); + } + exit(1); +} + +int main(int argc, char **argv) +{ + int w, it, i, size, iterations, xor; + char tests[100]; + char test; + char *single_tests = "MDI"; + char *region_tests = "G012"; + char *tstrings[256]; + void *tmethods[256]; + gf_t gf; + double timer, elapsed, ds, di, dnum; + int num; + time_t t0; + uint8_t *ra, *rb; + gf_general_t a; +#ifndef HAVE_POSIX_MEMALIGN + uint8_t *malloc_ra, *malloc_rb; +#endif + + + if (argc < 6) usage(NULL); + + if (sscanf(argv[1], "%d", &w) == 0){ + usage("Bad w[-pp]\n"); + } + + + if (sscanf(argv[3], "%ld", &t0) == 0) usage("Bad seed\n"); + if (sscanf(argv[4], "%d", &size) == 0) usage("Bad size\n"); + if (sscanf(argv[5], "%d", &iterations) == 0) usage("Bad iterations\n"); + if (t0 == -1) t0 = time(0); + MOA_Seed(t0); + + ds = size; + di = iterations; + + if ((w > 32 && w != 64 && w != 128) || w < 0) usage("Bad w"); + if ((size * 8) % w != 0) usage ("Bad size -- must be a multiple of w*8\n"); + + if (!create_gf_from_argv(&gf, w, argc, argv, 6)) usage(BM); + + strcpy(tests, ""); + for (i = 0; argv[2][i] != '\0'; i++) { + switch(argv[2][i]) { + case 'A': strcat(tests, single_tests); + strcat(tests, region_tests); + break; + case 'S': strcat(tests, single_tests); break; + case 'R': strcat(tests, region_tests); break; + case 'G': strcat(tests, "G"); break; + case '0': strcat(tests, "0"); break; + case '1': strcat(tests, "1"); break; + case '2': strcat(tests, "2"); break; + case 'M': strcat(tests, "M"); break; + case 'D': strcat(tests, "D"); break; + case 'I': strcat(tests, "I"); break; + default: usage("Bad tests"); + } + } + + tstrings['M'] = "Multiply"; + tstrings['D'] = "Divide"; + tstrings['I'] = "Inverse"; + tstrings['G'] = "Region-Random"; + tstrings['0'] = "Region-By-Zero"; + tstrings['1'] = "Region-By-One"; + tstrings['2'] = "Region-By-Two"; + + tmethods['M'] = (void *) gf.multiply.w32; + tmethods['D'] = (void *) gf.divide.w32; + tmethods['I'] = (void *) gf.inverse.w32; + tmethods['G'] = (void *) gf.multiply_region.w32; + tmethods['0'] = (void *) gf.multiply_region.w32; + tmethods['1'] = (void *) gf.multiply_region.w32; + tmethods['2'] = (void *) gf.multiply_region.w32; + + printf("Seed: %ld\n", t0); + +#ifdef HAVE_POSIX_MEMALIGN + if (posix_memalign((void **) &ra, 16, size)) + ra = NULL; + if (posix_memalign((void **) &rb, 16, size)) + rb = NULL; +#else + malloc_ra = (uint8_t *) malloc(size + 15); + malloc_rb = (uint8_t *) malloc(size + 15); + ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf)); + rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf)); +#endif + + if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); } + + for (i = 0; i < 3; i++) { + test = single_tests[i]; + if (strchr(tests, test) != NULL) { + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); + } else { + elapsed = 0; + dnum = 0; + for (it = 0; it < iterations; it++) { + gf_general_set_up_single_timing_test(w, ra, rb, size); + timer_start(&timer); + num = gf_general_do_single_timing_test(&gf, ra, rb, size, test); + dnum += num; + elapsed += timer_split(&timer); + } + printf("%14s: %10.6lf s Mops: %10.3lf %10.3lf Mega-ops/s\n", + tstrings[(int)test], elapsed, + dnum/1024.0/1024.0, dnum/1024.0/1024.0/elapsed); + } + } + } + + for (i = 0; i < 4; i++) { + test = region_tests[i]; + if (strchr(tests, test) != NULL) { + if (tmethods[(int)test] == NULL) { + printf("No %s method.\n", tstrings[(int)test]); + } else { + if (test == '0') gf_general_set_zero(&a, w); + if (test == '1') gf_general_set_one(&a, w); + if (test == '2') gf_general_set_two(&a, w); + + for (xor = 0; xor < 2; xor++) { + elapsed = 0; + for (it = 0; it < iterations; it++) { + if (test == 'G') gf_general_set_random(&a, w, 1); + gf_general_set_up_single_timing_test(8, ra, rb, size); + timer_start(&timer); + gf_general_do_region_multiply(&gf, &a, ra, rb, size, xor); + elapsed += timer_split(&timer); + } + printf("%14s: XOR: %d %10.6lf s MB: %10.3lf %10.3lf MB/s\n", + tstrings[(int)test], xor, elapsed, + ds*di/1024.0/1024.0, ds*di/1024.0/1024.0/elapsed); + } + } + } + } + return 0; +} diff --git a/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh b/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh new file mode 100755 index 000000000..e514e4f6f --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/test_simd.sh @@ -0,0 +1,367 @@ +#!/bin/bash -e + +# this scripts has a number of tests for SIMD. It can be invoked +# on the host or on a QEMU machine. + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +host_cpu=`uname -p` +results=${script_dir}/test_simd.results +nprocs=$(grep -c ^processor /proc/cpuinfo) + +# runs unit tests and save the results +test_unit(){ + { ./configure && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + make -j$nprocs check || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + cat tools/test-suite.log >> ${results} || true +} + +# build with DEBUG_FUNCTIONS and save all methods selected +# to a results file +test_functions() { + failed=0 + + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + done + + return ${failed} +} + +# build with DEBUG_CPU_FUNCTIONS and print out CPU detection +test_detection() { + failed=0 + + { ./configure --enable-debug-cpu && make clean && make; } || { echo "Compile FAILED" >> ${results}; return 1; } + { ${script_dir}/gf_methods 32 -ACD -L | grep '#' >> ${results}; } || { echo "gf_methods $i FAILED" >> ${results}; ((++failed)); } + + return ${failed} +} + +compile_arm() { + failed=0 + + echo -n "Compiling with NO SIMD support..." >> ${results} + { ./configure --disable-neon && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with FULL SIMD support..." >> ${results} + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + return ${failed} +} + +compile_intel() { + failed=0 + + echo -n "Compiling with NO SIMD support..." >> ${results} + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=no + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_1 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with SSE2,SSE3,SSSE3,SSE4_2 only..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=no + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + echo -n "Compiling with FULL SIMD support..." >> ${results} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=yes + { ./configure && make clean && make && echo "SUCCESS" >> ${results}; } || { echo "FAIL" >> ${results}; ((++failed)); } + + return ${failed} +} + +# test that we can compile the source code with different +# SIMD options. We assume that we are running on processor +# full SIMD support +test_compile() { + case $host_cpu in + aarch64*|arm*) compile_arm ;; + i[[3456]]86*|x86_64*|amd64*) compile_intel ;; + esac +} + +# disable through build flags +runtime_arm_flags() { + failed=0 + + echo "====NO SIMD support..." >> ${1} + { ./configure --disable-neon --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +# build once with FULL SIMD and disable at runtime through environment +runtime_arm_env() { + failed=0 + + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + + echo "====NO SIMD support..." >> ${1} + export GF_COMPLETE_DISABLE_NEON=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + unset GF_COMPLETE_DISABLE_NEON + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +runtime_intel_flags() { + failed=0 + + echo "====NO SIMD support..." >> ${1} + { ./configure --disable-sse --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=no + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=no + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=yes + export ax_cv_have_sse42_ext=no + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1} + export ax_cv_have_sse_ext=no + export ax_cv_have_sse2_ext=yes + export ax_cv_have_sse3_ext=yes + export ax_cv_have_ssse3_ext=yes + export ax_cv_have_sse41_ext=no + export ax_cv_have_sse42_ext=yes + export ax_cv_have_pclmuldq_ext=no + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + { ./configure --enable-debug-func && make clean && make; } || { echo "FAIL" >> ${1}; ((++failed)); } + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +runtime_intel_env() { + failed=0 + + # compile a build with full SIMD support + { ./configure --enable-debug-func && make clean && make; } || { echo "Compile FAILED" >> ${1}; return 1; } + + echo "====NO SIMD support..." >> ${1} + export GF_COMPLETE_DISABLE_SSE2=1 + export GF_COMPLETE_DISABLE_SSE3=1 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + export GF_COMPLETE_DISABLE_SSE3=1 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + export GF_COMPLETE_DISABLE_SSSE3=1 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + export GF_COMPLETE_DISABLE_SSE4=1 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1 + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + echo "====FULL SIMD support..." >> ${1} + unset GF_COMPLETE_DISABLE_SSE2 + unset GF_COMPLETE_DISABLE_SSE3 + unset GF_COMPLETE_DISABLE_SSSE3 + unset GF_COMPLETE_DISABLE_SSE4 + unset GF_COMPLETE_DISABLE_SSE4_PCLMUL + for i in 128 64 32 16 8 4; do + { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); } + done + + return ${failed} +} + +test_runtime() { + rm -f ${results}.left + rm -f ${results}.right + + case $host_cpu in + aarch64*|arm*) + runtime_arm_flags ${results}.left + runtime_arm_env ${results}.right + ;; + i[[3456]]86*|x86_64*|amd64*) + runtime_intel_flags ${results}.left + runtime_intel_env ${results}.right + ;; + esac + + echo "======LEFT======" > ${results} + cat ${results}.left >> ${results} + echo "======RIGHT======" >> ${results} + cat ${results}.right >> ${results} + echo "======RESULT======" >> ${results} + if diff "${results}.left" "${results}.right"; then + echo SUCCESS >> ${results} + return 0 + else + echo SUCCESS >> ${results} + return 1 + fi +} + +cd ${script_dir}/.. +rm -f ${results} + +test_$1 +exit $? diff --git a/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh b/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh new file mode 100755 index 000000000..5771874f7 --- /dev/null +++ b/src/erasure-code/jerasure/gf-complete/tools/test_simd_qemu.sh @@ -0,0 +1,258 @@ +#!/bin/bash -e + +# This script will use QEMU to test gf-complete especially SIMD support +# on different architectures and cpus. It will boot a qemu machine +# and run an Ubuntu cloud image. All testing will happen inside the +# QEMU machine. + +# The following packages are required: +# qemu-system-aarch64 +# qemu-system-arm +# qemu-system-x86_64 +# genisoimage + + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +qemu_dir="${script_dir}/.qemu" +ssh_port=2222 +ssh_pubkey_file="${qemu_dir}/qemu.pub" +ssh_key_file="${qemu_dir}/qemu" + +mkdir -p "${qemu_dir}" + +cleanup() { + if [[ -n "$(jobs -p)" ]]; then + echo killing qemu processes "$(jobs -p)" + kill $(jobs -p) + fi +} + +trap cleanup EXIT + +start_qemu() { + arch=$1 + cpu=$2 + + image_version="xenial" + image_url_base="http://cloud-images.ubuntu.com/${image_version}/current" + + case $arch in + i[[3456]]86*|x86_64*|amd64*) + image_kernel="${image_version}-server-cloudimg-amd64-vmlinuz-generic" + image_initrd="${image_version}-server-cloudimg-amd64-initrd-generic" + image_disk="${image_version}-server-cloudimg-amd64-disk1.img" + ;; + aarch64*) + image_kernel="${image_version}-server-cloudimg-arm64-vmlinuz-generic" + image_initrd="${image_version}-server-cloudimg-arm64-initrd-generic" + image_disk="${image_version}-server-cloudimg-arm64-disk1.img" + ;; + arm*) + image_kernel="${image_version}-server-cloudimg-armhf-vmlinuz-lpae" + image_initrd="${image_version}-server-cloudimg-armhf-initrd-generic-lpae" + image_disk="${image_version}-server-cloudimg-armhf-disk1.img" + ;; + *) die "Unsupported arch" ;; + esac + + [[ -f ${qemu_dir}/${image_kernel} ]] || wget -O ${qemu_dir}/${image_kernel} ${image_url_base}/unpacked/${image_kernel} + [[ -f ${qemu_dir}/${image_initrd} ]] || wget -O ${qemu_dir}/${image_initrd} ${image_url_base}/unpacked/${image_initrd} + [[ -f ${qemu_dir}/${image_disk} ]] || wget -O ${qemu_dir}/${image_disk} ${image_url_base}/${image_disk} + + #create a delta disk to keep the original image clean + delta_disk="${qemu_dir}/disk.img" + rm -f ${delta_disk} + qemu-img create -q -f qcow2 -b "${qemu_dir}/${image_disk}" ${delta_disk} + + # generate an ssh keys + [[ -f ${ssh_pubkey_file} ]] || ssh-keygen -q -N "" -f ${ssh_key_file} + + # create a config disk to set the SSH keys + cat > "${qemu_dir}/meta-data" < "${qemu_dir}/user-data" <&2 + exit 1 +fi + +op=$1 +w=$2 + +shift ; shift + +method="$*" + +if [ $op != M -a $op != D -a $op != R -a $op != B ]; then + echo 'usage sh time_tool.sh M|D|R|B w method' >&2 + echo 'You have to specify a test: ' >&2 + echo ' M=Multiplication' >&2 + echo ' D=Division' >&2 + echo ' R=Regions' >&2 + echo ' B=Best-Region' >&2 + exit 1 +fi + +# First, use a 16K buffer to test the performance of single multiplies. + +fac=`echo $w | awk '{ n = $1; while (n != 0 && n%2==0) n /= 2; print n }'` +if [ $fac -eq 0 ]; then + echo 'usage sh time_tool.sh M|D|R|B w method' >&2 + echo 'Bad w' >&2 + exit 1 +fi + +bsize=16384 +bsize=`echo $bsize $fac | awk '{ print $1 * $2 }'` + +if [ `./gf_time $w M -1 $bsize 1 $method 2>&1 | wc | awk '{ print $1 }'` -gt 2 ]; then + echo 'usage sh time_tool.sh w method' >&2 + echo "Bad method" + exit 1 +fi + +if [ $op = M -o $op = D ]; then + iter=1 + c1=`./gf_time $w $op -1 $bsize $iter $method` + t=`echo $c1 | awk '{ printf "%d\n", $4*100 }'` + s=`echo $c1 | awk '{ print $8 }'` + bs=$s + + while [ $t -lt 1 ]; do + bs=$s + iter=`echo $iter | awk '{ print $1*2 }'` + c1=`./gf_time $w $op -1 $bsize $iter $method` + t=`echo $c1 | awk '{ printf "%d\n", $4*100 }'` + s=`echo $c1 | awk '{ print $8 }'` + done + + echo $op $bs | awk '{ printf "%s speed (MB/s): %8.2lf W-Method: ", $1, $2 }' + echo $w $method + exit 0 +fi + +bsize=16384 +bsize=`echo $bsize $fac | awk '{ print $1 * $2 }'` + +best=0 +while [ $bsize -le 4194304 ]; do + iter=1 + c1=`./gf_time $w G -1 $bsize $iter $method` + t=`echo $c1 | awk '{ printf "%d\n", $6*500 }'` + s=`echo $c1 | awk '{ print $10 }'` + bs=$s + + while [ $t -lt 1 ]; do + bs=$s + iter=`echo $iter | awk '{ print $1*2 }'` + c1=`./gf_time $w G -1 $bsize $iter $method` + t=`echo $c1 | awk '{ printf "%d\n", $6*500 }'` + s=`echo $c1 | awk '{ print $10 }'` + done + if [ $bsize -lt 1048576 ]; then + str=`echo $bsize | awk '{ printf "%3dK\n", $1/1024 }'` + else + str=`echo $bsize | awk '{ printf "%3dM\n", $1/1024/1024 }'` + fi + if [ $op = R ]; then + echo $str $bs | awk '{ printf "Region Buffer-Size: %4s (MB/s): %8.2lf W-Method: ", $1, $2 }' + echo $w $method + fi + best=`echo $best $bs | awk '{ print ($1 > $2) ? $1 : $2 }'` + bsize=`echo $bsize | awk '{ print $1 * 2 }'` +done +echo $best | awk '{ printf "Region Best (MB/s): %8.2lf W-Method: ", $1 }' +echo $w $method -- cgit v1.2.3