summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include/kasumi_internal.h')
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/include/kasumi_internal.h1853
1 files changed, 1853 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/kasumi_internal.h b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
new file mode 100755
index 000000000..87b114d88
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/kasumi_internal.h
@@ -0,0 +1,1853 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+/*---------------------------------------------------------
+* Kasumi_internal.h
+*---------------------------------------------------------*/
+
+#ifndef _KASUMI_INTERNAL_H_
+#define _KASUMI_INTERNAL_H_
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "intel-ipsec-mb.h"
+#include "wireless_common.h"
+#include "include/clear_regs_mem.h"
+#include "include/constant_lookup.h"
+
+/*---------------------------------------------------------------------
+* Kasumi Inner S-Boxes
+*---------------------------------------------------------------------*/
+
+/* Table version based on a small table, no cache trash */
+static const uint16_t sso_kasumi_S7e[] = {
+ 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+ 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+ 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+ 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+ 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+ 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+ 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+ 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+ 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+ 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+ 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+ 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+ 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+ 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+ 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+ 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f,
+ 0x6c00, 0x6601, 0x7802, 0x7603, 0x2404, 0x4e05, 0xb006, 0xce07,
+ 0x5c08, 0x1e09, 0x6a0a, 0xac0b, 0x1c0c, 0x3e0d, 0xea0e, 0x5c0f,
+ 0x4e10, 0xc011, 0x6a12, 0xc213, 0x0214, 0xac15, 0xae16, 0x3617,
+ 0x6e18, 0xa019, 0x681a, 0x001b, 0x0a1c, 0xe41d, 0xc41e, 0x9c1f,
+ 0x2a20, 0x5021, 0xb622, 0xd823, 0x2024, 0x3225, 0x3826, 0x2e27,
+ 0x9a28, 0xac29, 0x042a, 0xa62b, 0x882c, 0xd62d, 0xd22e, 0x082f,
+ 0x4830, 0x9631, 0xf432, 0x1c33, 0x4634, 0xb035, 0x7636, 0xa637,
+ 0xea38, 0x7039, 0x543a, 0x783b, 0xdc3c, 0x6e3d, 0xae3e, 0xba3f,
+ 0x6a40, 0x6a41, 0x1c42, 0x9043, 0x3a44, 0x5e45, 0x8c46, 0x7447,
+ 0x7c48, 0x5449, 0x384a, 0x1c4b, 0xa44c, 0xe84d, 0x604e, 0x304f,
+ 0x4050, 0xc451, 0x8652, 0xac53, 0x1654, 0xb655, 0x1856, 0x0657,
+ 0x0658, 0xa259, 0xf25a, 0x785b, 0xf85c, 0x785d, 0x845e, 0x3a5f,
+ 0x0c60, 0xfc61, 0xf062, 0x9c63, 0x5e64, 0xc265, 0x6666, 0x7667,
+ 0x9a68, 0x4669, 0x746a, 0xb46b, 0x506c, 0xe06d, 0x3a6e, 0x866f,
+ 0x6070, 0x3471, 0x3c72, 0xd673, 0x3474, 0x4c75, 0xa476, 0x7277,
+ 0xa478, 0xd479, 0xea7a, 0xa47b, 0x487c, 0x147d, 0x8a7e, 0xf87f
+};
+
+static const uint16_t sso_kasumi_S9e[] = {
+ 0x4ea7, 0xdeef, 0x42a1, 0xf77b, 0x0f87, 0x9d4e, 0x1209, 0xa552,
+ 0x4c26, 0xc4e2, 0x6030, 0xcd66, 0x89c4, 0x0381, 0xb45a, 0x1b8d,
+ 0x6eb7, 0xfafd, 0x2693, 0x974b, 0x3f9f, 0xa954, 0x6633, 0xd56a,
+ 0x6532, 0xe9f4, 0x0d06, 0xa452, 0xb0d8, 0x3e9f, 0xc964, 0x62b1,
+ 0x5eaf, 0xe2f1, 0xd3e9, 0x4a25, 0x9cce, 0x2211, 0x0000, 0x9b4d,
+ 0x582c, 0xfcfe, 0xf57a, 0x743a, 0x1e8f, 0xb8dc, 0xa251, 0x2190,
+ 0xbe5f, 0x0603, 0x773b, 0xeaf5, 0x6c36, 0xd6eb, 0xb4da, 0x2b95,
+ 0xb1d8, 0x1108, 0x58ac, 0xddee, 0xe773, 0x4522, 0x1f8f, 0x984c,
+ 0x4aa5, 0x8ac5, 0x178b, 0xf279, 0x0301, 0xc1e0, 0x4fa7, 0xa8d4,
+ 0xe0f0, 0x381c, 0x9dce, 0x60b0, 0x2d96, 0xf7fb, 0x4120, 0xbedf,
+ 0xebf5, 0x2f97, 0xf2f9, 0x1309, 0xb259, 0x74ba, 0xbadd, 0x59ac,
+ 0x48a4, 0x944a, 0x71b8, 0x88c4, 0x95ca, 0x4ba5, 0xbd5e, 0x46a3,
+ 0xd0e8, 0x3c9e, 0x0c86, 0xc562, 0x1a0d, 0xf4fa, 0xd7eb, 0x1c8e,
+ 0x7ebf, 0x8a45, 0x82c1, 0x53a9, 0x3098, 0xc6e3, 0xdd6e, 0x0e87,
+ 0xb158, 0x592c, 0x2914, 0xe4f2, 0x6bb5, 0x8140, 0xe271, 0x2d16,
+ 0x160b, 0xe6f3, 0xae57, 0x7b3d, 0x4824, 0xba5d, 0xe1f0, 0x361b,
+ 0xcfe7, 0x7dbe, 0xc5e2, 0x5229, 0x8844, 0x389c, 0x93c9, 0x0683,
+ 0x8d46, 0x2793, 0xa753, 0x2814, 0x4e27, 0xe673, 0x75ba, 0xf87c,
+ 0xb7db, 0x0180, 0xf9fc, 0x6a35, 0xe070, 0x54aa, 0xbfdf, 0x2e97,
+ 0xfc7e, 0x52a9, 0x9249, 0x190c, 0x2f17, 0x8341, 0x50a8, 0xd96c,
+ 0xd76b, 0x4924, 0x5c2e, 0xe7f3, 0x1389, 0x8f47, 0x8944, 0x3018,
+ 0x91c8, 0x170b, 0x3a9d, 0x99cc, 0xd1e8, 0x55aa, 0x6b35, 0xcae5,
+ 0x6fb7, 0xf5fa, 0xa0d0, 0x1f0f, 0xbb5d, 0x2391, 0x65b2, 0xd8ec,
+ 0x2010, 0xa2d1, 0xcf67, 0x6834, 0x7038, 0xf078, 0x8ec7, 0x2b15,
+ 0xa3d1, 0x41a0, 0xf8fc, 0x3f1f, 0xecf6, 0x0c06, 0xa653, 0x6331,
+ 0x49a4, 0xb359, 0x3299, 0xedf6, 0x8241, 0x7a3d, 0xe8f4, 0x351a,
+ 0x5aad, 0xbcde, 0x45a2, 0x8643, 0x0582, 0xe170, 0x0b05, 0xca65,
+ 0xb9dc, 0x4723, 0x86c3, 0x5dae, 0x6231, 0x9e4f, 0x4ca6, 0x954a,
+ 0x3118, 0xff7f, 0xeb75, 0x0080, 0xfd7e, 0x3198, 0x369b, 0xdfef,
+ 0xdf6f, 0x0984, 0x2512, 0xd66b, 0x97cb, 0x43a1, 0x7c3e, 0x8dc6,
+ 0x0884, 0xc2e1, 0x96cb, 0x793c, 0xd4ea, 0x1c0e, 0x5b2d, 0xb65b,
+ 0xeff7, 0x3d1e, 0x51a8, 0xa6d3, 0xb75b, 0x6733, 0x188c, 0xed76,
+ 0x4623, 0xce67, 0xfa7d, 0x57ab, 0x2613, 0xacd6, 0x8bc5, 0x2492,
+ 0xe5f2, 0x753a, 0x79bc, 0xcce6, 0x0100, 0x9349, 0x8cc6, 0x3b1d,
+ 0x6432, 0xe874, 0x9c4e, 0x359a, 0x140a, 0x9acd, 0xfdfe, 0x56ab,
+ 0xcee7, 0x5a2d, 0x168b, 0xa7d3, 0x3a1d, 0xac56, 0xf3f9, 0x4020,
+ 0x9048, 0x341a, 0xad56, 0x2c96, 0x7339, 0xd5ea, 0x5faf, 0xdcee,
+ 0x379b, 0x8b45, 0x2a95, 0xb3d9, 0x5028, 0xee77, 0x5cae, 0xc763,
+ 0x72b9, 0xd2e9, 0x0b85, 0x8e47, 0x81c0, 0x2311, 0xe974, 0x6e37,
+ 0xdc6e, 0x64b2, 0x8542, 0x180c, 0xabd5, 0x1188, 0xe371, 0x7cbe,
+ 0x0201, 0xda6d, 0xef77, 0x1289, 0x6ab5, 0xb058, 0x964b, 0x6934,
+ 0x0904, 0xc9e4, 0xc462, 0x2110, 0xe572, 0x2713, 0x399c, 0xde6f,
+ 0xa150, 0x7d3e, 0x0804, 0xf1f8, 0xd9ec, 0x0703, 0x6130, 0x9a4d,
+ 0xa351, 0x67b3, 0x2a15, 0xcb65, 0x5f2f, 0x994c, 0xc7e3, 0x2412,
+ 0x5e2f, 0xaa55, 0x3219, 0xe3f1, 0xb5da, 0x4321, 0xc864, 0x1b0d,
+ 0x5128, 0xbdde, 0x1d0e, 0xd46a, 0x3e1f, 0xd068, 0x63b1, 0xa854,
+ 0x3d9e, 0xcde6, 0x158a, 0xc060, 0xc663, 0x349a, 0xffff, 0x2894,
+ 0x3b9d, 0xd369, 0x3399, 0xfeff, 0x44a2, 0xaed7, 0x5d2e, 0x92c9,
+ 0x150a, 0xbf5f, 0xaf57, 0x2090, 0x73b9, 0xdb6d, 0xd86c, 0x552a,
+ 0xf6fb, 0x4422, 0x6cb6, 0xfbfd, 0x148a, 0xa4d2, 0x9f4f, 0x0a85,
+ 0x6f37, 0xc160, 0x9148, 0x1a8d, 0x198c, 0xb55a, 0xf67b, 0x7f3f,
+ 0x85c2, 0x3319, 0x5bad, 0xc8e4, 0x77bb, 0xc3e1, 0xb85c, 0x2994,
+ 0xcbe5, 0x4da6, 0xf0f8, 0x5329, 0x2e17, 0xaad5, 0x0482, 0xa5d2,
+ 0x2c16, 0xb2d9, 0x371b, 0x8c46, 0x4d26, 0xd168, 0x47a3, 0xfe7f,
+ 0x7138, 0xf379, 0x0e07, 0xa9d4, 0x84c2, 0x0402, 0xea75, 0x4f27,
+ 0x9fcf, 0x0502, 0xc0e0, 0x7fbf, 0xeef7, 0x76bb, 0xa050, 0x1d8e,
+ 0x391c, 0xc361, 0xd269, 0x0d86, 0x572b, 0xafd7, 0xadd6, 0x70b8,
+ 0x7239, 0x90c8, 0xb95c, 0x7e3f, 0x98cc, 0x78bc, 0x4221, 0x87c3,
+ 0xc261, 0x3c1e, 0x6d36, 0xb6db, 0xbc5e, 0x40a0, 0x0281, 0xdbed,
+ 0x8040, 0x66b3, 0x0f07, 0xcc66, 0x7abd, 0x9ecf, 0xe472, 0x2592,
+ 0x6db6, 0xbbdd, 0x0783, 0xf47a, 0x80c0, 0x542a, 0xfb7d, 0x0a05,
+ 0x2291, 0xec76, 0x68b4, 0x83c1, 0x4b25, 0x8743, 0x1088, 0xf97c,
+ 0x562b, 0x8442, 0x783c, 0x8fc7, 0xab55, 0x7bbd, 0x94ca, 0x61b0,
+ 0x1008, 0xdaed, 0x1e0f, 0xf178, 0x69b4, 0xa1d0, 0x763b, 0x9bcd
+};
+
+/* Range of input data for KASUMI is from 1 to 20000 bits */
+#define KASUMI_MIN_LEN 1
+#define KASUMI_MAX_LEN 20000
+
+/* KASUMI cipher definitions */
+#define NUM_KASUMI_ROUNDS (8) /* 8 rounds in the kasumi spec */
+#define QWORDSIZEINBITS (64)
+#define QWORDSIZEINBYTES (8)
+#define LAST_PADDING_BIT (1)
+
+#define BYTESIZE (8)
+#define BITSIZE(x) ((int)(sizeof(x)*BYTESIZE))
+
+/*--------- 16 bit rotate left ------------------------------------------*/
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+/*----- a 64-bit structure to help with kasumi endian issues -----*/
+typedef union _ku64 {
+ uint64_t b64[1];
+ uint32_t b32[2];
+ uint16_t b16[4];
+ uint8_t b8[8];
+} kasumi_union_t;
+
+typedef union SafeBuffer {
+ uint64_t b64;
+ uint32_t b32[2];
+ uint8_t b8[KASUMI_BLOCK_SIZE];
+} SafeBuf;
+
+/*---------------------------------------------------------------------
+* Inline 16-bit left rotation
+*---------------------------------------------------------------------*/
+
+#define ROL16(a,b) (uint16_t)((a<<b)|(a>>(16-b)))
+
+#define FIp1(data, key1, key2, key3) \
+ do { \
+ uint16_t datal, datah; \
+ \
+ (data) ^= (key1); \
+ datal = LOOKUP16_SSE(sso_kasumi_S7e, (uint8_t)(data), 256); \
+ datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) >> 7, 512); \
+ (data) = datal ^ datah; \
+ (data) ^= (key2); \
+ datal = LOOKUP16_SSE(sso_kasumi_S7e, (data) >> 9, 256); \
+ datah = LOOKUP16_SSE(sso_kasumi_S9e, (data) & 0x1FF, 512); \
+ (data) = datal ^ datah; \
+ (data) ^= (key3); \
+ } while (0)
+
+#define FIp2(data1, data2, key1, key2, key3, key4) \
+ do { \
+ FIp1(data1, key1, key2, key3); \
+ FIp1(data2, key1, key2, key4); \
+ } while (0)
+
+#define FLpi(key1, key2, res_h, res_l) \
+ do { \
+ uint16_t l, r; \
+ r = (res_l) & (key1); \
+ r = (res_h) ^ ROL16(r, 1); \
+ l = r | (key2); \
+ (res_h) = (res_l) ^ ROL16(l, 1); \
+ (res_l) = r; \
+ } while (0)
+
+#define FLp1(index, h, l) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h, l); \
+ } while (0)
+
+#define FLp2(index, h1, l1, h2, l2) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h1, l1); \
+ FLpi(ka, kb, h2, l2); \
+ } while (0)
+
+#define FLp3(index, h1, l1, h2, l2, h3, l3) \
+ do { \
+ uint16_t ka = *(index + 0); \
+ uint16_t kb = *(index + 1); \
+ FLpi(ka, kb, h1, l1); \
+ FLpi(ka, kb, h2, l2); \
+ FLpi(ka, kb, h3, l3); \
+ } while (0)
+
+#define FLp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \
+ do { \
+ FLp2(index, h1, l1, h2, l2); \
+ FLp2(index, h3, l3, h4, l4); \
+ } while (0)
+
+#define FOp1(index, h, l) \
+ do { \
+ FIp1(h, *(index + 2), *(index + 3), l); \
+ FIp1(l, *(index + 4), *(index + 5), h); \
+ FIp1(h, *(index + 6), *(index + 7), l); \
+ } while (0)
+
+#define FOp2(index, h1, l1, h2, l2) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ } while (0)
+
+#define FOp3(index, h1, l1, h2, l2, h3, l3) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp1(h3, ka, kb, l3); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ FIp1(l3, ka, kb, h3); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp1(h3, ka, kb, l3); \
+ } while (0)
+
+#define FOp4(index, h1, l1, h2, l2, h3, l3, h4, l4) \
+ do { \
+ uint16_t ka = *(index + 2); \
+ uint16_t kb = *(index + 3); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp2(h3, h4, ka, kb, l3, l4); \
+ ka = *(index + 4); \
+ kb = *(index + 5); \
+ FIp2(l1, l2, ka, kb, h1, h2); \
+ FIp2(l3, l4, ka, kb, h3, h4); \
+ ka = *(index + 6); \
+ kb = *(index + 7); \
+ FIp2(h1, h2, ka, kb, l1, l2); \
+ FIp2(h3, h4, ka, kb, l3, l4); \
+ } while (0)
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given block using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData Block to be enc/dec
+ *
+ ******************************************************************************/
+static void kasumi_1_block(const uint16_t *context, uint16_t *data)
+{
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp_l, temp_h;
+
+ /* 4 iterations odd/even */
+ do {
+ temp_l = data[3];
+ temp_h = data[2];
+ FLp1(context, temp_h, temp_l);
+ FOp1(context, temp_h, temp_l);
+ context += 8;
+ data[1] ^= temp_l;
+ data[0] ^= temp_h;
+
+ temp_h = data[1];
+ temp_l = data[0];
+ FOp1(context, temp_h, temp_l);
+ FLp1(context, temp_h, temp_l);
+ context += 8;
+ data[3] ^= temp_h;
+ data[2] ^= temp_l;
+ } while (context < end);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData1 First block to be enc/dec
+ * @param[in/out] pData2 Second block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_2_blocks(const uint16_t *context, uint16_t *data1, uint16_t *data2)
+{
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp1_l, temp1_h;
+ uint16_t temp2_l, temp2_h;
+
+ /* 4 iterations odd/even , with fine grain interleave */
+ do {
+ /* even */
+ temp1_l = data1[3];
+ temp1_h = data1[2];
+ temp2_l = data2[3];
+ temp2_h = data2[2];
+ FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ context += 8;
+ data1[1] ^= temp1_l;
+ data1[0] ^= temp1_h;
+ data2[1] ^= temp2_l;
+ data2[0] ^= temp2_h;
+
+ /* odd */
+ temp1_h = data1[1];
+ temp1_l = data1[0];
+ temp2_h = data2[1];
+ temp2_l = data2[0];
+ FOp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ FLp2(context, temp1_h, temp1_l, temp2_h, temp2_l);
+ context += 8;
+ data1[3] ^= temp1_h;
+ data1[2] ^= temp1_l;
+ data2[3] ^= temp2_h;
+ data2[2] ^= temp2_l;
+ } while (context < end);
+}
+
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in/out] pData1 First block to be enc/dec
+ * @param[in/out] pData2 Second block to be enc/dec
+ * @param[in/out] pData3 Third block to be enc/dec
+ *
+ ******************************************************************************/
+static void
+kasumi_3_blocks(const uint16_t *context, uint16_t *data1,
+ uint16_t *data2, uint16_t *data3)
+{
+ /* Case when the conmpiler is able to interleave efficiently */
+ const uint16_t *end = context + KASUMI_KEY_SCHEDULE_SIZE;
+ uint16_t temp1_l, temp1_h;
+ uint16_t temp2_l, temp2_h;
+ uint16_t temp3_l, temp3_h;
+
+ /* 4 iterations odd/even , with fine grain interleave */
+ do {
+ temp1_l = data1[3];
+ temp1_h = data1[2];
+ temp2_l = data2[3];
+ temp2_h = data2[2];
+ temp3_l = data3[3];
+ temp3_h = data3[2];
+ FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ context += 8;
+ data1[1] ^= temp1_l;
+ data1[0] ^= temp1_h;
+ data2[1] ^= temp2_l;
+ data2[0] ^= temp2_h;
+ data3[1] ^= temp3_l;
+ data3[0] ^= temp3_h;
+
+ temp1_h = data1[1];
+ temp1_l = data1[0];
+ temp2_h = data2[1];
+ temp2_l = data2[0];
+ temp3_h = data3[1];
+ temp3_l = data3[0];
+ FOp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ FLp3(context, temp1_h, temp1_l, temp2_h, temp2_l, temp3_h,
+ temp3_l);
+ context += 8;
+ data1[3] ^= temp1_h;
+ data1[2] ^= temp1_l;
+ data2[3] ^= temp2_h;
+ data2[2] ^= temp2_l;
+ data3[3] ^= temp3_h;
+ data3[2] ^= temp3_l;
+ } while (context < end);
+}
+
+/**
+ *******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in] ppData Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_4_blocks(const uint16_t *context, uint16_t **ppData)
+{
+ /* Case when the conmpiler is unable to interleave efficiently */
+ kasumi_2_blocks (context, ppData[0], ppData[1]);
+ kasumi_2_blocks (context, ppData[2], ppData[3]);
+}
+
+/**
+ ******************************************************************************
+ * @description
+ * This function performs the Kasumi operation on the given blocks using the key
+ * that is already scheduled in the context
+ *
+ * @param[in] pContext Context where the scheduled keys are stored
+ * @param[in] ppData Pointer to an array of addresses of blocks
+ *
+ ******************************************************************************/
+static void
+kasumi_8_blocks(const uint16_t *context, uint16_t **ppData)
+{
+ kasumi_4_blocks (context, &ppData[0]);
+ kasumi_4_blocks (context, &ppData[4]);
+}
+
+/******************************************************************************
+* @description
+* Multiple wrappers for the Kasumi rounds on up to 16 blocks of 64 bits at a
+*time.
+*
+* Depending on the variable packet lengths, different wrappers get called.
+* It has been measured that 1 packet is faster than 2, 2 packets is faster
+*than 3
+* 3 packets is faster than 4, and so on ...
+* It has also been measured that 6 = 4+2 packets is faster than 8
+* It has also been measured that 7 packets are processed faster as 8 packets,
+*
+* If the assumptions are not verified, it is easy to implmement
+* the right function and reference it in wrapperArray.
+*
+*******************************************************************************/
+static void
+kasumi_f8_1_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_1_block(context, data[0]);
+}
+
+static void
+kasumi_f8_2_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_2_blocks(context, data[0], data[1]);
+}
+
+static void
+kasumi_f8_3_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_3_blocks(context, data[0], data[1], data[2]);
+}
+
+static void
+kasumi_f8_5_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_1_block(context, data[4]);
+}
+
+static void
+kasumi_f8_6_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ /* It is also assumed 6 = 4+2 packets is faster than 8 */
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_2_blocks(context, data[4], data[5]);
+}
+
+static void
+kasumi_f8_7_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_4_blocks(context, &data[0]);
+ kasumi_3_blocks(context, data[4], data[5], data[6]);
+}
+
+static void
+kasumi_f8_9_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_1_block(context, data[8]);
+}
+
+static void
+kasumi_f8_10_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_2_blocks(context, data[8], data[9]);
+}
+
+static void
+kasumi_f8_11_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_3_blocks(context, data[8], data[9], data[10]);
+}
+
+static void
+kasumi_f8_12_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+}
+
+static void
+kasumi_f8_13_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_1_block(context, data[12]);
+}
+
+static void
+kasumi_f8_14_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_2_blocks(context, data[12], data[13]);
+}
+
+static void
+kasumi_f8_15_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_4_blocks(context, &data[8]);
+ kasumi_3_blocks(context, data[12], data[13], data[14]);
+}
+
+static void
+kasumi_f8_16_buffer_wrapper(const uint16_t *context, uint16_t **data)
+{
+ kasumi_8_blocks(context, &data[0]);
+ kasumi_8_blocks(context, &data[8]);
+}
+
+typedef void (*kasumi_wrapper_t)(const uint16_t *, uint16_t **);
+
+static kasumi_wrapper_t kasumiWrapperArray[] = {
+ NULL,
+ kasumi_f8_1_buffer_wrapper,
+ kasumi_f8_2_buffer_wrapper,
+ kasumi_f8_3_buffer_wrapper,
+ kasumi_4_blocks,
+ kasumi_f8_5_buffer_wrapper,
+ kasumi_f8_6_buffer_wrapper,
+ kasumi_f8_7_buffer_wrapper,
+ kasumi_8_blocks,
+ kasumi_f8_9_buffer_wrapper,
+ kasumi_f8_10_buffer_wrapper,
+ kasumi_f8_11_buffer_wrapper,
+ kasumi_f8_12_buffer_wrapper,
+ kasumi_f8_13_buffer_wrapper,
+ kasumi_f8_14_buffer_wrapper,
+ kasumi_f8_15_buffer_wrapper,
+ kasumi_f8_16_buffer_wrapper};
+
+/*---------------------------------------------------------------------
+* kasumi_key_schedule_sk()
+* Build the key schedule. Most "key" operations use 16-bit
+*
+* Context is a flat array of 64 uint16. The context is built in the same order
+* it will be used.
+*---------------------------------------------------------------------*/
+static inline void
+kasumi_key_schedule_sk(uint16_t *context, const void *pKey)
+{
+
+ /* Kasumi constants*/
+ static const uint16_t C[] = {0x0123, 0x4567, 0x89AB, 0xCDEF,
+ 0xFEDC, 0xBA98, 0x7654, 0x3210};
+
+ uint16_t k[8], kprime[8], n;
+ const uint8_t *pk = (const uint8_t *) pKey;
+
+ /* Build K[] and K'[] keys */
+ for (n = 0; n < 8; n++, pk += 2) {
+ k[n] = (pk[0] << 8) + pk[1];
+ kprime[n] = k[n] ^ C[n];
+ }
+
+ /*
+ * Finally construct the various sub keys [Kli1, KlO ...) in the right
+ * order for easy usage at run-time
+ */
+ for (n = 0; n < 8; n++) {
+ context[0] = ROL16(k[n], 1);
+ context[1] = kprime[(n + 2) & 0x7];
+ context[2] = ROL16(k[(n + 1) & 0x7], 5);
+ context[3] = kprime[(n + 4) & 0x7];
+ context[4] = ROL16(k[(n + 5) & 0x7], 8);
+ context[5] = kprime[(n + 3) & 0x7];
+ context[6] = ROL16(k[(n + 6) & 0x7], 13);
+ context[7] = kprime[(n + 7) & 0x7];
+ context += 8;
+ }
+#ifdef SAFE_DATA
+ clear_mem(k, sizeof(k));
+ clear_mem(kprime, sizeof(kprime));
+#endif
+}
+
+/*---------------------------------------------------------------------
+* kasumi_compute_sched()
+* Generic ksaumi key sched init function.
+*
+*---------------------------------------------------------------------*/
+static inline int
+kasumi_compute_sched(const uint8_t modifier,
+ const void *const pKey, void *pCtx)
+{
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pCtx == NULL)
+ return -1;
+#endif
+ uint32_t i = 0;
+ const uint8_t *const key = (const uint8_t * const)pKey;
+ uint8_t ModKey[KASUMI_KEY_SIZE] = {0}; /* Modified key */
+ kasumi_key_sched_t *pLocalCtx = (kasumi_key_sched_t *)pCtx;
+
+ /* Construct the modified key*/
+ for (i = 0; i < KASUMI_KEY_SIZE; i++)
+ ModKey[i] = (uint8_t)key[i] ^ modifier;
+
+ kasumi_key_schedule_sk(pLocalCtx->sk16, pKey);
+ kasumi_key_schedule_sk(pLocalCtx->msk16, ModKey);
+
+#ifdef SAFE_DATA
+ clear_mem(ModKey, sizeof(ModKey));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+ return 0;
+}
+
+/*---------------------------------------------------------------------
+* kasumi_key_sched_size()
+* Get the size of a kasumi key sched context.
+*
+*---------------------------------------------------------------------*/
+static inline size_t
+kasumi_key_sched_size(void)
+{
+ /*
+ * There are two keys that need to be scheduled: the original one and
+ * the modified one (xored with the relevant modifier)
+ */
+ return sizeof(kasumi_key_sched_t);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f8_key_sched()
+* Compute the kasumi f8 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f8_key_sched(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_compute_sched(0x55, pKey, pCtx);
+}
+
+/*---------------------------------------------------------------------
+* kasumi_init_f9_key_sched()
+* Compute the kasumi f9 key schedule.
+*
+*---------------------------------------------------------------------*/
+
+static inline int
+kasumi_init_f9_key_sched(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_compute_sched(0xAA, pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_sse(void);
+
+int
+kasumi_init_f8_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_sse(const void *pKey, kasumi_key_sched_t *pCtx);
+
+size_t
+kasumi_key_sched_size_avx(void);
+
+int
+kasumi_init_f8_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+int
+kasumi_init_f9_key_sched_avx(const void *pKey, kasumi_key_sched_t *pCtx);
+
+
+static inline void
+kasumi_f8_1_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pIn, void *pOut,
+ const uint32_t length)
+{
+ uint32_t blkcnt;
+ kasumi_union_t a, b; /* the modifier */
+ SafeBuf safeInBuf;
+ const uint8_t *pBufferIn = (const uint8_t *) pIn;
+ uint8_t *pBufferOut = (uint8_t *) pOut;
+ uint32_t lengthInBytes = length;
+
+ /* IV Endianity */
+ a.b64[0] = BSWAP64(IV);
+
+ /* First encryption to create modifier */
+ kasumi_1_block(pCtx->msk16, a.b16 );
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b.b64[0] = a.b64[0];
+
+ /* Now run the block cipher */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b.b16 );
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn,
+ b.b64[0]);
+ pBufferOut += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ /* done another 64 bits */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf.b8, pBufferIn,
+ lengthInBytes);
+ xor_keystrm_rev(b.b8, safeInBuf.b8, b.b64[0]);
+ memcpy_keystrm(pBufferOut, b.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut, pBufferIn, b.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+preserve_bits(kasumi_union_t *c,
+ const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+ SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+ const uint8_t bit_len, const uint8_t byte_len)
+{
+ const uint64_t mask = UINT64_MAX << (KASUMI_BLOCK_SIZE * 8 - bit_len);
+
+ /* Clear the last bits of the keystream and the input
+ * (input only in out-of-place case) */
+ c->b64[0] &= mask;
+ if (pcBufferIn != pcBufferOut) {
+ const uint64_t swapMask = BSWAP64(mask);
+
+ safeInBuf->b64 &= swapMask;
+
+ /*
+ * Merge the last bits from the output, to be preserved,
+ * in the keystream, to be XOR'd with the input
+ * (which last bits are 0, maintaining the output bits)
+ */
+ memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+ c->b64[0] |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+ }
+}
+
+static inline void
+kasumi_f8_1_buffer_bit(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pIn, void *pOut,
+ const uint32_t lengthInBits,
+ const uint32_t offsetInBits)
+{
+ const uint8_t *pBufferIn = (const uint8_t *) pIn;
+ uint8_t *pBufferOut = (uint8_t *) pOut;
+ uint32_t cipherLengthInBits = lengthInBits;
+ uint32_t blkcnt;
+ uint64_t shiftrem = 0;
+ kasumi_union_t a, b, c; /* the modifier */
+ const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+ uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+ /* Offset into the first byte (0 - 7 bits) */
+ uint32_t remainOffset = offsetInBits % 8;
+ uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+ SafeBuf safeOutBuf;
+ SafeBuf safeInBuf;
+
+ /* IV Endianity */
+ a.b64[0] = BSWAP64(IV);
+
+ /* First encryption to create modifier */
+ kasumi_1_block(pCtx->msk16, a.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b.b64[0] = a.b64[0];
+ /* Now run the block cipher */
+
+ /* Start with potential partial block (due to offset and length) */
+ kasumi_1_block(pCtx->sk16, b.b16);
+ c.b64[0] = b.b64[0] >> remainOffset;
+ /* Only one block to encrypt */
+ if (cipherLengthInBits < (64 - remainOffset)) {
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 =
+ (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ }
+
+ /* If last byte is a partial byte, the last bits of the output
+ * need to be preserved */
+ const uint8_t bitlen_with_off = remainOffset +
+ cipherLengthInBits;
+
+ if ((bitlen_with_off & 0x7) != 0) {
+ preserve_bits(&c, pcBufferOut, pcBufferIn, &safeOutBuf,
+ &safeInBuf, bitlen_with_off, byteLength);
+ }
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ return;
+ }
+
+ /*
+ * If operation is Out-of-place and there is offset
+ * to be applied, "remainOffset" bits from the output buffer
+ * need to be preserved (only applicable to first byte,
+ * since remainOffset is up to 7 bits)
+ */
+ if ((pIn != pOut) && remainOffset) {
+ const uint8_t mask8 =
+ (const uint8_t)(1 << (8 - remainOffset)) - 1;
+
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+ safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+ (pcBufferOut[0] & ~mask8);
+ xor_keystrm_rev(pcBufferOut, safeInBuf.b8, c.b64[0]);
+ pcBufferIn += KASUMI_BLOCK_SIZE;
+ } else {
+ /* At least 64 bits to produce (including offset) */
+ pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, c.b64[0]);
+ }
+
+ if (remainOffset != 0)
+ shiftrem = b.b64[0] << (64 - remainOffset);
+ cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8 - remainOffset;
+ pcBufferOut += KASUMI_BLOCK_SIZE;
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+
+ while (cipherLengthInBits) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b.b16);
+ c.b64[0] = (b.b64[0] >> remainOffset) | shiftrem;
+ if (remainOffset != 0)
+ shiftrem = b.b64[0] << (64 - remainOffset);
+ if (cipherLengthInBits >= KASUMI_BLOCK_SIZE * 8) {
+ pcBufferIn = xor_keystrm_rev(pcBufferOut,
+ pcBufferIn, c.b64[0]);
+ cipherLengthInBits -= KASUMI_BLOCK_SIZE * 8;
+ pcBufferOut += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+
+ /* apply the modifier and update the block count */
+ b.b64[0] ^= a.b64[0];
+ b.b16[0] ^= (uint16_t)++blkcnt;
+ } else {
+ /* end of the loop, handle the last bytes */
+ byteLength = (cipherLengthInBits + 7) / 8;
+ memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+ byteLength);
+
+ /* If last byte is a partial byte, the last bits
+ * of the output need to be preserved */
+ if ((cipherLengthInBits & 0x7) != 0)
+ preserve_bits(&c, pcBufferOut, pcBufferIn,
+ &safeOutBuf, &safeInBuf,
+ cipherLengthInBits, byteLength);
+ xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, c.b64[0]);
+ memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+ cipherLengthInBits = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&c, sizeof(c));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+ clear_mem(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_2_buffer(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pIn1, void *pOut1,
+ const uint32_t length1,
+ const void *pIn2, void *pOut2,
+ const uint32_t length2)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ uint32_t lengthInBytes1 = length1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ uint32_t lengthInBytes2 = length2;
+ uint32_t blkcnt, length;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ SafeBuf safeInBuf;
+
+ kasumi_union_t temp;
+
+ /* IV Endianity */
+ a1.b64[0] = BSWAP64(IV1);
+ a2.b64[0] = BSWAP64(IV2);
+
+ kasumi_2_blocks(pCtx->msk16, a1.b16, a2.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b1.b64[0] = a1.b64[0];
+ b2.b64[0] = a2.b64[0];
+
+ /* check which packet is longer and save "common" shortest length */
+ if (lengthInBytes1 > lengthInBytes2)
+ length = lengthInBytes2;
+ else
+ length = lengthInBytes1;
+
+ /* Round down to to a whole number of qwords. (QWORDLENGTHINBYTES-1 */
+ length &= ~7;
+ lengthInBytes1 -= length;
+ lengthInBytes2 -= length;
+
+ /* Now run the block cipher for common packet length, a whole number of
+ * blocks */
+ while (length) {
+ /* KASUMI it to produce the next block of keystream for both
+ * packets */
+ kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+
+ /* xor and write keystream */
+ pBufferIn1 =
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 =
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ length -= KASUMI_BLOCK_SIZE; /* done another 64 bits */
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ }
+
+ /*
+ * Process common part at end of first packet and second packet.
+ * One of the packets has a length less than 8 bytes.
+ */
+ if (lengthInBytes1 > 0 && lengthInBytes2 > 0) {
+ /* final round for 1 of the packets */
+ kasumi_2_blocks(pCtx->sk16, b1.b16, b2.b16);
+ if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+ } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+ memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+ lengthInBytes1);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, temp.b8,
+ lengthInBytes1);
+ lengthInBytes1 = 0;
+ /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ lengthInBytes1 = 0;
+ }
+ if (lengthInBytes2 > KASUMI_BLOCK_SIZE) {
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)++blkcnt;
+ lengthInBytes2 -= KASUMI_BLOCK_SIZE;
+ } else if (lengthInBytes2 < KASUMI_BLOCK_SIZE) {
+ memcpy_keystrm(safeInBuf.b8, pBufferIn2,
+ lengthInBytes2);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, temp.b8,
+ lengthInBytes2);
+ lengthInBytes2 = 0;
+ /* lengthInBytes2 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ lengthInBytes2 = 0;
+ }
+ }
+
+ if (lengthInBytes1 < lengthInBytes2) {
+ /* packet 2 is not completed since lengthInBytes2 > 0
+ * packet 1 has less than 8 bytes.
+ */
+ if (lengthInBytes1) {
+ kasumi_1_block(pCtx->sk16, b1.b16);
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ }
+ /* move pointers to right variables for packet 1 */
+ lengthInBytes1 = lengthInBytes2;
+ b1.b64[0] = b2.b64[0];
+ a1.b64[0] = a2.b64[0];
+ pBufferIn1 = pBufferIn2;
+ pBufferOut1 = pBufferOut2;
+ } else { /* lengthInBytes1 >= lengthInBytes2 */
+ if (!lengthInBytes1)
+ /* both packets are completed */
+ return;
+ /* process the remaining of packet 2 */
+ if (lengthInBytes2) {
+ kasumi_1_block(pCtx->sk16, b2.b16);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ }
+ /* packet 1 is not completed */
+ }
+
+ /* process the length difference from ipkt1 and pkt2 */
+ while (lengthInBytes1) {
+ /* KASUMI it to produce the next block of keystream */
+ kasumi_1_block(pCtx->sk16, b1.b16);
+
+ if (lengthInBytes1 > KASUMI_BLOCK_SIZE) {
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes1 -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ } else if (lengthInBytes1 < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf.b8, pBufferIn1,
+ lengthInBytes1);
+ xor_keystrm_rev(temp.b8, safeInBuf.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, temp.b8,
+ lengthInBytes1);
+ lengthInBytes1 = 0;
+ /* lengthInBytes1 == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ lengthInBytes1 = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&temp, sizeof(temp));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f8_3_buffer(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2, const uint64_t IV3,
+ const void *pIn1, void *pOut1,
+ const void *pIn2, void *pOut2,
+ const void *pIn3, void *pOut3,
+ const uint32_t length)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+ uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+ uint32_t lengthInBytes = length;
+ uint32_t blkcnt;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ kasumi_union_t a3, b3; /* the modifier */
+ SafeBuf safeInBuf1, safeInBuf2, safeInBuf3;
+
+ /* IV Endianity */
+ a1.b64[0] = BSWAP64(IV1);
+ a2.b64[0] = BSWAP64(IV2);
+ a3.b64[0] = BSWAP64(IV3);
+
+ kasumi_3_blocks(pCtx->msk16, a1.b16, a2.b16, a3.b16);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ b1.b64[0] = a1.b64[0];
+ b2.b64[0] = a2.b64[0];
+ b3.b64[0] = a3.b64[0];
+
+ /* Now run the block cipher for common packet lengthInBytes, a whole
+ * number of blocks */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream for all the
+ * packets */
+ kasumi_3_blocks(pCtx->sk16, b1.b16, b2.b16, b3.b16);
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ /* xor and write keystream */
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+ pBufferIn3, b3.b64[0]);
+ pBufferOut3 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ b3.b64[0] ^= a3.b64[0];
+ b3.b16[0] ^= (uint16_t)blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+ lengthInBytes);
+ xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+ lengthInBytes);
+ xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+ lengthInBytes);
+ xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+ memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&a3, sizeof(a3));
+ clear_mem(&b3, sizeof(b3));
+ clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+ clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+ clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F8 4 packet:
+* Four packets enc/dec with the same key schedule.
+* The 4 Ivs are independent and are passed as an array of values
+* The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f8_4_buffer(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3, const uint64_t IV4,
+ const void *pIn1, void *pOut1,
+ const void *pIn2, void *pOut2,
+ const void *pIn3, void *pOut3,
+ const void *pIn4, void *pOut4,
+ const uint32_t length)
+{
+ const uint8_t *pBufferIn1 = (const uint8_t *) pIn1;
+ uint8_t *pBufferOut1 = (uint8_t *) pOut1;
+ const uint8_t *pBufferIn2 = (const uint8_t *) pIn2;
+ uint8_t *pBufferOut2 = (uint8_t *) pOut2;
+ const uint8_t *pBufferIn3 = (const uint8_t *) pIn3;
+ uint8_t *pBufferOut3 = (uint8_t *) pOut3;
+ const uint8_t *pBufferIn4 = (const uint8_t *) pIn4;
+ uint8_t *pBufferOut4 = (uint8_t *) pOut4;
+ uint32_t lengthInBytes = length;
+ uint32_t blkcnt;
+ kasumi_union_t a1, b1; /* the modifier */
+ kasumi_union_t a2, b2; /* the modifier */
+ kasumi_union_t a3, b3; /* the modifier */
+ kasumi_union_t a4, b4; /* the modifier */
+ uint16_t *pTemp[4] = {b1.b16, b2.b16, b3.b16, b4.b16};
+ SafeBuf safeInBuf1, safeInBuf2, safeInBuf3, safeInBuf4;
+
+ /* IV Endianity */
+ b1.b64[0] = BSWAP64(IV1);
+ b2.b64[0] = BSWAP64(IV2);
+ b3.b64[0] = BSWAP64(IV3);
+ b4.b64[0] = BSWAP64(IV4);
+
+ kasumi_4_blocks(pCtx->msk16, pTemp);
+
+ /* Final initialisation steps */
+ blkcnt = 0;
+ a1.b64[0] = b1.b64[0];
+ a2.b64[0] = b2.b64[0];
+ a3.b64[0] = b3.b64[0];
+ a4.b64[0] = b4.b64[0];
+
+ /* Now run the block cipher for common packet lengthInBytes, a whole
+ * number of blocks */
+ while (lengthInBytes) {
+ /* KASUMI it to produce the next block of keystream for all the
+ * packets */
+ kasumi_4_blocks(pCtx->sk16, pTemp);
+
+ if (lengthInBytes > KASUMI_BLOCK_SIZE) {
+ /* xor and write keystream */
+ pBufferIn1 = xor_keystrm_rev(pBufferOut1,
+ pBufferIn1, b1.b64[0]);
+ pBufferOut1 += KASUMI_BLOCK_SIZE;
+ pBufferIn2 = xor_keystrm_rev(pBufferOut2,
+ pBufferIn2, b2.b64[0]);
+ pBufferOut2 += KASUMI_BLOCK_SIZE;
+ pBufferIn3 = xor_keystrm_rev(pBufferOut3,
+ pBufferIn3, b3.b64[0]);
+ pBufferOut3 += KASUMI_BLOCK_SIZE;
+ pBufferIn4 = xor_keystrm_rev(pBufferOut4,
+ pBufferIn4, b4.b64[0]);
+ pBufferOut4 += KASUMI_BLOCK_SIZE;
+ /* loop variant */
+ lengthInBytes -= KASUMI_BLOCK_SIZE;
+
+ /* apply the modifier and update the block count */
+ b1.b64[0] ^= a1.b64[0];
+ b1.b16[0] ^= (uint16_t)++blkcnt;
+ b2.b64[0] ^= a2.b64[0];
+ b2.b16[0] ^= (uint16_t)blkcnt;
+ b3.b64[0] ^= a3.b64[0];
+ b3.b16[0] ^= (uint16_t)blkcnt;
+ b4.b64[0] ^= a4.b64[0];
+ b4.b16[0] ^= (uint16_t)blkcnt;
+ } else if (lengthInBytes < KASUMI_BLOCK_SIZE) {
+ /* end of the loop, handle the last bytes */
+ memcpy_keystrm(safeInBuf1.b8, pBufferIn1,
+ lengthInBytes);
+ xor_keystrm_rev(b1.b8, safeInBuf1.b8, b1.b64[0]);
+ memcpy_keystrm(pBufferOut1, b1.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf2.b8, pBufferIn2,
+ lengthInBytes);
+ xor_keystrm_rev(b2.b8, safeInBuf2.b8, b2.b64[0]);
+ memcpy_keystrm(pBufferOut2, b2.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf3.b8, pBufferIn3,
+ lengthInBytes);
+ xor_keystrm_rev(b3.b8, safeInBuf3.b8, b3.b64[0]);
+ memcpy_keystrm(pBufferOut3, b3.b8, lengthInBytes);
+
+ memcpy_keystrm(safeInBuf4.b8, pBufferIn4,
+ lengthInBytes);
+ xor_keystrm_rev(b4.b8, safeInBuf4.b8, b4.b64[0]);
+ memcpy_keystrm(pBufferOut4, b4.b8, lengthInBytes);
+ lengthInBytes = 0;
+ /* lengthInBytes == KASUMI_BLOCK_SIZE */
+ } else {
+ xor_keystrm_rev(pBufferOut1, pBufferIn1, b1.b64[0]);
+ xor_keystrm_rev(pBufferOut2, pBufferIn2, b2.b64[0]);
+ xor_keystrm_rev(pBufferOut3, pBufferIn3, b3.b64[0]);
+ xor_keystrm_rev(pBufferOut4, pBufferIn4, b4.b64[0]);
+ lengthInBytes = 0;
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a1, sizeof(a1));
+ clear_mem(&b1, sizeof(b1));
+ clear_mem(&a2, sizeof(a2));
+ clear_mem(&b2, sizeof(b2));
+ clear_mem(&a3, sizeof(a3));
+ clear_mem(&b3, sizeof(b3));
+ clear_mem(&a4, sizeof(a4));
+ clear_mem(&b4, sizeof(b4));
+ clear_mem(&safeInBuf1, sizeof(safeInBuf1));
+ clear_mem(&safeInBuf2, sizeof(safeInBuf2));
+ clear_mem(&safeInBuf3, sizeof(safeInBuf3));
+ clear_mem(&safeInBuf4, sizeof(safeInBuf4));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F8 2 packet:
+* Two packets enc/dec with the same key schedule.
+* The 2 Ivs are independent and are passed as an array of values.
+* The packets are separate, the datalength is common
+*---------------------------------------------------------*/
+/******************************************************************************
+* @description
+* Kasumi F8 n packet:
+* Performs F8 enc/dec on [n] packets. The operation is performed in-place.
+* The input IV's are passed in Big Endian format.
+* The KeySchedule is in Little Endian format.
+*******************************************************************************/
+
+static inline void
+kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[],
+ const void * const pIn[], void *pOut[],
+ const uint32_t lengths[], const uint32_t bufCount)
+{
+ if (bufCount > 16) {
+ pOut[0] = NULL;
+ printf("dataCount too high (%d)\n", bufCount);
+ return;
+ }
+
+ uint32_t dataCount = bufCount;
+ kasumi_union_t A[NUM_PACKETS_16], temp[NUM_PACKETS_16], tempSort;
+ uint16_t *data[NUM_PACKETS_16];
+ uint32_t dataLen[NUM_PACKETS_16];
+ uint8_t *pDataOut[NUM_PACKETS_16] = {NULL};
+ const uint8_t *pDataIn[NUM_PACKETS_16] = {NULL};
+ const uint8_t *srctempbuff;
+ uint8_t *dsttempbuff;
+ uint32_t blkcnt = 0;
+ uint32_t len = 0;
+ uint32_t packet_idx, inner_idx, same_size_blocks;
+ int sortNeeded = 0, tempLen = 0;
+ SafeBuf safeInBuf;
+
+ memcpy((void *)dataLen, lengths, dataCount * sizeof(uint32_t));
+ memcpy((void *)pDataIn, pIn, dataCount * sizeof(void *));
+ memcpy((void *)pDataOut, pOut, dataCount * sizeof(void *));
+
+ /* save the IV to A for each packet */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ /*copy IV in reverse endian order as input IV is BE */
+ temp[packet_idx].b64[0] = BSWAP64(IV[packet_idx]);
+
+ /* set LE IV pointers */
+ data[packet_idx] = temp[packet_idx].b16;
+
+ /* check if all packets are sorted by decreasing length */
+ if (packet_idx > 0 &&
+ dataLen[packet_idx - 1] < dataLen[packet_idx])
+ /* this packet array is not correctly sorted */
+ sortNeeded = 1;
+ }
+
+ /* do 1st kasumi block on A with modified key, this overwrites A */
+ kasumiWrapperArray[dataCount](pKeySchedule->msk16, data);
+
+ if (sortNeeded) {
+ /* sort packets in decreasing buffer size from [0] to [n]th
+ packet,
+ ** where buffer[0] will contain longest buffer and
+ buffer[n] will
+ contain the shortest buffer.
+ 4 arrays are swapped :
+ - pointers to input buffers
+ - pointers to output buffers
+ - pointers to input IV's
+ - input buffer lengths
+ */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ inner_idx = packet_idx;
+ while (inner_idx--) {
+ if (dataLen[packet_idx] > dataLen[inner_idx]) {
+
+ /* swap buffers to arrange in descending
+ * order from [0]. */
+ srctempbuff = pDataIn[packet_idx];
+ dsttempbuff = pDataOut[packet_idx];
+ tempSort = temp[packet_idx];
+ tempLen = dataLen[packet_idx];
+
+ pDataIn[packet_idx] =
+ pDataIn[inner_idx];
+ pDataOut[packet_idx] =
+ pDataOut[inner_idx];
+ temp[packet_idx] = temp[inner_idx];
+ dataLen[packet_idx] =
+ dataLen[inner_idx];
+
+ pDataIn[inner_idx] = srctempbuff;
+ pDataOut[inner_idx] = dsttempbuff;
+ temp[inner_idx] = tempSort;
+ dataLen[inner_idx] = tempLen;
+ }
+ } /* for inner packet idx (inner bubble-sort) */
+ } /* for outer packet idx (outer bubble-sort) */
+ } /* if sortNeeded */
+
+ packet_idx = dataCount;
+ while (packet_idx--)
+ /* copy the schedule */
+ A[packet_idx].b64[0] = temp[packet_idx].b64[0];
+
+ while (dataCount > 0) {
+ /* max num of blocks left depends on roundUp(smallest packet),
+ * The shortest stream to process is always stored at location
+ * [dataCount - 1]
+ */
+ same_size_blocks =
+ ((dataLen[dataCount - 1] + KASUMI_BLOCK_SIZE - 1) /
+ KASUMI_BLOCK_SIZE) -
+ blkcnt;
+
+ /* process streams of complete blocks */
+ while (same_size_blocks-- > 1) {
+ /* do kasumi block encryption */
+ kasumiWrapperArray[dataCount](pKeySchedule->sk16,
+ data);
+
+ packet_idx = dataCount;
+ while (packet_idx--)
+ xor_keystrm_rev(pDataOut[packet_idx] + len,
+ pDataIn[packet_idx] + len,
+ temp[packet_idx].b64[0]);
+
+ /* length already done since the start of the packets */
+ len += KASUMI_BLOCK_SIZE;
+
+ /* block idx is incremented and rewritten in the
+ * keystream */
+ blkcnt += 1;
+ packet_idx = dataCount;
+ while (packet_idx--) {
+ temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+ temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+ } /* for packet_idx */
+
+ } /* while same_size_blocks (iteration on multiple blocks) */
+
+ /* keystream for last block of all packets */
+ kasumiWrapperArray[dataCount](pKeySchedule->sk16, data);
+
+ /* process incomplete blocks without overwriting past the buffer
+ * end */
+ while ((dataCount > 0) &&
+ (dataLen[dataCount - 1] < (len + KASUMI_BLOCK_SIZE))) {
+
+ dataCount--;
+ /* incomplete block is copied into a temp buffer */
+ memcpy_keystrm(safeInBuf.b8, pDataIn[dataCount] + len,
+ dataLen[dataCount] - len);
+ xor_keystrm_rev(temp[dataCount].b8,
+ safeInBuf.b8,
+ temp[dataCount].b64[0]);
+
+ memcpy_keystrm(pDataOut[dataCount] + len,
+ temp[dataCount].b8,
+ dataLen[dataCount] - len);
+ } /* while dataCount */
+
+ /* process last blocks: it can be the last complete block of the
+ packets or, if
+ KASUMI_SAFE_BUFFER is defined, the last block (complete or not)
+ of the packets*/
+ while ((dataCount > 0) &&
+ (dataLen[dataCount - 1] <= (len + KASUMI_BLOCK_SIZE))) {
+
+ dataCount--;
+ xor_keystrm_rev(pDataOut[dataCount] + len,
+ pDataIn[dataCount] + len,
+ temp[dataCount].b64[0]);
+ } /* while dataCount */
+ /* block idx is incremented and rewritten in the keystream */
+ blkcnt += 1;
+
+ /* for the following packets, this block is not the last one:
+ dataCount is not decremented */
+ packet_idx = dataCount;
+ while (packet_idx--) {
+
+ xor_keystrm_rev(pDataOut[packet_idx] + len,
+ pDataIn[packet_idx] + len,
+ temp[packet_idx].b64[0]);
+ temp[packet_idx].b64[0] ^= A[packet_idx].b64[0];
+ temp[packet_idx].b16[0] ^= (uint16_t)blkcnt;
+ } /* while packet_idx */
+
+ /* length already done since the start of the packets */
+ len += KASUMI_BLOCK_SIZE;
+
+ /* the remaining packets, if any, have now at least one valid
+ block, which might be complete or not */
+
+ } /* while (dataCount) */
+#ifdef SAFE_DATA
+ uint32_t i;
+
+ /* Clear sensitive data in stack */
+ for (i = 0; i < dataCount; i++) {
+ clear_mem(&A[i], sizeof(A[i]));
+ clear_mem(&temp[i], sizeof(temp[i]));
+ }
+ clear_mem(&tempSort, sizeof(tempSort));
+ clear_mem(&safeInBuf, sizeof(safeInBuf));
+#endif
+}
+
+static inline void
+kasumi_f9_1_buffer(const kasumi_key_sched_t *pCtx, const void *dataIn,
+ const uint32_t length, void *pDigest)
+{
+ kasumi_union_t a, b, mask;
+ const uint64_t *pIn = (const uint64_t *)dataIn;
+ uint32_t lengthInBytes = length;
+ SafeBuf safeBuf;
+
+ /* Init */
+ a.b64[0] = 0;
+ b.b64[0] = 0;
+ mask.b64[0] = -1;
+
+ /* Now run kasumi for all 8 byte blocks */
+ while (lengthInBytes >= 8) {
+
+ a.b64[0] ^= BSWAP64(*(pIn++));
+
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* loop variant */
+ lengthInBytes -= 8; /* done another 64 bits */
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ if (lengthInBytes) {
+ /* Not a whole 8 byte block remaining */
+ mask.b64[0] = ~(mask.b64[0] >> (BYTESIZE * lengthInBytes));
+ memcpy(&safeBuf.b64, pIn, lengthInBytes);
+ mask.b64[0] &= BSWAP64(safeBuf.b64);
+ a.b64[0] ^= mask.b64[0];
+
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ /* Kasumi b */
+ kasumi_1_block(pCtx->msk16, b.b16);
+
+ /* swap result */
+ *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&mask, sizeof(mask));
+ clear_mem(&safeBuf, sizeof(safeBuf));
+#endif
+}
+
+/*---------------------------------------------------------
+* @description
+* Kasumi F9 1 packet with user config:
+* Single packet digest with user defined IV, and precomputed key schedule.
+*
+* IV = swap32(count) << 32 | swap32(fresh)
+*
+*---------------------------------------------------------*/
+
+static inline void
+kasumi_f9_1_buffer_user(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pDataIn, const uint32_t length,
+ void *pDigest, const uint32_t direction)
+{
+ kasumi_union_t a, b, mask, message, temp;
+ uint32_t lengthInBits = length;
+ const uint64_t *pIn = (const uint64_t *)pDataIn;
+ kasumi_union_t safebuff;
+
+ a.b64[0] = 0;
+ b.b64[0] = 0;
+
+ /* Use the count and fresh for first round */
+ a.b64[0] = BSWAP64(IV);
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] = a.b64[0];
+
+ /* Now run kasumi for all 8 byte blocks */
+ while (lengthInBits >= QWORDSIZEINBITS) {
+ a.b64[0] ^= BSWAP64(*(pIn++));
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* loop variant */
+ lengthInBits -= 64; /* done another 64 bits */
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+
+ /* Is there any non 8 byte blocks remaining ? */
+ if (lengthInBits == 0) {
+ /* last block is : direct + 1 + 62 0's */
+ a.b64[0] ^= ((uint64_t)direction + direction + LAST_PADDING_BIT)
+ << (QWORDSIZEINBITS - 2);
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ } else if (lengthInBits <= (QWORDSIZEINBITS - 2)) {
+ /* last block is : message + direction + LAST_PADDING_BITS(1) +
+ * less than 62 0's */
+ mask.b64[0] = -1;
+ temp.b64[0] = 0;
+ message.b64[0] = 0;
+ mask.b64[0] = ~(mask.b64[0] >> lengthInBits);
+ /*round up and copy last lengthInBits */
+ memcpy(&safebuff.b64[0], pIn, (lengthInBits + 7) / 8);
+ message.b64[0] = BSWAP64(safebuff.b64[0]);
+ temp.b64[0] = mask.b64[0] & message.b64[0];
+ temp.b64[0] |=
+ ((uint64_t)direction + direction + LAST_PADDING_BIT)
+ << ((QWORDSIZEINBITS - 2) - lengthInBits);
+ a.b64[0] ^= temp.b64[0];
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ } else if (lengthInBits == (QWORDSIZEINBITS - 1)) {
+ /* next block is : message + direct */
+ /* last block is : 1 + 63 0's */
+ a.b64[0] ^= direction | (~1 & BSWAP64(*(pIn++)));
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ a.b8[QWORDSIZEINBYTES - 1] ^= (LAST_PADDING_BIT)
+ << (QWORDSIZEINBYTES - 1);
+ /* KASUMI it */
+ kasumi_1_block(pCtx->sk16, a.b16);
+ /* update */
+ b.b64[0] ^= a.b64[0];
+ }
+ /* Kasumi b */
+ kasumi_1_block(pCtx->msk16, b.b16);
+
+ /* swap result */
+ *(uint32_t *)pDigest = bswap4(b.b32[1]);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(&a, sizeof(a));
+ clear_mem(&b, sizeof(b));
+ clear_mem(&mask, sizeof(mask));
+ clear_mem(&message, sizeof(message));
+ clear_mem(&temp, sizeof(temp));
+ clear_mem(&safebuff, sizeof(safebuff));
+#endif
+}
+
+void kasumi_f8_1_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes);
+
+void kasumi_f8_1_buffer_bit_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+
+void kasumi_f8_2_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+
+void kasumi_f8_3_buffer_sse(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes);
+
+void kasumi_f8_4_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes);
+
+void kasumi_f8_n_buffer_sse(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_sse(const kasumi_key_sched_t *pCtx,
+ const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_sse(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV, const void *pBufferIn,
+ const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction);
+
+
+void kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes);
+void kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits);
+void kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const void *pBufferIn1, void *pBufferOut1,
+ const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2);
+void kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes);
+void kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes);
+void kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount);
+
+void kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest);
+
+void kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV, const void *pBufferIn,
+ const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction);
+#endif /*_KASUMI_INTERNAL_H_*/
+