diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-09-27 08:42:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-09-27 08:42:40 +0000 |
commit | 54880067830dddd9611bc8c6145a396f208b76ca (patch) | |
tree | 1d40eb981a691b14df7881fca6408ccf42fa1b9f /sntrup761.c | |
parent | Releasing progress-linux version 1:9.8p1-3~progress7.99u1. (diff) | |
download | openssh-54880067830dddd9611bc8c6145a396f208b76ca.tar.xz openssh-54880067830dddd9611bc8c6145a396f208b76ca.zip |
Merging upstream version 1:9.9p1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sntrup761.c')
-rw-r--r-- | sntrup761.c | 2884 |
1 files changed, 1885 insertions, 999 deletions
diff --git a/sntrup761.c b/sntrup761.c index 57368bd..123d013 100644 --- a/sntrup761.c +++ b/sntrup761.c @@ -1,4 +1,5 @@ -/* $OpenBSD: sntrup761.c,v 1.6 2023/01/11 02:13:52 djm Exp $ */ + +/* $OpenBSD: sntrup761.c,v 1.8 2024/09/16 05:37:05 djm Exp $ */ /* * Public Domain, Authors: @@ -15,6 +16,8 @@ #include <string.h> #include "crypto_api.h" +#define crypto_declassify(x, y) do {} while (0) + #define int8 crypto_int8 #define uint8 crypto_uint8 #define int16 crypto_int16 @@ -23,1251 +26,2134 @@ #define uint32 crypto_uint32 #define int64 crypto_int64 #define uint64 crypto_uint64 +extern volatile crypto_int16 crypto_int16_optblocker; +extern volatile crypto_int32 crypto_int32_optblocker; +extern volatile crypto_int64 crypto_int64_optblocker; -/* from supercop-20201130/crypto_sort/int32/portable4/int32_minmax.inc */ -#define int32_MINMAX(a,b) \ -do { \ - int64_t ab = (int64_t)b ^ (int64_t)a; \ - int64_t c = (int64_t)b - (int64_t)a; \ - c ^= ab & (c ^ b); \ - c >>= 31; \ - c &= ab; \ - a ^= c; \ - b ^= c; \ -} while(0) +/* from supercop-20240808/cryptoint/crypto_int16.h */ +/* auto-generated: cd cryptoint; ./autogen */ +/* cryptoint 20240806 */ -/* from supercop-20201130/crypto_sort/int32/portable4/sort.c */ +#ifndef crypto_int16_h +#define crypto_int16_h +#define crypto_int16 int16_t +#define crypto_int16_unsigned uint16_t -static void crypto_sort_int32(void *array,long long n) -{ - long long top,p,q,r,i,j; - int32 *x = array; - if (n < 2) return; - top = 1; - while (top < n - top) top += top; - - for (p = top;p >= 1;p >>= 1) { - i = 0; - while (i + 2 * p <= n) { - for (j = i;j < i + p;++j) - int32_MINMAX(x[j],x[j+p]); - i += 2 * p; - } - for (j = i;j < n - p;++j) - int32_MINMAX(x[j],x[j+p]); - i = 0; - j = 0; - for (q = top;q > p;q >>= 1) { - if (j != i) for (;;) { - if (j == n - q) goto done; - int32 a = x[j + p]; - for (r = q;r > p;r >>= 1) - int32_MINMAX(a,x[j + r]); - x[j + p] = a; - ++j; - if (j == i + p) { - i += 2 * p; - break; - } - } - while (i + p <= n - q) { - for (j = i;j < i + p;++j) { - int32 a = x[j + p]; - for (r = q;r > p;r >>= 1) - int32_MINMAX(a,x[j+r]); - x[j + p] = a; - } - i += 2 * p; - } - /* now i + p > n - q */ - j = i; - while (j < n - q) { - int32 a = x[j + p]; - for (r = q;r > p;r >>= 1) - int32_MINMAX(a,x[j+r]); - x[j + p] = a; - ++j; - } - - done: ; - } - } +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_load(const unsigned char *crypto_int16_s) { + crypto_int16 crypto_int16_z = 0; + crypto_int16_z |= ((crypto_int16) (*crypto_int16_s++)) << 0; + crypto_int16_z |= ((crypto_int16) (*crypto_int16_s++)) << 8; + return crypto_int16_z; } -/* from supercop-20201130/crypto_sort/uint32/useint32/sort.c */ - -/* can save time by vectorizing xor loops */ -/* can save time by integrating xor loops with int32_sort */ - -static void crypto_sort_uint32(void *array,long long n) -{ - crypto_uint32 *x = array; - long long j; - for (j = 0;j < n;++j) x[j] ^= 0x80000000; - crypto_sort_int32(array,n); - for (j = 0;j < n;++j) x[j] ^= 0x80000000; +__attribute__((unused)) +static inline +void crypto_int16_store(unsigned char *crypto_int16_s,crypto_int16 crypto_int16_x) { + *crypto_int16_s++ = crypto_int16_x >> 0; + *crypto_int16_s++ = crypto_int16_x >> 8; } -/* from supercop-20201130/crypto_kem/sntrup761/ref/uint32.c */ - -/* -CPU division instruction typically takes time depending on x. -This software is designed to take time independent of x. -Time still varies depending on m; user must ensure that m is constant. -Time also varies on CPUs where multiplication is variable-time. -There could be more CPU issues. -There could also be compiler issues. -*/ - -static void uint32_divmod_uint14(uint32 *q,uint16 *r,uint32 x,uint16 m) -{ - uint32 v = 0x80000000; - uint32 qpart; - uint32 mask; - - v /= m; - - /* caller guarantees m > 0 */ - /* caller guarantees m < 16384 */ - /* vm <= 2^31 <= vm+m-1 */ - /* xvm <= 2^31 x <= xvm+x(m-1) */ - - *q = 0; - - qpart = (x*(uint64)v)>>31; - /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */ - /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */ - /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */ - /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */ - /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ - /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */ - - x -= qpart*m; *q += qpart; - /* x <= 49146 */ - - qpart = (x*(uint64)v)>>31; - /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ - /* 0 <= newx <= m + 49146(2^14-1)/2^31 */ - /* 0 <= newx <= m + 0.4 */ - /* 0 <= newx <= m */ - - x -= qpart*m; *q += qpart; - /* x <= m */ - - x -= m; *q += 1; - mask = -(x>>31); - x += mask&(uint32)m; *q += mask; - /* x < m */ - - *r = x; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_negative_mask(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarw $15,%0" : "+r"(crypto_int16_x) : : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_y; + __asm__ ("sbfx %w0,%w1,15,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : ); + return crypto_int16_y; +#else + crypto_int16_x >>= 16-6; + crypto_int16_x ^= crypto_int16_optblocker; + crypto_int16_x >>= 5; + return crypto_int16_x; +#endif } - -static uint16 uint32_mod_uint14(uint32 x,uint16 m) -{ - uint32 q; - uint16 r; - uint32_divmod_uint14(&q,&r,x,m); - return r; +__attribute__((unused)) +static inline +crypto_int16_unsigned crypto_int16_unsigned_topbit_01(crypto_int16_unsigned crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("shrw $15,%0" : "+r"(crypto_int16_x) : : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_y; + __asm__ ("ubfx %w0,%w1,15,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : ); + return crypto_int16_y; +#else + crypto_int16_x >>= 16-6; + crypto_int16_x ^= crypto_int16_optblocker; + crypto_int16_x >>= 5; + return crypto_int16_x; +#endif } -/* from supercop-20201130/crypto_kem/sntrup761/ref/int32.c */ - -static void int32_divmod_uint14(int32 *q,uint16 *r,int32 x,uint16 m) -{ - uint32 uq,uq2; - uint16 ur,ur2; - uint32 mask; - - uint32_divmod_uint14(&uq,&ur,0x80000000+(uint32)x,m); - uint32_divmod_uint14(&uq2,&ur2,0x80000000,m); - ur -= ur2; uq -= uq2; - mask = -(uint32)(ur>>15); - ur += mask&m; uq += mask; - *r = ur; *q = uq; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_negative_01(crypto_int16 crypto_int16_x) { + return crypto_int16_unsigned_topbit_01(crypto_int16_x); } - -static uint16 int32_mod_uint14(int32 x,uint16 m) -{ - int32 q; - uint16 r; - int32_divmod_uint14(&q,&r,x,m); - return r; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_topbit_mask(crypto_int16 crypto_int16_x) { + return crypto_int16_negative_mask(crypto_int16_x); } -/* from supercop-20201130/crypto_kem/sntrup761/ref/paramsmenu.h */ -/* pick one of these three: */ -#define SIZE761 -#undef SIZE653 -#undef SIZE857 - -/* pick one of these two: */ -#define SNTRUP /* Streamlined NTRU Prime */ -#undef LPR /* NTRU LPRime */ - -/* from supercop-20201130/crypto_kem/sntrup761/ref/params.h */ -#ifndef params_H -#define params_H - -/* menu of parameter choices: */ - - -/* what the menu means: */ +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_topbit_01(crypto_int16 crypto_int16_x) { + return crypto_int16_unsigned_topbit_01(crypto_int16_x); +} -#if defined(SIZE761) -#define p 761 -#define q 4591 -#define Rounded_bytes 1007 -#ifndef LPR -#define Rq_bytes 1158 -#define w 286 +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bottombit_mask(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andw $1,%0" : "+r"(crypto_int16_x) : : "cc"); + return -crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_y; + __asm__ ("sbfx %w0,%w1,0,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : ); + return crypto_int16_y; #else -#define w 250 -#define tau0 2156 -#define tau1 114 -#define tau2 2007 -#define tau3 287 + crypto_int16_x &= 1 ^ crypto_int16_optblocker; + return -crypto_int16_x; #endif +} -#elif defined(SIZE653) -#define p 653 -#define q 4621 -#define Rounded_bytes 865 -#ifndef LPR -#define Rq_bytes 994 -#define w 288 +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bottombit_01(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andw $1,%0" : "+r"(crypto_int16_x) : : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_y; + __asm__ ("ubfx %w0,%w1,0,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : ); + return crypto_int16_y; #else -#define w 252 -#define tau0 2175 -#define tau1 113 -#define tau2 2031 -#define tau3 290 + crypto_int16_x &= 1 ^ crypto_int16_optblocker; + return crypto_int16_x; #endif +} -#elif defined(SIZE857) -#define p 857 -#define q 5167 -#define Rounded_bytes 1152 -#ifndef LPR -#define Rq_bytes 1322 -#define w 322 +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bitinrangepublicpos_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("sxth %w0,%w0\n asr %w0,%w0,%w1" : "+&r"(crypto_int16_x) : "r"(crypto_int16_s) : ); #else -#define w 281 -#define tau0 2433 -#define tau1 101 -#define tau2 2265 -#define tau3 324 + crypto_int16_x >>= crypto_int16_s ^ crypto_int16_optblocker; #endif + return crypto_int16_bottombit_mask(crypto_int16_x); +} +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bitinrangepublicpos_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("sxth %w0,%w0\n asr %w0,%w0,%w1" : "+&r"(crypto_int16_x) : "r"(crypto_int16_s) : ); #else -#error "no parameter set defined" + crypto_int16_x >>= crypto_int16_s ^ crypto_int16_optblocker; #endif + return crypto_int16_bottombit_01(crypto_int16_x); +} -#ifdef LPR -#define I 256 +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_shlmod(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16_s &= 15; + __asm__ ("shlw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("and %w0,%w0,15\n and %w1,%w1,65535\n lsl %w1,%w1,%w0" : "+&r"(crypto_int16_s), "+r"(crypto_int16_x) : : ); +#else + int crypto_int16_k, crypto_int16_l; + for (crypto_int16_l = 0,crypto_int16_k = 1;crypto_int16_k < 16;++crypto_int16_l,crypto_int16_k *= 2) + crypto_int16_x ^= (crypto_int16_x ^ (crypto_int16_x << crypto_int16_k)) & crypto_int16_bitinrangepublicpos_mask(crypto_int16_s,crypto_int16_l); #endif + return crypto_int16_x; +} +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_shrmod(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16_s &= 15; + __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("and %w0,%w0,15\n sxth %w1,%w1\n asr %w1,%w1,%w0" : "+&r"(crypto_int16_s), "+r"(crypto_int16_x) : : ); +#else + int crypto_int16_k, crypto_int16_l; + for (crypto_int16_l = 0,crypto_int16_k = 1;crypto_int16_k < 16;++crypto_int16_l,crypto_int16_k *= 2) + crypto_int16_x ^= (crypto_int16_x ^ (crypto_int16_x >> crypto_int16_k)) & crypto_int16_bitinrangepublicpos_mask(crypto_int16_s,crypto_int16_l); #endif - -/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.h */ -#ifndef Decode_H -#define Decode_H - - -/* Decode(R,s,M,len) */ -/* assumes 0 < M[i] < 16384 */ -/* produces 0 <= R[i] < M[i] */ - + return crypto_int16_x; +} + +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bitmod_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { + crypto_int16_x = crypto_int16_shrmod(crypto_int16_x,crypto_int16_s); + return crypto_int16_bottombit_mask(crypto_int16_x); +} + +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_bitmod_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) { + crypto_int16_x = crypto_int16_shrmod(crypto_int16_x,crypto_int16_s); + return crypto_int16_bottombit_01(crypto_int16_x); +} + +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_nonzero_mask(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("tst %w1,65535\n csetm %w0,ne" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + crypto_int16_x |= -crypto_int16_x; + return crypto_int16_negative_mask(crypto_int16_x); #endif - -/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.c */ - -static void Decode(uint16 *out,const unsigned char *S,const uint16 *M,long long len) -{ - if (len == 1) { - if (M[0] == 1) - *out = 0; - else if (M[0] <= 256) - *out = uint32_mod_uint14(S[0],M[0]); - else - *out = uint32_mod_uint14(S[0]+(((uint16)S[1])<<8),M[0]); - } - if (len > 1) { - uint16 R2[(len+1)/2]; - uint16 M2[(len+1)/2]; - uint16 bottomr[len/2]; - uint32 bottomt[len/2]; - long long i; - for (i = 0;i < len-1;i += 2) { - uint32 m = M[i]*(uint32) M[i+1]; - if (m > 256*16383) { - bottomt[i/2] = 256*256; - bottomr[i/2] = S[0]+256*S[1]; - S += 2; - M2[i/2] = (((m+255)>>8)+255)>>8; - } else if (m >= 16384) { - bottomt[i/2] = 256; - bottomr[i/2] = S[0]; - S += 1; - M2[i/2] = (m+255)>>8; - } else { - bottomt[i/2] = 1; - bottomr[i/2] = 0; - M2[i/2] = m; - } - } - if (i < len) - M2[i/2] = M[i]; - Decode(R2,S,M2,(len+1)/2); - for (i = 0;i < len-1;i += 2) { - uint32 r = bottomr[i/2]; - uint32 r1; - uint16 r0; - r += bottomt[i/2]*R2[i/2]; - uint32_divmod_uint14(&r1,&r0,r,M[i]); - r1 = uint32_mod_uint14(r1,M[i+1]); /* only needed for invalid inputs */ - *out++ = r0; - *out++ = r1; - } - if (i < len) - *out++ = R2[i/2]; - } } -/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.h */ -#ifndef Encode_H -#define Encode_H - - -/* Encode(s,R,M,len) */ -/* assumes 0 <= R[i] < M[i] < 16384 */ - +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_nonzero_01(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("tst %w1,65535\n cset %w0,ne" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + crypto_int16_x |= -crypto_int16_x; + return crypto_int16_unsigned_topbit_01(crypto_int16_x); #endif - -/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.c */ - -/* 0 <= R[i] < M[i] < 16384 */ -static void Encode(unsigned char *out,const uint16 *R,const uint16 *M,long long len) -{ - if (len == 1) { - uint16 r = R[0]; - uint16 m = M[0]; - while (m > 1) { - *out++ = r; - r >>= 8; - m = (m+255)>>8; - } - } - if (len > 1) { - uint16 R2[(len+1)/2]; - uint16 M2[(len+1)/2]; - long long i; - for (i = 0;i < len-1;i += 2) { - uint32 m0 = M[i]; - uint32 r = R[i]+R[i+1]*m0; - uint32 m = M[i+1]*m0; - while (m >= 16384) { - *out++ = r; - r >>= 8; - m = (m+255)>>8; - } - R2[i/2] = r; - M2[i/2] = m; - } - if (i < len) { - R2[i/2] = R[i]; - M2[i/2] = M[i]; - } - Encode(out,R2,M2,(len+1)/2); - } } -/* from supercop-20201130/crypto_kem/sntrup761/ref/kem.c */ - -#ifdef LPR +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_positive_mask(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovgw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,0\n csetm %w0,gt" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + crypto_int16 crypto_int16_z = -crypto_int16_x; + crypto_int16_z ^= crypto_int16_x & crypto_int16_z; + return crypto_int16_negative_mask(crypto_int16_z); #endif - - -/* ----- masks */ - -#ifndef LPR - -/* return -1 if x!=0; else return 0 */ -static int int16_nonzero_mask(int16 x) -{ - uint16 u = x; /* 0, else 1...65535 */ - uint32 v = u; /* 0, else 1...65535 */ - v = -v; /* 0, else 2^32-65535...2^32-1 */ - v >>= 31; /* 0, else 1 */ - return -v; /* 0, else -1 */ } +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_positive_01(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovgw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,0\n cset %w0,gt" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + crypto_int16 crypto_int16_z = -crypto_int16_x; + crypto_int16_z ^= crypto_int16_x & crypto_int16_z; + return crypto_int16_unsigned_topbit_01(crypto_int16_z); #endif - -/* return -1 if x<0; otherwise return 0 */ -static int int16_negative_mask(int16 x) -{ - uint16 u = x; - u >>= 15; - return -(int) u; - /* alternative with gcc -fwrapv: */ - /* x>>15 compiles to CPU's arithmetic right shift */ } -/* ----- arithmetic mod 3 */ - -typedef int8 small; - -/* F3 is always represented as -1,0,1 */ -/* so ZZ_fromF3 is a no-op */ - -/* x must not be close to top int16 */ -static small F3_freeze(int16 x) -{ - return int32_mod_uint14(x+1,3)-1; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_zero_mask(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("tst %w1,65535\n csetm %w0,eq" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + return ~crypto_int16_nonzero_mask(crypto_int16_x); +#endif } -/* ----- arithmetic mod q */ - -#define q12 ((q-1)/2) -typedef int16 Fq; -/* always represented as -q12...q12 */ -/* so ZZ_fromFq is a no-op */ - -/* x must not be close to top int32 */ -static Fq Fq_freeze(int32 x) -{ - return int32_mod_uint14(x+q12,q)-q12; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_zero_01(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("tst %w1,65535\n cset %w0,eq" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc"); + return crypto_int16_z; +#else + return 1-crypto_int16_nonzero_01(crypto_int16_x); +#endif } -#ifndef LPR - -static Fq Fq_recip(Fq a1) -{ - int i = 1; - Fq ai = a1; - - while (i < q-2) { - ai = Fq_freeze(a1*(int32)ai); - i += 1; - } - return ai; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_unequal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n csetm %w0,ne" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return crypto_int16_nonzero_mask(crypto_int16_x ^ crypto_int16_y); +#endif } +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_unequal_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n cset %w0,ne" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return crypto_int16_nonzero_01(crypto_int16_x ^ crypto_int16_y); #endif - -/* ----- Top and Right */ - -#ifdef LPR -#define tau 16 - -static int8 Top(Fq C) -{ - return (tau1*(int32)(C+tau0)+16384)>>15; } -static Fq Right(int8 T) -{ - return Fq_freeze(tau3*(int32)T-tau2); -} +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_equal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n csetm %w0,eq" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return ~crypto_int16_unequal_mask(crypto_int16_x,crypto_int16_y); #endif +} -/* ----- small polynomials */ +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_equal_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n cset %w0,eq" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return 1-crypto_int16_unequal_01(crypto_int16_x,crypto_int16_y); +#endif +} -#ifndef LPR +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_min(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpw %1,%0\n cmovgw %1,%0" : "+r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("sxth %w0,%w0\n cmp %w0,%w1,sxth\n csel %w0,%w0,%w1,lt" : "+&r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc"); + return crypto_int16_x; +#else + crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x; + crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x; + crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y); + crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z); + crypto_int16_z &= crypto_int16_r; + return crypto_int16_x ^ crypto_int16_z; +#endif +} -/* 0 if Weightw_is(r), else -1 */ -static int Weightw_mask(small *r) -{ - int weight = 0; - int i; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_max(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpw %1,%0\n cmovlw %1,%0" : "+r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("sxth %w0,%w0\n cmp %w0,%w1,sxth\n csel %w0,%w1,%w0,lt" : "+&r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc"); + return crypto_int16_x; +#else + crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x; + crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x; + crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y); + crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z); + crypto_int16_z &= crypto_int16_r; + return crypto_int16_y ^ crypto_int16_z; +#endif +} - for (i = 0;i < p;++i) weight += r[i]&1; - return int16_nonzero_mask(weight-w); +__attribute__((unused)) +static inline +void crypto_int16_minmax(crypto_int16 *crypto_int16_p,crypto_int16 *crypto_int16_q) { + crypto_int16 crypto_int16_x = *crypto_int16_p; + crypto_int16 crypto_int16_y = *crypto_int16_q; +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_z; + __asm__ ("cmpw %2,%1\n movw %1,%0\n cmovgw %2,%1\n cmovgw %0,%2" : "=&r"(crypto_int16_z), "+&r"(crypto_int16_x), "+r"(crypto_int16_y) : : "cc"); + *crypto_int16_p = crypto_int16_x; + *crypto_int16_q = crypto_int16_y; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_r, crypto_int16_s; + __asm__ ("sxth %w0,%w0\n cmp %w0,%w3,sxth\n csel %w1,%w0,%w3,lt\n csel %w2,%w3,%w0,lt" : "+&r"(crypto_int16_x), "=&r"(crypto_int16_r), "=r"(crypto_int16_s) : "r"(crypto_int16_y) : "cc"); + *crypto_int16_p = crypto_int16_r; + *crypto_int16_q = crypto_int16_s; +#else + crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x; + crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x; + crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y); + crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z); + crypto_int16_z &= crypto_int16_r; + crypto_int16_x ^= crypto_int16_z; + crypto_int16_y ^= crypto_int16_z; + *crypto_int16_p = crypto_int16_x; + *crypto_int16_q = crypto_int16_y; +#endif } -/* R3_fromR(R_fromRq(r)) */ -static void R3_fromRq(small *out,const Fq *r) -{ - int i; - for (i = 0;i < p;++i) out[i] = F3_freeze(r[i]); +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_smaller_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovlw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n csetm %w0,lt" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + crypto_int16 crypto_int16_r = crypto_int16_x ^ crypto_int16_y; + crypto_int16 crypto_int16_z = crypto_int16_x - crypto_int16_y; + crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_x); + return crypto_int16_negative_mask(crypto_int16_z); +#endif } -/* h = f*g in the ring R3 */ -static void R3_mult(small *h,const small *f,const small *g) -{ - small fg[p+p-1]; - small result; - int i,j; - - for (i = 0;i < p;++i) { - result = 0; - for (j = 0;j <= i;++j) result = F3_freeze(result+f[j]*g[i-j]); - fg[i] = result; - } - for (i = p;i < p+p-1;++i) { - result = 0; - for (j = i-p+1;j < p;++j) result = F3_freeze(result+f[j]*g[i-j]); - fg[i] = result; - } +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_smaller_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovlw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n cset %w0,lt" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + crypto_int16 crypto_int16_r = crypto_int16_x ^ crypto_int16_y; + crypto_int16 crypto_int16_z = crypto_int16_x - crypto_int16_y; + crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_x); + return crypto_int16_unsigned_topbit_01(crypto_int16_z); +#endif +} - for (i = p+p-2;i >= p;--i) { - fg[i-p] = F3_freeze(fg[i-p]+fg[i]); - fg[i-p+1] = F3_freeze(fg[i-p+1]+fg[i]); - } +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_leq_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovlew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n csetm %w0,le" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return ~crypto_int16_smaller_mask(crypto_int16_y,crypto_int16_x); +#endif +} - for (i = 0;i < p;++i) h[i] = fg[i]; +__attribute__((unused)) +static inline +crypto_int16 crypto_int16_leq_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 crypto_int16_q,crypto_int16_z; + __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovlew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int16 crypto_int16_z; + __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n cset %w0,le" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc"); + return crypto_int16_z; +#else + return 1-crypto_int16_smaller_01(crypto_int16_y,crypto_int16_x); +#endif } -/* returns 0 if recip succeeded; else -1 */ -static int R3_recip(small *out,const small *in) -{ - small f[p+1],g[p+1],v[p+1],r[p+1]; - int i,loop,delta; - int sign,swap,t; +__attribute__((unused)) +static inline +int crypto_int16_ones_num(crypto_int16 crypto_int16_x) { + crypto_int16_unsigned crypto_int16_y = crypto_int16_x; + const crypto_int16 C0 = 0x5555; + const crypto_int16 C1 = 0x3333; + const crypto_int16 C2 = 0x0f0f; + crypto_int16_y -= ((crypto_int16_y >> 1) & C0); + crypto_int16_y = (crypto_int16_y & C1) + ((crypto_int16_y >> 2) & C1); + crypto_int16_y = (crypto_int16_y + (crypto_int16_y >> 4)) & C2; + crypto_int16_y = (crypto_int16_y + (crypto_int16_y >> 8)) & 0xff; + return crypto_int16_y; +} + +__attribute__((unused)) +static inline +int crypto_int16_bottomzeros_num(crypto_int16 crypto_int16_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int16 fallback = 16; + __asm__ ("bsfw %0,%0\n cmovew %1,%0" : "+&r"(crypto_int16_x) : "r"(fallback) : "cc"); + return crypto_int16_x; +#elif defined(__GNUC__) && defined(__aarch64__) + int64_t crypto_int16_y; + __asm__ ("orr %w0,%w1,-65536\n rbit %w0,%w0\n clz %w0,%w0" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : ); + return crypto_int16_y; +#else + crypto_int16 crypto_int16_y = crypto_int16_x ^ (crypto_int16_x-1); + crypto_int16_y = ((crypto_int16) crypto_int16_y) >> 1; + crypto_int16_y &= ~(crypto_int16_x & (((crypto_int16) 1) << (16-1))); + return crypto_int16_ones_num(crypto_int16_y); +#endif +} - for (i = 0;i < p+1;++i) v[i] = 0; - for (i = 0;i < p+1;++i) r[i] = 0; - r[0] = 1; - for (i = 0;i < p;++i) f[i] = 0; - f[0] = 1; f[p-1] = f[p] = -1; - for (i = 0;i < p;++i) g[p-1-i] = in[i]; - g[p] = 0; +#endif - delta = 1; +/* from supercop-20240808/cryptoint/crypto_int32.h */ +/* auto-generated: cd cryptoint; ./autogen */ +/* cryptoint 20240806 */ - for (loop = 0;loop < 2*p-1;++loop) { - for (i = p;i > 0;--i) v[i] = v[i-1]; - v[0] = 0; +#ifndef crypto_int32_h +#define crypto_int32_h - sign = -g[0]*f[0]; - swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]); - delta ^= swap&(delta^-delta); - delta += 1; +#define crypto_int32 int32_t +#define crypto_int32_unsigned uint32_t - for (i = 0;i < p+1;++i) { - t = swap&(f[i]^g[i]); f[i] ^= t; g[i] ^= t; - t = swap&(v[i]^r[i]); v[i] ^= t; r[i] ^= t; - } - for (i = 0;i < p+1;++i) g[i] = F3_freeze(g[i]+sign*f[i]); - for (i = 0;i < p+1;++i) r[i] = F3_freeze(r[i]+sign*v[i]); - for (i = 0;i < p;++i) g[i] = g[i+1]; - g[p] = 0; - } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_load(const unsigned char *crypto_int32_s) { + crypto_int32 crypto_int32_z = 0; + crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 0; + crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 8; + crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 16; + crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 24; + return crypto_int32_z; +} - sign = f[0]; - for (i = 0;i < p;++i) out[i] = sign*v[p-1-i]; +__attribute__((unused)) +static inline +void crypto_int32_store(unsigned char *crypto_int32_s,crypto_int32 crypto_int32_x) { + *crypto_int32_s++ = crypto_int32_x >> 0; + *crypto_int32_s++ = crypto_int32_x >> 8; + *crypto_int32_s++ = crypto_int32_x >> 16; + *crypto_int32_s++ = crypto_int32_x >> 24; +} - return int16_nonzero_mask(delta); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_negative_mask(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarl $31,%0" : "+r"(crypto_int32_x) : : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_y; + __asm__ ("asr %w0,%w1,31" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : ); + return crypto_int32_y; +#else + crypto_int32_x >>= 32-6; + crypto_int32_x ^= crypto_int32_optblocker; + crypto_int32_x >>= 5; + return crypto_int32_x; +#endif } +__attribute__((unused)) +static inline +crypto_int32_unsigned crypto_int32_unsigned_topbit_01(crypto_int32_unsigned crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("shrl $31,%0" : "+r"(crypto_int32_x) : : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_y; + __asm__ ("lsr %w0,%w1,31" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : ); + return crypto_int32_y; +#else + crypto_int32_x >>= 32-6; + crypto_int32_x ^= crypto_int32_optblocker; + crypto_int32_x >>= 5; + return crypto_int32_x; #endif +} -/* ----- polynomials mod q */ +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_negative_01(crypto_int32 crypto_int32_x) { + return crypto_int32_unsigned_topbit_01(crypto_int32_x); +} -/* h = f*g in the ring Rq */ -static void Rq_mult_small(Fq *h,const Fq *f,const small *g) -{ - Fq fg[p+p-1]; - Fq result; - int i,j; - - for (i = 0;i < p;++i) { - result = 0; - for (j = 0;j <= i;++j) result = Fq_freeze(result+f[j]*(int32)g[i-j]); - fg[i] = result; - } - for (i = p;i < p+p-1;++i) { - result = 0; - for (j = i-p+1;j < p;++j) result = Fq_freeze(result+f[j]*(int32)g[i-j]); - fg[i] = result; - } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_topbit_mask(crypto_int32 crypto_int32_x) { + return crypto_int32_negative_mask(crypto_int32_x); +} - for (i = p+p-2;i >= p;--i) { - fg[i-p] = Fq_freeze(fg[i-p]+fg[i]); - fg[i-p+1] = Fq_freeze(fg[i-p+1]+fg[i]); - } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_topbit_01(crypto_int32 crypto_int32_x) { + return crypto_int32_unsigned_topbit_01(crypto_int32_x); +} - for (i = 0;i < p;++i) h[i] = fg[i]; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bottombit_mask(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andl $1,%0" : "+r"(crypto_int32_x) : : "cc"); + return -crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_y; + __asm__ ("sbfx %w0,%w1,0,1" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : ); + return crypto_int32_y; +#else + crypto_int32_x &= 1 ^ crypto_int32_optblocker; + return -crypto_int32_x; +#endif } -#ifndef LPR +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bottombit_01(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andl $1,%0" : "+r"(crypto_int32_x) : : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_y; + __asm__ ("ubfx %w0,%w1,0,1" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : ); + return crypto_int32_y; +#else + crypto_int32_x &= 1 ^ crypto_int32_optblocker; + return crypto_int32_x; +#endif +} -/* h = 3f in Rq */ -static void Rq_mult3(Fq *h,const Fq *f) -{ - int i; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bitinrangepublicpos_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : ); +#else + crypto_int32_x >>= crypto_int32_s ^ crypto_int32_optblocker; +#endif + return crypto_int32_bottombit_mask(crypto_int32_x); +} - for (i = 0;i < p;++i) h[i] = Fq_freeze(3*f[i]); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bitinrangepublicpos_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : ); +#else + crypto_int32_x >>= crypto_int32_s ^ crypto_int32_optblocker; +#endif + return crypto_int32_bottombit_01(crypto_int32_x); } -/* out = 1/(3*in) in Rq */ -/* returns 0 if recip succeeded; else -1 */ -static int Rq_recip3(Fq *out,const small *in) -{ - Fq f[p+1],g[p+1],v[p+1],r[p+1]; - int i,loop,delta; - int swap,t; - int32 f0,g0; - Fq scale; - - for (i = 0;i < p+1;++i) v[i] = 0; - for (i = 0;i < p+1;++i) r[i] = 0; - r[0] = Fq_recip(3); - for (i = 0;i < p;++i) f[i] = 0; - f[0] = 1; f[p-1] = f[p] = -1; - for (i = 0;i < p;++i) g[p-1-i] = in[i]; - g[p] = 0; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_shlmod(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("shll %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("lsl %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : ); +#else + int crypto_int32_k, crypto_int32_l; + for (crypto_int32_l = 0,crypto_int32_k = 1;crypto_int32_k < 32;++crypto_int32_l,crypto_int32_k *= 2) + crypto_int32_x ^= (crypto_int32_x ^ (crypto_int32_x << crypto_int32_k)) & crypto_int32_bitinrangepublicpos_mask(crypto_int32_s,crypto_int32_l); +#endif + return crypto_int32_x; +} - delta = 1; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_shrmod(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : ); +#else + int crypto_int32_k, crypto_int32_l; + for (crypto_int32_l = 0,crypto_int32_k = 1;crypto_int32_k < 32;++crypto_int32_l,crypto_int32_k *= 2) + crypto_int32_x ^= (crypto_int32_x ^ (crypto_int32_x >> crypto_int32_k)) & crypto_int32_bitinrangepublicpos_mask(crypto_int32_s,crypto_int32_l); +#endif + return crypto_int32_x; +} + +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bitmod_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { + crypto_int32_x = crypto_int32_shrmod(crypto_int32_x,crypto_int32_s); + return crypto_int32_bottombit_mask(crypto_int32_x); +} + +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_bitmod_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) { + crypto_int32_x = crypto_int32_shrmod(crypto_int32_x,crypto_int32_s); + return crypto_int32_bottombit_01(crypto_int32_x); +} + +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_nonzero_mask(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n csetm %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + crypto_int32_x |= -crypto_int32_x; + return crypto_int32_negative_mask(crypto_int32_x); +#endif +} - for (loop = 0;loop < 2*p-1;++loop) { - for (i = p;i > 0;--i) v[i] = v[i-1]; - v[0] = 0; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_nonzero_01(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n cset %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + crypto_int32_x |= -crypto_int32_x; + return crypto_int32_unsigned_topbit_01(crypto_int32_x); +#endif +} - swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]); - delta ^= swap&(delta^-delta); - delta += 1; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_positive_mask(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovgl %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n csetm %w0,gt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + crypto_int32 crypto_int32_z = -crypto_int32_x; + crypto_int32_z ^= crypto_int32_x & crypto_int32_z; + return crypto_int32_negative_mask(crypto_int32_z); +#endif +} - for (i = 0;i < p+1;++i) { - t = swap&(f[i]^g[i]); f[i] ^= t; g[i] ^= t; - t = swap&(v[i]^r[i]); v[i] ^= t; r[i] ^= t; - } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_positive_01(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovgl %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n cset %w0,gt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + crypto_int32 crypto_int32_z = -crypto_int32_x; + crypto_int32_z ^= crypto_int32_x & crypto_int32_z; + return crypto_int32_unsigned_topbit_01(crypto_int32_z); +#endif +} - f0 = f[0]; - g0 = g[0]; - for (i = 0;i < p+1;++i) g[i] = Fq_freeze(f0*g[i]-g0*f[i]); - for (i = 0;i < p+1;++i) r[i] = Fq_freeze(f0*r[i]-g0*v[i]); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_zero_mask(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n csetm %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + return ~crypto_int32_nonzero_mask(crypto_int32_x); +#endif +} - for (i = 0;i < p;++i) g[i] = g[i+1]; - g[p] = 0; - } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_zero_01(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,0\n cset %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc"); + return crypto_int32_z; +#else + return 1-crypto_int32_nonzero_01(crypto_int32_x); +#endif +} - scale = Fq_recip(f[0]); - for (i = 0;i < p;++i) out[i] = Fq_freeze(scale*(int32)v[p-1-i]); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_unequal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n csetm %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return crypto_int32_nonzero_mask(crypto_int32_x ^ crypto_int32_y); +#endif +} - return int16_nonzero_mask(delta); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_unequal_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n cset %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return crypto_int32_nonzero_01(crypto_int32_x ^ crypto_int32_y); +#endif } +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_equal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n csetm %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return ~crypto_int32_unequal_mask(crypto_int32_x,crypto_int32_y); #endif +} -/* ----- rounded polynomials mod q */ +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_equal_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n cset %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return 1-crypto_int32_unequal_01(crypto_int32_x,crypto_int32_y); +#endif +} -static void Round(Fq *out,const Fq *a) -{ - int i; - for (i = 0;i < p;++i) out[i] = a[i]-F3_freeze(a[i]); +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_min(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpl %1,%0\n cmovgl %1,%0" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("cmp %w0,%w1\n csel %w0,%w0,%w1,lt" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc"); + return crypto_int32_x; +#else + crypto_int64 crypto_int32_r = (crypto_int64)crypto_int32_y ^ (crypto_int64)crypto_int32_x; + crypto_int64 crypto_int32_z = (crypto_int64)crypto_int32_y - (crypto_int64)crypto_int32_x; + crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y); + crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z); + crypto_int32_z &= crypto_int32_r; + return crypto_int32_x ^ crypto_int32_z; +#endif } -/* ----- sorting to generate short polynomial */ +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_max(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpl %1,%0\n cmovll %1,%0" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("cmp %w0,%w1\n csel %w0,%w1,%w0,lt" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc"); + return crypto_int32_x; +#else + crypto_int64 crypto_int32_r = (crypto_int64)crypto_int32_y ^ (crypto_int64)crypto_int32_x; + crypto_int64 crypto_int32_z = (crypto_int64)crypto_int32_y - (crypto_int64)crypto_int32_x; + crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y); + crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z); + crypto_int32_z &= crypto_int32_r; + return crypto_int32_y ^ crypto_int32_z; +#endif +} -static void Short_fromlist(small *out,const uint32 *in) -{ - uint32 L[p]; - int i; +__attribute__((unused)) +static inline +void crypto_int32_minmax(crypto_int32 *crypto_int32_p,crypto_int32 *crypto_int32_q) { + crypto_int32 crypto_int32_x = *crypto_int32_p; + crypto_int32 crypto_int32_y = *crypto_int32_q; +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmpl %2,%1\n movl %1,%0\n cmovgl %2,%1\n cmovgl %0,%2" : "=&r"(crypto_int32_z), "+&r"(crypto_int32_x), "+r"(crypto_int32_y) : : "cc"); + *crypto_int32_p = crypto_int32_x; + *crypto_int32_q = crypto_int32_y; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_r, crypto_int32_s; + __asm__ ("cmp %w2,%w3\n csel %w0,%w2,%w3,lt\n csel %w1,%w3,%w2,lt" : "=&r"(crypto_int32_r), "=r"(crypto_int32_s) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + *crypto_int32_p = crypto_int32_r; + *crypto_int32_q = crypto_int32_s; +#else + crypto_int64 crypto_int32_r = (crypto_int64)crypto_int32_y ^ (crypto_int64)crypto_int32_x; + crypto_int64 crypto_int32_z = (crypto_int64)crypto_int32_y - (crypto_int64)crypto_int32_x; + crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y); + crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z); + crypto_int32_z &= crypto_int32_r; + crypto_int32_x ^= crypto_int32_z; + crypto_int32_y ^= crypto_int32_z; + *crypto_int32_p = crypto_int32_x; + *crypto_int32_q = crypto_int32_y; +#endif +} - for (i = 0;i < w;++i) L[i] = in[i]&(uint32)-2; - for (i = w;i < p;++i) L[i] = (in[i]&(uint32)-3)|1; - crypto_sort_uint32(L,p); - for (i = 0;i < p;++i) out[i] = (L[i]&3)-1; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_smaller_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovll %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n csetm %w0,lt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + crypto_int32 crypto_int32_r = crypto_int32_x ^ crypto_int32_y; + crypto_int32 crypto_int32_z = crypto_int32_x - crypto_int32_y; + crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_x); + return crypto_int32_negative_mask(crypto_int32_z); +#endif } -/* ----- underlying hash function */ +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_smaller_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovll %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n cset %w0,lt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + crypto_int32 crypto_int32_r = crypto_int32_x ^ crypto_int32_y; + crypto_int32 crypto_int32_z = crypto_int32_x - crypto_int32_y; + crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_x); + return crypto_int32_unsigned_topbit_01(crypto_int32_z); +#endif +} -#define Hash_bytes 32 +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_leq_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovlel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n csetm %w0,le" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return ~crypto_int32_smaller_mask(crypto_int32_y,crypto_int32_x); +#endif +} -/* e.g., b = 0 means out = Hash0(in) */ -static void Hash_prefix(unsigned char *out,int b,const unsigned char *in,int inlen) -{ - unsigned char x[inlen+1]; - unsigned char h[64]; - int i; +__attribute__((unused)) +static inline +crypto_int32 crypto_int32_leq_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 crypto_int32_q,crypto_int32_z; + __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovlel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int32 crypto_int32_z; + __asm__ ("cmp %w1,%w2\n cset %w0,le" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc"); + return crypto_int32_z; +#else + return 1-crypto_int32_smaller_01(crypto_int32_y,crypto_int32_x); +#endif +} - x[0] = b; - for (i = 0;i < inlen;++i) x[i+1] = in[i]; - crypto_hash_sha512(h,x,inlen+1); - for (i = 0;i < 32;++i) out[i] = h[i]; +__attribute__((unused)) +static inline +int crypto_int32_ones_num(crypto_int32 crypto_int32_x) { + crypto_int32_unsigned crypto_int32_y = crypto_int32_x; + const crypto_int32 C0 = 0x55555555; + const crypto_int32 C1 = 0x33333333; + const crypto_int32 C2 = 0x0f0f0f0f; + crypto_int32_y -= ((crypto_int32_y >> 1) & C0); + crypto_int32_y = (crypto_int32_y & C1) + ((crypto_int32_y >> 2) & C1); + crypto_int32_y = (crypto_int32_y + (crypto_int32_y >> 4)) & C2; + crypto_int32_y += crypto_int32_y >> 8; + crypto_int32_y = (crypto_int32_y + (crypto_int32_y >> 16)) & 0xff; + return crypto_int32_y; +} + +__attribute__((unused)) +static inline +int crypto_int32_bottomzeros_num(crypto_int32 crypto_int32_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int32 fallback = 32; + __asm__ ("bsfl %0,%0\n cmovel %1,%0" : "+&r"(crypto_int32_x) : "r"(fallback) : "cc"); + return crypto_int32_x; +#elif defined(__GNUC__) && defined(__aarch64__) + int64_t crypto_int32_y; + __asm__ ("rbit %w0,%w1\n clz %w0,%w0" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : ); + return crypto_int32_y; +#else + crypto_int32 crypto_int32_y = crypto_int32_x ^ (crypto_int32_x-1); + crypto_int32_y = ((crypto_int32) crypto_int32_y) >> 1; + crypto_int32_y &= ~(crypto_int32_x & (((crypto_int32) 1) << (32-1))); + return crypto_int32_ones_num(crypto_int32_y); +#endif } -/* ----- higher-level randomness */ +#endif -static uint32 urandom32(void) -{ - unsigned char c[4]; - uint32 out[4]; +/* from supercop-20240808/cryptoint/crypto_int64.h */ +/* auto-generated: cd cryptoint; ./autogen */ +/* cryptoint 20240806 */ + +#ifndef crypto_int64_h +#define crypto_int64_h + +#define crypto_int64 int64_t +#define crypto_int64_unsigned uint64_t + + + +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_load(const unsigned char *crypto_int64_s) { + crypto_int64 crypto_int64_z = 0; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 0; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 8; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 16; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 24; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 32; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 40; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 48; + crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 56; + return crypto_int64_z; +} + +__attribute__((unused)) +static inline +void crypto_int64_store(unsigned char *crypto_int64_s,crypto_int64 crypto_int64_x) { + *crypto_int64_s++ = crypto_int64_x >> 0; + *crypto_int64_s++ = crypto_int64_x >> 8; + *crypto_int64_s++ = crypto_int64_x >> 16; + *crypto_int64_s++ = crypto_int64_x >> 24; + *crypto_int64_s++ = crypto_int64_x >> 32; + *crypto_int64_s++ = crypto_int64_x >> 40; + *crypto_int64_s++ = crypto_int64_x >> 48; + *crypto_int64_s++ = crypto_int64_x >> 56; +} + +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_negative_mask(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarq $63,%0" : "+r"(crypto_int64_x) : : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_y; + __asm__ ("asr %0,%1,63" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : ); + return crypto_int64_y; +#else + crypto_int64_x >>= 64-6; + crypto_int64_x ^= crypto_int64_optblocker; + crypto_int64_x >>= 5; + return crypto_int64_x; +#endif +} - randombytes(c,4); - out[0] = (uint32)c[0]; - out[1] = ((uint32)c[1])<<8; - out[2] = ((uint32)c[2])<<16; - out[3] = ((uint32)c[3])<<24; - return out[0]+out[1]+out[2]+out[3]; +__attribute__((unused)) +static inline +crypto_int64_unsigned crypto_int64_unsigned_topbit_01(crypto_int64_unsigned crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("shrq $63,%0" : "+r"(crypto_int64_x) : : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_y; + __asm__ ("lsr %0,%1,63" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : ); + return crypto_int64_y; +#else + crypto_int64_x >>= 64-6; + crypto_int64_x ^= crypto_int64_optblocker; + crypto_int64_x >>= 5; + return crypto_int64_x; +#endif } -static void Short_random(small *out) -{ - uint32 L[p]; - int i; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_negative_01(crypto_int64 crypto_int64_x) { + return crypto_int64_unsigned_topbit_01(crypto_int64_x); +} - for (i = 0;i < p;++i) L[i] = urandom32(); - Short_fromlist(out,L); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_topbit_mask(crypto_int64 crypto_int64_x) { + return crypto_int64_negative_mask(crypto_int64_x); } -#ifndef LPR +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_topbit_01(crypto_int64 crypto_int64_x) { + return crypto_int64_unsigned_topbit_01(crypto_int64_x); +} -static void Small_random(small *out) -{ - int i; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bottombit_mask(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andq $1,%0" : "+r"(crypto_int64_x) : : "cc"); + return -crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_y; + __asm__ ("sbfx %0,%1,0,1" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : ); + return crypto_int64_y; +#else + crypto_int64_x &= 1 ^ crypto_int64_optblocker; + return -crypto_int64_x; +#endif +} - for (i = 0;i < p;++i) out[i] = (((urandom32()&0x3fffffff)*3)>>30)-1; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bottombit_01(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("andq $1,%0" : "+r"(crypto_int64_x) : : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_y; + __asm__ ("ubfx %0,%1,0,1" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : ); + return crypto_int64_y; +#else + crypto_int64_x &= 1 ^ crypto_int64_optblocker; + return crypto_int64_x; +#endif } +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bitinrangepublicpos_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : ); +#else + crypto_int64_x >>= crypto_int64_s ^ crypto_int64_optblocker; #endif + return crypto_int64_bottombit_mask(crypto_int64_x); +} -/* ----- Streamlined NTRU Prime Core */ +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bitinrangepublicpos_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : ); +#else + crypto_int64_x >>= crypto_int64_s ^ crypto_int64_optblocker; +#endif + return crypto_int64_bottombit_01(crypto_int64_x); +} -#ifndef LPR +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_shlmod(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("shlq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("lsl %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : ); +#else + int crypto_int64_k, crypto_int64_l; + for (crypto_int64_l = 0,crypto_int64_k = 1;crypto_int64_k < 64;++crypto_int64_l,crypto_int64_k *= 2) + crypto_int64_x ^= (crypto_int64_x ^ (crypto_int64_x << crypto_int64_k)) & crypto_int64_bitinrangepublicpos_mask(crypto_int64_s,crypto_int64_l); +#endif + return crypto_int64_x; +} -/* h,(f,ginv) = KeyGen() */ -static void KeyGen(Fq *h,small *f,small *ginv) -{ - small g[p]; - Fq finv[p]; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_shrmod(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc"); +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : ); +#else + int crypto_int64_k, crypto_int64_l; + for (crypto_int64_l = 0,crypto_int64_k = 1;crypto_int64_k < 64;++crypto_int64_l,crypto_int64_k *= 2) + crypto_int64_x ^= (crypto_int64_x ^ (crypto_int64_x >> crypto_int64_k)) & crypto_int64_bitinrangepublicpos_mask(crypto_int64_s,crypto_int64_l); +#endif + return crypto_int64_x; +} + +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bitmod_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { + crypto_int64_x = crypto_int64_shrmod(crypto_int64_x,crypto_int64_s); + return crypto_int64_bottombit_mask(crypto_int64_x); +} + +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_bitmod_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) { + crypto_int64_x = crypto_int64_shrmod(crypto_int64_x,crypto_int64_s); + return crypto_int64_bottombit_01(crypto_int64_x); +} + +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_nonzero_mask(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n csetm %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + crypto_int64_x |= -crypto_int64_x; + return crypto_int64_negative_mask(crypto_int64_x); +#endif +} - for (;;) { - Small_random(g); - if (R3_recip(ginv,g) == 0) break; - } - Short_random(f); - Rq_recip3(finv,f); /* always works */ - Rq_mult_small(h,finv,g); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_nonzero_01(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n cset %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + crypto_int64_x |= -crypto_int64_x; + return crypto_int64_unsigned_topbit_01(crypto_int64_x); +#endif } -/* c = Encrypt(r,h) */ -static void Encrypt(Fq *c,const small *r,const Fq *h) -{ - Fq hr[p]; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_positive_mask(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmovgq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n csetm %0,gt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + crypto_int64 crypto_int64_z = -crypto_int64_x; + crypto_int64_z ^= crypto_int64_x & crypto_int64_z; + return crypto_int64_negative_mask(crypto_int64_z); +#endif +} - Rq_mult_small(hr,h,r); - Round(c,hr); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_positive_01(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmovgq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n cset %0,gt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + crypto_int64 crypto_int64_z = -crypto_int64_x; + crypto_int64_z ^= crypto_int64_x & crypto_int64_z; + return crypto_int64_unsigned_topbit_01(crypto_int64_z); +#endif } -/* r = Decrypt(c,(f,ginv)) */ -static void Decrypt(small *r,const Fq *c,const small *f,const small *ginv) -{ - Fq cf[p]; - Fq cf3[p]; - small e[p]; - small ev[p]; - int mask; - int i; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_zero_mask(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n csetm %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + return ~crypto_int64_nonzero_mask(crypto_int64_x); +#endif +} - Rq_mult_small(cf,c,f); - Rq_mult3(cf3,cf); - R3_fromRq(e,cf3); - R3_mult(ev,e,ginv); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_zero_01(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,0\n cset %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc"); + return crypto_int64_z; +#else + return 1-crypto_int64_nonzero_01(crypto_int64_x); +#endif +} - mask = Weightw_mask(ev); /* 0 if weight w, else -1 */ - for (i = 0;i < w;++i) r[i] = ((ev[i]^1)&~mask)^1; - for (i = w;i < p;++i) r[i] = ev[i]&~mask; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_unequal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n csetm %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return crypto_int64_nonzero_mask(crypto_int64_x ^ crypto_int64_y); +#endif } +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_unequal_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n cset %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return crypto_int64_nonzero_01(crypto_int64_x ^ crypto_int64_y); #endif +} -/* ----- NTRU LPRime Core */ +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_equal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n csetm %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return ~crypto_int64_unequal_mask(crypto_int64_x,crypto_int64_y); +#endif +} -#ifdef LPR +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_equal_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n cset %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return 1-crypto_int64_unequal_01(crypto_int64_x,crypto_int64_y); +#endif +} -/* (G,A),a = KeyGen(G); leaves G unchanged */ -static void KeyGen(Fq *A,small *a,const Fq *G) -{ - Fq aG[p]; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_min(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpq %1,%0\n cmovgq %1,%0" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("cmp %0,%1\n csel %0,%0,%1,lt" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc"); + return crypto_int64_x; +#else + crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x; + crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x; + crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y); + crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z); + crypto_int64_z &= crypto_int64_r; + return crypto_int64_x ^ crypto_int64_z; +#endif +} - Short_random(a); - Rq_mult_small(aG,G,a); - Round(A,aG); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_max(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ("cmpq %1,%0\n cmovlq %1,%0" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + __asm__ ("cmp %0,%1\n csel %0,%1,%0,lt" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc"); + return crypto_int64_x; +#else + crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x; + crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x; + crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y); + crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z); + crypto_int64_z &= crypto_int64_r; + return crypto_int64_y ^ crypto_int64_z; +#endif } -/* B,T = Encrypt(r,(G,A),b) */ -static void Encrypt(Fq *B,int8 *T,const int8 *r,const Fq *G,const Fq *A,const small *b) -{ - Fq bG[p]; - Fq bA[p]; - int i; +__attribute__((unused)) +static inline +void crypto_int64_minmax(crypto_int64 *crypto_int64_p,crypto_int64 *crypto_int64_q) { + crypto_int64 crypto_int64_x = *crypto_int64_p; + crypto_int64 crypto_int64_y = *crypto_int64_q; +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmpq %2,%1\n movq %1,%0\n cmovgq %2,%1\n cmovgq %0,%2" : "=&r"(crypto_int64_z), "+&r"(crypto_int64_x), "+r"(crypto_int64_y) : : "cc"); + *crypto_int64_p = crypto_int64_x; + *crypto_int64_q = crypto_int64_y; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_r, crypto_int64_s; + __asm__ ("cmp %2,%3\n csel %0,%2,%3,lt\n csel %1,%3,%2,lt" : "=&r"(crypto_int64_r), "=r"(crypto_int64_s) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + *crypto_int64_p = crypto_int64_r; + *crypto_int64_q = crypto_int64_s; +#else + crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x; + crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x; + crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y); + crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z); + crypto_int64_z &= crypto_int64_r; + crypto_int64_x ^= crypto_int64_z; + crypto_int64_y ^= crypto_int64_z; + *crypto_int64_p = crypto_int64_x; + *crypto_int64_q = crypto_int64_y; +#endif +} - Rq_mult_small(bG,G,b); - Round(B,bG); - Rq_mult_small(bA,A,b); - for (i = 0;i < I;++i) T[i] = Top(Fq_freeze(bA[i]+r[i]*q12)); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_smaller_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovlq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n csetm %0,lt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + crypto_int64 crypto_int64_r = crypto_int64_x ^ crypto_int64_y; + crypto_int64 crypto_int64_z = crypto_int64_x - crypto_int64_y; + crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_x); + return crypto_int64_negative_mask(crypto_int64_z); +#endif } -/* r = Decrypt((B,T),a) */ -static void Decrypt(int8 *r,const Fq *B,const int8 *T,const small *a) -{ - Fq aB[p]; - int i; +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_smaller_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovlq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n cset %0,lt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + crypto_int64 crypto_int64_r = crypto_int64_x ^ crypto_int64_y; + crypto_int64 crypto_int64_z = crypto_int64_x - crypto_int64_y; + crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_x); + return crypto_int64_unsigned_topbit_01(crypto_int64_z); +#endif +} - Rq_mult_small(aB,B,a); - for (i = 0;i < I;++i) - r[i] = -int16_negative_mask(Fq_freeze(Right(T[i])-aB[i]+4*w+1)); +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_leq_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovleq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n csetm %0,le" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return ~crypto_int64_smaller_mask(crypto_int64_y,crypto_int64_x); +#endif } +__attribute__((unused)) +static inline +crypto_int64 crypto_int64_leq_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 crypto_int64_q,crypto_int64_z; + __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovleq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#elif defined(__GNUC__) && defined(__aarch64__) + crypto_int64 crypto_int64_z; + __asm__ ("cmp %1,%2\n cset %0,le" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc"); + return crypto_int64_z; +#else + return 1-crypto_int64_smaller_01(crypto_int64_y,crypto_int64_x); #endif +} -/* ----- encoding I-bit inputs */ +__attribute__((unused)) +static inline +int crypto_int64_ones_num(crypto_int64 crypto_int64_x) { + crypto_int64_unsigned crypto_int64_y = crypto_int64_x; + const crypto_int64 C0 = 0x5555555555555555; + const crypto_int64 C1 = 0x3333333333333333; + const crypto_int64 C2 = 0x0f0f0f0f0f0f0f0f; + crypto_int64_y -= ((crypto_int64_y >> 1) & C0); + crypto_int64_y = (crypto_int64_y & C1) + ((crypto_int64_y >> 2) & C1); + crypto_int64_y = (crypto_int64_y + (crypto_int64_y >> 4)) & C2; + crypto_int64_y += crypto_int64_y >> 8; + crypto_int64_y += crypto_int64_y >> 16; + crypto_int64_y = (crypto_int64_y + (crypto_int64_y >> 32)) & 0xff; + return crypto_int64_y; +} + +__attribute__((unused)) +static inline +int crypto_int64_bottomzeros_num(crypto_int64 crypto_int64_x) { +#if defined(__GNUC__) && defined(__x86_64__) + crypto_int64 fallback = 64; + __asm__ ("bsfq %0,%0\n cmoveq %1,%0" : "+&r"(crypto_int64_x) : "r"(fallback) : "cc"); + return crypto_int64_x; +#elif defined(__GNUC__) && defined(__aarch64__) + int64_t crypto_int64_y; + __asm__ ("rbit %0,%1\n clz %0,%0" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : ); + return crypto_int64_y; +#else + crypto_int64 crypto_int64_y = crypto_int64_x ^ (crypto_int64_x-1); + crypto_int64_y = ((crypto_int64) crypto_int64_y) >> 1; + crypto_int64_y &= ~(crypto_int64_x & (((crypto_int64) 1) << (64-1))); + return crypto_int64_ones_num(crypto_int64_y); +#endif +} -#ifdef LPR +#endif -#define Inputs_bytes (I/8) -typedef int8 Inputs[I]; /* passed by reference */ +/* from supercop-20240808/crypto_sort/int32/portable4/sort.c */ +#define int32_MINMAX(a,b) crypto_int32_minmax(&a,&b) -static void Inputs_encode(unsigned char *s,const Inputs r) +static void crypto_sort_int32(void *array,long long n) { - int i; - for (i = 0;i < Inputs_bytes;++i) s[i] = 0; - for (i = 0;i < I;++i) s[i>>3] |= r[i]<<(i&7); -} - -#endif + long long top,p,q,r,i,j; + int32 *x = array; -/* ----- Expand */ + if (n < 2) return; + top = 1; + while (top < n - top) top += top; -#ifdef LPR + for (p = top;p >= 1;p >>= 1) { + i = 0; + while (i + 2 * p <= n) { + for (j = i;j < i + p;++j) + int32_MINMAX(x[j],x[j+p]); + i += 2 * p; + } + for (j = i;j < n - p;++j) + int32_MINMAX(x[j],x[j+p]); -static const unsigned char aes_nonce[16] = {0}; + i = 0; + j = 0; + for (q = top;q > p;q >>= 1) { + if (j != i) for (;;) { + if (j == n - q) goto done; + int32 a = x[j + p]; + for (r = q;r > p;r >>= 1) + int32_MINMAX(a,x[j + r]); + x[j + p] = a; + ++j; + if (j == i + p) { + i += 2 * p; + break; + } + } + while (i + p <= n - q) { + for (j = i;j < i + p;++j) { + int32 a = x[j + p]; + for (r = q;r > p;r >>= 1) + int32_MINMAX(a,x[j+r]); + x[j + p] = a; + } + i += 2 * p; + } + /* now i + p > n - q */ + j = i; + while (j < n - q) { + int32 a = x[j + p]; + for (r = q;r > p;r >>= 1) + int32_MINMAX(a,x[j+r]); + x[j + p] = a; + ++j; + } -static void Expand(uint32 *L,const unsigned char *k) -{ - int i; - crypto_stream_aes256ctr((unsigned char *) L,4*p,aes_nonce,k); - for (i = 0;i < p;++i) { - uint32 L0 = ((unsigned char *) L)[4*i]; - uint32 L1 = ((unsigned char *) L)[4*i+1]; - uint32 L2 = ((unsigned char *) L)[4*i+2]; - uint32 L3 = ((unsigned char *) L)[4*i+3]; - L[i] = L0+(L1<<8)+(L2<<16)+(L3<<24); + done: ; + } } } -#endif - -/* ----- Seeds */ - -#ifdef LPR +/* from supercop-20240808/crypto_sort/uint32/useint32/sort.c */ -#define Seeds_bytes 32 +/* can save time by vectorizing xor loops */ +/* can save time by integrating xor loops with int32_sort */ -static void Seeds_random(unsigned char *s) +static void crypto_sort_uint32(void *array,long long n) { - randombytes(s,Seeds_bytes); + crypto_uint32 *x = array; + long long j; + for (j = 0;j < n;++j) x[j] ^= 0x80000000; + crypto_sort_int32(array,n); + for (j = 0;j < n;++j) x[j] ^= 0x80000000; } -#endif - -/* ----- Generator, HashShort */ - -#ifdef LPR +/* from supercop-20240808/crypto_kem/sntrup761/compact/kem.c */ +// 20240806 djb: some automated conversion to cryptoint -/* G = Generator(k) */ -static void Generator(Fq *G,const unsigned char *k) -{ - uint32 L[p]; - int i; +#define p 761 +#define q 4591 +#define w 286 +#define q12 ((q - 1) / 2) +typedef int8_t small; +typedef int16_t Fq; +#define Hash_bytes 32 +#define Small_bytes ((p + 3) / 4) +typedef small Inputs[p]; +#define SecretKeys_bytes (2 * Small_bytes) +#define Confirm_bytes 32 - Expand(L,k); - for (i = 0;i < p;++i) G[i] = uint32_mod_uint14(L[i],q)-q12; +static small F3_freeze(int16_t x) { return x - 3 * ((10923 * x + 16384) >> 15); } + +static Fq Fq_freeze(int32_t x) { + const int32_t q16 = (0x10000 + q / 2) / q; + const int32_t q20 = (0x100000 + q / 2) / q; + const int32_t q28 = (0x10000000 + q / 2) / q; + x -= q * ((q16 * x) >> 16); + x -= q * ((q20 * x) >> 20); + return x - q * ((q28 * x + 0x8000000) >> 28); +} + +static int Weightw_mask(small *r) { + int i, weight = 0; + for (i = 0; i < p; ++i) weight += crypto_int64_bottombit_01(r[i]); + return crypto_int16_nonzero_mask(weight - w); +} + +static void uint32_divmod_uint14(uint32_t *Q, uint16_t *r, uint32_t x, uint16_t m) { + uint32_t qpart, mask, v = 0x80000000 / m; + qpart = (x * (uint64_t)v) >> 31; + x -= qpart * m; + *Q = qpart; + qpart = (x * (uint64_t)v) >> 31; + x -= qpart * m; + *Q += qpart; + x -= m; + *Q += 1; + mask = crypto_int32_negative_mask(x); + x += mask & (uint32_t)m; + *Q += mask; + *r = x; } -/* out = HashShort(r) */ -static void HashShort(small *out,const Inputs r) -{ - unsigned char s[Inputs_bytes]; - unsigned char h[Hash_bytes]; - uint32 L[p]; - - Inputs_encode(s,r); - Hash_prefix(h,5,s,sizeof s); - Expand(L,h); - Short_fromlist(out,L); +static uint16_t uint32_mod_uint14(uint32_t x, uint16_t m) { + uint32_t Q; + uint16_t r; + uint32_divmod_uint14(&Q, &r, x, m); + return r; } -#endif - -/* ----- NTRU LPRime Expand */ - -#ifdef LPR - -/* (S,A),a = XKeyGen() */ -static void XKeyGen(unsigned char *S,Fq *A,small *a) -{ - Fq G[p]; - - Seeds_random(S); - Generator(G,S); - KeyGen(A,a,G); +static void Encode(unsigned char *out, const uint16_t *R, const uint16_t *M, long long len) { + if (len == 1) { + uint16_t r = R[0], m = M[0]; + while (m > 1) { + *out++ = r; + r >>= 8; + m = (m + 255) >> 8; + } + } + if (len > 1) { + uint16_t R2[(len + 1) / 2], M2[(len + 1) / 2]; + long long i; + for (i = 0; i < len - 1; i += 2) { + uint32_t m0 = M[i]; + uint32_t r = R[i] + R[i + 1] * m0; + uint32_t m = M[i + 1] * m0; + while (m >= 16384) { + *out++ = r; + r >>= 8; + m = (m + 255) >> 8; + } + R2[i / 2] = r; + M2[i / 2] = m; + } + if (i < len) { + R2[i / 2] = R[i]; + M2[i / 2] = M[i]; + } + Encode(out, R2, M2, (len + 1) / 2); + } } -/* B,T = XEncrypt(r,(S,A)) */ -static void XEncrypt(Fq *B,int8 *T,const int8 *r,const unsigned char *S,const Fq *A) -{ - Fq G[p]; - small b[p]; - - Generator(G,S); - HashShort(b,r); - Encrypt(B,T,r,G,A,b); +static void Decode(uint16_t *out, const unsigned char *S, const uint16_t *M, long long len) { + if (len == 1) { + if (M[0] == 1) + *out = 0; + else if (M[0] <= 256) + *out = uint32_mod_uint14(S[0], M[0]); + else + *out = uint32_mod_uint14(S[0] + (((uint16_t)S[1]) << 8), M[0]); + } + if (len > 1) { + uint16_t R2[(len + 1) / 2], M2[(len + 1) / 2], bottomr[len / 2]; + uint32_t bottomt[len / 2]; + long long i; + for (i = 0; i < len - 1; i += 2) { + uint32_t m = M[i] * (uint32_t)M[i + 1]; + if (m > 256 * 16383) { + bottomt[i / 2] = 256 * 256; + bottomr[i / 2] = S[0] + 256 * S[1]; + S += 2; + M2[i / 2] = (((m + 255) >> 8) + 255) >> 8; + } else if (m >= 16384) { + bottomt[i / 2] = 256; + bottomr[i / 2] = S[0]; + S += 1; + M2[i / 2] = (m + 255) >> 8; + } else { + bottomt[i / 2] = 1; + bottomr[i / 2] = 0; + M2[i / 2] = m; + } + } + if (i < len) M2[i / 2] = M[i]; + Decode(R2, S, M2, (len + 1) / 2); + for (i = 0; i < len - 1; i += 2) { + uint32_t r1, r = bottomr[i / 2]; + uint16_t r0; + r += bottomt[i / 2] * R2[i / 2]; + uint32_divmod_uint14(&r1, &r0, r, M[i]); + r1 = uint32_mod_uint14(r1, M[i + 1]); + *out++ = r0; + *out++ = r1; + } + if (i < len) *out++ = R2[i / 2]; + } } -#define XDecrypt Decrypt - -#endif +static void R3_fromRq(small *out, const Fq *r) { + int i; + for (i = 0; i < p; ++i) out[i] = F3_freeze(r[i]); +} -/* ----- encoding small polynomials (including short polynomials) */ +static void R3_mult(small *h, const small *f, const small *g) { + int16_t fg[p + p - 1]; + int i, j; + for (i = 0; i < p + p - 1; ++i) fg[i] = 0; + for (i = 0; i < p; ++i) + for (j = 0; j < p; ++j) fg[i + j] += f[i] * (int16_t)g[j]; + for (i = p; i < p + p - 1; ++i) fg[i - p] += fg[i]; + for (i = p; i < p + p - 1; ++i) fg[i - p + 1] += fg[i]; + for (i = 0; i < p; ++i) h[i] = F3_freeze(fg[i]); +} -#define Small_bytes ((p+3)/4) +static int R3_recip(small *out, const small *in) { + small f[p + 1], g[p + 1], v[p + 1], r[p + 1]; + int sign, swap, t, i, loop, delta = 1; + for (i = 0; i < p + 1; ++i) v[i] = 0; + for (i = 0; i < p + 1; ++i) r[i] = 0; + r[0] = 1; + for (i = 0; i < p; ++i) f[i] = 0; + f[0] = 1; + f[p - 1] = f[p] = -1; + for (i = 0; i < p; ++i) g[p - 1 - i] = in[i]; + g[p] = 0; + for (loop = 0; loop < 2 * p - 1; ++loop) { + for (i = p; i > 0; --i) v[i] = v[i - 1]; + v[0] = 0; + sign = -g[0] * f[0]; + swap = crypto_int16_negative_mask(-delta) & crypto_int16_nonzero_mask(g[0]); + delta ^= swap & (delta ^ -delta); + delta += 1; + for (i = 0; i < p + 1; ++i) { + t = swap & (f[i] ^ g[i]); + f[i] ^= t; + g[i] ^= t; + t = swap & (v[i] ^ r[i]); + v[i] ^= t; + r[i] ^= t; + } + for (i = 0; i < p + 1; ++i) g[i] = F3_freeze(g[i] + sign * f[i]); + for (i = 0; i < p + 1; ++i) r[i] = F3_freeze(r[i] + sign * v[i]); + for (i = 0; i < p; ++i) g[i] = g[i + 1]; + g[p] = 0; + } + sign = f[0]; + for (i = 0; i < p; ++i) out[i] = sign * v[p - 1 - i]; + return crypto_int16_nonzero_mask(delta); +} -/* these are the only functions that rely on p mod 4 = 1 */ +static void Rq_mult_small(Fq *h, const Fq *f, const small *g) { + int32_t fg[p + p - 1]; + int i, j; + for (i = 0; i < p + p - 1; ++i) fg[i] = 0; + for (i = 0; i < p; ++i) + for (j = 0; j < p; ++j) fg[i + j] += f[i] * (int32_t)g[j]; + for (i = p; i < p + p - 1; ++i) fg[i - p] += fg[i]; + for (i = p; i < p + p - 1; ++i) fg[i - p + 1] += fg[i]; + for (i = 0; i < p; ++i) h[i] = Fq_freeze(fg[i]); +} -static void Small_encode(unsigned char *s,const small *f) -{ - small x; +static void Rq_mult3(Fq *h, const Fq *f) { int i; + for (i = 0; i < p; ++i) h[i] = Fq_freeze(3 * f[i]); +} - for (i = 0;i < p/4;++i) { - x = *f++ + 1; - x += (*f++ + 1)<<2; - x += (*f++ + 1)<<4; - x += (*f++ + 1)<<6; - *s++ = x; +static Fq Fq_recip(Fq a1) { + int i = 1; + Fq ai = a1; + while (i < q - 2) { + ai = Fq_freeze(a1 * (int32_t)ai); + i += 1; } - x = *f++ + 1; - *s++ = x; + return ai; } -static void Small_decode(small *f,const unsigned char *s) -{ - unsigned char x; - int i; - - for (i = 0;i < p/4;++i) { - x = *s++; - *f++ = ((small)(x&3))-1; x >>= 2; - *f++ = ((small)(x&3))-1; x >>= 2; - *f++ = ((small)(x&3))-1; x >>= 2; - *f++ = ((small)(x&3))-1; +static int Rq_recip3(Fq *out, const small *in) { + Fq f[p + 1], g[p + 1], v[p + 1], r[p + 1], scale; + int swap, t, i, loop, delta = 1; + int32_t f0, g0; + for (i = 0; i < p + 1; ++i) v[i] = 0; + for (i = 0; i < p + 1; ++i) r[i] = 0; + r[0] = Fq_recip(3); + for (i = 0; i < p; ++i) f[i] = 0; + f[0] = 1; + f[p - 1] = f[p] = -1; + for (i = 0; i < p; ++i) g[p - 1 - i] = in[i]; + g[p] = 0; + for (loop = 0; loop < 2 * p - 1; ++loop) { + for (i = p; i > 0; --i) v[i] = v[i - 1]; + v[0] = 0; + swap = crypto_int16_negative_mask(-delta) & crypto_int16_nonzero_mask(g[0]); + delta ^= swap & (delta ^ -delta); + delta += 1; + for (i = 0; i < p + 1; ++i) { + t = swap & (f[i] ^ g[i]); + f[i] ^= t; + g[i] ^= t; + t = swap & (v[i] ^ r[i]); + v[i] ^= t; + r[i] ^= t; + } + f0 = f[0]; + g0 = g[0]; + for (i = 0; i < p + 1; ++i) g[i] = Fq_freeze(f0 * g[i] - g0 * f[i]); + for (i = 0; i < p + 1; ++i) r[i] = Fq_freeze(f0 * r[i] - g0 * v[i]); + for (i = 0; i < p; ++i) g[i] = g[i + 1]; + g[p] = 0; } - x = *s++; - *f++ = ((small)(x&3))-1; + scale = Fq_recip(f[0]); + for (i = 0; i < p; ++i) out[i] = Fq_freeze(scale * (int32_t)v[p - 1 - i]); + return crypto_int16_nonzero_mask(delta); } -/* ----- encoding general polynomials */ - -#ifndef LPR - -static void Rq_encode(unsigned char *s,const Fq *r) -{ - uint16 R[p],M[p]; +static void Round(Fq *out, const Fq *a) { int i; - - for (i = 0;i < p;++i) R[i] = r[i]+q12; - for (i = 0;i < p;++i) M[i] = q; - Encode(s,R,M,p); + for (i = 0; i < p; ++i) out[i] = a[i] - F3_freeze(a[i]); } -static void Rq_decode(Fq *r,const unsigned char *s) -{ - uint16 R[p],M[p]; +static void Short_fromlist(small *out, const uint32_t *in) { + uint32_t L[p]; int i; - - for (i = 0;i < p;++i) M[i] = q; - Decode(R,s,M,p); - for (i = 0;i < p;++i) r[i] = ((Fq)R[i])-q12; + for (i = 0; i < w; ++i) L[i] = in[i] & (uint32_t)-2; + for (i = w; i < p; ++i) L[i] = (in[i] & (uint32_t)-3) | 1; + crypto_sort_uint32(L, p); + for (i = 0; i < p; ++i) out[i] = (L[i] & 3) - 1; } -#endif - -/* ----- encoding rounded polynomials */ - -static void Rounded_encode(unsigned char *s,const Fq *r) -{ - uint16 R[p],M[p]; +static void Hash_prefix(unsigned char *out, int b, const unsigned char *in, int inlen) { + unsigned char x[inlen + 1], h[64]; int i; - - for (i = 0;i < p;++i) R[i] = ((r[i]+q12)*10923)>>15; - for (i = 0;i < p;++i) M[i] = (q+2)/3; - Encode(s,R,M,p); + x[0] = b; + for (i = 0; i < inlen; ++i) x[i + 1] = in[i]; + crypto_hash_sha512(h, x, inlen + 1); + for (i = 0; i < 32; ++i) out[i] = h[i]; } -static void Rounded_decode(Fq *r,const unsigned char *s) -{ - uint16 R[p],M[p]; +static uint32_t urandom32(void) { + unsigned char c[4]; + uint32_t result = 0; int i; - - for (i = 0;i < p;++i) M[i] = (q+2)/3; - Decode(R,s,M,p); - for (i = 0;i < p;++i) r[i] = R[i]*3-q12; + randombytes(c, 4); + for (i = 0; i < 4; ++i) result += ((uint32_t)c[i]) << (8 * i); + return result; } -/* ----- encoding top polynomials */ - -#ifdef LPR - -#define Top_bytes (I/2) - -static void Top_encode(unsigned char *s,const int8 *T) -{ +static void Short_random(small *out) { + uint32_t L[p]; int i; - for (i = 0;i < Top_bytes;++i) - s[i] = T[2*i]+(T[2*i+1]<<4); + for (i = 0; i < p; ++i) L[i] = urandom32(); + Short_fromlist(out, L); } -static void Top_decode(int8 *T,const unsigned char *s) -{ +static void Small_random(small *out) { int i; - for (i = 0;i < Top_bytes;++i) { - T[2*i] = s[i]&15; - T[2*i+1] = s[i]>>4; - } + for (i = 0; i < p; ++i) out[i] = (((urandom32() & 0x3fffffff) * 3) >> 30) - 1; } -#endif - -/* ----- Streamlined NTRU Prime Core plus encoding */ - -#ifndef LPR - -typedef small Inputs[p]; /* passed by reference */ -#define Inputs_random Short_random -#define Inputs_encode Small_encode -#define Inputs_bytes Small_bytes - -#define Ciphertexts_bytes Rounded_bytes -#define SecretKeys_bytes (2*Small_bytes) -#define PublicKeys_bytes Rq_bytes - -/* pk,sk = ZKeyGen() */ -static void ZKeyGen(unsigned char *pk,unsigned char *sk) -{ - Fq h[p]; - small f[p],v[p]; - - KeyGen(h,f,v); - Rq_encode(pk,h); - Small_encode(sk,f); sk += Small_bytes; - Small_encode(sk,v); +static void KeyGen(Fq *h, small *f, small *ginv) { + small g[p]; + Fq finv[p]; + for (;;) { + int result; + Small_random(g); + result = R3_recip(ginv, g); + crypto_declassify(&result, sizeof result); + if (result == 0) break; + } + Short_random(f); + Rq_recip3(finv, f); + Rq_mult_small(h, finv, g); } -/* C = ZEncrypt(r,pk) */ -static void ZEncrypt(unsigned char *C,const Inputs r,const unsigned char *pk) -{ - Fq h[p]; - Fq c[p]; - Rq_decode(h,pk); - Encrypt(c,r,h); - Rounded_encode(C,c); +static void Encrypt(Fq *c, const small *r, const Fq *h) { + Fq hr[p]; + Rq_mult_small(hr, h, r); + Round(c, hr); +} + +static void Decrypt(small *r, const Fq *c, const small *f, const small *ginv) { + Fq cf[p], cf3[p]; + small e[p], ev[p]; + int mask, i; + Rq_mult_small(cf, c, f); + Rq_mult3(cf3, cf); + R3_fromRq(e, cf3); + R3_mult(ev, e, ginv); + mask = Weightw_mask(ev); + for (i = 0; i < w; ++i) r[i] = ((ev[i] ^ 1) & ~mask) ^ 1; + for (i = w; i < p; ++i) r[i] = ev[i] & ~mask; +} + +static void Small_encode(unsigned char *s, const small *f) { + int i, j; + for (i = 0; i < p / 4; ++i) { + small x = 0; + for (j = 0;j < 4;++j) x += (*f++ + 1) << (2 * j); + *s++ = x; + } + *s = *f++ + 1; } -/* r = ZDecrypt(C,sk) */ -static void ZDecrypt(Inputs r,const unsigned char *C,const unsigned char *sk) -{ - small f[p],v[p]; - Fq c[p]; - - Small_decode(f,sk); sk += Small_bytes; - Small_decode(v,sk); - Rounded_decode(c,C); - Decrypt(r,c,f,v); +static void Small_decode(small *f, const unsigned char *s) { + int i, j; + for (i = 0; i < p / 4; ++i) { + unsigned char x = *s++; + for (j = 0;j < 4;++j) *f++ = ((small)((x >> (2 * j)) & 3)) - 1; + } + *f++ = ((small)(*s & 3)) - 1; } -#endif - -/* ----- NTRU LPRime Expand plus encoding */ - -#ifdef LPR - -#define Ciphertexts_bytes (Rounded_bytes+Top_bytes) -#define SecretKeys_bytes Small_bytes -#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes) - -static void Inputs_random(Inputs r) -{ - unsigned char s[Inputs_bytes]; +static void Rq_encode(unsigned char *s, const Fq *r) { + uint16_t R[p], M[p]; int i; - - randombytes(s,sizeof s); - for (i = 0;i < I;++i) r[i] = 1&(s[i>>3]>>(i&7)); + for (i = 0; i < p; ++i) R[i] = r[i] + q12; + for (i = 0; i < p; ++i) M[i] = q; + Encode(s, R, M, p); } -/* pk,sk = ZKeyGen() */ -static void ZKeyGen(unsigned char *pk,unsigned char *sk) -{ - Fq A[p]; - small a[p]; - - XKeyGen(pk,A,a); pk += Seeds_bytes; - Rounded_encode(pk,A); - Small_encode(sk,a); -} - -/* c = ZEncrypt(r,pk) */ -static void ZEncrypt(unsigned char *c,const Inputs r,const unsigned char *pk) -{ - Fq A[p]; - Fq B[p]; - int8 T[I]; - - Rounded_decode(A,pk+Seeds_bytes); - XEncrypt(B,T,r,pk,A); - Rounded_encode(c,B); c += Rounded_bytes; - Top_encode(c,T); +static void Rq_decode(Fq *r, const unsigned char *s) { + uint16_t R[p], M[p]; + int i; + for (i = 0; i < p; ++i) M[i] = q; + Decode(R, s, M, p); + for (i = 0; i < p; ++i) r[i] = ((Fq)R[i]) - q12; } -/* r = ZDecrypt(C,sk) */ -static void ZDecrypt(Inputs r,const unsigned char *c,const unsigned char *sk) -{ - small a[p]; - Fq B[p]; - int8 T[I]; - - Small_decode(a,sk); - Rounded_decode(B,c); - Top_decode(T,c+Rounded_bytes); - XDecrypt(r,B,T,a); +static void Rounded_encode(unsigned char *s, const Fq *r) { + uint16_t R[p], M[p]; + int i; + for (i = 0; i < p; ++i) R[i] = ((r[i] + q12) * 10923) >> 15; + for (i = 0; i < p; ++i) M[i] = (q + 2) / 3; + Encode(s, R, M, p); } -#endif - -/* ----- confirmation hash */ - -#define Confirm_bytes 32 - -/* h = HashConfirm(r,pk,cache); cache is Hash4(pk) */ -static void HashConfirm(unsigned char *h,const unsigned char *r,const unsigned char *pk,const unsigned char *cache) -{ -#ifndef LPR - unsigned char x[Hash_bytes*2]; +static void Rounded_decode(Fq *r, const unsigned char *s) { + uint16_t R[p], M[p]; int i; + for (i = 0; i < p; ++i) M[i] = (q + 2) / 3; + Decode(R, s, M, p); + for (i = 0; i < p; ++i) r[i] = R[i] * 3 - q12; +} - Hash_prefix(x,3,r,Inputs_bytes); - for (i = 0;i < Hash_bytes;++i) x[Hash_bytes+i] = cache[i]; -#else - unsigned char x[Inputs_bytes+Hash_bytes]; - int i; +static void ZKeyGen(unsigned char *pk, unsigned char *sk) { + Fq h[p]; + small f[p], v[p]; + KeyGen(h, f, v); + Rq_encode(pk, h); + Small_encode(sk, f); + Small_encode(sk + Small_bytes, v); +} - for (i = 0;i < Inputs_bytes;++i) x[i] = r[i]; - for (i = 0;i < Hash_bytes;++i) x[Inputs_bytes+i] = cache[i]; -#endif - Hash_prefix(h,2,x,sizeof x); +static void ZEncrypt(unsigned char *C, const Inputs r, const unsigned char *pk) { + Fq h[p], c[p]; + Rq_decode(h, pk); + Encrypt(c, r, h); + Rounded_encode(C, c); } -/* ----- session-key hash */ +static void ZDecrypt(Inputs r, const unsigned char *C, const unsigned char *sk) { + small f[p], v[p]; + Fq c[p]; + Small_decode(f, sk); + Small_decode(v, sk + Small_bytes); + Rounded_decode(c, C); + Decrypt(r, c, f, v); +} -/* k = HashSession(b,y,z) */ -static void HashSession(unsigned char *k,int b,const unsigned char *y,const unsigned char *z) -{ -#ifndef LPR - unsigned char x[Hash_bytes+Ciphertexts_bytes+Confirm_bytes]; +static void HashConfirm(unsigned char *h, const unsigned char *r, const unsigned char *cache) { + unsigned char x[Hash_bytes * 2]; int i; + Hash_prefix(x, 3, r, Small_bytes); + for (i = 0; i < Hash_bytes; ++i) x[Hash_bytes + i] = cache[i]; + Hash_prefix(h, 2, x, sizeof x); +} - Hash_prefix(x,3,y,Inputs_bytes); - for (i = 0;i < Ciphertexts_bytes+Confirm_bytes;++i) x[Hash_bytes+i] = z[i]; -#else - unsigned char x[Inputs_bytes+Ciphertexts_bytes+Confirm_bytes]; +static void HashSession(unsigned char *k, int b, const unsigned char *y, const unsigned char *z) { + unsigned char x[Hash_bytes + crypto_kem_sntrup761_CIPHERTEXTBYTES]; int i; - - for (i = 0;i < Inputs_bytes;++i) x[i] = y[i]; - for (i = 0;i < Ciphertexts_bytes+Confirm_bytes;++i) x[Inputs_bytes+i] = z[i]; -#endif - Hash_prefix(k,b,x,sizeof x); + Hash_prefix(x, 3, y, Small_bytes); + for (i = 0; i < crypto_kem_sntrup761_CIPHERTEXTBYTES; ++i) x[Hash_bytes + i] = z[i]; + Hash_prefix(k, b, x, sizeof x); } -/* ----- Streamlined NTRU Prime and NTRU LPRime */ - -/* pk,sk = KEM_KeyGen() */ -static void KEM_KeyGen(unsigned char *pk,unsigned char *sk) -{ +int crypto_kem_sntrup761_keypair(unsigned char *pk, unsigned char *sk) { int i; - - ZKeyGen(pk,sk); sk += SecretKeys_bytes; - for (i = 0;i < PublicKeys_bytes;++i) *sk++ = pk[i]; - randombytes(sk,Inputs_bytes); sk += Inputs_bytes; - Hash_prefix(sk,4,pk,PublicKeys_bytes); + ZKeyGen(pk, sk); + sk += SecretKeys_bytes; + for (i = 0; i < crypto_kem_sntrup761_PUBLICKEYBYTES; ++i) *sk++ = pk[i]; + randombytes(sk, Small_bytes); + Hash_prefix(sk + Small_bytes, 4, pk, crypto_kem_sntrup761_PUBLICKEYBYTES); + return 0; } -/* c,r_enc = Hide(r,pk,cache); cache is Hash4(pk) */ -static void Hide(unsigned char *c,unsigned char *r_enc,const Inputs r,const unsigned char *pk,const unsigned char *cache) -{ - Inputs_encode(r_enc,r); - ZEncrypt(c,r,pk); c += Ciphertexts_bytes; - HashConfirm(c,r_enc,pk,cache); +static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) { + Small_encode(r_enc, r); + ZEncrypt(c, r, pk); + HashConfirm(c + crypto_kem_sntrup761_CIPHERTEXTBYTES - Confirm_bytes, r_enc, cache); } -/* c,k = Encap(pk) */ -static void Encap(unsigned char *c,unsigned char *k,const unsigned char *pk) -{ +int crypto_kem_sntrup761_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { Inputs r; - unsigned char r_enc[Inputs_bytes]; - unsigned char cache[Hash_bytes]; - - Hash_prefix(cache,4,pk,PublicKeys_bytes); - Inputs_random(r); - Hide(c,r_enc,r,pk,cache); - HashSession(k,1,r_enc,c); + unsigned char r_enc[Small_bytes], cache[Hash_bytes]; + Hash_prefix(cache, 4, pk, crypto_kem_sntrup761_PUBLICKEYBYTES); + Short_random(r); + Hide(c, r_enc, r, pk, cache); + HashSession(k, 1, r_enc, c); + return 0; } -/* 0 if matching ciphertext+confirm, else -1 */ -static int Ciphertexts_diff_mask(const unsigned char *c,const unsigned char *c2) -{ - uint16 differentbits = 0; - int len = Ciphertexts_bytes+Confirm_bytes; - - while (len-- > 0) differentbits |= (*c++)^(*c2++); - return (1&((differentbits-1)>>8))-1; +static int Ciphertexts_diff_mask(const unsigned char *c, const unsigned char *c2) { + uint16_t differentbits = 0; + int len = crypto_kem_sntrup761_CIPHERTEXTBYTES; + while (len-- > 0) differentbits |= (*c++) ^ (*c2++); + return (crypto_int64_bitmod_01((differentbits - 1),8)) - 1; } -/* k = Decap(c,sk) */ -static void Decap(unsigned char *k,const unsigned char *c,const unsigned char *sk) -{ +int crypto_kem_sntrup761_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { const unsigned char *pk = sk + SecretKeys_bytes; - const unsigned char *rho = pk + PublicKeys_bytes; - const unsigned char *cache = rho + Inputs_bytes; + const unsigned char *rho = pk + crypto_kem_sntrup761_PUBLICKEYBYTES; + const unsigned char *cache = rho + Small_bytes; Inputs r; - unsigned char r_enc[Inputs_bytes]; - unsigned char cnew[Ciphertexts_bytes+Confirm_bytes]; - int mask; - int i; - - ZDecrypt(r,c,sk); - Hide(cnew,r_enc,r,pk,cache); - mask = Ciphertexts_diff_mask(c,cnew); - for (i = 0;i < Inputs_bytes;++i) r_enc[i] ^= mask&(r_enc[i]^rho[i]); - HashSession(k,1+mask,r_enc,c); -} - -/* ----- crypto_kem API */ - - -int crypto_kem_sntrup761_keypair(unsigned char *pk,unsigned char *sk) -{ - KEM_KeyGen(pk,sk); - return 0; -} - -int crypto_kem_sntrup761_enc(unsigned char *c,unsigned char *k,const unsigned char *pk) -{ - Encap(c,k,pk); + unsigned char r_enc[Small_bytes], cnew[crypto_kem_sntrup761_CIPHERTEXTBYTES]; + int mask, i; + ZDecrypt(r, c, sk); + Hide(cnew, r_enc, r, pk, cache); + mask = Ciphertexts_diff_mask(c, cnew); + for (i = 0; i < Small_bytes; ++i) r_enc[i] ^= mask & (r_enc[i] ^ rho[i]); + HashSession(k, 1 + mask, r_enc, c); return 0; } -int crypto_kem_sntrup761_dec(unsigned char *k,const unsigned char *c,const unsigned char *sk) -{ - Decap(k,c,sk); - return 0; -} #endif /* USE_SNTRUP761X25519 */ |