175 files changed, 125928 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/ChangeLog-2011 b/comm/third_party/libgcrypt/cipher/ChangeLog-2011
new file mode 100644
index 0000000000..1ce6bd1e68
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ChangeLog-2011
@@ -0,0 +1,4279 @@
+2011-12-01  Werner Koch  <wk@g10code.com>
+
+	NB: ChangeLog files are no longer manually maintained.  Starting
+	on December 1st, 2011 we put change information only in the GIT
+	commit log, and generate a top-level ChangeLog file from logs at
+	"make dist".  See doc/HACKING for details.
+
+2011-09-16  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (_gcry_primegen_init): New.
+
+2011-09-15  Werner Koch  <wk@g10code.com>
+
+	* cipher-cbc.c, cipher-cfb.c, cipher-ofb.c, cipher-ctr.c: New.
+	* cipher-aeswrap.c: New.
+	* cipher-internal.h: New.
+	* cipher.c (cipher_context_alignment_t, struct gcry_cipher_handle)
+	(CTX_MAGIC_NORMAL, CTX_MAGIC_SECURE, NEED_16BYTE_ALIGNED_CONTEXT)
+	(MAX_BLOCKSIZE): Move to cipher-internal.h.
+	(do_aeswrap_encrypt, do_aeswrap_encrypt)
+	(do_cbc_encrypt, do_cbc_decrypt, do_ctr_encrypt, do_ctr_decrypt)
+	(do_ofb_encrypt, do_ofb_decrypt, do_ctr_encrypt): Move to the
+	respective new cipher-foo.c files.
+	(do_ctr_decrypt): Remove.
+
+2011-09-15  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_list): Remove.
+	(gcry_pk_unregister): Remove.
+	* md.c (gcry_md_list): Remove.
+	(gcry_md_unregister): Remove.
+	* cipher.c (gcry_cipher_list): Remove.
+	(gcry_cipher_unregister): Remove.
+	* ac.c: Remove.
+
+2011-06-29  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (cipher_get_keylen): Return zero for an invalid algorithm.
+	(cipher_get_blocksize): Ditto.
+
+2011-06-13  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (selftest_sign_1024): Use the raw and not the pkcs1 flag.
+
+	* pubkey.c (gcry_pk_sign): Special case output generation for PKCS1.
+	(sexp_data_to_mpi): Parse "random-override" for pkcs1 encryption.
+	(pkcs1_encode_for_encryption): Add args RANDOM_OVERRIDE and
+	RANDOM_OVERRIDE_LEN.
+	(gcry_pk_encrypt): Special case output generation for PKCS1.
+	(sexp_data_to_mpi): Use GCRYMPI_FMT_USG for raw encoding.
+
+2011-06-10  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_sign): Use format specifier '%M' to avoid
+	leading zeroes.  Special case output generation for PSS.
+	(gcry_pk_encrypt): Special case output generation for OAEP.
+	(sexp_data_to_mpi): Use GCRYMPI_FMT_USG for PSS verify.
+
+2011-06-09  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (oaep_decode): Make use of octet_string_from_mpi.
+	(sexp_to_enc): Skip "random-override".
+
+	* pubkey.c (oaep_encode, pss_encode): Add args RANDOM_OVERRIDE and
+	RANDOM_OVERRIDE_LEN.
+	(sexp_data_to_mpi): Extract new random-override parameter.
+
+	* pubkey.c (pss_encode, pss_verify): Use VALUE verbatim for MHASH.
+	(octet_string_from_mpi): Add arg SPACE.
+
+2011-06-08  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (pss_encode, pss_verify): Restructure and comment code
+	to match rfc-3447.  Replace secure allocs by plain allocs and
+	wipememory.  Use gcry_md_hash_buffer.
+	(octet_string_from_mpi): New.
+
+2011-06-03  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (oaep_decode): Add more comments and restructure to
+	match the description in RFC-3447.
+	(oaep_encode): Check for mgf1 error.  s/dlen/hlen/.
+
+2011-05-31  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (mgf1): Optimize by using gcry_md_reset.  Re-implement
+	for easier readability.
+	(oaep_encode): Add more comments and restructure to match the
+	description in RFC-3447.
+
+	* pubkey.c (pkcs1_encode_for_signature, oaep_decode): Change
+	return value from one MPI to a buffer.
+	(gcry_pk_decrypt): Adjust for this change.
+
+2011-05-30  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (pkcs1_decode_for_encryption): Change handling of
+	leading zero byte.
+
+2011-05-27  Daiki Ueno  <ueno@unixuser.org>
+
+	* pubkey.c (gcry_pk_decrypt): Fix double-free when un-padding
+	invalid data.  Thanks to Tom Ritter.
+
+2011-05-24  Daiki Ueno  <ueno@unixuser.org>
+
+	* rsa.c (rsa_verify): Use CMP if given, to check the decrypted
+	sig.
+
+	* pubkey.c (sexp_to_enc, sexp_data_to_mpi): Factor out
+	CTX initialization to ...
+	(init_encoding_ctx): .. new.
+	(gcry_pk_verify): Pass verify func and the arg to pubkey_verify.
+	(pss_encode, pss_verify, pss_verify_cmp): New.
+
+2011-05-23  Daiki Ueno  <ueno@unixuser.org>
+
+	* pubkey.c (pkcs1_decode_for_encryption, oaep_decode): Fix memleak
+	when gcry_mpi_print fails.
+
+2011-05-18  Daiki Ueno  <ueno@unixuser.org>
+
+	* pubkey.c (sexp_data_to_mpi): Factor some code out to ...
+	(pkcs1_encode_for_encryption): .. new,
+	(pkcs1_encode_for_signature): .. new.
+	(pkcs1_decode_for_encryption): New.
+	(gcry_pk_decrypt): Do un-padding for PKCS#1 as well as OAEP.
+	(sexp_to_enc): Abolish "unpad" flag, which is not necessary since
+	we can do un-padding implicitly when "pkcs1" or "oaep" is given.
+
+2011-05-11  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (sexp_to_enc, sexp_data_to_mpi): Set LABEL to NULL
+	after free.
+	(sexp_to_enc, sexp_data_to_mpi): Do not allow multiple encoding
+	flags.
+	(oaep_encode, oaep_decode, sexp_to_key, sexp_to_sig)
+	(sexp_to_enc, sexp_data_to_mpi, gcry_pk_encrypt, gcry_pk_sign)
+	(gcry_pk_genkey, _gcry_pk_get_elements): Replace access to ERRNO
+	by gpg_err_code_from_syserror.
+
+2011-05-11  Daiki Ueno  <ueno@unixuser.org>
+
+	* pubkey.c (sexp_data_to_mpi): Factor some code out to ...
+	(get_hash_algo): .. new.
+	(mgf1, oaep_encode, oaep_decode): New.
+	(sexp_to_enc): Add arg CTX.  Remove arg RET_WANT_PKCS1.  Support
+	OAEP.
+	(sexp_data_to_mpi): Add arg CTX.  Support OAEP.
+	(gcry_pk_encrypt): Pass a CTX to sexp_data_to_mpi.
+	(gcry_pk_decrypt): Pass a CTX tp sexp_to_enc and replace
+	WANT_PKCS1.  Implement unpadding for OAEP.
+	(gcry_pk_sign): Pass NULL for CTX arg of sexp_data_to_mpi.
+	(gcry_pk_verify): Ditto.
+
+2011-04-19  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (gcry_cipher_open): Replace gpg_err_code_from_errno by
+	gpg_err_code_from_syserror.
+
+2011-04-11  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_get_keygrip): Avoid double free of L2.
+
+	* cipher.c (_gcry_cipher_setctr): Clear unused lastiv info.
+	(gcry_cipher_ctl) <GCRYCTL_SET_CTR>: Implement by calling
+	_gcry_cipher_setctr.
+	(do_ctr_encrypt): Save last counter and reuse it.
+
+	* cipher.c (do_ctr_encrypt): Allow arbitrary length inputs to
+	match the 1.4 behaviour.
+
+2011-04-04  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (compute_keygrip): Release L1 while parsing "curve".
+
+	* pubkey.c (gcry_pk_get_keygrip): Always release NAME and L2.
+	Reported by Ben Kibbey.
+
+2011-03-28  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (_gcry_generate_elg_prime): Make sure that PRIME is
+	NULL if the called func ever returns an error.
+
+	* pubkey.c (gcry_pk_decrypt): Remove unused var PUBKEY.
+
+2011-03-09  Werner Koch  <wk@g10code.com>
+
+	* kdf.c: New.
+
+2011-02-22  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (aesni_cleanup_2_4): New.
+	(aesenc_xmm1_xmm0, do_aesni_ctr_4): New.
+	(_gcry_aes_ctr_enc): New.
+	* cipher.c (struct gcry_cipher_handle): Add CTR_ENC.  Move field
+	CTR into an u_ctr union and adjust all users.
+	(gcry_cipher_open): Use _gcry_aes_ctr_enc.
+	(do_ctr_encrypt): Use bulk mode.
+
+2011-02-18  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (u32_a_t): New.
+	(do_encrypt_aligned, do_encrypt_aligned): Use the new type to
+	avoid problems with strict aliasing rules.
+
+2011-02-16  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (do_aesni_cfb) [USE_AESNI]: New.
+	(_gcry_aes_cfb_enc, _gcry_aes_cfb_dec) [USE_AESNI]: Use new fucntion.
+
+2011-02-15  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (do_aesni_enc_aligned, do_aesni_dec_aligned): Use
+	movdqa for the key but keep using movdqu for the data.
+	(do_aesni): Remove alignment detection.  Don't burn the stack.
+	(aesni_prepare, aesni_cleanup): New macros.
+	(rijndael_encrypt, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
+	(rijndael_decrypt, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Use
+	these macros.  Don't burn the stack in the USE_AESNI case.
+	(do_setkey): Add disabled code to use aeskeygenassist.
+
+2011-02-14  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (ATTR_ALIGNED_16): New
+	(do_aesni): Do not copy if already aligned.
+	(do_encrypt, do_decrypt): Ditto.
+	(rijndael_decrypt, rijndael_encrypt): Increase stack burning amount.
+
+	* rijndael.c (RIJNDAEL_context): Reorder fields.  Change fieldname
+	ROUNDS to rounds.  Move padlock_key into u1.
+	(keySched, keySched2): Rename macros to keyscherr and keyschdec
+	and change all users.
+	(padlockkey): New macro.  Change all users of padlock_key.
+	* cipher.c (NEED_16BYTE_ALIGNED_CONTEXT): Always define if using gcc.
+	(struct gcry_cipher_handle): Align U_IV to at least 16 byte.
+
+2011-02-13  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (USE_AESNI): New.  Define for ia32 and gcc >= 4.
+	(m128i_t) [USE_AESNI]: New.
+	(RIJNDAEL_context) [USE_AESNI]: Add field use_aesni.
+	(do_setkey): Set USE_AESNI for all key lengths.
+	(prepare_decryption) [USE_AESNI]: Use aesimc instn if requested.
+	(do_aesni_enc_aligned, do_aesni_dec_aligned)
+	(do_aesni) [USE_AESNI]: New.
+	(rijndael_encrypt, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
+	(rijndael_decrypt, _gcry_aes_cfb_dec)
+	(_gcry_aes_cbc_dec) [USE_AESNI]: Use do_aesni.
+
+2011-02-01  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_get_curve): New.
+	(sexp_to_key): Add arg OVERRIDE_ELEMS.
+	(sexp_elements_extract_ecc): Allow for params only.
+	(gcry_pk_get_param): New.
+	* ecc.c (ecc_get_curve): New.
+	(ecc_get_param_sexp): New.
+
+2011-01-28  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Hack to insert the used curve name.
+
+2011-01-27  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (fill_in_curve): Remove.
+	(generate_curve): Rename to ..
+	(fill_in_curve): this.  Remove setting of NAME_OID.
+	(ecc_encrypt_raw): Change name of arg DATA to K for better
+	readability.  Use ECC_public_key instead of ECC_secret_key.
+	Require a caller to pass a complete pkey array.
+	(ecc_decrypt_raw): Require a caller to pass a complete skey array.
+	(elliptic_curve_t): Add field NAME.
+	(fill_in_curve): Set field.
+	(generate_key): Add arg R_USED_CURVE.
+	(ecc_generate_ext): Return used curve name.
+
+2011-01-13  Andrey Jivsov  <openpgp@brainhub.org>  (wk)
+
+	* ecc.c (ec2os): Do not free passed parameters X and Y.  Adjust
+	callers.
+	(ecc_encrypt_raw, ecc_decrypt_raw): New.
+	(ecdh_names, _gcry_pubkey_spec_ecdh): New.
+	* pubkey.c (pubkey_table): Support ECDH.
+
+2010-08-19  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (gcry_cipher_open): Remove double release of the module.
+	Fixes bug#1263.
+
+2010-06-10  Jeff Johnson  <n3npq@mac.com>  (wk)
+
+	* ecc.c (ecc_generate_ext): Parse transient-key flag.
+	(generate_key): Add arg TRANSIENT_KEY and use it to set the random
+	level.
+
+2010-04-12  Brad Hards  <bradh@frogmouth.net>  (wk)
+
+	Spelling fixes.
+
+2010-03-26  Werner Koch  <wk@g10code.com>
+
+	* tiger.c (asn): Unfetter the old TIGER from an OID.
+	(TIGER_CONTEXT): Add field VARIANT.
+	(tiger_init): Factor code out to ...
+	(do_init): New.
+	(tiger1_init, tiger2_init): New.
+	(_gcry_digest_spec_tiger1, _gcry_digest_spec_tiger2): New.
+	* md.c (digest_table): Add TIGER1 and TIGER2 variants.
+
+2009-12-11  Werner Koch  <wk@g10code.com>
+
+	* sha256.c (Cho, Maj, Sum0, Sum1): Turn macros into inline
+	functions.
+	(transform): Partly unroll to interweave the chain variables
+
+	* sha512.c (ROTR, Ch, Maj, Sum0, Sum1): Turn macros into inline
+	functions.
+	(transform): Partly unroll to interweave the chain variables.
+	Suggested by Christian Grothoff.
+
+2009-12-10  Werner Koch  <wk@g10code.com>
+
+	* Makefile.am (o_flag_munging): New.
+	(tiger.o, tiger.lo): Use it.
+
+	* cipher.c (do_ctr_encrypt): Add arg OUTBUFLEN.  Check for
+	suitable value.  Add check for valid inputlen.  Wipe temporary
+	memory.
+	(do_ctr_decrypt): Likewise.
+	(do_cbc_encrypt, do_cbc_decrypt): Add arg OUTBUFLEN.  Check for
+	suitable value.  Move check for valid inputlen to here; change
+	returned error from INV_ARG to INV_LENGTH.
+	(do_ecb_encrypt, do_ecb_decrypt): Ditto.
+	(do_cfb_encrypt, do_cfb_decrypt): Ditto.
+	(do_ofb_encrypt, do_ofb_decrypt): Ditto.
+	(cipher_encrypt, cipher_encrypt): Adjust for above changes.
+	(gcry_cipher_encrypt, gcry_cipher_decrypt): Simplify.
+
+2009-12-09  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (gcry_cipher_open): Allow for GCRY_CIPHER_MODE_AESWRAP.
+	(cipher_encrypt, cipher_decrypt): Ditto.
+	(do_aeswrap_encrypt, do_aeswrap_decrypt): New.
+	(struct gcry_cipher_handle): Add field marks.
+	(cipher_setkey, cipher_setiv): Update marks flags.
+	(cipher_reset): Reset marks.
+	(cipher_encrypt, cipher_decrypt): Add new arg OUTBUFLEN.
+	(gcry_cipher_encrypt, gcry_cipher_decrypt): Pass outbuflen to
+	cipher_encrypt.  Replace GPG_ERR_TOO_SHORT by
+	GPG_ERR_BUFFER_TOO_SHORT.
+
+2009-08-21  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (dsa_generate_ext): Release retfactors array before
+	setting it to NULL.  Reported by Daiko Ueno.
+
+2009-07-02  Werner Koch  <wk@g10code.com>
+
+	* md.c (md_read): Fix incomplete check for NULL.
+	Reported by Fabian Kail.
+
+2009-03-31  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (rsa_check_secret_key): Return GPG_ERR_BAD_SECKEY and not
+	GPG_ERR_PUBKEY_ALGO.
+
+2009-02-16  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (generate_x931): Do not initialize TBL with automatic
+	variables.
+	* whirlpool.c, tiger.c, sha256.c, sha1.c, rmd160.c, md5.c
+	* md4.c, crc.c: Remove memory.h.  This is garbage from gnupg.
+	Reported by Dan Fandrich.
+
+2009-01-22  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (compute_keygrip): Remove superfluous const.
+
+2009-01-06  Werner Koch  <wk@g10code.com>
+
+	* rmd160.c (oid_spec_rmd160): Add TeleTrust identifier.
+
+2008-12-10  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (generate): Add arg DOMAIN and use it if specified.
+	(generate_fips186): Ditto.
+	(dsa_generate_ext): Parse and check the optional "domain"
+	parameter and pass them to the generate functions.
+
+	* rijndael.c (rijndael_names): Add "AES128" and "AES-128".
+	(rijndael192_names): Add "AES-192".
+	(rijndael256_names): Add "AES-256".
+
+2008-12-05  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (generate): Add arg TRANSIENT_KEY and use it to detrmine
+	the RNG quality needed.
+	(dsa_generate_ext): Parse the transient-key flag und pass it to
+	generate.
+
+2008-11-28  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (generate_fips186): Add arg DERIVEPARMS and use the seed
+	value if available.
+
+	* primegen.c (_gcry_generate_fips186_2_prime): Fix inner p loop.
+
+2008-11-26  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (_gcry_generate_fips186_3_prime): New.
+	* dsa.c (generate_fips186): Add arg USE_FIPS186_2.
+	(dsa_generate_ext): Parse new flag use-fips183-2.
+
+2008-11-25  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (generate_fips186): New.
+	(dsa_generate_ext): Use new function if derive-parms are given or
+	if in FIPS mode.
+	* primegen.c (_gcry_generate_fips186_2_prime): New.
+
+2008-11-24  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Insert code to output extrainfo.
+	(pubkey_generate): Add arg R_EXTRAINFO and pass it to the extended
+	key generation function.
+	* rsa.c (gen_x931_parm_xp, gen_x931_parm_xi): New.
+	(generate_x931): Generate params if not given.
+	(rsa_generate_ext): Parse use-x931 flag.  Return p-q-swapped
+	indicator.
+	* dsa.c (dsa_generate_ext): Put RETFACTORS into R_EXTRAINFO if
+	possible.
+
+	* pubkey.c (gcry_pk_genkey): Remove parsing of almost all
+	parameters and pass the parameter S-expression to pubkey_generate.
+	(pubkey_generate): Simplify by requitring modules to parse the
+	parameters. Remove the special cases for Elgamal and ECC.
+	(sexp_elements_extract_ecc): Add arg EXTRASPEC and use it.  Fix
+	small memory leak.
+	(sexp_to_key): Pass EXTRASPEC to sexp_elements_extract_ecc.
+	(pubkey_table) [USE_ELGAMAL]: Add real extraspec.
+	* rsa.c (rsa_generate_ext): Adjust for new calling convention.
+	* dsa.c (dsa_generate_ext): Ditto.
+	* elgamal.c (_gcry_elg_generate): Ditto. Rename to elg_generate_ext.
+	(elg_generate): New.
+	(_gcry_elg_generate_using_x): Remove after merging code with
+	elg_generate_ext.
+	(_gcry_pubkey_extraspec_elg): New.
+	(_gcry_elg_check_secret_key, _gcry_elg_encrypt, _gcry_elg_sign)
+	(_gcry_elg_verify, _gcry_elg_get_nbits): Make static and remove
+	_gcry_ prefix.
+	* ecc.c (_gcry_ecc_generate): Rename to ecc_generate_ext and
+	adjust for new calling convention.
+	(_gcry_ecc_get_param): Rename to ecc_get_param and make static.
+	(_gcry_pubkey_extraspec_ecdsa): Add ecc_generate_ext and
+	ecc_get_param.
+
+2008-11-20  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (pubkey_generate): Add arg DERIVEPARMS.
+	(gcry_pk_genkey): Parse derive-parms and pass it to above.
+	* rsa.c (generate_x931): New.
+	(rsa_generate_ext): Add arg DERIVEPARMS and call new function in
+	fips mode or if DERIVEPARMS is given.
+	* primegen.c (_gcry_derive_x931_prime, find_x931_prime): New.
+
+2008-11-19  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (rsa_decrypt): Use gcry_create_nonce for blinding.
+	(generate): Rename to generate_std.
+
+2008-11-05  Werner Koch  <wk@g10code.com>
+
+	* md.c (md_open): Use a switch to set the Bsize.
+	(prepare_macpads): Fix long key case for SHA384 and SHA512.
+
+	* cipher.c (gcry_cipher_handle): Add field EXTRASPEC.
+	(gcry_cipher_open): Set it.
+	(gcry_cipher_ctl): Add private control code to disable weak key
+	detection and to return the current input block.
+	* des.c (_tripledes_ctx): Add field FLAGS.
+	(do_tripledes_set_extra_info): New.
+	(_gcry_cipher_extraspec_tripledes): Add new function.
+	(do_tripledes_setkey): Disable weak key detection.
+
+2008-10-24  Werner Koch  <wk@g10code.com>
+
+	* md.c (digest_table): Allow MD5 in fips mode.
+	(md_register_default): Take special action for MD5.
+	(md_enable, gcry_md_hash_buffer): Ditto.
+
+2008-09-30  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (do_setkey): Properly align "t" and "tk".
+	(prepare_decryption): Properly align "w".  Fixes bug #936.
+
+2008-09-18  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Parse domain parameter.
+	(pubkey_generate): Add new arg DOMAIN and remove special case for
+	DSA with qbits.
+	* rsa.c (rsa_generate): Add dummy args QBITS, NAME and DOMAIN and
+	rename to rsa_generate_ext.  Change caller.
+	(_gcry_rsa_generate, _gcry_rsa_check_secret_key)
+	(_gcry_rsa_encrypt, _gcry_rsa_decrypt, _gcry_rsa_sign)
+	(_gcry_rsa_verify, _gcry_rsa_get_nbits): Make static and remove
+	_gcry_ prefix.
+	(_gcry_pubkey_spec_rsa, _gcry_pubkey_extraspec_rsa): Adjust names.
+	* dsa.c (dsa_generate_ext): New.
+	(_gcry_dsa_generate): Replace code by a call to dsa_generate.
+	(_gcry_dsa_check_secret_key, _gcry_dsa_sign, _gcry_dsa_verify)
+	(_gcry_dsa_get_nbits): Make static and remove _gcry prefix.
+	(_gcry_dsa_generate2): Remove.
+	(_gcry_pubkey_spec_dsa): Adjust to name changes.
+	(_gcry_pubkey_extraspec_rsa): Add dsa_generate_ext.
+
+2008-09-16  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (run_selftests): Add arg EXTENDED.
+
+2008-09-12  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (test_keys): Do a bad case signature check.
+	* dsa.c (test_keys): Do a bad case check.
+
+	* cipher.c (_gcry_cipher_selftest): Add arg EXTENDED and pass it
+	to the called tests.
+	* md.c (_gcry_md_selftest): Ditto.
+	* pubkey.c (_gcry_pk_selftest): Ditto.
+	* rijndael.c (run_selftests): Add arg EXTENDED and pass it to the
+	called tests.
+	(selftest_fips_128): Add arg EXTENDED and run only one test
+	non-extended mode.
+	(selftest_fips_192): Add dummy arg EXTENDED.
+	(selftest_fips_256): Ditto.
+	* hmac-tests.c (_gcry_hmac_selftest): Ditto.
+	(run_selftests): Ditto.
+	(selftests_sha1): Add arg EXTENDED and run only one test
+	non-extended mode.
+	(selftests_sha224, selftests_sha256): Ditto.
+	(selftests_sha384, selftests_sha512): Ditto.
+	* sha1.c (run_selftests): Add arg EXTENDED and pass it to the
+	called test.
+	(selftests_sha1): Add arg EXTENDED and run only one test
+	non-extended mode.
+	* sha256.c (run_selftests): Add arg EXTENDED and pass it to the
+	called tests.
+	(selftests_sha224): Add arg EXTENDED and run only one test
+	non-extended mode.
+	(selftests_sha256): Ditto.
+	* sha512.c (run_selftests): Add arg EXTENDED and pass it to the
+	called tests.
+	(selftests_sha384): Add arg EXTENDED and run only one test
+	non-extended mode.
+	(selftests_sha512): Ditto.
+	* des.c (run_selftests): Add arg EXTENDED and pass it to the
+	called test.
+	(selftest_fips): Add dummy arg EXTENDED.
+	* rsa.c (run_selftests): Add dummy arg EXTENDED.
+
+	* dsa.c (run_selftests): Add dummy arg EXTENDED.
+
+	* rsa.c (extract_a_from_sexp): New.
+	(selftest_encr_1024): Check that the ciphertext does not match the
+	plaintext.
+	(test_keys): Improve tests and return an error status.
+	(generate): Return an error if test_keys fails.
+	* dsa.c (test_keys): Add comments and return an error status.
+	(generate): Return an error if test_keys failed.
+
+2008-09-11  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (_gcry_rsa_decrypt): Return an error instead of calling
+	BUG in case of a practically impossible condition.
+	(sample_secret_key, sample_public_key): New.
+	(selftest_sign_1024, selftest_encr_1024): New.
+	(selftests_rsa): Implement tests.
+	* dsa.c (sample_secret_key, sample_public_key): New.
+	(selftest_sign_1024): New.
+	(selftests_dsa): Implement tests.
+
+2008-09-09  Werner Koch  <wk@g10code.com>
+
+	* hmac-tests.c (selftests_sha1): Add tests.
+	(selftests_sha224, selftests_sha384, selftests_sha512): Make up tests.
+
+	* hash-common.c, hash-common.h: New.
+	* sha1.c (selftests_sha1): Add 3 tests.
+	* sha256.c (selftests_sha256, selftests_sha224): Ditto.
+	* sha512.c (selftests_sha512, selftests_sha384): Ditto.
+
+2008-08-29  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_get_keygrip): Remove the special case for RSA
+	and check whether a custom computation function has been setup.
+	* rsa.c (compute_keygrip): New.
+	(_gcry_pubkey_extraspec_rsa): Setup this function.
+	* ecc.c (compute_keygrip): New.
+	(_gcry_pubkey_extraspec_ecdsa): Setup this function.
+
+2008-08-28  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (cipher_decrypt, cipher_encrypt): Return an error if
+	mode NONE is used.
+	(gcry_cipher_open): Allow mode NONE only with a debug flag set and
+	if not in FIPS mode.
+
+2008-08-26  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (pubkey_generate): Add arg KEYGEN_FLAGS.
+	(gcry_pk_genkey): Implement new parameter "transient-key" and
+	pass it as flags to pubkey_generate.
+	(pubkey_generate): Make use of an ext_generate function.
+	* rsa.c (generate): Add new arg transient_key and pass appropriate
+	args to the prime generator.
+	(_gcry_rsa_generate): Factor all code out to ...
+	(rsa_generate): .. new func with extra arg KEYGEN_FLAGS.
+	(_gcry_pubkey_extraspec_ecdsa): Setup rsa_generate.
+	* primegen.c (_gcry_generate_secret_prime)
+	(_gcry_generate_public_prime): Add new arg RANDOM_LEVEL.
+
+2008-08-21  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (_gcry_generate_secret_prime)
+	(_gcry_generate_public_prime): Use a constant macro for the random
+	level.
+
+2008-08-19  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (sexp_elements_extract_ecc) [!USE_ECC]: Do not allow
+	allow "curve" parameter.
+
+2008-08-15  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (_gcry_pk_selftest): New.
+	* dsa.c (selftests_dsa, run_selftests): New.
+	* rsa.c (selftests_rsa, run_selftests): New.
+	* ecc.c (selftests_ecdsa, run_selftests): New.
+
+	* md.c (_gcry_md_selftest): New.
+	* sha1.c (run_selftests, selftests_sha1): New.
+	* sha256.c (selftests_sha224, selftests_sha256, run_selftests): New.
+	* sha512.c (selftests_sha384, selftests_sha512, run_selftests): New.
+
+	* des.c (selftest): Remove static variable form selftest.
+	(des_setkey): No on-the-fly self test in fips mode.
+	(tripledes_set3keys): Ditto.
+
+	* cipher.c (_gcry_cipher_setkey, _gcry_cipher_setiv):
+
+	* dsa.c (generate): Bail out in fips mode if NBITS is less than	1024.
+	* rsa.c (generate): Return an error code if the the requested size
+	is less than 1024 and we are in fpis mode.
+	(_gcry_rsa_generate): Take care of that error code.
+
+	* ecc.c (generate_curve): In fips mode enable only NIST curves.
+
+	* cipher.c (_gcry_cipher_selftest): New.
+
+	* sha512.c (_gcry_digest_extraspec_sha384)
+	(_gcry_digest_extraspec_sha512): New.
+	* sha256.c (_gcry_digest_extraspec_sha224)
+	(_gcry_digest_extraspec_sha256): New.
+	* sha1.c (_gcry_digest_extraspec_sha1): New.
+	* ecc.c (_gcry_pubkey_extraspec_ecdsa): New.
+	* dsa.c (_gcry_pubkey_extraspec_dsa): New.
+	* rsa.c (_gcry_pubkey_extraspec_rsa): New.
+	* rijndael.c (_gcry_cipher_extraspec_aes)
+	(_gcry_cipher_extraspec_aes192, _gcry_cipher_extraspec_aes256): New.
+	* des.c (_gcry_cipher_extraspec_tripledes): New.
+
+	* cipher.c (gcry_cipher_register): Rename to _gcry_cipher_register.
+	Add arg EXTRASPEC.
+	(dummy_extra_spec): New.
+	(cipher_table_entry): Add extraspec field.
+	* md.c (_gcry_md_register): Rename to _gcry_md_register.  Add
+	arg EXTRASPEC.
+	(dummy_extra_spec): New.
+	(digest_table_entry): Add extraspec field.
+	* pubkey.c (gcry_pk_register): Rename to _gcry_pk_register.  Add
+	arg EXTRASPEC.
+	(dummy_extra_spec): New.
+	(pubkey_table_entry): Add extraspec field.
+
+	* ac.c: Let most public functions return GPG_ERR_UNSUPPORTED in
+	fips mode.
+
+	* pubkey.c (pubkey_table_entry): Add field FIPS_ALLOWED and mark
+	appropriate algorithms.
+	(dummy_generate, dummy_check_secret_key, dummy_encrypt)
+	(dummy_decrypt, dummy_sign, dummy_verify, dummy_get_nbits): Signal
+	a fips error when used.
+	(gcry_pk_register): In fips mode do not allow to register new
+	algorithms.
+
+	* md.c (digest_table): Add field FIPS_ALLOWED and mark appropriate
+	algorithms.
+	(md_register_default): In fips mode register only fips algorithms.
+	(gcry_md_register): In fips mode do not allow to register new
+	algorithms.
+	(gcry_md_get): Signal a fips error if called.
+	(gcry_md_hash_buffer): Do not allow rmd160 when not in fips mode.
+	(md_start_debug): Disable in fips_mode.
+
+	* md.c (gcry_md_register_default): Rename to ..
+	(md_register_default): .. this.
+	(md_digest): Remove this commented fucntion.
+	* pubkey.c (gcry_pk_register_default): Rename to ..
+	(pk_register_default): .. this.
+
+	* cipher.c (cipher_table_entry): Add field FIPS_ALLOWED.
+	(gcry_cipher_register_default): Register only fips approved
+	algorithms.
+	(gcry_cipher_register): Do not allow to register new ciphers.
+	(cipher_setiv): Signal fips error.
+
+	* cipher (gcry_cipher_register_default): Rename to ..
+	(cipher_register_default): .. this.
+	(REGISTER_DEFAULT_CIPHERS): Adjust for that change.
+
+2008-07-05  Werner Koch  <wk@g10code.com>
+
+	* random-daemon.c, random.h, rndhw.c, rndunix.c, rand-internal.h
+	* random.c, rndegd.c, rndlinux.c, rndw32.c: Move to ../cipher.
+	* Makefile.am: Remove random stuff.
+
+2008-06-24  Szakats Istvan <szaki.ms@gmail.com>  (wk)
+
+	* ac.c (_gcry_ac_key_destroy, _gcry_ac_key_pair_generate): Relase
+	some more memory.
+
+2008-04-22  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (_gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
+	(_gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Use Padlock if possible.
+
+2008-04-18  Werner Koch  <wk@g10code.com>
+
+	* sha1.c (transform_aligned): Remove.  That is will obviosuly not
+	work because we need a scratch working area and our internal API
+	does not allow to modify the buffers.
+
+	* rijndael.c: Factor tables out to ..
+	* rijndael-tables.h: .. new.
+
+	* ac.c (ac_data_extract): Make static.
+
+	* camellia.h [HAVE_CONFIG_H]: Include config.h.
+
+	* rndw32.c (registry_poll): Only print the performance data
+	problem warning once.  Suggested by Simon Josefsson.
+
+2008-03-19  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (gcry_cipher_open) [USE_AES]: Init bulk encryption only
+	if requested.  Suggested by Dirk Stoecker.
+
+2008-03-18  Werner Koch  <wk@g10code.com>
+
+	* sha1.c: Include stdint.h.
+	(transform): Add arg NBLOCKS so that we can work on more than one
+	block and avoid updates of the chaining variables.  Changed all
+	callers to use 1.
+	(sha1_write): Replace loop around transform.
+	(transform_aligned) [WORDS_BIGENDIAN]: New.
+	(TRANSFORM): New macro to replace all direct calls of transform.
+
+2008-03-17  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (_gcry_aes_cfb_dec): New.
+	(do_encrypt): Factor code out to ..
+	(do_encrypt_aligned): .. New.
+	(_gcry_aes_cfb_enc, _gcry_aes_cfb_dec): Use new function.
+	(do_decrypt): Factor code out to ..
+	(do_decrypt_aligned): .. new.
+	(_gcry_aes_cbc_enc, _gcry_aes_cbc_dec): New.
+	* cipher.c (struct gcry_cipher_handle): Put field IV into new
+	union U_IV to enforce proper alignment.  Change all users.
+	(do_cfb_decrypt): Optimize.
+	(do_cbc_encrypt, do_cbc_decrypt): Optimize.
+
+2008-03-15  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (_gcry_aes_cfb_enc): New.
+	* cipher.c (struct gcry_cipher_handle): Add field ALGO and BULK.
+	(gcry_cipher_open): Set ALGO and BULK.
+	(do_cfb_encrypt): Optimize.
+
+2008-02-18  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (_gcry_rsa_verify) [IS_DEVELOPMENT_VERSION]: Print
+	intermediate results.
+
+2008-01-08  Werner Koch  <wk@g10code.com>
+
+	* random.c (add_randomness): Do not just increment
+	POOL_FILLED_COUNTER but update it by the actual amount of data.
+
+2007-12-13  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (sexp_data_to_mpi): Support SHA-224.
+
+2007-12-05  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (USE_PADLOCK): Depend on ENABLE_PADLOCK_SUPPORT.
+	* rndhw.c (USE_PADLOCK): Ditto
+
+	* rsa.c (secret): Fixed condition test for using CRT.  Reported by
+	Dean Scarff.  Fixes bug#864.
+	(_gcry_rsa_check_secret_key): Return an erro if the optional
+	parameters are missing.
+	* pubkey.c (sexp_elements_extract): Add arg ALGO_NAME. Changed all
+	callers to pass NULL. Add hack to allow for optional RSA
+	parameters.
+	(sexp_to_key): Pass algo name to sexp_elements_extract.
+
+2007-12-03  Werner Koch  <wk@g10code.com>
+
+	* random.c (gcry_random_add_bytes): Implement it.
+	* rand-internal.h (RANDOM_ORIGIN_EXTERNAL): New.
+
+2007-11-30  Werner Koch  <wk@g10code.com>
+
+	* rndhw.c: New.
+	* rndlinux.c (_gcry_rndlinux_gather_random): Try to read 50%
+	directly from the hwrng.
+	* random.c (do_fast_random_poll): Also run the hw rng fast poll.
+	(_gcry_random_dump_stats): Tell whether the hw rng failed.
+
+2007-11-29  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (USE_PADLOCK): Define new macro used for ia32.
+	(RIJNDAEL_context) [USE_PADLOCK]: Add fields USE_PADLOCK and
+	PADLOCK_KEY.
+	(do_setkey) [USE_PADLOCK]: Enable padlock if available for 128 bit
+	AES.
+	(do_padlock) [USE_PADLOCK]: New.
+	(rijndael_encrypt, rijndael_decrypt) [USE_PADLOCK]: Divert to
+	do_padlock.
+	* cipher.c (cipher_context_alignment_t): New.  Use it in this
+	module in place of PROPERLY_ALIGNED_TYPE.
+	(NEED_16BYTE_ALIGNED_CONTEXT): Define macro for ia32.
+	(struct gcry_cipher_handle): Add field HANDLE_OFFSET.
+	(gcry_cipher_open): Take care of increased alignment requirements.
+	(gcry_cipher_close): Ditto.
+
+2007-11-28  Werner Koch  <wk@g10code.com>
+
+	* sha256.c (asn224): Fixed wrong template.  It happened due to a
+	bug in RFC4880.  SHA-224 is not in the stable version of libgcrypt
+	so the consequences are limited to users of this devel version.
+
+2007-10-31  Werner Koch  <wk@g10code.com>
+
+	* ac.c (gcry_ac_data_new): Remove due to the visibility wrapper.
+	(gcry_ac_data_destroy, gcry_ac_data_copy, gcry_ac_data_length)
+	(gcry_ac_data_set, gcry_ac_data_get_name, gcry_ac_data_get_index)
+	(gcry_ac_data_to_sexp, gcry_ac_data_from_sexp)
+	(gcry_ac_data_clear, gcry_ac_io_init, gcry_ac_open)
+	(gcry_ac_close, gcry_ac_key_init, gcry_ac_key_pair_generate)
+	(gcry_ac_key_pair_extract, gcry_ac_key_destroy)
+	(gcry_ac_key_pair_destroy, gcry_ac_key_data_get)
+	(gcry_ac_key_test, gcry_ac_key_get_nbits, gcry_ac_key_get_grip)
+	(gcry_ac_data_encrypt, gcry_ac_data_decrypt, gcry_ac_data_sign)
+	(gcry_ac_data_verify, gcry_ac_data_encode, gcry_ac_data_decode)
+	(gcry_ac_mpi_to_os, gcry_ac_mpi_to_os_alloc, gcry_ac_os_to_mpi)
+	(gcry_ac_data_encrypt_scheme, gcry_ac_data_decrypt_scheme)
+	(gcry_ac_data_sign_scheme, gcry_ac_data_verify_scheme)
+	(gcry_ac_io_init_va): Ditto.
+	(gcry_ac_id_to_name, gcry_ac_name_to_id): Remove as these
+	deprecated functions are now implemented by visibility.c.
+
+2007-10-26  Werner Koch  <wk@g10code.com>
+
+	* rndw32.c: Disable debug flag.
+
+2007-10-25  Werner Koch  <wk@g10code.com>
+
+	* rndw32.c: Updated from current cryptlib snapshot and modified
+	for our use.  Removed support from pre NT systems.
+	(slow_gatherer_windows95): Remove.
+	(_gcry_rndw32_gather_random): Require an NT platform.
+	(init_system_rng, read_system_rng, read_mbm_data): New.
+	(slow_gatherer_windowsNT): Rename to ...
+	(slow_gatherer): .. this.  Read system RNG and MBM.
+	(registry_poll): New with code factored out from slow_gatherer.
+
+2007-08-23  Werner Koch  <wk@g10code.com>
+
+	* random.c (pool_filled_counter): New.
+	(add_randomness): Use it.
+
+2007-08-22  Werner Koch  <wk@g10code.com>
+
+	* rndw32.c, rndunix.c: Switched to LGPL.
+
+2007-05-30  Werner Koch  <wk@g10code.com>
+
+	* camellia.h, camellia.c: Replace by new LGPL version and adjusted
+	camellia.h.
+
+2007-05-09  Marcus Brinkmann  <marcus@g10code.de>
+
+	* ac.c (_gcry_ac_io_init_va, _gcry_ac_io_write, _gcry_ac_io_read):
+	Adjust users of gcry_ac_io_t because union is not anonymous
+	anymore.
+
+2007-05-02  Werner Koch  <wk@g10code.com>
+
+	* camellia-glue.c (camellia_setkey, camellia_encrypt)
+	(camellia_decrypt): Recalculated used stack size in called
+	functions.
+	* camellia.h: Redefine external symbols.
+
+2007-05-02  David Shaw  <dshaw@jabberwocky.com>
+
+	* Makefile.am, cipher.c: Add Camellia.
+
+	* camellia-glue.c: New.  The necessary glue to interface libgcrypt
+	to the stock NTT Camellia distribution.
+
+	* camellia.h, camellia.c: The stock NTT Camellia distribution
+	(GPL).
+
+2007-04-30  David Shaw  <dshaw@jabberwocky.com>
+
+	* cipher.c: Use #if instead of #ifdef as configure defines the
+	USE_cipher defines as 0 for disabled.
+
+2007-04-30  Werner Koch  <wk@g10code.com>
+
+	* rndegd.c (_gcry_rndegd_set_socket_name): New.
+
+2007-04-30  Marcus Brinkmann  <marcus@g10code.de>
+
+	* ecc.c (ec2os): Fix relocation of short numbers.
+
+	* ecc.c (generate_key): Do not allocate D, which will be allocated
+	by GEN_K.  Remove G.  Fix test if g_x, g_y resp. q_x, q_y are
+	requested.
+	(_gcry_ecc_generate): Release unneeded members of SK.
+	* pubkey.c (sexp_to_key): Release NAME.
+
+2007-04-28  Marcus Brinkmann  <marcus@g10code.de>
+
+	* ac.c (gcry_ac_mpi): Remove member NAME_PROVIDED.
+	(ac_data_mpi_copy, _gcry_ac_data_set, _gcry_ac_data_get_name)
+	(_gcry_ac_data_get_index, ac_data_construct): Adjust handling of
+	NAME accordingly.
+
+2007-04-20  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (domain_parms): Add standard brainpool curves.
+
+2007-04-18  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (generate_curve): Implement alias mechanism.
+
+	* pubkey.c (sexp_elements_extract_ecc): New.
+	(sexp_to_key): Add special case for ecc.
+	(sexp_to_key, sexp_to_sig, sexp_to_enc, gcry_pk_genkey): Replace
+	name_terminated stuff by a call to _gcry_sexp_nth_string.
+	(gcry_pk_get_keygrip): Ditto.
+
+2007-04-16  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (_gcry_ecc_generate): Renamed DUMMY to CURVE and use it.
+
+2007-04-13  Marcus Brinkmann  <marcus@g10code.de>
+
+	* ac.c (ac_data_construct): Cast const away to suppress compiler
+	warning.
+
+	* ecc.c (ecc_generate): Avoid compiler warning for unused argument
+	DUMMY.
+	(ecc_verify): Avoid compiler warning for unused arguments CMP and
+	OPAQUEV.
+
+2007-04-06  Werner Koch  <wk@g10code.com>
+
+	* sha1.c (oid_spec_sha1): Add another oid from X9.62.
+
+2007-03-28  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Do not issue misc-key-info if it is
+	empty.
+	(gcry_pk_genkey): New parameter "curve".
+
+	* ecc.c: Entirely rewritten with only a few traces of the old
+	code left.
+	(_gcry_ecc_generate): New.
+	(generate_key) New arg NAME.
+	(generate_curve): Ditto.  Return actual number of NBITS.
+
+2007-03-26  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Increase size of SKEY array and add a
+	runtime bounds check.
+
+2007-03-23  Werner Koch  <wk@g10code.com>
+
+	* ecc.c (ecc_ctx_init, ecc_ctx_free, ecc_mod, ecc_mulm): New.
+	(duplicate_point, sum_points, escalar_mult): Don't use a
+	copy of base->p.  Replaced all mpi_mulm by ecc_mulm so that we can
+	experiment with different algorithms.
+	(generate_key, check_secret_key, sign, verify): Initialize a
+	computation context for use by ecc_mulm.
+
+2007-03-22  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (pubkey_table): Initialize ECC.
+	* Makefile.am (EXTRA_libcipher_la_SOURCES): Add ecc.c.
+	* ecc.c: New. Heavily reformatted and changed for use in libgcrypt.
+	(point_init): New.
+	(escalar_mult): Make arg R the first arg to be similar to the mpi
+	functions.
+	(duplicate_point): Ditto
+	(sum_points): Ditto
+	(sign, verify): Remove unneeded copy operations.
+	(sum_points): Removed memory leaks and optimized some compares.
+	(verify): Simplified input check.
+
+2007-03-14  Werner Koch  <wk@g10code.com>
+
+	* random.c (MASK_LEVEL): Removed macro as it was used only at one
+	place.  Open coded it there.
+	(gcry_randomize, _gcry_update_random_seed_file)
+	(_gcry_fast_random_poll): Factor lock code out to ..
+	(lock_pool, unlock_pool): .. new.
+	(initialize): Look the pool while allocating.
+	(read_random_source, do_fast_random_poll): Moved intialization to ...
+	(initialize): .. here.
+	(_gcry_enable_quick_random_gen): No more need for initialization.
+	(is_initialized):  Moved this global flag to ..
+	(initialize): .. here and changed all users to unconditionally call
+	initialize.
+	(add_randomness): Remove initalization here.  It simply can't
+	happen.
+
+	* random.c (enum random_origins): Moved to ..
+	* rand-internal.h: .. here.
+	* rndunix.c (_gcry_rndunix_gather_random): Use enum in prototype
+	for ORIGIN and renamed REQUESTOR to ORIGIN.
+	* rndegd.c (_gcry_rndegd_gather_random): Ditto.
+	* rndlinux.c (_gcry_rndlinux_gather_random): Ditto.
+	* rndw32.c (_gcry_rndw32_gather_random): Ditto.
+	(_gcry_rndw32_gather_random_fast): Ditto.
+
+2007-03-13  Werner Koch  <wk@g10code.com>
+
+	* random.c (enum random_origins): New.
+	(add_randomness): Renamed arg SOURCE to ORIGIN.
+	(read_random_source): Renamed arg REQUESTOR to ORIGIN.
+	(getfnc_gather_random): Removed static variable because this
+	function is only called one and thus we don't need this
+	optimization.
+	(_gcry_quick_random_gen): Removed and replaced by..
+	(_gcry_enable_quick_random_gen): .. this.  It is onlyu used to
+	enable it and it does not make sense to disable it later. Changed
+	the only one caller too.
+	(get_random_bytes): Removed.
+	(gcry_random_bytes, gcry_random_bytes_secure): Implement in terms
+	of gcry_randomize.
+	* random-daemon.c (_gcry_daemon_get_random_bytes): Removed.
+
+2007-02-23  Werner Koch  <wk@g10code.com>
+
+	* elgamal.c (generate): Removed unused variable TEMP.
+	(test_keys): New arg NODIE.
+	(generate_using_x, _gcry_elg_generate_using_x): New.
+	* pubkey.c (pubkey_generate): New arg XVALUE and direct call to
+	the new elgamal generate fucntion.
+	(gcry_pk_genkey): Parse the new "xvalue" tag.
+
+2007-02-22  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (sexp_data_to_mpi): Handle dynamically allocated
+	algorithms.  Suggested by Neil Dunbar.  Fixes bug#596.
+
+	* rndw32.c (_gcry_rndw32_gather_random_fast): Make it return void.
+
+	* cipher.c (gcry_cipher_algo_name): Simplified.
+
+	* random.c: Use the daemon only if compiled with USE_RANDOM_DAEMON.
+
+	* Makefile.am (libcipher_la_SOURCES): Build random-daemon support
+	only if requested.
+
+2007-02-21  Werner Koch  <wk@g10code.com>
+
+	* random.c (rndpool, keypool): Make unsigned.
+	(mix_pool): Change char* variables to unsigned char*.
+	(gcry_randomize): Make arg BUFFER a void*.
+	(gcry_create_nonce): Ditto.
+
+	* rmd160.c (gcry_rmd160_mixblock): Make BUFFER a void*.
+	(_gcry_rmd160_hash_buffer): Make OUTBUF and BUFFER void*.
+	* sha1.c (_gcry_sha1_hash_buffer): Ditto.
+
+	* cipher.c (gcry_cipher_encrypt, cry_cipher_decrypt): Change
+	buffer args to void*.
+	(gcry_cipher_register): Make ALGORITHM_ID a int *.
+
+	* md.c (md_start_debug): Make SUFFIX a const char*.  Use snprintf.
+	(gcry_md_debug): New.
+	(gcry_md_ctl): Changed arg BUFFER from unsigned char*.
+
+	* md.c (md_write): Make INBUF a const void*.
+	(gcry_md_write): Remove needless cast.
+	* crc.c (crc32_write): Make INBUF a const void*
+	(update_crc32, crc24rfc2440_write): Ditto.
+	* sha512.c (sha512_write, transform): Ditto.
+	* sha256.c (sha256_write, transform): Ditto.
+	* rmd160.c (rmd160_write, transform): Ditto.
+	* md5.c (md5_write, transform): Ditto.
+	* md4.c (md4_write, transform): Ditto.
+	* sha1.c (sha1_write, transform): Ditto.
+
+	* tiger.c (tiger_write, transform): Ditto.
+	* whirlpool.c (whirlpool_write, whirlpool_add, transform): Ditto.
+
+	* elgamal.c (elg_names): Change to a const*.
+	* dsa.c (dsa_names): Ditto.
+	* rsa.c (rsa_names): Ditto.
+	* pubkey.c (gcry_pk_lookup_func_name): Make ALIASES a const.
+
+2007-02-20  Werner Koch  <wk@g10code.com>
+
+	* rndlinux.c (open_device): Remove unsused arg MINOR.
+
+2007-01-30  Werner Koch  <wk@g10code.com>
+
+	* sha256.c (oid_spec_sha256): Add alias from pkcs#1.
+	* sha512.c (oid_spec_sha512): Ditto.
+	(oid_spec_sha384): Ditto.
+
+2006-12-18  Werner Koch  <wk@g10code.com>
+
+	* rndlinux.c (set_cloexec_flag): New.
+	(open_device): Set close-on-exit flags.  Suggested by Max
+	Kellermann.  Fixes Debian#403613.
+
+	* Makefile.am (AM_CPPFLAGS, AM_CFLAGS): Splitted and merged
+	Moritz' changes.
+	(INCLUDES): Removed.
+
+2006-11-30  Werner Koch  <wk@g10code.com>
+
+	* serpent.c (byte_swap_32): Remove trailing semicolon.
+
+2006-11-15  Werner Koch  <wk@g10code.com>
+
+	* Makefile.am (INCLUDES): Include ../src/
+
+2006-11-03  Werner Koch  <wk@g10code.com>
+
+	* random.c [HAVE_GETTIMEOFDAY]: Included sys/time.h and not
+	sys/times.h.  Reported by Rafaël Carré.
+
+2006-11-05  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am (AM_CFLAGS): Added -I$(top_builddir)/src so that the
+	new gcrypt.h is used, not the one installed in the system.
+
+2006-10-25  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (prime_generate_internal): Tweaked use of secure
+	memory and entropy use. Safe unused primes from the pool. Allocate
+	at least a pool of 30.
+	(save_pool_prime, get_pool_prime): New.
+
+2006-10-23  Werner Koch  <wk@g10code.com>
+
+	* ac.c (_gcry_ac_data_from_sexp): Reset sexp_tmp for failsafe
+	means.  Release sexp_cur if needed.  Reported by Dirk Stoecker.
+
+	* pubkey.c (pubkeys_registered_lock): Intialized it.  It is not
+	 realy needed because this is a mere initialization to 0 anyway.
+	 Noted by Victor Stinner.
+
+2006-10-17  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (_gcry_dsa_generate2): New.
+	(generate): New arg QBITS.  Add sanity checks for reasonable qbits
+	and nbits.
+	* pubkey.c (gcry_pk_genkey): Parse an qbits element.
+	(pubkey_generate): New arg QBITS.  Pass it to the DSA generation.
+
+2006-10-05  Werner Koch  <wk@g10code.com>
+
+	* md.c (gcry_md_algo_info) <get_asnoid>: Check that the algo is
+	available.
+
+2006-10-04  David Shaw  <dshaw@jabberwocky.com>  (wk)
+
+	* tiger.c (round): Rename to tiger_round as gcc 4 has a built-in
+	round function that this conflicts with.
+
+2006-09-11  Werner Koch  <wk@g10code.com>
+
+	* rndw32.c (slow_gatherer_windowsNT): While adding data use the
+	size of the diskPerformance and not its address. Has been fixed in
+	GnuPG more than a year ago.  Noted by Lee Fisher.
+
+2006-08-30  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (sexp_data_to_mpi): Need to allow "ripemd160" here as
+	this is the canonical name.
+
+2006-08-29  Hye-Shik Chang <perky@FreeBSD.org>  (wk)
+
+	* seed.c: New.
+
+2006-08-03  Werner Koch  <wk@g10code.com>
+
+	* random-daemon.c (_gcry_daemon_initialize_basics): Don't
+	initialize the socket.  Remove arg SOCKETNAME.
+	(connect_to_socket): Make sure that daemon is set to -1 on error.
+	(call_daemon): Initialize the socket on the first call.
+	(_gcry_daemon_randomize, _gcry_daemon_get_random_bytes)
+	(_gcry_daemon_create_nonce): New arg SOCKETNAME.
+	* random.c (initialize): Call new daemon initializator.
+	(get_random_bytes, gcry_randomize, gcry_create_nonce): Pass socket
+	name to daemon call and reset allow_daemon on failure.
+
+2006-07-26  Werner Koch  <wk@g10code.com>
+
+	* rmd160.c (_gcry_rmd160_mixblock): Add cast to transform call.
+
+	* blowfish.c (selftest): Cast string to usnigned char*.
+
+	* primegen.c (prime_generate_internal): Cast unsigned/char*
+	mismatch in calling m_out_of_n.
+	(is_prime): Changed COUNT to unsigned int *.
+
+	* ac.c (_gcry_ac_data_copy): Initialize DATA_MPIS.
+
+	* random.c (gcry_create_nonce): Update the pid after a fork.
+	Reported by Uoti Urpala.
+
+2006-07-04  Marcus Brinkmann  <marcus@g10code.de>
+
+	* sha512.c: Fix typo in copyright notice.
+
+2006-06-21  Werner Koch  <wk@g10code.com>
+
+	* rsa.c (_gcry_rsa_generate): Replace xcalloc by calloc.
+	* pubkey.c (gcry_pk_encrypt, gcry_pk_sign): Ditto.
+	(sexp_to_key, sexp_to_sig, sexp_to_enc, gcry_pk_encrypt)
+	(gcry_pk_sign, gcry_pk_genkey, gcry_pk_get_keygrip): Ditto.
+	* md.c (md_copy): Ditto.
+
+2006-04-22  Moritz Schulte  <moritz@g10code.com>
+
+	* random-daemon.c (_gcry_daemon_initialize_basics): New argument:
+	SOCKETNAME.  Passing on to connect_to_socket() if non-NULL.
+	(connect_to_socket, writen, readn, call_daemon): New functions.
+	(_gcry_daemon_randomize, _gcry_daemon_get_random_bytes)
+	(_gcry_daemon_create_nonce): Call call_daemon().
+	(RANDOM_DAEMON_SOCKET): New symbol.
+	(daemon_socket): New static variable.
+
+	* random.h (_gcry_daemon_initialize_basics): New parameter:
+	SOCKETNAME.
+	(_gcry_set_random_daemon_socket): New declaration.
+
+	* random.c (initialize_basics): Pass DAEMON_SOCKET_NAME to
+	_gcry_daemon_initialize_basics.
+	(_gcry_set_random_daemon_socket): New function, setting
+	DAEMON_SOCKET_NAME.
+
+2006-04-01  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (eme_pkcs_v1_5_encode): Use KEY_SIZE directly, no need to
+	call gcry_ac_key_get_nbits.
+	(eme_pkcs_v1_5_decode): Likewise.
+	(ac_es_dencode_prepare_pkcs_v1_5): Fill options_em structure with
+	key_size.
+	(_gcry_ac_data_dump, gcry_ac_data_dump): New functions.
+	(_gcry_ac_data_to_sexp, _gcry_ac_data_from_sexp): More or less
+	rewritten; changed S-Expression format so that it matches the one
+	used in pubkey.c.
+
+2006-03-15  Werner Koch  <wk@g10code.com>
+
+	* random-daemon.c: New.
+	* random.c (_gcry_use_random_daemon): New.
+	(get_random_bytes, gcry_randomize, gcry_create_nonce): Try
+	diverting to the daemon functions.
+
+2006-03-14  Werner Koch  <wk@g10code.com>
+
+	* random.c (lock_seed_file): New.
+	(read_seed_file, _gcry_update_random_seed_file): Use it.
+
+	* random.c (gcry_create_nonce):  Detect a fork and re-seed.
+	(read_pool): Fixed the fork detection; it used to work only for
+	multi-threaded processes.
+
+2006-03-12  Brad Hards  <bradh@frogmouth.net>  (wk)
+
+	* md.c (md_open): Use new variable macpads_Bsize instead of
+	hardwiring the block size.  Changed at all places.
+
+2006-03-10  Brad Hards  <bradh@frogmouth.net>  (wk, patch 2005-04-22)
+
+	* md.c, sha256.c:  Add support for SHA-224.
+	(sha224_init): New.
+
+2006-01-18  Brad Hards  <bradh@frogmouth.net>  (wk 2006-03-07)
+
+	* cipher.c (cipher_encrypt, cipher_decrypt, do_ofb_encrypt)
+	(do_ofb_decrypt, gcry_cipher_open): Implement Output Feedback Mode.
+
+2005-11-02  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_algo_name): Return "?" instead of NULL for
+	unknown algorithm IDs.
+	* cipher.c (cipher_algo_to_string): Likewise.
+
+2005-11-01  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_algo_info): Don't forget to break after switch
+	case.
+
+2005-09-19  Werner Koch  <wk@g10code.com>
+
+	* dsa.c (generate): Add preliminary support for 2 and 4 keys.
+	Return an error code if the key size is not supported.
+	(_gcry_dsa_generate): Return an error.
+
+2005-08-22  Werner Koch  <wk@g10code.com>
+
+	* primegen.c (check_prime): New arg RM_ROUNDS.
+	(prime_generate_internal): Call it here with 5 rounds as used
+	before.
+	(gcry_prime_check): But here with 64 rounds.
+	(is_prime): Make sure never to use less than 5 rounds.
+
+2005-04-16  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (_gcry_ac_init): New function.
+
+2005-04-12  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (_gcry_ac_io_write, _gcry_ac_io_read): Initialize err to
+	make the compiler happy.
+	Always use errno, now that gcry_malloc() is guaranteed to set
+	errno on failure.
+	(_gcry_ac_data_to_sexp): Don't forget to goto out after error in
+	loop.
+	(_gcry_ac_data_to_sexp): Remove unused variable: mpi_list;
+	(_gcry_ac_data_to_sexp): Always deallocate sexp_buffer.
+	(_gcry_ac_data_from_sexp): Don't forget to initialize data_set_new.
+	(_gcry_ac_data_from_sexp): Handle special case, which is
+	necessary, since gcry_sexp_nth() does not distinguish between
+	"element does not exist" and "element is the empty list".
+	(_gcry_ac_io_init_va): Use assert to make sure that mode and type
+	are correct.
+	Use gcry_error_t types where gcry_err_code_t types have been used
+	before.
+
+2005-04-11  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (_gcry_ac_data_sign_scheme): Don't forget to initialize
+	buffer.
+
+	* whirlpool.c: New file.
+	* md.c (digest_table): Add whirlpool.
+	* Makefile.am (EXTRA_libcipher_la_SOURCES): Added: whirlpool.c.
+
+2005-03-30  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (_gcry_ac_data_from_sexp): Use length of SEXP_CUR, not
+	length of SEXP; do not forget to set SEXP_TMP to NULL after it has
+	been released.
+
+	(struct gcry_ac_mpi): New member: name_provided.
+	(_gcry_ac_data_set): Rename variable `name_final' to `name_cp';
+	remove const qualifier; change code to not cast away const
+	qualifiers; use name_provided member as well.
+	(_gcry_ac_data_set, _gcry_ac_data_get_name): Use name_provided
+	member of named mpi structure.
+
+	(gcry_ac_name_to_id): Do not forget to initialize err.
+	(_gcry_ac_data_get_index): Do not forget to initialize mpi_return;
+	use gcry_free() instead of free(); remove unnecessary cast; rename
+	mpi_return and name_return to mpi_cp and name_cp; adjust code.
+	(ac_data_mpi_copy): Do not cast away const qualifier.
+	(ac_data_values_destroy): Likewise.
+	(ac_data_construct): Likewise.
+
+	(ac_data_mpi_copy): Initialize flags to GCRY_AC_FLAG_DEALLOC.
+	(ac_data_extract): Use GCRY_AC_FLAG_DEALLOC instead of
+	GCRY_AC_FLAG_COPY.
+
+	(_gcry_ac_io_init_va, _gcry_ac_io_init, gcry_ac_io_init)
+	(gcry_ac_io_init_va, _gcry_ac_io_write, _gcry_ac_io_read)
+	(_gcry_ac_io_read_all, _gcry_ac_io_process): New functions.
+	(gry_ac_em_dencode_t): Use gcry_ac_io_t in prototype instead of
+	memroy strings directly; adjust encode/decode functions to use io
+	objects.
+	(emsa_pkcs_v1_5_encode_data_cb): New function ...
+	(emsa_pkcs_v1_5_encode): ... use it here.
+	(ac_data_dencode): Use io objects.
+	(_gcry_ac_data_encode, _gcry_ac_data_decode, gcry_ac_data_encode)
+	(gcry_ac_data_decode): Likewise.
+	(_gcry_ac_data_encrypt_scheme, gcry_ac_data_encrypt_scheme)
+	(_gcry_ac_data_decrypt_scheme, gcry_ac_data_decrypt_scheme)
+	(_gcry_ac_data_sign_scheme, gcry_ac_data_sign_scheme)
+	(_gcry_ac_data_verify_scheme, gcry_ac_data_verify_scheme):
+	Likewise.
+
+2005-03-23  Werner Koch  <wk@g10code.com>
+
+	* rndw32.c (_gcry_rndw32_gather_random_fast): While adding data
+	use the size of the object and not the one of its address.  Bug
+	reported by Sascha Kiefer.
+
+2005-03-19  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c (do_cbc_encrypt): Be careful to not overwrite data,
+	which is to be used later on.  This happend, in case CTS is
+	enabled and OUTBUF is equal to INBUF.
+
+2005-02-25  Werner Koch  <wk@g10code.com>
+
+	* pubkey.c (gcry_pk_get_keygrip): Allow for shadowed-private-key.
+
+2005-02-13  Moritz Schulte  <moritz@g10code.com>
+
+	* serpent.c: Updated from 1.2 branch:
+
+	s/u32_t/u32/ and s/byte_t/byte/.  Too match what we have always
+	used and are using in all other files too
+	(serpent_test): Moved prototype out of a fucntion.
+
+2005-02-07  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c: Major parts rewritten.
+	* pubkey.c (_gcry_pk_get_elements): New function.
+
+2004-12-09  Werner Koch  <wk@g10code.com>
+
+	* serpent.c (serpent_setkey): Moved prototype of serpent_test to
+	outer scope.
+
+2004-09-11  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (pubkey_table): Added an alias entry for GCRY_PK_ELG_E.
+
+2004-08-23  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c: Do not include <assert.h>.
+	* rndegd.c: Likewise.
+	* sha1.c: Likewise.
+	* rndunix.c: Likewise.
+	* rndlinux.c: Likewise.
+	* rmd160.c: Likewise.
+	* md5.c: Likewise.
+	* md4.c: Likewise.
+	* cipher.c: Likewise.
+	* crc.c: Likewise.
+	* blowfish.c: Likewise.
+
+	* pubkey.c (dummy_generate, dummy_check_secret_key)
+	(dummy_encrypt, dummy_decrypt, dummy_sign, dummy_verify): Return
+	err code GPG_ERR_NOT_IMPLEMENTED instead of aborting through
+	log_bug().
+	(dummy_get_nbits): Return 0 instead of aborting though log_bug().
+
+2004-08-19  Werner Koch  <wk@g10code.de>
+
+	* pubkey.c (sexp_data_to_mpi): Changed the zero random byte
+	substituting code to actually do clever things.  Thanks to
+	Matthias Urlichs for noting the implementation problem.
+
+2004-08-09  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_sign): Fixed memory leak; fix provided by
+	Modestas Vainius.
+
+2004-07-16  Werner Koch  <wk@gnupg.org>
+
+	* rijndael.c (do_encrypt): Fix alignment problem.  Bugs found by
+	Matthias Urlichs.
+	(do_decrypt): Ditto.
+	(keySched, keySched2): Use 2 macros along with unions in the key
+	schedule context.
+
+2004-07-14  Moritz Schulte  <moritz@g10code.com>
+
+	* rsa.c (_gcry_rsa_decrypt): Don't forget to free "a".  Thanks to
+	Nikos Mavroyanopoulos.
+
+2004-05-09  Werner Koch  <wk@gnupg.org>
+
+	* random.c (read_pool): Mix the PID in to better protect after a
+	fork.
+
+2004-07-04  Moritz Schulte  <moritz@g10code.com>
+
+	* serpent.c: Use "u32_t" instead of "unsigned long", do not
+	declare S-Box variables as "register".  Fixes failure on
+	OpenBSD/sparc64, reported by Nikolay Sturm.
+
+2004-05-07  Werner Koch  <wk@gnupg.org>
+
+	* random.c (initialize): Factored out some code to ..
+	(initialize_basics): .. new function.
+	(_gcry_random_initialize): Just call initialize_basics unless the
+	new arg FULL is set to TRUE.
+	(_gcry_fast_random_poll): Don't do anything unless the random
+	system has been really initialized.
+
+2004-05-07  Moritz Schulte  <moritz@g10code.de>
+
+	* ac.c (gcry_ac_open): Do not dereference NULL pointer.  Reported
+	by Umberto Salsi.
+
+2004-02-20  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (check_prime): New args CB_FUNC and CB_ARG; call them
+	at different stages.  Pass these arguments through all callers.
+
+2004-02-06  Werner Koch  <wk@gnupg.org>
+
+	* des.c: Add a new OID as used by pkcs#12.
+
+	* rfc2268.c: New. Taken from libgcrypt.
+	* cipher.c: Setup the rfc2268 algorithm.
+
+2004-01-25  Moritz Schulte  <mo@g10code.com>
+
+	* primegen.c (prime_generate_internal): Do not forget to free
+	`q_factor'; fixed by Brieuc Jeunhomme.
+	(prime_generate_internal): Do not forget to free `prime'.
+
+2004-01-14  Moritz Schulte  <mo@g10code.com>
+
+	* ac.c (gcry_ac_data_set): New argument: flags; slightly
+	rewritten.
+	(gcry_ac_data_get_name, gcry_ac_data_get_index): Likewise.
+	(gcry_ac_key_pair_generate): New argument: misc_data; modified
+	order of arguments.
+	(gcry_ac_key_test): New argument: handle.
+	(gcry_ac_key_get_nbits, gcry_ac_key_get_grip): Likewise.
+	Use GCRY_AC_FLAG_NO_BLINDING instead of
+	GCRY_AC_DATA_FLAG_NO_BLINDING.
+	(gcry_ac_mpi): New member: flags.
+	(gcry_ac_data_search, gcry_ac_data_add): Removed functions.
+
+2003-12-22  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (is_prime): Release A2.
+
+2003-12-19  Werner Koch  <wk@gnupg.org>
+
+	* md.c: Moved a couple of functions down below the data structure
+	definitions.
+	(struct gcry_md_context): New field ACTUAL_HANDLE_SIZE.
+	(md_open): Set it here.
+	(strcut gcry_md_list): New field ACTUAL_STRUCT_SIZE.
+	(md_enable): Set it here.
+	(md_close): Wipe the context memory.
+	secure memory.
+	* cipher.c (struct gcry_cipher_handle): New field ACTUAL_HANDLE_SIZE.
+	(gcry_cipher_open): Set it here.
+	(gcry_cipher_close): Use it to always wipe out the handle data.
+
+	* ac.c (gcry_ac_open): Make sure HANDLE gets initialized even when
+	the function is not successful.
+	(gcry_ac_close): Allow a NULL handle.
+	(gcry_ac_key_destroy, gcry_ac_key_pair_destroy): Ditto.
+	(gcry_ac_key_get_grip): Return INV_OBJ on error.
+
+	* primegen.c (prime_generate_internal): Fixed error code for
+	failed malloc.  Replaced the !err if chain by gotos.
+	(gcry_prime_group_generator): Remove the extra sanity check.
+
+	* md.c: Minor code and comment cleanups.
+
+2003-12-16  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (gen_prime): Doc fix.  Thanks to Newton Hammet.
+
+2003-12-11  Werner Koch  <wk@gnupg.org>
+
+	* rndunix.c (slow_poll): Don't use #warning but #error.
+
+	* rndegd.c: Changed indentation.
+	(my_make_filename): Removd the var_arg cruft becuase we
+	don't need it here.  Changed caller.
+
+	* rndlinux.c: Changed indentation.
+	(open_device): Remove the superfluous stat call and clarify
+	comment.
+
+	* rsa.c: Changed indentation.
+	(secret): Use the standard algorithm if p, q and u are not
+	available.
+	(rsa_blind, rsa_unblind): Renamed from _gcry_rsa_blind,
+	_gcry_rsa_unblind and moved more to the top.
+
+	* md4.c: Changed indentation.  Removed unnecessary casts.
+	* md5.c, rmd160.c, sha1.c, tiger.c: Ditto.
+	* rijndael.c, twofish.c: Ditto.
+	* serpent.c: Removed unnecessary casts.
+	* sha256.c, sha512.c: Ditto.
+
+2003-12-09  Werner Koch  <wk@gnupg.org>
+
+	* dsa.c: Unified indentation style.
+	* elgamal.c: Ditto.
+	* des.c (des_key_schedule): Code beautifications.
+	* blowfish.c: Changed indentation style.
+	* cast5.c (do_cast_setkey): Ditto.
+
+	* pubkey.c (gcry_pk_encrypt): Replaced the chain of if(!err) tests
+	by straightforward gotos. Other cleanups.
+	(gcry_pk_decrypt): Ditto.
+	(gcry_pk_sign): Ditto.
+	(gcry_pk_verify): Ditto.
+	(gcry_pk_genkey): Ditto.  Use strtoul instead of strtol.
+	(gcry_pk_ctl): Use GPG_ERR_INV_ARG to indicate bad arguments.
+
+2003-12-07  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_register_default): Undef the helper macro.
+	(gcry_pk_map_name): Allow NULL for string.
+	(sexp_to_key): Use memcpy and not strncpy.  Use gcry_free and not
+	free.
+	(sexp_to_sig): Ditto.
+	(sexp_to_enc): Ditto.  Replaced the chain of if(!err) tests by
+	straightforward gotos.
+
+2003-12-05  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c: Documentation cleanups.
+	(gcry_cipher_mode_from_oid): Allow NULL for STRING.
+
+2003-12-03  Werner Koch  <wk@gnupg.org>
+
+	* elgamal.c (sign, do_encrypt, gen_k): Make sure that a small K is
+	only used for encryption.
+
+2003-11-18  Werner Koch  <wk@gnupg.org>
+
+	* random.h (rndw32_set_dll_name): Removed unused prototype.
+
+	* Makefile.am (EXTRA_DIST): Added Manifest.
+
+2003-11-11  Werner Koch  <wk@gnupg.org>
+
+	* Manifest: New.
+
+2003-11-04  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_hash_buffer): Use shortcut for SHA1
+	* sha1.c (_gcry_sha1_hash_buffer): New.
+
+	* random.c: Reformatted most functions.
+	(mix_pool): Moved the failsafe_digest from global
+	scope to here.
+	(do_fast_random_poll): Use the generic fucntions even if a fast
+	gathering function has been used.
+	(read_pool): Detect a fork and retry.
+	(gcry_randomize, get_random_bytes): Don't distinguish anymore
+	between weak and strong random.
+	(gcry_create_nonce): New.
+
+2003-10-31  Werner Koch  <wk@gnupg.org>
+
+	* rndw32.c (slow_gatherer_windowsNT): Use a plain buffer for the
+	disk performance values and not the W32 API structure.
+
+	* dsa.c (verify): s/exp/ex/ due to shadowing of a builtin.
+	* elgamal.c (verify): Ditto.
+
+	* ac.c (gcry_ac_data_get_index): s/index/idx/
+	(gcry_ac_data_copy_internal): Remove the cast in _gcry_malloc.
+	(gcry_ac_data_add): Must use gcry_realloc instead of realloc.
+	* pubkey.c (sexp_elements_extract): s/index/idx/ as tribute to the
+	forehackers.
+	(gcry_pk_encrypt): Removed shadowed definition of I. Reordered
+	arguments to malloc for clarity.
+	(gcry_pk_sign, gcry_pk_genkey): Ditto.
+	* primegen.c (prime_generate_internal): s/random/randomlevel/.
+
+2003-10-27  Moritz Schulte  <mo@g10code.com>
+
+	* pubkey.c (gcry_pk_encrypt): Don't forget to deallocate pkey.
+
+2003-10-27  Werner Koch  <wk@gnupg.org>
+
+	* random.c (gcry_random_add_bytes): Return if buflen is zero to
+	avoid gcc warning about unsed parameter.
+	(MASK_LEVEL): Simplified; does now work for signed and unsigned
+	w/o warnings.
+
+	* md.c (md_start_debug): Removed the const from SUFFIX, because
+	this function is called from the control fucntion which does not
+	require const.
+
+	Prefixed all (pubkey,digest,cipher}_spec_* globale variables with
+	_gcry_.
+
+	* ac.c (ac_key_identifiers): Made static.
+
+	* random.c (getfnc_gather_random,getfnc_fast_random_poll): Move
+	prototypes to ..
+	* rand-internal.h: .. here
+	* random.c (getfnc_gather_random): Include rndw32 gatherer.
+	* rndunix.c, rndw32.c, rndegd.c: Include them here.
+	* rndlinux.c (_gcry_rndlinux_gather_random): Prepend the _gcry_
+	prefix.  Changed all callers.
+	* rndegd.c (_gcry_rndegd_gather_random): Likewise.
+	(_gcry_rndegd_connect_socket): Likewise.
+	* rndunix.c (_gcry_rndunix_gather_random): Likewise.
+	(waitpid): Made static.
+	* rndw32.c: Removed the old and unused winseed.dll cruft.
+	(_gcry_rndw32_gather_random_fast): Renamed from
+	gather_random_fast.
+	(_gcry_rndw32_gather_random): Renamed from gather_random.  Note,
+	that the changes 2003-04-08 somehow got lost.
+
+	* sha512.c (sha512_init, sha384_init): Made static.
+
+	* cipher.c (do_ctr_decrypt): Removed "return" from this void
+	function.
+
+2003-10-24  Moritz Schulte  <mo@g10code.com>
+
+	* serpent.c: Fix an issue on big-endian systems.
+
+	* rndw32.c: Removed IS_MODULE -cruft.
+	* rndlinux.c (rndlinux_gather_random): Likewise.
+
+2003-10-10  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (gen_prime): Bail out if NBITS is less than 16.
+	(prime_generate_internal): Initialize prime variable to suppress
+	compiler warning.  Check pbits, initialize qbits when passed as
+	zero.
+
+	* primegen.c (prime_generate_internal): New arg
+	ALL_FACTORS. Changed all callers.
+	(gcry_prime_generate): Make the factors arg optional. Request
+	all_factors.  Make sure PRIME is set to NULL even on error.
+	(gcry_prime_group_generator): New.
+	(gcry_prime_release_factors): New.
+
+2003-10-06  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (gen_prime): Assert that NBITS is never zero, it
+	would cause a segv.
+
+2003-09-28  Moritz Schulte  <mo@g10code.com>
+
+	* ac.c: Include "cipher.h".
+
+2003-09-27  Moritz Schulte  <mo@g10code.com>
+
+	* rndegd.c (do_read): Return nread instead of nbytes; thanks to
+	Michael Caerwyn.
+
+2003-09-04  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (_gcry_pk_aliased_algo_name): New.
+	* ac.c (gcry_ac_open): Use it here.
+
+	* Makefile.am (EXTRA_libcipher_la_SOURCES): Add serpent.c
+
+2003-09-02  Moritz Schulte  <mo@g10code.com>
+
+	* primegen.c (gcry_prime_check, gcry_prime_generate): New
+	functions.
+	(prime_generate_internal): New function, based on
+	_gcry_generate_elg_prime.
+	(_gcry_generate_elg_prime): Rewritten as a wrapper for
+	prime_generate_internal.
+
+2003-08-28  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_encrypt): Don't include the flags list in the
+	return value.  This does not make sense and breaks any programs
+	parsing the output strictly (e.g. current gpgsm).
+	(gcry_pk_encrypt): If aliases for the algorithm name exists, take
+	the first one instead of the regular name to adhere to SPKI
+	conventions.
+	(gcry_pk_genkey): Ditto.
+	(gcry_pk_sign): Ditto. Removed unused KEY_ALGO_NAME.
+
+2003-08-19  Moritz Schulte  <mo@g10code.com>
+
+	* cipher.c: Add support for Serpent
+	* serpent.c: New file.
+
+2003-08-10  Moritz Schulte  <moritz@g10code.com>
+
+	* rsa.c (_gcry_rsa_blind, _gcry_rsa_unblind): Declare static.
+
+2003-08-09  Timo Schulz  <twoaday@freakmail.de>
+
+	* random.c (getfnc_gather_random): Don't check NAME_OF_DEV_RANDOM
+	two times, but also the NAME_OF_DEV_URANDOM device.
+
+2003-08-08  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (sexp_to_enc): Fixed extraction of S-Expression: do not
+	fail if no `flags' sub S-Expression is found.
+
+2003-07-27  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_lookup_func_oid): Allow for empty OID lists.
+
+2003-07-23  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (gcry_ac_data_construct): New argument: include_flags, only
+	include `flags' S-expression, if include_flags is true.  Adjust
+	callers.  Thanks for triggering a bug caused by `flags'
+	sub-S-expression where they are not expected to Ralf Schneider.
+
+2003-07-21  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_lookup_func_name): Use new member name
+	`aliases' instead of `sexp_names'.
+
+	* ac.c (gcry_ac_key_data_get): New function.
+
+	* cipher.c (gcry_cipher_lookup_func_name): Fix return value.
+
+2003-07-20  Moritz Schulte  <moritz@g10code.com>
+
+	* blowfish.c: Adjusted for new gcry_cipher_spec_t structure.
+	* cast5.c: Likewise.
+	* twofish.c: Likewise.
+	* arcfour.c: Likewise.
+	* rijndael.c (rijndael_oids, rijndael192_oids, rijndael256_oids):
+	New variables, adjust for new gcry_cipher_spec_t structure.
+	* des.c (oids_tripledes): New variable, adjust for new
+	gcry_cipher_spec_t structure.
+
+	* md.c (oid_table): Removed.
+
+	* tiger.c (oid_spec_tiger): New variable.
+	(digest_spec_tiger): Adjusted for new gry_md_spec_t structure.
+
+	* sha512.c (oid_spec_sha512): New variable.
+	(digest_spec_sha512): Adjusted for new gry_md_spec_t structure.
+
+	* sha512.c (oid_spec_sha384): New variable.
+	(digest_spec_sha384): Adjusted for new gry_md_spec_t structure.
+
+	* sha256.c (oid_spec_sha256): New variable.
+	(digest_spec_sha256): Adjusted for new gry_md_spec_t structure.
+
+	* sha1.c (oid_spec_sha1): New variable.
+	(digest_spec_sha1): Adjusted for new gry_md_spec_t structure.
+
+	* rmd160.c (oid_spec_rmd160): New variable.
+	(digest_spec_rnd160): Adjusted for new gry_md_spec_t structure.
+
+	* md5.c (oid_spec_md5): New variable.
+	(digest_spec_md5): Adjusted for new gry_md_spec_t structure.
+
+	* md4.c (oid_spec_md4): New variable.
+	(digest_spec_md4): Adjusted for new gry_md_spec_t structure.
+
+	* crc.c (digest_spec_crc32, digest_spec_crc32_rfc1510,
+	digest_spec_crc32_rfc2440): Adjusted for new gry_md_spec_t
+	structure.
+
+2003-07-19  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (gcry_md_lookup_func_oid): New function.
+	(search_oid): New function, copied from cipher.c.
+	(gcry_md_map_name): Adjust for new search_oid_interface.
+
+	* cipher.c (oid_table): Removed table.
+	(gcry_cipher_lookup_func_oid): New function.
+	(search_oid): Rewritten to use the module functions.
+	(gcry_cipher_map_name): Adjust for new search_oid interface.
+	(gcry_cipher_mode_from_oid): Likewise.
+
+2003-07-18  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_hash_buffer): Convert ERR to gpg_error_t in
+	gpg_strerror.
+
+2003-07-14  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c (gcry_cipher_lookup_func_name): Also check the cipher
+	name aliases, not just the primary name.
+	(gcry_cipher_map_name): Remove kludge for aliasing Rijndael to
+	AES.
+
+	* arcfour.c, blowfish.c, cast5.c, des.c, twofish.c: Adjust cipher
+	specification structures.
+
+	* rijndael.c (rijndael_names, rijndael192_names,
+	rijndael256_names): New variables, use them in the cipher
+	specifications.
+
+	* rmd160test.c: Removed file.
+
+	* ac.c, arcfour.c, blowfish.c, cast5.c, cipher.c, des.c, dsa.c,
+	elgamal.c, md.c, pubkey.c, random.c, rijndael.c, rsa.c, twofish.c:
+	Used gcry_err* wrappers for libgpg symbols.
+
+	* primegen.c (gen_prime): Correct the order arguments to
+	extra_check.
+
+2003-07-12  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c: Replaced all public occurences of gpg_error_t with
+	gcry_error_t.
+	* cipher.c: Likewise.
+	* md.c: Likewise.
+	* pubkey.c: Likewise.
+	* random.c: Likewise.
+
+	* cipher.c: Added support for TWOFISH128.
+
+2003-07-08  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (gcry_ac_data_copy_internal): New function, based on
+	gcry_ac_data_copy.
+	(gcry_ac_data_copy): Made public, use gcry_ac_data_copy_internal.
+	(gcry_ac_key_init): Use gcry_ac_data_copy_internal.
+
+2003-07-07  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c (gcry_ac_data_set): Only release old MPI value if it is
+	different from the new value.  Bug reported by Simon Josefsson
+	<jas@extundo.com>.
+
+	* pubkey.c (gcry_pk_list): New function.
+	* md.c (gcry_md_list): New function.
+
+	* ac.c (gcry_ac_key_pair_generate): Fix calculation of format
+	string size.
+
+2003-07-05  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c: Named struct of digest_table `digest_table_entry'.
+	(digest_table_entry): New member: algorithm; filled in.
+	(digest_table_entry): Removed unused member: flags.
+	(gcry_md_register): New argument: algorithm_id, filled in.
+	(gcry_md_register_default): Used algorithm ID from module
+	structure.
+	(gcry_md_map_name): Likewise.
+	(md_enable): Likewise.
+	(md_read): Likewise.
+	(gcry_md_info): Likewise.
+
+	* pubkey.c: Named truct for pubkey_table `pubkey_table_entry'.
+	(pubkey_table_entry): New member: algorithm; filled in.
+	(gcry_pk_register_default): Used algorithm ID from pubkey_table.
+	(gcry_pk_register): New argument: algorithm_id, filled in.
+	(gcry_pk_map_name): Used algorithm ID from module structure.
+	(gcry_pk_decrypt): Likewise.
+	(gcry_pk_encrypt): Likewise.
+	(gcry_pk_verify): Likewise.
+	(gcry_pk_sign): Likewise.
+	(gcry_pk_testkey): Likewise.
+	(gcry_pk_genkey): Likewise.
+	(gcry_pk_get_nbits): Likewise.
+	(sexp_to_key): Removed unused variable: algo.
+	(sexp_to_sig): Likewise.
+
+	* cipher.c: Named struct for cipher_table `cipher_table_entry'.
+	(cipher_table_entry): New member: algorithm; filled in.
+	(gcry_cipher_register_default): Used algorithm ID from
+	cipher_table.
+	(gcry_cipher_register): New argument: algorithm_id, filled in.
+	(gcry_cipher_map_name): Used algorithm ID from module structure.
+
+	* arcfour.c (cipher_spec_arcfour): Removed algorithm ID.
+	* blowfish.c (cipher_spec_blowfish): Likewise.
+	* cast5.c (cipher_spec_cast5): Likewise.
+	* crc.c (digest_spec_crc32): Likewise.
+	* crc.c (digest_spec_crc32_rfc1510): Likewise.
+	* crc.c (digest_spec_crc32_rfc2440): Likewise.
+	* des.c (cipher_spec_des): Likewise.
+	* des.c (cipher_spec_tripledes): Likewise.
+	* dsa.c (pubkey_spec_dsa): Likewise.
+	* elgamal.c (pubkey_spec_elg): Likewise.
+	* md4.c (digest_spec_md4): Likewise.
+	* md5.c (digest_spec_md5): Likewise.
+	* aes.c (cipher_spec_aes): Likewise.
+	* aes.c (cipher_spec_aes192): Likewise.
+	* aes.c (cipher_spec_aes256): Likewise.
+	* rsa.c (pubkey_spec_rsa): Likewise.
+	* sha1.c (digest_spec_sha1): Likewise.
+	* sha256.c (digest_spec_sha256): Likewise.
+	* sha512.c (digest_spec_sha512): Likewise.
+	* tiger.c (digest_spec_tiger): Likewise.
+	* twofish.c (cipher_spec_twofish): Likewise.
+	* twofish.c (cipher_spec_twofish128): Likewise.
+
+	* Makefile.am (EXTRA_libcipher_la_SOURCES): Fix list of source
+	files; reported by Simon Josefsson <jas@extundo.com>.
+
+	* pubkey.c: Replaced all occurences of `id' with `algorithm',
+	since `id' is a keyword in obj-c.
+	* md.c: Likewise.
+	* cipher.c: Likewise.
+
+	* crc.c, md4.c, md5.c, rmd160.c, sha1.c, sha256.c, tiger.c:
+	Replaced all occurences of gcry_digest_spec_t with gcry_md_spec_t.
+
+	* dsa.c, rsa.c, elgamal.c: Replaced all occurencens of
+	gcry_pubkey_spec_t with gcry_pk_spec_t.
+
+	* md.c: Replaced all occurences of gcry_digest_spec_t with
+	gcry_md_spec_t.
+	(gcry_digest_register_default): Renamed to ...
+	(gcry_md_register_default): ... this; adjusted callers.
+	(gcry_digest_lookup_func_name): Renamed to ...
+	(gcry_md_lookup_func_name): ... this; adjusted callers.
+	(gcry_digest_lookup_name): Renamed to ...
+	(gcry_md_lookup_name): ... this; adjusted callers.
+	(gcry_digest_register): Renamed to ...
+	(gcry_md_register): ... this.
+	(gcry_digest_unregister): Renamed to ...
+	(gcry_md_unregister): ... this.
+
+	* pubkey.c (gcry_pubkey_register): Renamed to ...
+	(gcry_pk_register): ... this.
+	(gcry_pubkey_unregister): Renamed to ...
+	(gcry_pk_unregister): ... this.
+	Replaced all occurences of gcry_pubkey_spec_t with gcry_pk_spec_t.
+	(gcry_pubkey_register_default): Renamed to ...
+	(gcry_pk_register_default): ... this; adjusted callers.
+	(gcry_pubkey_lookup_func_name): Renamed to ...
+	(gcry_pk_lookup_func_name): ... this; adjusted callers.
+	(gcry_pubkey_lookup_name): Renamed to ...
+	(gcry_pk_lookup_name): ... this; adjusted callers.
+
+	* md.c (gcry_md_hash_buffer): Fix error checking.  Thanks to Simon
+	Josefsson <jas@extunde.com>.
+
+2003-07-04  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c (gcry_cipher_list): New function.
+
+2003-07-01  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (sexp_to_sig): Accept a `flags' S-expression to be more
+	consistent with sexp_to_enc.
+
+2003-06-30  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am (libcipher_la_SOURCES): Added: ac.c.
+
+	* pubkey.c (_gcry_pk_module_lookup): New function.
+	(_gcry_pk_module_release): New function.
+
+2003-06-29  Moritz Schulte  <moritz@g10code.com>
+
+	* ac.c: New file.
+
+2003-06-26  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_hash_buffer): Trigger BUG correcly with new API.
+
+2003-06-19  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_is_enabled): Fixed.
+
+2003-06-18  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (gcry_cipher_get_algo_keylen): New.
+	(gcry_cipher_get_algo_blklen): New.
+
+2003-06-18  Moritz Schulte  <moritz@g10code.com>
+
+	* arcfour.c, cipher.c, blowfish.c, md.c, cast5.c, pubkey.c, crc.c,
+	des.c, dsa.c, elgamal.c, md4.c, md5.c, random.c, rijndael.c,
+	rmd160.c, rsa.c, sha1.c, sha256.c, sha512.c, tiger.c, twofish.c:
+	Replaced older types GcryDigestSpec, GcryCipherSpec and
+	GcryPubkeySpec with newer types: gcry_digest_spec_t,
+	gcry_cipher_spec_t and gcry_pubkey_spec_t.
+
+	* md.c (gcry_digest_id_new): Removed function.
+	(gcry_digest_register): Removed code for generating a new module
+	ID.
+
+	* pubkey.c (gcry_pubkey_id_new): Removed function.
+	(gcry_pubkey_register): Removed code for generating a new module
+	ID.
+
+	* cipher.c, md.c, pubkey.c: Replace old type GcryModule with newer
+	one: gcry_module_t.
+	(gcry_cipher_id_new): Removed function.
+	(gcry_cipher_register): Removed code for generating a new module
+	ID.
+
+	* cipher.c (gcry_cipher_register): Adjust call to
+	_gcry_module_add.
+	(gcry_cipher_register_default): Likewise.
+	* pubkey.c (gcry_pubkey_register_default): Likewise.
+	(gcry_pubkey_register): Likewise.
+	* md.c (gcry_digest_register_default): Likewise.
+	(gcry_digest_register): Likewise.
+
+	* md.c (gcry_digest_lookup_func_id): Removed function.
+	(gcry_digest_lookup_id): Likewise.
+	(gcry_digest_id_new): Use _gcry_module_lookup_id instead of
+	gcry_digest_lookup_id.
+	(digest_algo_to_string): Likewise.
+	(check_digest_algo): Likewise.
+	(md_enable): Likewise.
+	(md_digest_length): Likewise.
+	(md_asn_oid): Likewise.
+
+	* pubkey.c (gcry_pubkey_lookup_id): Removed function.
+	(gcry_pubkey_lookup_func_id): Likewise.
+	(gcry_pubkey_id_new): Use _gcry_module_lookup_id instead of
+	gcry_pubkey_id_new.
+	(gcry_pk_algo_name): Likewise.
+	(disable_pubkey_algo): Likewise.
+	(check_pubkey_algo): Likewise.
+	(pubkey_get_npkey): Likewise.
+	(pubkey_get_nskey): Likewise.
+	(pubkey_get_nsig): Likewise.
+	(pubkey_get_nenc): Likewise.
+	(pubkey_generate): Likewise.
+	(pubkey_check_secret_key): Likewise.
+	(pubkey_encrypt): Likewise.
+	(pubkey_decrypt): Likewise.
+	(pubkey_sign): Likewise.
+	(pubkey_verify): Likewise.
+	(gcry_pk_algo_info): Likewise.
+
+	* cipher.c (gcry_cipher_lookup_func_id): Removed function.
+	(gcry_cipher_lookup_id): Likewise.
+	(cipher_algo_to_string): use _gcry_module_lookup_id instead of
+	gcry_cipher_lookup_id.
+	(disable_cipher_algo): Likewise.
+	(check_cipher_algo): Likewise.
+	(cipher_get_blocksize): Likewise.
+	(gcry_cipher_open): Likewise.
+	(gcry_cipher_id_new): Likewise.
+
+2003-06-17  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am (GCRYPT_MODULES): Set to @GCRYPT_CIPHERS@,
+	@GCRYPT_PUBKEY_CIPHERS@, @GCRYPT_DIGESTS@ and @GCRYPT_RANDOM@.
+	(libcipher_la_DEPENDENCIES): Set to $(GCRYPT_MODULES).
+	(libcipher_la_LIBADD): Likewise.
+	(AM_CFLAGS): Added: @GPG_ERROR_CFLAGS@.
+	(EXTRA_libcipher_la_SOURCES): Added all conditional sources.
+
+	* md.c (md_open): Use _gcry_fast_random_poll instead of
+	fast_random_poll.
+	* cipher.c (gcry_cipher_open): Likewise.
+
+	* random.h (fast_random_poll): Removed macro.
+
+	* blowfish.c, md4.c, md5.c, rmd160.c, sha1.c, sha256.c, sha512.c,
+	tiger.c: Use Autoconf's WORDS_BIGENDIAN instead of our own
+	BIG_ENDIAN_HOST.
+
+2003-06-16  Moritz Schulte  <moritz@g10code.com>
+
+	* random.c (getfnc_gather_random): Do not special-case
+	USE_ALL_RANDOM_MODULES, make it the default.
+
+	* dsa.c: Replace last occurences of old type names with newer
+	names (i.e. replace MPI with gcry_mpi_t).
+	* elgamal.c: Likewise.
+	* primegen.c: Likewise.
+	* pubkey.c: Likewise.
+	* rsa.c: Likewise.
+
+2003-06-14  Moritz Schulte  <moritz@g10code.com>
+
+	* des.c (des_setkey): Add selftest check.
+	(tripledes_set3keys): Likewise.
+	(do_tripledes_setkey): Remove selftest check.
+	(do_des_setkey): Likewise.
+
+2003-06-11  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (_gcry_md_init): New function.
+	* cipher.c (_gcry_cipher_init): New function.
+	* pubkey.c (_gcry_pk_init): New function.
+
+2003-06-13  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_get_algo): Reverted to old API.  This is a
+	convenience function anyway and error checking is not approriate.
+	(gcry_md_is_secure): New.
+	(gcry_md_is_enabled): New.
+
+2003-06-12  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (gcry_cipher_open): Make sure HANDLE is set to NULL on
+	error.
+
+2003-06-11  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_open): Make sure H receives either NULL or an
+	valid handle.
+	(gcry_md_copy): Swapped arguments so that it is more in lione with
+	md_open and most other API fucntions like memcpy (destination
+	comes first).  Make sure HANDLE is set to NULL on error.
+
+	* rijndael.c (do_encrypt): Hack to force correct alignment.  It
+	seems not to be	not sufficient, though.  We should rework this
+	fucntions and remove all these ugly casts.  Let the compiler
+	optimize or have an assembler implementation.
+
+2003-06-09  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am: Removed rules serpent, since that is not commited
+	yet.
+
+2003-06-08  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_encrypt): Improve calculation for size of the
+	format string.
+
+2003-06-07  Moritz Schulte  <moritz@g10code.com>
+
+	* arcfour.c, bithelp.h, blowfish.c, cast5.c, cipher.c, crc.c,
+	des.c, dsa.c, elgamal.c, md4.c, md5.c, md.c, primegen.c, pubkey.c,
+	rand-internal.h, random.c, random.h, rijndael.c, rmd160.c,
+	rmd160test.c, rmd.h, rndeged.c, rndlinux.c, rndunix.c, rndw32.c,
+	rsa.c, sha1.c, sha256.c, sha512.c, tiger.c, twofish.c: Edited all
+	preprocessor instructions to remove whitespace before the '#'.
+	This is not required by C89, but there are some compilers out
+	there that don't like it.  Replaced any occurence of the now
+	deprecated type names with the new ones.
+
+2003-06-04  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_encrypt): Construct an arg_list and use
+	gcry_sexp_build_array instead of gcry_sexp_build.
+	(gcry_pk_sign): Likewise.
+	(gcry_pk_genkey): Likewise.
+
+2003-06-01  Moritz Schulte  <moritz@g10code.com>
+
+	* dsa.c (_gcry_dsa_generate): Do not check wether the algorithm ID
+	does indeed belong to DSA.
+	(_gcry_dsa_sign): Likewise.
+	(_gcry_dsa_verify): Likewise.
+	(_gcry_dsa_get_nbits): Likewise.
+
+	* elgamal.c (_gcry_elg_check_secret_key): Do not check wether the
+	algorithm ID does indeed belong to ElGamal.
+	(_gcry_elg_encrypt): Likewise.
+	(_gcry_elg_decrypt): Likewise.
+	(_gcry_elg_sign): Likewise.
+	(_gcry_elg_verify): Likewise.
+	(_gcry_elg_get_nbits): Likewise.
+	(_gcry_elg_generate): Likewise.
+
+	* rsa.c (_gcry_rsa_generate): Do not check wether the algorithm ID
+	does indeed belong to RSA.
+	(_gcry_rsa_encrypt): Likewise.
+	(_gcry_rsa_decrypt): Likewise.
+	(_gcry_rsa_sign): Likewise.
+	(_gcry_rsa_verify): Likewise.
+	(_gcry_rsa_get_nbits): Likewise.
+
+2003-05-30  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (md_get_algo): Return zero in case to algorithm is enabled.
+
+	* md.c (gcry_md_info): Adjusted for new no-errno-API.
+	(md_final): Likewise.
+	(gcry_md_get_algo): Likewise.
+	* pubkey.c (gcry_pk_get_keygrip): Likewise.
+	(gcry_pk_ctl): Likewise.
+	(gcry_pk_algo_info): Likewise.
+	* des.c (selftest): Likewise.
+
+2003-05-29  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (md_enable): Do not forget to release module on error.
+	(gcry_md_open): Adjusted for new no-errno-API.
+	(md_open): Likewise.
+	(md_copy): Likewise.
+	(gcry_md_copy): Likewise.
+	(gcry_md_setkey): Likewise.
+	(gcry_md_algo_info): Likewise.
+
+	* cipher.c (gcry_cipher_open): Adjusted for new no-errno-API and
+	also fixed a locking bug.
+	(gcry_cipher_encrypt): Adjusted for new no-errno-API.
+	(gcry_cipher_decrypt): Likewise.
+	(gcry_cipher_ctl): Likewise.
+	(gcry_cipher_info): Likewise.
+	(gcry_cipher_algo_info): Likewise.
+
+2003-05-28  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (md_enable): Adjusted for libgpg-error.
+	(gcry_md_enable): Likewise.
+	(gcry_digest_register_default): Likewise.
+	(gcry_digest_register): Likewise.
+	(check_digest_algo): Likewise.
+	(prepare_macpads): Likewise.
+	(gcry_md_setkey): Likewise.
+	(gcry_md_ctl): Likewise.
+	(gcry_md_get): Likewise.
+	(gcry_md_algo_info): Likewise.
+	(gcry_md_info): Likewise.
+	* dsa.c (_gcry_dsa_generate): Likewise.
+	(_gcry_dsa_check_secret_key): Likewise.
+	(_gcry_dsa_sign): Likewie.
+	(_gcry_dsa_verify): Likewise.
+	* twofish.c (do_twofish_setkey): Likewise.
+	(twofish_setkey): Likewise.
+	* cipher.c (gcry_cipher_register): Likewise.
+
+2003-05-25  Moritz Schulte  <moritz@g10code.com>
+
+	* rijndael.c (do_setkey): Adjusted for libgpg-error.
+	(rijndael_setkey): Likewise.
+	* random.c (gcry_random_add_bytes): Likewise.
+	* elgamal.c (_gcry_elg_generate): Likewise.
+	(_gcry_elg_check_secret_key): Likewise.
+	(_gcry_elg_encrypt): Likewise.
+	(_gcry_elg_decrypt): Likewise.
+	(_gcry_elg_sign): Likewise.
+	(_gcry_elg_verify): Likewise.
+	* rsa.c (_gcry_rsa_generate): Likewise.
+	(_gcry_rsa_check_secret_key): Likewise.
+	(_gcry_rsa_encrypt): Likewise.
+	(_gcry_rsa_decrypt): Likewise.
+	(_gcry_rsa_sign): Likewise.
+	(_gcry_rsa_verify): Likewise.
+	* pubkey.c (dummy_generate, dummy_check_secret_key, dummy_encrypt,
+	dummy_decrypt, dummy_sign, dummy_verify): Likewise.
+	(gcry_pubkey_register): Likewise.
+	(check_pubkey_algo): Likewise.
+	(pubkey_generate): Likewise.
+	(pubkey_check_secret_key): Likewise.
+	(pubkey_encrypt): Likewise.
+	(pubkey_decrypt): Likewise.
+	(pubkey_sign): Likewise.
+	(pubkey_verify): Likewise.
+	(sexp_elements_extract): Likewise.
+	(sexp_to_key): Likewise.
+	(sexp_to_sig): Likewise.
+	(sexp_to_enc): Likewise.
+	(sexp_data_to_mpi): Likewise.
+	(gcry_pk_encrypt): Likewise.
+	(gcry_pk_decrypt): Likewise.
+	(gcry_pk_sign): Likewise.
+	(gcry_pk_verify): Likewise.
+	(gcry_pk_testkey): Likewise.
+	(gcry_pk_genkey): Likewise.
+	(gcry_pk_ctl): Likewise.
+	* cipher.c (dummy_setkey): Likewise.
+	(check_cipher_algo): Likewise.
+	(gcry_cipher_open): Likewise.
+	(cipher_setkey): Likewise.
+	(gcry_cipher_ctl): Likewise.
+	(cipher_encrypt): Likewise.
+	(gcry_cipher_encrypt): Likewise.
+	(cipher_decrypt): Likewise.
+	(gcry_cipher_decrypt): Likewise.
+	(gcry_cipher_info): Likewise.
+	(gcry_cipher_algo_info): Likewise.
+	* cast5.c (cast_setkey): Likewise.
+	(do_cast_setkey): Likewise.
+	* arcfour.c (arcfour_setkey): Likewise.
+	(do_arcfour_setkey): Likewise.
+	* blowfish.c (do_bf_setkey): Likewise.
+	(bf_setkey): Likewise.
+	* des.c (do_des_setkey): Likewise.
+	(do_tripledes_setkey): Likewise.
+
+2003-05-22  Moritz Schulte  <moritz@g10code.com>
+
+	* tiger.c: Merged code ussing the U64_C macro from GnuPG.
+
+	* sha512.c: Likewise.
+
+2003-05-17  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Fix type: acquire a lock, instead of
+	releasing it.
+
+2003-05-11  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_testkey): Call REGISTER_DEFAULT_CIPHERS.
+	(gcry_pk_ctl): Likewise.
+
+2003-04-27  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (gcry_pk_genkey): Release sexp after extracted data has
+	been used.
+
+	* md.c (gcry_md_get_algo_dlen): Simplified, simply call
+	md_digest_length to do the job.
+
+	* des.c (do_des_setkey): Check for selftest failure not only
+	during initialization.
+	(do_tripledes_setkey): Include check for selftest failure.
+
+	* pubkey.c (gcry_pubkey_register_default): New macro
+	`pubkey_use_dummy', use it.
+
+	* elgamal.c (elg_names): New variable.
+	(pubkey_spec_elg): Include elg_names.
+
+	* dsa.c (dsa_names): New variable.
+	(pubkey_spec_dsa): Include dsa_names.
+
+	* rsa.c (rsa_names): New variable.
+	(pubkey_spec_rsa): Include rsa_names.
+
+	* pubkey.c (gcry_pubkey_lookup_func_name): Compare name also with
+	the names listed in `sexp_names'.
+
+2003-04-24  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (sexp_to_key): New variables: module, pubkey.  Adjusted
+	to new module interface.
+	(sexp_to_key): Changend type of argument `retalgo' from `int *' to
+	`GcryModule **'.  Adjusted all callers.  Removed argument:
+	r_algotblidx.
+	(sexp_to_sig): Changend type of argument `retalgo' from `int *' to
+	`GcryModule **'.  Adjusted all callers.
+	(sexp_to_enc): Likewise.
+
+	(pubkey_get_npkey, pubkey_get_nskey, pubkey_get_nsig,
+	pubkey_get_nenc): Use strlen to find out the number.
+
+	* rsa.c: Adjust pubkey_spec_rsa to new internal interface.
+	* dsa.c: Likewise.
+	* elgamal.c: Likewise.
+
+2003-04-17  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c (sexp_elements_extract): New function.
+	* pubkey.c (sexp_to_key): Removed variable `idx', added `err', use
+	sexp_elements_extract.
+	(sexp_to_sig): Likewise.
+	(sexp_to_enc): Likewise.
+
+	* pubkey.c: Terminate list correctly.
+	* md.c: Include sha512/sha384 in digest_table.
+
+2003-04-16  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am: Include support for sha512.c.
+
+	* sha512.c: New file, merged from GnuPG, with few modifications
+	for libgcrypt.
+
+	* rand-internal.h: Removed declarations for constructor functions.
+
+	* md.c (md_copy): Call _gcry_module_use for incrementing the usage
+	counter of the digest modules.
+
+	* rsa.c: Do not include "rsa.h".
+	* dsa.c: Do not include "dsa.h".
+	* elgamal.c: Do not include "elgamal.h".
+	* des.c: Do not include "des.h".
+	* cast5.c: Do not include "cast5.h".
+	* blowfish.c: Do not include "blowfish.h".
+	* arcfour.c: Do not include "arcfour.h".
+
+	* Makefile.am (libcipher_la_DEPENDENCIES): Removed.
+	(libcipher_la_LIBADD): Removed.
+	Use Automake conditionals for conditional compilation.
+
+2003-04-13  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c (gcry_cipher_open): Call REGISTER_DEFAULT_CIPHERS.
+
+	* md.c (gcry_md_list): New member: module.
+	(md_enable): New variable: module, changed use of module and
+	digest.
+	(md_enable): Initialize member: module.
+	(md_close): Call _gcry_module_release.
+
+	* cipher.c (gcry_cipher_open): New variable: module, changed use of
+	module and cipher.
+	(struct gcry_cipher_handle): New member: module.
+	(gcry_cipher_open): Initialize member: module.
+	(gcry_cipher_close): Call _gcry_module_release.
+
+2003-04-09  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c: Include "ath.h".
+	* md.c: Likewise.
+	* pubkey.c: Likewise.
+
+	* cipher.c (ciphers_registered_lock): New variable.
+	* md.c (digests_registered_lock): New variable.
+	* pubkey.c (pubkeys_registered_lock): New variable.
+
+	* rndlinux.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_rndlinux_constructor): Removed function.
+
+	* rndegd.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_rndegd_constructor): Removed function.
+
+	* rndunix.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_rndunix_constructor): Removed function.
+
+	* rndw32.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_rndw32_constructor): Removed function.
+
+	* rndegd.c (rndegd_connect_socket): Simplify code for creating the
+	egd socket address.
+	(rndegd_connect_socket): Call log_fatal use instead of
+	g10_log_fatal.
+	(egd_gather_random): Renamed to ...
+	(rndegd_gather_random): ... here.
+
+2003-04-08  Moritz Schulte  <moritz@g10code.com>
+
+	* rndlinux.c: Do not include "dynload.h".
+	* rndunix.c: Likewise.
+	* rndw32.c: Likewise.
+
+	* rndegd.c (rndegd_connect_socket): Factored out from ...
+	(egd_gather_random): here; call it.
+	(egd_socket): New variable.
+	(egd_gather_random): Initialize fd with egd_socket, do not declare
+	fd static.
+	(do_read): Merged few changes from GnuPG. FIXME - not finished?
+	Do not include "dynload.h".
+
+	* rndw32.c (gather_random): Renamed to rndw32_gather_random, do
+	not declare static.
+	(gather_random_fast): Renamed to rndw32_gather_random_fast, do not
+	declare static.
+
+	* rndunix.c (gather_random): Renamed to rndunix_gather_random, do
+	not declare static.
+	* rndegd.c (gather_random): Renamed to rndegd_gather_random, do
+	not declare static.
+	* rndlinux.c (gather_random): Renamed to rndlinux_gather_random,
+	do not declare static.
+
+2003-04-07  Moritz Schulte  <moritz@g10code.com>
+
+	* Makefile.am (libcipher_la_SOURCES): Removed construct.c.
+	(libcipher_la_SOURCES): Added sha1.c, sha256.c, rmd160.c, md4.c,
+	md5.c, tiger.c and crc.c
+	(EXTRA_PROGRAMS): Removed sha1, sha256, rmd160, md4, md5, tiger
+	and crc.  Removed definitions: EXTRA_md4_SOURCES,
+	EXTRA_md5_SOURCES, EXTRA_rmd160_SOURCES, EXTRA_sha1_SOURCES,
+	EXTRA_sha256_SOURCES, EXTRA_tiger_SOURCES and EXTRA_crc_SOURCES,
+	BUILT_SOURCES, DISTCLEANFILES.
+
+	* pubkey.c: Do not include "elgamal.h", "dsa.h" and "rsa.h".
+
+	* Makefile.am (libcipher_la_SOURCES): Removed rsa.h, elgamal.h,
+	dsa.h, des.h, cast5.h, arcfour.h and blowfish.h.
+
+	* rsa.h: Removed file.
+	* elgamal.h: Removed file.
+	* dsa.h: Removed file.
+	* des.h: Removed file.
+	* cast5.h: Removed file.
+	* arcfour.h: Removed file.
+	* blowfish.h: Removed file.
+
+	* Makefile.am (libcipher_la_SOURCES): Removed dynload.c and
+	dynload.h.
+
+	* rsa.c (pubkey_spec_rsa): New variable.
+	* dsa.c (pubkey_spec_rsa): New variable.
+	* elgamal.c (pubkey_spec_elg): New variable.
+
+	* rsa.c (_gcry_rsa_get_info): Removed function.
+	* elgamal.c (_gcry_elg_get_info): Removed function.
+	* dsa.c (_gcry_dsa_get_info): Removed function.
+
+	* tiger.c (tiger_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_tiger_constructor): Removed function.
+
+	* sha1.c (sha1_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_sha1_constructor): Removed function.
+
+	* sha256.c (sha256_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_sha256_constructor): Removed function.
+
+	* rmd160.c (rmd160_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_rmd160_constructor): Removed function.
+
+	* md5.c (md5_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_md5_constructor): Removed function.
+
+	* md4.c (md4_get_info): Removed function.
+	(gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func): Removed function.
+	(_gcry_md4_constructor): Removed function.
+
+	* crc.c (crc_get_info): Removed function.
+
+	* arcfour.c (do_arcfour_setkey): Changed type of context argument
+	to `void *', added local variable for cast, adjusted callers.
+	(arcfour_setkey): Likewise.
+	(encrypt_stream): Likewise.
+	* cast5.c (cast_setkey): Likewise.
+	(encrypt_block): Likewise.
+	* rijndael.c (rijndael_setkey): Likewise.
+	(rijndael_encrypt): Likewise.
+	(rijndael_decrypt): Likewise.
+	* twofish.c (twofish_setkey): Likewise.
+	(twofish_encrypt): Likewise.
+	(twofish_decrypt): Likewise.
+	* des.c (do_des_setkey): Likewise.
+	(do_des_encrypt): Likewise.
+	(do_des_encrypt): Likewise.
+	(do_tripledes_encrypt): Likewise.
+	(do_tripledes_encrypt): Likewise.
+	* blowfish.c (bf_setkey: Likewise.
+	(encrypt_block): Likewise.
+	(decrypt_block): Likewise.
+
+	* arcfour.c (encrypt_stream): Likewise.
+
+	* rijndael.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func) Removed function.
+
+	* twofish.c (gnupgext_version, func_table): Removed definitions.
+	(gnupgext_enum_func) Removed function.
+
+	* cast5.c (CIPHER_ALGO_CAST5): Removed.
+
+	* blowfish.c (FNCCAST_SETKEY, FNCCAST_CRYPT): Removed macros.
+	(CIPHER_ALGO_BLOWFISH): Removed symbol.
+	* cast5.c (FNCCAST_SETKEY, FNCCAST_CRYPT): Likewise.
+	* des.c (selftest_failed): Removed.
+	(initialized): New variable.
+	(do_des_setkey): Run selftest, if not yet done.
+	(FNCCAST_SETKEY, FNCCAST_CRYPT): Removed macros.
+
+	* arcfour.c (_gcry_arcfour_get_info): Removed function.
+	* blowfish.c (_gcry_blowfish_get_info): Removed function.
+	* cast5.c (_gcry_cast5_get_info): Removed function.
+	* des.c (_gcry_des_get_info): Removed function.
+	* rijndael.c (_gcry_rijndael_get_info): Removed function.
+	* twofish.c (_gcry_twofish_get_info): Removed function.
+
+	* arcfour.c (cipher_spec_arcfour): New variable.
+	* twofish.c (cipher_spec_twofish, cipher_spec_twofish128): New
+	variables.
+	* rijndael.c (cipher_spec_aes, cipher_spec_aes192,
+	cipher_spec256): New variables.
+	* des.c (cipher_spec_des, cipher_spec_tripledes): New variables.
+	* cast5.c (cipher_spec_cast5): New variable.
+	* blowfish.c (cipher_spec_blowfish): Likewise.
+
+	* twofish.c: Do not include "dynload.h".
+	* rijndael.c: Likewise.
+	* des.c: Likewise.
+	* cast5.c: Likewise.
+	* blowfish.c: Likewise.
+	* cipher.c: Likewise.
+	* crc.c: Likewise.
+	* md4.c: Likewise.
+	* md5.c: Likewise.
+	* md.c: Likewise.
+	* pubkey.c: Likewise.
+	* rijndael.c: Likewise.
+	* sha1.c: Likewise.
+	* sha256.c: Likewise.
+
+	* arcfour.c: Include "cipher.h".
+	* twofish.c: Likewise.
+	* rijndael.c: Likewise.
+	* des.c: Likewise.
+	* cast5.c: Likewise.
+	* blowfish.c: Likewise.
+
+	* twofish.c (twofish_setkey): Declared argument `key' const.
+	(twofish_encrypt): Declared argument `inbuf' const.
+	(twofish_decrypt): Likewise.
+
+	* rijndael.c (rijndael_setkey): Declared argument `key' const.
+	(rijndael_encrypt): Declared argument `inbuf' const.
+	(rijndael_decrypt): Likewise.
+
+	* des.c (do_des_setkey): Declared argument `key' const.
+	(do_tripledes_setkey): Likewise.
+	(do_des_encrypt): Declared argument `inbuf' const.
+	(do_des_decrypt): Likewise.
+	(do_tripledes_encrypt): Likewise.
+	(do_tripledes_decrypt): Likewise.
+
+	* cast5.c (encrypt_block): Declared argument `inbuf' const.
+	(decrypt_block): Likewise.
+	(cast_setkey): Declared argument `key' const.
+
+	* blowfish.c (do_bf_setkey): Declared argument `key' const.
+	(encrypt_block): Declared argument `inbuf' const.
+	(encrypt_block): Likewise.
+
+
+
+	* cipher.c: Remove CIPHER_ALGO_DUMMY related code.
+	Removed struct cipher_table_s.
+	Changed definition of cipher_table.
+	Removed definition of disabled_algos.
+	(ciphers_registered, default_ciphers_registered): New variables.
+	(REGISTER_DEFAULT_CIPHERS): New macro.
+	(dummy_setkey): Declared argument `key' const.
+	(dummy_encrypt_block): Declared argument `inbuf' const.
+	(dummy_encrypt_block): Likewise.
+	(dummy_encrypt_stream): Likewise.
+	(dummy_encrypt_stream): Likewise.
+	(dummy_setkey): Use `unsigned char' instead of `byte'.
+	(dummy_encrypt_block): Likewise.
+	(dummy_decrypt_block): Likewise.
+	(dummy_encrypt_stream): Likewise.
+	(dummy_decrypt_stream): Likewise.
+	(gcry_cipher_register_default): New function.
+	(gcry_cipher_lookup_func_id): New function.
+	(gcry_cipher_lookup_func_name): New function.
+	(gcry_cipher_lookup_id): New function.
+	(gcry_cipher_lookup_name): New function.
+	(gcry_cipher_id_new): New function.
+	(gcry_cipher_register): New function.
+	(gcry_cipher_unregister): New function.
+	(setup_cipher_table): Removed function.
+	(load_cipher_modules): Removed function.
+	(gcry_cipher_map_name): Adjusted to use new module management.
+	(cipher_algo_to_string): Likewise.
+	(disable_cipher_algo): Likewise.
+	(check_cipher_algo): Likewise.
+	(cipher_get_keylen): Likewise.
+	(cipher_get_blocksize): Likewise.
+	(gcry_cipher_open): Likewise.
+	(struct gcry_cipher_handle): Replaced members algo, algo_index,
+	blocksize, setkey, encrypt, decrypt, stencrypt, stdecrypt with one
+	member: cipher.
+	(gcry_cipher_open): Adjusted code for new handle structure.
+	(cipher_setkey): Likewise.
+	(cipher_setiv): Likewise.
+	(cipher_reset): Likewise.
+	(do_ecb_encrypt): Likewise.
+	(do_ecb_decrypt): Likewise.
+	(do_cbc_encrypt): Likewise.
+	(do_cbc_decrypt): Likewise.
+	(do_cfb_encrypt): Likewise.
+	(do_cfb_decrypt): Likewise.
+	(do_ctr_encrypt): Likewise.
+	(cipher_encrypt): Likewise.
+	(gcry_cipher_encrypt): Likewise.
+	(cipher_decrypt): Likewise.
+	(gcry_cipher_decrypt): Likewise.
+	(cipher_sync): Likewise.
+	(gcry_cipher_ctl): Likewise.
+
+	* pubkey.c: Removed struct pubkey_table_s.
+	Changed definition of pubkey_table.
+	Removed definition of disabled_algos.
+	(pubkeys_registered, default_pubkeys_registered): New variables.
+	(REGISTER_DEFAULT_PUBKEYS): New macro.
+	(setup_pubkey_table): Removed function.
+	(load_pubkey_modules): Removed function.
+	(gcry_pubkey_register_default): New function.
+	(gcry_pubkey_lookup_func_id): New function.
+	(gcry_pubkey_lookup_func_name): New function.
+	(gcry_pubkey_lookup_id): New function.
+	(gcry_pubkey_lookup_name): New function.
+	(gcry_pubkey_id_new): New function.
+	(gcry_pubkey_register): New function.
+	(gcry_pubkey_unregister): New function.
+	(gcry_pk_map_name): Adjusted to use new module management.
+	(gcry_pk_algo_name): Likewise.
+	(disable_pubkey_algo): Likewise.
+	(check_pubkey_algo): Likewise.
+	(pubkey_get_npkey): Likewise.
+	(pubkey_get_nskey): Likewise.
+	(pubkey_get_nsig): Likewise.
+	(pubkey_get_nenc): Likewise.
+	(pubkey_generate): Likewise.
+	(pubkey_check_secret_key): Likewise.
+	(pubkey_encrypt): Likewise.
+	(pubkey_decrypt): Likewise.
+	(pubkey_sign): Likewise.
+	(pubkey_verify): Likewise.
+	(gcry_pk_get_nbits): Likewise.
+	(gcry_pk_algo_info): Likewise.
+
+	* md.c: Removed struct md_digest_list_s.
+	(digest_list): Changed definition.
+	(digests_registered, default_digests_registered): New variables.
+	(REGISTER_DEFAULT_DIGESTS): New macro.
+	(new_list_item): Removed function.
+	(setup_md_table): Removed function.
+	(load_digest_module): Removed function.
+	(gcry_digest_register_default): New function.
+	(gcry_digest_lookup_func_id): New function.
+	(gcry_digest_lookup_func_name): New function.
+	(gcry_digest_lookup_id): New function.
+	(gcry_digest_lookup_name): New function.
+	(gcry_digest_id_new): New function.
+	(gcry_digest_register): New function.
+	(gcry_digest_unregister): New function.
+	(GcryDigestEntry): New type.
+	(struct gcry_md_context): Adjusted type of `list'.
+	(gcry_md_map_name): Adjusted to use new module management.
+	(digest_algo_to_string): Likewise.
+	(check_digest_algo): Likewise.
+	(md_enable): Likewise.
+	(md_digest_length): Likewise.
+	(md_asn_oid): Likewise.
+
+2003-04-07  Moritz Schulte  <moritz@g10code.com>
+
+	* pubkey.c: Replaced PUBKEY_ALGO_DSA with GCRY_PK_DSA,
+	PUBKEY_ALGO_RSA with GCRY_PK_RSA and PUBKEY_ALGO_ELGAMAL with
+	GCRY_PK_ELG.
+
+	* dsa.c: Replaced PUBKEY_ALGO_DSA with GCRY_PK_DSA.
+
+2003-04-01  Moritz Schulte  <moritz@g10code.com>
+
+	* des.c: Removed checks for GCRY_CIPHER_3DES and GCRY_CIPHER_DES.
+
+2003-03-31  Moritz Schulte  <moritz@g10code.com>
+
+	* tiger.c (tiger_get_info): Do not declare static.
+	* sha256.c (sha256_get_info): Likewise.
+	* sha1.c (sha1_get_info): Likewise.
+	* rmd160.c (rmd160_get_info): Likewise.
+	* md5.c (md5_get_info): Likewise.
+	* md4.c (md4_get_info): Likewise.
+	* crc.c (crc_get_info): Likewise.
+
+	* md.c (load_digest_module): Call setup_md_table during
+	initialization.
+	(new_list_item): Link new element into digest_list.
+
+	* cipher.c (do_ctr_decrypt): Made do_ctr_encrypt act as a wrapper
+	for do_ctr_encrypt, since these functions are identical.
+
+2003-03-30  Simon Josefsson  <jas@extundo.com>
+
+	* cipher.c (struct gcry_cipher_handle): Add counter field.
+	(gcry_cipher_open): Add CTR.
+	(cipher_reset): Clear counter field.
+	(do_ctr_encrypt, do_ctr_decrypt): New functions.
+	(cipher_encrypt, cipher_decrypt): Call CTR functions.
+	(gcry_cipher_ctl): Add SET_CTR to set counter.
+
+2003-03-30  Moritz Schulte  <moritz@g10code.com>
+
+	* rsa.c (_gcry_rsa_blind): New function.
+	(_gcry_rsa_unblind): New function.
+	(_gcry_rsa_decrypt): Use _gcry_rsa_blind and _gcry_rsa_decrypt.
+
+2003-03-26  Moritz Schulte  <moritz@g10code.com>
+
+	* dynload.c (_gcry_enum_gnupgext_pubkeys): Adjust `encrypt' and
+	`decrypt' function arguments.
+	(_gcry_enum_gnupgext_pubkeys): Likewise.
+	* dynload.h: Likewise.
+
+	* pubkey.c (dummy_decrypt): Add argument: int flags.
+	(dummy_encrypt): Likewise.
+
+	* elgamal.c (_gcry_elg_encrypt): Add argument: int flags.
+	(_gcry_elg_decrypt): Likewise.
+
+	* rsa.c (_gcry_rsa_encrypt): Add argument: int flags.
+	(_gcry_rsa_decrypt): Likewise.
+
+	* pubkey.c: Add `flags' argument to members `encrypt' and
+	`decrypt' of struct `pubkey_table_s'.
+
+	* rsa.h: Add `flags' argument to function declarations.
+	* elgamal.h: Likewise.
+
+	* pubkey.c (sexp_data_to_mpi): New variable: int parsed_flags.
+	(sexp_data_to_mpi): Set `parsed_flags'.
+	(sexp_data_to_mpi): New argument: int *flags.
+	(gcry_pk_encrypt): New variable: int flags.
+	(gcry_pk_encrypt): Pass `flags' to pubkey_encrypt.
+	(pubkey_encrypt): New variable: int flags.
+	(pubkey_encrypt): Pass `flags' to pubkey encrypt function.
+	(pubkey_decrypt): Likewise.
+	(pubkey_decrypt): Pass `flags' to pubkey encrypt function.
+	(gcry_pk_encrypt): Include `flags' s-exp in return list.
+	(sexp_to_enc): New argument: int *flags.
+	(gcry_pk_decrypt): New variable: int flags.
+	(gcry_pk_decrypt): Pass `flags' to pubkey_decrypt.
+	(sexp_to_enc): New variable: int parsed_flags.
+	(sexp_to_enc): Set `parsed_flags'.
+
+2003-03-22  Simon Josefsson  <jas@extundo.com>
+
+	* cipher.c (gcry_cipher_open, do_cbc_encrypt)
+	(gcry_cipher_encrypt): Support GCRY_CIPHER_CBC_MAC.
+	(gcry_cipher_ctl): Support GCRYCTL_SET_CBC_MAC.
+
+2003-03-19  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c (gen_prime): New args EXTRA_CHECK and EXTRA_CHECK_ARG
+	to allow for a user callback.  Changed all callers.
+	(_gcry_generate_secret_prime)
+	(_gcry_generate_public_prime): Ditto, pass them to gen_prime.
+	* rsa.c (check_exponent): New.
+	(generate): Use a callback to ensure that a given exponent is
+	actually generated.
+
+2003-03-12  Moritz Schulte  <moritz@g10code.com>
+
+	* primegen.c: Initialize `no_of_small_prime_numbers' statically.
+	(gen_prime): Remove calculation of `no_of_small_prime_numbers'.
+
+2003-03-03  Moritz Schulte  <moritz@g10code.com>
+
+	* md.c (gcry_md_ctl): Rewritten to use same style like the other
+	functions dispatchers.
+
+2003-03-02  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c (struct gcry_cipher_handle): New member: algo_index.
+	(gcry_cipher_open): Allocate memory for two cipher contexts.
+	Initialize algo_index.
+	(cipher_setkey): Duplicate context into reserved memory.
+	(cipher_reset): New function, which resets the context and clear
+	the IV.
+	(gcry_cipher_ctl): Call cipher_reset.
+
+2003-02-23  Moritz Schulte  <moritz@g10code.com>
+
+	* cipher.c: Remove (bogus) `digitp' macro definition.
+	* md.c: Likewise.
+
+	* blowfish.c (burn_stack): Removed.
+	* arcfour.c (burn_stack): Likewise.
+	* cast5.c (burn_stack): Likewise.
+	* des.c (burn_stack): Likewise.
+	* md4.c (burn_stack): Likewise.
+	* md5.c (burn_stack): Likewise.
+	* random.c (burn_stack): Likewise.
+	* rijndael.c (burn_stack): Likewise.
+	* rmd160.c (burn_stack): Likewise.
+	* sha1.c (burn_stack): Likewise.
+	* sha256.c (burn_stack): Likewise.
+	* tiger.c (burn_stack): Likewise.
+	* twofish.c (burn_stack): Likewise.
+
+	* blowfish.c: Changed all occurences of burn_stack to
+	_gcry_burn_stack.
+	* arcfour.c: Likewise.
+	* cast5.c: Likewise.
+	* des.c: Likewise.
+	* md4.c: Likewise.
+	* md5.c: Likewise.
+	* random.c: Likewise.
+	* rijndael.c: Likewise.
+	* rmd160.c: Likewise.
+	* sha1.c: Likewise.
+	* sha256.c: Likewise.
+	* tiger.c: Likewise.
+	* twofish.c: Likewise.
+
+	* arcfour.c (_gcry_arcfour_get_info): Use GCRY_CIPHER_ARCFOUR
+	instead of hard-coded value `301'.
+
+2003-01-24  Werner Koch  <wk@gnupg.org>
+
+	* random.c (_gcry_register_random_progress): New.
+	(_gcry_random_progress): New.
+
+	* rndlinux.c (gather_random): Call the random progress function.
+
+2003-01-23  Werner Koch  <wk@gnupg.org>
+
+	* rsa.c (generate): New arg USE_E to request a specific public
+	exponent.
+	(_gcry_rsa_generate): Ditto.
+	* elgamal.c (_gcry_elg_generate): Must add an dummy argument
+	instead of USE_E.
+	* dsa.c (_gcry_dsa_generate): Ditto.
+	* pubkey.c (dummy_generate): Ditto.
+	(pubkey_generate): Add USE_E arg and pass it down.
+	(gcry_pk_genkey): Detect "rsa-use-e" parameter and pass it to generate.
+
+	* pubkey.c (sexp_to_enc): New arg RET_MODERN.
+	(gcry_pk_decrypt): Make use of it to return a real S-expression.
+	Return better error codes.
+	(gcry_pk_verify): Return better error codes.
+
+2003-01-21  Werner Koch  <wk@gnupg.org>
+
+	* random.c (gcry_random_add_bytes): Add QUALITY argument, let
+	function return an error code and disable its core for now.
+
+2003-01-21  Timo Schulz  <twoaday@freakmail.de>
+
+	* random.c (gcry_random_add_bytes): New. Function to add external
+	random to the pool.
+
+2003-01-20  Simon Josefsson  <jas@extundo.com>
+
+	* crc.c: New.
+	* Makefile.am (EXTRA_PROGRAMS, EXTRA_crc_SOURCES): Add crc.c.
+	* md.c (gcry_md_get_algo_dlen): Add values for CRC.
+
+2003-01-20  Werner Koch  <wk@gnupg.org>
+
+	* sha256.c: New.
+	* bithelp.h (ror): New.
+	* Makfile.am: Add sha256.c.
+	* md.c (oid_table): Add values for SHA256 et al.
+	(gcry_md_get_algo_dlen): Likewise
+
+2003-01-20  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_get_keygrip): Implemented keygrips for DSA
+	and ElGamal.
+
+2003-01-17  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (gcry_cipher_encrypt): Reworked so that the output will
+	never contain the plaintext even if the caller did not checked the
+	return value.
+
+	* md.c (gcry_md_get_algo): Changed error code to GCRYERR_GENERAL
+	because we don't have an invalid md algo but no algorithm enabled.
+
+	* pubkey.c (gcry_pk_genkey): Changed error code for bounds check
+	of table parameters to GCRYERR_INTERNAL.
+
+	* md.c (gcry_md_open): Partly reverted Timo's change from
+	2002-10-10 by removing the check for the algorithm.  An algorithm
+	of 0 is allowed and anyway we should not double check it or check
+	it using a different function.  Also fixed the flags check.
+
+	* pubkey.c (gcry_pk_encrypt): Make sure that R_CIPH points to NULL
+	on error.
+	(gcry_pk_decrypt): Ditto for R_PLAIN.
+	(gcry_pk_sign): Ditto for R_SIG.
+	(gcry_pk_genkey): Ditto for R_KEY.
+
+2003-01-16  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_write): Changed 2nd argument type to void*.
+	(gcry_md_hash_buffer): Changed type of boths buffers to void*.
+	(gcry_md_setkey): Changed 2nd argument type to void*.
+
+2003-01-15  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (sexp_data_to_mpi): New.  This handles pkcs1 padding.
+	(gcry_pk_sign, gcry_pk_verify): Use it here.
+	(gcry_pk_encrypt): And here.
+	(pubkey_verify): Add debug code.
+	(sexp_to_enc): Handle flags in the input and return the pkcs1 flag
+	in a new parameter.
+	(gcry_pk_decrypt): Prepare for future pkcs1 handling.
+
+2002-12-19  Werner Koch  <wk@gnupg.org>
+
+	* random.c (_gcry_random_initialize): New.
+
+2002-12-16  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c: Added a Teletrust specific OID for 3DES.
+
+2002-12-12  Werner Koch  <wk@gnupg.org>
+
+	* md.c: Added another oddball OIW OID (sha-1WithRSAEncryption).
+
+2002-11-23  Werner Koch  <wk@gnupg.org>
+
+	* md.c (load_digest_module): Enlarged checked_algos bitmap.
+	* md4.c (func_table):  Fixed entry for md4.
+	Both by Simon Josephson.
+	(transform): Copy data to get the alignment straight. Tested only
+	on i386.
+
+2002-11-10  Simon Josefsson  <jas@extundo.com>
+
+	* cipher.c (gcry_cipher_open): Don't reject CTS flag.
+	(do_cbc_encrypt, do_cbc_decrypt, cipher_encrypt)
+	(gcry_cipher_encrypt, cipher_decrypt)
+	(gcry_cipher_decrypt): Support CTS flag.
+	(gcry_cipher_ctl): Toggle CTS flag.
+
+2002-11-10  Werner Koch  <wk@gnupg.org>
+
+	* md4.c: New. By Simon Josefsson.
+	* Makefile.am (EXTRA_PROGRAMS): Add md4.c.
+	* md.c (oid_table,gcry_md_get_algo_dlen): MD4 support.
+
+2002-10-14  Werner Koch  <wk@gnupg.org>
+
+	* arcfour.c (do_encrypt_stream): Don't use increment op when
+	assigning to the same variable.
+
+2002-10-10  Timo Schulz  <ts@winpt.org>
+
+	* pubkey.c (gcry_pk_genkey): Check boundaries.
+
+	* md.c (gcry_md_open): Check that algo is available and only
+	valid flag values are used.
+	(gcry_md_get_algo): Add error handling.
+
+2002-09-26  Werner Koch  <wk@gnupg.org>
+
+	* md.c: Include an OID for TIGER.
+	* tiger.c (tiger_get_info): Use a regular OID.
+
+2002-09-17  Werner Koch  <wk@gnupg.org>
+
+	* random.c: Replaced mutex.h by the new ath.h.  Changed all calls.
+
+2002-09-16  Werner Koch  <wk@gnupg.org>
+
+	* arcfour.c (do_encrypt_stream): Use register modifier and modulo.
+	According to Nikos Mavroyanopoulos this increases perfromace on
+	i386 system noticable.  And I always tought gcc is clever enough.
+	* md5.c (transform): Use register modifier.
+	* rmd160.c (transform): Ditto.
+	* sha1.c (transform): Ditto.  We hope that there are 6 free registers.
+	* random.c (gcry_randomize): Rewrote to avoid malloc calls.
+
+	* rndlinux.c (gather_random): Replaced remaining fprintfs by log_*.
+	* arcfour.c (do_arcfour_setkey): Ditto.
+	* twofish.c (do_twofish_setkey): Ditto.
+	* rndegd.c (gather_random): Ditto.
+	* rijndael.c (do_setkey): Ditto.
+	* random.c (_gcry_random_dump_stats): Ditto.
+	* primegen.c (_gcry_generate_elg_prime): Ditto.
+	* des.c (_gcry_des_get_info): Ditto.
+	* cast5.c (do_cast_setkey): Ditto.
+	* blowfish.c (do_bf_setkey): Ditto.
+
+2002-08-26  Werner Koch  <wk@gnupg.org>
+
+	* des.c (weak_keys): Fixed one entry in the table and compared
+	all entries against the literature.
+	(selftest): Checksum the weak key table.
+
+2002-08-21  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c: Enable keygrip calculation for "openpgp-rsa".
+
+2002-08-17  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (setup_cipher_table): Don't overwrite the DES entry
+	with the entry for DUMMY.
+
+2002-08-14  Werner Koch  <wk@gnupg.org>
+
+	* des.c (do_des_setkey,do_des_encrypt, do_des_decrypt): New.
+	(_gcry_des_get_info): Support plain old DES.
+	* cipher.c (setup_cipher_table): Put DES into the table.
+
+2002-07-25  Werner Koch  <wk@gnupg.org>
+
+	* rndunix.c (_gcry_rndunix_constructor): Prefixed with _gcry_.
+	Noted by Stephan Austermuehle.
+
+2002-07-08  Timo Schulz  <ts@winpt.org>
+
+	* rndw32.c: Replaced the m_ memory functions with the real
+	gcry_ functions. Renamed all g10_ prefixed functions to log_.
+
+2002-06-12  Werner Koch  <wk@gnupg.org>
+
+	* rsa.c (generate): Use e = 65537 for now.
+
+2002-06-11  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_get_keygrip): Allow a "protected-private-key".
+
+2002-06-05  Timo Schulz  <ts@winpt.org>
+
+	* cipher.c (gcry_cipher_encrypt, gcry_cipher_decrypt):
+	Check that the input size is a multiple of the blocksize.
+
+2002-05-23  Werner Koch  <wk@gnupg.org>
+
+	* md.c (oid_table): Add an rsadsi OID for MD5.
+
+2002-05-21  Werner Koch  <wk@gnupg.org>
+
+	* primegen.c, elgamal.c, dsa.c (progress): Do not print anything
+	by default.  Pass an extra identifying string to the callback and
+	reserved 2 argumenst for current and total counters.  Changed the
+	register function prototype.
+
+2002-05-17  Werner Koch  <wk@gnupg.org>
+
+	* rndegd.c (rndegd_constructor): Fixed name of register function
+	and prefixed the function name with _gcry_.
+	* rndw32.c (rndw32_constructor): Ditto.
+	* tiger.c (tiger_constructor): Ditto.
+
+	* Makefile.am: Removed all dynamic loading stuff.
+	* dynload.c: Ditto. Now only used for the constructor system.
+
+2002-05-15  Werner Koch  <wk@gnupg.org>
+
+	* random.c (gcry_random_bytes,gcry_random_bytes_secure)
+	(gcry_randomize): Make sure we are initialized.
+
+2002-05-14  Werner Koch  <wk@gnupg.org>
+
+	Changed license of most files to the LGPL.
+
+2002-05-02  Werner Koch  <wk@gnupg.org>
+
+	* random.c (_gcry_fast_random_poll): Initialize the module so the
+	mutex can be used.
+
+	* primegen.c (small_prime_numbers): Moved table from smallprime.c
+	* smallprime.c: File removed.
+
+	* des.c (leftkey_swap, rightkey_swap, working_memcmp): Made static.
+
+	* cipher.c (gcry_cipher_map_name): Map "RIJNDAEL" to "AES".
+	* rijndael.c (rijndael_get_info): We do only support a 128 bit
+	blocksize so it makes sense to change the algorithm strings to
+	AES.
+
+	* tiger.c (tiger_final): Removed superfluous token pasting operators.
+	* md5.c (md5_final): Ditto.
+
+2002-04-30  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c: Fixed list of copyright years.
+
+2002-03-18  Werner Koch  <wk@gnupg.org>
+
+	* random.c (initialize): Initialize the new pool lock mutex.
+	(_gcry_fast_random_poll): Add locking and moved main
+	code out to...
+	(do_fast_random_poll): new function.
+	(read_pool): Use the new function here.
+	(get_random_bytes): Add locking.
+	(_gcry_update_random_seed_file): Ditto.
+
+2002-03-11  Werner Koch  <wk@gnupg.org>
+
+	* md.c: Add rsaSignatureWithripemd160 to OID table.
+
+2002-02-20  Werner Koch  <wk@gnupg.org>
+
+	* sha1.c: Removed a left over comment note.  The code has been
+	rewritten from scratch in 1998.  Thanks to Niels Möller for
+	reporting this misleading comment.
+
+2002-02-18  Werner Koch  <wk@gnupg.org>
+
+	* rndunix.c (rndunix_constructor): Use the the new prefixed
+	function name.  Reported by Jordi Mallach.
+
+2002-02-10  Werner Koch  <wk@gnupg.org>
+
+	* random.c (mix_pool): Carry an extra failsafe_digest buffer
+	around to make the function more robust.
+
+2002-02-08  Werner Koch  <wk@gnupg.org>
+
+	* random.c (add_randomness): Xor new data into the pool and not
+	just copy it.  This avoids any choosen input attacks which are not
+	serious in our setting because an outsider won't be able to mix
+	data in and even then we keep going with a PRNG.  Thanks to Stefan
+	Keller for pointing this out.
+
+2002-01-04  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_genkey): Do not release skey - it is static.
+
+	* primegen.c (gen_prime): Of course we should use set_bit
+	and not set_highbit to set the second high bit.
+
+2001-12-18  Werner Koch  <wk@gnupg.org>
+
+	* rsa.c (generate): Loop until we find the exact modulus size.
+	Changed the exponent to 41.
+	(rsa_get_info): s/usage/r_usage/ to avoid shadow warnings.
+	* primegen.c (gen_prime): Set 2 high order bits for secret primes.
+
+	* Makefile.am (DISTCLEANFILES): Include construct.c.
+
+2001-12-17  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_get_keygrip): New - experimental.
+
+2001-12-11  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c: Added OIDs for AES.
+	(gcry_cipher_mode_from_oid): New.
+	(gcry_cipher_map_name): Moved OID search code to ..
+	(search_oid): .. new function.
+
+2001-12-10  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (gcry_pk_encrypt): Find the signature algorithm by name
+	and not by number.
+
+	* pubkey.c (gcry_pk_encrypt,gcry_pk_decrypt,gcry_pk_sign)
+	(gcry_pk_verify,gcry_pk_testkey, gcry_pk_genkey)
+	(gcry_pk_get_nbits): Release the arrays.  Noted by Nikos
+	Mavroyanopoulos.
+
+2001-12-06  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (gcry_cipher_map_name): Look also for OIDs prefixed
+	with "oid."  or "OID.".
+
+2001-12-05  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c (algo_info_table): Fixed entry for openpgp-rsa.
+
+2001-11-24  Werner Koch  <wk@gnupg.org>
+
+	* pubkey.c: Added the rsaEncryption OID to the tables.
+	(sexp_to_key): Add an arg to return the index of the algorithm,
+	changed all callers.
+	(gcry_pk_sign): Find the signature algorithm by name and not by
+	number.
+	(gcry_pk_get_nbits): Fixed so that we can now really pass a secret
+	key to get the result.
+
+	* md.c (gcry_md_map_name): Look also for OIDs prefixed with "oid."
+	or "OID." so that an OID string can be used as an S-Exp token.
+
+2001-11-20  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_map_name): Lookup by OID if the the name begins
+	with a digit.
+	(oid_table): New.
+
+2001-11-16  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_info): New operator GCRYCTL_IS_ALGO_ENABLED.
+
+2001-11-07  Werner Koch  <wk@gnupg.org>
+
+	* md.c (gcry_md_hash_buffer): Close the handle which was left open
+	for algorithms other than rmd160.
+
+2001-08-08  Werner Koch  <wk@gnupg.org>
+
+	* rndw32.c (gather_random): Use toolhelp in addition to the NT
+	gatherer for Windows2000.  Suggested by Sami Tolvanen.
+
+	* random.c (read_pool): Fixed length check, this used to be one
+	byte to strict.  Made an assert out of it because the caller has
+	already made sure that only poolsize bytes are requested.
+	Reported by Marcus Brinkmann.
+
+2001-08-03  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (cipher_encrypt, cipher_decrypt): Prepare to return
+	errors. We have to change the interface to all ciphers to make
+	this really work but we should do so to prepare for hardware
+	encryption modules.
+	(gcry_cipher_encrypt, gcry_cipher_decrypt): Return the error and
+	set lasterr.
+	(gcry_cipher_ctl): Make sure that errors from setkey are returned.
+
+2001-08-02  Werner Koch  <wk@gnupg.org>
+
+	* rndlinux.c (gather_random): casted a size_t arg to int so that
+	the format string is correct.  Casting is okay here and avoids
+	translation changes.
+
+	* random.c (fast_random_poll): Do not check the return code of
+	getrusage.
+
+	* rndunix.c: Add a signal.h header to avoid warnings on Solaris 7
+	and 8.
+
+	* tiger.c (print_abc,print_data): Removed.
+
+	* rijndael.c, des.c, blowfish.c, twofish.c, cast5.c, arcfour.c
+	(burn_stack): New.  Add wrappers for most functions to be able to
+	call burn_stack after the function invocation. This methods seems
+	to be the most portable way to zeroise the stack used. It does
+	only work on stack frame based machines but it is highly portable
+	and has no side effects.  Just setting the automatic variables at
+	the end of a function to zero does not work well because the
+	compiler will optimize them away - marking them as volatile would
+	be bad for performance.
+	* md5.c, sha1.c, rmd160.c, tiger.c (burn_stack): Likewise.
+	* random.c (burn_stack): New.
+	(mix_pool): Use it here to burn the stack of the mixblock function.
+
+	* primegen.c (_gcry_generate_elg_prime): Freed q at 3 places.
+	Thanks to Tommi Komulainen.
+
+	* arcfour.c (arcfour_setkey): Check the minimim keylength against
+	bytes and not bits.
+	(selftest): Must reset the key before decryption.
+
+2001-05-31  Werner Koch  <wk@gnupg.org>
+
+	* sha1.c (sha1_init): Made static.
+
+        Changed all g10_ prefixed function names as well as some mpi_
+	function names to cope with the introduced naming changes.
+
+	* md.c (prepare_macpads): Made key const.
+
+2001-05-28  Werner Koch  <wk@gnupg.org>
+
+	* rndegd.c (gather_random): Removed the use of tty_printf.
+
+2001-03-29  Werner Koch  <wk@gnupg.org>
+
+	* md5.c (md5_final): Fixed calculation of hashed length.  Thanks
+	to disastry@saiknes.lv for pointing out that it was horrible wrong
+	for more than 512MB of input.
+	* sha1.c (sha1_final): Ditto.
+	* rmd160.c (rmd160_final): Ditto.
+	* tiger.c (tiger_final): Ditto.
+
+	* blowfish.c (encrypt,do_encrypt): Changed name to do_encrypt to
+	avoid name clashes with an encrypt function in stdlib.h of
+	Dynix/PIX.  Thanks to Gene Carter.
+	* elgamal.c (encrypt,do_encrypt): Ditto.
+
+	* twofish.c (gnupgext_enum_func): Use only when when compiled as a
+	module.
+	* rijndael.c (gnupgext_enum_func): Ditto.
+
+	* tiger.c (tiger_get_info): Return "TIGER192" and not just
+	"TIGER".  By Edwin Woudt.
+
+	* random.c: Always include time.h - standard requirement.  Thanks
+	to James Troup.
+
+	* rndw32.c: Fixes to the macros.
+
+2001-01-11  Werner Koch  <wk@gnupg.org>
+
+	* cipher.c (cipher_encrypt,gcry_cipher_encrypt): Use blocksize and
+	not 8.
+
+2000-12-19  Werner Koch  <wk@gnupg.org>
+
+	Major change:
+	Removed all GnuPG stuff and renamed this piece of software
+	to gcrypt.
+
+2000-11-14  Werner Koch  <wk@gnupg.org>
+
+	* dsa.c (test_keys): Replaced mpi_alloc by gcry_mpi_new and
+	mpi_free by gcry_mpi_release.
+	* elgamal.c (test_keys,generate): Ditto, also for mpi_alloc_secure.
+	* rsa.c (test_keys,generate,rsa_verify): Ditto.
+	* primegen.c (generate_elg_prime): Ditto.
+	(gen_prime): Ditto and removed nlimbs.
+
+	* rsa.c (generate): Allocate 2 more vars in secure memory.
+
+	* Makefile.am (OMIT_DEPENDENCIES): Hack to work around dependency
+	problems.
+
+2000-10-09  Werner Koch  <wk@gnupg.org>
+
+	* arcfour.c, arcfour.h: New.
+	* cipher.c (cipher_encrypt, cipher_decrypt): Add stream mode.
+	(setup_cipher_table): Add Arcfour.
+	(gcry_cipher_open): Kludge to allow stream mode.
+
+Wed Oct  4 13:16:18 CEST 2000  Werner Koch  <wk@openit.de>
+
+        * sha1.c (transform): Use rol() macro.  Actually this is not needed
+        for a newer gcc but there are still aoter compilers.
+
+        * rsa.c (test_keys): Use new random function.
+
+        * md.c (gcry_md_setkey): New function to overcome problems with
+        const conflics.
+        (gcry_md_ctl): Pass set key to the new functions.
+
+        * rijndael.c: New.
+        * cipher.c: Add Rijndael support.
+
+Mon Sep 18 16:35:45 CEST 2000  Werner Koch  <wk@openit.de>
+
+        * rndlinux.c (open_device): Loose random device checking.
+        By Nils Ellmenreich.
+
+        * random.c (fast_random_poll): Check ENOSYS for getrusage.
+        * rndunix.c:  Add 2 sources for QNX. By Sam Roberts.
+
+        * pubkey.c (gcry_pk_algo_info): Add GCRYCTL_GET_ALGO_USAGE.
+
+        * rsa.c: Changed the comment about the patent.
+        (secret): Speed up by using the CRT.  For a 2k keys this
+        is about 3 times faster.
+        (stronger_key_check): New but unused code to check the secret key.
+        * Makefile.am: Included rsa.[ch].
+        * pubkey.c: Enabled RSA support.
+        (pubkey_get_npkey): Removed RSA workaround.
+
+Mon Jul 31 10:04:47 CEST 2000  Werner Koch  <wk@openit.de>
+
+  * pubkey.c: Replaced all gcry_sexp_{car,cdr}_{data,mpi} by the new
+  gcry_sexp_nth_{data,mpi} functions.
+
+Tue Jul 25 17:44:15 CEST 2000  Werner Koch  <wk@openit.de>
+
+  * pubkey.c (exp_to_key,sexp_to_sig,sexp_to_enc,gcry_pk_encrypt,
+    gcry_pk_decrypt,gcry_pk_sign,gcry_pk_genkey): Changed to work with
+    the new S-Exp interface.
+
+Mon Jul 17 16:35:47 CEST 2000  Werner Koch  <wk@>
+
+  * random.c (gather_faked): Replaced make_timestamp by time(2) again.
+
+Fri Jul 14 19:38:23 CEST 2000  Werner Koch  <wk@>
+
+  * md.c (gcry_md_ctl): Support GCRYCTL_{START,STOP}_DUMP.
+
+  * Makefile.am: Never compile mingw32 as module.
+
+  * Makefile.am: Tweaked module build and removed libtool
+
+  * Makefile.am:  Replaced -O1 by -O. Suggested by Alec Habig.
+
+  * elgamal.c (sign): Removed inactive code.
+
+  * rsa.c, rsa.h: New based on the old module version (only in CVS for now).
+  * pubkey.c (setup_pubkey_table): Added commented support for RSA.
+
+  * rndunix.c (waitpid): New. For UTS 2.1.  All by Dave Dykstra.
+  (my_popen): Do the FD_CLOEXEC only if it is available
+  (start_gatherer): Cope with missing _SC_OPEN_MAX
+
+  * rndunix.c: Add some more headers for QNX. By Sam Roberts.
+
+  * rndegd.c (gather_random): Shortcut level 0.
+  * rndunix.c (gather_random): Ditto.
+  * rndw32.c (gather_random): Ditto.
+
+  * rndw32.c: Replaced with code from Cryptlib and commented the old stuff.
+  * rndw32.c: Add some debuging code enabled by an environment variable.
+
+  * random.c (read_seed_file): Binary open for DOSish system
+  (update_random_seed_file): Ditto.
+  * random.c [MINGW32]: Include process.h for getpid.
+  * random.c (fast_random_poll): Add clock_gettime() as fallback for
+  system which support this POSIX.4 fucntion. By Sam Roberts.
+
+  * random.c (read_seed_file): Removed the S_ISLNK test becuase it
+  is already covered by !S_ISREG and is not defined in Unixware.
+  Reported by Dave Dykstra.
+  (update_random_seed_file): Silently ignore update request when pool
+  is not filled.
+
+  * random.c (read_seed_file): New.
+  (set_random_seed_file): New.
+  (read_pool): Try to read the seeding file.
+  (update_random_seed_file): New.
+
+  (read_pool): Do an initial extra seeding when level 2 quality random
+  is requested the first time.	This requestes at least POOLSIZE/2 bytes
+  of entropy.  Compined with the seeding file this should make normal
+  random bytes cheaper and increase the quality of the random bytes
+  used for key generation.
+
+  * random.c (read_pool): Print a more friendly error message in
+  cases when too much random is requested in one call.
+
+  * random.c (fast_random_poll): Check whether RUSAGE_SELF is defined;
+  this is not the case for some ESIX and Unixware, although they have
+  getrusage().
+
+  * primegen.c (generate_elg_prime): All primes are now generated with
+  the lowest random quality level.  Because they are public anyway we
+  don't need stronger random and by this we do not drain the systems
+  entropy so much.
+
+  * primegen.c (register_primegen_progress): New.
+  * dsa.c (register_pk_dsa_progress): New.
+  * elgamal.c (register_pk_elg_progress): New.
+
+  * elgamal.c (wiener_map): New.
+  (gen_k): Use a much smaller k.
+  (generate): Calculate the qbits using the wiener map and
+  choose an x at a size comparable to the one choosen in gen_k
+
+  * rmd160.c (rmd160_get_info): Moved casting to the left side due to a
+  problem with UTS4.3.	Suggested by Dave Dykstra.
+  * sha1.c (sha1_get_info): Ditto.
+  * tiger.c (tiger_get_info): Ditto.
+  * md5.c (md5_get_info): Ditto
+  * des.c (des_get_info): Ditto.
+  * blowfish.c (blowfish_get_info): Ditto.
+  * cast5.c (cast5_get_info): Ditto.
+  * twofish.c (twofish_get_info): Ditto.
+
+Fri Mar 24 11:25:45 CET 2000  Werner Koch  <wk@openit.de>
+
+	* md.c (md_open): Add hmac arg and allocate space for the pads.
+	(md_finalize): Add HMAC support.
+	(md_copy): Ditto.
+	(md_close): Ditto.
+	(gcry_md_reset): Ditto.
+	(gcry_md_ctl): Ditto.
+	(prepare_macpdas): New.
+
+Mon Mar 13 19:22:46 CET 2000  Werner Koch  <wk@openit.de>
+
+	* md.c (gcry_md_hash_buffer): Add support for the other algorithms.
+
+Mon Jan 31 16:37:34 CET 2000  Werner Koch  <wk@gnupg.de>
+
+	* genprime.c (generate_elg_prime): Fixed returned factors which never
+	worked for non-DSA keys.
+
+Thu Jan 27 18:00:44 CET 2000  Werner Koch  <wk@gnupg.de>
+
+	* pubkey.c (sexp_to_key): Fixed mem leaks in case of errors.
+
+Mon Jan 24 22:24:38 CET 2000  Werner Koch  <wk@gnupg.de>
+
+	* pubkey.c (gcry_pk_decrypt): Implemented.
+	(gcry_pk_encrypt): Implemented.
+	(gcry_pk_testkey): New.
+	(gcry_pk_genkey): New.
+	(pubkey_decrypt): Made static.
+	(pubkey_encrypt): Ditto.
+	(pubkey_check_secret_key): Ditto.
+	(pubkey_generate): Ditto.
+
+Mon Jan 24 13:04:28 CET 2000  Werner Koch  <wk@gnupg.de>
+
+	* pubkey.c (pubkey_nbits): Removed and replaced by ...
+	(gcry_pk_get_nbits): this new one.
+
+Wed Dec  8 21:58:32 CET 1999  Werner Koch  <wk@gnupg.de>
+
+	* dsa.c: s/mpi_powm/gcry_mpi_powm/g
+	* elgamal.c: Ditto.
+	* primegen.c: Ditto.
+
+	* : Replaced g10_opt_verbose by g10_log_verbosity().
+
+	* Makefile.am (INCLUDES): removed intl, add ../gcrypt
+
+Fri Nov 19 17:15:20 CET 1999  Werner Koch  <wk@gnupg.de>
+
+	* dynload.c (cmp_filenames): New to replaced compare_filename() in
+	module.
+	(register_cipher_extension): Removed the tilde expansion stuff.
+	* rndeg.c (my_make_filename): New.
+
+	* : Replaced header util.h by g10lib.h
+
+	* random.c (gather_faked): Replaced make_timestamp by time(2).
+	Disabled wrning printed with tty_printf.
+	* rndlinux.c (gather_random): Always use fprintf instead of tty_xxx;
+	this should be replaced by a callback function.
+
+	* primegen.c (gen_prime): Use gcry_mpi_randomize.
+	(is_prime): Ditto.
+	* elgamal.c (test_keys): Ditto.
+	* dsa.c (test_keys): Ditto.
+
+	* cipher.c (gcry_cipher_close): Die on invalid handle.
+
+Mon Nov 15 21:36:02 CET 1999  Werner Koch  <wk@gnupg.de>
+
+	* elgamal.c (gen_k): Use the new random API.
+	(generate): Ditto.
+	* dsa.c (gen_k): Ditto.
+	(generate): Ditto.
+
+Sat Nov 13 17:44:23 CET 1999  Werner Koch  <wk@gnupg.de>
+
+	* pubkey.c (disable_pubkey_algo): Made static.
+	(gcry_pk_ctl): New.
+
+	* random.c (get_random_bits): Renamed to ...
+	(get_random_bytes): ... this and made static.
+	(gcry_random_bytes): New.
+	(gcry_random_bytes_secure): New.
+	(randomize_buffer): Renamed to ...
+	(gcry_randomize): ...this.
+
+	* md.c (gcry_md_hash_buffer): New.
+
+	* pubkey.c (gcry_pk_algo_info): 4 new commands.
+	(pubkey_get_npkey): Made static.
+	(pubkey_get_nskey): Made static.
+	(pubkey_get_nsig): Made static.
+	(pubkey_get_nenc): Made static.
+
+	* pubkey.c: Removed all G10ERR_xxx.
+	* cipher.c: Changed all GCRYERR_INV_ALGO to GCRYERR_INV_CIPHER_ALGO.
+	* md.c: Changed all GCRYERR_INV_ALGO to GCRYERR_INV_MD_ALGO.
+	* cast5.c (cast_setkey): Changed errocodes to GCRYERR_xxx.
+	* blowfish.c: Ditto.
+	* des.c: Ditto.
+	* twofish.c: Ditto.
+	* dsa.c: Ditto.
+	* elgamal.c: Ditto.
+
+	* g10c.c: Removed
+
+	* cipher.c (gcry_cipher_open): Replaced alloc functions and return NULL
+	if we are out of core.
+	* dynload.c: Replaced all memory allocation functions.
+	* md.c: Ditto.
+	* primegen.c: Ditto.
+	* pubkey.c: Ditto.
+	* random.c: Ditto.
+	* rndw32.c: Ditto.
+	* elgamal.c: Ditto.
+	* dsa.c: Ditto.
+
+Tue Oct 26 14:10:21 CEST 1999  Werner Koch  <wk@gnupg.de>
+
+	* elgamal.c (sign): Hugh found strange code here. Replaced by BUG().
+
+	* cipher.c: Merged with gcrypt/symapi.c.
+
+	* pubkey.c (string_to_pubkey_algo): Renamed function to ...
+	(gcry_pk_map_name): ... this.
+	(pubkey_algo_to_string): Renamed function to ...
+	(gcry_pk_algo_name): ... this.
+	(gcry_pk_algo_info): New.
+	* pubkey.c: Merged with gcrypt/pkapi.c.
+
+	* md.c (md_reset): Clear finalized; thanks to Ulf Moeller for
+	fixing this bug.
+
+	* md.c: Merged with gcrypt/mdapi.c
+
+Wed Sep 15 14:39:59 CEST 1999  Michael Roth <mroth@nessie.de>
+
+	* des.c: Various speed improvements: One bit pre rotation
+	  trick after initial permutation (Richard Outerbridge).
+	  Finished test of SSLeay Tripple-DES patterns.
+
+Wed Sep 15 16:22:17 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndw32.c: New.
+
+Mon Sep 13 10:51:29 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* bithelp.h: New.
+	* rmd160.h, sha1.h, md5.h: Use the rol macro from bithelp.h
+
+Tue Sep  7 16:23:36 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* Makefile.am: Fixed seds for latest egcc. By Ollivier Robert.
+
+Mon Sep  6 19:59:08 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* des.c (selftest): Add some testpattern
+
+Mon Aug 30 20:38:33 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* cipher.c (do_cbc_encrypt): Fixed serious bug occuring when not using
+	in place encryption. Pointed out by Frank Stajano.
+
+Mon Jul 26 09:34:46 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* md5.c (md5_final): Fix for a SCO cpp bug.
+
+Thu Jul 15 10:15:35 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* elgamal.c (elg_check_secret_key,elg_encrypt
+	elg_decrypt,elg_sign,elg_verify): Sanity check on the args.
+	* dsa.c (dsa_check_secret_key,dsa_sign,dsa_verify): Ditto.
+
+	* pubkey.c (disable_pubkey_algo): New.
+	(check_pubkey_algo2): Look at disabled algo table.
+	* cipher.c (disable_cipher_algo): New.
+	(check_cipher_algo): Look at disabled algo table.
+
+Wed Jul  7 13:08:40 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* Makefile.am: Support for libtool.
+
+Fri Jul  2 11:45:54 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* dsa.c (gen_k): Changed algorithm to consume less random bytes
+	* elgamal.c (gen_k): Ditto.
+
+	* random.c (random_dump_stats): New.
+
+Thu Jul  1 12:47:31 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* primegen.c, elgamal.c, dsa.c (progess): New and replaced all
+	fputc with a call to this function.
+
+Sat Jun 26 12:15:59 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndegd.c (do_write): s/ssize_t/int/ due to SunOS 4.1 probs.
+
+	* cipher.c (do_cbc_encrypt, do_cbc_decrypt): New.
+
+	* dynload.c (HAVE_DL_SHL_LOAD): Map hpux API to dlopen (Dave Dykstra).
+	* Makefile.am (install-exec-hook): Removed.
+
+Sun May 23 14:20:22 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* cipher.c (setup_cipher_table): Enable Twofish
+
+	* random.c (fast_random_poll): Disable use of times() for mingw32.
+
+Mon May 17 21:54:43 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* dynload.c (register_internal_cipher_extension): Minor init fix.
+
+Tue May  4 15:47:53 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* primegen.c (gen_prime): Readded the Fermat test. Fixed the bug
+	that we didn't correct for step when passing the prime to the
+	Rabin-Miller test which led to bad performance (Stefan Keller).
+	(check_prime): Add a first Fermat test.
+
+Sun Apr 18 10:11:28 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* cipher.c (cipher_setiv): Add ivlen arg, changed all callers.
+
+	* random.c (randomize_buffer): alway use secure memory because
+	we can't use m_is_secure() on a statically allocated buffer.
+
+	* twofish.c: Replaced some macros by a loop to reduce text size.
+	* Makefile.am (twofish): No more need for sed editing.
+
+Fri Apr  9 12:26:25 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* cipher.c (cipher_open): Reversed the changes for AUTO_CFB.
+
+	* blowfish.c: Dropped the Blowfish 160 mode.
+	* cipher.c (cipher_open): Ditto.
+	(setup_cipher_table): Ditto.  And removed support of twofish128
+
+Wed Apr  7 20:51:39 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* random.c (get_random_bits): Can now handle requests > POOLSIZE
+
+	* cipher.c (cipher_open): Now uses standard CFB for automode if
+	the blocksize is gt 8 (according to rfc2440).
+
+	* twofish.c: Applied Matthew Skala's patches for 256 bit key.
+
+Tue Apr  6 19:58:12 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* random.c (get_random_bits): Can now handle requests > POOLSIZE
+
+	* cipher.c (cipher_open): Now uses standard CFB for automode if
+	the blocksize is gt 8 (according to rfc2440).
+
+Sat Mar 20 11:44:21 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndlinux.c (tty_printf) [IS_MODULE]: Removed.
+
+	* rndegd.c (gather_random): Some fixes.
+
+Wed Mar 17 13:09:03 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndegd.c (do_read): New.
+	(gather_random): Changed the implementation.
+
+Mon Mar  8 20:47:17 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* dynload.c (DLSYM_NEEDS_UNDERSCORE): Renamed.
+
+Fri Feb 26 17:55:41 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* md.c: Nearly a total rewrote.
+
+Wed Feb 24 11:07:27 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* cipher.c (context): Fixed alignment
+	* md.c: Ditto.
+
+	* rndegd.c: New
+
+Mon Feb 22 20:04:00 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndegd.c: New.
+
+Wed Feb 10 17:15:39 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* Makefile.am: Modules are now figured out by configure
+	* construct.c: New. Generated by configure. Changed all modules
+	to work with that.
+	* sha1.h: Removed.
+	* md5.h: Removed.
+
+	* twofish.c: Changed interface to allow Twofish/256
+
+	* rndunix.c (start_gatherer): Die on SIGPIPE.
+
+Wed Jan 20 18:59:49 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndunix.c (gather_random): Fix to avoid infinite loop.
+
+Sun Jan 17 11:04:33 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* des.c (is_weak_key): Replace system memcmp due to bugs
+	in SunOS's memcmp.
+	(des_get_info): Return error on failed selftest.
+	* twofish.c (twofish_setkey): Return error on failed selftest or
+	invalid keylength.
+	* cast5.c (cast_setkey): Ditto.
+	* blowfish.c (bf_setkey): Return error on failed selftest.
+
+Tue Jan 12 11:17:18 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* random.c (random_is_faked): New.
+
+	* tiger.c: Only compile if we have the u64 type
+
+Sat Jan  9 16:02:23 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndunix.c (gather_random): check for setuid.
+
+	* Makefile.am: Add a way to staically link random modules
+
+Thu Jan  7 18:00:58 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* md.c (md_stop_debug): Do a flush first.
+	(md_open): size of buffer now depends on the secure parameter
+
+Sun Jan  3 15:28:44 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* rndunix.c (start_gatherer): Fixed stupid ==/= bug
+
+1998-12-31  Geoff Keating  <geoffk@ozemail.com.au>
+
+	* des.c (is_weak_key): Rewrite loop end condition.
+
+Tue Dec 29 14:41:47 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* random.c: add unistd.h for getpid().
+	(RAND_MAX): Fallback value for Sun.
+
+Wed Dec 23 17:12:24 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* md.c (md_copy): Reset debug.
+
+Mon Dec 14 21:18:49 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* random.c (read_random_source): Changed the interface to the
+	random gathering function.
+	(gather_faked): Use new interface.
+	* dynload.c (dynload_getfnc_fast_random_poll): Ditto.
+	(dynload_getfnc_gather_random): Ditto.
+	* rndlinux.c (gather_random): Ditto.
+	* rndunix.c (gather_random): Ditto.
+
+Sat Dec 12 18:40:32 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* dynload.c (SYMBOL_VERSION): New to cope with system which needs
+	underscores.
+
+	* rndunix.c: Rewrote large parts
+
+Thu Dec 10 20:15:36 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
+
+	* dynload.c (load_extension): increased needed verbosity level.
+
+	* random.c (fast_random_poll): Fallback to a default fast random
+	poll function.
+	(read_random_source): Always use the faked entroy gatherer if no
+	gather module is available.
+	* rndlinux.c (fast_poll): Removed.
+	* rndunix.c (fast_poll): Removed.
+
+
+Wed Nov 25 12:33:41 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rand-*.c: Removed.
+	* rndlinux.c : New.
+	* rndunix.c : New.
+	* random.c : Restructured the interface to the gather modules.
+	(intialize): Call constructor functions
+	(read_radnom_source): Moved to here.
+	* dynload.c (dynload_getfnc_gather_random): New.
+	(dynload_getfnc_fast_random_poll): New.
+	(register_internal_cipher_extension): New.
+	(register_cipher_extension): Support of internal modules.
+
+Sun Nov  8 17:44:36 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rand-unix.c (read_random_source): Removed the assert.
+
+Mon Oct 19 18:34:30 1998  me,,,  (wk@tobold)
+
+	* pubkey.c: Hack to allow us to give some info about RSA keys back.
+
+Thu Oct 15 11:47:57 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* dynload.c: Support for DLD
+
+Wed Oct 14 12:13:07 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rand-unix.c: Now uses names from configure for /dev/random.
+
+1998-10-10  SL Baur  <steve@altair.xemacs.org>
+
+	* Makefile.am: fix sed -O substitutions to catch -O6, etc.
+
+Tue Oct  6 10:06:32 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rand-unix.c (HAVE_GETTIMEOFDAY): Fixed (was ..GETTIMEOFTIME :-)
+	* rand-dummy.c (HAVE_GETTIMEOFDAY): Ditto.
+
+Mon Sep 28 13:23:09 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c (md_digest): New.
+	(md_reset): New.
+
+Wed Sep 23 12:27:02 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* tiger.c (TIGER_CONTEXT): moved "buf", so that it is 64 bit aligned.
+
+Mon Sep 21 06:22:53 1998  Werner Koch  (wk@(none))
+
+	* des.c: Some patches from Michael.
+
+Thu Sep 17 19:00:06 1998  Werner Koch  (wk@(none))
+
+	* des.c : New file from Michael Roth <mroth@nessie.de>
+
+Mon Sep 14 11:10:55 1998  Werner Koch  (wk@(none))
+
+	* blowfish.c (bf_setkey): Niklas Hernaeus patch to detect weak keys.
+
+Mon Sep 14 09:19:25 1998  Werner Koch  (wk@(none))
+
+	* dynload.c (RTLD_NOW): Now defined to 1 if it is undefined.
+
+Mon Sep  7 17:04:33 1998  Werner Koch  (wk@(none))
+
+	* Makefile.am: Fixes to allow a different build directory
+
+Thu Aug  6 17:25:38 1998  Werner Koch,mobil,,,	(wk@tobold)
+
+	* random.c (get_random_byte): Removed and changed all callers
+	to use get_random_bits()
+
+Mon Jul 27 10:30:22 1998  Werner Koch  (wk@(none))
+
+	* cipher.c : Support for other blocksizes
+	(cipher_get_blocksize): New.
+	* twofish.c: New.
+	* Makefile.am: Add twofish module.
+
+Mon Jul 13 21:30:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* random.c (read_pool): Simple alloc if secure_alloc is not set.
+	(get_random_bits): Ditto.
+
+Thu Jul  9 13:01:14 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* dynload.c (load_extension): Function now nbails out if
+	the program is run setuid.
+
+Wed Jul  8 18:58:23 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rmd160.c (rmd160_hash_buffer): New.
+
+Thu Jul  2 10:50:30 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cipher.c (cipher_open): algos >=100 use standard CFB
+
+Thu Jun 25 11:18:25 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* Makefile.am: Support for extensions
+
+Thu Jun 18 12:09:38 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* random.c (mix_pool): simpler handling for level 0
+
+Mon Jun 15 14:40:48 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* tiger.c: Removed from dist, will reappear as dynload module
+
+Sat Jun 13 14:16:57 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* pubkey.c: Major changes to allow extensions. Changed the inteface
+	of all public key ciphers and added the ability to load extensions
+	on demand.
+
+	* misc.c: Removed.
+
+Wed Jun 10 07:52:08 1998  Werner Koch,mobil,,,	(wk@tobold)
+
+	* dynload.c: New.
+	* cipher.c: Major changes to allow extensions.
+
+Mon Jun  8 22:43:00 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cipher.c: Major internal chnages to support extensions.
+	* blowfish.c (blowfish_get_info): New and made all internal
+	functions static, changed heder.
+	* cast5.c (cast5_get_info): Likewise.
+
+Mon Jun  8 12:27:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* tiger.c (transform): Fix for big endian
+
+	* cipher.c (do_cfb_decrypt): Big endian fix.
+
+Fri May 22 07:30:39 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c (md_get_oid): Add a new one for TIGER.
+
+Thu May 21 13:24:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cipher.c: Add support for a dummy cipher
+
+Thu May 14 15:40:36 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rmd160.c (transform): fixed sigbus - I should better
+	add Christian von Roques's new implemenation of rmd160_write.
+
+Fri May  8 18:07:44 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rand-internal.h, rand-unix.c, rand-w32.c, rand_dummy.c: New
+	* random.c: Moved system specific functions to rand-****.c
+
+Fri May  8 14:01:17 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* random.c (fast_random_poll): add call to gethrtime.
+
+Tue May  5 21:28:55 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* elgamal.c (elg_generate): choosing x was not correct, could
+	yield 6 bytes which are not from the random pool, tsss, tsss..
+
+Tue May  5 14:09:06 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* primegen.c (generate_elg_prime): Add arg mode, changed all
+	callers and implemented mode 1.
+
+Mon Apr 27 14:41:58 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cipher.c (cipher_get_keylen): New.
+
+Sun Apr 26 14:44:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* tiger.c, tiger.h: New.
+
+Wed Apr  8 14:57:11 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* misc.c (check_pubkey_algo2): New.
+
+Tue Apr  7 18:46:49 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cipher.c: New
+	* misc.c (check_cipher_algo): Moved to cipher.c
+	* cast5.c: Moved many functions to cipher.c
+	* blowfish.c: Likewise.
+
+Sat Apr  4 19:52:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* cast5.c: Implemented and tested.
+
+Wed Apr  1 16:38:27 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* elgamal.c (elg_generate): Faster generation of x in some cases.
+
+Thu Mar 19 13:54:48 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* blowfish.c (blowfish_decode_cfb): changed XOR operation
+	(blowfish_encode_cfb): Ditto.
+
+Thu Mar 12 14:04:05 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* sha1.c (transform): Rewrote
+
+	* blowfish.c (encrypt): Unrolled for rounds == 16
+	(decrypt): Ditto.
+
+Tue Mar 10 16:32:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rmd160.c (transform): Unrolled the loop.
+
+Tue Mar 10 13:05:14 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* random.c (read_pool): Add pool_balance stuff.
+	(get_random_bits): New.
+
+	* elgamal.c (elg_generate): Now uses get_random_bits to generate x.
+
+
+Tue Mar 10 11:33:51 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c (md_digest_length): New.
+
+Tue Mar 10 11:27:41 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* dsa.c (dsa_verify): Works.
+
+Mon Mar  9 12:59:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* dsa.c, dsa.h: Removed some unused code.
+
+Wed Mar  4 10:39:22 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c (md_open): Add call to fast_random_poll.
+	blowfish.c (blowfish_setkey): Ditto.
+
+Tue Mar  3 13:32:54 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* rmd160.c (rmd160_mixblock): New.
+	* random.c: Restructured to start with a new RNG implementation.
+	* random.h: New.
+
+Mon Mar  2 19:21:46 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* gost.c, gost.h: Removed because they did only contain trash.
+
+Sun Mar  1 16:42:29 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* random.c (fill_buffer): removed error message if n == -1.
+
+Fri Feb 27 16:39:34 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c (md_enable): No init if called twice.
+
+Thu Feb 26 07:57:02 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* primegen.c (generate_elg_prime): Changed the progress printing.
+	(gen_prime): Ditto.
+
+Tue Feb 24 12:28:42 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md5.c, md.5 : Replaced by a modified version of md5.c from
+	GNU textutils 1.22.
+
+Wed Feb 18 14:08:30 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* md.c, md.h : New debugging support
+
+Mon Feb 16 10:08:47 1998  Werner Koch  (wk@isil.d.shuttle.de)
+
+	* misc.c (cipher_algo_to_string): New
+	(pubkey_algo_to_string): New.
+	(digest_algo_to_string): New.
+
+
+ Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+	   2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+ This file is free software; as a special exception the author gives
+ unlimited permission to copy and/or distribute it, with or without
+ modifications, as long as this notice is preserved.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
+ implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+Local Variables:
+buffer-read-only: t
+End:
diff --git a/comm/third_party/libgcrypt/cipher/Makefile.am b/comm/third_party/libgcrypt/cipher/Makefile.am
new file mode 100644
index 0000000000..d644005634
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/Makefile.am
@@ -0,0 +1,258 @@
+# Makefile for cipher modules
+# Copyright (C) 1998, 1999, 2000, 2001, 2002,
+#               2003, 2009 Free Software Foundation, Inc.
+#
+# This file is part of Libgcrypt.
+#
+# Libgcrypt is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+#
+# Libgcrypt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Process this file with automake to produce Makefile.in
+
+# Need to include ../src in addition to top_srcdir because gcrypt.h is
+# a built header.
+AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
+AM_CFLAGS = $(GPG_ERROR_CFLAGS)
+
+AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
+
+EXTRA_DIST = gost-s-box.c
+
+CLEANFILES = gost-s-box
+DISTCLEANFILES = gost-sb.h
+
+noinst_LTLIBRARIES = libcipher.la
+
+GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
+                 @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
+
+libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
+libcipher_la_LIBADD = $(GCRYPT_MODULES)
+
+libcipher_la_SOURCES = \
+	cipher.c cipher-internal.h \
+	cipher-cbc.c \
+	cipher-cfb.c \
+	cipher-ofb.c \
+	cipher-ctr.c \
+	cipher-aeswrap.c \
+	cipher-ccm.c \
+	cipher-cmac.c \
+	cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
+	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
+	cipher-poly1305.c \
+	cipher-ocb.c \
+	cipher-xts.c \
+	cipher-eax.c \
+	cipher-selftest.c cipher-selftest.h \
+	pubkey.c pubkey-internal.h pubkey-util.c \
+	md.c \
+	mac.c mac-internal.h \
+	mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
+	poly1305.c poly1305-internal.h \
+	poly1305-s390x.S \
+	kdf.c kdf-internal.h \
+	bithelp.h  \
+	bufhelp.h  \
+	primegen.c  \
+	hash-common.c hash-common.h \
+	dsa-common.c rsa-common.c \
+	sha1.h
+
+EXTRA_libcipher_la_SOURCES = \
+	asm-common-aarch64.h \
+	asm-common-amd64.h \
+	asm-common-s390x.h \
+	asm-inline-s390x.h \
+	asm-poly1305-aarch64.h \
+	asm-poly1305-amd64.h \
+	asm-poly1305-s390x.h \
+	arcfour.c arcfour-amd64.S \
+	blowfish.c blowfish-amd64.S blowfish-arm.S \
+	cast5.c cast5-amd64.S cast5-arm.S \
+	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
+	chacha20-armv7-neon.S chacha20-aarch64.S \
+	chacha20-ppc.c chacha20-s390x.S \
+	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
+	crc-armv8-aarch64-ce.S \
+	crc-ppc.c \
+	des.c des-amd64.S \
+	dsa.c \
+	elgamal.c \
+	ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
+	ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \
+	idea.c \
+	gost28147.c gost.h \
+	gostr3411-94.c \
+	md4.c \
+	md5.c \
+	rijndael.c rijndael-internal.h rijndael-tables.h   \
+	rijndael-aesni.c rijndael-padlock.c                \
+	rijndael-amd64.S rijndael-arm.S                    \
+	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
+	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
+	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
+	rijndael-ppc.c rijndael-ppc9le.c                   \
+	rijndael-ppc-common.h rijndael-ppc-functions.h     \
+	rijndael-s390x.c                                   \
+	rmd160.c \
+	rsa.c \
+	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
+	scrypt.c \
+	seed.c \
+	serpent.c serpent-sse2-amd64.S \
+	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
+	serpent-avx2-amd64.S serpent-armv7-neon.S \
+	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
+	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
+	sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
+	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
+	sha256-avx2-bmi2-amd64.S \
+	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
+	sha256-intel-shaext.c sha256-ppc.c \
+	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
+	sha512-avx2-bmi2-amd64.S \
+	sha512-armv7-neon.S sha512-arm.S \
+	sha512-ppc.c sha512-ssse3-i386.c \
+	sm3.c \
+	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+	stribog.c \
+	tiger.c \
+	whirlpool.c whirlpool-sse2-amd64.S \
+	twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
+	twofish-avx2-amd64.S \
+	rfc2268.c \
+	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
+	camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+	blake2.c \
+	blake2b-amd64-avx2.S blake2s-amd64-avx.S
+
+gost28147.lo: gost-sb.h
+gost-sb.h: gost-s-box
+	./gost-s-box $@
+
+gost-s-box: gost-s-box.c
+	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
+	    $(CPPFLAGS_FOR_BUILD)-o $@ $(srcdir)/gost-s-box.c
+
+
+if ENABLE_O_FLAG_MUNGING
+o_flag_munging = sed -e 's/-O\([2-9sg][2-9sg]*\)/-O1/' -e 's/-Ofast/-O1/g'
+else
+o_flag_munging = cat
+endif
+
+
+# We need to lower the optimization for this module.
+tiger.o: $(srcdir)/tiger.c Makefile
+	`echo $(COMPILE) -c $< | $(o_flag_munging) `
+
+tiger.lo: $(srcdir)/tiger.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
+
+
+# We need to disable instrumentation for these modules as they use cc as
+# thin assembly front-end and do not tolerate in-between function calls
+# inserted by compiler as those functions may clobber the XMM registers.
+if ENABLE_INSTRUMENTATION_MUNGING
+instrumentation_munging = sed \
+	-e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+	-e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+	-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
+else
+instrumentation_munging = cat
+endif
+
+rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
+ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
+else
+ppc_vcrypto_cflags =
+endif
+
+rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/comm/third_party/libgcrypt/cipher/Makefile.in b/comm/third_party/libgcrypt/cipher/Makefile.in
new file mode 100644
index 0000000000..ceba51b45a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/Makefile.in
@@ -0,0 +1,1445 @@
+# Makefile.in generated by automake 1.16.1 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2018 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Makefile for cipher modules
+# Copyright (C) 1998, 1999, 2000, 2001, 2002,
+#               2003, 2009 Free Software Foundation, Inc.
+#
+# This file is part of Libgcrypt.
+#
+# Libgcrypt is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of
+# the License, or (at your option) any later version.
+#
+# Libgcrypt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Process this file with automake to produce Makefile.in
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = cipher
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_cc_for_build.m4 \
+	$(top_srcdir)/m4/gpg-error.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/m4/noexecstack.m4 $(top_srcdir)/m4/socklen.m4 \
+	$(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+am__DEPENDENCIES_1 =
+am_libcipher_la_OBJECTS = cipher.lo cipher-cbc.lo cipher-cfb.lo \
+	cipher-ofb.lo cipher-ctr.lo cipher-aeswrap.lo cipher-ccm.lo \
+	cipher-cmac.lo cipher-gcm.lo cipher-gcm-intel-pclmul.lo \
+	cipher-gcm-armv7-neon.lo cipher-gcm-armv8-aarch32-ce.lo \
+	cipher-gcm-armv8-aarch64-ce.lo cipher-poly1305.lo \
+	cipher-ocb.lo cipher-xts.lo cipher-eax.lo cipher-selftest.lo \
+	pubkey.lo pubkey-util.lo md.lo mac.lo mac-hmac.lo mac-cmac.lo \
+	mac-gmac.lo mac-poly1305.lo poly1305.lo poly1305-s390x.lo \
+	kdf.lo primegen.lo hash-common.lo dsa-common.lo rsa-common.lo
+libcipher_la_OBJECTS = $(am_libcipher_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/build-aux/depcomp
+am__maybe_remake_depfiles = depfiles
+am__depfiles_remade = ./$(DEPDIR)/arcfour-amd64.Plo \
+	./$(DEPDIR)/arcfour.Plo ./$(DEPDIR)/blake2.Plo \
+	./$(DEPDIR)/blake2b-amd64-avx2.Plo \
+	./$(DEPDIR)/blake2s-amd64-avx.Plo \
+	./$(DEPDIR)/blowfish-amd64.Plo ./$(DEPDIR)/blowfish-arm.Plo \
+	./$(DEPDIR)/blowfish.Plo ./$(DEPDIR)/camellia-aarch64.Plo \
+	./$(DEPDIR)/camellia-aesni-avx-amd64.Plo \
+	./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo \
+	./$(DEPDIR)/camellia-arm.Plo ./$(DEPDIR)/camellia-glue.Plo \
+	./$(DEPDIR)/camellia.Plo ./$(DEPDIR)/cast5-amd64.Plo \
+	./$(DEPDIR)/cast5-arm.Plo ./$(DEPDIR)/cast5.Plo \
+	./$(DEPDIR)/chacha20-aarch64.Plo \
+	./$(DEPDIR)/chacha20-amd64-avx2.Plo \
+	./$(DEPDIR)/chacha20-amd64-ssse3.Plo \
+	./$(DEPDIR)/chacha20-armv7-neon.Plo \
+	./$(DEPDIR)/chacha20-ppc.Plo ./$(DEPDIR)/chacha20-s390x.Plo \
+	./$(DEPDIR)/chacha20.Plo ./$(DEPDIR)/cipher-aeswrap.Plo \
+	./$(DEPDIR)/cipher-cbc.Plo ./$(DEPDIR)/cipher-ccm.Plo \
+	./$(DEPDIR)/cipher-cfb.Plo ./$(DEPDIR)/cipher-cmac.Plo \
+	./$(DEPDIR)/cipher-ctr.Plo ./$(DEPDIR)/cipher-eax.Plo \
+	./$(DEPDIR)/cipher-gcm-armv7-neon.Plo \
+	./$(DEPDIR)/cipher-gcm-armv8-aarch32-ce.Plo \
+	./$(DEPDIR)/cipher-gcm-armv8-aarch64-ce.Plo \
+	./$(DEPDIR)/cipher-gcm-intel-pclmul.Plo \
+	./$(DEPDIR)/cipher-gcm.Plo ./$(DEPDIR)/cipher-ocb.Plo \
+	./$(DEPDIR)/cipher-ofb.Plo ./$(DEPDIR)/cipher-poly1305.Plo \
+	./$(DEPDIR)/cipher-selftest.Plo ./$(DEPDIR)/cipher-xts.Plo \
+	./$(DEPDIR)/cipher.Plo ./$(DEPDIR)/crc-armv8-aarch64-ce.Plo \
+	./$(DEPDIR)/crc-armv8-ce.Plo ./$(DEPDIR)/crc-intel-pclmul.Plo \
+	./$(DEPDIR)/crc-ppc.Plo ./$(DEPDIR)/crc.Plo \
+	./$(DEPDIR)/des-amd64.Plo ./$(DEPDIR)/des.Plo \
+	./$(DEPDIR)/dsa-common.Plo ./$(DEPDIR)/dsa.Plo \
+	./$(DEPDIR)/ecc-curves.Plo ./$(DEPDIR)/ecc-ecdh.Plo \
+	./$(DEPDIR)/ecc-ecdsa.Plo ./$(DEPDIR)/ecc-eddsa.Plo \
+	./$(DEPDIR)/ecc-gost.Plo ./$(DEPDIR)/ecc-misc.Plo \
+	./$(DEPDIR)/ecc-sm2.Plo ./$(DEPDIR)/ecc.Plo \
+	./$(DEPDIR)/elgamal.Plo ./$(DEPDIR)/gost28147.Plo \
+	./$(DEPDIR)/gostr3411-94.Plo ./$(DEPDIR)/hash-common.Plo \
+	./$(DEPDIR)/idea.Plo ./$(DEPDIR)/kdf.Plo \
+	./$(DEPDIR)/keccak-armv7-neon.Plo ./$(DEPDIR)/keccak.Plo \
+	./$(DEPDIR)/mac-cmac.Plo ./$(DEPDIR)/mac-gmac.Plo \
+	./$(DEPDIR)/mac-hmac.Plo ./$(DEPDIR)/mac-poly1305.Plo \
+	./$(DEPDIR)/mac.Plo ./$(DEPDIR)/md.Plo ./$(DEPDIR)/md4.Plo \
+	./$(DEPDIR)/md5.Plo ./$(DEPDIR)/poly1305-s390x.Plo \
+	./$(DEPDIR)/poly1305.Plo ./$(DEPDIR)/primegen.Plo \
+	./$(DEPDIR)/pubkey-util.Plo ./$(DEPDIR)/pubkey.Plo \
+	./$(DEPDIR)/rfc2268.Plo ./$(DEPDIR)/rijndael-aarch64.Plo \
+	./$(DEPDIR)/rijndael-aesni.Plo ./$(DEPDIR)/rijndael-amd64.Plo \
+	./$(DEPDIR)/rijndael-arm.Plo \
+	./$(DEPDIR)/rijndael-armv8-aarch32-ce.Plo \
+	./$(DEPDIR)/rijndael-armv8-aarch64-ce.Plo \
+	./$(DEPDIR)/rijndael-armv8-ce.Plo \
+	./$(DEPDIR)/rijndael-padlock.Plo ./$(DEPDIR)/rijndael-ppc.Plo \
+	./$(DEPDIR)/rijndael-ppc9le.Plo ./$(DEPDIR)/rijndael-s390x.Plo \
+	./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo \
+	./$(DEPDIR)/rijndael-ssse3-amd64.Plo ./$(DEPDIR)/rijndael.Plo \
+	./$(DEPDIR)/rmd160.Plo ./$(DEPDIR)/rsa-common.Plo \
+	./$(DEPDIR)/rsa.Plo ./$(DEPDIR)/salsa20-amd64.Plo \
+	./$(DEPDIR)/salsa20-armv7-neon.Plo ./$(DEPDIR)/salsa20.Plo \
+	./$(DEPDIR)/scrypt.Plo ./$(DEPDIR)/seed.Plo \
+	./$(DEPDIR)/serpent-armv7-neon.Plo \
+	./$(DEPDIR)/serpent-avx2-amd64.Plo \
+	./$(DEPDIR)/serpent-sse2-amd64.Plo ./$(DEPDIR)/serpent.Plo \
+	./$(DEPDIR)/sha1-armv7-neon.Plo \
+	./$(DEPDIR)/sha1-armv8-aarch32-ce.Plo \
+	./$(DEPDIR)/sha1-armv8-aarch64-ce.Plo \
+	./$(DEPDIR)/sha1-avx-amd64.Plo \
+	./$(DEPDIR)/sha1-avx-bmi2-amd64.Plo \
+	./$(DEPDIR)/sha1-avx2-bmi2-amd64.Plo \
+	./$(DEPDIR)/sha1-intel-shaext.Plo \
+	./$(DEPDIR)/sha1-ssse3-amd64.Plo ./$(DEPDIR)/sha1.Plo \
+	./$(DEPDIR)/sha256-armv8-aarch32-ce.Plo \
+	./$(DEPDIR)/sha256-armv8-aarch64-ce.Plo \
+	./$(DEPDIR)/sha256-avx-amd64.Plo \
+	./$(DEPDIR)/sha256-avx2-bmi2-amd64.Plo \
+	./$(DEPDIR)/sha256-intel-shaext.Plo ./$(DEPDIR)/sha256-ppc.Plo \
+	./$(DEPDIR)/sha256-ssse3-amd64.Plo ./$(DEPDIR)/sha256.Plo \
+	./$(DEPDIR)/sha512-arm.Plo ./$(DEPDIR)/sha512-armv7-neon.Plo \
+	./$(DEPDIR)/sha512-avx-amd64.Plo \
+	./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo \
+	./$(DEPDIR)/sha512-ppc.Plo ./$(DEPDIR)/sha512-ssse3-amd64.Plo \
+	./$(DEPDIR)/sha512-ssse3-i386.Plo ./$(DEPDIR)/sha512.Plo \
+	./$(DEPDIR)/sm3.Plo ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo \
+	./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo ./$(DEPDIR)/sm4.Plo \
+	./$(DEPDIR)/stribog.Plo ./$(DEPDIR)/tiger.Plo \
+	./$(DEPDIR)/twofish-aarch64.Plo ./$(DEPDIR)/twofish-amd64.Plo \
+	./$(DEPDIR)/twofish-arm.Plo ./$(DEPDIR)/twofish-avx2-amd64.Plo \
+	./$(DEPDIR)/twofish.Plo ./$(DEPDIR)/whirlpool-sse2-amd64.Plo \
+	./$(DEPDIR)/whirlpool.Plo
+am__mv = mv -f
+CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CCASFLAGS) $(CCASFLAGS)
+AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
+am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
+am__v_CPPAS_0 = @echo "  CPPAS   " $@;
+am__v_CPPAS_1 = 
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(libcipher_la_SOURCES) $(EXTRA_libcipher_la_SOURCES)
+DIST_SOURCES = $(libcipher_la_SOURCES) $(EXTRA_libcipher_la_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in \
+	$(top_srcdir)/build-aux/depcomp
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AS = @AS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BUILD_FILEVERSION = @BUILD_FILEVERSION@
+BUILD_REVISION = @BUILD_REVISION@
+BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
+BUILD_VERSION = @BUILD_VERSION@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CC_FOR_BUILD = @CC_FOR_BUILD@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DL_LIBS = @DL_LIBS@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@
+FALLBACK_SOCKLEN_T = @FALLBACK_SOCKLEN_T@
+FGREP = @FGREP@
+GCRYPT_CIPHERS = @GCRYPT_CIPHERS@
+GCRYPT_DIGESTS = @GCRYPT_DIGESTS@
+GCRYPT_HWF_MODULES = @GCRYPT_HWF_MODULES@
+GCRYPT_KDFS = @GCRYPT_KDFS@
+GCRYPT_PUBKEY_CIPHERS = @GCRYPT_PUBKEY_CIPHERS@
+GCRYPT_RANDOM = @GCRYPT_RANDOM@
+GPGRT_CONFIG = @GPGRT_CONFIG@
+GPG_ERROR_CFLAGS = @GPG_ERROR_CFLAGS@
+GPG_ERROR_CONFIG = @GPG_ERROR_CONFIG@
+GPG_ERROR_LIBS = @GPG_ERROR_LIBS@
+GPG_ERROR_MT_CFLAGS = @GPG_ERROR_MT_CFLAGS@
+GPG_ERROR_MT_LIBS = @GPG_ERROR_MT_LIBS@
+GREP = @GREP@
+INSERT_SYS_SELECT_H = @INSERT_SYS_SELECT_H@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDADD_FOR_TESTS_KLUDGE = @LDADD_FOR_TESTS_KLUDGE@
+LDFLAGS = @LDFLAGS@
+LIBGCRYPT_CIPHERS = @LIBGCRYPT_CIPHERS@
+LIBGCRYPT_CONFIG_API_VERSION = @LIBGCRYPT_CONFIG_API_VERSION@
+LIBGCRYPT_CONFIG_CFLAGS = @LIBGCRYPT_CONFIG_CFLAGS@
+LIBGCRYPT_CONFIG_HOST = @LIBGCRYPT_CONFIG_HOST@
+LIBGCRYPT_CONFIG_LIBS = @LIBGCRYPT_CONFIG_LIBS@
+LIBGCRYPT_DIGESTS = @LIBGCRYPT_DIGESTS@
+LIBGCRYPT_LT_AGE = @LIBGCRYPT_LT_AGE@
+LIBGCRYPT_LT_CURRENT = @LIBGCRYPT_LT_CURRENT@
+LIBGCRYPT_LT_REVISION = @LIBGCRYPT_LT_REVISION@
+LIBGCRYPT_PUBKEY_CIPHERS = @LIBGCRYPT_PUBKEY_CIPHERS@
+LIBGCRYPT_THREAD_MODULES = @LIBGCRYPT_THREAD_MODULES@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+MPI_SFLAGS = @MPI_SFLAGS@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NOEXECSTACK_FLAGS = @NOEXECSTACK_FLAGS@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PTH_CFLAGS = @PTH_CFLAGS@
+PTH_CONFIG = @PTH_CONFIG@
+PTH_LIBS = @PTH_LIBS@
+RANLIB = @RANLIB@
+RC = @RC@
+RUN_LARGE_DATA_TESTS = @RUN_LARGE_DATA_TESTS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+SYSROOT = @SYSROOT@
+VERSION = @VERSION@
+VERSION_NUMBER = @VERSION_NUMBER@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+emacs_local_vars_begin = @emacs_local_vars_begin@
+emacs_local_vars_end = @emacs_local_vars_end@
+emacs_local_vars_read_only = @emacs_local_vars_read_only@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+runstatedir = @runstatedir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+
+# Need to include ../src in addition to top_srcdir because gcrypt.h is
+# a built header.
+AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
+AM_CFLAGS = $(GPG_ERROR_CFLAGS)
+AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
+EXTRA_DIST = gost-s-box.c
+CLEANFILES = gost-s-box
+DISTCLEANFILES = gost-sb.h
+noinst_LTLIBRARIES = libcipher.la
+GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
+                 @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
+
+libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
+libcipher_la_LIBADD = $(GCRYPT_MODULES)
+libcipher_la_SOURCES = \
+	cipher.c cipher-internal.h \
+	cipher-cbc.c \
+	cipher-cfb.c \
+	cipher-ofb.c \
+	cipher-ctr.c \
+	cipher-aeswrap.c \
+	cipher-ccm.c \
+	cipher-cmac.c \
+	cipher-gcm.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
+	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
+	cipher-poly1305.c \
+	cipher-ocb.c \
+	cipher-xts.c \
+	cipher-eax.c \
+	cipher-selftest.c cipher-selftest.h \
+	pubkey.c pubkey-internal.h pubkey-util.c \
+	md.c \
+	mac.c mac-internal.h \
+	mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
+	poly1305.c poly1305-internal.h \
+	poly1305-s390x.S \
+	kdf.c kdf-internal.h \
+	bithelp.h  \
+	bufhelp.h  \
+	primegen.c  \
+	hash-common.c hash-common.h \
+	dsa-common.c rsa-common.c \
+	sha1.h
+
+EXTRA_libcipher_la_SOURCES = \
+	asm-common-aarch64.h \
+	asm-common-amd64.h \
+	asm-common-s390x.h \
+	asm-inline-s390x.h \
+	asm-poly1305-aarch64.h \
+	asm-poly1305-amd64.h \
+	asm-poly1305-s390x.h \
+	arcfour.c arcfour-amd64.S \
+	blowfish.c blowfish-amd64.S blowfish-arm.S \
+	cast5.c cast5-amd64.S cast5-arm.S \
+	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
+	chacha20-armv7-neon.S chacha20-aarch64.S \
+	chacha20-ppc.c chacha20-s390x.S \
+	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
+	crc-armv8-aarch64-ce.S \
+	crc-ppc.c \
+	des.c des-amd64.S \
+	dsa.c \
+	elgamal.c \
+	ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
+	ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \
+	idea.c \
+	gost28147.c gost.h \
+	gostr3411-94.c \
+	md4.c \
+	md5.c \
+	rijndael.c rijndael-internal.h rijndael-tables.h   \
+	rijndael-aesni.c rijndael-padlock.c                \
+	rijndael-amd64.S rijndael-arm.S                    \
+	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
+	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
+	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
+	rijndael-ppc.c rijndael-ppc9le.c                   \
+	rijndael-ppc-common.h rijndael-ppc-functions.h     \
+	rijndael-s390x.c                                   \
+	rmd160.c \
+	rsa.c \
+	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
+	scrypt.c \
+	seed.c \
+	serpent.c serpent-sse2-amd64.S \
+	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
+	serpent-avx2-amd64.S serpent-armv7-neon.S \
+	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
+	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
+	sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
+	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
+	sha256-avx2-bmi2-amd64.S \
+	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
+	sha256-intel-shaext.c sha256-ppc.c \
+	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
+	sha512-avx2-bmi2-amd64.S \
+	sha512-armv7-neon.S sha512-arm.S \
+	sha512-ppc.c sha512-ssse3-i386.c \
+	sm3.c \
+	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+	stribog.c \
+	tiger.c \
+	whirlpool.c whirlpool-sse2-amd64.S \
+	twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
+	twofish-avx2-amd64.S \
+	rfc2268.c \
+	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
+	camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+	blake2.c \
+	blake2b-amd64-avx2.S blake2s-amd64-avx.S
+
+@ENABLE_O_FLAG_MUNGING_FALSE@o_flag_munging = cat
+@ENABLE_O_FLAG_MUNGING_TRUE@o_flag_munging = sed -e 's/-O\([2-9sg][2-9sg]*\)/-O1/' -e 's/-Ofast/-O1/g'
+@ENABLE_INSTRUMENTATION_MUNGING_FALSE@instrumentation_munging = cat
+
+# We need to disable instrumentation for these modules as they use cc as
+# thin assembly front-end and do not tolerate in-between function calls
+# inserted by compiler as those functions may clobber the XMM registers.
+@ENABLE_INSTRUMENTATION_MUNGING_TRUE@instrumentation_munging = sed \
+@ENABLE_INSTRUMENTATION_MUNGING_TRUE@	-e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+@ENABLE_INSTRUMENTATION_MUNGING_TRUE@	-e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+@ENABLE_INSTRUMENTATION_MUNGING_TRUE@	-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
+
+@ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_FALSE@ppc_vcrypto_cflags = 
+@ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS_TRUE@ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .S .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu cipher/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu cipher/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libcipher.la: $(libcipher_la_OBJECTS) $(libcipher_la_DEPENDENCIES) $(EXTRA_libcipher_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK)  $(libcipher_la_OBJECTS) $(libcipher_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/arcfour-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/arcfour.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2b-amd64-avx2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blake2s-amd64-avx.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blowfish.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aarch64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aesni-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia-glue.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/camellia.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cast5-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cast5-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cast5.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-aarch64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-amd64-avx2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-amd64-ssse3.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-ppc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20-s390x.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chacha20.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-aeswrap.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-cbc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ccm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-cfb.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-cmac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ctr.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-eax.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-gcm-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-gcm-armv8-aarch32-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-gcm-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-gcm-intel-pclmul.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-gcm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ocb.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-ofb.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-poly1305.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-selftest.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher-xts.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cipher.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crc-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crc-armv8-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crc-intel-pclmul.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crc-ppc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/crc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/des-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/des.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dsa-common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dsa.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-curves.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-ecdh.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-ecdsa.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-eddsa.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-gost.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-misc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc-sm2.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ecc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/elgamal.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gost28147.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gostr3411-94.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hash-common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/idea.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kdf.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/keccak-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/keccak.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-cmac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-gmac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-hmac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac-poly1305.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mac.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md4.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md5.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poly1305-s390x.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poly1305.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/primegen.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pubkey-util.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pubkey.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rfc2268.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-aarch64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-aesni.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-armv8-aarch32-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-armv8-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-padlock.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ppc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ppc9le.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-s390x.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael-ssse3-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rijndael.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rmd160.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rsa-common.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rsa.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/salsa20-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/salsa20-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/salsa20.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scrypt.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/seed.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent-sse2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serpent.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-armv8-aarch32-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-avx-bmi2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-avx2-bmi2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-intel-shaext.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1-ssse3-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha1.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-armv8-aarch32-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-armv8-aarch64-ce.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-avx2-bmi2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-intel-shaext.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-ppc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256-ssse3-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha256.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-armv7-neon.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ppc.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ssse3-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512-ssse3-i386.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sha512.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm3.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-aesni-avx-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sm4.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stribog.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tiger.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish-aarch64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish-arm.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish-avx2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/twofish.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/whirlpool-sse2-amd64.Plo@am__quote@ # am--include-marker
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/whirlpool.Plo@am__quote@ # am--include-marker
+
+$(am__depfiles_remade):
+	@$(MKDIR_P) $(@D)
+	@echo '# dummy' >$@-t && $(am__mv) $@-t $@
+
+am--depfiles: $(am__depfiles_remade)
+
+.S.o:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $<
+
+.S.obj:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.S.lo:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $<
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) distdir-am
+
+distdir-am: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+	-test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+		-rm -f ./$(DEPDIR)/arcfour-amd64.Plo
+	-rm -f ./$(DEPDIR)/arcfour.Plo
+	-rm -f ./$(DEPDIR)/blake2.Plo
+	-rm -f ./$(DEPDIR)/blake2b-amd64-avx2.Plo
+	-rm -f ./$(DEPDIR)/blake2s-amd64-avx.Plo
+	-rm -f ./$(DEPDIR)/blowfish-amd64.Plo
+	-rm -f ./$(DEPDIR)/blowfish-arm.Plo
+	-rm -f ./$(DEPDIR)/blowfish.Plo
+	-rm -f ./$(DEPDIR)/camellia-aarch64.Plo
+	-rm -f ./$(DEPDIR)/camellia-aesni-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/camellia-arm.Plo
+	-rm -f ./$(DEPDIR)/camellia-glue.Plo
+	-rm -f ./$(DEPDIR)/camellia.Plo
+	-rm -f ./$(DEPDIR)/cast5-amd64.Plo
+	-rm -f ./$(DEPDIR)/cast5-arm.Plo
+	-rm -f ./$(DEPDIR)/cast5.Plo
+	-rm -f ./$(DEPDIR)/chacha20-aarch64.Plo
+	-rm -f ./$(DEPDIR)/chacha20-amd64-avx2.Plo
+	-rm -f ./$(DEPDIR)/chacha20-amd64-ssse3.Plo
+	-rm -f ./$(DEPDIR)/chacha20-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/chacha20-ppc.Plo
+	-rm -f ./$(DEPDIR)/chacha20-s390x.Plo
+	-rm -f ./$(DEPDIR)/chacha20.Plo
+	-rm -f ./$(DEPDIR)/cipher-aeswrap.Plo
+	-rm -f ./$(DEPDIR)/cipher-cbc.Plo
+	-rm -f ./$(DEPDIR)/cipher-ccm.Plo
+	-rm -f ./$(DEPDIR)/cipher-cfb.Plo
+	-rm -f ./$(DEPDIR)/cipher-cmac.Plo
+	-rm -f ./$(DEPDIR)/cipher-ctr.Plo
+	-rm -f ./$(DEPDIR)/cipher-eax.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-intel-pclmul.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm.Plo
+	-rm -f ./$(DEPDIR)/cipher-ocb.Plo
+	-rm -f ./$(DEPDIR)/cipher-ofb.Plo
+	-rm -f ./$(DEPDIR)/cipher-poly1305.Plo
+	-rm -f ./$(DEPDIR)/cipher-selftest.Plo
+	-rm -f ./$(DEPDIR)/cipher-xts.Plo
+	-rm -f ./$(DEPDIR)/cipher.Plo
+	-rm -f ./$(DEPDIR)/crc-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/crc-armv8-ce.Plo
+	-rm -f ./$(DEPDIR)/crc-intel-pclmul.Plo
+	-rm -f ./$(DEPDIR)/crc-ppc.Plo
+	-rm -f ./$(DEPDIR)/crc.Plo
+	-rm -f ./$(DEPDIR)/des-amd64.Plo
+	-rm -f ./$(DEPDIR)/des.Plo
+	-rm -f ./$(DEPDIR)/dsa-common.Plo
+	-rm -f ./$(DEPDIR)/dsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-curves.Plo
+	-rm -f ./$(DEPDIR)/ecc-ecdh.Plo
+	-rm -f ./$(DEPDIR)/ecc-ecdsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-eddsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-gost.Plo
+	-rm -f ./$(DEPDIR)/ecc-misc.Plo
+	-rm -f ./$(DEPDIR)/ecc-sm2.Plo
+	-rm -f ./$(DEPDIR)/ecc.Plo
+	-rm -f ./$(DEPDIR)/elgamal.Plo
+	-rm -f ./$(DEPDIR)/gost28147.Plo
+	-rm -f ./$(DEPDIR)/gostr3411-94.Plo
+	-rm -f ./$(DEPDIR)/hash-common.Plo
+	-rm -f ./$(DEPDIR)/idea.Plo
+	-rm -f ./$(DEPDIR)/kdf.Plo
+	-rm -f ./$(DEPDIR)/keccak-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/keccak.Plo
+	-rm -f ./$(DEPDIR)/mac-cmac.Plo
+	-rm -f ./$(DEPDIR)/mac-gmac.Plo
+	-rm -f ./$(DEPDIR)/mac-hmac.Plo
+	-rm -f ./$(DEPDIR)/mac-poly1305.Plo
+	-rm -f ./$(DEPDIR)/mac.Plo
+	-rm -f ./$(DEPDIR)/md.Plo
+	-rm -f ./$(DEPDIR)/md4.Plo
+	-rm -f ./$(DEPDIR)/md5.Plo
+	-rm -f ./$(DEPDIR)/poly1305-s390x.Plo
+	-rm -f ./$(DEPDIR)/poly1305.Plo
+	-rm -f ./$(DEPDIR)/primegen.Plo
+	-rm -f ./$(DEPDIR)/pubkey-util.Plo
+	-rm -f ./$(DEPDIR)/pubkey.Plo
+	-rm -f ./$(DEPDIR)/rfc2268.Plo
+	-rm -f ./$(DEPDIR)/rijndael-aarch64.Plo
+	-rm -f ./$(DEPDIR)/rijndael-aesni.Plo
+	-rm -f ./$(DEPDIR)/rijndael-amd64.Plo
+	-rm -f ./$(DEPDIR)/rijndael-arm.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-padlock.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ppc.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ppc9le.Plo
+	-rm -f ./$(DEPDIR)/rijndael-s390x.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/rijndael.Plo
+	-rm -f ./$(DEPDIR)/rmd160.Plo
+	-rm -f ./$(DEPDIR)/rsa-common.Plo
+	-rm -f ./$(DEPDIR)/rsa.Plo
+	-rm -f ./$(DEPDIR)/salsa20-amd64.Plo
+	-rm -f ./$(DEPDIR)/salsa20-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/salsa20.Plo
+	-rm -f ./$(DEPDIR)/scrypt.Plo
+	-rm -f ./$(DEPDIR)/seed.Plo
+	-rm -f ./$(DEPDIR)/serpent-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/serpent-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/serpent-sse2-amd64.Plo
+	-rm -f ./$(DEPDIR)/serpent.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-intel-shaext.Plo
+	-rm -f ./$(DEPDIR)/sha1-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1.Plo
+	-rm -f ./$(DEPDIR)/sha256-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/sha256-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/sha256-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256-intel-shaext.Plo
+	-rm -f ./$(DEPDIR)/sha256-ppc.Plo
+	-rm -f ./$(DEPDIR)/sha256-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256.Plo
+	-rm -f ./$(DEPDIR)/sha512-arm.Plo
+	-rm -f ./$(DEPDIR)/sha512-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/sha512-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-ppc.Plo
+	-rm -f ./$(DEPDIR)/sha512-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-ssse3-i386.Plo
+	-rm -f ./$(DEPDIR)/sha512.Plo
+	-rm -f ./$(DEPDIR)/sm3.Plo
+	-rm -f ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sm4.Plo
+	-rm -f ./$(DEPDIR)/stribog.Plo
+	-rm -f ./$(DEPDIR)/tiger.Plo
+	-rm -f ./$(DEPDIR)/twofish-aarch64.Plo
+	-rm -f ./$(DEPDIR)/twofish-amd64.Plo
+	-rm -f ./$(DEPDIR)/twofish-arm.Plo
+	-rm -f ./$(DEPDIR)/twofish-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/twofish.Plo
+	-rm -f ./$(DEPDIR)/whirlpool-sse2-amd64.Plo
+	-rm -f ./$(DEPDIR)/whirlpool.Plo
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+		-rm -f ./$(DEPDIR)/arcfour-amd64.Plo
+	-rm -f ./$(DEPDIR)/arcfour.Plo
+	-rm -f ./$(DEPDIR)/blake2.Plo
+	-rm -f ./$(DEPDIR)/blake2b-amd64-avx2.Plo
+	-rm -f ./$(DEPDIR)/blake2s-amd64-avx.Plo
+	-rm -f ./$(DEPDIR)/blowfish-amd64.Plo
+	-rm -f ./$(DEPDIR)/blowfish-arm.Plo
+	-rm -f ./$(DEPDIR)/blowfish.Plo
+	-rm -f ./$(DEPDIR)/camellia-aarch64.Plo
+	-rm -f ./$(DEPDIR)/camellia-aesni-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/camellia-aesni-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/camellia-arm.Plo
+	-rm -f ./$(DEPDIR)/camellia-glue.Plo
+	-rm -f ./$(DEPDIR)/camellia.Plo
+	-rm -f ./$(DEPDIR)/cast5-amd64.Plo
+	-rm -f ./$(DEPDIR)/cast5-arm.Plo
+	-rm -f ./$(DEPDIR)/cast5.Plo
+	-rm -f ./$(DEPDIR)/chacha20-aarch64.Plo
+	-rm -f ./$(DEPDIR)/chacha20-amd64-avx2.Plo
+	-rm -f ./$(DEPDIR)/chacha20-amd64-ssse3.Plo
+	-rm -f ./$(DEPDIR)/chacha20-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/chacha20-ppc.Plo
+	-rm -f ./$(DEPDIR)/chacha20-s390x.Plo
+	-rm -f ./$(DEPDIR)/chacha20.Plo
+	-rm -f ./$(DEPDIR)/cipher-aeswrap.Plo
+	-rm -f ./$(DEPDIR)/cipher-cbc.Plo
+	-rm -f ./$(DEPDIR)/cipher-ccm.Plo
+	-rm -f ./$(DEPDIR)/cipher-cfb.Plo
+	-rm -f ./$(DEPDIR)/cipher-cmac.Plo
+	-rm -f ./$(DEPDIR)/cipher-ctr.Plo
+	-rm -f ./$(DEPDIR)/cipher-eax.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm-intel-pclmul.Plo
+	-rm -f ./$(DEPDIR)/cipher-gcm.Plo
+	-rm -f ./$(DEPDIR)/cipher-ocb.Plo
+	-rm -f ./$(DEPDIR)/cipher-ofb.Plo
+	-rm -f ./$(DEPDIR)/cipher-poly1305.Plo
+	-rm -f ./$(DEPDIR)/cipher-selftest.Plo
+	-rm -f ./$(DEPDIR)/cipher-xts.Plo
+	-rm -f ./$(DEPDIR)/cipher.Plo
+	-rm -f ./$(DEPDIR)/crc-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/crc-armv8-ce.Plo
+	-rm -f ./$(DEPDIR)/crc-intel-pclmul.Plo
+	-rm -f ./$(DEPDIR)/crc-ppc.Plo
+	-rm -f ./$(DEPDIR)/crc.Plo
+	-rm -f ./$(DEPDIR)/des-amd64.Plo
+	-rm -f ./$(DEPDIR)/des.Plo
+	-rm -f ./$(DEPDIR)/dsa-common.Plo
+	-rm -f ./$(DEPDIR)/dsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-curves.Plo
+	-rm -f ./$(DEPDIR)/ecc-ecdh.Plo
+	-rm -f ./$(DEPDIR)/ecc-ecdsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-eddsa.Plo
+	-rm -f ./$(DEPDIR)/ecc-gost.Plo
+	-rm -f ./$(DEPDIR)/ecc-misc.Plo
+	-rm -f ./$(DEPDIR)/ecc-sm2.Plo
+	-rm -f ./$(DEPDIR)/ecc.Plo
+	-rm -f ./$(DEPDIR)/elgamal.Plo
+	-rm -f ./$(DEPDIR)/gost28147.Plo
+	-rm -f ./$(DEPDIR)/gostr3411-94.Plo
+	-rm -f ./$(DEPDIR)/hash-common.Plo
+	-rm -f ./$(DEPDIR)/idea.Plo
+	-rm -f ./$(DEPDIR)/kdf.Plo
+	-rm -f ./$(DEPDIR)/keccak-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/keccak.Plo
+	-rm -f ./$(DEPDIR)/mac-cmac.Plo
+	-rm -f ./$(DEPDIR)/mac-gmac.Plo
+	-rm -f ./$(DEPDIR)/mac-hmac.Plo
+	-rm -f ./$(DEPDIR)/mac-poly1305.Plo
+	-rm -f ./$(DEPDIR)/mac.Plo
+	-rm -f ./$(DEPDIR)/md.Plo
+	-rm -f ./$(DEPDIR)/md4.Plo
+	-rm -f ./$(DEPDIR)/md5.Plo
+	-rm -f ./$(DEPDIR)/poly1305-s390x.Plo
+	-rm -f ./$(DEPDIR)/poly1305.Plo
+	-rm -f ./$(DEPDIR)/primegen.Plo
+	-rm -f ./$(DEPDIR)/pubkey-util.Plo
+	-rm -f ./$(DEPDIR)/pubkey.Plo
+	-rm -f ./$(DEPDIR)/rfc2268.Plo
+	-rm -f ./$(DEPDIR)/rijndael-aarch64.Plo
+	-rm -f ./$(DEPDIR)/rijndael-aesni.Plo
+	-rm -f ./$(DEPDIR)/rijndael-amd64.Plo
+	-rm -f ./$(DEPDIR)/rijndael-arm.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-armv8-ce.Plo
+	-rm -f ./$(DEPDIR)/rijndael-padlock.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ppc.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ppc9le.Plo
+	-rm -f ./$(DEPDIR)/rijndael-s390x.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ssse3-amd64-asm.Plo
+	-rm -f ./$(DEPDIR)/rijndael-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/rijndael.Plo
+	-rm -f ./$(DEPDIR)/rmd160.Plo
+	-rm -f ./$(DEPDIR)/rsa-common.Plo
+	-rm -f ./$(DEPDIR)/rsa.Plo
+	-rm -f ./$(DEPDIR)/salsa20-amd64.Plo
+	-rm -f ./$(DEPDIR)/salsa20-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/salsa20.Plo
+	-rm -f ./$(DEPDIR)/scrypt.Plo
+	-rm -f ./$(DEPDIR)/seed.Plo
+	-rm -f ./$(DEPDIR)/serpent-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/serpent-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/serpent-sse2-amd64.Plo
+	-rm -f ./$(DEPDIR)/serpent.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/sha1-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1-intel-shaext.Plo
+	-rm -f ./$(DEPDIR)/sha1-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha1.Plo
+	-rm -f ./$(DEPDIR)/sha256-armv8-aarch32-ce.Plo
+	-rm -f ./$(DEPDIR)/sha256-armv8-aarch64-ce.Plo
+	-rm -f ./$(DEPDIR)/sha256-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256-intel-shaext.Plo
+	-rm -f ./$(DEPDIR)/sha256-ppc.Plo
+	-rm -f ./$(DEPDIR)/sha256-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha256.Plo
+	-rm -f ./$(DEPDIR)/sha512-arm.Plo
+	-rm -f ./$(DEPDIR)/sha512-armv7-neon.Plo
+	-rm -f ./$(DEPDIR)/sha512-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-avx2-bmi2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-ppc.Plo
+	-rm -f ./$(DEPDIR)/sha512-ssse3-amd64.Plo
+	-rm -f ./$(DEPDIR)/sha512-ssse3-i386.Plo
+	-rm -f ./$(DEPDIR)/sha512.Plo
+	-rm -f ./$(DEPDIR)/sm3.Plo
+	-rm -f ./$(DEPDIR)/sm4-aesni-avx-amd64.Plo
+	-rm -f ./$(DEPDIR)/sm4-aesni-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/sm4.Plo
+	-rm -f ./$(DEPDIR)/stribog.Plo
+	-rm -f ./$(DEPDIR)/tiger.Plo
+	-rm -f ./$(DEPDIR)/twofish-aarch64.Plo
+	-rm -f ./$(DEPDIR)/twofish-amd64.Plo
+	-rm -f ./$(DEPDIR)/twofish-arm.Plo
+	-rm -f ./$(DEPDIR)/twofish-avx2-amd64.Plo
+	-rm -f ./$(DEPDIR)/twofish.Plo
+	-rm -f ./$(DEPDIR)/whirlpool-sse2-amd64.Plo
+	-rm -f ./$(DEPDIR)/whirlpool.Plo
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
+	clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	cscopelist-am ctags ctags-am distclean distclean-compile \
+	distclean-generic distclean-libtool distclean-tags distdir dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-data install-data-am install-dvi install-dvi-am \
+	install-exec install-exec-am install-html install-html-am \
+	install-info install-info-am install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+gost28147.lo: gost-sb.h
+gost-sb.h: gost-s-box
+	./gost-s-box $@
+
+gost-s-box: gost-s-box.c
+	$(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
+	    $(CPPFLAGS_FOR_BUILD)-o $@ $(srcdir)/gost-s-box.c
+
+# We need to lower the optimization for this module.
+tiger.o: $(srcdir)/tiger.c Makefile
+	`echo $(COMPILE) -c $< | $(o_flag_munging) `
+
+tiger.lo: $(srcdir)/tiger.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
+
+rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
+	`echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
+	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/comm/third_party/libgcrypt/cipher/arcfour-amd64.S b/comm/third_party/libgcrypt/cipher/arcfour-amd64.S
new file mode 100644
index 0000000000..221dfeff77
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/arcfour-amd64.S
@@ -0,0 +1,108 @@
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+**
+** 2013/12/20 <jussi.kivilinna@iki.fi>:
+**  - Integrated to libgcrypt
+**  - 4.18 cycles/byte on Intel i5-4570
+*/
+
+#ifdef __x86_64__
+#include <config.h>
+#if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+.align 16
+.globl _gcry_arcfour_amd64
+ELF(.type _gcry_arcfour_amd64,@function)
+_gcry_arcfour_amd64:
+	CFI_STARTPROC()
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	push	%rbp
+	CFI_PUSH(%rbp)
+	push	%rbx
+	CFI_PUSH(%rbx)
+	mov	%rdi,		%rbp	# key = ARG(key)
+	mov	%rsi,		%rbx	# rbx = ARG(len)
+	mov	%rdx,		%rsi	# in = ARG(in)
+	mov	%rcx,		%rdi	# out = ARG(out)
+	mov	(4*256)(%rbp),	%ecx	# x = key->x
+	mov	(4*256+4)(%rbp),%edx	# y = key->y
+	inc	%rcx			# x++
+	and	$255,		%rcx	# x &= 0xff
+	lea	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
+	mov	%rbx,		%r9	# tmp = in+len-8
+	mov	(%rbp,%rcx,4),	%eax	# tx = d[x]
+	cmp	%rsi,		%rbx	# cmp in with in+len-8
+	jl	.Lend			# jump if (in+len-8 < in)
+
+.Lstart:
+	add	$8,		%rsi		# increment in
+	add	$8,		%rdi		# increment out
+
+	# generate the next 8 bytes of the rc4 stream into %r8
+	mov	$8,		%r11		# byte counter
+1:	add	%al,		%dl		# y += tx
+	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
+	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
+	add	%al,		%bl		# val = ty + tx
+	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
+	inc	%cl				# x++		(NEXT ROUND)
+	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
+	shl	$8,		%r8
+	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
+	dec	%r11b
+	jnz 1b
+
+	# xor 8 bytes
+	bswap	%r8
+	xor	-8(%rsi),	%r8
+	cmp	%r9,		%rsi		# cmp in+len-8 with in
+	mov	%r8,		-8(%rdi)
+	jle	.Lstart				# jump if (in <= in+len-8)
+
+.Lend:
+	add	$8,		%r9		# tmp = in+len
+
+	# handle the last bytes, one by one
+1:	cmp	%rsi,		%r9		# cmp in with in+len
+	jle	.Lfinished			# jump if (in+len <= in)
+	add	%al,		%dl		# y += tx
+	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
+	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
+	add	%al,		%bl		# val = ty + tx
+	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
+	inc	%cl				# x++		(NEXT ROUND)
+	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
+	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
+	xor	(%rsi),		%r8b		# xor 1 byte
+	movb	%r8b,		(%rdi)
+	inc	%rsi				# in++
+	inc	%rdi				# out++
+	jmp 1b
+
+.Lfinished:
+	dec	%rcx				# x--
+	movb	%cl,		(4*256)(%rbp)	# key->y = y
+	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
+	pop	%rbx
+	CFI_POP(%rbx)
+	pop	%rbp
+	CFI_POP(%rbp)
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC()
+.L__gcry_arcfour_amd64_end:
+ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/arcfour.c b/comm/third_party/libgcrypt/cipher/arcfour.c
new file mode 100644
index 0000000000..353de00bd7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/arcfour.c
@@ -0,0 +1,216 @@
+/* arcfour.c  -  The arcfour stream cipher
+ *	Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * For a description of the algorithm, see:
+ *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
+ *   ISBN 0-471-11709-9. Pages 397 ff.
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "cipher-internal.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+static const char *selftest(void);
+
+#ifdef USE_AMD64_ASM
+
+typedef struct {
+    u32 sbox[256];
+    u32 idx_i, idx_j;
+} ARCFOUR_context;
+
+void _gcry_arcfour_amd64(void *key, size_t len, const byte *indata,
+			 byte *outdata);
+
+static void
+encrypt_stream (void *context,
+                byte *outbuf, const byte *inbuf, size_t length)
+{
+  _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
+}
+
+#else /*!USE_AMD64_ASM*/
+
+typedef struct {
+    byte sbox[256];
+    int idx_i, idx_j;
+} ARCFOUR_context;
+
+static void
+do_encrypt_stream( ARCFOUR_context *ctx,
+		   byte *outbuf, const byte *inbuf, size_t length )
+{
+#ifndef __i386__
+  register unsigned int i = ctx->idx_i;
+  register byte j = ctx->idx_j;
+  register byte *sbox = ctx->sbox;
+  register byte t, u;
+
+  while ( length-- )
+    {
+      i++;
+      t = sbox[(byte)i];
+      j += t;
+      u = sbox[j];
+      sbox[(byte)i] = u;
+      u += t;
+      sbox[j] = t;
+      *outbuf++ = sbox[u] ^ *inbuf++;
+    }
+
+  ctx->idx_i = (byte)i;
+  ctx->idx_j = (byte)j;
+#else /*__i386__*/
+  /* Old implementation of arcfour is faster on i386 than the version above.
+   * This is because version above increases register pressure which on i386
+   * would push some of the variables to memory/stack.  Therefore keep this
+   * version for i386 to avoid regressing performance.  */
+  register int i = ctx->idx_i;
+  register int j = ctx->idx_j;
+  register byte *sbox = ctx->sbox;
+  register int t;
+
+  while ( length-- )
+    {
+      i++;
+      i = i & 255; /* The and-op seems to be faster than the mod-op. */
+      j += sbox[i];
+      j &= 255;
+      t = sbox[i]; sbox[i] = sbox[j]; sbox[j] = t;
+      *outbuf++ = *inbuf++ ^ sbox[(sbox[i] + sbox[j]) & 255];
+    }
+
+  ctx->idx_i = i;
+  ctx->idx_j = j;
+#endif
+}
+
+static void
+encrypt_stream (void *context,
+                byte *outbuf, const byte *inbuf, size_t length)
+{
+  ARCFOUR_context *ctx = (ARCFOUR_context *) context;
+  do_encrypt_stream (ctx, outbuf, inbuf, length );
+  _gcry_burn_stack (64);
+}
+
+#endif /*!USE_AMD64_ASM*/
+
+
+static gcry_err_code_t
+do_arcfour_setkey (void *context, const byte *key, unsigned int keylen)
+{
+  static int initialized;
+  static const char* selftest_failed;
+  int i, j;
+  byte karr[256];
+  ARCFOUR_context *ctx = (ARCFOUR_context *) context;
+
+  if (!initialized )
+    {
+      initialized = 1;
+      selftest_failed = selftest();
+      if( selftest_failed )
+        log_error ("ARCFOUR selftest failed (%s)\n", selftest_failed );
+    }
+  if( selftest_failed )
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if( keylen < 40/8 ) /* we want at least 40 bits */
+    return GPG_ERR_INV_KEYLEN;
+
+  ctx->idx_i = ctx->idx_j = 0;
+  for (i=0; i < 256; i++ )
+    ctx->sbox[i] = i;
+  for (i=j=0; i < 256; i++,j++ )
+    {
+      if (j >= keylen)
+        j = 0;
+      karr[i] = key[j];
+    }
+  for (i=j=0; i < 256; i++ )
+    {
+      int t;
+      j = (j + ctx->sbox[i] + karr[i]) & 255;
+      t = ctx->sbox[i];
+      ctx->sbox[i] = ctx->sbox[j];
+      ctx->sbox[j] = t;
+    }
+  wipememory( karr, sizeof(karr) );
+
+  return GPG_ERR_NO_ERROR;
+}
+
+static gcry_err_code_t
+arcfour_setkey ( void *context, const byte *key, unsigned int keylen,
+                 cipher_bulk_ops_t *bulk_ops )
+{
+  ARCFOUR_context *ctx = (ARCFOUR_context *) context;
+  gcry_err_code_t rc = do_arcfour_setkey (ctx, key, keylen );
+  (void)bulk_ops;
+  return rc;
+}
+
+
+static const char*
+selftest(void)
+{
+  ARCFOUR_context ctx;
+  byte scratch[16];
+
+  /* Test vector from Cryptlib labeled there: "from the
+     State/Commerce Department". */
+  static const byte key_1[] =
+    { 0x61, 0x8A, 0x63, 0xD2, 0xFB };
+  static const byte plaintext_1[] =
+    { 0xDC, 0xEE, 0x4C, 0xF9, 0x2C };
+  static const byte ciphertext_1[] =
+    { 0xF1, 0x38, 0x29, 0xC9, 0xDE };
+
+  arcfour_setkey( &ctx, key_1, sizeof(key_1), NULL);
+  encrypt_stream( &ctx, scratch, plaintext_1, sizeof(plaintext_1));
+  if ( memcmp (scratch, ciphertext_1, sizeof (ciphertext_1)))
+    return "Arcfour encryption test 1 failed.";
+  arcfour_setkey( &ctx, key_1, sizeof(key_1), NULL);
+  encrypt_stream(&ctx, scratch, scratch, sizeof(plaintext_1)); /* decrypt */
+  if ( memcmp (scratch, plaintext_1, sizeof (plaintext_1)))
+    return "Arcfour decryption test 1 failed.";
+  return NULL;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_arcfour =
+  {
+    GCRY_CIPHER_ARCFOUR, {0, 0},
+    "ARCFOUR", NULL, NULL, 1, 128, sizeof (ARCFOUR_context),
+    arcfour_setkey, NULL, NULL, encrypt_stream, encrypt_stream,
+  };
diff --git a/comm/third_party/libgcrypt/cipher/asm-common-aarch64.h b/comm/third_party/libgcrypt/cipher/asm-common-aarch64.h
new file mode 100644
index 0000000000..cf0afe1f87
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-common-aarch64.h
@@ -0,0 +1,104 @@
+/* asm-common-aarch64.h  -  Common macros for AArch64 assembly
+ *
+ * Copyright (C) 2018 Martin Storsjö <martin@martin.st>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AARCH64_H
+#define GCRY_ASM_COMMON_AARCH64_H
+
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __APPLE__
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, name@GOTPAGE ; \
+	add     reg, reg, name@GOTPAGEOFF ;
+#elif defined(_WIN32)
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, name ; \
+	add     reg, reg, #:lo12:name ;
+#else
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, :got:name ; \
+	ldr     reg, [reg, #:got_lo12:name] ;
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 31
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x8f, /* DW_OP_breg31, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(regno), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x8f, /* DW_OP_breg31, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#endif /* GCRY_ASM_COMMON_AARCH64_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-common-amd64.h b/comm/third_party/libgcrypt/cipher/asm-common-amd64.h
new file mode 100644
index 0000000000..9d4a028a04
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-common-amd64.h
@@ -0,0 +1,189 @@
+/* asm-common-amd64.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AMD64_H
+#define GCRY_ASM_COMMON_AMD64_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+#ifdef __PIC__
+#  define RIP %rip
+#else
+#  define RIP
+#endif
+
+#ifdef __PIC__
+#  define ADD_RIP +rip
+#else
+#  define ADD_RIP
+#endif
+
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
+#else
+#  ifdef __code_model_large__
+#    define GET_EXTERN_POINTER(name, reg) \
+	       pushq %r15; \
+	       pushq %r14; \
+	    1: leaq 1b(%rip), reg; \
+	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
+	       movabsq $name@GOT, %r15; \
+	       addq %r14, reg; \
+	       popq %r14; \
+	       movq (reg, %r15), reg; \
+	       popq %r15;
+#  else
+#    define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
+#  endif
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+	CFI_ADJUST_CFA_OFFSET(-8);
+# define CFI_LEAVE() \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_rax 0
+# define DW_REGNO_rdx 1
+# define DW_REGNO_rcx 2
+# define DW_REGNO_rbx 3
+# define DW_REGNO_rsi 4
+# define DW_REGNO_rdi 5
+# define DW_REGNO_rbp 6
+# define DW_REGNO_rsp 7
+# define DW_REGNO_r8  8
+# define DW_REGNO_r9  9
+# define DW_REGNO_r10 10
+# define DW_REGNO_r11 11
+# define DW_REGNO_r12 12
+# define DW_REGNO_r13 13
+# define DW_REGNO_r14 14
+# define DW_REGNO_r15 15
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(reg,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(DW_REGNO(reg)), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ENTER_SYSV_FUNC_PARAMS_0_4 \
+	pushq %rdi; \
+	CFI_PUSH(%rdi); \
+	pushq %rsi; \
+	CFI_PUSH(%rsi); \
+	movq %rcx, %rdi; \
+	movq %rdx, %rsi; \
+	movq %r8, %rdx; \
+	movq %r9, %rcx; \
+
+# define ENTER_SYSV_FUNC_PARAMS_5 \
+	ENTER_SYSV_FUNC_PARAMS_0_4; \
+	movq 0x38(%rsp), %r8;
+
+# define ENTER_SYSV_FUNC_PARAMS_6 \
+	ENTER_SYSV_FUNC_PARAMS_5; \
+	movq 0x40(%rsp), %r9;
+
+# define EXIT_SYSV_FUNC \
+	popq %rsi; \
+	CFI_POP(%rsi); \
+	popq %rdi; \
+	CFI_POP(%rdi);
+#else
+# define ENTER_SYSV_FUNC_PARAMS_0_4
+# define ENTER_SYSV_FUNC_PARAMS_5
+# define ENTER_SYSV_FUNC_PARAMS_6
+# define EXIT_SYSV_FUNC
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-common-s390x.h b/comm/third_party/libgcrypt/cipher/asm-common-s390x.h
new file mode 100644
index 0000000000..b3a996cd6e
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-common-s390x.h
@@ -0,0 +1,90 @@
+/* asm-common-s390x.h  -  Common macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_S390X_H
+#define GCRY_ASM_COMMON_S390X_H
+
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 15
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x7f, /* DW_OP_breg15, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+160)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(regno), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x7f, /* DW_OP_breg15, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-inline-s390x.h b/comm/third_party/libgcrypt/cipher/asm-inline-s390x.h
new file mode 100644
index 0000000000..bacb45fe2e
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-inline-s390x.h
@@ -0,0 +1,157 @@
+/* asm-inline-s390x.h  -  Common macros for zSeries inline assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_INLINE_S390X_H
+#define GCRY_ASM_INLINE_S390X_H
+
+#include <config.h>
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+
+typedef unsigned int u128_t __attribute__ ((mode (TI)));
+
+enum kmxx_functions_e
+{
+  KM_FUNCTION_AES_128 = 18,
+  KM_FUNCTION_AES_192 = 19,
+  KM_FUNCTION_AES_256 = 20,
+  KM_FUNCTION_XTS_AES_128 = 50,
+  KM_FUNCTION_XTS_AES_256 = 52,
+
+  KMID_FUNCTION_SHA1 = 1,
+  KMID_FUNCTION_SHA256 = 2,
+  KMID_FUNCTION_SHA512 = 3,
+  KMID_FUNCTION_SHA3_224 = 32,
+  KMID_FUNCTION_SHA3_256 = 33,
+  KMID_FUNCTION_SHA3_384 = 34,
+  KMID_FUNCTION_SHA3_512 = 35,
+  KMID_FUNCTION_SHAKE128 = 36,
+  KMID_FUNCTION_SHAKE256 = 37,
+  KMID_FUNCTION_GHASH = 65,
+};
+
+enum kmxx_function_flags_e
+{
+  KM_ENCRYPT  = 0 << 7,
+  KM_DECRYPT  = 1 << 7,
+
+  KMF_LCFB_16 = 16 << 24,
+
+  KMA_LPC     = 1 << 8,
+  KMA_LAAD    = 1 << 9,
+  KMA_HS      = 1 << 10,
+
+  KLMD_PADDING_STATE = 1 << 8,
+};
+
+static ALWAYS_INLINE u128_t km_function_to_mask(enum kmxx_functions_e func)
+{
+  return (u128_t)1 << (127 - func);
+}
+
+static inline u128_t kimd_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "=a" (r1)
+		: [reg0] "r" (reg0), [reg1] "r" (reg1)
+		: "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static inline u128_t klmd_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "=a" (r1)
+		: [reg0] "r" (reg0), [reg1] "r" (reg1)
+		: "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static ALWAYS_INLINE void
+kimd_execute(unsigned int func, void *param_block, const void *src,
+	     size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "+a" (r1)
+		: [func] "r" (reg0), [param_ptr] "r" (reg1)
+		: "cc", "memory");
+}
+
+static ALWAYS_INLINE void
+klmd_execute(unsigned int func, void *param_block, const void *src,
+	     size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
+		"   brc 1,0b\n\t"
+		: [func] "+r" (reg0), [r1] "+a" (r1)
+		: [param_ptr] "r" (reg1)
+		: "cc", "memory");
+}
+
+static ALWAYS_INLINE void
+klmd_shake_execute(unsigned int func, void *param_block, void *dst,
+		   size_t dst_len, const void *src, size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)dst << 64) | (u64)dst_len;
+  u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, %[r1], %[r2]\n\t"
+		"   brc 1,0b\n\t"
+		: [func] "+r" (reg0), [r1] "+a" (r1), [r2] "+a" (r2)
+		: [param_ptr] "r" (reg1)
+		: "cc", "memory");
+}
+
+#endif /* GCRY_ASM_INLINE_S390X_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-poly1305-aarch64.h b/comm/third_party/libgcrypt/cipher/asm-poly1305-aarch64.h
new file mode 100644
index 0000000000..9009270956
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-poly1305-aarch64.h
@@ -0,0 +1,245 @@
+/* asm-common-aarch64.h  -  Poly1305 macros for ARMv8/AArch64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AARCH64_H
+#define GCRY_ASM_POLY1305_AARCH64_H
+
+#include "asm-common-aarch64.h"
+
+#ifdef __AARCH64EL__
+  #define le_to_host(reg) /*_*/
+#else
+  #define le_to_host(reg) rev reg, reg;
+#endif
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 Aarch64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    x8
+#define POLY_RSRC      x9
+
+#define POLY_R_H0      x10
+#define POLY_R_H1      x11
+#define POLY_R_H2      x12
+#define POLY_R_H2d     w12
+#define POLY_R_R0      x13
+#define POLY_R_R1      x14
+#define POLY_R_R1_MUL5 x15
+#define POLY_R_X0_HI   x16
+#define POLY_R_X0_LO   x17
+#define POLY_R_X1_HI   x19
+#define POLY_R_X1_LO   x20
+#define POLY_R_ONE     x21
+#define POLY_R_ONEd    w21
+
+#define POLY_TMP0      x22
+#define POLY_TMP1      x23
+#define POLY_TMP2      x24
+#define POLY_TMP3      x25
+
+#define POLY_CHACHA_ROUND x26
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)
+#define POLY_S_R1      (4 * 4 + 1 * 8)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)
+
+#define POLY1305_PUSH_REGS() \
+	stp x19, x20, [sp, #-16]!; \
+	CFI_ADJUST_CFA_OFFSET(16); \
+	CFI_REG_ON_STACK(19, 0); \
+	CFI_REG_ON_STACK(20, 8); \
+	stp x21, x22, [sp, #-16]!; \
+	CFI_ADJUST_CFA_OFFSET(16); \
+	CFI_REG_ON_STACK(21, 0); \
+	CFI_REG_ON_STACK(22, 8); \
+	stp x23, x24, [sp, #-16]!; \
+	CFI_ADJUST_CFA_OFFSET(16); \
+	CFI_REG_ON_STACK(23, 0); \
+	CFI_REG_ON_STACK(24, 8); \
+	stp x25, x26, [sp, #-16]!; \
+	CFI_ADJUST_CFA_OFFSET(16); \
+	CFI_REG_ON_STACK(25, 0); \
+	CFI_REG_ON_STACK(26, 8);
+
+#define POLY1305_POP_REGS() \
+	ldp x25, x26, [sp], #16; \
+	CFI_ADJUST_CFA_OFFSET(-16); \
+	CFI_RESTORE(x25); \
+	CFI_RESTORE(x26); \
+	ldp x23, x24, [sp], #16; \
+	CFI_ADJUST_CFA_OFFSET(-16); \
+	CFI_RESTORE(x23); \
+	CFI_RESTORE(x24); \
+	ldp x21, x22, [sp], #16; \
+	CFI_ADJUST_CFA_OFFSET(-16); \
+	CFI_RESTORE(x21); \
+	CFI_RESTORE(x22); \
+	ldp x19, x20, [sp], #16; \
+	CFI_ADJUST_CFA_OFFSET(-16); \
+	CFI_RESTORE(x19); \
+	CFI_RESTORE(x20);
+
+#define POLY1305_LOAD_STATE() \
+	ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \
+	ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)];  \
+	ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+	ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \
+	ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \
+	add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \
+	mov POLY_R_ONE, #1;
+
+#define POLY1305_STORE_STATE() \
+	str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
+	str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+	str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)];
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+	/* a = h + m */ \
+	ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)];
+#define POLY1305_BLOCK_PART2(src_offset) \
+	ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)];
+#define POLY1305_BLOCK_PART3() \
+	le_to_host(POLY_TMP0);
+#define POLY1305_BLOCK_PART4() \
+	le_to_host(POLY_TMP1);
+#define POLY1305_BLOCK_PART5() \
+	adds POLY_R_H0, POLY_R_H0, POLY_TMP0;
+#define POLY1305_BLOCK_PART6() \
+	adcs POLY_R_H1, POLY_R_H1, POLY_TMP1;
+#define POLY1305_BLOCK_PART7() \
+	adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd;
+
+#define POLY1305_BLOCK_PART8() \
+	/* h = a * r (partial mod 2^130-5): */ \
+	mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1;   /* lo: h0 * r1 */
+#define POLY1305_BLOCK_PART9() \
+	mul POLY_TMP0, POLY_R_H1, POLY_R_R0;      /* lo: h1 * r0 */
+#define POLY1305_BLOCK_PART10() \
+	mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0;   /* lo: h0 * r0 */
+#define POLY1305_BLOCK_PART11() \
+	umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */
+#define POLY1305_BLOCK_PART12() \
+	adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0;
+#define POLY1305_BLOCK_PART13() \
+	umulh POLY_TMP1, POLY_R_H1, POLY_R_R0;    /* hi: h1 * r0 */
+#define POLY1305_BLOCK_PART14() \
+	mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5;   /* lo: h1 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART15() \
+	umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */
+#define POLY1305_BLOCK_PART16() \
+	adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1;
+#define POLY1305_BLOCK_PART17() \
+	umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART18() \
+	adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2;
+#define POLY1305_BLOCK_PART19() \
+	mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART20() \
+	adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3;
+#define POLY1305_BLOCK_PART21() \
+	mul POLY_R_H2, POLY_R_H2, POLY_R_R0;      /* h2 * r0 */
+#define POLY1305_BLOCK_PART22() \
+	adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO;
+#define POLY1305_BLOCK_PART23() \
+	adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART24() \
+	/* carry propagation */ \
+	and POLY_R_H2, POLY_R_H0, #3;
+#define POLY1305_BLOCK_PART25() \
+	lsr POLY_R_H0, POLY_R_H0, #2;
+#define POLY1305_BLOCK_PART26() \
+	add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2;
+#define POLY1305_BLOCK_PART27() \
+	adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO;
+#define POLY1305_BLOCK_PART28() \
+	adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI;
+#define POLY1305_BLOCK_PART29() \
+	adc POLY_R_H2d, POLY_R_H2d, wzr;
+
+//#define TESTING_POLY1305_ASM
+#ifdef TESTING_POLY1305_ASM
+/* for testing only. */
+.align 3
+.globl _gcry_poly1305_aarch64_blocks1
+ELF(.type _gcry_poly1305_aarch64_blocks1,%function;)
+_gcry_poly1305_aarch64_blocks1:
+	/* input:
+	 *	x0: poly1305-state
+	 *	x1: src
+	 *	x2: nblks
+	 */
+	CFI_STARTPROC()
+	POLY1305_PUSH_REGS();
+
+	mov POLY_RSTATE, x0;
+	mov POLY_RSRC, x1;
+
+	POLY1305_LOAD_STATE();
+
+.L_gcry_poly1305_aarch64_loop1:
+	POLY1305_BLOCK_PART1(0 * 16);
+	POLY1305_BLOCK_PART2(0 * 16);
+	add POLY_RSRC, POLY_RSRC, #16;
+	POLY1305_BLOCK_PART3();
+	POLY1305_BLOCK_PART4();
+	POLY1305_BLOCK_PART5();
+	POLY1305_BLOCK_PART6();
+	POLY1305_BLOCK_PART7();
+	POLY1305_BLOCK_PART8();
+	POLY1305_BLOCK_PART9();
+	POLY1305_BLOCK_PART10();
+	POLY1305_BLOCK_PART11();
+	POLY1305_BLOCK_PART12();
+	POLY1305_BLOCK_PART13();
+	POLY1305_BLOCK_PART14();
+	POLY1305_BLOCK_PART15();
+	POLY1305_BLOCK_PART16();
+	POLY1305_BLOCK_PART17();
+	POLY1305_BLOCK_PART18();
+	POLY1305_BLOCK_PART19();
+	POLY1305_BLOCK_PART20();
+	POLY1305_BLOCK_PART21();
+	POLY1305_BLOCK_PART22();
+	POLY1305_BLOCK_PART23();
+	POLY1305_BLOCK_PART24();
+	POLY1305_BLOCK_PART25();
+	POLY1305_BLOCK_PART26();
+	POLY1305_BLOCK_PART27();
+	POLY1305_BLOCK_PART28();
+	POLY1305_BLOCK_PART29();
+
+	subs x2, x2, #1;
+	b.ne .L_gcry_poly1305_aarch64_loop1;
+
+	POLY1305_STORE_STATE();
+
+	mov x0, #0;
+
+	POLY1305_POP_REGS();
+	ret;
+	CFI_ENDPROC()
+ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AARCH64_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-poly1305-amd64.h b/comm/third_party/libgcrypt/cipher/asm-poly1305-amd64.h
new file mode 100644
index 0000000000..3f99ea3e16
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-poly1305-amd64.h
@@ -0,0 +1,171 @@
+/* asm-common-amd64.h  -  Poly1305 macros for AMD64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AMD64_H
+#define GCRY_ASM_POLY1305_AMD64_H
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 AMD64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    %r8
+#define POLY_RSRC      %r9
+
+#define POLY_R_H0      %rbx
+#define POLY_R_H1      %rcx
+#define POLY_R_H2      %r10
+#define POLY_R_H2d     %r10d
+#define POLY_R_R0      %r11
+#define POLY_R_R1_MUL5 %r12
+#define POLY_R_X0_HI   %r13
+#define POLY_R_X0_LO   %r14
+#define POLY_R_X1_HI   %r15
+#define POLY_R_X1_LO   %rsi
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define POLY1305_LOAD_STATE() \
+	movq POLY_S_H0, POLY_R_H0; \
+	movq POLY_S_H1, POLY_R_H1; \
+	movl POLY_S_H2d, POLY_R_H2d; \
+	movq POLY_S_R0, POLY_R_R0; \
+	movq POLY_S_R1, POLY_R_R1_MUL5; \
+	shrq $2, POLY_R_R1_MUL5; \
+	addq POLY_S_R1, POLY_R_R1_MUL5;
+
+#define POLY1305_STORE_STATE() \
+	movq POLY_R_H0, POLY_S_H0; \
+	movq POLY_R_H1, POLY_S_H1; \
+	movl POLY_R_H2d, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1(src_offset) \
+	addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \
+	adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \
+	adcl $1, POLY_R_H2d; \
+	\
+	/* h = a * r (partial mod 2^130-5): */ \
+	\
+	/* h0 * r1 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_S_R1; \
+	movq %rax, POLY_R_X1_LO; \
+	movq %rdx, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART2() \
+	\
+	/* h0 * r0 */ \
+	movq POLY_R_H0, %rax; \
+	mulq POLY_R_R0; \
+	movq %rax, POLY_R_X0_LO; \
+	movq %rdx, POLY_R_X0_HI;
+
+#define POLY1305_BLOCK_PART3() \
+	\
+	/* h1 * r0 */ \
+	movq POLY_R_H1, %rax; \
+	mulq POLY_R_R0; \
+	addq %rax, POLY_R_X1_LO; \
+	adcq %rdx, POLY_R_X1_HI; \
+	\
+	/* h1 * r1 mod 2^130-5 */ \
+	movq POLY_R_R1_MUL5, %rax; \
+	mulq POLY_R_H1;
+
+#define POLY1305_BLOCK_PART4() \
+	movq POLY_R_H2, POLY_R_H1; \
+	imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \
+	addq %rax, POLY_R_X0_LO; \
+	adcq %rdx, POLY_R_X0_HI; \
+	imulq POLY_R_R0, POLY_R_H2;      /* h2 * r0 */ \
+	addq POLY_R_X1_LO, POLY_R_H1; \
+	adcq POLY_R_X1_HI, POLY_R_H2;
+
+#define POLY1305_BLOCK_PART5() \
+	\
+	/* carry propagation */ \
+	movq POLY_R_H2, POLY_R_H0; \
+	andl $3, POLY_R_H2d; \
+	shrq $2, POLY_R_H0; \
+	leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \
+	addq POLY_R_X0_LO, POLY_R_H0; \
+	adcq POLY_R_X0_HI, POLY_R_H1; \
+	adcl $0, POLY_R_H2d;
+
+#ifdef TESTING_POLY1305_ASM
+/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */
+.align 8
+.globl _gcry_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_poly1305_amd64_ssse3_blocks1,@function;)
+
+_gcry_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: poly1305-state
+	 *	%rsi: src
+	 *	%rdx: nblks
+	 */
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	subq $(10 * 8), %rsp;
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rdx, (8 * 8)(%rsp); # NBLKS
+
+	movq %rdi, POLY_RSTATE;
+	movq %rsi, POLY_RSRC;
+
+	POLY1305_LOAD_STATE();
+
+.L_poly1:
+	POLY1305_BLOCK_PART1(0 * 16);
+	POLY1305_BLOCK_PART2();
+	POLY1305_BLOCK_PART3();
+	POLY1305_BLOCK_PART4();
+	POLY1305_BLOCK_PART5();
+
+	subq $1, (8 * 8)(%rsp); # NBLKS
+	leaq (16)(POLY_RSRC), POLY_RSRC;
+	jnz .L_poly1;
+
+	POLY1305_STORE_STATE();
+
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+
+	xorl %eax, %eax;
+	leave
+	ret;
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/comm/third_party/libgcrypt/cipher/asm-poly1305-s390x.h b/comm/third_party/libgcrypt/cipher/asm-poly1305-s390x.h
new file mode 100644
index 0000000000..113ab94913
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/asm-poly1305-s390x.h
@@ -0,0 +1,140 @@
+/* asm-common-amd64.h  -  Poly1305 macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_S390X_H
+#define GCRY_ASM_POLY1305_S390X_H
+
+#include "asm-common-s390x.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305
+ **********************************************************************/
+
+#define POLY_RSTATE       %r1
+#define POLY_RSRC         %r14
+
+#define POLY_R_H0_TMP_HI  %r6  // even-
+#define POLY_R_H0         %r7  //      odd pair
+#define POLY_R_H1_TMP_HI  %r8  // even-
+#define POLY_R_H1         %r9  //      odd pair
+#define POLY_R_H2         %r10
+#define POLY_R_R0         %r11
+#define POLY_R_R1         %r12
+#define POLY_R_R1_MUL5    %r13
+#define POLY_R_X0_HI      %r2  // even-
+#define POLY_R_X0_LO      %r3  //      odd pair
+#define POLY_R_X1_HI      %r4  // even-
+#define POLY_R_X1_LO      %r5  //      odd pair
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define INC_POLY1305_SRC(a) \
+	aghi POLY_RSRC, (a);
+
+#define POLY1305_LOAD_STATE() \
+	lg POLY_R_H0, POLY_S_H0; \
+	lg POLY_R_H1, POLY_S_H1; \
+	llgf POLY_R_H2, POLY_S_H2d; \
+	rllg POLY_R_H0, POLY_R_H0, 32; \
+	rllg POLY_R_H1, POLY_R_H1, 32; \
+	lg POLY_R_R0, POLY_S_R0; \
+	lg POLY_R_R1, POLY_S_R1; \
+	rllg POLY_R_R0, POLY_R_R0, 32; \
+	rllg POLY_R_R1, POLY_R_R1, 32; \
+	srlg POLY_R_R1_MUL5, POLY_R_R1, 2; \
+	algr POLY_R_R1_MUL5, POLY_R_R1;
+
+#define POLY1305_STORE_STATE() \
+	rllg POLY_R_H0, POLY_R_H0, 32; \
+	rllg POLY_R_H1, POLY_R_H1, 32; \
+	stg POLY_R_H0, POLY_S_H0; \
+	stg POLY_R_H1, POLY_S_H1; \
+	st POLY_R_H2, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1_HB(src_offset, high_pad) \
+	lrvg POLY_R_X0_HI, ((src_offset) + 1 * 8)(POLY_RSRC); \
+	lrvg POLY_R_X0_LO, ((src_offset) + 0 * 8)(POLY_RSRC); \
+	lghi POLY_R_H1_TMP_HI, (high_pad);
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+	POLY1305_BLOCK_PART1_HB(src_offset, 1);
+
+#define POLY1305_BLOCK_PART2() \
+	algr POLY_R_H0, POLY_R_X0_LO; \
+	alcgr POLY_R_H1, POLY_R_X0_HI; \
+	alcgr POLY_R_H2, POLY_R_H1_TMP_HI; \
+	lgr POLY_R_X1_LO, POLY_R_H0; \
+	lgr POLY_R_X0_LO, POLY_R_H0;
+
+#define POLY1305_BLOCK_PART3() \
+	/* h = a * r (partial mod 2^130-5): */ \
+	\
+	/* h0 * r1 */ \
+	mlgr POLY_R_X1_HI, POLY_R_R1; \
+	\
+	/* h1 * r0 */ \
+	lgr POLY_R_H0, POLY_R_H1; \
+	mlgr POLY_R_H0_TMP_HI, POLY_R_R0; \
+	\
+	/* h1 * r1 mod 2^130-5 */ \
+	mlgr POLY_R_H1_TMP_HI, POLY_R_R1_MUL5;
+
+#define POLY1305_BLOCK_PART4() \
+	\
+	/* h0 * r0 */ \
+	mlgr POLY_R_X0_HI, POLY_R_R0; \
+	\
+	algr POLY_R_X1_LO, POLY_R_H0; \
+	alcgr POLY_R_X1_HI, POLY_R_H0_TMP_HI; \
+	\
+	lgr POLY_R_H0_TMP_HI, POLY_R_H2; \
+	msgr POLY_R_H0_TMP_HI, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */ \
+	msgr POLY_R_H2, POLY_R_R0;             /* h2 * r0 */
+
+#define POLY1305_BLOCK_PART5() \
+	\
+	algr POLY_R_X0_LO, POLY_R_H1; \
+	alcgr POLY_R_X0_HI, POLY_R_H1_TMP_HI;
+
+#define POLY1305_BLOCK_PART6() \
+	\
+	algrk POLY_R_H1, POLY_R_H0_TMP_HI, POLY_R_X1_LO; \
+	alcgr POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART7() \
+	\
+	/* carry propagation */ \
+	srlg POLY_R_H0, POLY_R_H2, 2; \
+	risbgn POLY_R_X1_LO, POLY_R_H2, 0, 0x80 | 61, 0; \
+	lghi POLY_R_H1_TMP_HI, 0; \
+	agr POLY_R_H0, POLY_R_X1_LO; \
+	risbgn POLY_R_H2, POLY_R_H2, 62, 0x80 | 63, 0;
+
+#define POLY1305_BLOCK_PART8() \
+	algr POLY_R_H0, POLY_R_X0_LO; \
+	alcgr POLY_R_H1, POLY_R_X0_HI; \
+	alcgr POLY_R_H2, POLY_R_H1_TMP_HI;
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/comm/third_party/libgcrypt/cipher/bithelp.h b/comm/third_party/libgcrypt/cipher/bithelp.h
new file mode 100644
index 0000000000..7793ce7ca3
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/bithelp.h
@@ -0,0 +1,123 @@
+/* bithelp.h  -  Some bit manipulation helpers
+ *	Copyright (C) 1999, 2002 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRYPT_BITHELP_H
+#define GCRYPT_BITHELP_H
+
+#include "types.h"
+
+
+/****************
+ * Rotate the 32 bit unsigned integer X by N bits left/right
+ */
+static inline u32 rol(u32 x, int n)
+{
+	return ( (x << (n&(32-1))) | (x >> ((32-n)&(32-1))) );
+}
+
+static inline u32 ror(u32 x, int n)
+{
+	return ( (x >> (n&(32-1))) | (x << ((32-n)&(32-1))) );
+}
+
+static inline u64 rol64(u64 x, int n)
+{
+  return ( (x << (n&(64-1))) | (x >> ((64-n)&(64-1))) );
+}
+
+/* Byte swap for 32-bit and 64-bit integers.  If available, use compiler
+   provided helpers.  */
+#ifdef HAVE_BUILTIN_BSWAP32
+# define _gcry_bswap32 __builtin_bswap32
+#else
+static inline u32
+_gcry_bswap32(u32 x)
+{
+	return ((rol(x, 8) & 0x00ff00ffL) | (ror(x, 8) & 0xff00ff00L));
+}
+#endif
+
+#ifdef HAVE_BUILTIN_BSWAP64
+# define _gcry_bswap64 __builtin_bswap64
+#else
+static inline u64
+_gcry_bswap64(u64 x)
+{
+	return ((u64)_gcry_bswap32(x) << 32) | (_gcry_bswap32(x >> 32));
+}
+#endif
+
+/* Endian dependent byte swap operations.  */
+#ifdef WORDS_BIGENDIAN
+# define le_bswap32(x) _gcry_bswap32(x)
+# define be_bswap32(x) ((u32)(x))
+# define le_bswap64(x) _gcry_bswap64(x)
+# define be_bswap64(x) ((u64)(x))
+#else
+# define le_bswap32(x) ((u32)(x))
+# define be_bswap32(x) _gcry_bswap32(x)
+# define le_bswap64(x) ((u64)(x))
+# define be_bswap64(x) _gcry_bswap64(x)
+#endif
+
+
+/* Count trailing zero bits in an unsigend int.  We return an int
+   because that is what gcc's builtin does.  Returns the number of
+   bits in X if X is 0. */
+static inline int
+_gcry_ctz (unsigned int x)
+{
+#if defined (HAVE_BUILTIN_CTZ)
+  return x ? __builtin_ctz (x) : 8 * sizeof (x);
+#else
+  /* See
+   * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightModLookup
+   */
+  static const unsigned char mod37[] =
+    {
+      sizeof (unsigned int)*8,
+          0,  1, 26,  2, 23, 27,  0,  3, 16, 24, 30, 28, 11,  0, 13,
+      4,  7, 17,  0, 25, 22, 31, 15, 29, 10, 12,  6,  0, 21, 14,  9,
+      5, 20,  8, 19, 18
+    };
+  return (int)mod37[(-x & x) % 37];
+#endif
+}
+
+
+/* Count trailing zero bits in an u64.  We return an int because that
+   is what gcc's builtin does.  Returns the number of bits in X if X
+   is 0.  */
+static inline int
+_gcry_ctz64(u64 x)
+{
+#if defined (HAVE_BUILTIN_CTZL) && SIZEOF_UNSIGNED_LONG >= 8
+  return x ? __builtin_ctzl (x) : 8 * sizeof (x);
+#elif defined (HAVE_BUILTIN_CTZ) && SIZEOF_UNSIGNED_INT >= 8
+#warning hello
+  return x ? __builtin_ctz (x) : 8 * sizeof (x);
+#else
+  if ((x & 0xffffffff))
+    return _gcry_ctz (x);
+  else
+    return 32 + _gcry_ctz (x >> 32);
+#endif
+}
+
+
+#endif /*GCRYPT_BITHELP_H*/
diff --git a/comm/third_party/libgcrypt/cipher/blake2.c b/comm/third_party/libgcrypt/cipher/blake2.c
new file mode 100644
index 0000000000..f2bf49e522
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blake2.c
@@ -0,0 +1,996 @@
+/* blake2.c - BLAKE2b and BLAKE2s hash functions (RFC 7693)
+ * Copyright (C) 2017  Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/ref
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#include <config.h>
+#include <string.h>
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AVX2) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+#define BLAKE2B_BLOCKBYTES 128
+#define BLAKE2B_OUTBYTES 64
+#define BLAKE2B_KEYBYTES 64
+
+#define BLAKE2S_BLOCKBYTES 64
+#define BLAKE2S_OUTBYTES 32
+#define BLAKE2S_KEYBYTES 32
+
+typedef struct
+{
+  u64 h[8];
+  u64 t[2];
+  u64 f[2];
+} BLAKE2B_STATE;
+
+struct blake2b_param_s
+{
+  byte digest_length;
+  byte key_length;
+  byte fanout;
+  byte depth;
+  byte leaf_length[4];
+  byte node_offset[4];
+  byte xof_length[4];
+  byte node_depth;
+  byte inner_length;
+  byte reserved[14];
+  byte salt[16];
+  byte personal[16];
+};
+
+typedef struct BLAKE2B_CONTEXT_S
+{
+  BLAKE2B_STATE state;
+  byte buf[BLAKE2B_BLOCKBYTES];
+  size_t buflen;
+  size_t outlen;
+#ifdef USE_AVX2
+  unsigned int use_avx2:1;
+#endif
+} BLAKE2B_CONTEXT;
+
+typedef struct
+{
+  u32 h[8];
+  u32 t[2];
+  u32 f[2];
+} BLAKE2S_STATE;
+
+struct blake2s_param_s
+{
+  byte digest_length;
+  byte key_length;
+  byte fanout;
+  byte depth;
+  byte leaf_length[4];
+  byte node_offset[4];
+  byte xof_length[2];
+  byte node_depth;
+  byte inner_length;
+  /* byte reserved[0]; */
+  byte salt[8];
+  byte personal[8];
+};
+
+typedef struct BLAKE2S_CONTEXT_S
+{
+  BLAKE2S_STATE state;
+  byte buf[BLAKE2S_BLOCKBYTES];
+  size_t buflen;
+  size_t outlen;
+#ifdef USE_AVX
+  unsigned int use_avx:1;
+#endif
+} BLAKE2S_CONTEXT;
+
+typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk,
+					   size_t nblks);
+
+
+static const u64 blake2b_IV[8] =
+{
+  U64_C(0x6a09e667f3bcc908), U64_C(0xbb67ae8584caa73b),
+  U64_C(0x3c6ef372fe94f82b), U64_C(0xa54ff53a5f1d36f1),
+  U64_C(0x510e527fade682d1), U64_C(0x9b05688c2b3e6c1f),
+  U64_C(0x1f83d9abfb41bd6b), U64_C(0x5be0cd19137e2179)
+};
+
+static const u32 blake2s_IV[8] =
+{
+  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static byte zero_block[BLAKE2B_BLOCKBYTES] = { 0, };
+
+
+static void blake2_write(void *S, const void *inbuf, size_t inlen,
+			 byte *tmpbuf, size_t *tmpbuflen, size_t blkbytes,
+			 blake2_transform_t transform_fn)
+{
+  const byte* in = inbuf;
+  unsigned int burn = 0;
+
+  if (inlen > 0)
+    {
+      size_t left = *tmpbuflen;
+      size_t fill = blkbytes - left;
+      size_t nblks;
+
+      if (inlen > fill)
+	{
+	  if (fill > 0)
+	    buf_cpy (tmpbuf + left, in, fill); /* Fill buffer */
+	  left = 0;
+
+	  burn = transform_fn (S, tmpbuf, 1); /* Increment counter + Compress */
+
+	  in += fill;
+	  inlen -= fill;
+
+	  nblks = inlen / blkbytes - !(inlen % blkbytes);
+	  if (nblks)
+	    {
+	      burn = transform_fn(S, in, nblks);
+	      in += blkbytes * nblks;
+	      inlen -= blkbytes * nblks;
+	    }
+	}
+
+      gcry_assert (inlen > 0);
+
+      buf_cpy (tmpbuf + left, in, inlen);
+      *tmpbuflen = left + inlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return;
+}
+
+
+static inline void blake2b_set_lastblock(BLAKE2B_STATE *S)
+{
+  S->f[0] = U64_C(0xffffffffffffffff);
+}
+
+static inline int blake2b_is_lastblock(const BLAKE2B_STATE *S)
+{
+  return S->f[0] != 0;
+}
+
+static inline void blake2b_increment_counter(BLAKE2B_STATE *S, const int inc)
+{
+  S->t[0] += (u64)inc;
+  S->t[1] += (S->t[0] < (u64)inc) - (inc < 0);
+}
+
+static inline u64 rotr64(u64 x, u64 n)
+{
+  return ((x >> (n & 63)) | (x << ((64 - n) & 63)));
+}
+
+static unsigned int blake2b_transform_generic(BLAKE2B_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
+{
+  static const byte blake2b_sigma[12][16] =
+  {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
+  };
+  const byte* in = inblks;
+  u64 m[16];
+  u64 v[16];
+
+  while (nblks--)
+    {
+      /* Increment counter */
+      blake2b_increment_counter (S, BLAKE2B_BLOCKBYTES);
+
+      /* Compress */
+      m[0] = buf_get_le64 (in + 0 * sizeof(m[0]));
+      m[1] = buf_get_le64 (in + 1 * sizeof(m[0]));
+      m[2] = buf_get_le64 (in + 2 * sizeof(m[0]));
+      m[3] = buf_get_le64 (in + 3 * sizeof(m[0]));
+      m[4] = buf_get_le64 (in + 4 * sizeof(m[0]));
+      m[5] = buf_get_le64 (in + 5 * sizeof(m[0]));
+      m[6] = buf_get_le64 (in + 6 * sizeof(m[0]));
+      m[7] = buf_get_le64 (in + 7 * sizeof(m[0]));
+      m[8] = buf_get_le64 (in + 8 * sizeof(m[0]));
+      m[9] = buf_get_le64 (in + 9 * sizeof(m[0]));
+      m[10] = buf_get_le64 (in + 10 * sizeof(m[0]));
+      m[11] = buf_get_le64 (in + 11 * sizeof(m[0]));
+      m[12] = buf_get_le64 (in + 12 * sizeof(m[0]));
+      m[13] = buf_get_le64 (in + 13 * sizeof(m[0]));
+      m[14] = buf_get_le64 (in + 14 * sizeof(m[0]));
+      m[15] = buf_get_le64 (in + 15 * sizeof(m[0]));
+
+      v[ 0] = S->h[0];
+      v[ 1] = S->h[1];
+      v[ 2] = S->h[2];
+      v[ 3] = S->h[3];
+      v[ 4] = S->h[4];
+      v[ 5] = S->h[5];
+      v[ 6] = S->h[6];
+      v[ 7] = S->h[7];
+      v[ 8] = blake2b_IV[0];
+      v[ 9] = blake2b_IV[1];
+      v[10] = blake2b_IV[2];
+      v[11] = blake2b_IV[3];
+      v[12] = blake2b_IV[4] ^ S->t[0];
+      v[13] = blake2b_IV[5] ^ S->t[1];
+      v[14] = blake2b_IV[6] ^ S->f[0];
+      v[15] = blake2b_IV[7] ^ S->f[1];
+
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+    d = rotr64(d ^ a, 32);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 24);                  \
+    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+    d = rotr64(d ^ a, 16);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 63);                  \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+      ROUND(0);
+      ROUND(1);
+      ROUND(2);
+      ROUND(3);
+      ROUND(4);
+      ROUND(5);
+      ROUND(6);
+      ROUND(7);
+      ROUND(8);
+      ROUND(9);
+      ROUND(10);
+      ROUND(11);
+
+#undef G
+#undef ROUND
+
+      S->h[0] = S->h[0] ^ v[0] ^ v[0 + 8];
+      S->h[1] = S->h[1] ^ v[1] ^ v[1 + 8];
+      S->h[2] = S->h[2] ^ v[2] ^ v[2 + 8];
+      S->h[3] = S->h[3] ^ v[3] ^ v[3 + 8];
+      S->h[4] = S->h[4] ^ v[4] ^ v[4 + 8];
+      S->h[5] = S->h[5] ^ v[5] ^ v[5 + 8];
+      S->h[6] = S->h[6] ^ v[6] ^ v[6 + 8];
+      S->h[7] = S->h[7] ^ v[7] ^ v[7 + 8];
+
+      in += BLAKE2B_BLOCKBYTES;
+    }
+
+  return sizeof(void *) * 4 + sizeof(u64) * 16 * 2;
+}
+
+#ifdef USE_AVX2
+unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S,
+                                                const void *inblks,
+                                                size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2b_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    {}
+#ifdef USE_AVX2
+  if (c->use_avx2)
+    nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2b_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
+static void blake2b_final(void *ctx)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  BLAKE2B_STATE *S = &c->state;
+  unsigned int burn;
+  size_t i;
+
+  gcry_assert (sizeof(c->buf) >= c->outlen);
+  if (blake2b_is_lastblock(S))
+    return;
+
+  if (c->buflen < BLAKE2B_BLOCKBYTES)
+    memset (c->buf + c->buflen, 0, BLAKE2B_BLOCKBYTES - c->buflen); /* Padding */
+  blake2b_set_lastblock (S);
+  blake2b_increment_counter (S, (int)c->buflen - BLAKE2B_BLOCKBYTES);
+  burn = blake2b_transform (ctx, c->buf, 1);
+
+  /* Output full hash to buffer */
+  for (i = 0; i < 8; ++i)
+    buf_put_le64 (c->buf + sizeof(S->h[i]) * i, S->h[i]);
+
+  /* Zero out extra buffer bytes. */
+  if (c->outlen < sizeof(c->buf))
+    memset (c->buf + c->outlen, 0, sizeof(c->buf) - c->outlen);
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+static byte *blake2b_read(void *ctx)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  return c->buf;
+}
+
+static void blake2b_write(void *ctx, const void *inbuf, size_t inlen)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  BLAKE2B_STATE *S = &c->state;
+  blake2_write(S, inbuf, inlen, c->buf, &c->buflen, BLAKE2B_BLOCKBYTES,
+	       blake2b_transform);
+}
+
+static inline void blake2b_init_param(BLAKE2B_STATE *S,
+				      const struct blake2b_param_s *P)
+{
+  const byte *p = (const byte *)P;
+  size_t i;
+
+  /* init xors IV with input parameter block */
+
+  /* IV XOR ParamBlock */
+  for (i = 0; i < 8; ++i)
+    S->h[i] = blake2b_IV[i] ^ buf_get_le64(p + sizeof(S->h[i]) * i);
+}
+
+static inline gcry_err_code_t blake2b_init(BLAKE2B_CONTEXT *ctx,
+					   const byte *key, size_t keylen)
+{
+  struct blake2b_param_s P[1] = { { 0, } };
+  BLAKE2B_STATE *S = &ctx->state;
+
+  if (!ctx->outlen || ctx->outlen > BLAKE2B_OUTBYTES)
+    return GPG_ERR_INV_ARG;
+  if (sizeof(P[0]) != sizeof(u64) * 8)
+    return GPG_ERR_INTERNAL;
+  if (keylen && (!key || keylen > BLAKE2B_KEYBYTES))
+    return GPG_ERR_INV_KEYLEN;
+
+  P->digest_length = ctx->outlen;
+  P->key_length = keylen;
+  P->fanout = 1;
+  P->depth = 1;
+
+  blake2b_init_param (S, P);
+  wipememory (P, sizeof(P));
+
+  if (key)
+    {
+      blake2b_write (ctx, key, keylen);
+      blake2b_write (ctx, zero_block, BLAKE2B_BLOCKBYTES - keylen);
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
+					const byte *key, size_t keylen,
+					unsigned int dbits)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)features;
+  (void)flags;
+
+  memset (c, 0, sizeof (*c));
+
+#ifdef USE_AVX2
+  c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+#endif
+
+  c->outlen = dbits / 8;
+  c->buflen = 0;
+  return blake2b_init(c, key, keylen);
+}
+
+static inline void blake2s_set_lastblock(BLAKE2S_STATE *S)
+{
+  S->f[0] = 0xFFFFFFFFUL;
+}
+
+static inline int blake2s_is_lastblock(BLAKE2S_STATE *S)
+{
+  return S->f[0] != 0;
+}
+
+static inline void blake2s_increment_counter(BLAKE2S_STATE *S, const int inc)
+{
+  S->t[0] += (u32)inc;
+  S->t[1] += (S->t[0] < (u32)inc) - (inc < 0);
+}
+
+static unsigned int blake2s_transform_generic(BLAKE2S_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
+{
+  static const byte blake2s_sigma[10][16] =
+  {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+  };
+  unsigned int burn = 0;
+  const byte* in = inblks;
+  u32 m[16];
+  u32 v[16];
+
+  while (nblks--)
+    {
+      /* Increment counter */
+      blake2s_increment_counter (S, BLAKE2S_BLOCKBYTES);
+
+      /* Compress */
+      m[0] = buf_get_le32 (in + 0 * sizeof(m[0]));
+      m[1] = buf_get_le32 (in + 1 * sizeof(m[0]));
+      m[2] = buf_get_le32 (in + 2 * sizeof(m[0]));
+      m[3] = buf_get_le32 (in + 3 * sizeof(m[0]));
+      m[4] = buf_get_le32 (in + 4 * sizeof(m[0]));
+      m[5] = buf_get_le32 (in + 5 * sizeof(m[0]));
+      m[6] = buf_get_le32 (in + 6 * sizeof(m[0]));
+      m[7] = buf_get_le32 (in + 7 * sizeof(m[0]));
+      m[8] = buf_get_le32 (in + 8 * sizeof(m[0]));
+      m[9] = buf_get_le32 (in + 9 * sizeof(m[0]));
+      m[10] = buf_get_le32 (in + 10 * sizeof(m[0]));
+      m[11] = buf_get_le32 (in + 11 * sizeof(m[0]));
+      m[12] = buf_get_le32 (in + 12 * sizeof(m[0]));
+      m[13] = buf_get_le32 (in + 13 * sizeof(m[0]));
+      m[14] = buf_get_le32 (in + 14 * sizeof(m[0]));
+      m[15] = buf_get_le32 (in + 15 * sizeof(m[0]));
+
+      v[ 0] = S->h[0];
+      v[ 1] = S->h[1];
+      v[ 2] = S->h[2];
+      v[ 3] = S->h[3];
+      v[ 4] = S->h[4];
+      v[ 5] = S->h[5];
+      v[ 6] = S->h[6];
+      v[ 7] = S->h[7];
+      v[ 8] = blake2s_IV[0];
+      v[ 9] = blake2s_IV[1];
+      v[10] = blake2s_IV[2];
+      v[11] = blake2s_IV[3];
+      v[12] = S->t[0] ^ blake2s_IV[4];
+      v[13] = S->t[1] ^ blake2s_IV[5];
+      v[14] = S->f[0] ^ blake2s_IV[6];
+      v[15] = S->f[1] ^ blake2s_IV[7];
+
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+    d = ror(d ^ a, 16);                     \
+    c = c + d;                              \
+    b = ror(b ^ c, 12);                     \
+    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+    d = ror(d ^ a, 8);                      \
+    c = c + d;                              \
+    b = ror(b ^ c, 7);                      \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+      ROUND(0);
+      ROUND(1);
+      ROUND(2);
+      ROUND(3);
+      ROUND(4);
+      ROUND(5);
+      ROUND(6);
+      ROUND(7);
+      ROUND(8);
+      ROUND(9);
+
+#undef G
+#undef ROUND
+
+      S->h[0] = S->h[0] ^ v[0] ^ v[0 + 8];
+      S->h[1] = S->h[1] ^ v[1] ^ v[1 + 8];
+      S->h[2] = S->h[2] ^ v[2] ^ v[2 + 8];
+      S->h[3] = S->h[3] ^ v[3] ^ v[3 + 8];
+      S->h[4] = S->h[4] ^ v[4] ^ v[4 + 8];
+      S->h[5] = S->h[5] ^ v[5] ^ v[5 + 8];
+      S->h[6] = S->h[6] ^ v[6] ^ v[6 + 8];
+      S->h[7] = S->h[7] ^ v[7] ^ v[7 + 8];
+
+      in += BLAKE2S_BLOCKBYTES;
+    }
+
+  return burn;
+}
+
+#ifdef USE_AVX
+unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S,
+                                               const void *inblks,
+                                               size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2s_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    {}
+#ifdef USE_AVX
+  if (c->use_avx)
+    nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2s_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
+static void blake2s_final(void *ctx)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  BLAKE2S_STATE *S = &c->state;
+  unsigned int burn;
+  size_t i;
+
+  gcry_assert (sizeof(c->buf) >= c->outlen);
+  if (blake2s_is_lastblock(S))
+    return;
+
+  if (c->buflen < BLAKE2S_BLOCKBYTES)
+    memset (c->buf + c->buflen, 0, BLAKE2S_BLOCKBYTES - c->buflen); /* Padding */
+  blake2s_set_lastblock (S);
+  blake2s_increment_counter (S, (int)c->buflen - BLAKE2S_BLOCKBYTES);
+  burn = blake2s_transform (ctx, c->buf, 1);
+
+  /* Output full hash to buffer */
+  for (i = 0; i < 8; ++i)
+    buf_put_le32 (c->buf + sizeof(S->h[i]) * i, S->h[i]);
+
+  /* Zero out extra buffer bytes. */
+  if (c->outlen < sizeof(c->buf))
+    memset (c->buf + c->outlen, 0, sizeof(c->buf) - c->outlen);
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+static byte *blake2s_read(void *ctx)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  return c->buf;
+}
+
+static void blake2s_write(void *ctx, const void *inbuf, size_t inlen)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  BLAKE2S_STATE *S = &c->state;
+  blake2_write(S, inbuf, inlen, c->buf, &c->buflen, BLAKE2S_BLOCKBYTES,
+	       blake2s_transform);
+}
+
+static inline void blake2s_init_param(BLAKE2S_STATE *S,
+				      const struct blake2s_param_s *P)
+{
+  const byte *p = (const byte *)P;
+  size_t i;
+
+  /* init2 xors IV with input parameter block */
+
+  /* IV XOR ParamBlock */
+  for (i = 0; i < 8; ++i)
+    S->h[i] ^= blake2s_IV[i] ^ buf_get_le32(&p[i * 4]);
+}
+
+static inline gcry_err_code_t blake2s_init(BLAKE2S_CONTEXT *ctx,
+					   const byte *key, size_t keylen)
+{
+  struct blake2s_param_s P[1] = { { 0, } };
+  BLAKE2S_STATE *S = &ctx->state;
+
+  if (!ctx->outlen || ctx->outlen > BLAKE2S_OUTBYTES)
+    return GPG_ERR_INV_ARG;
+  if (sizeof(P[0]) != sizeof(u32) * 8)
+    return GPG_ERR_INTERNAL;
+  if (keylen && (!key || keylen > BLAKE2S_KEYBYTES))
+    return GPG_ERR_INV_KEYLEN;
+
+  P->digest_length = ctx->outlen;
+  P->key_length = keylen;
+  P->fanout = 1;
+  P->depth = 1;
+
+  blake2s_init_param (S, P);
+  wipememory (P, sizeof(P));
+
+  if (key)
+    {
+      blake2s_write (ctx, key, keylen);
+      blake2s_write (ctx, zero_block, BLAKE2S_BLOCKBYTES - keylen);
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
+					const byte *key, size_t keylen,
+					unsigned int dbits)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)features;
+  (void)flags;
+
+  memset (c, 0, sizeof (*c));
+
+#ifdef USE_AVX
+  c->use_avx = !!(features & HWF_INTEL_AVX);
+#endif
+
+  c->outlen = dbits / 8;
+  c->buflen = 0;
+  return blake2s_init(c, key, keylen);
+}
+
+/* Selftests from "RFC 7693, Appendix E. BLAKE2b and BLAKE2s Self-Test
+ * Module C Source". */
+static void selftest_seq(byte *out, size_t len, u32 seed)
+{
+  size_t i;
+  u32 t, a, b;
+
+  a = 0xDEAD4BAD * seed;
+  b = 1;
+
+  for (i = 0; i < len; i++)
+    {
+      t = a + b;
+      a = b;
+      b = t;
+      out[i] = (t >> 24) & 0xFF;
+    }
+}
+
+static gpg_err_code_t
+selftests_blake2b (int algo, int extended, selftest_report_func_t report)
+{
+  static const byte blake2b_res[32] =
+  {
+    0xC2, 0x3A, 0x78, 0x00, 0xD9, 0x81, 0x23, 0xBD,
+    0x10, 0xF5, 0x06, 0xC6, 0x1E, 0x29, 0xDA, 0x56,
+    0x03, 0xD7, 0x63, 0xB8, 0xBB, 0xAD, 0x2E, 0x73,
+    0x7F, 0x5E, 0x76, 0x5A, 0x7B, 0xCC, 0xD4, 0x75
+  };
+  static const size_t b2b_md_len[4] = { 20, 32, 48, 64 };
+  static const size_t b2b_in_len[6] = { 0, 3, 128, 129, 255, 1024 };
+  size_t i, j, outlen, inlen;
+  byte in[1024], key[64];
+  BLAKE2B_CONTEXT ctx;
+  BLAKE2B_CONTEXT ctx2;
+  const char *what;
+  const char *errtxt;
+
+  (void)extended;
+
+  what = "rfc7693 BLAKE2b selftest";
+
+  /* 256-bit hash for testing */
+  if (blake2b_init_ctx(&ctx, 0, NULL, 0, 32 * 8))
+    {
+      errtxt = "init failed";
+      goto failed;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      outlen = b2b_md_len[i];
+      for (j = 0; j < 6; j++)
+	{
+	  inlen = b2b_in_len[j];
+
+	  selftest_seq(in, inlen, inlen); /* unkeyed hash */
+	  blake2b_init_ctx(&ctx2, 0, NULL, 0, outlen * 8);
+	  blake2b_write(&ctx2, in, inlen);
+	  blake2b_final(&ctx2);
+	  blake2b_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+
+	  selftest_seq(key, outlen, outlen); /* keyed hash */
+	  blake2b_init_ctx(&ctx2, 0, key, outlen, outlen * 8);
+	  blake2b_write(&ctx2, in, inlen);
+	  blake2b_final(&ctx2);
+	  blake2b_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+	}
+    }
+
+  /* compute and compare the hash of hashes */
+  blake2b_final(&ctx);
+  for (i = 0; i < 32; i++)
+    {
+      if (ctx.buf[i] != blake2b_res[i])
+	{
+	  errtxt = "digest mismatch";
+	  goto failed;
+	}
+    }
+
+  return 0;
+
+failed:
+  if (report)
+    report ("digest", algo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_blake2s (int algo, int extended, selftest_report_func_t report)
+{
+  static const byte blake2s_res[32] =
+  {
+    0x6A, 0x41, 0x1F, 0x08, 0xCE, 0x25, 0xAD, 0xCD,
+    0xFB, 0x02, 0xAB, 0xA6, 0x41, 0x45, 0x1C, 0xEC,
+    0x53, 0xC5, 0x98, 0xB2, 0x4F, 0x4F, 0xC7, 0x87,
+    0xFB, 0xDC, 0x88, 0x79, 0x7F, 0x4C, 0x1D, 0xFE
+  };
+  static const size_t b2s_md_len[4] = { 16, 20, 28, 32 };
+  static const size_t b2s_in_len[6] = { 0, 3, 64, 65, 255, 1024 };
+  size_t i, j, outlen, inlen;
+  byte in[1024], key[32];
+  BLAKE2S_CONTEXT ctx;
+  BLAKE2S_CONTEXT ctx2;
+  const char *what;
+  const char *errtxt;
+
+  (void)extended;
+
+  what = "rfc7693 BLAKE2s selftest";
+
+  /* 256-bit hash for testing */
+  if (blake2s_init_ctx(&ctx, 0, NULL, 0, 32 * 8))
+    {
+      errtxt = "init failed";
+      goto failed;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      outlen = b2s_md_len[i];
+      for (j = 0; j < 6; j++)
+	{
+	  inlen = b2s_in_len[j];
+
+	  selftest_seq(in, inlen, inlen); /* unkeyed hash */
+	  blake2s_init_ctx(&ctx2, 0, NULL, 0, outlen * 8);
+	  blake2s_write(&ctx2, in, inlen);
+	  blake2s_final(&ctx2);
+	  blake2s_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+
+	  selftest_seq(key, outlen, outlen); /* keyed hash */
+	  blake2s_init_ctx(&ctx2, 0, key, outlen, outlen * 8);
+	  blake2s_write(&ctx2, in, inlen);
+	  blake2s_final(&ctx2);
+	  blake2s_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+	}
+    }
+
+  /* compute and compare the hash of hashes */
+  blake2s_final(&ctx);
+  for (i = 0; i < 32; i++)
+    {
+      if (ctx.buf[i] != blake2s_res[i])
+	{
+	  errtxt = "digest mismatch";
+	  goto failed;
+	}
+    }
+
+  return 0;
+
+failed:
+  if (report)
+    report ("digest", algo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+gcry_err_code_t _gcry_blake2_init_with_key(void *ctx, unsigned int flags,
+					   const unsigned char *key,
+					   size_t keylen, int algo)
+{
+  gcry_err_code_t rc;
+  switch (algo)
+    {
+    case GCRY_MD_BLAKE2B_512:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 512);
+      break;
+    case GCRY_MD_BLAKE2B_384:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 384);
+      break;
+    case GCRY_MD_BLAKE2B_256:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 256);
+      break;
+    case GCRY_MD_BLAKE2B_160:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 160);
+      break;
+    case GCRY_MD_BLAKE2S_256:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 256);
+      break;
+    case GCRY_MD_BLAKE2S_224:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 224);
+      break;
+    case GCRY_MD_BLAKE2S_160:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 160);
+      break;
+    case GCRY_MD_BLAKE2S_128:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 128);
+      break;
+    default:
+      rc = GPG_ERR_DIGEST_ALGO;
+      break;
+    }
+
+  return rc;
+}
+
+
+#define DEFINE_BLAKE2_VARIANT(bs, BS, dbits, oid_branch) \
+  static void blake2##bs##_##dbits##_init(void *ctx, unsigned int flags) \
+  { \
+    int err = blake2##bs##_init_ctx (ctx, flags, NULL, 0, dbits); \
+    gcry_assert (err == 0); \
+  } \
+  static void \
+  _gcry_blake2##bs##_##dbits##_hash_buffer(void *outbuf, \
+        const void *buffer, size_t length) \
+  { \
+    BLAKE2##BS##_CONTEXT hd; \
+    blake2##bs##_##dbits##_init (&hd, 0); \
+    blake2##bs##_write (&hd, buffer, length); \
+    blake2##bs##_final (&hd); \
+    memcpy (outbuf, blake2##bs##_read (&hd), dbits / 8); \
+  } \
+  static void \
+  _gcry_blake2##bs##_##dbits##_hash_buffers(void *outbuf, \
+        const gcry_buffer_t *iov, int iovcnt) \
+  { \
+    BLAKE2##BS##_CONTEXT hd; \
+    blake2##bs##_##dbits##_init (&hd, 0); \
+    for (;iovcnt > 0; iov++, iovcnt--) \
+      blake2##bs##_write (&hd, (const char*)iov[0].data + iov[0].off, \
+                          iov[0].len); \
+    blake2##bs##_final (&hd); \
+    memcpy (outbuf, blake2##bs##_read (&hd), dbits / 8); \
+  } \
+  static byte blake2##bs##_##dbits##_asn[] = { 0x30 }; \
+  static gcry_md_oid_spec_t oid_spec_blake2##bs##_##dbits[] = \
+    { \
+      { " 1.3.6.1.4.1.1722.12.2." oid_branch }, \
+      { NULL } \
+    }; \
+  gcry_md_spec_t _gcry_digest_spec_blake2##bs##_##dbits = \
+    { \
+      GCRY_MD_BLAKE2##BS##_##dbits, {0, 0}, \
+      "BLAKE2" #BS "_" #dbits, blake2##bs##_##dbits##_asn, \
+      DIM (blake2##bs##_##dbits##_asn), oid_spec_blake2##bs##_##dbits, \
+      dbits / 8, blake2##bs##_##dbits##_init, blake2##bs##_write, \
+      blake2##bs##_final, blake2##bs##_read, NULL, \
+      _gcry_blake2##bs##_##dbits##_hash_buffer, \
+      _gcry_blake2##bs##_##dbits##_hash_buffers, \
+      sizeof (BLAKE2##BS##_CONTEXT), selftests_blake2##bs \
+    };
+
+DEFINE_BLAKE2_VARIANT(b, B, 512, "1.16")
+DEFINE_BLAKE2_VARIANT(b, B, 384, "1.12")
+DEFINE_BLAKE2_VARIANT(b, B, 256, "1.8")
+DEFINE_BLAKE2_VARIANT(b, B, 160, "1.5")
+
+DEFINE_BLAKE2_VARIANT(s, S, 256, "2.8")
+DEFINE_BLAKE2_VARIANT(s, S, 224, "2.7")
+DEFINE_BLAKE2_VARIANT(s, S, 160, "2.5")
+DEFINE_BLAKE2_VARIANT(s, S, 128, "2.4")
diff --git a/comm/third_party/libgcrypt/cipher/blake2b-amd64-avx2.S b/comm/third_party/libgcrypt/cipher/blake2b-amd64-avx2.S
new file mode 100644
index 0000000000..357e8a5167
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blake2b-amd64-avx2.S
@@ -0,0 +1,300 @@
+/* blake2b-amd64-avx2.S  -  AVX2 implementation of BLAKE2b
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 8)
+#define STATE_F (STATE_T + 2 * 8)
+
+/* vector registers */
+#define ROW1  %ymm0
+#define ROW2  %ymm1
+#define ROW3  %ymm2
+#define ROW4  %ymm3
+#define TMP1  %ymm4
+#define TMP1x %xmm4
+#define R16   %ymm5
+#define R24   %ymm6
+
+#define MA1   %ymm8
+#define MA2   %ymm9
+#define MA3   %ymm10
+#define MA4   %ymm11
+#define MA1x  %xmm8
+#define MA2x  %xmm9
+#define MA3x  %xmm10
+#define MA4x  %xmm11
+
+#define MB1   %ymm12
+#define MB2   %ymm13
+#define MB3   %ymm14
+#define MB4   %ymm15
+#define MB1x  %xmm12
+#define MB2x  %xmm13
+#define MB3x  %xmm14
+#define MB4x  %xmm15
+
+/**********************************************************************
+  blake2b/AVX2
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+        vmovq (s4)*8(RINBLKS), TMP1x; \
+        vpinsrq $1, (s2)*8(RINBLKS), m1x, m1x; \
+        vpinsrq $1, (s6)*8(RINBLKS), TMP1x, TMP1x; \
+        vinserti128 $1, TMP1x, m1, m1; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+          vmovq (s5)*8(RINBLKS), TMP1x; \
+          vpinsrq $1, (s3)*8(RINBLKS), m2x, m2x; \
+          vpinsrq $1, (s7)*8(RINBLKS), TMP1x, TMP1x; \
+          vinserti128 $1, TMP1x, m2, m2; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+            vmovq (s12)*8(RINBLKS), TMP1x; \
+            vpinsrq $1, (s10)*8(RINBLKS), m3x, m3x; \
+            vpinsrq $1, (s14)*8(RINBLKS), TMP1x, TMP1x; \
+            vinserti128 $1, TMP1x, m3, m3; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+              vmovq (s13)*8(RINBLKS), TMP1x; \
+              vpinsrq $1, (s11)*8(RINBLKS), m4x, m4x; \
+              vpinsrq $1, (s15)*8(RINBLKS), TMP1x, TMP1x; \
+              vinserti128 $1, TMP1x, m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) \
+        LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
+
+#define ROR_32(in, out) vpshufd $0xb1, in, out;
+
+#define ROR_24(in, out) vpshufb R24, in, out;
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_63(in, out) \
+        vpsrlq $63, in, TMP1; \
+        vpaddq in, in, out; \
+        vpxor  TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddq m, r1, r1; \
+        vpaddq r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddq r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_32, ROR_24);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_63);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+blake2b_data:
+.align 32
+.Liv:
+        .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+        .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+        .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+        .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+.Lshuf_ror16:
+        .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
+.Lshuf_ror24:
+        .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
+
+.align 64
+.globl _gcry_blake2b_transform_amd64_avx2
+ELF(.type _gcry_blake2b_transform_amd64_avx2,@function;)
+
+_gcry_blake2b_transform_amd64_avx2:
+        /* input:
+         *	%rdi: state
+         *	%rsi: blks
+         *	%rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        vzeroupper;
+
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        vbroadcasti128 .Lshuf_ror16 rRIP, R16;
+        vbroadcasti128 .Lshuf_ror24 rRIP, R24;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(10, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(11, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 128(RINBLKS), RINBLKS;
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        ROUND(10, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(10, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2b_transform_amd64_avx2,
+    .-_gcry_blake2b_transform_amd64_avx2;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/blake2s-amd64-avx.S b/comm/third_party/libgcrypt/cipher/blake2s-amd64-avx.S
new file mode 100644
index 0000000000..5b93675871
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blake2s-amd64-avx.S
@@ -0,0 +1,278 @@
+/* blake2s-amd64-avx.S  -  AVX implementation of BLAKE2s
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 4)
+#define STATE_F (STATE_T + 2 * 4)
+
+/* vector registers */
+#define ROW1  %xmm0
+#define ROW2  %xmm1
+#define ROW3  %xmm2
+#define ROW4  %xmm3
+#define TMP1  %xmm4
+#define TMP1x %xmm4
+#define R16   %xmm5
+#define R8    %xmm6
+
+#define MA1   %xmm8
+#define MA2   %xmm9
+#define MA3   %xmm10
+#define MA4   %xmm11
+
+#define MB1   %xmm12
+#define MB2   %xmm13
+#define MB3   %xmm14
+#define MB4   %xmm15
+
+/**********************************************************************
+  blake2s/AVX
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_8(in, out)  vpshufb R8, in, out;
+
+#define ROR_12(in, out) \
+        vpsrld $12, in, TMP1; \
+        vpslld $(32 - 12), in, out; \
+        vpxor TMP1, out, out;
+
+#define ROR_7(in, out) \
+        vpsrld $7, in, TMP1; \
+        vpslld $(32 - 7), in, out; \
+        vpxor TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddd m, r1, r1; \
+        vpaddd r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddd r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_12);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_8, ROR_7);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+blake2s_data:
+.align 16
+.Liv:
+        .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+        .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.Lshuf_ror16:
+        .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_ror8:
+        .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
+
+.align 64
+.globl _gcry_blake2s_transform_amd64_avx
+ELF(.type _gcry_blake2s_transform_amd64_avx,@function;)
+
+_gcry_blake2s_transform_amd64_avx:
+        /* input:
+         *	%rdi: state
+         *	%rsi: blks
+         *	%rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        vzeroupper;
+
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        vmovdqa .Lshuf_ror16 rRIP, R16;
+        vmovdqa .Lshuf_ror8 rRIP, R8;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 64(RINBLKS), RINBLKS;
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(8, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2s_transform_amd64_avx,
+    .-_gcry_blake2s_transform_amd64_avx;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/blowfish-amd64.S b/comm/third_party/libgcrypt/cipher/blowfish-amd64.S
new file mode 100644
index 0000000000..bdb361d7eb
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blowfish-amd64.S
@@ -0,0 +1,601 @@
+/* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0	0
+#define s1	((s0) + 256 * 4)
+#define s2	((s1) + 256 * 4)
+#define s3	((s2) + 256 * 4)
+#define p	((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+	movzbl RX0bh,		RT1d; \
+	movzbl RX0bl,		RT3d; \
+	rorq $16,		RX0; \
+	movzbl RX0bh,		RT0d; \
+	movzbl RX0bl,		RT2d; \
+	rorq $16,		RX0; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		RX0;
+
+#define load_roundkey_enc(n) \
+	movq p+4*(n)(CTX), 	RX3;
+
+#define add_roundkey_enc() \
+	xorq RX3, 		RX0;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(); \
+	F();
+
+#define load_roundkey_dec(n) \
+	movq p+4*(n-1)(CTX),	RX3; \
+	rorq $32,		RX3;
+
+#define add_roundkey_dec() \
+	xorq RX3, 		RX0;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(); \
+	F();
+
+#define read_block() \
+	movq (RIO), 		RX0; \
+	rorq $32, 		RX0; \
+	bswapq 			RX0;
+
+#define write_block() \
+	bswapq 			RX0; \
+	movq RX0, 		(RIO);
+
+.align 8
+ELF(.type   __blowfish_enc_blk1,@function;)
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0: input plaintext block
+	 * output:
+	 *	RX0: output plaintext block
+	 */
+	CFI_STARTPROC();
+	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	movq %r11, %rbp;
+	CFI_RESTORE(%rbp)
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_do_encrypt
+ELF(.type   _gcry_blowfish_amd64_do_encrypt,@function;)
+
+_gcry_blowfish_amd64_do_encrypt:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: u32 *ret_xl
+	 *	%rdx: u32 *ret_xr
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	movl (%rdx), RX0d;
+	shlq $32, RX0;
+	movl (%rsi), RT3d;
+	movq %rdx, %r10;
+	orq RT3, RX0;
+	movq %rsi, RX2;
+
+	call __blowfish_enc_blk1;
+
+	movl RX0d, (%r10);
+	shrq $32, RX0;
+	movl RX0d, (RX2);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_encrypt_block
+ELF(.type   _gcry_blowfish_amd64_encrypt_block,@function;)
+
+_gcry_blowfish_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	movq %rsi, %r10;
+
+	movq %rdx, RIO;
+	read_block();
+
+	call __blowfish_enc_blk1;
+
+	movq %r10, RIO;
+	write_block();
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_decrypt_block
+ELF(.type   _gcry_blowfish_amd64_decrypt_block,@function;)
+
+_gcry_blowfish_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
+
+	movq %rsi, %r10;
+	movq %rdx, RIO;
+
+	read_block();
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	movq %r10, RIO;
+	write_block();
+
+	movq %r11, %rbp;
+	CFI_RESTORE(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
+
+/**********************************************************************
+  4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+	movzbl x ## bh,		RT1d; \
+	movzbl x ## bl,		RT3d; \
+	rorq $16,		x; \
+	movzbl x ## bh,		RT0d; \
+	movzbl x ## bl,		RT2d; \
+	rorq $16,		x; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		x;
+
+#define add_preloaded_roundkey4() \
+	xorq RKEY,		RX0; \
+	xorq RKEY,		RX1; \
+	xorq RKEY,		RX2; \
+	xorq RKEY,		RX3;
+
+#define preload_roundkey_enc(n) \
+	movq p+4*(n)(CTX),	RKEY;
+
+#define add_roundkey_enc4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+	add_roundkey_enc4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define preload_roundkey_dec(n) \
+	movq p+4*((n)-1)(CTX),	RKEY; \
+	rorq $32,		RKEY;
+
+#define add_roundkey_dec4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+	add_roundkey_dec4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define inbswap_block4() \
+	rorq $32,		RX0; \
+	bswapq 			RX0; \
+	rorq $32,		RX1; \
+	bswapq 			RX1; \
+	rorq $32,		RX2; \
+	bswapq 			RX2; \
+	rorq $32,		RX3; \
+	bswapq 			RX3;
+
+#define inctrswap_block4() \
+	rorq $32,		RX0; \
+	rorq $32,		RX1; \
+	rorq $32,		RX2; \
+	rorq $32,		RX3;
+
+#define outbswap_block4() \
+	bswapq 			RX0; \
+	bswapq 			RX1; \
+	bswapq 			RX2; \
+	bswapq 			RX3;
+
+.align 8
+ELF(.type   __blowfish_enc_blk4,@function;)
+
+__blowfish_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
+	 */
+	CFI_STARTPROC();
+	preload_roundkey_enc(0);
+
+	round_enc4(0);
+	round_enc4(2);
+	round_enc4(4);
+	round_enc4(6);
+	round_enc4(8);
+	round_enc4(10);
+	round_enc4(12);
+	round_enc4(14);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
+
+.align 8
+ELF(.type   __blowfish_dec_blk4,@function;)
+
+__blowfish_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input ciphertext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
+	 */
+	CFI_STARTPROC();
+	preload_roundkey_dec(17);
+
+	inbswap_block4();
+
+	round_dec4(17);
+	round_dec4(15);
+	round_dec4(13);
+	round_dec4(11);
+	round_dec4(9);
+	round_dec4(7);
+	round_dec4(5);
+	round_dec4(3);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_ctr_enc
+ELF(.type   _gcry_blowfish_amd64_ctr_enc,@function;)
+_gcry_blowfish_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* load IV and byteswap */
+	movq (%r13), RT0;
+	bswapq RT0;
+	movq RT0, RX0;
+
+	/* construct IVs */
+	leaq 1(RT0), RX1;
+	leaq 2(RT0), RX2;
+	leaq 3(RT0), RX3;
+	leaq 4(RT0), RT0;
+	bswapq RT0;
+
+	inctrswap_block4();
+
+	/* store new IV */
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_cbc_dec
+ELF(.type   _gcry_blowfish_amd64_cbc_dec,@function;)
+_gcry_blowfish_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+
+	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
+	movq %rsi, %r11; /*dst*/
+	movq %rdx, %r12; /*src*/
+	movq %rcx, %r13; /*iv*/
+
+	/* load input */
+	movq 0 * 8(%r12), RX0;
+	movq 1 * 8(%r12), RX1;
+	movq 2 * 8(%r12), RX2;
+	movq 3 * 8(%r12), RX3;
+
+	call __blowfish_dec_blk4;
+
+	movq 3 * 8(%r12), RT0;
+	xorq      (%r13), RX0;
+	xorq 0 * 8(%r12), RX1;
+	xorq 1 * 8(%r12), RX2;
+	xorq 2 * 8(%r12), RX3;
+	movq RT0, (%r13); /* store new IV */
+
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_cfb_dec
+ELF(.type   _gcry_blowfish_amd64_cfb_dec,@function;)
+_gcry_blowfish_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* Load input */
+	movq (%r13), RX0;
+	movq 0 * 8(%r12), RX1;
+	movq 1 * 8(%r12), RX2;
+	movq 2 * 8(%r12), RX3;
+
+	inbswap_block4();
+
+	/* Update IV */
+	movq 3 * 8(%r12), RT0;
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/blowfish-arm.S b/comm/third_party/libgcrypt/cipher/blowfish-arm.S
new file mode 100644
index 0000000000..b30aa31f1d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blowfish-arm.S
@@ -0,0 +1,743 @@
+/* blowfish-arm.S  -  ARM assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of crypto context */
+#define s0	0
+#define s1	(s0 + (1 * 256) * 4)
+#define s2	(s0 + (2 * 256) * 4)
+#define s3	(s0 + (3 * 256) * 4)
+#define p	(s3 + (1 * 256) * 4)
+
+/* register macros */
+#define CTXs0 %r0
+#define CTXs1 %r9
+#define CTXs2 %r8
+#define CTXs3 %r10
+#define RMASK %lr
+#define RKEYL %r2
+#define RKEYR %ip
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %r11
+#define RT1 %r7
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+	#define ldr_unaligned_host ldr_unaligned_le
+	#define str_unaligned_host str_unaligned_le
+
+	/* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	#define ldr_unaligned_host ldr_unaligned_be
+	#define str_unaligned_host str_unaligned_be
+
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(l, r) \
+	and RT0, RMASK, l, lsr#(24 - 2); \
+	and RT1, RMASK, l, lsr#(16 - 2); \
+	ldr RT0, [CTXs0, RT0]; \
+	and RT2, RMASK, l, lsr#(8 - 2); \
+	ldr RT1, [CTXs1, RT1]; \
+	and RT3, RMASK, l, lsl#2; \
+	ldr RT2, [CTXs2, RT2]; \
+	add RT0, RT1; \
+	ldr RT3, [CTXs3, RT3]; \
+	eor RT0, RT2; \
+	add RT0, RT3; \
+	eor r, RT0;
+
+#define load_roundkey_enc(n) \
+	ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
+	ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
+
+#define add_roundkey_enc() \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(RL0, RR0); \
+	F(RR0, RL0);
+
+#define load_roundkey_dec(n) \
+	ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
+	ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
+
+#define add_roundkey_dec() \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(RL0, RR0); \
+	F(RR0, RL0);
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+	ldr l0, [rin, #((offs) + 0)]; \
+	ldr r0, [rin, #((offs) + 4)]; \
+	convert(l0, rtmp); \
+	convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	str l0, [rout, #((offs) + 0)]; \
+	str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
+
+	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
+
+	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, host_to_host); \
+		2:;
+#endif
+
+.align 3
+.type  __blowfish_enc_blk1,%function;
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0]: src
+	 * output:
+	 *	[RR0, RL0]: dst
+	 */
+	push {%lr};
+
+	add CTXs1, CTXs0, #(s1 - s0);
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+	add CTXs3, CTXs1, #(s3 - s1);
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	pop {%pc};
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_arm_do_encrypt
+.type   _gcry_blowfish_arm_do_encrypt,%function;
+
+_gcry_blowfish_arm_do_encrypt:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: u32 *ret_xl
+	 *	%r2: u32 *ret_xr
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	ldr RL0, [%r1];
+	ldr RR0, [%r2];
+
+	bl __blowfish_enc_blk1;
+
+	pop {%r2};
+	str RR0, [%r1];
+	str RL0, [%r2];
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
+
+.align 3
+.globl _gcry_blowfish_arm_encrypt_block
+.type   _gcry_blowfish_arm_encrypt_block,%function;
+
+_gcry_blowfish_arm_encrypt_block:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	bl __blowfish_enc_blk1;
+
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
+
+.align 3
+.globl _gcry_blowfish_arm_decrypt_block
+.type   _gcry_blowfish_arm_decrypt_block,%function;
+
+_gcry_blowfish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	add CTXs1, CTXs0, #(s1 - s0);
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+	add CTXs3, CTXs1, #(s3 - s1);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
+
+/***********************************************************************
+ * 2-way blowfish
+ ***********************************************************************/
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
+	\
+	and RT0, RMASK, l0, lsr#(24 - 2); \
+	and RT1, RMASK, l0, lsr#(16 - 2); \
+	and RT2, RMASK, l0, lsr#(8 - 2); \
+	add RT1, #(s1 - s0); \
+	\
+	ldr RT0, [CTXs0, RT0]; \
+	and RT3, RMASK, l0, lsl#2; \
+	ldr RT1, [CTXs0, RT1]; \
+	add RT3, #(s3 - s2); \
+	ldr RT2, [CTXs2, RT2]; \
+	add RT0, RT1; \
+	ldr RT3, [CTXs2, RT3]; \
+	\
+	and RT1, RMASK, l1, lsr#(24 - 2); \
+	eor RT0, RT2; \
+	and RT2, RMASK, l1, lsr#(16 - 2); \
+	add RT0, RT3; \
+	add RT2, #(s1 - s0); \
+	and RT3, RMASK, l1, lsr#(8 - 2); \
+	eor r0, RT0; \
+	\
+	ldr RT1, [CTXs0, RT1]; \
+	and RT0, RMASK, l1, lsl#2; \
+	ldr RT2, [CTXs0, RT2]; \
+	add RT0, #(s3 - s2); \
+	ldr RT3, [CTXs2, RT3]; \
+	add RT1, RT2; \
+	ldr RT0, [CTXs2, RT0]; \
+	\
+	and RT2, RMASK, r0, lsr#(24 - 2); \
+	eor RT1, RT3; \
+	and RT3, RMASK, r0, lsr#(16 - 2); \
+	add RT1, RT0; \
+	add RT3, #(s1 - s0); \
+	and RT0, RMASK, r0, lsr#(8 - 2); \
+	eor r1, RT1; \
+	\
+	ldr RT2, [CTXs0, RT2]; \
+	and RT1, RMASK, r0, lsl#2; \
+	ldr RT3, [CTXs0, RT3]; \
+	add RT1, #(s3 - s2); \
+	ldr RT0, [CTXs2, RT0]; \
+	add RT2, RT3; \
+	ldr RT1, [CTXs2, RT1]; \
+	\
+	and RT3, RMASK, r1, lsr#(24 - 2); \
+	eor RT2, RT0; \
+	and RT0, RMASK, r1, lsr#(16 - 2); \
+	add RT2, RT1; \
+	add RT0, #(s1 - s0); \
+	and RT1, RMASK, r1, lsr#(8 - 2); \
+	eor l0, RT2; \
+	\
+	ldr RT3, [CTXs0, RT3]; \
+	and RT2, RMASK, r1, lsl#2; \
+	ldr RT0, [CTXs0, RT0]; \
+	add RT2, #(s3 - s2); \
+	ldr RT1, [CTXs2, RT1]; \
+	eor l1, RKEYL; \
+	ldr RT2, [CTXs2, RT2]; \
+	\
+	eor r0, RKEYR; \
+	add RT3, RT0; \
+	eor r1, RKEYR; \
+	eor RT3, RT1; \
+	eor l0, RKEYL; \
+	add RT3, RT2; \
+	set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
+	eor l1, RT3; \
+	set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
+
+#define load_n_add_roundkey_enc2(n) \
+	load_roundkey_enc(n); \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR; \
+	eor RL1, RKEYL; \
+	eor RR1, RKEYR; \
+	load_roundkey_enc((n) + 2);
+
+#define next_key(reg, offs) \
+	ldr reg, [CTXs2, #(offs)];
+
+#define dummy(x, y) /* do nothing */
+
+#define round_enc2(n, load_next_key) \
+	F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
+
+#define load_n_add_roundkey_dec2(n) \
+	load_roundkey_dec(n); \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR; \
+	eor RL1, RKEYL; \
+	eor RR1, RKEYR; \
+	load_roundkey_dec((n) - 2);
+
+#define round_dec2(n, load_next_key) \
+	F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+	ldr l0, [rin, #(0)]; \
+	ldr r0, [rin, #(4)]; \
+	convert(l0, rtmp); \
+	ldr l1, [rin, #(8)]; \
+	convert(r0, rtmp); \
+	ldr r1, [rin, #(12)]; \
+	convert(l1, rtmp); \
+	convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	convert(l1, rtmp); \
+	str l0, [rout, #(0)]; \
+	convert(r1, rtmp); \
+	str r0, [rout, #(4)]; \
+	str l1, [rout, #(8)]; \
+	str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, 4, rtmp0); \
+			ldr_unaligned_be(l1, rin, 8, rtmp0); \
+			ldr_unaligned_be(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, 4, rtmp0); \
+			ldr_unaligned_host(l1, rin, 8, rtmp0); \
+			ldr_unaligned_host(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.type  _gcry_blowfish_arm_enc_blk2,%function;
+
+_gcry_blowfish_arm_enc_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	push {RT0,%lr};
+
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+
+	load_n_add_roundkey_enc2(0);
+	round_enc2(2, next_key);
+	round_enc2(4, next_key);
+	round_enc2(6, next_key);
+	round_enc2(8, next_key);
+	round_enc2(10, next_key);
+	round_enc2(12, next_key);
+	round_enc2(14, next_key);
+	round_enc2(16, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	pop {RT0,%pc};
+.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cfb_dec;
+.type  _gcry_blowfish_arm_cfb_dec,%function;
+
+_gcry_blowfish_arm_cfb_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+	ldm %r3, {RL0, RR0};
+	host_to_be(RL0, RT0);
+	host_to_be(RR0, RT0);
+	read_block(%r2, 0, RL1, RR1, RT0);
+
+	/* Update IV, load src[1] and save to iv[0] */
+	read_block_host(%r2, 8, %r5, %r6, RT0);
+	stm %lr, {%r5, %r6};
+
+	bl _gcry_blowfish_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r1: dst, %r0: %src */
+	pop {%r0};
+
+	/* dst = src ^ result */
+	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
+
+.align 3
+.globl _gcry_blowfish_arm_ctr_enc;
+.type  _gcry_blowfish_arm_ctr_enc,%function;
+
+_gcry_blowfish_arm_ctr_enc:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit, big-endian)
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load IV (big => host endian) */
+	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+
+	/* Construct IVs */
+	adds RR1, RR0, #1; /* +1 */
+	adc RL1, RL0, #0;
+	adds %r6, RR1, #1; /* +2 */
+	adc %r5, RL1, #0;
+
+	/* Store new IV (host => big-endian) */
+	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+
+	bl _gcry_blowfish_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r1: dst, %r0: %src */
+	pop {%r0};
+
+	/* XOR key-stream with plaintext */
+	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
+
+.align 3
+.type  _gcry_blowfish_arm_dec_blk2,%function;
+
+_gcry_blowfish_arm_dec_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+
+	load_n_add_roundkey_dec2(17);
+	round_dec2(15, next_key);
+	round_dec2(13, next_key);
+	round_dec2(11, next_key);
+	round_dec2(9, next_key);
+	round_dec2(7, next_key);
+	round_dec2(5, next_key);
+	round_dec2(3, next_key);
+	round_dec2(1, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cbc_dec;
+.type  _gcry_blowfish_arm_cbc_dec,%function;
+
+_gcry_blowfish_arm_cbc_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r2-%r11, %ip, %lr};
+
+	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+	 * of function call. */
+	b _gcry_blowfish_arm_dec_blk2;
+.Ldec_cbc_tail:
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: %src, %r1: dst, %r2: iv */
+	pop {%r0, %r2};
+
+	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r0, 0, %r7, %r8, %r5);
+	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+	ldm %r2, {%r5, %r6};
+
+	/* out[1] ^= IV+1 */
+	eor %r10, %r7;
+	eor %r9, %r8;
+	/* out[0] ^= IV */
+	eor %r4, %r5;
+	eor %r3, %r6;
+
+	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r0, 8, %r7, %r8, %r5);
+	/* store IV+2 to iv[0] (aligned). */
+	stm %r2, {%r7, %r8};
+
+	/* store result to dst[0-3]. Might be unaligned. */
+	write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/comm/third_party/libgcrypt/cipher/blowfish.c b/comm/third_party/libgcrypt/cipher/blowfish.c
new file mode 100644
index 0000000000..7b001306c7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/blowfish.c
@@ -0,0 +1,1142 @@
+/* blowfish.c  -  Blowfish encryption
+ *	Copyright (C) 1998, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * For a description of the algorithm, see:
+ *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
+ *   ISBN 0-471-11709-9. Pages 336 ff.
+ */
+
+/* Test values:
+ * key	  "abcdefghijklmnopqrstuvwxyz";
+ * plain  "BLOWFISH"
+ * cipher 32 4E D0 FE F4 13 A2 03
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+#define BLOWFISH_BLOCKSIZE 8
+#define BLOWFISH_KEY_MIN_BITS 8
+#define BLOWFISH_KEY_MAX_BITS 576
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+#  define USE_ARM_ASM 1
+# endif
+#endif
+
+typedef struct {
+    u32 s0[256];
+    u32 s1[256];
+    u32 s2[256];
+    u32 s3[256];
+    u32 p[16+2];
+} BLOWFISH_context;
+
+static gcry_err_code_t bf_setkey (void *c, const byte *key, unsigned keylen,
+                                  cipher_bulk_ops_t *bulk_ops);
+static unsigned int encrypt_block (void *bc, byte *outbuf, const byte *inbuf);
+static unsigned int decrypt_block (void *bc, byte *outbuf, const byte *inbuf);
+
+
+/* precomputed S boxes */
+static const u32 ks0[256] = {
+    0xD1310BA6,0x98DFB5AC,0x2FFD72DB,0xD01ADFB7,0xB8E1AFED,0x6A267E96,
+    0xBA7C9045,0xF12C7F99,0x24A19947,0xB3916CF7,0x0801F2E2,0x858EFC16,
+    0x636920D8,0x71574E69,0xA458FEA3,0xF4933D7E,0x0D95748F,0x728EB658,
+    0x718BCD58,0x82154AEE,0x7B54A41D,0xC25A59B5,0x9C30D539,0x2AF26013,
+    0xC5D1B023,0x286085F0,0xCA417918,0xB8DB38EF,0x8E79DCB0,0x603A180E,
+    0x6C9E0E8B,0xB01E8A3E,0xD71577C1,0xBD314B27,0x78AF2FDA,0x55605C60,
+    0xE65525F3,0xAA55AB94,0x57489862,0x63E81440,0x55CA396A,0x2AAB10B6,
+    0xB4CC5C34,0x1141E8CE,0xA15486AF,0x7C72E993,0xB3EE1411,0x636FBC2A,
+    0x2BA9C55D,0x741831F6,0xCE5C3E16,0x9B87931E,0xAFD6BA33,0x6C24CF5C,
+    0x7A325381,0x28958677,0x3B8F4898,0x6B4BB9AF,0xC4BFE81B,0x66282193,
+    0x61D809CC,0xFB21A991,0x487CAC60,0x5DEC8032,0xEF845D5D,0xE98575B1,
+    0xDC262302,0xEB651B88,0x23893E81,0xD396ACC5,0x0F6D6FF3,0x83F44239,
+    0x2E0B4482,0xA4842004,0x69C8F04A,0x9E1F9B5E,0x21C66842,0xF6E96C9A,
+    0x670C9C61,0xABD388F0,0x6A51A0D2,0xD8542F68,0x960FA728,0xAB5133A3,
+    0x6EEF0B6C,0x137A3BE4,0xBA3BF050,0x7EFB2A98,0xA1F1651D,0x39AF0176,
+    0x66CA593E,0x82430E88,0x8CEE8619,0x456F9FB4,0x7D84A5C3,0x3B8B5EBE,
+    0xE06F75D8,0x85C12073,0x401A449F,0x56C16AA6,0x4ED3AA62,0x363F7706,
+    0x1BFEDF72,0x429B023D,0x37D0D724,0xD00A1248,0xDB0FEAD3,0x49F1C09B,
+    0x075372C9,0x80991B7B,0x25D479D8,0xF6E8DEF7,0xE3FE501A,0xB6794C3B,
+    0x976CE0BD,0x04C006BA,0xC1A94FB6,0x409F60C4,0x5E5C9EC2,0x196A2463,
+    0x68FB6FAF,0x3E6C53B5,0x1339B2EB,0x3B52EC6F,0x6DFC511F,0x9B30952C,
+    0xCC814544,0xAF5EBD09,0xBEE3D004,0xDE334AFD,0x660F2807,0x192E4BB3,
+    0xC0CBA857,0x45C8740F,0xD20B5F39,0xB9D3FBDB,0x5579C0BD,0x1A60320A,
+    0xD6A100C6,0x402C7279,0x679F25FE,0xFB1FA3CC,0x8EA5E9F8,0xDB3222F8,
+    0x3C7516DF,0xFD616B15,0x2F501EC8,0xAD0552AB,0x323DB5FA,0xFD238760,
+    0x53317B48,0x3E00DF82,0x9E5C57BB,0xCA6F8CA0,0x1A87562E,0xDF1769DB,
+    0xD542A8F6,0x287EFFC3,0xAC6732C6,0x8C4F5573,0x695B27B0,0xBBCA58C8,
+    0xE1FFA35D,0xB8F011A0,0x10FA3D98,0xFD2183B8,0x4AFCB56C,0x2DD1D35B,
+    0x9A53E479,0xB6F84565,0xD28E49BC,0x4BFB9790,0xE1DDF2DA,0xA4CB7E33,
+    0x62FB1341,0xCEE4C6E8,0xEF20CADA,0x36774C01,0xD07E9EFE,0x2BF11FB4,
+    0x95DBDA4D,0xAE909198,0xEAAD8E71,0x6B93D5A0,0xD08ED1D0,0xAFC725E0,
+    0x8E3C5B2F,0x8E7594B7,0x8FF6E2FB,0xF2122B64,0x8888B812,0x900DF01C,
+    0x4FAD5EA0,0x688FC31C,0xD1CFF191,0xB3A8C1AD,0x2F2F2218,0xBE0E1777,
+    0xEA752DFE,0x8B021FA1,0xE5A0CC0F,0xB56F74E8,0x18ACF3D6,0xCE89E299,
+    0xB4A84FE0,0xFD13E0B7,0x7CC43B81,0xD2ADA8D9,0x165FA266,0x80957705,
+    0x93CC7314,0x211A1477,0xE6AD2065,0x77B5FA86,0xC75442F5,0xFB9D35CF,
+    0xEBCDAF0C,0x7B3E89A0,0xD6411BD3,0xAE1E7E49,0x00250E2D,0x2071B35E,
+    0x226800BB,0x57B8E0AF,0x2464369B,0xF009B91E,0x5563911D,0x59DFA6AA,
+    0x78C14389,0xD95A537F,0x207D5BA2,0x02E5B9C5,0x83260376,0x6295CFA9,
+    0x11C81968,0x4E734A41,0xB3472DCA,0x7B14A94A,0x1B510052,0x9A532915,
+    0xD60F573F,0xBC9BC6E4,0x2B60A476,0x81E67400,0x08BA6FB5,0x571BE91F,
+    0xF296EC6B,0x2A0DD915,0xB6636521,0xE7B9F9B6,0xFF34052E,0xC5855664,
+    0x53B02D5D,0xA99F8FA1,0x08BA4799,0x6E85076A };
+
+static const u32 ks1[256] = {
+    0x4B7A70E9,0xB5B32944,0xDB75092E,0xC4192623,0xAD6EA6B0,0x49A7DF7D,
+    0x9CEE60B8,0x8FEDB266,0xECAA8C71,0x699A17FF,0x5664526C,0xC2B19EE1,
+    0x193602A5,0x75094C29,0xA0591340,0xE4183A3E,0x3F54989A,0x5B429D65,
+    0x6B8FE4D6,0x99F73FD6,0xA1D29C07,0xEFE830F5,0x4D2D38E6,0xF0255DC1,
+    0x4CDD2086,0x8470EB26,0x6382E9C6,0x021ECC5E,0x09686B3F,0x3EBAEFC9,
+    0x3C971814,0x6B6A70A1,0x687F3584,0x52A0E286,0xB79C5305,0xAA500737,
+    0x3E07841C,0x7FDEAE5C,0x8E7D44EC,0x5716F2B8,0xB03ADA37,0xF0500C0D,
+    0xF01C1F04,0x0200B3FF,0xAE0CF51A,0x3CB574B2,0x25837A58,0xDC0921BD,
+    0xD19113F9,0x7CA92FF6,0x94324773,0x22F54701,0x3AE5E581,0x37C2DADC,
+    0xC8B57634,0x9AF3DDA7,0xA9446146,0x0FD0030E,0xECC8C73E,0xA4751E41,
+    0xE238CD99,0x3BEA0E2F,0x3280BBA1,0x183EB331,0x4E548B38,0x4F6DB908,
+    0x6F420D03,0xF60A04BF,0x2CB81290,0x24977C79,0x5679B072,0xBCAF89AF,
+    0xDE9A771F,0xD9930810,0xB38BAE12,0xDCCF3F2E,0x5512721F,0x2E6B7124,
+    0x501ADDE6,0x9F84CD87,0x7A584718,0x7408DA17,0xBC9F9ABC,0xE94B7D8C,
+    0xEC7AEC3A,0xDB851DFA,0x63094366,0xC464C3D2,0xEF1C1847,0x3215D908,
+    0xDD433B37,0x24C2BA16,0x12A14D43,0x2A65C451,0x50940002,0x133AE4DD,
+    0x71DFF89E,0x10314E55,0x81AC77D6,0x5F11199B,0x043556F1,0xD7A3C76B,
+    0x3C11183B,0x5924A509,0xF28FE6ED,0x97F1FBFA,0x9EBABF2C,0x1E153C6E,
+    0x86E34570,0xEAE96FB1,0x860E5E0A,0x5A3E2AB3,0x771FE71C,0x4E3D06FA,
+    0x2965DCB9,0x99E71D0F,0x803E89D6,0x5266C825,0x2E4CC978,0x9C10B36A,
+    0xC6150EBA,0x94E2EA78,0xA5FC3C53,0x1E0A2DF4,0xF2F74EA7,0x361D2B3D,
+    0x1939260F,0x19C27960,0x5223A708,0xF71312B6,0xEBADFE6E,0xEAC31F66,
+    0xE3BC4595,0xA67BC883,0xB17F37D1,0x018CFF28,0xC332DDEF,0xBE6C5AA5,
+    0x65582185,0x68AB9802,0xEECEA50F,0xDB2F953B,0x2AEF7DAD,0x5B6E2F84,
+    0x1521B628,0x29076170,0xECDD4775,0x619F1510,0x13CCA830,0xEB61BD96,
+    0x0334FE1E,0xAA0363CF,0xB5735C90,0x4C70A239,0xD59E9E0B,0xCBAADE14,
+    0xEECC86BC,0x60622CA7,0x9CAB5CAB,0xB2F3846E,0x648B1EAF,0x19BDF0CA,
+    0xA02369B9,0x655ABB50,0x40685A32,0x3C2AB4B3,0x319EE9D5,0xC021B8F7,
+    0x9B540B19,0x875FA099,0x95F7997E,0x623D7DA8,0xF837889A,0x97E32D77,
+    0x11ED935F,0x16681281,0x0E358829,0xC7E61FD6,0x96DEDFA1,0x7858BA99,
+    0x57F584A5,0x1B227263,0x9B83C3FF,0x1AC24696,0xCDB30AEB,0x532E3054,
+    0x8FD948E4,0x6DBC3128,0x58EBF2EF,0x34C6FFEA,0xFE28ED61,0xEE7C3C73,
+    0x5D4A14D9,0xE864B7E3,0x42105D14,0x203E13E0,0x45EEE2B6,0xA3AAABEA,
+    0xDB6C4F15,0xFACB4FD0,0xC742F442,0xEF6ABBB5,0x654F3B1D,0x41CD2105,
+    0xD81E799E,0x86854DC7,0xE44B476A,0x3D816250,0xCF62A1F2,0x5B8D2646,
+    0xFC8883A0,0xC1C7B6A3,0x7F1524C3,0x69CB7492,0x47848A0B,0x5692B285,
+    0x095BBF00,0xAD19489D,0x1462B174,0x23820E00,0x58428D2A,0x0C55F5EA,
+    0x1DADF43E,0x233F7061,0x3372F092,0x8D937E41,0xD65FECF1,0x6C223BDB,
+    0x7CDE3759,0xCBEE7460,0x4085F2A7,0xCE77326E,0xA6078084,0x19F8509E,
+    0xE8EFD855,0x61D99735,0xA969A7AA,0xC50C06C2,0x5A04ABFC,0x800BCADC,
+    0x9E447A2E,0xC3453484,0xFDD56705,0x0E1E9EC9,0xDB73DBD3,0x105588CD,
+    0x675FDA79,0xE3674340,0xC5C43465,0x713E38D8,0x3D28F89E,0xF16DFF20,
+    0x153E21E7,0x8FB03D4A,0xE6E39F2B,0xDB83ADF7 };
+
+static const u32 ks2[256] = {
+    0xE93D5A68,0x948140F7,0xF64C261C,0x94692934,0x411520F7,0x7602D4F7,
+    0xBCF46B2E,0xD4A20068,0xD4082471,0x3320F46A,0x43B7D4B7,0x500061AF,
+    0x1E39F62E,0x97244546,0x14214F74,0xBF8B8840,0x4D95FC1D,0x96B591AF,
+    0x70F4DDD3,0x66A02F45,0xBFBC09EC,0x03BD9785,0x7FAC6DD0,0x31CB8504,
+    0x96EB27B3,0x55FD3941,0xDA2547E6,0xABCA0A9A,0x28507825,0x530429F4,
+    0x0A2C86DA,0xE9B66DFB,0x68DC1462,0xD7486900,0x680EC0A4,0x27A18DEE,
+    0x4F3FFEA2,0xE887AD8C,0xB58CE006,0x7AF4D6B6,0xAACE1E7C,0xD3375FEC,
+    0xCE78A399,0x406B2A42,0x20FE9E35,0xD9F385B9,0xEE39D7AB,0x3B124E8B,
+    0x1DC9FAF7,0x4B6D1856,0x26A36631,0xEAE397B2,0x3A6EFA74,0xDD5B4332,
+    0x6841E7F7,0xCA7820FB,0xFB0AF54E,0xD8FEB397,0x454056AC,0xBA489527,
+    0x55533A3A,0x20838D87,0xFE6BA9B7,0xD096954B,0x55A867BC,0xA1159A58,
+    0xCCA92963,0x99E1DB33,0xA62A4A56,0x3F3125F9,0x5EF47E1C,0x9029317C,
+    0xFDF8E802,0x04272F70,0x80BB155C,0x05282CE3,0x95C11548,0xE4C66D22,
+    0x48C1133F,0xC70F86DC,0x07F9C9EE,0x41041F0F,0x404779A4,0x5D886E17,
+    0x325F51EB,0xD59BC0D1,0xF2BCC18F,0x41113564,0x257B7834,0x602A9C60,
+    0xDFF8E8A3,0x1F636C1B,0x0E12B4C2,0x02E1329E,0xAF664FD1,0xCAD18115,
+    0x6B2395E0,0x333E92E1,0x3B240B62,0xEEBEB922,0x85B2A20E,0xE6BA0D99,
+    0xDE720C8C,0x2DA2F728,0xD0127845,0x95B794FD,0x647D0862,0xE7CCF5F0,
+    0x5449A36F,0x877D48FA,0xC39DFD27,0xF33E8D1E,0x0A476341,0x992EFF74,
+    0x3A6F6EAB,0xF4F8FD37,0xA812DC60,0xA1EBDDF8,0x991BE14C,0xDB6E6B0D,
+    0xC67B5510,0x6D672C37,0x2765D43B,0xDCD0E804,0xF1290DC7,0xCC00FFA3,
+    0xB5390F92,0x690FED0B,0x667B9FFB,0xCEDB7D9C,0xA091CF0B,0xD9155EA3,
+    0xBB132F88,0x515BAD24,0x7B9479BF,0x763BD6EB,0x37392EB3,0xCC115979,
+    0x8026E297,0xF42E312D,0x6842ADA7,0xC66A2B3B,0x12754CCC,0x782EF11C,
+    0x6A124237,0xB79251E7,0x06A1BBE6,0x4BFB6350,0x1A6B1018,0x11CAEDFA,
+    0x3D25BDD8,0xE2E1C3C9,0x44421659,0x0A121386,0xD90CEC6E,0xD5ABEA2A,
+    0x64AF674E,0xDA86A85F,0xBEBFE988,0x64E4C3FE,0x9DBC8057,0xF0F7C086,
+    0x60787BF8,0x6003604D,0xD1FD8346,0xF6381FB0,0x7745AE04,0xD736FCCC,
+    0x83426B33,0xF01EAB71,0xB0804187,0x3C005E5F,0x77A057BE,0xBDE8AE24,
+    0x55464299,0xBF582E61,0x4E58F48F,0xF2DDFDA2,0xF474EF38,0x8789BDC2,
+    0x5366F9C3,0xC8B38E74,0xB475F255,0x46FCD9B9,0x7AEB2661,0x8B1DDF84,
+    0x846A0E79,0x915F95E2,0x466E598E,0x20B45770,0x8CD55591,0xC902DE4C,
+    0xB90BACE1,0xBB8205D0,0x11A86248,0x7574A99E,0xB77F19B6,0xE0A9DC09,
+    0x662D09A1,0xC4324633,0xE85A1F02,0x09F0BE8C,0x4A99A025,0x1D6EFE10,
+    0x1AB93D1D,0x0BA5A4DF,0xA186F20F,0x2868F169,0xDCB7DA83,0x573906FE,
+    0xA1E2CE9B,0x4FCD7F52,0x50115E01,0xA70683FA,0xA002B5C4,0x0DE6D027,
+    0x9AF88C27,0x773F8641,0xC3604C06,0x61A806B5,0xF0177A28,0xC0F586E0,
+    0x006058AA,0x30DC7D62,0x11E69ED7,0x2338EA63,0x53C2DD94,0xC2C21634,
+    0xBBCBEE56,0x90BCB6DE,0xEBFC7DA1,0xCE591D76,0x6F05E409,0x4B7C0188,
+    0x39720A3D,0x7C927C24,0x86E3725F,0x724D9DB9,0x1AC15BB4,0xD39EB8FC,
+    0xED545578,0x08FCA5B5,0xD83D7CD3,0x4DAD0FC4,0x1E50EF5E,0xB161E6F8,
+    0xA28514D9,0x6C51133C,0x6FD5C7E7,0x56E14EC4,0x362ABFCE,0xDDC6C837,
+    0xD79A3234,0x92638212,0x670EFA8E,0x406000E0 };
+
+static const u32 ks3[256] = {
+    0x3A39CE37,0xD3FAF5CF,0xABC27737,0x5AC52D1B,0x5CB0679E,0x4FA33742,
+    0xD3822740,0x99BC9BBE,0xD5118E9D,0xBF0F7315,0xD62D1C7E,0xC700C47B,
+    0xB78C1B6B,0x21A19045,0xB26EB1BE,0x6A366EB4,0x5748AB2F,0xBC946E79,
+    0xC6A376D2,0x6549C2C8,0x530FF8EE,0x468DDE7D,0xD5730A1D,0x4CD04DC6,
+    0x2939BBDB,0xA9BA4650,0xAC9526E8,0xBE5EE304,0xA1FAD5F0,0x6A2D519A,
+    0x63EF8CE2,0x9A86EE22,0xC089C2B8,0x43242EF6,0xA51E03AA,0x9CF2D0A4,
+    0x83C061BA,0x9BE96A4D,0x8FE51550,0xBA645BD6,0x2826A2F9,0xA73A3AE1,
+    0x4BA99586,0xEF5562E9,0xC72FEFD3,0xF752F7DA,0x3F046F69,0x77FA0A59,
+    0x80E4A915,0x87B08601,0x9B09E6AD,0x3B3EE593,0xE990FD5A,0x9E34D797,
+    0x2CF0B7D9,0x022B8B51,0x96D5AC3A,0x017DA67D,0xD1CF3ED6,0x7C7D2D28,
+    0x1F9F25CF,0xADF2B89B,0x5AD6B472,0x5A88F54C,0xE029AC71,0xE019A5E6,
+    0x47B0ACFD,0xED93FA9B,0xE8D3C48D,0x283B57CC,0xF8D56629,0x79132E28,
+    0x785F0191,0xED756055,0xF7960E44,0xE3D35E8C,0x15056DD4,0x88F46DBA,
+    0x03A16125,0x0564F0BD,0xC3EB9E15,0x3C9057A2,0x97271AEC,0xA93A072A,
+    0x1B3F6D9B,0x1E6321F5,0xF59C66FB,0x26DCF319,0x7533D928,0xB155FDF5,
+    0x03563482,0x8ABA3CBB,0x28517711,0xC20AD9F8,0xABCC5167,0xCCAD925F,
+    0x4DE81751,0x3830DC8E,0x379D5862,0x9320F991,0xEA7A90C2,0xFB3E7BCE,
+    0x5121CE64,0x774FBE32,0xA8B6E37E,0xC3293D46,0x48DE5369,0x6413E680,
+    0xA2AE0810,0xDD6DB224,0x69852DFD,0x09072166,0xB39A460A,0x6445C0DD,
+    0x586CDECF,0x1C20C8AE,0x5BBEF7DD,0x1B588D40,0xCCD2017F,0x6BB4E3BB,
+    0xDDA26A7E,0x3A59FF45,0x3E350A44,0xBCB4CDD5,0x72EACEA8,0xFA6484BB,
+    0x8D6612AE,0xBF3C6F47,0xD29BE463,0x542F5D9E,0xAEC2771B,0xF64E6370,
+    0x740E0D8D,0xE75B1357,0xF8721671,0xAF537D5D,0x4040CB08,0x4EB4E2CC,
+    0x34D2466A,0x0115AF84,0xE1B00428,0x95983A1D,0x06B89FB4,0xCE6EA048,
+    0x6F3F3B82,0x3520AB82,0x011A1D4B,0x277227F8,0x611560B1,0xE7933FDC,
+    0xBB3A792B,0x344525BD,0xA08839E1,0x51CE794B,0x2F32C9B7,0xA01FBAC9,
+    0xE01CC87E,0xBCC7D1F6,0xCF0111C3,0xA1E8AAC7,0x1A908749,0xD44FBD9A,
+    0xD0DADECB,0xD50ADA38,0x0339C32A,0xC6913667,0x8DF9317C,0xE0B12B4F,
+    0xF79E59B7,0x43F5BB3A,0xF2D519FF,0x27D9459C,0xBF97222C,0x15E6FC2A,
+    0x0F91FC71,0x9B941525,0xFAE59361,0xCEB69CEB,0xC2A86459,0x12BAA8D1,
+    0xB6C1075E,0xE3056A0C,0x10D25065,0xCB03A442,0xE0EC6E0E,0x1698DB3B,
+    0x4C98A0BE,0x3278E964,0x9F1F9532,0xE0D392DF,0xD3A0342B,0x8971F21E,
+    0x1B0A7441,0x4BA3348C,0xC5BE7120,0xC37632D8,0xDF359F8D,0x9B992F2E,
+    0xE60B6F47,0x0FE3F11D,0xE54CDA54,0x1EDAD891,0xCE6279CF,0xCD3E7E6F,
+    0x1618B166,0xFD2C1D05,0x848FD2C5,0xF6FB2299,0xF523F357,0xA6327623,
+    0x93A83531,0x56CCCD02,0xACF08162,0x5A75EBB5,0x6E163697,0x88D273CC,
+    0xDE966292,0x81B949D0,0x4C50901B,0x71C65614,0xE6C6C7BD,0x327A140A,
+    0x45E1D006,0xC3F27B9A,0xC9AA53FD,0x62A80F00,0xBB25BFE2,0x35BDD2F6,
+    0x71126905,0xB2040222,0xB6CBCF7C,0xCD769C2B,0x53113EC0,0x1640E3D3,
+    0x38ABBD60,0x2547ADF0,0xBA38209C,0xF746CE76,0x77AFA1C5,0x20756060,
+    0x85CBFE4E,0x8AE88DD8,0x7AAAF9B0,0x4CF9AA7E,0x1948C25C,0x02FB8A8C,
+    0x01C36AE4,0xD6EBE1F9,0x90D4F869,0xA65CDEA0,0x3F09252D,0xC208E69F,
+    0xB74E6132,0xCE77E25B,0x578FDFE3,0x3AC372E6 };
+
+static const u32 ps[16+2] = {
+    0x243F6A88,0x85A308D3,0x13198A2E,0x03707344,0xA4093822,0x299F31D0,
+    0x082EFA98,0xEC4E6C89,0x452821E6,0x38D01377,0xBE5466CF,0x34E90C6C,
+    0xC0AC29B7,0xC97C50DD,0x3F84D5B5,0xB5470917,0x9216D5D9,0x8979FB1B };
+
+
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_amd64_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+					    u32 *ret_xr);
+
+extern void _gcry_blowfish_amd64_encrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+extern void _gcry_blowfish_amd64_decrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static inline void
+blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *ctr)
+{
+  _gcry_blowfish_amd64_ctr_enc(ctx, out, in, ctr);
+}
+
+static inline void
+blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+  _gcry_blowfish_amd64_cbc_dec(ctx, out, in, iv);
+}
+
+static inline void
+blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+  _gcry_blowfish_amd64_cfb_dec(ctx, out, in, iv);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+#elif defined(USE_ARM_ASM)
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_arm_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+					    u32 *ret_xr);
+
+extern void _gcry_blowfish_arm_encrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+extern void _gcry_blowfish_arm_decrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+/* These assembly implementations process two blocks in parallel. */
+extern void _gcry_blowfish_arm_ctr_enc(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_arm_cbc_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+extern void _gcry_blowfish_arm_cfb_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_arm_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_arm_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_arm_decrypt_block (context, outbuf, inbuf);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+#else /*USE_ARM_ASM*/
+
+
+#define F(x) ((( s0[(x)>>24] + s1[((x)>>16)&0xff])	 \
+		   ^ s2[((x)>>8)&0xff]) + s3[(x)&0xff] )
+#define R(l,r,i) do { l ^= p[i]; r ^= F(l); } while(0)
+#define R3(l,r,i) do { R(l##0,r##0,i);R(l##1,r##1,i);R(l##2,r##2,i);} while(0)
+
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  u32 xl, xr, *s0, *s1, *s2, *s3, *p;
+
+  xl = *ret_xl;
+  xr = *ret_xr;
+  p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
+
+  R( xl, xr,  0);
+  R( xr, xl,  1);
+  R( xl, xr,  2);
+  R( xr, xl,  3);
+  R( xl, xr,  4);
+  R( xr, xl,  5);
+  R( xl, xr,  6);
+  R( xr, xl,  7);
+  R( xl, xr,  8);
+  R( xr, xl,  9);
+  R( xl, xr, 10);
+  R( xr, xl, 11);
+  R( xl, xr, 12);
+  R( xr, xl, 13);
+  R( xl, xr, 14);
+  R( xr, xl, 15);
+
+  xl ^= p[16];
+  xr ^= p[16+1];
+
+  *ret_xl = xr;
+  *ret_xr = xl;
+}
+
+
+static void
+do_encrypt_3 ( BLOWFISH_context *bc, byte *dst, const byte *src )
+{
+  u32 xl0, xr0, xl1, xr1, xl2, xr2, *s0, *s1, *s2, *s3, *p;
+
+  xl0 = buf_get_be32(src + 0);
+  xr0 = buf_get_be32(src + 4);
+  xl1 = buf_get_be32(src + 8);
+  xr1 = buf_get_be32(src + 12);
+  xl2 = buf_get_be32(src + 16);
+  xr2 = buf_get_be32(src + 20);
+  p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
+
+  R3( xl, xr,  0);
+  R3( xr, xl,  1);
+  R3( xl, xr,  2);
+  R3( xr, xl,  3);
+  R3( xl, xr,  4);
+  R3( xr, xl,  5);
+  R3( xl, xr,  6);
+  R3( xr, xl,  7);
+  R3( xl, xr,  8);
+  R3( xr, xl,  9);
+  R3( xl, xr, 10);
+  R3( xr, xl, 11);
+  R3( xl, xr, 12);
+  R3( xr, xl, 13);
+  R3( xl, xr, 14);
+  R3( xr, xl, 15);
+
+  xl0 ^= p[16];
+  xr0 ^= p[16+1];
+  xl1 ^= p[16];
+  xr1 ^= p[16+1];
+  xl2 ^= p[16];
+  xr2 ^= p[16+1];
+
+  buf_put_be32(dst + 0, xr0);
+  buf_put_be32(dst + 4, xl0);
+  buf_put_be32(dst + 8, xr1);
+  buf_put_be32(dst + 12, xl1);
+  buf_put_be32(dst + 16, xr2);
+  buf_put_be32(dst + 20, xl2);
+}
+
+
+static void
+decrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  u32 xl, xr, *s0, *s1, *s2, *s3, *p;
+
+  xl = *ret_xl;
+  xr = *ret_xr;
+  p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
+
+  R( xl, xr, 17);
+  R( xr, xl, 16);
+  R( xl, xr, 15);
+  R( xr, xl, 14);
+  R( xl, xr, 13);
+  R( xr, xl, 12);
+  R( xl, xr, 11);
+  R( xr, xl, 10);
+  R( xl, xr,  9);
+  R( xr, xl,  8);
+  R( xl, xr,  7);
+  R( xr, xl,  6);
+  R( xl, xr,  5);
+  R( xr, xl,  4);
+  R( xl, xr,  3);
+  R( xr, xl,  2);
+
+  xl ^= p[1];
+  xr ^= p[0];
+
+  *ret_xl = xr;
+  *ret_xr = xl;
+}
+
+
+static void
+do_decrypt_3 ( BLOWFISH_context *bc, byte *dst, const byte *src )
+{
+  u32 xl0, xr0, xl1, xr1, xl2, xr2, *s0, *s1, *s2, *s3, *p;
+
+  xl0 = buf_get_be32(src + 0);
+  xr0 = buf_get_be32(src + 4);
+  xl1 = buf_get_be32(src + 8);
+  xr1 = buf_get_be32(src + 12);
+  xl2 = buf_get_be32(src + 16);
+  xr2 = buf_get_be32(src + 20);
+  p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
+
+  R3( xl, xr, 17);
+  R3( xr, xl, 16);
+  R3( xl, xr, 15);
+  R3( xr, xl, 14);
+  R3( xl, xr, 13);
+  R3( xr, xl, 12);
+  R3( xl, xr, 11);
+  R3( xr, xl, 10);
+  R3( xl, xr,  9);
+  R3( xr, xl,  8);
+  R3( xl, xr,  7);
+  R3( xr, xl,  6);
+  R3( xl, xr,  5);
+  R3( xr, xl,  4);
+  R3( xl, xr,  3);
+  R3( xr, xl,  2);
+
+  xl0 ^= p[1];
+  xr0 ^= p[0];
+  xl1 ^= p[1];
+  xr1 ^= p[0];
+  xl2 ^= p[1];
+  xr2 ^= p[0];
+
+  buf_put_be32(dst + 0, xr0);
+  buf_put_be32(dst + 4, xl0);
+  buf_put_be32(dst + 8, xr1);
+  buf_put_be32(dst + 12, xl1);
+  buf_put_be32(dst + 16, xr2);
+  buf_put_be32(dst + 20, xl2);
+}
+
+#undef F
+#undef R
+#undef R3
+
+static void
+do_encrypt_block ( BLOWFISH_context *bc, byte *outbuf, const byte *inbuf )
+{
+  u32 d1, d2;
+
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
+  do_encrypt( bc, &d1, &d2 );
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
+}
+
+static unsigned int
+encrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *bc = (BLOWFISH_context *) context;
+  do_encrypt_block (bc, outbuf, inbuf);
+  return /*burn_stack*/ (64);
+}
+
+
+static void
+do_decrypt_block (BLOWFISH_context *bc, byte *outbuf, const byte *inbuf)
+{
+  u32 d1, d2;
+
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
+  decrypt( bc, &d1, &d2 );
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *bc = (BLOWFISH_context *) context;
+  do_decrypt_block (bc, outbuf, inbuf);
+  return /*burn_stack*/ (64);
+}
+
+#endif /*!USE_AMD64_ASM&&!USE_ARM_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOWFISH_BLOCKSIZE. */
+static void
+_gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+		       const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* Prepare the counter blocks. */
+      cipher_block_cpy (tmpbuf + 0, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 16, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 8, 1, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 16, 2, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (ctr, 3, BLOWFISH_BLOCKSIZE);
+      /* Encrypt the counter. */
+      do_encrypt_3(ctx, tmpbuf, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE * 3);
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+      inbuf  += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add (ctr, 1, BLOWFISH_BLOCKSIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		       const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_3 (ctx, savebuf, inbuf);
+
+      cipher_block_xor_1 (savebuf + 0, iv, BLOWFISH_BLOCKSIZE);
+      cipher_block_xor_1 (savebuf + 8, inbuf, BLOWFISH_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, BLOWFISH_BLOCKSIZE);
+      buf_cpy (outbuf, savebuf, BLOWFISH_BLOCKSIZE * 3);
+      inbuf += BLOWFISH_BLOCKSIZE * 3;
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block (ctx, savebuf, inbuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
+      inbuf += BLOWFISH_BLOCKSIZE;
+      outbuf += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		       const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3 )
+    {
+      cipher_block_cpy (tmpbuf + 0, iv, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, inbuf + 0, BLOWFISH_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, BLOWFISH_BLOCKSIZE);
+      do_encrypt_3 (ctx, tmpbuf, tmpbuf);
+      buf_xor (outbuf, inbuf, tmpbuf, BLOWFISH_BLOCKSIZE * 3);
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+      inbuf  += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for BLOWFISH-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_ctr("BLOWFISH", &bf_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cbc("BLOWFISH", &bf_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cfb("BLOWFISH", &bf_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+static const char*
+selftest(void)
+{
+  BLOWFISH_context c;
+  cipher_bulk_ops_t bulk_ops;
+  byte plain[] = "BLOWFISH";
+  byte buffer[8];
+  static const byte plain3[] =
+    { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
+  static const byte key3[] =
+    { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
+  static const byte cipher3[] =
+    { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
+  const char *r;
+
+  bf_setkey( (void *) &c,
+             (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26,
+             &bulk_ops );
+  encrypt_block( (void *) &c, buffer, plain );
+  if( memcmp( buffer, "\x32\x4E\xD0\xFE\xF4\x13\xA2\x03", 8 ) )
+    return "Blowfish selftest failed (1).";
+  decrypt_block( (void *) &c, buffer, buffer );
+  if( memcmp( buffer, plain, 8 ) )
+    return "Blowfish selftest failed (2).";
+
+  bf_setkey( (void *) &c, key3, 8, &bulk_ops );
+  encrypt_block( (void *) &c, buffer, plain3 );
+  if( memcmp( buffer, cipher3, 8 ) )
+    return "Blowfish selftest failed (3).";
+  decrypt_block( (void *) &c, buffer, buffer );
+  if( memcmp( buffer, plain3, 8 ) )
+    return "Blowfish selftest failed (4).";
+
+  if ( (r = selftest_cbc ()) )
+    return r;
+
+  if ( (r = selftest_cfb ()) )
+    return r;
+
+  if ( (r = selftest_ctr ()) )
+    return r;
+
+  return NULL;
+}
+
+
+struct hashset_elem {
+  u32 val;
+  short nidx;
+  char used;
+};
+
+static inline byte
+val_to_hidx(u32 val)
+{
+  /* bf sboxes are quite random already. */
+  return (val >> 24) ^ (val >> 16)  ^ (val >> 8) ^ val;
+}
+
+static inline int
+add_val(struct hashset_elem hset[256], u32 val, int *midx,
+	struct hashset_elem *mpool)
+{
+  struct hashset_elem *elem;
+  byte hidx;
+
+  hidx = val_to_hidx(val);
+  elem = &hset[hidx];
+
+  /* Check if first is in use. */
+  if (elem->used == 0)
+    {
+      elem->val = val;
+      elem->nidx = -1;
+      elem->used = 1;
+      return 0;
+    }
+
+  /* Check if first matches. */
+  if (elem->val == val)
+    return 1;
+
+  for (; elem->nidx >= 0; elem = &mpool[elem->nidx])
+    {
+      /* Check if elem matches. */
+      if (elem->val == val)
+        return 1;
+    }
+
+  elem->nidx = (*midx)++;
+  elem = &mpool[elem->nidx];
+
+  elem->val = val;
+  elem->nidx = -1;
+  elem->used = 1;
+
+  return 0;
+}
+
+static gcry_err_code_t
+do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
+{
+  struct hashset_elem mempool[4 * 255]; /* Enough entries for the worst case. */
+  struct hashset_elem hset[4][256];
+  int memidx = 0;
+  int weak = 0;
+  int i, j, ret;
+  u32 data, datal, datar;
+  static int initialized;
+  static const char *selftest_failed;
+
+  if( !initialized )
+    {
+      initialized = 1;
+      selftest_failed = selftest();
+      if( selftest_failed )
+        log_error ("%s\n", selftest_failed );
+    }
+  if( selftest_failed )
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen < BLOWFISH_KEY_MIN_BITS / 8 ||
+      keylen > BLOWFISH_KEY_MAX_BITS / 8)
+    return GPG_ERR_INV_KEYLEN;
+
+  memset(hset, 0, sizeof(hset));
+
+  for(i=0; i < 16+2; i++ )
+    c->p[i] = ps[i];
+  for(i=0; i < 256; i++ )
+    {
+      c->s0[i] = ks0[i];
+      c->s1[i] = ks1[i];
+      c->s2[i] = ks2[i];
+      c->s3[i] = ks3[i];
+    }
+
+  for(i=j=0; i < 16+2; i++ )
+    {
+      data = ((u32)key[j] << 24) |
+             ((u32)key[(j+1)%keylen] << 16) |
+             ((u32)key[(j+2)%keylen] << 8) |
+             ((u32)key[(j+3)%keylen]);
+      c->p[i] ^= data;
+      j = (j+4) % keylen;
+    }
+
+  datal = datar = 0;
+  for(i=0; i < 16+2; i += 2 )
+    {
+      do_encrypt( c, &datal, &datar );
+      c->p[i]   = datal;
+      c->p[i+1] = datar;
+    }
+  for(i=0; i < 256; i += 2 )
+    {
+      do_encrypt( c, &datal, &datar );
+      c->s0[i]   = datal;
+      c->s0[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[0], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[0], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
+    }
+  for(i=0; i < 256; i += 2 )
+    {
+      do_encrypt( c, &datal, &datar );
+      c->s1[i]   = datal;
+      c->s1[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[1], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[1], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
+    }
+  for(i=0; i < 256; i += 2 )
+    {
+      do_encrypt( c, &datal, &datar );
+      c->s2[i]   = datal;
+      c->s2[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[2], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[2], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
+    }
+  for(i=0; i < 256; i += 2 )
+    {
+      do_encrypt( c, &datal, &datar );
+      c->s3[i]   = datal;
+      c->s3[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[3], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[3], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
+    }
+
+  /* Clear stack. */
+  wipememory(hset, sizeof(hset));
+  wipememory(mempool, sizeof(mempool[0]) * memidx);
+
+  _gcry_burn_stack (64);
+
+  /* Check for weak key.  A weak key is a key in which a value in
+     the P-array (here c) occurs more than once per table.  */
+  if (weak)
+    return GPG_ERR_WEAK_KEY;
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+static gcry_err_code_t
+bf_setkey (void *context, const byte *key, unsigned keylen,
+           cipher_bulk_ops_t *bulk_ops)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  gcry_err_code_t rc = do_bf_setkey (c, key, keylen);
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cfb_dec = _gcry_blowfish_cfb_dec;
+  bulk_ops->cbc_dec = _gcry_blowfish_cbc_dec;
+  bulk_ops->ctr_enc = _gcry_blowfish_ctr_enc;
+
+  return rc;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_blowfish =
+  {
+    GCRY_CIPHER_BLOWFISH, {0, 0},
+    "BLOWFISH", NULL, NULL, BLOWFISH_BLOCKSIZE, 128,
+    sizeof (BLOWFISH_context),
+    bf_setkey, encrypt_block, decrypt_block
+  };
diff --git a/comm/third_party/libgcrypt/cipher/bufhelp.h b/comm/third_party/libgcrypt/cipher/bufhelp.h
new file mode 100644
index 0000000000..fa5b2e8ece
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/bufhelp.h
@@ -0,0 +1,385 @@
+/* bufhelp.h  -  Some buffer manipulation helpers
+ * Copyright (C) 2012-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRYPT_BUFHELP_H
+#define GCRYPT_BUFHELP_H
+
+
+#include "g10lib.h"
+#include "bithelp.h"
+
+
+#undef BUFHELP_UNALIGNED_ACCESS
+#if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \
+    defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \
+    defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
+/* Compiler is supports attributes needed for automatically issuing unaligned
+   memory access instructions.
+ */
+# define BUFHELP_UNALIGNED_ACCESS 1
+#endif
+
+
+#ifndef BUFHELP_UNALIGNED_ACCESS
+
+/* Functions for loading and storing unaligned u32 values of different
+   endianness.  */
+static inline u32 buf_get_be32(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u32)in[0] << 24) | ((u32)in[1] << 16) | \
+         ((u32)in[2] << 8) | (u32)in[3];
+}
+
+static inline u32 buf_get_le32(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u32)in[3] << 24) | ((u32)in[2] << 16) | \
+         ((u32)in[1] << 8) | (u32)in[0];
+}
+
+static inline void buf_put_be32(void *_buf, u32 val)
+{
+  byte *out = _buf;
+  out[0] = val >> 24;
+  out[1] = val >> 16;
+  out[2] = val >> 8;
+  out[3] = val;
+}
+
+static inline void buf_put_le32(void *_buf, u32 val)
+{
+  byte *out = _buf;
+  out[3] = val >> 24;
+  out[2] = val >> 16;
+  out[1] = val >> 8;
+  out[0] = val;
+}
+
+
+/* Functions for loading and storing unaligned u64 values of different
+   endianness.  */
+static inline u64 buf_get_be64(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u64)in[0] << 56) | ((u64)in[1] << 48) | \
+         ((u64)in[2] << 40) | ((u64)in[3] << 32) | \
+         ((u64)in[4] << 24) | ((u64)in[5] << 16) | \
+         ((u64)in[6] << 8) | (u64)in[7];
+}
+
+static inline u64 buf_get_le64(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u64)in[7] << 56) | ((u64)in[6] << 48) | \
+         ((u64)in[5] << 40) | ((u64)in[4] << 32) | \
+         ((u64)in[3] << 24) | ((u64)in[2] << 16) | \
+         ((u64)in[1] << 8) | (u64)in[0];
+}
+
+static inline void buf_put_be64(void *_buf, u64 val)
+{
+  byte *out = _buf;
+  out[0] = val >> 56;
+  out[1] = val >> 48;
+  out[2] = val >> 40;
+  out[3] = val >> 32;
+  out[4] = val >> 24;
+  out[5] = val >> 16;
+  out[6] = val >> 8;
+  out[7] = val;
+}
+
+static inline void buf_put_le64(void *_buf, u64 val)
+{
+  byte *out = _buf;
+  out[7] = val >> 56;
+  out[6] = val >> 48;
+  out[5] = val >> 40;
+  out[4] = val >> 32;
+  out[3] = val >> 24;
+  out[2] = val >> 16;
+  out[1] = val >> 8;
+  out[0] = val;
+}
+
+#else /*BUFHELP_UNALIGNED_ACCESS*/
+
+typedef struct bufhelp_u32_s
+{
+  u32 a;
+} __attribute__((packed, aligned(1), may_alias)) bufhelp_u32_t;
+
+/* Functions for loading and storing unaligned u32 values of different
+   endianness.  */
+static inline u32 buf_get_be32(const void *_buf)
+{
+  return be_bswap32(((const bufhelp_u32_t *)_buf)->a);
+}
+
+static inline u32 buf_get_le32(const void *_buf)
+{
+  return le_bswap32(((const bufhelp_u32_t *)_buf)->a);
+}
+
+static inline void buf_put_be32(void *_buf, u32 val)
+{
+  bufhelp_u32_t *out = _buf;
+  out->a = be_bswap32(val);
+}
+
+static inline void buf_put_le32(void *_buf, u32 val)
+{
+  bufhelp_u32_t *out = _buf;
+  out->a = le_bswap32(val);
+}
+
+
+typedef struct bufhelp_u64_s
+{
+  u64 a;
+} __attribute__((packed, aligned(1), may_alias)) bufhelp_u64_t;
+
+/* Functions for loading and storing unaligned u64 values of different
+   endianness.  */
+static inline u64 buf_get_be64(const void *_buf)
+{
+  return be_bswap64(((const bufhelp_u64_t *)_buf)->a);
+}
+
+static inline u64 buf_get_le64(const void *_buf)
+{
+  return le_bswap64(((const bufhelp_u64_t *)_buf)->a);
+}
+
+static inline void buf_put_be64(void *_buf, u64 val)
+{
+  bufhelp_u64_t *out = _buf;
+  out->a = be_bswap64(val);
+}
+
+static inline void buf_put_le64(void *_buf, u64 val)
+{
+  bufhelp_u64_t *out = _buf;
+  out->a = le_bswap64(val);
+}
+
+#endif /*BUFHELP_UNALIGNED_ACCESS*/
+
+
+/* Host-endian get/put macros */
+#ifdef WORDS_BIGENDIAN
+# define buf_get_he32 buf_get_be32
+# define buf_put_he32 buf_put_be32
+# define buf_get_he64 buf_get_be64
+# define buf_put_he64 buf_put_be64
+#else
+# define buf_get_he32 buf_get_le32
+# define buf_put_he32 buf_put_le32
+# define buf_get_he64 buf_get_le64
+# define buf_put_he64 buf_put_le64
+#endif
+
+
+
+/* Optimized function for small buffer copying */
+static inline void
+buf_cpy(void *_dst, const void *_src, size_t len)
+{
+  byte *dst = _dst;
+  const byte *src = _src;
+
+#if __GNUC__ >= 4
+  if (!__builtin_constant_p (len))
+    {
+      if (UNLIKELY(len == 0))
+	return;
+      memcpy(_dst, _src, len);
+      return;
+    }
+#endif
+
+  while (len >= sizeof(u64))
+    {
+      buf_put_he64(dst, buf_get_he64(src));
+      dst += sizeof(u64);
+      src += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      buf_put_he32(dst, buf_get_he32(src));
+      dst += sizeof(u32);
+      src += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst++ = *src++;
+}
+
+
+/* Optimized function for buffer xoring */
+static inline void
+buf_xor(void *_dst, const void *_src1, const void *_src2, size_t len)
+{
+  byte *dst = _dst;
+  const byte *src1 = _src1;
+  const byte *src2 = _src2;
+
+  while (len >= sizeof(u64))
+    {
+      buf_put_he64(dst, buf_get_he64(src1) ^ buf_get_he64(src2));
+      dst += sizeof(u64);
+      src1 += sizeof(u64);
+      src2 += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len > sizeof(u32))
+    {
+      buf_put_he32(dst, buf_get_he32(src1) ^ buf_get_he32(src2));
+      dst += sizeof(u32);
+      src1 += sizeof(u32);
+      src2 += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst++ = *src1++ ^ *src2++;
+}
+
+
+/* Optimized function for buffer xoring with two destination buffers.  Used
+   mainly by CFB mode encryption.  */
+static inline void
+buf_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t len)
+{
+  byte *dst1 = _dst1;
+  byte *dst2 = _dst2;
+  const byte *src = _src;
+
+  while (len >= sizeof(u64))
+    {
+      u64 temp = buf_get_he64(dst2) ^ buf_get_he64(src);
+      buf_put_he64(dst2, temp);
+      buf_put_he64(dst1, temp);
+      dst2 += sizeof(u64);
+      dst1 += sizeof(u64);
+      src += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      u32 temp = buf_get_he32(dst2) ^ buf_get_he32(src);
+      buf_put_he32(dst2, temp);
+      buf_put_he32(dst1, temp);
+      dst2 += sizeof(u32);
+      dst1 += sizeof(u32);
+      src += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst1++ = (*dst2++ ^= *src++);
+}
+
+
+/* Optimized function for combined buffer xoring and copying.  Used by mainly
+   CBC mode decryption.  */
+static inline void
+buf_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy,
+		 const void *_src_cpy, size_t len)
+{
+  byte *dst_xor = _dst_xor;
+  byte *srcdst_cpy = _srcdst_cpy;
+  const byte *src_xor = _src_xor;
+  const byte *src_cpy = _src_cpy;
+
+  while (len >= sizeof(u64))
+    {
+      u64 temp = buf_get_he64(src_cpy);
+      buf_put_he64(dst_xor, buf_get_he64(srcdst_cpy) ^ buf_get_he64(src_xor));
+      buf_put_he64(srcdst_cpy, temp);
+      dst_xor += sizeof(u64);
+      srcdst_cpy += sizeof(u64);
+      src_xor += sizeof(u64);
+      src_cpy += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      u32 temp = buf_get_he32(src_cpy);
+      buf_put_he32(dst_xor, buf_get_he32(srcdst_cpy) ^ buf_get_he32(src_xor));
+      buf_put_he32(srcdst_cpy, temp);
+      dst_xor += sizeof(u32);
+      srcdst_cpy += sizeof(u32);
+      src_xor += sizeof(u32);
+      src_cpy += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    {
+      byte temp = *src_cpy++;
+      *dst_xor++ = *srcdst_cpy ^ *src_xor++;
+      *srcdst_cpy++ = temp;
+    }
+}
+
+
+/* Optimized function for combined buffer xoring and copying.  Used by mainly
+   CFB mode decryption.  */
+static inline void
+buf_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t len)
+{
+  buf_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, len);
+}
+
+
+/* Constant-time compare of two buffers.  Returns 1 if buffers are equal,
+   and 0 if buffers differ.  */
+static inline int
+buf_eq_const(const void *_a, const void *_b, size_t len)
+{
+  const byte *a = _a;
+  const byte *b = _b;
+  int ab, ba;
+  size_t i;
+
+  /* Constant-time compare. */
+  for (i = 0, ab = 0, ba = 0; i < len; i++)
+    {
+      /* If a[i] != b[i], either ab or ba will be negative. */
+      ab |= a[i] - b[i];
+      ba |= b[i] - a[i];
+    }
+
+  /* 'ab | ba' is negative when buffers are not equal. */
+  return (ab | ba) >= 0;
+}
+
+
+#endif /*GCRYPT_BUFHELP_H*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia-aarch64.S b/comm/third_party/libgcrypt/cipher/camellia-aarch64.S
new file mode 100644
index 0000000000..f498086212
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia-aarch64.S
@@ -0,0 +1,586 @@
+/* camellia-aarch64.S  -  ARMv8/AArch64 assembly implementation of Camellia
+ *                        cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* struct camellia_ctx: */
+#define key_table 0
+
+/* register macros */
+#define CTX x0
+#define RDST x1
+#define RSRC x2
+#define RKEYBITS w3
+
+#define RTAB1 x4
+#define RTAB2 x5
+#define RTAB3 x6
+#define RTAB4 x7
+#define RMASK w8
+
+#define IL w9
+#define IR w10
+
+#define xIL x9
+#define xIR x10
+
+#define XL w11
+#define XR w12
+#define YL w13
+#define YR w14
+
+#define RT0 w15
+#define RT1 w16
+#define RT2 w17
+#define RT3 w19
+
+#define xRT0 x15
+#define xRT1 x16
+#define xRT2 x17
+#define xRT3 x19
+
+#ifdef __AARCH64EL__
+  #define host_to_be(reg, rtmp) \
+	  rev reg, reg;
+  #define be_to_host(reg, rtmp) \
+	  rev reg, reg;
+#else
+  /* nop on big-endian */
+  #define host_to_be(reg, rtmp) /*_*/
+  #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define ldr_input_aligned_be(rin, a, b, c, d, rtmp) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	be_to_host(a, rtmp); \
+	ldr c, [rin, #8]; \
+	be_to_host(b, rtmp); \
+	ldr d, [rin, #12]; \
+	be_to_host(c, rtmp); \
+	be_to_host(d, rtmp);
+
+#define str_output_aligned_be(rout, a, b, c, d, rtmp) \
+	be_to_host(a, rtmp); \
+	be_to_host(b, rtmp); \
+	str a, [rout, #0]; \
+	be_to_host(c, rtmp); \
+	str b, [rout, #4]; \
+	be_to_host(d, rtmp); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+/* unaligned word reads/writes allowed */
+#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
+	ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp)
+
+#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+	str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0)
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(xl, xr, kl, kr, yl, yr) \
+	ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
+	and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
+	ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
+	and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
+	and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
+	ldr  IR, [RTAB1,  xIR]; \
+	and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
+	eor yl, yl, RT2; \
+	ldr  IL, [RTAB1,  xIL]; \
+	eor yr, yr, RT3; \
+	\
+	ldr RT0, [RTAB3, xRT0]; \
+	ldr RT1, [RTAB3, xRT1]; \
+	\
+	and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
+	and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
+	\
+	eor IR, IR, RT0; \
+	eor IL, IL, RT1; \
+	\
+	ldr RT2, [RTAB2, xRT2]; \
+	and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
+	ldr RT3, [RTAB2, xRT3]; \
+	and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
+	\
+	ldr RT0, [RTAB4, xRT0]; \
+	ldr RT1, [RTAB4, xRT1]; \
+	\
+	eor IR, IR, RT2; \
+	eor IL, IL, RT3; \
+	eor IR, IR, RT0; \
+	eor IL, IL, RT1; \
+	\
+	eor IR, IR, IL; \
+	eor yr, yr, IL, ror#8; \
+	eor yl, yl, IR; \
+	eor yr, yr, IR;
+
+#define enc_rounds(n) \
+	roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
+
+#define dec_rounds(n) \
+	roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
+
+/* perform FL and FL⁻¹ */
+#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
+	ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
+	ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
+	and RT0, RT0, ll; \
+	ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
+	orr RT2, RT2, rr; \
+	ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
+	eor rl, rl, RT2; \
+	eor lr, lr, RT0, ror#31; \
+	and RT3, RT3, rl; \
+	orr RT1, RT1, lr; \
+	eor ll, ll, RT1; \
+	eor rr, rr, RT3, ror#31;
+
+#define enc_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 0, (n) * 2 + 1, \
+	    (n) * 2 + 2, (n) * 2 + 3);
+
+#define dec_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 2, (n) * 2 + 3, \
+	    (n) * 2 + 0, (n) * 2 + 1);
+
+#define inpack(n) \
+	ldr_input_be(RSRC, XL, XR, YL, YR, RT0); \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor XL, XL, RT0; \
+	eor XR, XR, RT1;
+
+#define outunpack(n) \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor YL, YL, RT0; \
+	eor YR, YR, RT1; \
+	str_output_be(RDST, YL, YR, XL, XR, RT0, RT1);
+
+.globl _gcry_camellia_arm_encrypt_block
+ELF(.type   _gcry_camellia_arm_encrypt_block,@function;)
+
+_gcry_camellia_arm_encrypt_block:
+	CFI_STARTPROC()
+	stp x19, x30, [sp, #-16]!
+	CFI_ADJUST_CFA_OFFSET(16)
+	CFI_REG_ON_STACK(19, 0)
+	CFI_REG_ON_STACK(30, 8)
+
+	/* input:
+	 *	x0: keytable
+	 *	x1: dst
+	 *	x2: src
+	 *	w3: keybitlen
+	 */
+
+	adr RTAB1,  _gcry_camellia_arm_tables;
+	mov RMASK, #(0xff<<4); /* byte mask */
+	add RTAB2, RTAB1, #(1 * 4);
+	add RTAB3, RTAB1, #(2 * 4);
+	add RTAB4, RTAB1, #(3 * 4);
+
+	inpack(0);
+
+	enc_rounds(0);
+	enc_fls(8);
+	enc_rounds(8);
+	enc_fls(16);
+	enc_rounds(16);
+
+	cmp RKEYBITS, #(16 * 8);
+	bne .Lenc_256;
+
+	outunpack(24);
+
+	CFI_REMEMBER_STATE()
+	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
+	ret;
+	CFI_RESTORE_STATE()
+.ltorg
+
+.Lenc_256:
+	enc_fls(24);
+	enc_rounds(24);
+
+	outunpack(32);
+
+	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
+	ret;
+	CFI_ENDPROC()
+.ltorg
+ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
+
+.globl _gcry_camellia_arm_decrypt_block
+ELF(.type   _gcry_camellia_arm_decrypt_block,@function;)
+
+_gcry_camellia_arm_decrypt_block:
+	CFI_STARTPROC()
+	stp x19, x30, [sp, #-16]!
+	CFI_ADJUST_CFA_OFFSET(16)
+	CFI_REG_ON_STACK(19, 0)
+	CFI_REG_ON_STACK(30, 8)
+
+	/* input:
+	 *	x0: keytable
+	 *	x1: dst
+	 *	x2: src
+	 *	w3: keybitlen
+	 */
+
+	adr RTAB1,  _gcry_camellia_arm_tables;
+	mov RMASK, #(0xff<<4); /* byte mask */
+	add RTAB2, RTAB1, #(1 * 4);
+	add RTAB3, RTAB1, #(2 * 4);
+	add RTAB4, RTAB1, #(3 * 4);
+
+	cmp RKEYBITS, #(16 * 8);
+	bne .Ldec_256;
+
+	inpack(24);
+
+.Ldec_128:
+	dec_rounds(16);
+	dec_fls(16);
+	dec_rounds(8);
+	dec_fls(8);
+	dec_rounds(0);
+
+	outunpack(0);
+
+	CFI_REMEMBER_STATE()
+	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
+	ret;
+	CFI_RESTORE_STATE()
+.ltorg
+
+.Ldec_256:
+	inpack(32);
+	dec_rounds(24);
+	dec_fls(24);
+
+	b .Ldec_128;
+	CFI_ENDPROC()
+.ltorg
+ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
+
+/* Encryption/Decryption tables */
+ELF(.type  _gcry_camellia_arm_tables,@object;)
+.balign 32
+_gcry_camellia_arm_tables:
+.Lcamellia_sp1110:
+.long 0x70707000
+.Lcamellia_sp0222:
+            .long 0x00e0e0e0
+.Lcamellia_sp3033:
+                        .long 0x38003838
+.Lcamellia_sp4404:
+                                    .long 0x70700070
+.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
+.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
+.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
+.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
+.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
+.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
+.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
+.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
+.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
+.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
+.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
+.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
+.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
+.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
+.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
+.long 0x23232300, 0x00464646, 0x91009191, 0x86860086
+.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
+.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
+.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
+.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
+.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
+.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
+.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
+.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
+.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
+.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
+.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
+.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
+.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
+.long 0x92929200, 0x00252525, 0x49004949, 0x51510051
+.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
+.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
+.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
+.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
+.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
+.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
+.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
+.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
+.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
+.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
+.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
+.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
+.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
+.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
+.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
+.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
+.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
+.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
+.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
+.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
+.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
+.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
+.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
+.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
+.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
+.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
+.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
+.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
+.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
+.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
+.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
+.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
+.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
+.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
+.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
+.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
+.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
+.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
+.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
+.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
+.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
+.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
+.long 0x12121200, 0x00242424, 0x09000909, 0x00000000
+.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
+.long 0x20202000, 0x00404040, 0x10001010, 0x75750075
+.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
+.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
+.long 0x84848400, 0x00090909, 0x42004242, 0x09090009
+.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
+.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
+.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
+.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
+.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
+.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
+.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
+.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
+.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
+.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
+.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
+.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
+.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
+.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
+.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
+.long 0x04040400, 0x00080808, 0x02000202, 0x13130013
+.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
+.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
+.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
+.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
+.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
+.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
+.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
+.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
+.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
+.long 0x32323200, 0x00646464, 0x19001919, 0x78780078
+.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
+.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
+.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
+.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
+.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
+.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
+.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
+.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
+.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
+.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
+.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
+.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
+.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
+.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
+.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
+.long 0x24242400, 0x00484848, 0x12001212, 0x40400040
+.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
+.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
+.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
+.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
+.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
+.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
+.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
+.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
+.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
+.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
+.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
+.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
+.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
+.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
+.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
+.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
+.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
+.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
+.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
+.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
+.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
+.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
+.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
+.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
+.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
+.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
+.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
+.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
+.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
+.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
+.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
+.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
+.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
+.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
+.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
+.long 0x09090900, 0x00121212, 0x84008484, 0x01010001
+.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
+.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
+.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
+.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
+.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
+.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
+.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
+.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
+.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
+.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
+.long 0x33333300, 0x00666666, 0x99009999, 0x99990099
+.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
+.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
+.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
+.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
+.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
+.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
+.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
+.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
+.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
+.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
+.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
+.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
+.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
+.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
+.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
+.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
+.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
+.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
+.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
+.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
+.long 0x13131300, 0x00262626, 0x89008989, 0x08080008
+.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
+.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
+.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
+.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
+.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
+.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
+.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
+.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
+.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
+.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
+.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
+.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
+.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
+.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
+.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
+.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
+.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
+.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
+.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
+.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
+.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
+.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
+.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
+.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
+.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
+.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
+.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
+.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
+.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
+.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
+.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
+.long 0x88888800, 0x00111111, 0x44004444, 0x96960096
+.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
+.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
+.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
+.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
+.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
+.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
+.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
+.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
+.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
+.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
+.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
+.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
+.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
+.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
+.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
+.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
+.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
+.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
+.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
+.long 0x40404000, 0x00808080, 0x20002020, 0x07070007
+.long 0x28282800, 0x00505050, 0x14001414, 0x55550055
+.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
+.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
+.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
+.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
+.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
+.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
+.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
+.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
+.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
+.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
+.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
+.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
+.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
+.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
+ELF(.size _gcry_camellia_arm_tables,.-_gcry_camellia_arm_tables;)
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia-aesni-avx-amd64.S b/comm/third_party/libgcrypt/cipher/camellia-aesni-avx-amd64.S
new file mode 100644
index 0000000000..64cabaa51b
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia-aesni-avx-amd64.S
@@ -0,0 +1,2618 @@
+/* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
+ *
+ * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vmovdqa .Linv_shift_row rRIP, t4; \
+	vbroadcastss .L0f0f0f0f rRIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4, t4, t4; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
+	vaesenclast t4, x0, x0; \
+	vaesenclast t4, x7, x7; \
+	vaesenclast t4, x1, x1; \
+	vaesenclast t4, x4, x4; \
+	vaesenclast t4, x2, x2; \
+	vaesenclast t4, x5, x5; \
+	vaesenclast t4, x3, x3; \
+	vaesenclast t4, x6, x6; \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpxor t6, t6, t6; \
+	vmovq key, t0; \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $5, t0, t5; \
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t6, t0, t0; \
+	vpshufb t6, t1, t1; \
+	vpshufb t6, t2, t2; \
+	vpshufb t6, t3, t3; \
+	vpshufb t6, t4, t4; \
+	vpsrldq $2, t5, t7; \
+	vpshufb t6, t7, t7; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 16(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 16(mem_cd), x5, x5; \
+	\
+	vpsrldq $1, t5, t3; \
+	vpshufb t6, t5, t5; \
+	vpshufb t6, t3, t6; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 16(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 16(mem_cd), x7, x7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 16(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 16(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 16(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+	\
+	vmovdqu x4, 0 * 16(mem_cd); \
+	vmovdqu x5, 1 * 16(mem_cd); \
+	vmovdqu x6, 2 * 16(mem_cd); \
+	vmovdqu x7, 3 * 16(mem_cd); \
+	vmovdqu x0, 4 * 16(mem_cd); \
+	vmovdqu x1, 5 * 16(mem_cd); \
+	vmovdqu x2, 6 * 16(mem_cd); \
+	vmovdqu x3, 7 * 16(mem_cd); \
+	\
+	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpxor tt0, tt0, tt0; \
+	vmovd kll, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 16(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 16(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 16(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 16(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vmovd krr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor 4 * 16(r), t0, t0; \
+	vpor 5 * 16(r), t1, t1; \
+	vpor 6 * 16(r), t2, t2; \
+	vpor 7 * 16(r), t3, t3; \
+	\
+	vpxor 0 * 16(r), t0, t0; \
+	vpxor 1 * 16(r), t1, t1; \
+	vpxor 2 * 16(r), t2, t2; \
+	vpxor 3 * 16(r), t3, t3; \
+	vmovdqu t0, 0 * 16(r); \
+	vmovdqu t1, 1 * 16(r); \
+	vmovdqu t2, 2 * 16(r); \
+	vmovdqu t3, 3 * 16(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vmovd krl, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand 0 * 16(r), t0, t0; \
+	vpand 1 * 16(r), t1, t1; \
+	vpand 2 * 16(r), t2, t2; \
+	vpand 3 * 16(r), t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 16(r), t0, t0; \
+	vpxor 5 * 16(r), t1, t1; \
+	vpxor 6 * 16(r), t2, t2; \
+	vpxor 7 * 16(r), t3, t3; \
+	vmovdqu t0, 4 * 16(r); \
+	vmovdqu t1, 5 * 16(r); \
+	vmovdqu t2, 6 * 16(r); \
+	vmovdqu t3, 7 * 16(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vmovd klr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 16(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 16(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 16(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vmovdqu .Lshufb_16x16b rRIP, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
+	vpunpcklbw a, b, t0; \
+	vpunpckhbw a, b, b; \
+	\
+	vpunpcklbw c, d, t1; \
+	vpunpckhbw c, d, d; \
+	\
+	vpunpcklbw e, f, t2; \
+	vpunpckhbw e, f, f; \
+	\
+	vpunpcklbw g, h, t3; \
+	vpunpckhbw g, h, h; \
+	\
+	vpunpcklwd t0, t1, g; \
+	vpunpckhwd t0, t1, t0; \
+	\
+	vpunpcklwd b, d, t1; \
+	vpunpckhwd b, d, e; \
+	\
+	vpunpcklwd t2, t3, c; \
+	vpunpckhwd t2, t3, t2; \
+	\
+	vpunpcklwd f, h, t3; \
+	vpunpckhwd f, h, b; \
+	\
+	vpunpcklwd e, b, t4; \
+	vpunpckhwd e, b, b; \
+	\
+	vpunpcklwd t1, t3, e; \
+	vpunpckhwd t1, t3, f; \
+	\
+	vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
+	\
+	vpunpcklwd g, c, d; \
+	vpunpckhwd g, c, c; \
+	\
+	vpunpcklwd t0, t2, t1; \
+	vpunpckhwd t0, t2, h; \
+	\
+	vpunpckhqdq b, h, a; \
+	vpshufb t3, a, a; \
+	vpunpcklqdq b, h, b; \
+	vpshufb t3, b, b; \
+	\
+	vpunpckhqdq e, d, g; \
+	vpshufb t3, g, g; \
+	vpunpcklqdq e, d, h; \
+	vpshufb t3, h, h; \
+	\
+	vpunpckhqdq f, c, e; \
+	vpshufb t3, e, e; \
+	vpunpcklqdq f, c, f; \
+	vpshufb t3, f, f; \
+	\
+	vpunpckhqdq t4, t1, c; \
+	vpshufb t3, c, c; \
+	vpunpcklqdq t4, t1, d; \
+	vpshufb t3, d, d;
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
+	\
+	vpxor 0 * 16(rio), x0, y7; \
+	vpxor 1 * 16(rio), x0, y6; \
+	vpxor 2 * 16(rio), x0, y5; \
+	vpxor 3 * 16(rio), x0, y4; \
+	vpxor 4 * 16(rio), x0, y3; \
+	vpxor 5 * 16(rio), x0, y2; \
+	vpxor 6 * 16(rio), x0, y1; \
+	vpxor 7 * 16(rio), x0, y0; \
+	vpxor 8 * 16(rio), x0, x7; \
+	vpxor 9 * 16(rio), x0, x6; \
+	vpxor 10 * 16(rio), x0, x5; \
+	vpxor 11 * 16(rio), x0, x4; \
+	vpxor 12 * 16(rio), x0, x3; \
+	vpxor 13 * 16(rio), x0, x2; \
+	vpxor 14 * 16(rio), x0, x1; \
+	vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab); \
+	vmovdqu y0, 0 * 16(mem_cd); \
+	vmovdqu y1, 1 * 16(mem_cd); \
+	vmovdqu y2, 2 * 16(mem_cd); \
+	vmovdqu y3, 3 * 16(mem_cd); \
+	vmovdqu y4, 4 * 16(mem_cd); \
+	vmovdqu y5, 5 * 16(mem_cd); \
+	vmovdqu y6, 6 * 16(mem_cd); \
+	vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 16(rio); \
+	vmovdqu x1, 1 * 16(rio); \
+	vmovdqu x2, 2 * 16(rio); \
+	vmovdqu x3, 3 * 16(rio); \
+	vmovdqu x4, 4 * 16(rio); \
+	vmovdqu x5, 5 * 16(rio); \
+	vmovdqu x6, 6 * 16(rio); \
+	vmovdqu x7, 7 * 16(rio); \
+	vmovdqu y0, 8 * 16(rio); \
+	vmovdqu y1, 9 * 16(rio); \
+	vmovdqu y2, 10 * 16(rio); \
+	vmovdqu y3, 11 * 16(rio); \
+	vmovdqu y4, 12 * 16(rio); \
+	vmovdqu y5, 13 * 16(rio); \
+	vmovdqu y6, 14 * 16(rio); \
+	vmovdqu y7, 15 * 16(rio);
+
+.text
+.align 16
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x80808080
+	.long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* shuffle mask for 8x8 byte transpose */
+.Ltranspose_8x8_shuf:
+	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+
+.align 8
+ELF(.type   __camellia_enc_blk16,@function;)
+
+__camellia_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%xmm0..%xmm15: 16 plaintext blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	leaq 8 * 16(%rax), %rcx;
+
+	leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+.align 8
+.Lenc_loop:
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	cmpq %r8, CTX;
+	je .Lenc_done;
+	leaq (8 * 8)(CTX), CTX;
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX),
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX));
+	jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax));
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
+
+.align 8
+ELF(.type   __camellia_dec_blk16,@function;)
+
+__camellia_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%xmm0..%xmm15: 16 encrypted blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	movq %r8, %rcx;
+	movq CTX, %r8
+	leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+.align 8
+.Ldec_loop:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	cmpq %r8, CTX;
+	je .Ldec_done;
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX),
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX));
+
+	leaq (-8 * 8)(CTX), CTX;
+	jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ctr_enc
+ELF(.type   _gcry_camellia_aesni_avx_ctr_enc,@function;)
+
+_gcry_camellia_aesni_avx_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	vmovdqa .Lbswap128_mask rRIP, %xmm14;
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm15;
+	vmovdqu %xmm15, 15 * 16(%rax);
+	vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
+
+	vpcmpeqd %xmm15, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 14 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 13 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm12;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm11;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm10;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm9;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm8;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm7;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm6;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm5;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm4;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm3;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm2;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vmovdqa %xmm0, %xmm13;
+	vpshufb %xmm14, %xmm0, %xmm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor 13 * 16(%rax), %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_cbc_dec
+ELF(.type   _gcry_camellia_aesni_avx_cbc_dec,@function;)
+
+_gcry_camellia_aesni_avx_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	movq %rcx, %r9;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_dec_blk16;
+
+	/* XOR output with IV */
+	vpxor (%r9), %xmm7, %xmm7;
+	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+	movq (15 * 16 + 0)(%rdx), %r10;
+	movq (15 * 16 + 8)(%rdx), %r11;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	/* store new IV */
+	movq %r10, (0)(%r9);
+	movq %r11, (8)(%r9);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_cfb_dec
+ELF(.type   _gcry_camellia_aesni_avx_cfb_dec,@function;)
+
+_gcry_camellia_aesni_avx_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm0;
+	vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0;
+	vpxor (%rcx), %xmm0, %xmm15;
+	vmovdqu 15 * 16(%rdx), %xmm1;
+	vmovdqu %xmm1, (%rcx); /* store new IV */
+	vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+	vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+	vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+	vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+	vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+	vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+	vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+	vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+	vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+	vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+	vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+	vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+	vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+	vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+	call __camellia_enc_blk16;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ocb_enc,@function;)
+
+_gcry_camellia_aesni_avx_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+	vmovdqu (%rcx), %xmm14;
+	vmovdqu (%r8), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), %xmm14, %xmm14; \
+	  vpxor xreg, %xmm15, %xmm15; \
+	  vpxor xreg, %xmm14, xreg; \
+	  vmovdqu %xmm14, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm0);
+	vmovdqu %xmm0, (14 * 16)(%rax);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rcx);
+	vmovdqu %xmm15, (%r8);
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r10d;
+	cmovel %r10d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ocb_dec,@function;)
+
+_gcry_camellia_aesni_avx_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+	vmovdqu (%rcx), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), %xmm15, %xmm15; \
+	  vpxor xreg, %xmm15, xreg; \
+	  vmovdqu %xmm15, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm14);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm15, (%rcx);
+
+	movq %r8, %r10;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r9d;
+	cmovel %r9d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX, %r8, 8), %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor %xmm14, %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_dec_blk16;
+
+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+	vmovdqu %xmm7, (7 * 16)(%rax);
+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vpxor (%r10), %xmm7, %xmm7;
+	vpxor %xmm6, %xmm7, %xmm7;
+	vpxor %xmm5, %xmm7, %xmm7;
+	vpxor %xmm4, %xmm7, %xmm7;
+	vpxor %xmm3, %xmm7, %xmm7;
+	vpxor %xmm2, %xmm7, %xmm7;
+	vpxor %xmm1, %xmm7, %xmm7;
+	vpxor %xmm0, %xmm7, %xmm7;
+	vpxor %xmm15, %xmm7, %xmm7;
+	vpxor %xmm14, %xmm7, %xmm7;
+	vpxor %xmm13, %xmm7, %xmm7;
+	vpxor %xmm12, %xmm7, %xmm7;
+	vpxor %xmm11, %xmm7, %xmm7;
+	vpxor %xmm10, %xmm7, %xmm7;
+	vpxor %xmm9, %xmm7, %xmm7;
+	vpxor %xmm8, %xmm7, %xmm7;
+	vmovdqu %xmm7, (%r10);
+	vmovdqu (7 * 16)(%rax), %xmm7;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_auth
+ELF(.type   _gcry_camellia_aesni_avx_ocb_auth,@function;)
+
+_gcry_camellia_aesni_avx_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+	vmovdqu (%rdx), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rsi), xreg; \
+	  vpxor (lreg), %xmm15, %xmm15; \
+	  vpxor xreg, %xmm15, xreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm14);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r10d;
+	cmovel %r10d, %r8d; /* max */
+
+	vmovdqu %xmm15, (%rdx);
+
+	movq %rcx, %r10;
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor %xmm14, %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	vpxor %xmm7, %xmm6, %xmm6;
+	vpxor %xmm5, %xmm4, %xmm4;
+	vpxor %xmm3, %xmm2, %xmm2;
+	vpxor %xmm1, %xmm0, %xmm0;
+	vpxor %xmm15, %xmm14, %xmm14;
+	vpxor %xmm13, %xmm12, %xmm12;
+	vpxor %xmm11, %xmm10, %xmm10;
+	vpxor %xmm9, %xmm8, %xmm8;
+
+	vpxor %xmm6, %xmm4, %xmm4;
+	vpxor %xmm2, %xmm0, %xmm0;
+	vpxor %xmm14, %xmm12, %xmm12;
+	vpxor %xmm10, %xmm8, %xmm8;
+
+	vpxor %xmm4, %xmm0, %xmm0;
+	vpxor %xmm12, %xmm8, %xmm8;
+
+	vpxor %xmm0, %xmm8, %xmm0;
+	vpxor (%r10), %xmm0, %xmm0;
+	vmovdqu %xmm0, (%r10);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
+
+/*
+ * IN:
+ *  ab: 64-bit AB state
+ *  cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
+		   _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+	vmovq key, t0; \
+	vpxor x, x, t3; \
+	\
+	vpxor ab, t0, x; \
+	\
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	\
+	/* input rotation for sbox4 (<<< 1) */ \
+	vpand x, sbox4mask, t0; \
+	vpandn x, sbox4mask, x; \
+	vpaddw t0, t0, t1; \
+	vpsrlw $7, t0, t0; \
+	vpor t0, t1, t0; \
+	vpand sbox4mask, t0, t0; \
+	vpor t0, x, x; \
+	\
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
+	\
+	/* prefilter sboxes */ \
+	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+	\
+	/* AES subbytes + AES shift rows + AES inv shift rows */ \
+	vaesenclast t3, x, x; \
+	\
+	/* postfilter sboxes */ \
+	filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
+	\
+	/* output rotation for sbox2 (<<< 1) */ \
+	/* output rotation for sbox3 (>>> 1) */ \
+	vpshufb inv_shift_row, x, t1; \
+	vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
+	vpshufb .Lsp1110111010011110mask rRIP, x, x; \
+	vpaddb t1, t1, t2; \
+	vpsrlw $7, t1, t0; \
+	vpsllw $7, t1, t3; \
+	vpor t0, t2, t0; \
+	vpsrlw $1, t1, t1; \
+	vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
+	vpor t1, t3, t1; \
+	\
+	vpxor x, t4, t4; \
+	vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
+	vpxor t4, t0, t0; \
+	vpxor t1, t0, t0; \
+	vpsrldq $8, t0, x; \
+	vpxor t0, x, x;
+
+#define vec_rol128(in, out, nrol, t0) \
+	vpshufd $0x4e, in, out; \
+	vpsllq $(nrol), in, t0; \
+	vpsrlq $(64-(nrol)), out, out; \
+	vpaddd t0, out, out;
+
+#define vec_ror128(in, out, nror, t0) \
+	vpshufd $0x4e, in, out; \
+	vpsrlq $(nror), in, t0; \
+	vpsllq $(64-(nror)), out, out; \
+	vpaddd t0, out, out;
+
+
+.align 16
+.Linv_shift_row_and_unpcklbw:
+	.byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff
+	.byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff
+.Lsp0044440444044404mask:
+	.long 0xffff0404, 0x0404ff04;
+	.long 0x0d0dff0d, 0x0d0dff0d;
+.Lsp1110111010011110mask:
+	.long 0x000000ff, 0x000000ff;
+	.long 0x0bffff0b, 0x0b0b0bff;
+.Lsp0222022222000222mask:
+	.long 0xff060606, 0xff060606;
+	.long 0x0c0cffff, 0xff0c0c0c;
+.Lsp3033303303303033mask:
+	.long 0x04ff0404, 0x04ff0404;
+	.long 0xff0a0aff, 0x0aff0a0a;
+.Lsbox4_input_mask:
+	.byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
+.Lsigma1:
+	.long 0x3BCC908B, 0xA09E667F;
+.Lsigma2:
+	.long 0x4CAA73B2, 0xB67AE858;
+.Lsigma3:
+	.long 0xE94F82BE, 0xC6EF372F;
+.Lsigma4:
+	.long 0xF1D36F1C, 0x54FF53A5;
+.Lsigma5:
+	.long 0xDE682D1D, 0x10E527FA;
+.Lsigma6:
+	.long 0xB3E6C1FD, 0xB05688C2;
+
+
+.align 8
+ELF(.type  __camellia_avx_setup128,@function;)
+__camellia_avx_setup128:
+	/* input:
+	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
+	 *	%xmm0: key
+	 */
+	CFI_STARTPROC();
+
+#define cmll_sub(n, ctx) (key_table+((n)*8))(ctx)
+#define KL128 %xmm0
+#define KA128 %xmm2
+
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
+
+	/*
+	 * Generate KA
+	 */
+	vpsrldq $8, KL128, %xmm2;
+	vmovdqa KL128, %xmm3;
+	vpslldq $8, %xmm3, %xmm3;
+	vpsrldq $8, %xmm3, %xmm3;
+
+	camellia_f(%xmm2, %xmm4, %xmm1,
+		   %xmm5, %xmm6, %xmm7, %xmm8,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+	vpxor %xmm4, %xmm3, %xmm3;
+	camellia_f(%xmm3, %xmm2, %xmm1,
+		   %xmm5, %xmm6, %xmm7, %xmm8,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+	camellia_f(%xmm2, %xmm3, %xmm1,
+		   %xmm5, %xmm6, %xmm7, %xmm8,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+	vpxor %xmm4, %xmm3, %xmm3;
+	camellia_f(%xmm3, %xmm4, %xmm1,
+		   %xmm5, %xmm6, %xmm7, %xmm8,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+
+	vpslldq $8, %xmm3, %xmm3;
+	vpxor %xmm4, %xmm2, %xmm2;
+	vpsrldq $8, %xmm3, %xmm3;
+	vpslldq $8, %xmm2, KA128;
+	vpor %xmm3, KA128, KA128;
+
+        /*
+         * Generate subkeys
+         */
+	vmovdqu KA128, cmll_sub(24, CTX);
+	vec_rol128(KL128, %xmm3, 15, %xmm15);
+	vec_rol128(KA128, %xmm4, 15, %xmm15);
+	vec_rol128(KA128, %xmm5, 30, %xmm15);
+	vec_rol128(KL128, %xmm6, 45, %xmm15);
+	vec_rol128(KA128, %xmm7, 45, %xmm15);
+	vec_rol128(KL128, %xmm8, 60, %xmm15);
+	vec_rol128(KA128, %xmm9, 60, %xmm15);
+	vec_ror128(KL128, %xmm10, 128-77, %xmm15);
+
+	/* absorb kw2 to other subkeys */
+	vpslldq $8, KL128, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, KA128, KA128;
+	vpxor %xmm15, %xmm3, %xmm3;
+	vpxor %xmm15, %xmm4, %xmm4;
+
+	/* subl(1) ^= subr(1) & ~subr(9); */
+	vpandn %xmm15, %xmm5, %xmm13;
+	vpslldq $12, %xmm13, %xmm13;
+	vpsrldq $8, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm5, %xmm14;
+	vpslld $1, %xmm14, %xmm11;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm11, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm6, %xmm6;
+	vpxor %xmm15, %xmm8, %xmm8;
+	vpxor %xmm15, %xmm9, %xmm9;
+
+	/* subl(1) ^= subr(1) & ~subr(17); */
+	vpandn %xmm15, %xmm10, %xmm13;
+	vpslldq $12, %xmm13, %xmm13;
+	vpsrldq $8, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm10, %xmm14;
+	vpslld $1, %xmm14, %xmm11;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm11, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpshufd $0x1b, KL128, KL128;
+	vpshufd $0x1b, KA128, KA128;
+	vpshufd $0x1b, %xmm3, %xmm3;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm5, %xmm5;
+	vpshufd $0x1b, %xmm6, %xmm6;
+	vpshufd $0x1b, %xmm7, %xmm7;
+	vpshufd $0x1b, %xmm8, %xmm8;
+	vpshufd $0x1b, %xmm9, %xmm9;
+	vpshufd $0x1b, %xmm10, %xmm10;
+
+	vmovdqu KL128, cmll_sub(0, CTX);
+	vpshufd $0x1b, KL128, KL128;
+	vmovdqu KA128, cmll_sub(2, CTX);
+	vmovdqu %xmm3, cmll_sub(4, CTX);
+	vmovdqu %xmm4, cmll_sub(6, CTX);
+	vmovdqu %xmm5, cmll_sub(8, CTX);
+	vmovdqu %xmm6, cmll_sub(10, CTX);
+	vpsrldq $8, %xmm8, %xmm8;
+	vmovq %xmm7, cmll_sub(12, CTX);
+	vmovq %xmm8, cmll_sub(13, CTX);
+	vmovdqu %xmm9, cmll_sub(14, CTX);
+	vmovdqu %xmm10, cmll_sub(16, CTX);
+
+	vmovdqu cmll_sub(24, CTX), KA128;
+
+	vec_ror128(KL128, %xmm3, 128 - 94, %xmm7);
+	vec_ror128(KA128, %xmm4, 128 - 94, %xmm7);
+	vec_ror128(KL128, %xmm5, 128 - 111, %xmm7);
+	vec_ror128(KA128, %xmm6, 128 - 111, %xmm7);
+
+	vpxor %xmm15, %xmm3, %xmm3;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm5, %xmm5;
+	vpslldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm6, %xmm6;
+
+	/* absorb kw4 to other subkeys */
+	vpslldq $8, %xmm6, %xmm15;
+	vpxor %xmm15, %xmm5, %xmm5;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm3, %xmm3;
+
+	/* subl(25) ^= subr(25) & ~subr(16); */
+	vpshufd $0x1b, cmll_sub(16, CTX), %xmm10;
+	vpandn %xmm15, %xmm10, %xmm13;
+	vpslldq $4, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm10, %xmm14;
+	vpslld $1, %xmm14, %xmm11;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm11, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpshufd $0x1b, %xmm3, %xmm3;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm5, %xmm5;
+	vpshufd $0x1b, %xmm6, %xmm6;
+
+	vmovdqu %xmm3, cmll_sub(18, CTX);
+	vmovdqu %xmm4, cmll_sub(20, CTX);
+	vmovdqu %xmm5, cmll_sub(22, CTX);
+	vmovdqu %xmm6, cmll_sub(24, CTX);
+
+	vpshufd $0x1b, cmll_sub(14, CTX), %xmm3;
+	vpshufd $0x1b, cmll_sub(12, CTX), %xmm4;
+	vpshufd $0x1b, cmll_sub(10, CTX), %xmm5;
+	vpshufd $0x1b, cmll_sub(8, CTX), %xmm6;
+
+	vpxor %xmm15, %xmm3, %xmm3;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm5, %xmm5;
+
+	/* subl(25) ^= subr(25) & ~subr(8); */
+	vpandn %xmm15, %xmm6, %xmm13;
+	vpslldq $4, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm6, %xmm14;
+	vpslld $1, %xmm14, %xmm11;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm11, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpshufd $0x1b, %xmm3, %xmm3;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm5, %xmm5;
+
+	vmovdqu %xmm3, cmll_sub(14, CTX);
+	vmovdqu %xmm4, cmll_sub(12, CTX);
+	vmovdqu %xmm5, cmll_sub(10, CTX);
+
+	vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
+	vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
+	vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
+	vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
+
+	vpxor %xmm15, %xmm6, %xmm6;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm2, %xmm2;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpshufd $0x1b, %xmm6, %xmm6;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm2, %xmm2;
+	vpshufd $0x1b, %xmm0, %xmm0;
+
+	vpsrldq $8, %xmm2, %xmm3;
+	vpsrldq $8, %xmm4, %xmm5;
+	vpsrldq $8, %xmm6, %xmm7;
+
+        /*
+	 * key XOR is end of F-function.
+	 */
+	vpxor %xmm2, %xmm0, %xmm0;
+	vpxor %xmm4, %xmm2, %xmm2;
+
+	vmovq %xmm0, cmll_sub(0, CTX);
+	vmovq %xmm3, cmll_sub(2, CTX);
+	vpxor %xmm5, %xmm3, %xmm3;
+	vpxor %xmm6, %xmm4, %xmm4;
+	vpxor %xmm7, %xmm5, %xmm5;
+	vmovq %xmm2, cmll_sub(3, CTX);
+	vmovq %xmm3, cmll_sub(4, CTX);
+	vmovq %xmm4, cmll_sub(5, CTX);
+	vmovq %xmm5, cmll_sub(6, CTX);
+
+	vmovq cmll_sub(7, CTX), %xmm7;
+	vmovq cmll_sub(8, CTX), %xmm8;
+	vmovq cmll_sub(9, CTX), %xmm9;
+	vmovq cmll_sub(10, CTX), %xmm10;
+	/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+	vpandn %xmm10, %xmm8, %xmm15;
+	vpsrldq $4, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm10, %xmm0;
+	/* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm8, %xmm0, %xmm15;
+	vpslld $1, %xmm15, %xmm14;
+	vpsrld $31, %xmm15, %xmm15;
+	vpaddd %xmm14, %xmm15, %xmm15;
+	vpslldq $12, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpxor %xmm0, %xmm6, %xmm6;
+	vmovq %xmm6, cmll_sub(7, CTX);
+
+	vmovq cmll_sub(11, CTX), %xmm11;
+	vmovq cmll_sub(12, CTX), %xmm12;
+	vmovq cmll_sub(13, CTX), %xmm13;
+	vmovq cmll_sub(14, CTX), %xmm14;
+	vmovq cmll_sub(15, CTX), %xmm15;
+	/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+	vpandn %xmm7, %xmm9, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm7, %xmm0;
+	/* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm9, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vpxor %xmm11, %xmm0, %xmm0;
+	vpxor %xmm12, %xmm10, %xmm10;
+	vpxor %xmm13, %xmm11, %xmm11;
+	vpxor %xmm14, %xmm12, %xmm12;
+	vpxor %xmm15, %xmm13, %xmm13;
+	vmovq %xmm0, cmll_sub(10, CTX);
+	vmovq %xmm10, cmll_sub(11, CTX);
+	vmovq %xmm11, cmll_sub(12, CTX);
+	vmovq %xmm12, cmll_sub(13, CTX);
+	vmovq %xmm13, cmll_sub(14, CTX);
+
+	vmovq cmll_sub(16, CTX), %xmm6;
+	vmovq cmll_sub(17, CTX), %xmm7;
+	vmovq cmll_sub(18, CTX), %xmm8;
+	vmovq cmll_sub(19, CTX), %xmm9;
+	vmovq cmll_sub(20, CTX), %xmm10;
+	/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+	vpandn %xmm8, %xmm6, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm8, %xmm0;
+	/* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm6, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vpxor %xmm14, %xmm0, %xmm0;
+	vmovq %xmm0, cmll_sub(15, CTX);
+
+	/* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+	vpandn %xmm15, %xmm7, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm15, %xmm0;
+	/* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm7, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vmovq cmll_sub(21, CTX), %xmm1;
+	vmovq cmll_sub(22, CTX), %xmm2;
+	vmovq cmll_sub(23, CTX), %xmm3;
+	vmovq cmll_sub(24, CTX), %xmm4;
+
+	vpxor %xmm9, %xmm0, %xmm0;
+	vpxor %xmm10, %xmm8, %xmm8;
+	vpxor %xmm1, %xmm9, %xmm9;
+	vpxor %xmm2, %xmm10, %xmm10;
+	vpxor %xmm3, %xmm1, %xmm1;
+	vpxor %xmm4, %xmm3, %xmm3;
+
+	vmovq %xmm0, cmll_sub(18, CTX);
+	vmovq %xmm8, cmll_sub(19, CTX);
+	vmovq %xmm9, cmll_sub(20, CTX);
+	vmovq %xmm10, cmll_sub(21, CTX);
+	vmovq %xmm1, cmll_sub(22, CTX);
+	vmovq %xmm2, cmll_sub(23, CTX);
+	vmovq %xmm3, cmll_sub(24, CTX);
+
+	/* kw2 and kw4 are unused now. */
+	movq $0, cmll_sub(1, CTX);
+	movq $0, cmll_sub(25, CTX);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
+
+.align 8
+ELF(.type  __camellia_avx_setup256,@function;)
+
+__camellia_avx_setup256:
+	/* input:
+	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
+	 *	%xmm0 & %xmm1: key
+	 */
+	CFI_STARTPROC();
+
+#define KL128 %xmm0
+#define KR128 %xmm1
+#define KA128 %xmm2
+#define KB128 %xmm3
+
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KR128, KR128;
+
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
+
+	/*
+	 * Generate KA
+	 */
+	vpxor KL128, KR128, %xmm3;
+	vpsrldq $8, KR128, %xmm6;
+	vpsrldq $8, %xmm3, %xmm2;
+	vpslldq $8, %xmm3, %xmm3;
+	vpsrldq $8, %xmm3, %xmm3;
+
+	camellia_f(%xmm2, %xmm4, %xmm5,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+	vpxor %xmm4, %xmm3, %xmm3;
+	camellia_f(%xmm3, %xmm2, %xmm5,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+	vpxor %xmm6, %xmm2, %xmm2;
+	camellia_f(%xmm2, %xmm3, %xmm5,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+	vpxor %xmm4, %xmm3, %xmm3;
+	vpxor KR128, %xmm3, %xmm3;
+	camellia_f(%xmm3, %xmm4, %xmm5,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+
+	vpslldq $8, %xmm3, %xmm3;
+	vpxor %xmm4, %xmm2, %xmm2;
+	vpsrldq $8, %xmm3, %xmm3;
+	vpslldq $8, %xmm2, KA128;
+	vpor %xmm3, KA128, KA128;
+
+	/*
+	 * Generate KB
+	 */
+	vpxor KA128, KR128, %xmm3;
+	vpsrldq $8, %xmm3, %xmm4;
+	vpslldq $8, %xmm3, %xmm3;
+	vpsrldq $8, %xmm3, %xmm3;
+
+	camellia_f(%xmm4, %xmm5, %xmm6,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
+	vpxor %xmm5, %xmm3, %xmm3;
+
+	camellia_f(%xmm3, %xmm5, %xmm6,
+		   %xmm7, %xmm8, %xmm9, %xmm10,
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
+	vpslldq $8, %xmm3, %xmm3;
+	vpxor %xmm5, %xmm4, %xmm4;
+	vpsrldq $8, %xmm3, %xmm3;
+	vpslldq $8, %xmm4, %xmm4;
+	vpor %xmm3, %xmm4, KB128;
+
+        /*
+         * Generate subkeys
+         */
+	vmovdqu KB128, cmll_sub(32, CTX);
+	vec_rol128(KR128, %xmm4, 15, %xmm15);
+	vec_rol128(KA128, %xmm5, 15, %xmm15);
+	vec_rol128(KR128, %xmm6, 30, %xmm15);
+	vec_rol128(KB128, %xmm7, 30, %xmm15);
+	vec_rol128(KL128, %xmm8, 45, %xmm15);
+	vec_rol128(KA128, %xmm9, 45, %xmm15);
+	vec_rol128(KL128, %xmm10, 60, %xmm15);
+	vec_rol128(KR128, %xmm11, 60, %xmm15);
+	vec_rol128(KB128, %xmm12, 60, %xmm15);
+
+	/* absorb kw2 to other subkeys */
+	vpslldq $8, KL128, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, KB128, KB128;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm5, %xmm5;
+
+	/* subl(1) ^= subr(1) & ~subr(9); */
+	vpandn %xmm15, %xmm6, %xmm13;
+	vpslldq $12, %xmm13, %xmm13;
+	vpsrldq $8, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm6, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm7, %xmm7;
+	vpxor %xmm15, %xmm8, %xmm8;
+	vpxor %xmm15, %xmm9, %xmm9;
+
+	vpshufd $0x1b, KL128, KL128;
+	vpshufd $0x1b, KB128, KB128;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm5, %xmm5;
+	vpshufd $0x1b, %xmm6, %xmm6;
+	vpshufd $0x1b, %xmm7, %xmm7;
+	vpshufd $0x1b, %xmm8, %xmm8;
+	vpshufd $0x1b, %xmm9, %xmm9;
+
+	vmovdqu KL128, cmll_sub(0, CTX);
+	vpshufd $0x1b, KL128, KL128;
+	vmovdqu KB128, cmll_sub(2, CTX);
+	vmovdqu %xmm4, cmll_sub(4, CTX);
+	vmovdqu %xmm5, cmll_sub(6, CTX);
+	vmovdqu %xmm6, cmll_sub(8, CTX);
+	vmovdqu %xmm7, cmll_sub(10, CTX);
+	vmovdqu %xmm8, cmll_sub(12, CTX);
+	vmovdqu %xmm9, cmll_sub(14, CTX);
+
+	vmovdqu cmll_sub(32, CTX), KB128;
+
+	/* subl(1) ^= subr(1) & ~subr(17); */
+	vpandn %xmm15, %xmm10, %xmm13;
+	vpslldq $12, %xmm13, %xmm13;
+	vpsrldq $8, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm10, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm11, %xmm11;
+	vpxor %xmm15, %xmm12, %xmm12;
+
+	vec_ror128(KL128, %xmm4, 128-77, %xmm14);
+	vec_ror128(KA128, %xmm5, 128-77, %xmm14);
+	vec_ror128(KR128, %xmm6, 128-94, %xmm14);
+	vec_ror128(KA128, %xmm7, 128-94, %xmm14);
+	vec_ror128(KL128, %xmm8, 128-111, %xmm14);
+	vec_ror128(KB128, %xmm9, 128-111, %xmm14);
+
+	vpxor %xmm15, %xmm4, %xmm4;
+
+	vpshufd $0x1b, %xmm10, %xmm10;
+	vpshufd $0x1b, %xmm11, %xmm11;
+	vpshufd $0x1b, %xmm12, %xmm12;
+	vpshufd $0x1b, %xmm4, %xmm4;
+
+	vmovdqu %xmm10, cmll_sub(16, CTX);
+	vmovdqu %xmm11, cmll_sub(18, CTX);
+	vmovdqu %xmm12, cmll_sub(20, CTX);
+	vmovdqu %xmm4, cmll_sub(22, CTX);
+
+	/* subl(1) ^= subr(1) & ~subr(25); */
+	vpandn %xmm15, %xmm5, %xmm13;
+	vpslldq $12, %xmm13, %xmm13;
+	vpsrldq $8, %xmm13, %xmm13;
+	vpxor %xmm13, %xmm15, %xmm15;
+	/* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm5, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm6, %xmm6;
+	vpxor %xmm15, %xmm7, %xmm7;
+	vpxor %xmm15, %xmm8, %xmm8;
+	vpslldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm9, %xmm9;
+
+	/* absorb kw4 to other subkeys */
+	vpslldq $8, %xmm9, %xmm15;
+	vpxor %xmm15, %xmm8, %xmm8;
+	vpxor %xmm15, %xmm7, %xmm7;
+	vpxor %xmm15, %xmm6, %xmm6;
+
+	/* subl(33) ^= subr(33) & ~subr(24); */
+	vpandn %xmm15, %xmm5, %xmm14;
+	vpslldq $4, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm5, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpshufd $0x1b, %xmm5, %xmm5;
+	vpshufd $0x1b, %xmm6, %xmm6;
+	vpshufd $0x1b, %xmm7, %xmm7;
+	vpshufd $0x1b, %xmm8, %xmm8;
+	vpshufd $0x1b, %xmm9, %xmm9;
+
+	vmovdqu %xmm5, cmll_sub(24, CTX);
+	vmovdqu %xmm6, cmll_sub(26, CTX);
+	vmovdqu %xmm7, cmll_sub(28, CTX);
+	vmovdqu %xmm8, cmll_sub(30, CTX);
+	vmovdqu %xmm9, cmll_sub(32, CTX);
+
+	vpshufd $0x1b, cmll_sub(22, CTX), %xmm0;
+	vpshufd $0x1b, cmll_sub(20, CTX), %xmm1;
+	vpshufd $0x1b, cmll_sub(18, CTX), %xmm2;
+	vpshufd $0x1b, cmll_sub(16, CTX), %xmm3;
+	vpshufd $0x1b, cmll_sub(14, CTX), %xmm4;
+	vpshufd $0x1b, cmll_sub(12, CTX), %xmm5;
+	vpshufd $0x1b, cmll_sub(10, CTX), %xmm6;
+	vpshufd $0x1b, cmll_sub(8, CTX), %xmm7;
+
+	vpxor %xmm15, %xmm0, %xmm0;
+	vpxor %xmm15, %xmm1, %xmm1;
+	vpxor %xmm15, %xmm2, %xmm2;
+
+	/* subl(33) ^= subr(33) & ~subr(24); */
+	vpandn %xmm15, %xmm3, %xmm14;
+	vpslldq $4, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm3, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm5, %xmm5;
+	vpxor %xmm15, %xmm6, %xmm6;
+
+	vpshufd $0x1b, %xmm0, %xmm0;
+	vpshufd $0x1b, %xmm1, %xmm1;
+	vpshufd $0x1b, %xmm2, %xmm2;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm5, %xmm5;
+	vpshufd $0x1b, %xmm6, %xmm6;
+
+	vmovdqu %xmm0, cmll_sub(22, CTX);
+	vmovdqu %xmm1, cmll_sub(20, CTX);
+	vmovdqu %xmm2, cmll_sub(18, CTX);
+	vmovdqu %xmm4, cmll_sub(14, CTX);
+	vmovdqu %xmm5, cmll_sub(12, CTX);
+	vmovdqu %xmm6, cmll_sub(10, CTX);
+
+	vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
+	vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
+	vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
+	vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
+
+	/* subl(33) ^= subr(33) & ~subr(24); */
+	vpandn %xmm15, %xmm7, %xmm14;
+	vpslldq $4, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+	/* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+	vpand %xmm15, %xmm7, %xmm14;
+	vpslld $1, %xmm14, %xmm13;
+	vpsrld $31, %xmm14, %xmm14;
+	vpaddd %xmm13, %xmm14, %xmm14;
+	vpsrldq $12, %xmm14, %xmm14;
+	vpslldq $8, %xmm14, %xmm14;
+	vpxor %xmm14, %xmm15, %xmm15;
+
+	vpxor %xmm15, %xmm6, %xmm6;
+	vpxor %xmm15, %xmm4, %xmm4;
+	vpxor %xmm15, %xmm2, %xmm2;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpshufd $0x1b, %xmm6, %xmm6;
+	vpshufd $0x1b, %xmm4, %xmm4;
+	vpshufd $0x1b, %xmm2, %xmm2;
+	vpshufd $0x1b, %xmm0, %xmm0;
+
+	vpsrldq $8, %xmm2, %xmm3;
+	vpsrldq $8, %xmm4, %xmm5;
+	vpsrldq $8, %xmm6, %xmm7;
+
+        /*
+	 * key XOR is end of F-function.
+	 */
+	vpxor %xmm2, %xmm0, %xmm0;
+	vpxor %xmm4, %xmm2, %xmm2;
+
+	vmovq %xmm0, cmll_sub(0, CTX);
+	vmovq %xmm3, cmll_sub(2, CTX);
+	vpxor %xmm5, %xmm3, %xmm3;
+	vpxor %xmm6, %xmm4, %xmm4;
+	vpxor %xmm7, %xmm5, %xmm5;
+	vmovq %xmm2, cmll_sub(3, CTX);
+	vmovq %xmm3, cmll_sub(4, CTX);
+	vmovq %xmm4, cmll_sub(5, CTX);
+	vmovq %xmm5, cmll_sub(6, CTX);
+
+	vmovq cmll_sub(7, CTX), %xmm7;
+	vmovq cmll_sub(8, CTX), %xmm8;
+	vmovq cmll_sub(9, CTX), %xmm9;
+	vmovq cmll_sub(10, CTX), %xmm10;
+	/* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+	vpandn %xmm10, %xmm8, %xmm15;
+	vpsrldq $4, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm10, %xmm0;
+	/* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm8, %xmm0, %xmm15;
+	vpslld $1, %xmm15, %xmm14;
+	vpsrld $31, %xmm15, %xmm15;
+	vpaddd %xmm14, %xmm15, %xmm15;
+	vpslldq $12, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpxor %xmm0, %xmm6, %xmm6;
+	vmovq %xmm6, cmll_sub(7, CTX);
+
+	vmovq cmll_sub(11, CTX), %xmm11;
+	vmovq cmll_sub(12, CTX), %xmm12;
+	vmovq cmll_sub(13, CTX), %xmm13;
+	vmovq cmll_sub(14, CTX), %xmm14;
+	vmovq cmll_sub(15, CTX), %xmm15;
+	/* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+	vpandn %xmm7, %xmm9, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm7, %xmm0;
+	/* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm9, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vpxor %xmm11, %xmm0, %xmm0;
+	vpxor %xmm12, %xmm10, %xmm10;
+	vpxor %xmm13, %xmm11, %xmm11;
+	vpxor %xmm14, %xmm12, %xmm12;
+	vpxor %xmm15, %xmm13, %xmm13;
+	vmovq %xmm0, cmll_sub(10, CTX);
+	vmovq %xmm10, cmll_sub(11, CTX);
+	vmovq %xmm11, cmll_sub(12, CTX);
+	vmovq %xmm12, cmll_sub(13, CTX);
+	vmovq %xmm13, cmll_sub(14, CTX);
+
+	vmovq cmll_sub(16, CTX), %xmm6;
+	vmovq cmll_sub(17, CTX), %xmm7;
+	vmovq cmll_sub(18, CTX), %xmm8;
+	vmovq cmll_sub(19, CTX), %xmm9;
+	vmovq cmll_sub(20, CTX), %xmm10;
+	/* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+	vpandn %xmm8, %xmm6, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm8, %xmm0;
+	/* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm6, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vpxor %xmm14, %xmm0, %xmm0;
+	vmovq %xmm0, cmll_sub(15, CTX);
+
+	/* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+	vpandn %xmm15, %xmm7, %xmm1;
+	vpsrldq $4, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm15, %xmm0;
+	/* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm7, %xmm0, %xmm1;
+	vpslld $1, %xmm1, %xmm2;
+	vpsrld $31, %xmm1, %xmm1;
+	vpaddd %xmm2, %xmm1, %xmm1;
+	vpslldq $12, %xmm1, %xmm1;
+	vpsrldq $8, %xmm1, %xmm1;
+	vpxor %xmm1, %xmm0, %xmm0;
+
+	vmovq cmll_sub(21, CTX), %xmm1;
+	vmovq cmll_sub(22, CTX), %xmm2;
+	vmovq cmll_sub(23, CTX), %xmm3;
+	vmovq cmll_sub(24, CTX), %xmm4;
+
+	vpxor %xmm9, %xmm0, %xmm0;
+	vpxor %xmm10, %xmm8, %xmm8;
+	vpxor %xmm1, %xmm9, %xmm9;
+	vpxor %xmm2, %xmm10, %xmm10;
+	vpxor %xmm3, %xmm1, %xmm1;
+
+	vmovq %xmm0, cmll_sub(18, CTX);
+	vmovq %xmm8, cmll_sub(19, CTX);
+	vmovq %xmm9, cmll_sub(20, CTX);
+	vmovq %xmm10, cmll_sub(21, CTX);
+	vmovq %xmm1, cmll_sub(22, CTX);
+
+	vmovq cmll_sub(25, CTX), %xmm5;
+	vmovq cmll_sub(26, CTX), %xmm6;
+	vmovq cmll_sub(27, CTX), %xmm7;
+	vmovq cmll_sub(28, CTX), %xmm8;
+	vmovq cmll_sub(29, CTX), %xmm9;
+	vmovq cmll_sub(30, CTX), %xmm10;
+	vmovq cmll_sub(31, CTX), %xmm11;
+	vmovq cmll_sub(32, CTX), %xmm12;
+
+	/* tl = subl(26) ^ (subr(26) & ~subr(24)); */
+	vpandn %xmm6, %xmm4, %xmm15;
+	vpsrldq $4, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm6, %xmm0;
+	/* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm4, %xmm0, %xmm15;
+	vpslld $1, %xmm15, %xmm14;
+	vpsrld $31, %xmm15, %xmm15;
+	vpaddd %xmm14, %xmm15, %xmm15;
+	vpslldq $12, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpxor %xmm0, %xmm2, %xmm2;
+	vmovq %xmm2, cmll_sub(23, CTX);
+
+	/* tl = subl(23) ^ (subr(23) &  ~subr(25)); */
+	vpandn %xmm3, %xmm5, %xmm15;
+	vpsrldq $4, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm3, %xmm0;
+	/* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+	vpand %xmm5, %xmm0, %xmm15;
+	vpslld $1, %xmm15, %xmm14;
+	vpsrld $31, %xmm15, %xmm15;
+	vpaddd %xmm14, %xmm15, %xmm15;
+	vpslldq $12, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15;
+	vpxor %xmm15, %xmm0, %xmm0;
+
+	vpxor %xmm7, %xmm0, %xmm0;
+	vpxor %xmm8, %xmm6, %xmm6;
+	vpxor %xmm9, %xmm7, %xmm7;
+	vpxor %xmm10, %xmm8, %xmm8;
+	vpxor %xmm11, %xmm9, %xmm9;
+	vpxor %xmm12, %xmm11, %xmm11;
+
+	vmovq %xmm0, cmll_sub(26, CTX);
+	vmovq %xmm6, cmll_sub(27, CTX);
+	vmovq %xmm7, cmll_sub(28, CTX);
+	vmovq %xmm8, cmll_sub(29, CTX);
+	vmovq %xmm9, cmll_sub(30, CTX);
+	vmovq %xmm10, cmll_sub(31, CTX);
+	vmovq %xmm11, cmll_sub(32, CTX);
+
+	/* kw2 and kw4 are unused now. */
+	movq $0, cmll_sub(1, CTX);
+	movq $0, cmll_sub(33, CTX);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_keygen
+ELF(.type  _gcry_camellia_aesni_avx_keygen,@function;)
+
+_gcry_camellia_aesni_avx_keygen:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: key
+	 *	%rdx: keylen
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (%rsi), %xmm0;
+	cmpl $24, %edx;
+	jb __camellia_avx_setup128;
+	je .Lprepare_key192;
+
+	vmovdqu 16(%rsi), %xmm1;
+	jmp __camellia_avx_setup256;
+
+.Lprepare_key192:
+	vpcmpeqd %xmm2, %xmm2, %xmm2;
+	vmovq 16(%rsi), %xmm1;
+
+	vpxor %xmm1, %xmm2, %xmm2;
+	vpslldq $8, %xmm2, %xmm2;
+	vpor %xmm2, %xmm1, %xmm1;
+
+	jmp __camellia_avx_setup256;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia-aesni-avx2-amd64.S b/comm/third_party/libgcrypt/cipher/camellia-aesni-avx2-amd64.S
new file mode 100644
index 0000000000..f620f04036
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia-aesni-avx2-amd64.S
@@ -0,0 +1,1782 @@
+/* camellia-avx2-aesni-amd64.S  -  AES-NI/AVX2 implementation of Camellia cipher
+ *
+ * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/**********************************************************************
+  32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vbroadcasti128 .Linv_shift_row rRIP, t4; \
+	vpbroadcastd .L0f0f0f0f rRIP, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	/* prefilter sbox 4 */ \
+	filter_8bit(x0, t5, t6, t7, t4); \
+	filter_8bit(x7, t5, t6, t7, t4); \
+	vextracti128 $1, x0, t0##_x; \
+	vextracti128 $1, x7, t1##_x; \
+	filter_8bit(x3, t2, t3, t7, t4); \
+	filter_8bit(x6, t2, t3, t7, t4); \
+	vextracti128 $1, x3, t3##_x; \
+	vextracti128 $1, x6, t2##_x; \
+	filter_8bit(x2, t5, t6, t7, t4); \
+	filter_8bit(x5, t5, t6, t7, t4); \
+	filter_8bit(x1, t5, t6, t7, t4); \
+	filter_8bit(x4, t5, t6, t7, t4); \
+	\
+	vpxor t4##_x, t4##_x, t4##_x; \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vextracti128 $1, x2, t6##_x; \
+	vextracti128 $1, x5, t5##_x; \
+	vaesenclast t4##_x, x0##_x, x0##_x; \
+	vaesenclast t4##_x, t0##_x, t0##_x; \
+	vaesenclast t4##_x, x7##_x, x7##_x; \
+	vaesenclast t4##_x, t1##_x, t1##_x; \
+	vaesenclast t4##_x, x3##_x, x3##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vaesenclast t4##_x, x6##_x, x6##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t0##_x, x0, x0; \
+	vinserti128 $1, t1##_x, x7, x7; \
+	vinserti128 $1, t3##_x, x3, x3; \
+	vinserti128 $1, t2##_x, x6, x6; \
+	vextracti128 $1, x1, t3##_x; \
+	vextracti128 $1, x4, t2##_x; \
+	vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
+	vaesenclast t4##_x, x2##_x, x2##_x; \
+	vaesenclast t4##_x, t6##_x, t6##_x; \
+	vaesenclast t4##_x, x5##_x, x5##_x; \
+	vaesenclast t4##_x, t5##_x, t5##_x; \
+	vaesenclast t4##_x, x1##_x, x1##_x; \
+	vaesenclast t4##_x, t3##_x, t3##_x; \
+	vaesenclast t4##_x, x4##_x, x4##_x; \
+	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t6##_x, x2, x2; \
+	vinserti128 $1, t5##_x, x5, x5; \
+	vinserti128 $1, t3##_x, x1, x1; \
+	vinserti128 $1, t2##_x, x4, x4; \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t4); \
+	filter_8bit(x7, t0, t1, t7, t4); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	vpxor t7, t7, t7; \
+	\
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpshufb t7, t1, t1; \
+	vpsrldq $3, t0, t3; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpshufb t7, t2, t2; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t7, t3, t3; \
+	vpsrldq $5, t0, t5; \
+	vpshufb t7, t4, t4; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpsrldq $6, t0, t6; \
+	vpshufb t7, t5, t5; \
+	vpshufb t7, t6, t6; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
+	vpsrldq $7, t0, t6; \
+	vpshufb t7, t0, t0; \
+	vpshufb t7, t6, t7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 32(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 32(mem_cd), x3, x3; \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 32(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 32(mem_cd), x5, x5; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 32(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+	\
+	vmovdqu x0, 4 * 32(mem_cd); \
+	vmovdqu x1, 5 * 32(mem_cd); \
+	vmovdqu x2, 6 * 32(mem_cd); \
+	vmovdqu x3, 7 * 32(mem_cd); \
+	vmovdqu x4, 0 * 32(mem_cd); \
+	vmovdqu x5, 1 * 32(mem_cd); \
+	vmovdqu x6, 2 * 32(mem_cd); \
+	vmovdqu x7, 3 * 32(mem_cd); \
+	\
+	roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+	vpxor tt0, tt0, tt0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+	vmovdqu l4, 4 * 32(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 32(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 32(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 32(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor 4 * 32(r), t0, t0; \
+	vpor 5 * 32(r), t1, t1; \
+	vpor 6 * 32(r), t2, t2; \
+	vpor 7 * 32(r), t3, t3; \
+	\
+	vpxor 0 * 32(r), t0, t0; \
+	vpxor 1 * 32(r), t1, t1; \
+	vpxor 2 * 32(r), t2, t2; \
+	vpxor 3 * 32(r), t3, t3; \
+	vmovdqu t0, 0 * 32(r); \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+	vmovdqu t1, 1 * 32(r); \
+	vmovdqu t2, 2 * 32(r); \
+	vmovdqu t3, 3 * 32(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand 0 * 32(r), t0, t0; \
+	vpand 1 * 32(r), t1, t1; \
+	vpand 2 * 32(r), t2, t2; \
+	vpand 3 * 32(r), t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 32(r), t0, t0; \
+	vpxor 5 * 32(r), t1, t1; \
+	vpxor 6 * 32(r), t2, t2; \
+	vpxor 7 * 32(r), t3, t3; \
+	vmovdqu t0, 4 * 32(r); \
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+	vmovdqu t1, 5 * 32(r); \
+	vmovdqu t2, 6 * 32(r); \
+	vmovdqu t3, 7 * 32(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 32(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 32(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 32(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
+	\
+	vpxor 0 * 32(rio), x0, y7; \
+	vpxor 1 * 32(rio), x0, y6; \
+	vpxor 2 * 32(rio), x0, y5; \
+	vpxor 3 * 32(rio), x0, y4; \
+	vpxor 4 * 32(rio), x0, y3; \
+	vpxor 5 * 32(rio), x0, y2; \
+	vpxor 6 * 32(rio), x0, y1; \
+	vpxor 7 * 32(rio), x0, y0; \
+	vpxor 8 * 32(rio), x0, x7; \
+	vpxor 9 * 32(rio), x0, x6; \
+	vpxor 10 * 32(rio), x0, x5; \
+	vpxor 11 * 32(rio), x0, x4; \
+	vpxor 12 * 32(rio), x0, x3; \
+	vpxor 13 * 32(rio), x0, x2; \
+	vpxor 14 * 32(rio), x0, x1; \
+	vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab); \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu y0, 0 * 32(mem_cd); \
+	vmovdqu y1, 1 * 32(mem_cd); \
+	vmovdqu y2, 2 * 32(mem_cd); \
+	vmovdqu y3, 3 * 32(mem_cd); \
+	vmovdqu y4, 4 * 32(mem_cd); \
+	vmovdqu y5, 5 * 32(mem_cd); \
+	vmovdqu y6, 6 * 32(mem_cd); \
+	vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 32(rio); \
+	vmovdqu x1, 1 * 32(rio); \
+	vmovdqu x2, 2 * 32(rio); \
+	vmovdqu x3, 3 * 32(rio); \
+	vmovdqu x4, 4 * 32(rio); \
+	vmovdqu x5, 5 * 32(rio); \
+	vmovdqu x6, 6 * 32(rio); \
+	vmovdqu x7, 7 * 32(rio); \
+	vmovdqu y0, 8 * 32(rio); \
+	vmovdqu y1, 9 * 32(rio); \
+	vmovdqu y2, 10 * 32(rio); \
+	vmovdqu y3, 11 * 32(rio); \
+	vmovdqu y4, 12 * 32(rio); \
+	vmovdqu y5, 13 * 32(rio); \
+	vmovdqu y6, 14 * 32(rio); \
+	vmovdqu y7, 15 * 32(rio);
+
+.text
+.align 32
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.Lpack_bswap:
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+
+.align 8
+ELF(.type   __camellia_enc_blk32,@function;)
+
+__camellia_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%ymm0..%ymm15: 32 plaintext blocks
+	 * output:
+	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	leaq 8 * 32(%rax), %rcx;
+
+	leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+.align 8
+.Lenc_loop:
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	cmpq %r8, CTX;
+	je .Lenc_done;
+	leaq (8 * 8)(CTX), CTX;
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX),
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX));
+	jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax));
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
+
+.align 8
+ELF(.type   __camellia_dec_blk32,@function;)
+
+__camellia_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%ymm0..%ymm15: 16 encrypted blocks
+	 * output:
+	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+	CFI_STARTPROC();
+
+	movq %r8, %rcx;
+	movq CTX, %r8
+	leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+	leaq 8 * 32(%rax), %rcx;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+.align 8
+.Ldec_loop:
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	cmpq %r8, CTX;
+	je .Ldec_done;
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table) + 8)(CTX),
+	      ((key_table) + 12)(CTX),
+	      ((key_table) + 0)(CTX),
+	      ((key_table) + 4)(CTX));
+
+	leaq (-8 * 8)(CTX), CTX;
+	jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ctr_enc
+ELF(.type   _gcry_camellia_aesni_avx2_ctr_enc,@function;)
+
+_gcry_camellia_aesni_avx2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	movq 8(%rcx), %r11;
+	bswapq %r11;
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 32), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	vpcmpeqd %ymm15, %ymm15, %ymm15;
+	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm0;
+	vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0;
+	vmovdqa %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm14);
+	vbroadcasti128 .Lbswap128_mask rRIP, %ymm14;
+	vinserti128 $1, %xmm0, %ymm1, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 15 * 32(%rax);
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 32), %r11;
+	ja .Lload_ctr_carry;
+
+	/* construct IVs */
+	vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 14 * 32(%rax);
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 13 * 32(%rax);
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm12;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm11;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm10;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm9;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm8;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm7;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm6;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm5;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm4;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm3;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm2;
+	vpsubq %ymm15, %ymm0, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm1;
+	vpsubq %ymm15, %ymm0, %ymm0;  /* +30 ; +31 */
+	vpsubq %xmm15, %xmm0, %xmm13; /* +32 */
+	vpshufb %ymm14, %ymm0, %ymm0;
+	vpshufb %xmm14, %xmm13, %xmm13;
+	vmovdqu %xmm13, (%rcx);
+
+	jmp .Lload_ctr_done;
+
+.align 4
+.Lload_ctr_carry:
+	/* construct IVs */
+	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */
+	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 14 * 32(%rax);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 13 * 32(%rax);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm12;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm11;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm10;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm9;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm8;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm7;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm6;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm5;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm4;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm3;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm2;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm1;
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	inc_le128(%ymm0, %ymm15, %ymm13);
+	vextracti128 $1, %ymm0, %xmm13;
+	vpshufb %ymm14, %ymm0, %ymm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
+	vmovdqu %xmm13, (%rcx);
+
+.align 4
+.Lload_ctr_done:
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+	leaq 32 * 16(%rdx), %rdx;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_cbc_dec
+ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec,@function;)
+
+_gcry_camellia_aesni_avx2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	movq %rcx, %r9;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 32), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	call __camellia_dec_blk32;
+
+	/* XOR output with IV */
+	vmovdqu %ymm8, (%rax);
+	vmovdqu (%r9), %xmm8;
+	vinserti128 $1, (%rdx), %ymm8, %ymm8;
+	vpxor %ymm8, %ymm7, %ymm7;
+	vmovdqu (%rax), %ymm8;
+	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+	movq (15 * 32 + 16 + 0)(%rdx), %rax;
+	movq (15 * 32 + 16 + 8)(%rdx), %rcx;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	/* store new IV */
+	movq %rax, (0)(%r9);
+	movq %rcx, (8)(%r9);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_cfb_dec
+ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec,@function;)
+
+_gcry_camellia_aesni_avx2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 32), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+	vmovdqu (%rcx), %xmm15;
+	vinserti128 $1, (%rdx), %ymm15, %ymm15;
+	vpxor %ymm15, %ymm0, %ymm15;
+	vmovdqu (15 * 32 + 16)(%rdx), %xmm1;
+	vmovdqu %xmm1, (%rcx); /* store new IV */
+	vpxor (0 * 32 + 16)(%rdx), %ymm0, %ymm14;
+	vpxor (1 * 32 + 16)(%rdx), %ymm0, %ymm13;
+	vpxor (2 * 32 + 16)(%rdx), %ymm0, %ymm12;
+	vpxor (3 * 32 + 16)(%rdx), %ymm0, %ymm11;
+	vpxor (4 * 32 + 16)(%rdx), %ymm0, %ymm10;
+	vpxor (5 * 32 + 16)(%rdx), %ymm0, %ymm9;
+	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm8;
+	vpxor (7 * 32 + 16)(%rdx), %ymm0, %ymm7;
+	vpxor (8 * 32 + 16)(%rdx), %ymm0, %ymm6;
+	vpxor (9 * 32 + 16)(%rdx), %ymm0, %ymm5;
+	vpxor (10 * 32 + 16)(%rdx), %ymm0, %ymm4;
+	vpxor (11 * 32 + 16)(%rdx), %ymm0, %ymm3;
+	vpxor (12 * 32 + 16)(%rdx), %ymm0, %ymm2;
+	vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1;
+	vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0;
+
+	call __camellia_enc_blk32;
+
+	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_enc
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_enc,@function;)
+
+_gcry_camellia_aesni_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[32])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
+
+	vmovdqu (%rcx), %xmm14;
+	vmovdqu (%r8), %xmm13;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm13, %ymm13; \
+	  vpxor yreg, %ymm15, yreg; \
+	  vmovdqu %ymm15, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (13 * 32)(%rax);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r9), %r10;
+	movq (17 * 8)(%r9), %r11;
+	movq (18 * 8)(%r9), %r12;
+	movq (19 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r9), %r10;
+	movq (21 * 8)(%r9), %r11;
+	movq (22 * 8)(%r9), %r12;
+	movq (23 * 8)(%r9), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r9), %r10;
+	movq (25 * 8)(%r9), %r11;
+	movq (26 * 8)(%r9), %r12;
+	movq (27 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r9), %r10;
+	movq (29 * 8)(%r9), %r11;
+	movq (30 * 8)(%r9), %r12;
+	movq (31 * 8)(%r9), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vextracti128 $1, %ymm13, %xmm15;
+	vmovdqu %xmm14, (%rcx);
+	vpxor %xmm13, %xmm15, %xmm15;
+	vmovdqu %xmm15, (%r8);
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r10d;
+	cmovel %r10d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_dec
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_dec,@function;)
+
+_gcry_camellia_aesni_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[32])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
+
+	vmovdqu (%rcx), %xmm14;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm15, yreg; \
+	  vmovdqu %ymm15, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm13);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r9), %r10;
+	movq (17 * 8)(%r9), %r11;
+	movq (18 * 8)(%r9), %r12;
+	movq (19 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r9), %r10;
+	movq (21 * 8)(%r9), %r11;
+	movq (22 * 8)(%r9), %r12;
+	movq (23 * 8)(%r9), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r9), %r10;
+	movq (25 * 8)(%r9), %r11;
+	movq (26 * 8)(%r9), %r12;
+	movq (27 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r9), %r10;
+	movq (29 * 8)(%r9), %r11;
+	movq (30 * 8)(%r9), %r12;
+	movq (31 * 8)(%r9), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rcx);
+
+	movq %r8, %r10;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r9d;
+	cmovel %r9d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor %ymm13, %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_dec_blk32;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vmovdqu %ymm7, (7 * 32)(%rax);
+	vmovdqu %ymm6, (6 * 32)(%rax);
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vpxor %ymm5, %ymm7, %ymm7;
+	vpxor %ymm4, %ymm6, %ymm6;
+	vpxor %ymm3, %ymm7, %ymm7;
+	vpxor %ymm2, %ymm6, %ymm6;
+	vpxor %ymm1, %ymm7, %ymm7;
+	vpxor %ymm0, %ymm6, %ymm6;
+	vpxor %ymm15, %ymm7, %ymm7;
+	vpxor %ymm14, %ymm6, %ymm6;
+	vpxor %ymm13, %ymm7, %ymm7;
+	vpxor %ymm12, %ymm6, %ymm6;
+	vpxor %ymm11, %ymm7, %ymm7;
+	vpxor %ymm10, %ymm6, %ymm6;
+	vpxor %ymm9, %ymm7, %ymm7;
+	vpxor %ymm8, %ymm6, %ymm6;
+	vpxor %ymm7, %ymm6, %ymm7;
+
+	vextracti128 $1, %ymm7, %xmm6;
+	vpxor %xmm6, %xmm7, %xmm7;
+	vpxor (%r10), %xmm7, %xmm7;
+	vmovdqu %xmm7, (%r10);
+
+	vmovdqu 7 * 32(%rax), %ymm7;
+	vmovdqu 6 * 32(%rax), %ymm6;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_auth
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_auth,@function;)
+
+_gcry_camellia_aesni_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
+
+	vmovdqu (%rdx), %xmm14;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm15, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm13);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r8), %r10;
+	movq (17 * 8)(%r8), %r11;
+	movq (18 * 8)(%r8), %r12;
+	movq (19 * 8)(%r8), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r8), %r10;
+	movq (21 * 8)(%r8), %r11;
+	movq (22 * 8)(%r8), %r12;
+	movq (23 * 8)(%r8), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r8), %r10;
+	movq (25 * 8)(%r8), %r11;
+	movq (26 * 8)(%r8), %r12;
+	movq (27 * 8)(%r8), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r8), %r10;
+	movq (29 * 8)(%r8), %r11;
+	movq (30 * 8)(%r8), %r12;
+	movq (31 * 8)(%r8), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rdx);
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r10d;
+	cmovel %r10d, %r8d; /* max */
+
+	movq %rcx, %r10;
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor %ymm13, %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	vpxor %ymm7, %ymm6, %ymm6;
+	vpxor %ymm5, %ymm4, %ymm4;
+	vpxor %ymm3, %ymm2, %ymm2;
+	vpxor %ymm1, %ymm0, %ymm0;
+	vpxor %ymm15, %ymm14, %ymm14;
+	vpxor %ymm13, %ymm12, %ymm12;
+	vpxor %ymm11, %ymm10, %ymm10;
+	vpxor %ymm9, %ymm8, %ymm8;
+
+	vpxor %ymm6, %ymm4, %ymm4;
+	vpxor %ymm2, %ymm0, %ymm0;
+	vpxor %ymm14, %ymm12, %ymm12;
+	vpxor %ymm10, %ymm8, %ymm8;
+
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm12, %ymm8, %ymm8;
+
+	vpxor %ymm0, %ymm8, %ymm0;
+
+	vextracti128 $1, %ymm0, %xmm1;
+	vpxor (%r10), %xmm0, %xmm0;
+	vpxor %xmm0, %xmm1, %xmm0;
+	vmovdqu %xmm0, (%r10);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia-arm.S b/comm/third_party/libgcrypt/cipher/camellia-arm.S
new file mode 100644
index 0000000000..a3d87d1109
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia-arm.S
@@ -0,0 +1,626 @@
+/* camellia-arm.S  -  ARM assembly implementation of Camellia cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* struct camellia_ctx: */
+#define key_table 0
+
+/* register macros */
+#define CTX %r0
+#define RTAB1 %ip
+#define RTAB3 %r1
+#define RMASK %lr
+
+#define IL %r2
+#define IR %r3
+
+#define XL %r4
+#define XR %r5
+#define YL %r6
+#define YR %r7
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define ldr_input_aligned_be(rin, a, b, c, d, rtmp) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	be_to_host(a, rtmp); \
+	ldr c, [rin, #8]; \
+	be_to_host(b, rtmp); \
+	ldr d, [rin, #12]; \
+	be_to_host(c, rtmp); \
+	be_to_host(d, rtmp);
+
+#define str_output_aligned_be(rout, a, b, c, d, rtmp) \
+	be_to_host(a, rtmp); \
+	be_to_host(b, rtmp); \
+	str a, [rout, #0]; \
+	be_to_host(c, rtmp); \
+	str b, [rout, #4]; \
+	be_to_host(d, rtmp); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads/writes allowed */
+	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
+		ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp)
+
+	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0)
+#else
+	/* need to handle unaligned reads/writes by byte reads */
+	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(ra, rin, 0, rtmp0); \
+			ldr_unaligned_be(rb, rin, 4, rtmp0); \
+			ldr_unaligned_be(rc, rin, 8, rtmp0); \
+			ldr_unaligned_be(rd, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp0); \
+		2:;
+
+	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(ra, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(rb, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(rc, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(rd, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0); \
+		2:;
+#endif
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(xl, xr, kl, kr, yl, yr) \
+	ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
+	and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
+	ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
+	and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
+	and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
+	ldr  IR, [RTAB1,  IR]; \
+	and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
+	eor yl, RT2; \
+	ldr  IL, [RTAB1,  IL]; \
+	eor yr, RT3; \
+	\
+	ldr RT0, [RTAB3, RT0]; \
+	add RTAB1, #4; \
+	ldr RT1, [RTAB3, RT1]; \
+	add RTAB3, #4; \
+	\
+	and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
+	and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
+	\
+	eor IR, RT0; \
+	eor IL, RT1; \
+	\
+	ldr RT2, [RTAB1, RT2]; \
+	and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
+	ldr RT3, [RTAB1, RT3]; \
+	and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
+	\
+	ldr RT0, [RTAB3, RT0]; \
+	sub RTAB1, #4; \
+	ldr RT1, [RTAB3, RT1]; \
+	sub RTAB3, #4; \
+	\
+	eor IR, RT2; \
+	eor IL, RT3; \
+	eor IR, RT0; \
+	eor IL, RT1; \
+	\
+	eor IR, IL; \
+	eor yr, yr, IL, ror#8; \
+	eor yl, IR; \
+	eor yr, IR;
+
+#define enc_rounds(n) \
+	roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
+
+#define dec_rounds(n) \
+	roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
+
+/* perform FL and FL⁻¹ */
+#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
+	ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
+	ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
+	and RT0, ll; \
+	ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
+	orr RT2, rr; \
+	ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
+	eor rl, RT2; \
+	eor lr, lr, RT0, ror#31; \
+	and RT3, rl; \
+	orr RT1, lr; \
+	eor ll, RT1; \
+	eor rr, rr, RT3, ror#31;
+
+#define enc_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 0, (n) * 2 + 1, \
+	    (n) * 2 + 2, (n) * 2 + 3);
+
+#define dec_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 2, (n) * 2 + 3, \
+	    (n) * 2 + 0, (n) * 2 + 1);
+
+#define inpack(n) \
+	ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor XL, RT0; \
+	eor XR, RT1;
+
+#define outunpack(n) \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor YL, RT0; \
+	eor YR, RT1; \
+	str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
+
+.align 3
+.globl _gcry_camellia_arm_encrypt_block
+.type   _gcry_camellia_arm_encrypt_block,%function;
+
+_gcry_camellia_arm_encrypt_block:
+	/* input:
+	 *	%r0: keytable
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: keybitlen
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
+	mov RMASK, #0xff;
+	add RTAB3, RTAB1, #(2 * 4);
+	push {%r3};
+	mov RMASK, RMASK, lsl#4 /* byte mask */
+
+	inpack(0);
+
+	enc_rounds(0);
+	enc_fls(8);
+	enc_rounds(8);
+	enc_fls(16);
+	enc_rounds(16);
+
+	pop {RT0};
+	cmp RT0, #(16 * 8);
+	bne .Lenc_256;
+
+	pop {%r1};
+	outunpack(24);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+
+.Lenc_256:
+	enc_fls(24);
+	enc_rounds(24);
+
+	pop {%r1};
+	outunpack(32);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;
+
+.align 3
+.globl _gcry_camellia_arm_decrypt_block
+.type   _gcry_camellia_arm_decrypt_block,%function;
+
+_gcry_camellia_arm_decrypt_block:
+	/* input:
+	 *	%r0: keytable
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: keybitlen
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
+	mov RMASK, #0xff;
+	add RTAB3, RTAB1, #(2 * 4);
+	mov RMASK, RMASK, lsl#4 /* byte mask */
+
+	cmp %r3, #(16 * 8);
+	bne .Ldec_256;
+
+	inpack(24);
+
+.Ldec_128:
+	dec_rounds(16);
+	dec_fls(16);
+	dec_rounds(8);
+	dec_fls(8);
+	dec_rounds(0);
+
+	pop {%r1};
+	outunpack(0);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+
+.Ldec_256:
+	inpack(32);
+	dec_rounds(24);
+	dec_fls(24);
+
+	b .Ldec_128;
+.ltorg
+.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;
+
+.data
+
+/* Encryption/Decryption tables */
+.align 5
+.Lcamellia_sp1110:
+.long 0x70707000
+.Lcamellia_sp0222:
+            .long 0x00e0e0e0
+.Lcamellia_sp3033:
+                        .long 0x38003838
+.Lcamellia_sp4404:
+                                    .long 0x70700070
+.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
+.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
+.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
+.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
+.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
+.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
+.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
+.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
+.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
+.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
+.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
+.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
+.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
+.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
+.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
+.long 0x23232300, 0x00464646, 0x91009191, 0x86860086
+.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
+.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
+.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
+.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
+.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
+.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
+.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
+.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
+.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
+.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
+.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
+.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
+.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
+.long 0x92929200, 0x00252525, 0x49004949, 0x51510051
+.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
+.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
+.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
+.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
+.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
+.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
+.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
+.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
+.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
+.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
+.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
+.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
+.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
+.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
+.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
+.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
+.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
+.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
+.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
+.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
+.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
+.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
+.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
+.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
+.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
+.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
+.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
+.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
+.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
+.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
+.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
+.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
+.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
+.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
+.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
+.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
+.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
+.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
+.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
+.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
+.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
+.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
+.long 0x12121200, 0x00242424, 0x09000909, 0x00000000
+.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
+.long 0x20202000, 0x00404040, 0x10001010, 0x75750075
+.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
+.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
+.long 0x84848400, 0x00090909, 0x42004242, 0x09090009
+.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
+.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
+.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
+.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
+.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
+.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
+.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
+.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
+.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
+.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
+.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
+.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
+.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
+.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
+.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
+.long 0x04040400, 0x00080808, 0x02000202, 0x13130013
+.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
+.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
+.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
+.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
+.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
+.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
+.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
+.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
+.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
+.long 0x32323200, 0x00646464, 0x19001919, 0x78780078
+.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
+.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
+.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
+.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
+.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
+.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
+.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
+.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
+.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
+.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
+.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
+.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
+.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
+.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
+.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
+.long 0x24242400, 0x00484848, 0x12001212, 0x40400040
+.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
+.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
+.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
+.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
+.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
+.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
+.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
+.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
+.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
+.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
+.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
+.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
+.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
+.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
+.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
+.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
+.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
+.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
+.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
+.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
+.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
+.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
+.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
+.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
+.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
+.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
+.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
+.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
+.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
+.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
+.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
+.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
+.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
+.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
+.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
+.long 0x09090900, 0x00121212, 0x84008484, 0x01010001
+.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
+.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
+.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
+.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
+.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
+.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
+.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
+.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
+.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
+.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
+.long 0x33333300, 0x00666666, 0x99009999, 0x99990099
+.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
+.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
+.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
+.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
+.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
+.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
+.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
+.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
+.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
+.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
+.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
+.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
+.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
+.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
+.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
+.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
+.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
+.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
+.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
+.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
+.long 0x13131300, 0x00262626, 0x89008989, 0x08080008
+.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
+.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
+.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
+.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
+.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
+.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
+.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
+.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
+.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
+.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
+.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
+.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
+.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
+.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
+.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
+.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
+.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
+.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
+.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
+.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
+.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
+.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
+.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
+.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
+.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
+.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
+.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
+.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
+.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
+.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
+.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
+.long 0x88888800, 0x00111111, 0x44004444, 0x96960096
+.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
+.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
+.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
+.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
+.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
+.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
+.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
+.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
+.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
+.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
+.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
+.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
+.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
+.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
+.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
+.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
+.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
+.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
+.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
+.long 0x40404000, 0x00808080, 0x20002020, 0x07070007
+.long 0x28282800, 0x00505050, 0x14001414, 0x55550055
+.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
+.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
+.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
+.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
+.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
+.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
+.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
+.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
+.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
+.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
+.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
+.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
+.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
+.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia-glue.c b/comm/third_party/libgcrypt/cipher/camellia-glue.c
new file mode 100644
index 0000000000..6577b6516a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia-glue.c
@@ -0,0 +1,1097 @@
+/* camellia-glue.c - Glue for the Camellia cipher
+ * Copyright (C) 2007 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/* I put all the libgcrypt-specific stuff in this file to keep the
+   camellia.c/camellia.h files exactly as provided by NTT.  If they
+   update their code, this should make it easier to bring the changes
+   in. - dshaw
+
+   There is one small change which needs to be done: Include the
+   following code at the top of camellia.h: */
+#if 0
+
+/* To use Camellia with libraries it is often useful to keep the name
+ * space of the library clean.  The following macro is thus useful:
+ *
+ *     #define CAMELLIA_EXT_SYM_PREFIX foo_
+ *
+ * This prefixes all external symbols with "foo_".
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#ifdef CAMELLIA_EXT_SYM_PREFIX
+#define CAMELLIA_PREFIX1(x,y) x ## y
+#define CAMELLIA_PREFIX2(x,y) CAMELLIA_PREFIX1(x,y)
+#define CAMELLIA_PREFIX(x)    CAMELLIA_PREFIX2(CAMELLIA_EXT_SYM_PREFIX,x)
+#define Camellia_Ekeygen      CAMELLIA_PREFIX(Camellia_Ekeygen)
+#define Camellia_EncryptBlock CAMELLIA_PREFIX(Camellia_EncryptBlock)
+#define Camellia_DecryptBlock CAMELLIA_PREFIX(Camellia_DecryptBlock)
+#define camellia_decrypt128   CAMELLIA_PREFIX(camellia_decrypt128)
+#define camellia_decrypt256   CAMELLIA_PREFIX(camellia_decrypt256)
+#define camellia_encrypt128   CAMELLIA_PREFIX(camellia_encrypt128)
+#define camellia_encrypt256   CAMELLIA_PREFIX(camellia_encrypt256)
+#define camellia_setup128     CAMELLIA_PREFIX(camellia_setup128)
+#define camellia_setup192     CAMELLIA_PREFIX(camellia_setup192)
+#define camellia_setup256     CAMELLIA_PREFIX(camellia_setup256)
+#endif /*CAMELLIA_EXT_SYM_PREFIX*/
+
+#endif /* Code sample. */
+
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "camellia.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+/* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
+typedef struct
+{
+  KEY_TABLE_TYPE keytable;
+  int keybitlength;
+#ifdef USE_AESNI_AVX
+  unsigned int use_aesni_avx:1;	/* AES-NI/AVX implementation shall be used.  */
+#endif /*USE_AESNI_AVX*/
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
+#endif /*USE_AESNI_AVX2*/
+} CAMELLIA_context;
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+#ifdef USE_AESNI_AVX
+/* Assembler implementations of Camellia using AES-NI and AVX.  Process data
+   in 16 block same time.
+ */
+extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
+					     const unsigned char *abuf,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
+					    const unsigned char *key,
+					    unsigned int keylen) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_AESNI_AVX2
+/* Assembler implementations of Camellia using AES-NI and AVX2.  Process data
+   in 32 block same time.
+ */
+extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
+					       const unsigned char *abuf,
+					       unsigned char *offset,
+					       unsigned char *checksum,
+					       const u64 Ls[32]) ASM_FUNC_ABI;
+#endif
+
+static const char *selftest(void);
+
+static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					const void *inbuf_arg, size_t nblocks,
+					int encrypt);
+static size_t _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				       size_t nblocks);
+
+static gcry_err_code_t
+camellia_setkey(void *c, const byte *key, unsigned keylen,
+                cipher_bulk_ops_t *bulk_ops)
+{
+  CAMELLIA_context *ctx=c;
+  static int initialized=0;
+  static const char *selftest_failed=NULL;
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  unsigned int hwf = _gcry_get_hw_features ();
+#endif
+
+  if(keylen!=16 && keylen!=24 && keylen!=32)
+    return GPG_ERR_INV_KEYLEN;
+
+  if(!initialized)
+    {
+      initialized=1;
+      selftest_failed=selftest();
+      if(selftest_failed)
+	log_error("%s\n",selftest_failed);
+    }
+
+  if(selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+
+  ctx->keybitlength=keylen*8;
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_dec = _gcry_camellia_cbc_dec;
+  bulk_ops->cfb_dec = _gcry_camellia_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
+  bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
+  bulk_ops->ocb_auth  = _gcry_camellia_ocb_auth;
+
+  if (0)
+    { }
+#ifdef USE_AESNI_AVX
+  else if (ctx->use_aesni_avx)
+    _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
+  else
+#endif
+    {
+      Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
+      _gcry_burn_stack
+        ((19+34+34)*sizeof(u32)+2*sizeof(void*) /* camellia_setup256 */
+         +(4+32)*sizeof(u32)+2*sizeof(void*)    /* camellia_setup192 */
+         +0+sizeof(int)+2*sizeof(void*)         /* Camellia_Ekeygen */
+         +3*2*sizeof(void*)                     /* Function calls.  */
+         );
+    }
+
+  return 0;
+}
+
+#ifdef USE_ARM_ASM
+
+/* Assembly implementations of Camellia. */
+extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable,
+					       byte *outbuf, const byte *inbuf,
+					       const int keybits);
+
+extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable,
+					       byte *outbuf, const byte *inbuf,
+					       const int keybits);
+
+static void Camellia_EncryptBlock(const int keyBitLength,
+				  const unsigned char *plaintext,
+				  const KEY_TABLE_TYPE keyTable,
+				  unsigned char *cipherText)
+{
+  _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext,
+				     keyBitLength);
+}
+
+static void Camellia_DecryptBlock(const int keyBitLength,
+				  const unsigned char *cipherText,
+				  const KEY_TABLE_TYPE keyTable,
+				  unsigned char *plaintext)
+{
+  _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText,
+				     keyBitLength);
+}
+
+#ifdef __aarch64__
+#  define CAMELLIA_encrypt_stack_burn_size (0)
+#  define CAMELLIA_decrypt_stack_burn_size (0)
+#else
+#  define CAMELLIA_encrypt_stack_burn_size (15*4)
+#  define CAMELLIA_decrypt_stack_burn_size (15*4)
+#endif
+
+static unsigned int
+camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx = c;
+  Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+  return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
+}
+
+static unsigned int
+camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx=c;
+  Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+  return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
+}
+
+#else /*USE_ARM_ASM*/
+
+static unsigned int
+camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx=c;
+
+  Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+
+#define CAMELLIA_encrypt_stack_burn_size \
+  (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
+     +4*sizeof(u32)+4*sizeof(u32) \
+     +2*sizeof(u32*)+4*sizeof(u32) \
+     +2*2*sizeof(void*) /* Function calls.  */ \
+    )
+
+  return /*burn_stack*/ (CAMELLIA_encrypt_stack_burn_size);
+}
+
+static unsigned int
+camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  CAMELLIA_context *ctx=c;
+
+  Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
+
+#define CAMELLIA_decrypt_stack_burn_size \
+    (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
+     +4*sizeof(u32)+4*sizeof(u32) \
+     +2*sizeof(u32*)+4*sizeof(u32) \
+     +2*2*sizeof(void*) /* Function calls.  */ \
+    )
+
+  return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
+}
+
+#endif /*!USE_ARM_ASM*/
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size CAMELLIA_BLOCK_SIZE. */
+static void
+_gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       size_t nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
+  int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf  += CAMELLIA_BLOCK_SIZE;
+      /* Increment the counter.  */
+      cipher_block_add(ctr, 1, CAMELLIA_BLOCK_SIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_camellia_cbc_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       size_t nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
+  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
+                                CAMELLIA_BLOCK_SIZE);
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      outbuf += CAMELLIA_BLOCK_SIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_camellia_cfb_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       size_t nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+        {
+          _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 32;
+          outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx2 = 1;
+        }
+
+      if (did_use_aesni_avx2)
+        {
+          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx2_burn_stack_depth)
+            burn_stack_depth = avx2_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf  += CAMELLIA_BLOCK_SIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  CAMELLIA_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+  burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
+			      CAMELLIA_decrypt_stack_burn_size;
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+      u64 Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 32)
+	{
+	  for (i = 0; i < 32; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
+
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
+	    {
+	      blkn += 32;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 32;
+	      outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
+	    }
+	}
+
+      if (did_use_aesni_avx2)
+	{
+	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx2_burn_stack_depth)
+	    burn_stack_depth = avx2_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
+	    }
+	}
+
+      if (did_use_aesni_avx)
+	{
+	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx_burn_stack_depth)
+	    burn_stack_depth = avx_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			 size_t nblocks)
+{
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  CAMELLIA_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  int burn_stack_depth;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+      u64 Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 32)
+	{
+	  for (i = 0; i < 32; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
+
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
+	    {
+	      blkn += 32;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
+
+	      _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
+						 c->u_mode.ocb.aad_offset,
+						 c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 32;
+	      abuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
+	    }
+	}
+
+      if (did_use_aesni_avx2)
+	{
+	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx2_burn_stack_depth)
+	    burn_stack_depth = avx2_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
+						c->u_mode.ocb.aad_offset,
+						c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
+	    }
+	}
+
+      if (did_use_aesni_avx)
+	{
+	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx_burn_stack_depth)
+	    burn_stack_depth = avx_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
+}
+
+/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 32+16+1;
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
+
+  return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 32+16+2;
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
+
+  return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 32+16+2;
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
+
+  return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, nblocks, blocksize, context_size);
+}
+
+static const char *
+selftest(void)
+{
+  CAMELLIA_context ctx;
+  byte scratch[16];
+  cipher_bulk_ops_t bulk_ops;
+  const char *r;
+
+  /* These test vectors are from RFC-3713 */
+  static const byte plaintext[]=
+    {
+      0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
+      0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
+    };
+  static const byte key_128[]=
+    {
+      0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,
+      0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10
+    };
+  static const byte ciphertext_128[]=
+    {
+      0x67,0x67,0x31,0x38,0x54,0x96,0x69,0x73,
+      0x08,0x57,0x06,0x56,0x48,0xea,0xbe,0x43
+    };
+  static const byte key_192[]=
+    {
+      0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,0x98,
+      0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77
+    };
+  static const byte ciphertext_192[]=
+    {
+      0xb4,0x99,0x34,0x01,0xb3,0xe9,0x96,0xf8,
+      0x4e,0xe5,0xce,0xe7,0xd7,0x9b,0x09,0xb9
+    };
+  static const byte key_256[]=
+    {
+      0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xfe,0xdc,0xba,
+      0x98,0x76,0x54,0x32,0x10,0x00,0x11,0x22,0x33,0x44,0x55,
+      0x66,0x77,0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
+    };
+  static const byte ciphertext_256[]=
+    {
+      0x9a,0xcc,0x23,0x7d,0xff,0x16,0xd7,0x6c,
+      0x20,0xef,0x7c,0x91,0x9e,0x3a,0x75,0x09
+    };
+
+  camellia_setkey(&ctx,key_128,sizeof(key_128),&bulk_ops);
+  camellia_encrypt(&ctx,scratch,plaintext);
+  if(memcmp(scratch,ciphertext_128,sizeof(ciphertext_128))!=0)
+    return "CAMELLIA-128 test encryption failed.";
+  camellia_decrypt(&ctx,scratch,scratch);
+  if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
+    return "CAMELLIA-128 test decryption failed.";
+
+  camellia_setkey(&ctx,key_192,sizeof(key_192),&bulk_ops);
+  camellia_encrypt(&ctx,scratch,plaintext);
+  if(memcmp(scratch,ciphertext_192,sizeof(ciphertext_192))!=0)
+    return "CAMELLIA-192 test encryption failed.";
+  camellia_decrypt(&ctx,scratch,scratch);
+  if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
+    return "CAMELLIA-192 test decryption failed.";
+
+  camellia_setkey(&ctx,key_256,sizeof(key_256),&bulk_ops);
+  camellia_encrypt(&ctx,scratch,plaintext);
+  if(memcmp(scratch,ciphertext_256,sizeof(ciphertext_256))!=0)
+    return "CAMELLIA-256 test encryption failed.";
+  camellia_decrypt(&ctx,scratch,scratch);
+  if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
+    return "CAMELLIA-256 test decryption failed.";
+
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
+  return NULL;
+}
+
+/* These oids are from
+   <http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications_oid.html>,
+   retrieved May 1, 2007. */
+
+static gcry_cipher_oid_spec_t camellia128_oids[] =
+  {
+    {"1.2.392.200011.61.1.1.1.2", GCRY_CIPHER_MODE_CBC},
+    {"0.3.4401.5.3.1.9.1", GCRY_CIPHER_MODE_ECB},
+    {"0.3.4401.5.3.1.9.3", GCRY_CIPHER_MODE_OFB},
+    {"0.3.4401.5.3.1.9.4", GCRY_CIPHER_MODE_CFB},
+    { NULL }
+  };
+
+static gcry_cipher_oid_spec_t camellia192_oids[] =
+  {
+    {"1.2.392.200011.61.1.1.1.3", GCRY_CIPHER_MODE_CBC},
+    {"0.3.4401.5.3.1.9.21", GCRY_CIPHER_MODE_ECB},
+    {"0.3.4401.5.3.1.9.23", GCRY_CIPHER_MODE_OFB},
+    {"0.3.4401.5.3.1.9.24", GCRY_CIPHER_MODE_CFB},
+    { NULL }
+  };
+
+static gcry_cipher_oid_spec_t camellia256_oids[] =
+  {
+    {"1.2.392.200011.61.1.1.1.4", GCRY_CIPHER_MODE_CBC},
+    {"0.3.4401.5.3.1.9.41", GCRY_CIPHER_MODE_ECB},
+    {"0.3.4401.5.3.1.9.43", GCRY_CIPHER_MODE_OFB},
+    {"0.3.4401.5.3.1.9.44", GCRY_CIPHER_MODE_CFB},
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_camellia128 =
+  {
+    GCRY_CIPHER_CAMELLIA128, {0, 0},
+    "CAMELLIA128",NULL,camellia128_oids,CAMELLIA_BLOCK_SIZE,128,
+    sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_camellia192 =
+  {
+    GCRY_CIPHER_CAMELLIA192, {0, 0},
+    "CAMELLIA192",NULL,camellia192_oids,CAMELLIA_BLOCK_SIZE,192,
+    sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_camellia256 =
+  {
+    GCRY_CIPHER_CAMELLIA256, {0, 0},
+    "CAMELLIA256",NULL,camellia256_oids,CAMELLIA_BLOCK_SIZE,256,
+    sizeof(CAMELLIA_context),camellia_setkey,camellia_encrypt,camellia_decrypt
+  };
diff --git a/comm/third_party/libgcrypt/cipher/camellia.c b/comm/third_party/libgcrypt/cipher/camellia.c
new file mode 100644
index 0000000000..e7085a7ec8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia.c
@@ -0,0 +1,1413 @@
+/* camellia.h	ver 1.2.0
+ *
+ * Copyright (C) 2006,2007
+ * NTT (Nippon Telegraph and Telephone Corporation).
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Algorithm Specification
+ *  http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications.html
+ */
+
+#include <config.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "types.h"
+#include "bufhelp.h"
+#include "camellia.h"
+
+typedef byte u8;
+
+/* key constants */
+
+#define CAMELLIA_SIGMA1L (0xA09E667FL)
+#define CAMELLIA_SIGMA1R (0x3BCC908BL)
+#define CAMELLIA_SIGMA2L (0xB67AE858L)
+#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
+#define CAMELLIA_SIGMA3L (0xC6EF372FL)
+#define CAMELLIA_SIGMA3R (0xE94F82BEL)
+#define CAMELLIA_SIGMA4L (0x54FF53A5L)
+#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
+#define CAMELLIA_SIGMA5L (0x10E527FAL)
+#define CAMELLIA_SIGMA5R (0xDE682D1DL)
+#define CAMELLIA_SIGMA6L (0xB05688C2L)
+#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
+
+/*
+ *  macros
+ */
+
+
+#if defined(_MSC_VER)
+
+# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+# define GETU32(p) SWAP(*((u32 *)(p)))
+# define PUTU32(ct, st) {*((u32 *)(ct)) = SWAP((st));}
+
+#else /* not MS-VC */
+
+# define GETU32(pt) buf_get_be32(pt)
+# define PUTU32(ct, st) buf_put_be32(ct, st)
+
+#endif
+
+#define CamelliaSubkeyL(INDEX) (subkey[(INDEX)*2])
+#define CamelliaSubkeyR(INDEX) (subkey[(INDEX)*2 + 1])
+
+/* rotation right shift 1byte */
+#define CAMELLIA_RR8(x) (((x) >> 8) + ((x) << 24))
+/* rotation left shift 1bit */
+#define CAMELLIA_RL1(x) (((x) << 1) + ((x) >> 31))
+/* rotation left shift 1byte */
+#define CAMELLIA_RL8(x) (((x) << 8) + ((x) >> 24))
+
+#define CAMELLIA_ROLDQ(ll, lr, rl, rr, w0, w1, bits)	\
+    do {						\
+	w0 = ll;					\
+	ll = (ll << bits) + (lr >> (32 - bits));	\
+	lr = (lr << bits) + (rl >> (32 - bits));	\
+	rl = (rl << bits) + (rr >> (32 - bits));	\
+	rr = (rr << bits) + (w0 >> (32 - bits));	\
+    } while(0)
+
+#define CAMELLIA_ROLDQo32(ll, lr, rl, rr, w0, w1, bits)	\
+    do {						\
+	w0 = ll;					\
+	w1 = lr;					\
+	ll = (lr << (bits - 32)) + (rl >> (64 - bits));	\
+	lr = (rl << (bits - 32)) + (rr >> (64 - bits));	\
+	rl = (rr << (bits - 32)) + (w0 >> (64 - bits));	\
+	rr = (w0 << (bits - 32)) + (w1 >> (64 - bits));	\
+    } while(0)
+
+#define CAMELLIA_SP1110(INDEX) (camellia_sp1110[(INDEX)])
+#define CAMELLIA_SP0222(INDEX) (camellia_sp0222[(INDEX)])
+#define CAMELLIA_SP3033(INDEX) (camellia_sp3033[(INDEX)])
+#define CAMELLIA_SP4404(INDEX) (camellia_sp4404[(INDEX)])
+
+#define CAMELLIA_F(xl, xr, kl, kr, yl, yr, il, ir, t0, t1)	\
+    do {							\
+	il = xl ^ kl;						\
+	ir = xr ^ kr;						\
+	t0 = il >> 16;						\
+	t1 = ir >> 16;						\
+	yl = CAMELLIA_SP1110(ir & 0xff)				\
+	    ^ CAMELLIA_SP0222((t1 >> 8) & 0xff)			\
+	    ^ CAMELLIA_SP3033(t1 & 0xff)			\
+	    ^ CAMELLIA_SP4404((ir >> 8) & 0xff);		\
+	yr = CAMELLIA_SP1110((t0 >> 8) & 0xff)			\
+	    ^ CAMELLIA_SP0222(t0 & 0xff)			\
+	    ^ CAMELLIA_SP3033((il >> 8) & 0xff)			\
+	    ^ CAMELLIA_SP4404(il & 0xff);			\
+	yl ^= yr;						\
+	yr = CAMELLIA_RR8(yr);					\
+	yr ^= yl;						\
+    } while(0)
+
+
+/*
+ * for speed up
+ *
+ */
+#define CAMELLIA_FLS(ll, lr, rl, rr, kll, klr, krl, krr, t0, t1, t2, t3) \
+    do {								\
+	t0 = kll;							\
+	t0 &= ll;							\
+	lr ^= CAMELLIA_RL1(t0);						\
+	t1 = klr;							\
+	t1 |= lr;							\
+	ll ^= t1;							\
+									\
+	t2 = krr;							\
+	t2 |= rr;							\
+	rl ^= t2;							\
+	t3 = krl;							\
+	t3 &= rl;							\
+	rr ^= CAMELLIA_RL1(t3);						\
+    } while(0)
+
+#define CAMELLIA_ROUNDSM(xl, xr, kl, kr, yl, yr, il, ir, t0, t1)	\
+    do {								\
+	yl ^= kl;							\
+	yr ^= kr;							\
+	ir = CAMELLIA_SP1110(xr & 0xff)					\
+	    ^ CAMELLIA_SP0222((xr >> 24) & 0xff)			\
+	    ^ CAMELLIA_SP3033((xr >> 16) & 0xff)			\
+	    ^ CAMELLIA_SP4404((xr >> 8) & 0xff);			\
+	il = CAMELLIA_SP1110((xl >> 24) & 0xff)				\
+	    ^ CAMELLIA_SP0222((xl >> 16) & 0xff)			\
+	    ^ CAMELLIA_SP3033((xl >> 8) & 0xff)				\
+	    ^ CAMELLIA_SP4404(xl & 0xff);				\
+	ir ^= il;							\
+	il = CAMELLIA_RR8(il);						\
+	il ^= ir;							\
+	yl ^= ir;							\
+	yr ^= il;							\
+    } while(0)
+
+
+static const u32 camellia_sp1110[256] = {
+    0x70707000,0x82828200,0x2c2c2c00,0xececec00,
+    0xb3b3b300,0x27272700,0xc0c0c000,0xe5e5e500,
+    0xe4e4e400,0x85858500,0x57575700,0x35353500,
+    0xeaeaea00,0x0c0c0c00,0xaeaeae00,0x41414100,
+    0x23232300,0xefefef00,0x6b6b6b00,0x93939300,
+    0x45454500,0x19191900,0xa5a5a500,0x21212100,
+    0xededed00,0x0e0e0e00,0x4f4f4f00,0x4e4e4e00,
+    0x1d1d1d00,0x65656500,0x92929200,0xbdbdbd00,
+    0x86868600,0xb8b8b800,0xafafaf00,0x8f8f8f00,
+    0x7c7c7c00,0xebebeb00,0x1f1f1f00,0xcecece00,
+    0x3e3e3e00,0x30303000,0xdcdcdc00,0x5f5f5f00,
+    0x5e5e5e00,0xc5c5c500,0x0b0b0b00,0x1a1a1a00,
+    0xa6a6a600,0xe1e1e100,0x39393900,0xcacaca00,
+    0xd5d5d500,0x47474700,0x5d5d5d00,0x3d3d3d00,
+    0xd9d9d900,0x01010100,0x5a5a5a00,0xd6d6d600,
+    0x51515100,0x56565600,0x6c6c6c00,0x4d4d4d00,
+    0x8b8b8b00,0x0d0d0d00,0x9a9a9a00,0x66666600,
+    0xfbfbfb00,0xcccccc00,0xb0b0b000,0x2d2d2d00,
+    0x74747400,0x12121200,0x2b2b2b00,0x20202000,
+    0xf0f0f000,0xb1b1b100,0x84848400,0x99999900,
+    0xdfdfdf00,0x4c4c4c00,0xcbcbcb00,0xc2c2c200,
+    0x34343400,0x7e7e7e00,0x76767600,0x05050500,
+    0x6d6d6d00,0xb7b7b700,0xa9a9a900,0x31313100,
+    0xd1d1d100,0x17171700,0x04040400,0xd7d7d700,
+    0x14141400,0x58585800,0x3a3a3a00,0x61616100,
+    0xdedede00,0x1b1b1b00,0x11111100,0x1c1c1c00,
+    0x32323200,0x0f0f0f00,0x9c9c9c00,0x16161600,
+    0x53535300,0x18181800,0xf2f2f200,0x22222200,
+    0xfefefe00,0x44444400,0xcfcfcf00,0xb2b2b200,
+    0xc3c3c300,0xb5b5b500,0x7a7a7a00,0x91919100,
+    0x24242400,0x08080800,0xe8e8e800,0xa8a8a800,
+    0x60606000,0xfcfcfc00,0x69696900,0x50505000,
+    0xaaaaaa00,0xd0d0d000,0xa0a0a000,0x7d7d7d00,
+    0xa1a1a100,0x89898900,0x62626200,0x97979700,
+    0x54545400,0x5b5b5b00,0x1e1e1e00,0x95959500,
+    0xe0e0e000,0xffffff00,0x64646400,0xd2d2d200,
+    0x10101000,0xc4c4c400,0x00000000,0x48484800,
+    0xa3a3a300,0xf7f7f700,0x75757500,0xdbdbdb00,
+    0x8a8a8a00,0x03030300,0xe6e6e600,0xdadada00,
+    0x09090900,0x3f3f3f00,0xdddddd00,0x94949400,
+    0x87878700,0x5c5c5c00,0x83838300,0x02020200,
+    0xcdcdcd00,0x4a4a4a00,0x90909000,0x33333300,
+    0x73737300,0x67676700,0xf6f6f600,0xf3f3f300,
+    0x9d9d9d00,0x7f7f7f00,0xbfbfbf00,0xe2e2e200,
+    0x52525200,0x9b9b9b00,0xd8d8d800,0x26262600,
+    0xc8c8c800,0x37373700,0xc6c6c600,0x3b3b3b00,
+    0x81818100,0x96969600,0x6f6f6f00,0x4b4b4b00,
+    0x13131300,0xbebebe00,0x63636300,0x2e2e2e00,
+    0xe9e9e900,0x79797900,0xa7a7a700,0x8c8c8c00,
+    0x9f9f9f00,0x6e6e6e00,0xbcbcbc00,0x8e8e8e00,
+    0x29292900,0xf5f5f500,0xf9f9f900,0xb6b6b600,
+    0x2f2f2f00,0xfdfdfd00,0xb4b4b400,0x59595900,
+    0x78787800,0x98989800,0x06060600,0x6a6a6a00,
+    0xe7e7e700,0x46464600,0x71717100,0xbababa00,
+    0xd4d4d400,0x25252500,0xababab00,0x42424200,
+    0x88888800,0xa2a2a200,0x8d8d8d00,0xfafafa00,
+    0x72727200,0x07070700,0xb9b9b900,0x55555500,
+    0xf8f8f800,0xeeeeee00,0xacacac00,0x0a0a0a00,
+    0x36363600,0x49494900,0x2a2a2a00,0x68686800,
+    0x3c3c3c00,0x38383800,0xf1f1f100,0xa4a4a400,
+    0x40404000,0x28282800,0xd3d3d300,0x7b7b7b00,
+    0xbbbbbb00,0xc9c9c900,0x43434300,0xc1c1c100,
+    0x15151500,0xe3e3e300,0xadadad00,0xf4f4f400,
+    0x77777700,0xc7c7c700,0x80808000,0x9e9e9e00,
+};
+
+static const u32 camellia_sp0222[256] = {
+    0x00e0e0e0,0x00050505,0x00585858,0x00d9d9d9,
+    0x00676767,0x004e4e4e,0x00818181,0x00cbcbcb,
+    0x00c9c9c9,0x000b0b0b,0x00aeaeae,0x006a6a6a,
+    0x00d5d5d5,0x00181818,0x005d5d5d,0x00828282,
+    0x00464646,0x00dfdfdf,0x00d6d6d6,0x00272727,
+    0x008a8a8a,0x00323232,0x004b4b4b,0x00424242,
+    0x00dbdbdb,0x001c1c1c,0x009e9e9e,0x009c9c9c,
+    0x003a3a3a,0x00cacaca,0x00252525,0x007b7b7b,
+    0x000d0d0d,0x00717171,0x005f5f5f,0x001f1f1f,
+    0x00f8f8f8,0x00d7d7d7,0x003e3e3e,0x009d9d9d,
+    0x007c7c7c,0x00606060,0x00b9b9b9,0x00bebebe,
+    0x00bcbcbc,0x008b8b8b,0x00161616,0x00343434,
+    0x004d4d4d,0x00c3c3c3,0x00727272,0x00959595,
+    0x00ababab,0x008e8e8e,0x00bababa,0x007a7a7a,
+    0x00b3b3b3,0x00020202,0x00b4b4b4,0x00adadad,
+    0x00a2a2a2,0x00acacac,0x00d8d8d8,0x009a9a9a,
+    0x00171717,0x001a1a1a,0x00353535,0x00cccccc,
+    0x00f7f7f7,0x00999999,0x00616161,0x005a5a5a,
+    0x00e8e8e8,0x00242424,0x00565656,0x00404040,
+    0x00e1e1e1,0x00636363,0x00090909,0x00333333,
+    0x00bfbfbf,0x00989898,0x00979797,0x00858585,
+    0x00686868,0x00fcfcfc,0x00ececec,0x000a0a0a,
+    0x00dadada,0x006f6f6f,0x00535353,0x00626262,
+    0x00a3a3a3,0x002e2e2e,0x00080808,0x00afafaf,
+    0x00282828,0x00b0b0b0,0x00747474,0x00c2c2c2,
+    0x00bdbdbd,0x00363636,0x00222222,0x00383838,
+    0x00646464,0x001e1e1e,0x00393939,0x002c2c2c,
+    0x00a6a6a6,0x00303030,0x00e5e5e5,0x00444444,
+    0x00fdfdfd,0x00888888,0x009f9f9f,0x00656565,
+    0x00878787,0x006b6b6b,0x00f4f4f4,0x00232323,
+    0x00484848,0x00101010,0x00d1d1d1,0x00515151,
+    0x00c0c0c0,0x00f9f9f9,0x00d2d2d2,0x00a0a0a0,
+    0x00555555,0x00a1a1a1,0x00414141,0x00fafafa,
+    0x00434343,0x00131313,0x00c4c4c4,0x002f2f2f,
+    0x00a8a8a8,0x00b6b6b6,0x003c3c3c,0x002b2b2b,
+    0x00c1c1c1,0x00ffffff,0x00c8c8c8,0x00a5a5a5,
+    0x00202020,0x00898989,0x00000000,0x00909090,
+    0x00474747,0x00efefef,0x00eaeaea,0x00b7b7b7,
+    0x00151515,0x00060606,0x00cdcdcd,0x00b5b5b5,
+    0x00121212,0x007e7e7e,0x00bbbbbb,0x00292929,
+    0x000f0f0f,0x00b8b8b8,0x00070707,0x00040404,
+    0x009b9b9b,0x00949494,0x00212121,0x00666666,
+    0x00e6e6e6,0x00cecece,0x00ededed,0x00e7e7e7,
+    0x003b3b3b,0x00fefefe,0x007f7f7f,0x00c5c5c5,
+    0x00a4a4a4,0x00373737,0x00b1b1b1,0x004c4c4c,
+    0x00919191,0x006e6e6e,0x008d8d8d,0x00767676,
+    0x00030303,0x002d2d2d,0x00dedede,0x00969696,
+    0x00262626,0x007d7d7d,0x00c6c6c6,0x005c5c5c,
+    0x00d3d3d3,0x00f2f2f2,0x004f4f4f,0x00191919,
+    0x003f3f3f,0x00dcdcdc,0x00797979,0x001d1d1d,
+    0x00525252,0x00ebebeb,0x00f3f3f3,0x006d6d6d,
+    0x005e5e5e,0x00fbfbfb,0x00696969,0x00b2b2b2,
+    0x00f0f0f0,0x00313131,0x000c0c0c,0x00d4d4d4,
+    0x00cfcfcf,0x008c8c8c,0x00e2e2e2,0x00757575,
+    0x00a9a9a9,0x004a4a4a,0x00575757,0x00848484,
+    0x00111111,0x00454545,0x001b1b1b,0x00f5f5f5,
+    0x00e4e4e4,0x000e0e0e,0x00737373,0x00aaaaaa,
+    0x00f1f1f1,0x00dddddd,0x00595959,0x00141414,
+    0x006c6c6c,0x00929292,0x00545454,0x00d0d0d0,
+    0x00787878,0x00707070,0x00e3e3e3,0x00494949,
+    0x00808080,0x00505050,0x00a7a7a7,0x00f6f6f6,
+    0x00777777,0x00939393,0x00868686,0x00838383,
+    0x002a2a2a,0x00c7c7c7,0x005b5b5b,0x00e9e9e9,
+    0x00eeeeee,0x008f8f8f,0x00010101,0x003d3d3d,
+};
+
+static const u32 camellia_sp3033[256] = {
+    0x38003838,0x41004141,0x16001616,0x76007676,
+    0xd900d9d9,0x93009393,0x60006060,0xf200f2f2,
+    0x72007272,0xc200c2c2,0xab00abab,0x9a009a9a,
+    0x75007575,0x06000606,0x57005757,0xa000a0a0,
+    0x91009191,0xf700f7f7,0xb500b5b5,0xc900c9c9,
+    0xa200a2a2,0x8c008c8c,0xd200d2d2,0x90009090,
+    0xf600f6f6,0x07000707,0xa700a7a7,0x27002727,
+    0x8e008e8e,0xb200b2b2,0x49004949,0xde00dede,
+    0x43004343,0x5c005c5c,0xd700d7d7,0xc700c7c7,
+    0x3e003e3e,0xf500f5f5,0x8f008f8f,0x67006767,
+    0x1f001f1f,0x18001818,0x6e006e6e,0xaf00afaf,
+    0x2f002f2f,0xe200e2e2,0x85008585,0x0d000d0d,
+    0x53005353,0xf000f0f0,0x9c009c9c,0x65006565,
+    0xea00eaea,0xa300a3a3,0xae00aeae,0x9e009e9e,
+    0xec00ecec,0x80008080,0x2d002d2d,0x6b006b6b,
+    0xa800a8a8,0x2b002b2b,0x36003636,0xa600a6a6,
+    0xc500c5c5,0x86008686,0x4d004d4d,0x33003333,
+    0xfd00fdfd,0x66006666,0x58005858,0x96009696,
+    0x3a003a3a,0x09000909,0x95009595,0x10001010,
+    0x78007878,0xd800d8d8,0x42004242,0xcc00cccc,
+    0xef00efef,0x26002626,0xe500e5e5,0x61006161,
+    0x1a001a1a,0x3f003f3f,0x3b003b3b,0x82008282,
+    0xb600b6b6,0xdb00dbdb,0xd400d4d4,0x98009898,
+    0xe800e8e8,0x8b008b8b,0x02000202,0xeb00ebeb,
+    0x0a000a0a,0x2c002c2c,0x1d001d1d,0xb000b0b0,
+    0x6f006f6f,0x8d008d8d,0x88008888,0x0e000e0e,
+    0x19001919,0x87008787,0x4e004e4e,0x0b000b0b,
+    0xa900a9a9,0x0c000c0c,0x79007979,0x11001111,
+    0x7f007f7f,0x22002222,0xe700e7e7,0x59005959,
+    0xe100e1e1,0xda00dada,0x3d003d3d,0xc800c8c8,
+    0x12001212,0x04000404,0x74007474,0x54005454,
+    0x30003030,0x7e007e7e,0xb400b4b4,0x28002828,
+    0x55005555,0x68006868,0x50005050,0xbe00bebe,
+    0xd000d0d0,0xc400c4c4,0x31003131,0xcb00cbcb,
+    0x2a002a2a,0xad00adad,0x0f000f0f,0xca00caca,
+    0x70007070,0xff00ffff,0x32003232,0x69006969,
+    0x08000808,0x62006262,0x00000000,0x24002424,
+    0xd100d1d1,0xfb00fbfb,0xba00baba,0xed00eded,
+    0x45004545,0x81008181,0x73007373,0x6d006d6d,
+    0x84008484,0x9f009f9f,0xee00eeee,0x4a004a4a,
+    0xc300c3c3,0x2e002e2e,0xc100c1c1,0x01000101,
+    0xe600e6e6,0x25002525,0x48004848,0x99009999,
+    0xb900b9b9,0xb300b3b3,0x7b007b7b,0xf900f9f9,
+    0xce00cece,0xbf00bfbf,0xdf00dfdf,0x71007171,
+    0x29002929,0xcd00cdcd,0x6c006c6c,0x13001313,
+    0x64006464,0x9b009b9b,0x63006363,0x9d009d9d,
+    0xc000c0c0,0x4b004b4b,0xb700b7b7,0xa500a5a5,
+    0x89008989,0x5f005f5f,0xb100b1b1,0x17001717,
+    0xf400f4f4,0xbc00bcbc,0xd300d3d3,0x46004646,
+    0xcf00cfcf,0x37003737,0x5e005e5e,0x47004747,
+    0x94009494,0xfa00fafa,0xfc00fcfc,0x5b005b5b,
+    0x97009797,0xfe00fefe,0x5a005a5a,0xac00acac,
+    0x3c003c3c,0x4c004c4c,0x03000303,0x35003535,
+    0xf300f3f3,0x23002323,0xb800b8b8,0x5d005d5d,
+    0x6a006a6a,0x92009292,0xd500d5d5,0x21002121,
+    0x44004444,0x51005151,0xc600c6c6,0x7d007d7d,
+    0x39003939,0x83008383,0xdc00dcdc,0xaa00aaaa,
+    0x7c007c7c,0x77007777,0x56005656,0x05000505,
+    0x1b001b1b,0xa400a4a4,0x15001515,0x34003434,
+    0x1e001e1e,0x1c001c1c,0xf800f8f8,0x52005252,
+    0x20002020,0x14001414,0xe900e9e9,0xbd00bdbd,
+    0xdd00dddd,0xe400e4e4,0xa100a1a1,0xe000e0e0,
+    0x8a008a8a,0xf100f1f1,0xd600d6d6,0x7a007a7a,
+    0xbb00bbbb,0xe300e3e3,0x40004040,0x4f004f4f,
+};
+
+static const u32 camellia_sp4404[256] = {
+    0x70700070,0x2c2c002c,0xb3b300b3,0xc0c000c0,
+    0xe4e400e4,0x57570057,0xeaea00ea,0xaeae00ae,
+    0x23230023,0x6b6b006b,0x45450045,0xa5a500a5,
+    0xeded00ed,0x4f4f004f,0x1d1d001d,0x92920092,
+    0x86860086,0xafaf00af,0x7c7c007c,0x1f1f001f,
+    0x3e3e003e,0xdcdc00dc,0x5e5e005e,0x0b0b000b,
+    0xa6a600a6,0x39390039,0xd5d500d5,0x5d5d005d,
+    0xd9d900d9,0x5a5a005a,0x51510051,0x6c6c006c,
+    0x8b8b008b,0x9a9a009a,0xfbfb00fb,0xb0b000b0,
+    0x74740074,0x2b2b002b,0xf0f000f0,0x84840084,
+    0xdfdf00df,0xcbcb00cb,0x34340034,0x76760076,
+    0x6d6d006d,0xa9a900a9,0xd1d100d1,0x04040004,
+    0x14140014,0x3a3a003a,0xdede00de,0x11110011,
+    0x32320032,0x9c9c009c,0x53530053,0xf2f200f2,
+    0xfefe00fe,0xcfcf00cf,0xc3c300c3,0x7a7a007a,
+    0x24240024,0xe8e800e8,0x60600060,0x69690069,
+    0xaaaa00aa,0xa0a000a0,0xa1a100a1,0x62620062,
+    0x54540054,0x1e1e001e,0xe0e000e0,0x64640064,
+    0x10100010,0x00000000,0xa3a300a3,0x75750075,
+    0x8a8a008a,0xe6e600e6,0x09090009,0xdddd00dd,
+    0x87870087,0x83830083,0xcdcd00cd,0x90900090,
+    0x73730073,0xf6f600f6,0x9d9d009d,0xbfbf00bf,
+    0x52520052,0xd8d800d8,0xc8c800c8,0xc6c600c6,
+    0x81810081,0x6f6f006f,0x13130013,0x63630063,
+    0xe9e900e9,0xa7a700a7,0x9f9f009f,0xbcbc00bc,
+    0x29290029,0xf9f900f9,0x2f2f002f,0xb4b400b4,
+    0x78780078,0x06060006,0xe7e700e7,0x71710071,
+    0xd4d400d4,0xabab00ab,0x88880088,0x8d8d008d,
+    0x72720072,0xb9b900b9,0xf8f800f8,0xacac00ac,
+    0x36360036,0x2a2a002a,0x3c3c003c,0xf1f100f1,
+    0x40400040,0xd3d300d3,0xbbbb00bb,0x43430043,
+    0x15150015,0xadad00ad,0x77770077,0x80800080,
+    0x82820082,0xecec00ec,0x27270027,0xe5e500e5,
+    0x85850085,0x35350035,0x0c0c000c,0x41410041,
+    0xefef00ef,0x93930093,0x19190019,0x21210021,
+    0x0e0e000e,0x4e4e004e,0x65650065,0xbdbd00bd,
+    0xb8b800b8,0x8f8f008f,0xebeb00eb,0xcece00ce,
+    0x30300030,0x5f5f005f,0xc5c500c5,0x1a1a001a,
+    0xe1e100e1,0xcaca00ca,0x47470047,0x3d3d003d,
+    0x01010001,0xd6d600d6,0x56560056,0x4d4d004d,
+    0x0d0d000d,0x66660066,0xcccc00cc,0x2d2d002d,
+    0x12120012,0x20200020,0xb1b100b1,0x99990099,
+    0x4c4c004c,0xc2c200c2,0x7e7e007e,0x05050005,
+    0xb7b700b7,0x31310031,0x17170017,0xd7d700d7,
+    0x58580058,0x61610061,0x1b1b001b,0x1c1c001c,
+    0x0f0f000f,0x16160016,0x18180018,0x22220022,
+    0x44440044,0xb2b200b2,0xb5b500b5,0x91910091,
+    0x08080008,0xa8a800a8,0xfcfc00fc,0x50500050,
+    0xd0d000d0,0x7d7d007d,0x89890089,0x97970097,
+    0x5b5b005b,0x95950095,0xffff00ff,0xd2d200d2,
+    0xc4c400c4,0x48480048,0xf7f700f7,0xdbdb00db,
+    0x03030003,0xdada00da,0x3f3f003f,0x94940094,
+    0x5c5c005c,0x02020002,0x4a4a004a,0x33330033,
+    0x67670067,0xf3f300f3,0x7f7f007f,0xe2e200e2,
+    0x9b9b009b,0x26260026,0x37370037,0x3b3b003b,
+    0x96960096,0x4b4b004b,0xbebe00be,0x2e2e002e,
+    0x79790079,0x8c8c008c,0x6e6e006e,0x8e8e008e,
+    0xf5f500f5,0xb6b600b6,0xfdfd00fd,0x59590059,
+    0x98980098,0x6a6a006a,0x46460046,0xbaba00ba,
+    0x25250025,0x42420042,0xa2a200a2,0xfafa00fa,
+    0x07070007,0x55550055,0xeeee00ee,0x0a0a000a,
+    0x49490049,0x68680068,0x38380038,0xa4a400a4,
+    0x28280028,0x7b7b007b,0xc9c900c9,0xc1c100c1,
+    0xe3e300e3,0xf4f400f4,0xc7c700c7,0x9e9e009e,
+};
+
+
+/**
+ * Stuff related to the Camellia key schedule
+ */
+#define subl(x) subL[(x)]
+#define subr(x) subR[(x)]
+
+void camellia_setup128(const unsigned char *key, u32 *subkey)
+{
+    u32 kll, klr, krl, krr;
+    u32 il, ir, t0, t1, w0, w1;
+    u32 kw4l, kw4r, dw, tl, tr;
+    u32 subL[26];
+    u32 subR[26];
+
+    /**
+     *  k == kll || klr || krl || krr (|| is concatination)
+     */
+    kll = GETU32(key     );
+    klr = GETU32(key +  4);
+    krl = GETU32(key +  8);
+    krr = GETU32(key + 12);
+    /**
+     * generate KL dependent subkeys
+     */
+    subl(0) = kll; subr(0) = klr;
+    subl(1) = krl; subr(1) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(4) = kll; subr(4) = klr;
+    subl(5) = krl; subr(5) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 30);
+    subl(10) = kll; subr(10) = klr;
+    subl(11) = krl; subr(11) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(13) = krl; subr(13) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 17);
+    subl(16) = kll; subr(16) = klr;
+    subl(17) = krl; subr(17) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 17);
+    subl(18) = kll; subr(18) = klr;
+    subl(19) = krl; subr(19) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 17);
+    subl(22) = kll; subr(22) = klr;
+    subl(23) = krl; subr(23) = krr;
+
+    /* generate KA */
+    kll = subl(0); klr = subr(0);
+    krl = subl(1); krr = subr(1);
+    CAMELLIA_F(kll, klr,
+	       CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R,
+	       w0, w1, il, ir, t0, t1);
+    krl ^= w0; krr ^= w1;
+    CAMELLIA_F(krl, krr,
+	       CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R,
+	       kll, klr, il, ir, t0, t1);
+    CAMELLIA_F(kll, klr,
+	       CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R,
+	       krl, krr, il, ir, t0, t1);
+    krl ^= w0; krr ^= w1;
+    CAMELLIA_F(krl, krr,
+	       CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R,
+	       w0, w1, il, ir, t0, t1);
+    kll ^= w0; klr ^= w1;
+
+    /* generate KA dependent subkeys */
+    subl(2) = kll; subr(2) = klr;
+    subl(3) = krl; subr(3) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(6) = kll; subr(6) = klr;
+    subl(7) = krl; subr(7) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(8) = kll; subr(8) = klr;
+    subl(9) = krl; subr(9) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(12) = kll; subr(12) = klr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(14) = kll; subr(14) = klr;
+    subl(15) = krl; subr(15) = krr;
+    CAMELLIA_ROLDQo32(kll, klr, krl, krr, w0, w1, 34);
+    subl(20) = kll; subr(20) = klr;
+    subl(21) = krl; subr(21) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 17);
+    subl(24) = kll; subr(24) = klr;
+    subl(25) = krl; subr(25) = krr;
+
+
+    /* absorb kw2 to other subkeys */
+    subl(3) ^= subl(1); subr(3) ^= subr(1);
+    subl(5) ^= subl(1); subr(5) ^= subr(1);
+    subl(7) ^= subl(1); subr(7) ^= subr(1);
+    subl(1) ^= subr(1) & ~subr(9);
+    dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw);
+    subl(11) ^= subl(1); subr(11) ^= subr(1);
+    subl(13) ^= subl(1); subr(13) ^= subr(1);
+    subl(15) ^= subl(1); subr(15) ^= subr(1);
+    subl(1) ^= subr(1) & ~subr(17);
+    dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw);
+    subl(19) ^= subl(1); subr(19) ^= subr(1);
+    subl(21) ^= subl(1); subr(21) ^= subr(1);
+    subl(23) ^= subl(1); subr(23) ^= subr(1);
+    subl(24) ^= subl(1); subr(24) ^= subr(1);
+
+    /* absorb kw4 to other subkeys */
+    kw4l = subl(25); kw4r = subr(25);
+    subl(22) ^= kw4l; subr(22) ^= kw4r;
+    subl(20) ^= kw4l; subr(20) ^= kw4r;
+    subl(18) ^= kw4l; subr(18) ^= kw4r;
+    kw4l ^= kw4r & ~subr(16);
+    dw = kw4l & subl(16), kw4r ^= CAMELLIA_RL1(dw);
+    subl(14) ^= kw4l; subr(14) ^= kw4r;
+    subl(12) ^= kw4l; subr(12) ^= kw4r;
+    subl(10) ^= kw4l; subr(10) ^= kw4r;
+    kw4l ^= kw4r & ~subr(8);
+    dw = kw4l & subl(8), kw4r ^= CAMELLIA_RL1(dw);
+    subl(6) ^= kw4l; subr(6) ^= kw4r;
+    subl(4) ^= kw4l; subr(4) ^= kw4r;
+    subl(2) ^= kw4l; subr(2) ^= kw4r;
+    subl(0) ^= kw4l; subr(0) ^= kw4r;
+
+    /* key XOR is end of F-function */
+    CamelliaSubkeyL(0) = subl(0) ^ subl(2);
+    CamelliaSubkeyR(0) = subr(0) ^ subr(2);
+    CamelliaSubkeyL(2) = subl(3);
+    CamelliaSubkeyR(2) = subr(3);
+    CamelliaSubkeyL(3) = subl(2) ^ subl(4);
+    CamelliaSubkeyR(3) = subr(2) ^ subr(4);
+    CamelliaSubkeyL(4) = subl(3) ^ subl(5);
+    CamelliaSubkeyR(4) = subr(3) ^ subr(5);
+    CamelliaSubkeyL(5) = subl(4) ^ subl(6);
+    CamelliaSubkeyR(5) = subr(4) ^ subr(6);
+    CamelliaSubkeyL(6) = subl(5) ^ subl(7);
+    CamelliaSubkeyR(6) = subr(5) ^ subr(7);
+    tl = subl(10) ^ (subr(10) & ~subr(8));
+    dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(7) = subl(6) ^ tl;
+    CamelliaSubkeyR(7) = subr(6) ^ tr;
+    CamelliaSubkeyL(8) = subl(8);
+    CamelliaSubkeyR(8) = subr(8);
+    CamelliaSubkeyL(9) = subl(9);
+    CamelliaSubkeyR(9) = subr(9);
+    tl = subl(7) ^ (subr(7) & ~subr(9));
+    dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(10) = tl ^ subl(11);
+    CamelliaSubkeyR(10) = tr ^ subr(11);
+    CamelliaSubkeyL(11) = subl(10) ^ subl(12);
+    CamelliaSubkeyR(11) = subr(10) ^ subr(12);
+    CamelliaSubkeyL(12) = subl(11) ^ subl(13);
+    CamelliaSubkeyR(12) = subr(11) ^ subr(13);
+    CamelliaSubkeyL(13) = subl(12) ^ subl(14);
+    CamelliaSubkeyR(13) = subr(12) ^ subr(14);
+    CamelliaSubkeyL(14) = subl(13) ^ subl(15);
+    CamelliaSubkeyR(14) = subr(13) ^ subr(15);
+    tl = subl(18) ^ (subr(18) & ~subr(16));
+    dw = tl & subl(16),	tr = subr(18) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(15) = subl(14) ^ tl;
+    CamelliaSubkeyR(15) = subr(14) ^ tr;
+    CamelliaSubkeyL(16) = subl(16);
+    CamelliaSubkeyR(16) = subr(16);
+    CamelliaSubkeyL(17) = subl(17);
+    CamelliaSubkeyR(17) = subr(17);
+    tl = subl(15) ^ (subr(15) & ~subr(17));
+    dw = tl & subl(17),	tr = subr(15) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(18) = tl ^ subl(19);
+    CamelliaSubkeyR(18) = tr ^ subr(19);
+    CamelliaSubkeyL(19) = subl(18) ^ subl(20);
+    CamelliaSubkeyR(19) = subr(18) ^ subr(20);
+    CamelliaSubkeyL(20) = subl(19) ^ subl(21);
+    CamelliaSubkeyR(20) = subr(19) ^ subr(21);
+    CamelliaSubkeyL(21) = subl(20) ^ subl(22);
+    CamelliaSubkeyR(21) = subr(20) ^ subr(22);
+    CamelliaSubkeyL(22) = subl(21) ^ subl(23);
+    CamelliaSubkeyR(22) = subr(21) ^ subr(23);
+    CamelliaSubkeyL(23) = subl(22);
+    CamelliaSubkeyR(23) = subr(22);
+    CamelliaSubkeyL(24) = subl(24) ^ subl(23);
+    CamelliaSubkeyR(24) = subr(24) ^ subr(23);
+
+    return;
+}
+
+void camellia_setup256(const unsigned char *key, u32 *subkey)
+{
+    u32 kll,klr,krl,krr;           /* left half of key */
+    u32 krll,krlr,krrl,krrr;       /* right half of key */
+    u32 il, ir, t0, t1, w0, w1;    /* temporary variables */
+    u32 kw4l, kw4r, dw, tl, tr;
+    u32 subL[34];
+    u32 subR[34];
+
+    /**
+     *  key = (kll || klr || krl || krr || krll || krlr || krrl || krrr)
+     *  (|| is concatination)
+     */
+
+    kll  = GETU32(key     );
+    klr  = GETU32(key +  4);
+    krl  = GETU32(key +  8);
+    krr  = GETU32(key + 12);
+    krll = GETU32(key + 16);
+    krlr = GETU32(key + 20);
+    krrl = GETU32(key + 24);
+    krrr = GETU32(key + 28);
+
+    /* generate KL dependent subkeys */
+    subl(0) = kll; subr(0) = klr;
+    subl(1) = krl; subr(1) = krr;
+    CAMELLIA_ROLDQo32(kll, klr, krl, krr, w0, w1, 45);
+    subl(12) = kll; subr(12) = klr;
+    subl(13) = krl; subr(13) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(16) = kll; subr(16) = klr;
+    subl(17) = krl; subr(17) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 17);
+    subl(22) = kll; subr(22) = klr;
+    subl(23) = krl; subr(23) = krr;
+    CAMELLIA_ROLDQo32(kll, klr, krl, krr, w0, w1, 34);
+    subl(30) = kll; subr(30) = klr;
+    subl(31) = krl; subr(31) = krr;
+
+    /* generate KR dependent subkeys */
+    CAMELLIA_ROLDQ(krll, krlr, krrl, krrr, w0, w1, 15);
+    subl(4) = krll; subr(4) = krlr;
+    subl(5) = krrl; subr(5) = krrr;
+    CAMELLIA_ROLDQ(krll, krlr, krrl, krrr, w0, w1, 15);
+    subl(8) = krll; subr(8) = krlr;
+    subl(9) = krrl; subr(9) = krrr;
+    CAMELLIA_ROLDQ(krll, krlr, krrl, krrr, w0, w1, 30);
+    subl(18) = krll; subr(18) = krlr;
+    subl(19) = krrl; subr(19) = krrr;
+    CAMELLIA_ROLDQo32(krll, krlr, krrl, krrr, w0, w1, 34);
+    subl(26) = krll; subr(26) = krlr;
+    subl(27) = krrl; subr(27) = krrr;
+    CAMELLIA_ROLDQo32(krll, krlr, krrl, krrr, w0, w1, 34);
+
+    /* generate KA */
+    kll = subl(0) ^ krll; klr = subr(0) ^ krlr;
+    krl = subl(1) ^ krrl; krr = subr(1) ^ krrr;
+    CAMELLIA_F(kll, klr,
+	       CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R,
+	       w0, w1, il, ir, t0, t1);
+    krl ^= w0; krr ^= w1;
+    CAMELLIA_F(krl, krr,
+	       CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R,
+	       kll, klr, il, ir, t0, t1);
+    kll ^= krll; klr ^= krlr;
+    CAMELLIA_F(kll, klr,
+	       CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R,
+	       krl, krr, il, ir, t0, t1);
+    krl ^= w0 ^ krrl; krr ^= w1 ^ krrr;
+    CAMELLIA_F(krl, krr,
+	       CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R,
+	       w0, w1, il, ir, t0, t1);
+    kll ^= w0; klr ^= w1;
+
+    /* generate KB */
+    krll ^= kll; krlr ^= klr;
+    krrl ^= krl; krrr ^= krr;
+    CAMELLIA_F(krll, krlr,
+	       CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R,
+	       w0, w1, il, ir, t0, t1);
+    krrl ^= w0; krrr ^= w1;
+    CAMELLIA_F(krrl, krrr,
+	       CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R,
+	       w0, w1, il, ir, t0, t1);
+    krll ^= w0; krlr ^= w1;
+
+    /* generate KA dependent subkeys */
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 15);
+    subl(6) = kll; subr(6) = klr;
+    subl(7) = krl; subr(7) = krr;
+    CAMELLIA_ROLDQ(kll, klr, krl, krr, w0, w1, 30);
+    subl(14) = kll; subr(14) = klr;
+    subl(15) = krl; subr(15) = krr;
+    subl(24) = klr; subr(24) = krl;
+    subl(25) = krr; subr(25) = kll;
+    CAMELLIA_ROLDQo32(kll, klr, krl, krr, w0, w1, 49);
+    subl(28) = kll; subr(28) = klr;
+    subl(29) = krl; subr(29) = krr;
+
+    /* generate KB dependent subkeys */
+    subl(2) = krll; subr(2) = krlr;
+    subl(3) = krrl; subr(3) = krrr;
+    CAMELLIA_ROLDQ(krll, krlr, krrl, krrr, w0, w1, 30);
+    subl(10) = krll; subr(10) = krlr;
+    subl(11) = krrl; subr(11) = krrr;
+    CAMELLIA_ROLDQ(krll, krlr, krrl, krrr, w0, w1, 30);
+    subl(20) = krll; subr(20) = krlr;
+    subl(21) = krrl; subr(21) = krrr;
+    CAMELLIA_ROLDQo32(krll, krlr, krrl, krrr, w0, w1, 51);
+    subl(32) = krll; subr(32) = krlr;
+    subl(33) = krrl; subr(33) = krrr;
+
+    /* absorb kw2 to other subkeys */
+    subl(3) ^= subl(1); subr(3) ^= subr(1);
+    subl(5) ^= subl(1); subr(5) ^= subr(1);
+    subl(7) ^= subl(1); subr(7) ^= subr(1);
+    subl(1) ^= subr(1) & ~subr(9);
+    dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw);
+    subl(11) ^= subl(1); subr(11) ^= subr(1);
+    subl(13) ^= subl(1); subr(13) ^= subr(1);
+    subl(15) ^= subl(1); subr(15) ^= subr(1);
+    subl(1) ^= subr(1) & ~subr(17);
+    dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw);
+    subl(19) ^= subl(1); subr(19) ^= subr(1);
+    subl(21) ^= subl(1); subr(21) ^= subr(1);
+    subl(23) ^= subl(1); subr(23) ^= subr(1);
+    subl(1) ^= subr(1) & ~subr(25);
+    dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw);
+    subl(27) ^= subl(1); subr(27) ^= subr(1);
+    subl(29) ^= subl(1); subr(29) ^= subr(1);
+    subl(31) ^= subl(1); subr(31) ^= subr(1);
+    subl(32) ^= subl(1); subr(32) ^= subr(1);
+
+    /* absorb kw4 to other subkeys */
+    kw4l = subl(33); kw4r = subr(33);
+    subl(30) ^= kw4l; subr(30) ^= kw4r;
+    subl(28) ^= kw4l; subr(28) ^= kw4r;
+    subl(26) ^= kw4l; subr(26) ^= kw4r;
+    kw4l ^= kw4r & ~subr(24);
+    dw = kw4l & subl(24), kw4r ^= CAMELLIA_RL1(dw);
+    subl(22) ^= kw4l; subr(22) ^= kw4r;
+    subl(20) ^= kw4l; subr(20) ^= kw4r;
+    subl(18) ^= kw4l; subr(18) ^= kw4r;
+    kw4l ^= kw4r & ~subr(16);
+    dw = kw4l & subl(16), kw4r ^= CAMELLIA_RL1(dw);
+    subl(14) ^= kw4l; subr(14) ^= kw4r;
+    subl(12) ^= kw4l; subr(12) ^= kw4r;
+    subl(10) ^= kw4l; subr(10) ^= kw4r;
+    kw4l ^= kw4r & ~subr(8);
+    dw = kw4l & subl(8), kw4r ^= CAMELLIA_RL1(dw);
+    subl(6) ^= kw4l; subr(6) ^= kw4r;
+    subl(4) ^= kw4l; subr(4) ^= kw4r;
+    subl(2) ^= kw4l; subr(2) ^= kw4r;
+    subl(0) ^= kw4l; subr(0) ^= kw4r;
+
+    /* key XOR is end of F-function */
+    CamelliaSubkeyL(0) = subl(0) ^ subl(2);
+    CamelliaSubkeyR(0) = subr(0) ^ subr(2);
+    CamelliaSubkeyL(2) = subl(3);
+    CamelliaSubkeyR(2) = subr(3);
+    CamelliaSubkeyL(3) = subl(2) ^ subl(4);
+    CamelliaSubkeyR(3) = subr(2) ^ subr(4);
+    CamelliaSubkeyL(4) = subl(3) ^ subl(5);
+    CamelliaSubkeyR(4) = subr(3) ^ subr(5);
+    CamelliaSubkeyL(5) = subl(4) ^ subl(6);
+    CamelliaSubkeyR(5) = subr(4) ^ subr(6);
+    CamelliaSubkeyL(6) = subl(5) ^ subl(7);
+    CamelliaSubkeyR(6) = subr(5) ^ subr(7);
+    tl = subl(10) ^ (subr(10) & ~subr(8));
+    dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(7) = subl(6) ^ tl;
+    CamelliaSubkeyR(7) = subr(6) ^ tr;
+    CamelliaSubkeyL(8) = subl(8);
+    CamelliaSubkeyR(8) = subr(8);
+    CamelliaSubkeyL(9) = subl(9);
+    CamelliaSubkeyR(9) = subr(9);
+    tl = subl(7) ^ (subr(7) & ~subr(9));
+    dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(10) = tl ^ subl(11);
+    CamelliaSubkeyR(10) = tr ^ subr(11);
+    CamelliaSubkeyL(11) = subl(10) ^ subl(12);
+    CamelliaSubkeyR(11) = subr(10) ^ subr(12);
+    CamelliaSubkeyL(12) = subl(11) ^ subl(13);
+    CamelliaSubkeyR(12) = subr(11) ^ subr(13);
+    CamelliaSubkeyL(13) = subl(12) ^ subl(14);
+    CamelliaSubkeyR(13) = subr(12) ^ subr(14);
+    CamelliaSubkeyL(14) = subl(13) ^ subl(15);
+    CamelliaSubkeyR(14) = subr(13) ^ subr(15);
+    tl = subl(18) ^ (subr(18) & ~subr(16));
+    dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(15) = subl(14) ^ tl;
+    CamelliaSubkeyR(15) = subr(14) ^ tr;
+    CamelliaSubkeyL(16) = subl(16);
+    CamelliaSubkeyR(16) = subr(16);
+    CamelliaSubkeyL(17) = subl(17);
+    CamelliaSubkeyR(17) = subr(17);
+    tl = subl(15) ^ (subr(15) & ~subr(17));
+    dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(18) = tl ^ subl(19);
+    CamelliaSubkeyR(18) = tr ^ subr(19);
+    CamelliaSubkeyL(19) = subl(18) ^ subl(20);
+    CamelliaSubkeyR(19) = subr(18) ^ subr(20);
+    CamelliaSubkeyL(20) = subl(19) ^ subl(21);
+    CamelliaSubkeyR(20) = subr(19) ^ subr(21);
+    CamelliaSubkeyL(21) = subl(20) ^ subl(22);
+    CamelliaSubkeyR(21) = subr(20) ^ subr(22);
+    CamelliaSubkeyL(22) = subl(21) ^ subl(23);
+    CamelliaSubkeyR(22) = subr(21) ^ subr(23);
+    tl = subl(26) ^ (subr(26) & ~subr(24));
+    dw = tl & subl(24), tr = subr(26) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(23) = subl(22) ^ tl;
+    CamelliaSubkeyR(23) = subr(22) ^ tr;
+    CamelliaSubkeyL(24) = subl(24);
+    CamelliaSubkeyR(24) = subr(24);
+    CamelliaSubkeyL(25) = subl(25);
+    CamelliaSubkeyR(25) = subr(25);
+    tl = subl(23) ^ (subr(23) &  ~subr(25));
+    dw = tl & subl(25), tr = subr(23) ^ CAMELLIA_RL1(dw);
+    CamelliaSubkeyL(26) = tl ^ subl(27);
+    CamelliaSubkeyR(26) = tr ^ subr(27);
+    CamelliaSubkeyL(27) = subl(26) ^ subl(28);
+    CamelliaSubkeyR(27) = subr(26) ^ subr(28);
+    CamelliaSubkeyL(28) = subl(27) ^ subl(29);
+    CamelliaSubkeyR(28) = subr(27) ^ subr(29);
+    CamelliaSubkeyL(29) = subl(28) ^ subl(30);
+    CamelliaSubkeyR(29) = subr(28) ^ subr(30);
+    CamelliaSubkeyL(30) = subl(29) ^ subl(31);
+    CamelliaSubkeyR(30) = subr(29) ^ subr(31);
+    CamelliaSubkeyL(31) = subl(30);
+    CamelliaSubkeyR(31) = subr(30);
+    CamelliaSubkeyL(32) = subl(32) ^ subl(31);
+    CamelliaSubkeyR(32) = subr(32) ^ subr(31);
+
+    return;
+}
+
+void camellia_setup192(const unsigned char *key, u32 *subkey)
+{
+    unsigned char kk[32];
+    u32 krll, krlr, krrl,krrr;
+
+    memcpy(kk, key, 24);
+    memcpy((unsigned char *)&krll, key+16,4);
+    memcpy((unsigned char *)&krlr, key+20,4);
+    krrl = ~krll;
+    krrr = ~krlr;
+    memcpy(kk+24, (unsigned char *)&krrl, 4);
+    memcpy(kk+28, (unsigned char *)&krrr, 4);
+    camellia_setup256(kk, subkey);
+    return;
+}
+
+
+#ifndef USE_ARM_ASM
+/**
+ * Stuff related to camellia encryption/decryption
+ *
+ * "io" must be 4byte aligned and big-endian data.
+ */
+void camellia_encrypt128(const u32 *subkey, u32 *blocks)
+{
+    u32 il, ir, t0, t1;
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
+
+    /* pre whitening but absorb kw2*/
+    io[0] ^= CamelliaSubkeyL(0);
+    io[1] ^= CamelliaSubkeyR(0);
+    /* main iteration */
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(2),CamelliaSubkeyR(2),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(3),CamelliaSubkeyR(3),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(4),CamelliaSubkeyR(4),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(5),CamelliaSubkeyR(5),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(6),CamelliaSubkeyR(6),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(7),CamelliaSubkeyR(7),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(8),CamelliaSubkeyR(8),
+		 CamelliaSubkeyL(9),CamelliaSubkeyR(9),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(10),CamelliaSubkeyR(10),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(11),CamelliaSubkeyR(11),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(12),CamelliaSubkeyR(12),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(13),CamelliaSubkeyR(13),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(14),CamelliaSubkeyR(14),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(15),CamelliaSubkeyR(15),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(16),CamelliaSubkeyR(16),
+		 CamelliaSubkeyL(17),CamelliaSubkeyR(17),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(18),CamelliaSubkeyR(18),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(19),CamelliaSubkeyR(19),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(20),CamelliaSubkeyR(20),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(21),CamelliaSubkeyR(21),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(22),CamelliaSubkeyR(22),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(23),CamelliaSubkeyR(23),
+		     io[0],io[1],il,ir,t0,t1);
+
+    /* post whitening but kw4 */
+    io[2] ^= CamelliaSubkeyL(24);
+    io[3] ^= CamelliaSubkeyR(24);
+
+    t0 = io[0];
+    t1 = io[1];
+    io[0] = io[2];
+    io[1] = io[3];
+    io[2] = t0;
+    io[3] = t1;
+
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
+    return;
+}
+
+void camellia_decrypt128(const u32 *subkey, u32 *blocks)
+{
+    u32 il,ir,t0,t1;               /* temporary valiables */
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
+
+    /* pre whitening but absorb kw2*/
+    io[0] ^= CamelliaSubkeyL(24);
+    io[1] ^= CamelliaSubkeyR(24);
+
+    /* main iteration */
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(23),CamelliaSubkeyR(23),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(22),CamelliaSubkeyR(22),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(21),CamelliaSubkeyR(21),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(20),CamelliaSubkeyR(20),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(19),CamelliaSubkeyR(19),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(18),CamelliaSubkeyR(18),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(17),CamelliaSubkeyR(17),
+		 CamelliaSubkeyL(16),CamelliaSubkeyR(16),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(15),CamelliaSubkeyR(15),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(14),CamelliaSubkeyR(14),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(13),CamelliaSubkeyR(13),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(12),CamelliaSubkeyR(12),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(11),CamelliaSubkeyR(11),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(10),CamelliaSubkeyR(10),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(9),CamelliaSubkeyR(9),
+		 CamelliaSubkeyL(8),CamelliaSubkeyR(8),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(7),CamelliaSubkeyR(7),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(6),CamelliaSubkeyR(6),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(5),CamelliaSubkeyR(5),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(4),CamelliaSubkeyR(4),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(3),CamelliaSubkeyR(3),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(2),CamelliaSubkeyR(2),
+		     io[0],io[1],il,ir,t0,t1);
+
+    /* post whitening but kw4 */
+    io[2] ^= CamelliaSubkeyL(0);
+    io[3] ^= CamelliaSubkeyR(0);
+
+    t0 = io[0];
+    t1 = io[1];
+    io[0] = io[2];
+    io[1] = io[3];
+    io[2] = t0;
+    io[3] = t1;
+
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
+    return;
+}
+
+/**
+ * stuff for 192 and 256bit encryption/decryption
+ */
+void camellia_encrypt256(const u32 *subkey, u32 *blocks)
+{
+    u32 il,ir,t0,t1;           /* temporary valiables */
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
+
+    /* pre whitening but absorb kw2*/
+    io[0] ^= CamelliaSubkeyL(0);
+    io[1] ^= CamelliaSubkeyR(0);
+
+    /* main iteration */
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(2),CamelliaSubkeyR(2),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(3),CamelliaSubkeyR(3),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(4),CamelliaSubkeyR(4),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(5),CamelliaSubkeyR(5),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(6),CamelliaSubkeyR(6),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(7),CamelliaSubkeyR(7),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(8),CamelliaSubkeyR(8),
+		 CamelliaSubkeyL(9),CamelliaSubkeyR(9),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(10),CamelliaSubkeyR(10),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(11),CamelliaSubkeyR(11),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(12),CamelliaSubkeyR(12),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(13),CamelliaSubkeyR(13),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(14),CamelliaSubkeyR(14),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(15),CamelliaSubkeyR(15),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(16),CamelliaSubkeyR(16),
+		 CamelliaSubkeyL(17),CamelliaSubkeyR(17),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(18),CamelliaSubkeyR(18),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(19),CamelliaSubkeyR(19),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(20),CamelliaSubkeyR(20),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(21),CamelliaSubkeyR(21),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(22),CamelliaSubkeyR(22),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(23),CamelliaSubkeyR(23),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(24),CamelliaSubkeyR(24),
+		 CamelliaSubkeyL(25),CamelliaSubkeyR(25),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(26),CamelliaSubkeyR(26),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(27),CamelliaSubkeyR(27),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(28),CamelliaSubkeyR(28),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(29),CamelliaSubkeyR(29),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(30),CamelliaSubkeyR(30),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(31),CamelliaSubkeyR(31),
+		     io[0],io[1],il,ir,t0,t1);
+
+    /* post whitening but kw4 */
+    io[2] ^= CamelliaSubkeyL(32);
+    io[3] ^= CamelliaSubkeyR(32);
+
+    t0 = io[0];
+    t1 = io[1];
+    io[0] = io[2];
+    io[1] = io[3];
+    io[2] = t0;
+    io[3] = t1;
+
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
+    return;
+}
+
+void camellia_decrypt256(const u32 *subkey, u32 *blocks)
+{
+    u32 il,ir,t0,t1;           /* temporary valiables */
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
+
+    /* pre whitening but absorb kw2*/
+    io[0] ^= CamelliaSubkeyL(32);
+    io[1] ^= CamelliaSubkeyR(32);
+
+    /* main iteration */
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(31),CamelliaSubkeyR(31),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(30),CamelliaSubkeyR(30),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(29),CamelliaSubkeyR(29),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(28),CamelliaSubkeyR(28),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(27),CamelliaSubkeyR(27),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(26),CamelliaSubkeyR(26),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(25),CamelliaSubkeyR(25),
+		 CamelliaSubkeyL(24),CamelliaSubkeyR(24),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(23),CamelliaSubkeyR(23),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(22),CamelliaSubkeyR(22),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(21),CamelliaSubkeyR(21),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(20),CamelliaSubkeyR(20),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(19),CamelliaSubkeyR(19),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(18),CamelliaSubkeyR(18),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(17),CamelliaSubkeyR(17),
+		 CamelliaSubkeyL(16),CamelliaSubkeyR(16),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(15),CamelliaSubkeyR(15),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(14),CamelliaSubkeyR(14),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(13),CamelliaSubkeyR(13),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(12),CamelliaSubkeyR(12),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(11),CamelliaSubkeyR(11),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(10),CamelliaSubkeyR(10),
+		     io[0],io[1],il,ir,t0,t1);
+
+    CAMELLIA_FLS(io[0],io[1],io[2],io[3],
+		 CamelliaSubkeyL(9),CamelliaSubkeyR(9),
+		 CamelliaSubkeyL(8),CamelliaSubkeyR(8),
+		 t0,t1,il,ir);
+
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(7),CamelliaSubkeyR(7),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(6),CamelliaSubkeyR(6),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(5),CamelliaSubkeyR(5),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(4),CamelliaSubkeyR(4),
+		     io[0],io[1],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[0],io[1],
+		     CamelliaSubkeyL(3),CamelliaSubkeyR(3),
+		     io[2],io[3],il,ir,t0,t1);
+    CAMELLIA_ROUNDSM(io[2],io[3],
+		     CamelliaSubkeyL(2),CamelliaSubkeyR(2),
+		     io[0],io[1],il,ir,t0,t1);
+
+    /* post whitening but kw4 */
+    io[2] ^= CamelliaSubkeyL(0);
+    io[3] ^= CamelliaSubkeyR(0);
+
+    t0 = io[0];
+    t1 = io[1];
+    io[0] = io[2];
+    io[1] = io[3];
+    io[2] = t0;
+    io[3] = t1;
+
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
+    return;
+}
+#endif /*!USE_ARM_ASM*/
+
+
+/***
+ *
+ * API for compatibility
+ */
+
+void Camellia_Ekeygen(const int keyBitLength,
+		      const unsigned char *rawKey,
+		      KEY_TABLE_TYPE keyTable)
+{
+    switch(keyBitLength) {
+    case 128:
+	camellia_setup128(rawKey, keyTable);
+	break;
+    case 192:
+	camellia_setup192(rawKey, keyTable);
+	break;
+    case 256:
+	camellia_setup256(rawKey, keyTable);
+	break;
+    default:
+	break;
+    }
+}
+
+
+#ifndef USE_ARM_ASM
+void Camellia_EncryptBlock(const int keyBitLength,
+			   const unsigned char *plaintext,
+			   const KEY_TABLE_TYPE keyTable,
+			   unsigned char *ciphertext)
+{
+    u32 tmp[4];
+
+    tmp[0] = GETU32(plaintext);
+    tmp[1] = GETU32(plaintext + 4);
+    tmp[2] = GETU32(plaintext + 8);
+    tmp[3] = GETU32(plaintext + 12);
+
+    switch (keyBitLength) {
+    case 128:
+	camellia_encrypt128(keyTable, tmp);
+	break;
+    case 192:
+	/* fall through */
+    case 256:
+	camellia_encrypt256(keyTable, tmp);
+	break;
+    default:
+	break;
+    }
+
+    PUTU32(ciphertext, tmp[0]);
+    PUTU32(ciphertext + 4, tmp[1]);
+    PUTU32(ciphertext + 8, tmp[2]);
+    PUTU32(ciphertext + 12, tmp[3]);
+}
+
+void Camellia_DecryptBlock(const int keyBitLength,
+			   const unsigned char *ciphertext,
+			   const KEY_TABLE_TYPE keyTable,
+			   unsigned char *plaintext)
+{
+    u32 tmp[4];
+
+    tmp[0] = GETU32(ciphertext);
+    tmp[1] = GETU32(ciphertext + 4);
+    tmp[2] = GETU32(ciphertext + 8);
+    tmp[3] = GETU32(ciphertext + 12);
+
+    switch (keyBitLength) {
+    case 128:
+	camellia_decrypt128(keyTable, tmp);
+	break;
+    case 192:
+	/* fall through */
+    case 256:
+	camellia_decrypt256(keyTable, tmp);
+	break;
+    default:
+	break;
+    }
+    PUTU32(plaintext, tmp[0]);
+    PUTU32(plaintext + 4, tmp[1]);
+    PUTU32(plaintext + 8, tmp[2]);
+    PUTU32(plaintext + 12, tmp[3]);
+}
+#endif /*!USE_ARM_ASM*/
diff --git a/comm/third_party/libgcrypt/cipher/camellia.h b/comm/third_party/libgcrypt/cipher/camellia.h
new file mode 100644
index 0000000000..d7a1e6f4a0
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/camellia.h
@@ -0,0 +1,95 @@
+/* camellia.h	ver 1.2.0
+ *
+ * Copyright (C) 2006,2007
+ * NTT (Nippon Telegraph and Telephone Corporation).
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef HEADER_CAMELLIA_H
+#define HEADER_CAMELLIA_H
+
+/* To use Camellia with libraries it is often useful to keep the name
+ * space of the library clean.  The following macro is thus useful:
+ *
+ *     #define CAMELLIA_EXT_SYM_PREFIX foo_
+ *
+ * This prefixes all external symbols with "foo_".
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+# undef USE_ARM_ASM
+# if defined(__ARMEL__)
+#  ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+#   define USE_ARM_ASM 1
+#  endif
+# endif
+# if defined(__AARCH64EL__)
+#  ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+#   define USE_ARM_ASM 1
+#  endif
+# endif
+#endif
+#ifdef CAMELLIA_EXT_SYM_PREFIX
+#define CAMELLIA_PREFIX1(x,y) x ## y
+#define CAMELLIA_PREFIX2(x,y) CAMELLIA_PREFIX1(x,y)
+#define CAMELLIA_PREFIX(x)    CAMELLIA_PREFIX2(CAMELLIA_EXT_SYM_PREFIX,x)
+#define Camellia_Ekeygen      CAMELLIA_PREFIX(Camellia_Ekeygen)
+#define Camellia_EncryptBlock CAMELLIA_PREFIX(Camellia_EncryptBlock)
+#define Camellia_DecryptBlock CAMELLIA_PREFIX(Camellia_DecryptBlock)
+#define camellia_decrypt128   CAMELLIA_PREFIX(camellia_decrypt128)
+#define camellia_decrypt256   CAMELLIA_PREFIX(camellia_decrypt256)
+#define camellia_encrypt128   CAMELLIA_PREFIX(camellia_encrypt128)
+#define camellia_encrypt256   CAMELLIA_PREFIX(camellia_encrypt256)
+#define camellia_setup128     CAMELLIA_PREFIX(camellia_setup128)
+#define camellia_setup192     CAMELLIA_PREFIX(camellia_setup192)
+#define camellia_setup256     CAMELLIA_PREFIX(camellia_setup256)
+#endif /*CAMELLIA_EXT_SYM_PREFIX*/
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define CAMELLIA_BLOCK_SIZE 16
+#define CAMELLIA_TABLE_BYTE_LEN 272
+#define CAMELLIA_TABLE_WORD_LEN (CAMELLIA_TABLE_BYTE_LEN / 4)
+
+typedef unsigned int KEY_TABLE_TYPE[CAMELLIA_TABLE_WORD_LEN];
+
+
+void Camellia_Ekeygen(const int keyBitLength,
+		      const unsigned char *rawKey,
+		      KEY_TABLE_TYPE keyTable);
+
+#ifndef USE_ARM_ASM
+void Camellia_EncryptBlock(const int keyBitLength,
+			   const unsigned char *plaintext,
+			   const KEY_TABLE_TYPE keyTable,
+			   unsigned char *cipherText);
+
+void Camellia_DecryptBlock(const int keyBitLength,
+			   const unsigned char *cipherText,
+			   const KEY_TABLE_TYPE keyTable,
+			   unsigned char *plaintext);
+#endif /*!USE_ARM_ASM*/
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* HEADER_CAMELLIA_H */
diff --git a/comm/third_party/libgcrypt/cipher/cast5-amd64.S b/comm/third_party/libgcrypt/cipher/cast5-amd64.S
new file mode 100644
index 0000000000..82f678901d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cast5-amd64.S
@@ -0,0 +1,663 @@
+/* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
+
+#include "asm-common-amd64.h"
+
+.text
+
+.extern _gcry_cast5_s1to4;
+
+#define s1 0
+#define s2 (s1 + (4 * 256))
+#define s3 (s2 + (4 * 256))
+#define s4 (s3 + (4 * 256))
+
+/* structure of CAST5_context: */
+#define Km 0
+#define Kr (Km + (16 * 4))
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RTAB %r8
+
+#define RLR0 %r9
+#define RLR1 %r10
+#define RLR2 %r11
+#define RLR3 %r12
+
+#define RLR0d %r9d
+#define RLR1d %r10d
+#define RLR2d %r11d
+#define RLR3d %r12d
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %dh
+
+#define RKR %rcx
+#define RKRd %ecx
+#define RKRbl %cl
+
+#define RT0 %rbp
+#define RT1 %rsi
+
+#define RT0d %ebp
+#define RT1d %esi
+
+#define RKM0d %r13d
+#define RKM1d %r14d
+
+/***********************************************************************
+ * 1-way cast5
+ ***********************************************************************/
+#define dummy(x)
+
+#define shr_kr(none) \
+	shrq $8,			RKR;
+
+#define F(km, load_next_kr, op0, op1, op2, op3) \
+	op0 ## l RLR0d,			km ## d; \
+	roll RKRbl,			km ## d; \
+	rorq $32,			RLR0; \
+	movzbl km ## bh,		RT0d; \
+	movzbl km ## bl,		RT1d; \
+	roll $16,			km ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	load_next_kr(kr_next); \
+	movzbl km ## bh,		RT1d; \
+	movzbl km ## bl,		km ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,km,4),		RT0d; \
+	xorq RT0,			RLR0;
+
+#define F1(km, load_next_kr) \
+	F(##km, load_next_kr, add, xor, sub, add)
+#define F2(km, load_next_kr) \
+	F(##km, load_next_kr, xor, sub, add, xor)
+#define F3(km, load_next_kr) \
+	F(##km, load_next_kr, sub, add, xor, sub)
+
+#define get_round_km(n, km) \
+	movl Km+4*(n)(CTX), 		km;
+
+#define get_round_kr_enc(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n)(CTX),		RKR;
+
+#define get_round_kr_dec(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n - 7)(CTX),		RKR; \
+	bswapq				RKR;
+
+#define round_enc(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n + 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_enc_last(n, FXA, FXB) \
+	get_round_km(n + 1, RX2d); \
+	\
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_enc_1(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, shr_kr)
+
+#define round_enc_2(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, dummy)
+
+#define round_dec(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n - 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_dec_last(n, FXA, FXB) \
+	get_round_km(n - 1, RX2d); \
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_dec_1(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, shr_kr)
+
+#define round_dec_2(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, dummy)
+
+#define read_block() \
+	movq (RIO), 		RLR0; \
+	bswapq 			RLR0;
+
+#define write_block() \
+	bswapq 			RLR0; \
+	rorq $32,		RLR0; \
+	movq RLR0, 		(RIO);
+
+.align 8
+.globl _gcry_cast5_amd64_encrypt_block
+ELF(.type   _gcry_cast5_amd64_encrypt_block,@function;)
+
+_gcry_cast5_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(0, RX0d);
+	get_round_kr_enc(0);
+	round_enc_1(0, F1, F2);
+	round_enc_1(2, F3, F1);
+	round_enc_1(4, F2, F3);
+	round_enc_2(6, F1, F2);
+	get_round_kr_enc(8);
+	round_enc_1(8, F3, F1);
+	round_enc_1(10, F2, F3);
+	round_enc_1(12, F1, F2);
+	round_enc_last(14, F3, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
+
+.align 8
+.globl _gcry_cast5_amd64_decrypt_block
+ELF(.type   _gcry_cast5_amd64_decrypt_block,@function;)
+
+_gcry_cast5_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(15, RX0d);
+	get_round_kr_dec(15);
+	round_dec_1(15, F1, F3);
+	round_dec_1(13, F2, F1);
+	round_dec_1(11, F3, F2);
+	round_dec_2(9, F1, F3);
+	get_round_kr_dec(7);
+	round_dec_1(7, F2, F1);
+	round_dec_1(5, F3, F2);
+	round_dec_1(3, F1, F3);
+	round_dec_last(1, F2, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
+
+/**********************************************************************
+  4-way cast5, four blocks parallel
+ **********************************************************************/
+#define F_tail(rlr, rx, op1, op2, op3) \
+	movzbl rx ## bh,		RT0d; \
+	movzbl rx ## bl,		RT1d; \
+	roll $16,			rx ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	movzbl rx ## bh,		RT1d; \
+	movzbl rx ## bl,		rx ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,rx,4),		RT0d; \
+	xorq RT0,			rlr;
+
+#define F4(km, load_next_kr, op0, op1, op2, op3) \
+	movl km,			RX0d; \
+	op0 ## l RLR0d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	rorq $32,			RLR0; \
+	\
+	movl km,			RX1d; \
+	op0 ## l RLR1d,			RX1d; \
+	roll RKRbl,			RX1d; \
+	rorq $32,			RLR1; \
+	\
+	movl km,			RX2d; \
+	op0 ## l RLR2d,			RX2d; \
+	roll RKRbl,			RX2d; \
+	rorq $32,			RLR2; \
+	\
+	F_tail(RLR0, RX0, op1, op2, op3); \
+	F_tail(RLR1, RX1, op1, op2, op3); \
+	F_tail(RLR2, RX2, op1, op2, op3); \
+	\
+	movl km,			RX0d; \
+	op0 ## l RLR3d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	load_next_kr();			\
+	rorq $32,			RLR3; \
+	\
+	F_tail(RLR3, RX0, op1, op2, op3);
+
+#define F4_1(km, load_next_kr) \
+	F4(km, load_next_kr, add, xor, sub, add)
+#define F4_2(km, load_next_kr) \
+	F4(km, load_next_kr, xor, sub, add, xor)
+#define F4_3(km, load_next_kr) \
+	F4(km, load_next_kr, sub, add, xor, sub)
+
+#define round_enc4(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n + 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_enc_last4(n, FXA, FXB) \
+	get_round_km(n + 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_enc4_1(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_enc4_2(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, dummy);
+
+#define round_dec4(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n - 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_dec_last4(n, FXA, FXB) \
+	get_round_km(n - 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_dec4_1(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_dec4_2(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, dummy);
+
+#define inbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d;
+
+#define outbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d; \
+	rorq $32,		a; \
+	rorq $32,		b; \
+	rorq $32,		c; \
+	rorq $32,		d;
+
+.align 8
+ELF(.type   __cast5_enc_blk4,@function;)
+
+__cast5_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
+	 */
+	CFI_STARTPROC();
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	get_round_km(0, RKM0d);
+	get_round_kr_enc(0);
+	round_enc4_1(0, F4_1, F4_2);
+	round_enc4_1(2, F4_3, F4_1);
+	round_enc4_1(4, F4_2, F4_3);
+	round_enc4_2(6, F4_1, F4_2);
+	get_round_kr_enc(8);
+	round_enc4_1(8, F4_3, F4_1);
+	round_enc4_1(10, F4_2, F4_3);
+	round_enc4_1(12, F4_1, F4_2);
+	round_enc_last4(14, F4_3, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+	CFI_ENDPROC();
+ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
+
+.align 8
+ELF(.type   __cast5_dec_blk4,@function;)
+
+__cast5_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
+	 */
+	CFI_STARTPROC();
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	get_round_km(15, RKM0d);
+	get_round_kr_dec(15);
+	round_dec4_1(15, F4_1, F4_3);
+	round_dec4_1(13, F4_2, F4_1);
+	round_dec4_1(11, F4_3, F4_2);
+	round_dec4_2(9, F4_1, F4_3);
+	get_round_kr_dec(7);
+	round_dec4_1(7, F4_2, F4_1);
+	round_dec4_1(5, F4_3, F4_2);
+	round_dec4_1(3, F4_1, F4_3);
+	round_dec_last4(1, F4_2, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	CFI_ENDPROC();
+	ret;
+ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
+
+.align 8
+.globl _gcry_cast5_amd64_ctr_enc
+ELF(.type   _gcry_cast5_amd64_ctr_enc,@function;)
+_gcry_cast5_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+
+	pushq %rsi;
+	CFI_PUSH(%rsi);
+	pushq %rdx;
+	CFI_PUSH(%rdx);
+
+	/* load IV and byteswap */
+	movq (%rcx), RX0;
+	bswapq RX0;
+	movq RX0, RLR0;
+
+	/* construct IVs */
+	leaq 1(RX0), RLR1;
+	leaq 2(RX0), RLR2;
+	leaq 3(RX0), RLR3;
+	leaq 4(RX0), RX0;
+	bswapq RX0;
+
+	/* store new IV */
+	movq RX0, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %r14; /*src*/
+	CFI_POP_TMP_REG();
+	popq %r13; /*dst*/
+	CFI_POP_TMP_REG();
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r14), RLR0;
+	xorq 1 * 8(%r14), RLR1;
+	xorq 2 * 8(%r14), RLR2;
+	xorq 3 * 8(%r14), RLR3;
+	movq RLR0, 0 * 8(%r13);
+	movq RLR1, 1 * 8(%r13);
+	movq RLR2, 2 * 8(%r13);
+	movq RLR3, 3 * 8(%r13);
+
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
+
+.align 8
+.globl _gcry_cast5_amd64_cbc_dec
+ELF(.type   _gcry_cast5_amd64_cbc_dec,@function;)
+_gcry_cast5_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+
+	pushq %rcx;
+	CFI_PUSH(%rcx);
+	pushq %rsi;
+	CFI_PUSH(%rsi);
+	pushq %rdx;
+	CFI_PUSH(%rdx);
+
+	/* load input */
+	movq 0 * 8(%rdx), RLR0;
+	movq 1 * 8(%rdx), RLR1;
+	movq 2 * 8(%rdx), RLR2;
+	movq 3 * 8(%rdx), RLR3;
+
+	call __cast5_dec_blk4;
+
+	popq RX0; /*src*/
+	CFI_POP_TMP_REG();
+	popq RX1; /*dst*/
+	CFI_POP_TMP_REG();
+	popq RX2; /*iv*/
+	CFI_POP_TMP_REG();
+
+	movq 3 * 8(RX0), %r14;
+	xorq      (RX2), RLR0;
+	xorq 0 * 8(RX0), RLR1;
+	xorq 1 * 8(RX0), RLR2;
+	xorq 2 * 8(RX0), RLR3;
+	movq %r14, (RX2); /* store new IV */
+
+	movq RLR0, 0 * 8(RX1);
+	movq RLR1, 1 * 8(RX1);
+	movq RLR2, 2 * 8(RX1);
+	movq RLR3, 3 * 8(RX1);
+
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
+
+.align 8
+.globl _gcry_cast5_amd64_cfb_dec
+ELF(.type   _gcry_cast5_amd64_cfb_dec,@function;)
+_gcry_cast5_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+
+	pushq %rsi;
+	CFI_PUSH(%rsi);
+	pushq %rdx;
+	CFI_PUSH(%rdx);
+
+	/* Load input */
+	movq (%rcx), RLR0;
+	movq 0 * 8(%rdx), RLR1;
+	movq 1 * 8(%rdx), RLR2;
+	movq 2 * 8(%rdx), RLR3;
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	/* Update IV */
+	movq 3 * 8(%rdx), %rdx;
+	movq %rdx, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
+	popq %rcx; /*dst*/
+	CFI_POP_TMP_REG();
+
+	xorq 0 * 8(%rdx), RLR0;
+	xorq 1 * 8(%rdx), RLR1;
+	xorq 2 * 8(%rdx), RLR2;
+	xorq 3 * 8(%rdx), RLR3;
+	movq RLR0, 0 * 8(%rcx);
+	movq RLR1, 1 * 8(%rcx);
+	movq RLR2, 2 * 8(%rcx);
+	movq RLR3, 3 * 8(%rcx);
+
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
+
+#endif /*defined(USE_CAST5)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/cast5-arm.S b/comm/third_party/libgcrypt/cipher/cast5-arm.S
new file mode 100644
index 0000000000..76ddd2e335
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cast5-arm.S
@@ -0,0 +1,728 @@
+/* cast5-arm.S  -  ARM assembly implementation of CAST5 cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+.extern _gcry_cast5_s1to4;
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* structure of crypto context */
+#define Km 0
+#define Kr (Km + (16 * 4))
+#define Kr_arm_enc (Kr + (16))
+#define Kr_arm_dec (Kr_arm_enc + (16))
+
+/* register macros */
+#define CTX %r0
+#define Rs1 %r7
+#define Rs2 %r8
+#define Rs3 %r9
+#define Rs4 %r10
+#define RMASK %r11
+#define RKM %r1
+#define RKR %r2
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %lr
+#define RT1 %ip
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+	#define ldr_unaligned_host ldr_unaligned_le
+	#define str_unaligned_host str_unaligned_le
+
+	/* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	#define ldr_unaligned_host ldr_unaligned_be
+	#define str_unaligned_host str_unaligned_be
+
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/**********************************************************************
+  1-way cast5
+ **********************************************************************/
+
+#define dummy(n) /*_*/
+
+#define load_kr(n) \
+	ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */
+
+#define load_dec_kr(n) \
+	ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */
+
+#define load_km(n) \
+	ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */
+
+#define shift_kr(dummy) \
+	mov RKR, RKR, lsr #8;
+
+#define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \
+	op1 RKM, rr; \
+	mov RKM, RKM, ror RKR; \
+	\
+	and RT0, RMASK, RKM, ror #(24); \
+	and RT1, RMASK, RKM, lsr #(16); \
+	and RT2, RMASK, RKM, lsr #(8); \
+	ldr RT0, [Rs1, RT0]; \
+	and RT3, RMASK, RKM; \
+	ldr RT1, [Rs2, RT1]; \
+	shiftkr(RKR); \
+	\
+	ldr RT2, [Rs3, RT2]; \
+	\
+	op2 RT0, RT1; \
+	ldr RT3, [Rs4, RT3]; \
+	op3 RT0, RT2; \
+	loadkm((n) + (1 - ((dec) * 2))); \
+	op4 RT0, RT3; \
+	loadkr((n) + (1 - ((dec) * 2))); \
+	eor rl, RT0;
+
+#define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr)
+#define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr)
+#define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr)
+
+#define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr)
+
+#define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr)
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+	ldr l0, [rin, #((offs) + 0)]; \
+	ldr r0, [rin, #((offs) + 4)]; \
+	convert(l0, rtmp); \
+	convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	str l0, [rout, #((offs) + 0)]; \
+	str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
+
+	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
+
+	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.globl _gcry_cast5_arm_encrypt_block
+.type  _gcry_cast5_arm_encrypt_block,%function;
+
+_gcry_cast5_arm_encrypt_block:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100*4);
+	add Rs3, Rs1, #(0x100*4*2);
+	add Rs4, Rs1, #(0x100*4*3);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_km(0);
+	load_kr(0);
+	enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
+
+	ldr %r1, [%sp], #4;
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
+
+.align 3
+.globl _gcry_cast5_arm_decrypt_block
+.type  _gcry_cast5_arm_decrypt_block,%function;
+
+_gcry_cast5_arm_decrypt_block:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+	add Rs3, Rs1, #(0x100 * 4 * 2);
+	add Rs4, Rs1, #(0x100 * 4 * 3);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_km(15);
+	load_dec_kr(15);
+	dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
+
+	ldr %r1, [%sp], #4;
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
+
+/**********************************************************************
+  2-way cast5
+ **********************************************************************/
+
+#define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \
+	     loadkr) \
+	op1 RT3, RKM, rr0; \
+	op1 RKM, RKM, rr1; \
+	mov RT3, RT3, ror RKR; \
+	mov RKM, RKM, ror RKR; \
+	\
+	and RT0, RMASK, RT3, ror #(24); \
+	and RT1, RMASK, RT3, lsr #(16); \
+	and RT2, RMASK, RT3, lsr #(8); \
+	and RT3, RMASK, RT3; \
+	\
+	ldr RT0, [Rs1, RT0]; \
+	add RT2, #(0x100 * 4); \
+	ldr RT1, [Rs2, RT1]; \
+	add RT3, #(0x100 * 4 * 2); \
+	\
+	ldr RT2, [Rs2, RT2]; \
+	\
+	op2 RT0, RT1; \
+	ldr RT3, [Rs2, RT3]; \
+	and RT1, RMASK, RKM, ror #(24); \
+	op3 RT0, RT2; \
+	and RT2, RMASK, RKM, lsr #(16); \
+	op4 RT0, RT3; \
+	and RT3, RMASK, RKM, lsr #(8); \
+	eor rl0, RT0; \
+	add RT3, #(0x100 * 4); \
+	ldr RT1, [Rs1, RT1]; \
+	and RT0, RMASK, RKM; \
+	ldr RT2, [Rs2, RT2]; \
+	add RT0, #(0x100 * 4 * 2); \
+	\
+	ldr RT3, [Rs2, RT3]; \
+	\
+	op2 RT1, RT2; \
+	ldr RT0, [Rs2, RT0]; \
+	op3 RT1, RT3; \
+	loadkm((n) + (1 - ((dec) * 2))); \
+	op4 RT1, RT0; \
+	loadkr((n) + (1 - ((dec) * 2))); \
+	shiftkr(RKR); \
+	eor rl1, RT1;
+
+#define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \
+	     loadkm, shiftkr, loadkr)
+#define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \
+	     loadkm, shiftkr, loadkr)
+#define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \
+	     loadkm, shiftkr, loadkr)
+
+#define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr)
+
+#define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr)
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+	ldr l0, [rin, #(0)]; \
+	ldr r0, [rin, #(4)]; \
+	convert(l0, rtmp); \
+	ldr l1, [rin, #(8)]; \
+	convert(r0, rtmp); \
+	ldr r1, [rin, #(12)]; \
+	convert(l1, rtmp); \
+	convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	convert(l1, rtmp); \
+	str l0, [rout, #(0)]; \
+	convert(r1, rtmp); \
+	str r0, [rout, #(4)]; \
+	str l1, [rout, #(8)]; \
+	str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, 4, rtmp0); \
+			ldr_unaligned_be(l1, rin, 8, rtmp0); \
+			ldr_unaligned_be(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, 4, rtmp0); \
+			ldr_unaligned_host(l1, rin, 8, rtmp0); \
+			ldr_unaligned_host(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.type  _gcry_cast5_arm_enc_blk2,%function;
+
+_gcry_cast5_arm_enc_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	push {%lr};
+
+	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+
+	load_km(0);
+	load_kr(0);
+	enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(3, F1, RR, RL, load_km, dummy, load_kr);
+	enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(7, F2, RR, RL, load_km, dummy, load_kr);
+	enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(11, F3, RR, RL, load_km, dummy, load_kr);
+	enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(15, F1, RR, RL, dummy, dummy, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	pop {%pc};
+.ltorg
+.size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
+
+.align 3
+.globl _gcry_cast5_arm_cfb_dec;
+.type  _gcry_cast5_arm_cfb_dec,%function;
+
+_gcry_cast5_arm_cfb_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r1, %r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+	ldm %r3, {RL0, RR0};
+	host_to_be(RL0, RT1);
+	host_to_be(RR0, RT1);
+	read_block(%r2, 0, RL1, RR1, %ip);
+
+	/* Update IV, load src[1] and save to iv[0] */
+	read_block_host(%r2, 8, %r5, %r6, %r7);
+	stm %lr, {%r5, %r6};
+
+	bl _gcry_cast5_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src */
+	pop {%r0, %r1};
+
+	/* dst = src ^ result */
+	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
+
+.align 3
+.globl _gcry_cast5_arm_ctr_enc;
+.type  _gcry_cast5_arm_ctr_enc,%function;
+
+_gcry_cast5_arm_ctr_enc:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit, big-endian)
+	 */
+	push {%r1, %r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load IV (big => host endian) */
+	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
+
+	/* Construct IVs */
+	adds RR1, RR0, #1; /* +1 */
+	adc RL1, RL0, #0;
+	adds %r6, RR1, #1; /* +2 */
+	adc %r5, RL1, #0;
+
+	/* Store new IV (host => big-endian) */
+	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
+
+	bl _gcry_cast5_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src */
+	pop {%r0, %r1};
+
+	/* XOR key-stream with plaintext */
+	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
+
+.align 3
+.type  _gcry_cast5_arm_dec_blk2,%function;
+
+_gcry_cast5_arm_dec_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+
+	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+
+	load_km(15);
+	load_dec_kr(15);
+	dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(0, F1, RR, RL, dummy, dummy, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_cast5_arm_dec_blk2,.-_gcry_cast5_arm_dec_blk2;
+
+.align 3
+.globl _gcry_cast5_arm_cbc_dec;
+.type  _gcry_cast5_arm_cbc_dec,%function;
+
+_gcry_cast5_arm_cbc_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r1-%r11, %ip, %lr};
+
+	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+	 * of function call. */
+	b _gcry_cast5_arm_dec_blk2;
+.Ldec_cbc_tail:
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src, %r2: iv */
+	pop {%r0-%r2};
+
+	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r1, 0, %r7, %r8, %r5);
+	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+	ldm %r2, {%r5, %r6};
+
+	/* out[1] ^= IV+1 */
+	eor %r10, %r7;
+	eor %r9, %r8;
+	/* out[0] ^= IV */
+	eor %r4, %r5;
+	eor %r3, %r6;
+
+	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r1, 8, %r7, %r8, %r5);
+	/* store IV+2 to iv[0] (aligned). */
+	stm %r2, {%r7, %r8};
+
+	/* store result to dst[0-3]. Might be unaligned. */
+	write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/comm/third_party/libgcrypt/cipher/cast5.c b/comm/third_party/libgcrypt/cipher/cast5.c
new file mode 100644
index 0000000000..837ea0fe57
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cast5.c
@@ -0,0 +1,1238 @@
+/* cast5.c  -  CAST5 cipher (RFC2144)
+ *	Copyright (C) 1998, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+/* Test vectors:
+ *
+ * 128-bit key	       = 01 23 45 67 12 34 56 78 23 45 67 89 34 56 78 9A
+ *	   plaintext   = 01 23 45 67 89 AB CD EF
+ *	   ciphertext  = 23 8B 4F E5 84 7E 44 B2
+ *
+ * 80-bit  key	       = 01 23 45 67 12 34 56 78 23 45
+ *		       = 01 23 45 67 12 34 56 78 23 45 00 00 00 00 00 00
+ *	   plaintext   = 01 23 45 67 89 AB CD EF
+ *	   ciphertext  = EB 6A 71 1A 2C 02 27 1B
+ *
+ * 40-bit  key	       = 01 23 45 67 12
+ *		       = 01 23 45 67 12 00 00 00 00 00 00 00 00 00 00 00
+ *	   plaintext   = 01 23 45 67 89 AB CD EF
+ *	   ciphertext  = 7A C8 16 D1 6E 9B 30 2E
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "g10lib.h"
+#include "types.h"
+#include "cipher.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+#  define USE_ARM_ASM 1
+# endif
+#endif
+
+#define CAST5_BLOCKSIZE 8
+
+typedef struct {
+    u32  Km[16];
+    byte Kr[16];
+#ifdef USE_ARM_ASM
+    u32 Kr_arm_enc[16 / sizeof(u32)];
+    u32 Kr_arm_dec[16 / sizeof(u32)];
+#endif
+} CAST5_context;
+
+static gcry_err_code_t cast_setkey (void *c, const byte *key, unsigned keylen,
+                                    cipher_bulk_ops_t *bulk_ops);
+static unsigned int encrypt_block (void *c, byte *outbuf, const byte *inbuf);
+static unsigned int decrypt_block (void *c, byte *outbuf, const byte *inbuf);
+
+
+
+#define s1 _gcry_cast5_s1to4[0]
+#define s2 _gcry_cast5_s1to4[1]
+#define s3 _gcry_cast5_s1to4[2]
+#define s4 _gcry_cast5_s1to4[3]
+
+const u32 _gcry_cast5_s1to4[4][256] = { {
+0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949,
+0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
+0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
+0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1, 0xaa54166b, 0x22568e3a, 0xa2d341d0,
+0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac, 0x4a97c1d8, 0x527644b7, 0xb5f437a7,
+0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0, 0x90ecf52e, 0x22b0c054, 0xbc8e5935,
+0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290, 0xe93b159f, 0xb48ee411, 0x4bff345d,
+0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad, 0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50,
+0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f, 0xc59c5319, 0xb949e354, 0xb04669fe,
+0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5, 0x6a390493, 0xe63d37e0, 0x2a54f6b3,
+0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5, 0xf61b1891, 0xbb72275e, 0xaa508167,
+0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427, 0xa2d1936b, 0x2ad286af, 0xaa56d291,
+0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d, 0x73e2bb14, 0xa0bebc3c, 0x54623779,
+0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e, 0x89fe78e6, 0x3fab0950, 0x325ff6c2,
+0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf, 0x380782d5, 0xc7fa5cf6, 0x8ac31511,
+0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241, 0x051ef495, 0xaa573b04, 0x4a805d8d,
+0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b, 0x50afd341, 0xa7c13275, 0x915a0bf5,
+0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265, 0xab85c5f3, 0x1b55db94, 0xaad4e324,
+0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3, 0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c,
+0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6, 0x22513f1e, 0xaa51a79b, 0x2ad344cc,
+0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6, 0x032268d4, 0xc9600acc, 0xce387e6d,
+0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da, 0x4736f464, 0x5ad328d8, 0xb347cc96,
+0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc, 0xbfc5fe4a, 0xa70aec10, 0xac39570a,
+0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f, 0x1cacd68d, 0x2ad37c96, 0x0175cb9d,
+0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4, 0xb11c3274, 0xdd24cb9e, 0x7e1c54bd,
+0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af, 0x51c85f4d, 0x56907596, 0xa5bb15e6,
+0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a, 0x3526ffa0, 0xc37b4d09, 0xbc306ed9,
+0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf, 0x700b45e1, 0xd5ea50f1, 0x85a92872,
+0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198, 0x0cd0ede7, 0x26470db8, 0xf881814c,
+0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e,
+0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9,
+0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
+}, {
+0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651,
+0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
+0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb,
+0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb, 0xe4e7ef5b, 0x25a1ff41, 0xe180f806,
+0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f, 0x77e83f4e, 0x79929269, 0x24fa9f7b,
+0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154, 0x0d554b63, 0x5d681121, 0xc866c359,
+0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181, 0x39f7627f, 0x361e3084, 0xe4eb573b,
+0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c, 0x99847ab4, 0xa0e3df79, 0xba6cf38c,
+0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a, 0x8f458c74, 0xd9e0a227, 0x4ec73a34,
+0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c, 0x1d804366, 0x721d9bfd, 0xa58684bb,
+0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1, 0x27e19ba5, 0xd5a6c252, 0xe49754bd,
+0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9, 0xe0b56714, 0x21f043b7, 0xe5d05860,
+0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf, 0x68561be6, 0x83ca6b94, 0x2d6ed23b,
+0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c, 0x397bc8d6, 0x5ee22b95, 0x5f0e5304,
+0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122, 0xb96726d1, 0x8049a7e8, 0x22b7da7b,
+0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402, 0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf,
+0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53, 0xe3214517, 0xb4542835, 0x9f63293c,
+0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6, 0x30a22c95, 0x31a70850, 0x60930f13,
+0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6, 0xa02b1741, 0x7cbad9a2, 0x2180036f,
+0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676, 0x25a75e7b, 0xe4e6d1fc, 0x20c710e6,
+0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb, 0x846a3bae, 0x8ff77888, 0xee5d60f6,
+0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54, 0x157fd7fa, 0xef8579cc, 0xd152de58,
+0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5, 0xc242fa0f, 0xa7e3ebb0, 0xc68e4906,
+0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8, 0xbec0c560, 0x61a3c9e8, 0xbca8f54d,
+0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc, 0x301e16e6, 0x273be979, 0xb0ffeaa6,
+0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a, 0xf7e19798, 0x7619b72f, 0x8f1c9ba4,
+0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e, 0x1a513742, 0xef6828bc, 0x520365d6,
+0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb, 0x5eea29cb, 0x145892f5, 0x91584f7f,
+0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4, 0x0d23e0f9, 0x6c387e8a, 0x0ae6d249,
+0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
+0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9,
+0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1
+}, {
+0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90,
+0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5,
+0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e,
+0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee, 0x825b1bfd, 0x9255c5ed, 0x1257a240,
+0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf, 0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5,
+0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1, 0x1fb78dfc, 0x8e6bd2c1, 0x437be59b,
+0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c, 0x4a012d6e, 0xc5884a28, 0xccc36f71,
+0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850, 0xd7c07f7e, 0x02507fbf, 0x5afb9a04,
+0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e, 0x727cc3c4, 0x0a0fb402, 0x0f7fef82,
+0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0, 0x1eac5790, 0x796fb449, 0x8252dc15,
+0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403, 0xe83ec305, 0x4f91751a, 0x925669c2,
+0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574, 0x927985b2, 0x8276dbcb, 0x02778176,
+0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83, 0x340ce5c8, 0x96bbb682, 0x93b4b148,
+0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20, 0x8437aa88, 0x7d29dc96, 0x2756d3dc,
+0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e, 0x3cf8209d, 0x6094d1e3, 0xcd9ca341,
+0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9, 0xbda8229c, 0x127dadaa, 0x438a074e,
+0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff, 0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51,
+0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a, 0x76a2e214, 0xb9a40368, 0x925d958f,
+0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623, 0x193cbcfa, 0x27627545, 0x825cf47a,
+0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7, 0x8272a972, 0x9270c4a8, 0x127de50b,
+0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb, 0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b,
+0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11, 0x236a5cae, 0x12deca4d, 0x2c3f8cc5,
+0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c, 0xb9b6a80c, 0x5c8f82bc, 0x89d36b45,
+0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40, 0x7c34671c, 0x02717ef6, 0x4feb5536,
+0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1, 0x006e1888, 0xa2e53f55, 0xb9e6d4bc,
+0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33, 0xabcc4f33, 0x7688c55d, 0x7b00a6b0,
+0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff, 0x856302e0, 0x72dbd92b, 0xee971b69,
+0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2, 0x61efc8c2, 0xf1ac2571, 0xcc8239c2,
+0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38, 0x0ff0443d, 0x606e6dc6, 0x60543a49,
+0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d,
+0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a,
+0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783
+}, {
+0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1,
+0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf,
+0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15,
+0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f, 0x0c13fefe, 0x081b08ca, 0x05170121,
+0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f, 0x06df4261, 0xbb9e9b8a, 0x7293ea25,
+0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400, 0x547eebe6, 0x446d4ca0, 0x6cf3d6f5,
+0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061, 0x11b638e1, 0x72500e03, 0xf80eb2bb,
+0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400, 0x6920318f, 0x081dbb99, 0xffc304a5,
+0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea, 0x9f926f91, 0x9f46222f, 0x3991467d,
+0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8, 0x3fb6180c, 0x18f8931e, 0x281658e6,
+0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25, 0x79098b02, 0xe4eabb81, 0x28123b23,
+0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9, 0x0014377b, 0x041e8ac8, 0x09114003,
+0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de, 0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6,
+0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0, 0x56c8c391, 0x6b65811c, 0x5e146119,
+0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d, 0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24,
+0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a, 0xeca1d7c7, 0x041afa32, 0x1d16625a,
+0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb, 0xc70b8b46, 0xd9e66a48, 0x56e55a79,
+0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3, 0xedda04eb, 0x17a9be04, 0x2c18f4df,
+0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254, 0xe5b6a035, 0x213d42f6, 0x2c1c7c26,
+0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2, 0x0418f2c8, 0x001a96a6, 0x0d1526ab,
+0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86, 0x311170a7, 0x3e9b640c, 0xcc3e10d7,
+0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1, 0x1f9af36e, 0xcfcbd12f, 0xc1de8417,
+0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca, 0xb4be31cd, 0xd8782806, 0x12a3a4e2,
+0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5, 0x9711aac5, 0x001d7b95, 0x82e5e7d2,
+0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415, 0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a,
+0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7, 0x0ce454a9, 0xd60acd86, 0x015f1919,
+0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe, 0x8b75e387, 0xb3c50651, 0xb8a5c3ef,
+0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb, 0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876,
+0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8, 0x296b299e, 0x492fc295, 0x9266beab,
+0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04,
+0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282,
+0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
+} };
+static const u32 s5[256] = {
+0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f,
+0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a,
+0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, 0x15b0a848, 0xe68b18cb, 0x4caadeff,
+0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, 0x248eb6fb, 0x8dba1cfe, 0x41a99b02,
+0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, 0x97a5980a, 0xc539b9aa, 0x4d79fe6a,
+0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, 0x8709e6b0, 0xd7e07156, 0x4e29fea7,
+0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, 0x578535f2, 0x2261be02, 0xd642a0c9,
+0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, 0xc8adedb3, 0x28a87fc9, 0x3d959981,
+0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, 0x4fb96976, 0x90c79505, 0xb0a8a774,
+0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, 0x0ec50966, 0xdfdd55bc, 0x29de0655,
+0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, 0x524755f4, 0x03b63cc9, 0x0cc844b2,
+0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, 0x64ee2d7e, 0xcddbb1da, 0x01c94910,
+0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, 0x50f5b616, 0xf24766e3, 0x8eca36c1,
+0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, 0x3063fcdf, 0xb6f589de, 0xec2941da,
+0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, 0xc1bacb7f, 0xe5ff550f, 0xb6083049,
+0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, 0x9e0885f9, 0x68cb3e47, 0x086c010f,
+0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, 0xcbb3d550, 0x1793084d, 0xb0d70eba,
+0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, 0x0f5755d1, 0xe0e1e56e, 0x6184b5be,
+0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, 0x05687715, 0x646c6bd7, 0x44904db3,
+0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, 0x2cb6356a, 0x85808573, 0x4991f840,
+0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, 0xc1092910, 0x8bc95fc6, 0x7d869cf4,
+0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, 0x7d161bba, 0x9cad9010, 0xaf462ba2,
+0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, 0x176d486f, 0x097c13ea, 0x631da5c7,
+0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, 0x6e5dd2f3, 0x20936079, 0x459b80a5,
+0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, 0xf6721b2c, 0x1ad2fff3, 0x8c25404e,
+0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, 0x75922283, 0x784d6b17, 0x58ebb16e,
+0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, 0xaaf47556, 0x5f46b02a, 0x2b092801,
+0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, 0x95055110, 0x1b5ad7a8, 0xf61ed5ad,
+0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0,
+0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, 0x17e3fe2a, 0x24b79767, 0xf5a96b20,
+0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, 0xeeb9491d, 0x34010718, 0xbb30cab8,
+0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, 0xb1534546, 0x6d47de08, 0xefe9e7d4
+};
+static const u32 s6[256] = {
+0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, 0x016843b4, 0xeced5cbc, 0x325553ac,
+0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, 0xde5ebe39, 0xf38ff732, 0x8989b138,
+0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, 0x4e23e33c, 0x79cbd7cc, 0x48a14367,
+0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, 0x09a8486f, 0xa888614a, 0x2900af98,
+0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, 0xcf0fec14, 0xf7ca07d2, 0xd0a82072,
+0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, 0x4c7f4448, 0xdab5d440, 0x6dba0ec3,
+0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, 0x64bdb941, 0x2c0e636a, 0xba7dd9cd,
+0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, 0xb88153e2, 0x08a19866, 0x1ae2eac8,
+0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, 0x9aea3906, 0xefe8c36e, 0xf890cdd9,
+0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, 0x221db3a6, 0x9a69a02f, 0x68818a54,
+0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, 0xcf222ebf, 0x25ac6f48, 0xa9a99387,
+0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, 0xe8a11be9, 0x4980740d, 0xc8087dfc,
+0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, 0x9528cd89, 0xfd339fed, 0xb87834bf,
+0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, 0x57f55ec5, 0xe2220abe, 0xd2916ebf,
+0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, 0xa8dc8af0, 0x7345c106, 0xf41e232f,
+0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, 0x692573e4, 0xe9a9d848, 0xf3160289,
+0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, 0x4576698d, 0xb6fad407, 0x592af950,
+0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, 0xc50dfe5d, 0xfcd707ab, 0x0921c42f,
+0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, 0x48b9d585, 0xdc049441, 0xc8098f9b,
+0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, 0x890072d6, 0x28207682, 0xa9a9f7be,
+0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, 0x1f8fb214, 0xd372cf08, 0xcc3c4a13,
+0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, 0xb6c85283, 0x3cc2acfb, 0x3fc06976,
+0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, 0x513021a5, 0x6c5b68b7, 0x822f8aa0,
+0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, 0x0c5ec241, 0x8809286c, 0xf592d891,
+0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, 0xb173ecc0, 0xbc60b42a, 0x953498da,
+0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, 0x257f0c3d, 0x9348af49, 0x361400bc,
+0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, 0xda41e7f9, 0xc25ad33a, 0x54f4a084,
+0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, 0xb6f6deaf, 0x3a479c3a, 0x5302da25,
+0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, 0x44136c76, 0x0404a8c8, 0xb8e5a121,
+0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5,
+0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, 0xf544edeb, 0xb0e93524, 0xbebb8fbd,
+0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, 0xa65b1db8, 0x851c97bd, 0xd675cf2f
+};
+static const u32 s7[256] = {
+0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, 0xab9bc912, 0xde6008a1, 0x2028da1f,
+0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, 0xb232e75c, 0x4b3695f2, 0xb28707de,
+0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43,
+0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, 0xbaeeadf4, 0x1286becf, 0xb6eacb19,
+0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, 0x28136086, 0x0bd8dfa8, 0x356d1cf2,
+0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, 0xeb12ff82, 0xe3486911, 0xd34d7516,
+0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88,
+0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, 0x4437f107, 0xb6e79962, 0x42d2d816,
+0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, 0xf9583745, 0xcf19df58, 0xbec3f756,
+0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, 0x38bc46e9, 0xc6e6fa14, 0xbae8584a,
+0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, 0xaff60ff4, 0xea2c4e6d, 0x16e39264,
+0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, 0xb2856e6e, 0x1aec3ca9, 0xbe838688,
+0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, 0x61fe033c, 0x16746233, 0x3c034c28,
+0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, 0x1626a49f, 0xeed82b29, 0x1d382fe3,
+0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, 0xd45230c7, 0x2bd1408b, 0x60c03eb7,
+0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06,
+0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033,
+0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, 0x79d34217, 0x021a718d, 0x9ac6336a,
+0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, 0x4eeb8476, 0x488dcf25, 0x36c9d566,
+0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509,
+0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, 0x2b9f4fd5, 0x625aba82, 0x6a017962,
+0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, 0xe32dbf9a, 0x058745b9, 0x3453dc1e,
+0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, 0x19de7eae, 0x053e561a, 0x15ad6f8c,
+0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, 0x58d4f2ae, 0x9ea294fb, 0x52cf564c,
+0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, 0xa1e7160e, 0xe4f2dfa6, 0x693ed285,
+0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, 0x3d321c5d, 0xc3f5e194, 0x4b269301,
+0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, 0x296693f4, 0x3d1fce6f, 0xc61e45be,
+0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, 0xb5229301, 0xcfd2a87f, 0x60aeb767,
+0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, 0x589dd390, 0x5479f8e6, 0x1cb8d647,
+0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, 0x462e1b78, 0x6580f87e, 0xf3817914,
+0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, 0x3d40f021, 0xc3c0bdae, 0x4958c24c,
+0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, 0x94e01be8, 0x90716f4b, 0x954b8aa3
+};
+static const u32 s8[256] = {
+0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, 0xe6c1121b, 0x0e241600, 0x052ce8b5,
+0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, 0x76e38111, 0xb12def3a, 0x37ddddfc,
+0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, 0xb4d137cf, 0xb44e79f0, 0x049eedfd,
+0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, 0x3f8f95e7, 0x72df191b, 0x7580330d,
+0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, 0x02e7d1ca, 0x53571dae, 0x7a3182a2,
+0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, 0xce949ad4, 0xb84769ad, 0x965bd862,
+0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, 0xc28ec4b8, 0x57e8726e, 0x647a78fc,
+0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, 0xae63aff2, 0x7e8bd632, 0x70108c0c,
+0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, 0x06918548, 0x58cb7e07, 0x3b74ef2e,
+0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, 0x19b47a38, 0x424f7618, 0x35856039,
+0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, 0xc18910b1, 0xe11dbf7b, 0x06cd1af8,
+0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, 0x3dd00db3, 0x708f8f34, 0x77d51b42,
+0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, 0x3e378160, 0x7895cda5, 0x859c15a5,
+0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, 0x31842e7b, 0x24259fd7, 0xf8bef472,
+0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, 0xe2506d3d, 0x4f9b12ea, 0xf215f225,
+0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, 0xea7a6e98, 0x7cd16efc, 0x1436876c,
+0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, 0x92ecbae6, 0xdd67016d, 0x151682eb,
+0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, 0xe139673b, 0xefa63fb8, 0x71873054,
+0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, 0x844a1be5, 0xbae7dfdc, 0x42cbda70,
+0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, 0x79d130a4, 0x3486ebfb, 0x33d3cddc,
+0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, 0xc5c8b37e, 0x0d809ea2, 0x398feb7c,
+0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, 0x37df932b, 0xc4248289, 0xacf3ebc3,
+0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, 0x5e410fab, 0xb48a2465, 0x2eda7fa4,
+0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, 0xdb485694, 0x38d7e5b2, 0x57720101,
+0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, 0x7523d24a, 0xe0779695, 0xf9c17a8f,
+0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, 0xad1163ed, 0xea7b5965, 0x1a00726e,
+0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, 0x9eedc364, 0x22ebe6a8, 0xcee7d28a,
+0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, 0x8951570f, 0xdf09822b, 0xbd691a6c,
+0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, 0x0d771c2b, 0x67cdb156, 0x350d8384,
+0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, 0x8360d87b, 0x1fa98b0c, 0x1149382c,
+0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, 0x0d2059d1, 0xa466bb1e, 0xf8da0a82,
+0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, 0xeaee6801, 0x8db2a283, 0xea8bf59e
+};
+
+
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of CAST5. */
+extern void _gcry_cast5_amd64_encrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+extern void _gcry_cast5_amd64_decrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *ctr);
+
+extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+static void
+do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static void
+cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr)
+{
+  _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr);
+}
+
+static void
+cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+  _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv);
+}
+
+static void
+cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+  _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+#elif defined(USE_ARM_ASM)
+
+/* ARM assembly implementations of CAST5. */
+extern void _gcry_cast5_arm_encrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+extern void _gcry_cast5_arm_decrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+/* These assembly implementations process two blocks in parallel. */
+extern void _gcry_cast5_arm_ctr_enc(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *ctr);
+
+extern void _gcry_cast5_arm_cbc_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+extern void _gcry_cast5_arm_cfb_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+static void
+do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_arm_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_arm_decrypt_block (context, outbuf, inbuf);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+#else /*USE_ARM_ASM*/
+
+#define F1(D,m,r)  (  (I = ((m) + (D))), (I=rol(I,(r))),   \
+    (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]) )
+#define F2(D,m,r)  (  (I = ((m) ^ (D))), (I=rol(I,(r))),   \
+    (((s1[I >> 24] - s2[(I>>16)&0xff]) + s3[(I>>8)&0xff]) ^ s4[I&0xff]) )
+#define F3(D,m,r)  (  (I = ((m) - (D))), (I=rol(I,(r))),   \
+    (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff]) )
+
+static void
+do_encrypt_block( CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+    u32 l, r, t;
+    u32 I;   /* used by the Fx macros */
+    u32 *Km;
+    u32 Kr;
+
+    Km = c->Km;
+    Kr = buf_get_le32(c->Kr + 0);
+
+    /* (L0,R0) <-- (m1...m64).	(Split the plaintext into left and
+     * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.)
+     */
+    l = buf_get_be32(inbuf + 0);
+    r = buf_get_be32(inbuf + 4);
+
+    /* (16 rounds) for i from 1 to 16, compute Li and Ri as follows:
+     *	Li = Ri-1;
+     *	Ri = Li-1 ^ f(Ri-1,Kmi,Kri), where f is defined in Section 2.2
+     * Rounds 1, 4, 7, 10, 13, and 16 use f function Type 1.
+     * Rounds 2, 5, 8, 11, and 14 use f function Type 2.
+     * Rounds 3, 6, 9, 12, and 15 use f function Type 3.
+     */
+
+    t = l; l = r; r = t ^ F1(r, Km[ 0], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[ 1], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[ 2], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 3], Kr & 31); Kr = buf_get_le32(c->Kr + 4);
+    t = l; l = r; r = t ^ F2(r, Km[ 4], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[ 5], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 6], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[ 7], Kr & 31); Kr = buf_get_le32(c->Kr + 8);
+    t = l; l = r; r = t ^ F3(r, Km[ 8], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 9], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[10], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[11], Kr & 31); Kr = buf_get_le32(c->Kr + 12);
+    t = l; l = r; r = t ^ F1(r, Km[12], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[13], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[14], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[15], Kr & 31);
+
+    /* c1...c64 <-- (R16,L16).	(Exchange final blocks L16, R16 and
+     *	concatenate to form the ciphertext.) */
+    buf_put_be32(outbuf + 0, r);
+    buf_put_be32(outbuf + 4, l);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (20+4*sizeof(void*));
+}
+
+
+static void
+do_encrypt_block_3( CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+    u32 l0, r0, t0, l1, r1, t1, l2, r2, t2;
+    u32 I;   /* used by the Fx macros */
+    u32 *Km;
+    u32 Kr;
+
+    Km = c->Km;
+    Kr = buf_get_le32(c->Kr + 0);
+
+    l0 = buf_get_be32(inbuf + 0);
+    r0 = buf_get_be32(inbuf + 4);
+    l1 = buf_get_be32(inbuf + 8);
+    r1 = buf_get_be32(inbuf + 12);
+    l2 = buf_get_be32(inbuf + 16);
+    r2 = buf_get_be32(inbuf + 20);
+
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 0], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 0], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 0], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 1], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 1], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 1], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 2], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 2], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 2], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 3], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 3], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 3], Kr & 31);
+    Kr = buf_get_le32(c->Kr + 4);
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 4], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 4], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 4], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 5], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 5], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 5], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 6], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 6], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 6], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 7], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 7], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 7], Kr & 31);
+    Kr = buf_get_le32(c->Kr + 8);
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 8], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 8], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 8], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 9], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 9], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 9], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[10], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[10], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[10], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[11], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[11], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[11], Kr & 31);
+    Kr = buf_get_le32(c->Kr + 12);
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[12], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[12], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[12], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[13], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[13], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[13], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[14], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[14], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[14], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[15], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[15], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[15], Kr & 31);
+
+    buf_put_be32(outbuf + 0, r0);
+    buf_put_be32(outbuf + 4, l0);
+    buf_put_be32(outbuf + 8, r1);
+    buf_put_be32(outbuf + 12, l1);
+    buf_put_be32(outbuf + 16, r2);
+    buf_put_be32(outbuf + 20, l2);
+}
+
+
+static void
+do_decrypt_block (CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+    u32 l, r, t;
+    u32 I;
+    u32 *Km;
+    u32 Kr;
+
+    Km = c->Km;
+    Kr = buf_get_be32(c->Kr + 12);
+
+    l = buf_get_be32(inbuf + 0);
+    r = buf_get_be32(inbuf + 4);
+
+    t = l; l = r; r = t ^ F1(r, Km[15], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[14], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[13], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[12], Kr & 31); Kr = buf_get_be32(c->Kr + 8);
+    t = l; l = r; r = t ^ F3(r, Km[11], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[10], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 9], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[ 8], Kr & 31); Kr = buf_get_be32(c->Kr + 4);
+    t = l; l = r; r = t ^ F2(r, Km[ 7], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 6], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[ 5], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[ 4], Kr & 31); Kr = buf_get_be32(c->Kr + 0);
+    t = l; l = r; r = t ^ F1(r, Km[ 3], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F3(r, Km[ 2], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F2(r, Km[ 1], Kr & 31); Kr >>= 8;
+    t = l; l = r; r = t ^ F1(r, Km[ 0], Kr & 31);
+
+    buf_put_be32(outbuf + 0, r);
+    buf_put_be32(outbuf + 4, l);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (20+4*sizeof(void*));
+}
+
+
+static void
+do_decrypt_block_3 (CAST5_context *c, byte *outbuf, const byte *inbuf )
+{
+    u32 l0, r0, t0, l1, r1, t1, l2, r2, t2;
+    u32 I;
+    u32 *Km;
+    u32 Kr;
+
+    Km = c->Km;
+    Kr = buf_get_be32(c->Kr + 12);
+
+    l0 = buf_get_be32(inbuf + 0);
+    r0 = buf_get_be32(inbuf + 4);
+    l1 = buf_get_be32(inbuf + 8);
+    r1 = buf_get_be32(inbuf + 12);
+    l2 = buf_get_be32(inbuf + 16);
+    r2 = buf_get_be32(inbuf + 20);
+
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[15], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[15], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[15], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[14], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[14], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[14], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[13], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[13], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[13], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[12], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[12], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[12], Kr & 31);
+    Kr = buf_get_be32(c->Kr + 8);
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[11], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[11], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[11], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[10], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[10], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[10], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 9], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 9], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 9], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 8], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 8], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 8], Kr & 31);
+    Kr = buf_get_be32(c->Kr + 4);
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 7], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 7], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 7], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 6], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 6], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 6], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 5], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 5], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 5], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 4], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 4], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 4], Kr & 31);
+    Kr = buf_get_be32(c->Kr + 0);
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 3], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 3], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 3], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F3(r0, Km[ 2], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F3(r1, Km[ 2], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F3(r2, Km[ 2], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F2(r0, Km[ 1], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F2(r1, Km[ 1], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F2(r2, Km[ 1], Kr & 31);
+    Kr >>= 8;
+    t0 = l0; l0 = r0; r0 = t0 ^ F1(r0, Km[ 0], Kr & 31);
+	    t1 = l1; l1 = r1; r1 = t1 ^ F1(r1, Km[ 0], Kr & 31);
+		    t2 = l2; l2 = r2; r2 = t2 ^ F1(r2, Km[ 0], Kr & 31);
+
+    buf_put_be32(outbuf + 0, r0);
+    buf_put_be32(outbuf + 4, l0);
+    buf_put_be32(outbuf + 8, r1);
+    buf_put_be32(outbuf + 12, l1);
+    buf_put_be32(outbuf + 16, r2);
+    buf_put_be32(outbuf + 20, l2);
+}
+
+#endif /*!USE_ARM_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size CAST5_BLOCKSIZE. */
+static void
+_gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+		    const void *inbuf_arg, size_t nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAST5_BLOCKSIZE * 3];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_cast5_arm_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 2;
+        outbuf += 2 * CAST5_BLOCKSIZE;
+        inbuf  += 2 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* Prepare the counter blocks. */
+      cipher_block_cpy (tmpbuf + 0, ctr, CAST5_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, ctr, CAST5_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 16, ctr, CAST5_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 8, 1, CAST5_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 16, 2, CAST5_BLOCKSIZE);
+      cipher_block_add (ctr, 3, CAST5_BLOCKSIZE);
+      /* Encrypt the counter. */
+      do_encrypt_block_3(ctx, tmpbuf, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE * 3);
+      outbuf += CAST5_BLOCKSIZE * 3;
+      inbuf  += CAST5_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add (ctr, 1, CAST5_BLOCKSIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, size_t nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[CAST5_BLOCKSIZE * 3];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_cast5_arm_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * CAST5_BLOCKSIZE;
+        inbuf  += 2 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block_3 (ctx, savebuf, inbuf);
+
+      cipher_block_xor_1 (savebuf + 0, iv, CAST5_BLOCKSIZE);
+      cipher_block_xor_1 (savebuf + 8, inbuf, CAST5_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, CAST5_BLOCKSIZE);
+      buf_cpy (outbuf, savebuf, CAST5_BLOCKSIZE * 3);
+      inbuf += CAST5_BLOCKSIZE * 3;
+      outbuf += CAST5_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block (ctx, savebuf, inbuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, CAST5_BLOCKSIZE);
+      inbuf += CAST5_BLOCKSIZE;
+      outbuf += CAST5_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, size_t nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAST5_BLOCKSIZE * 3];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 4 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_cast5_arm_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * CAST5_BLOCKSIZE;
+        inbuf  += 2 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3 )
+    {
+      cipher_block_cpy (tmpbuf + 0, iv, CAST5_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, inbuf + 0, CAST5_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, CAST5_BLOCKSIZE);
+      do_encrypt_block_3 (ctx, tmpbuf, tmpbuf);
+      buf_xor (outbuf, inbuf, tmpbuf, CAST5_BLOCKSIZE * 3);
+      outbuf += CAST5_BLOCKSIZE * 3;
+      inbuf  += CAST5_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_ctr("CAST5", &cast_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cbc("CAST5", &cast_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cfb("CAST5", &cast_setkey,
+           &encrypt_block, nblocks, blocksize, context_size);
+}
+
+
+static const char*
+selftest(void)
+{
+    CAST5_context c;
+    cipher_bulk_ops_t bulk_ops;
+    static const byte key[16] =
+                    { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78,
+		      0x23, 0x45, 0x67, 0x89, 0x34, 0x56, 0x78, 0x9A  };
+    static const byte plain[8] =
+                    { 0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF };
+    static const byte cipher[8] =
+                    { 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 };
+    byte buffer[8];
+    const char *r;
+
+    cast_setkey( &c, key, 16, &bulk_ops );
+    encrypt_block( &c, buffer, plain );
+    if( memcmp( buffer, cipher, 8 ) )
+	return "1";
+    decrypt_block( &c, buffer, buffer );
+    if( memcmp( buffer, plain, 8 ) )
+	return "2";
+
+#if 0 /* full maintenance test */
+    {
+	int i;
+	byte a0[16] = { 0x01,0x23,0x45,0x67,0x12,0x34,0x56,0x78,
+			0x23,0x45,0x67,0x89,0x34,0x56,0x78,0x9A };
+	byte b0[16] = { 0x01,0x23,0x45,0x67,0x12,0x34,0x56,0x78,
+			0x23,0x45,0x67,0x89,0x34,0x56,0x78,0x9A };
+	byte a1[16] = { 0xEE,0xA9,0xD0,0xA2,0x49,0xFD,0x3B,0xA6,
+			0xB3,0x43,0x6F,0xB8,0x9D,0x6D,0xCA,0x92 };
+	byte b1[16] = { 0xB2,0xC9,0x5E,0xB0,0x0C,0x31,0xAD,0x71,
+			0x80,0xAC,0x05,0xB8,0xE8,0x3D,0x69,0x6E };
+
+	for(i=0; i < 1000000; i++ ) {
+	    cast_setkey( &c, b0, 16, &bulk_ops );
+	    encrypt_block( &c, a0, a0 );
+	    encrypt_block( &c, a0+8, a0+8 );
+	    cast_setkey( &c, a0, 16, &bulk_ops );
+	    encrypt_block( &c, b0, b0 );
+	    encrypt_block( &c, b0+8, b0+8 );
+	}
+	if( memcmp( a0, a1, 16 ) || memcmp( b0, b1, 16 ) )
+	    return "3";
+
+    }
+#endif
+
+    if ( (r = selftest_cbc ()) )
+      return r;
+
+    if ( (r = selftest_cfb ()) )
+      return r;
+
+    if ( (r = selftest_ctr ()) )
+      return r;
+
+    return NULL;
+}
+
+
+static void
+key_schedule( u32 *x, u32 *z, u32 *k )
+{
+
+#define xi(i)   ((x[(i)/4] >> (8*(3-((i)%4)))) & 0xff)
+#define zi(i)   ((z[(i)/4] >> (8*(3-((i)%4)))) & 0xff)
+
+    z[0] = x[0] ^ s5[xi(13)]^s6[xi(15)]^s7[xi(12)]^s8[xi(14)]^s7[xi( 8)];
+    z[1] = x[2] ^ s5[zi( 0)]^s6[zi( 2)]^s7[zi( 1)]^s8[zi( 3)]^s8[xi(10)];
+    z[2] = x[3] ^ s5[zi( 7)]^s6[zi( 6)]^s7[zi( 5)]^s8[zi( 4)]^s5[xi( 9)];
+    z[3] = x[1] ^ s5[zi(10)]^s6[zi( 9)]^s7[zi(11)]^s8[zi( 8)]^s6[xi(11)];
+    k[0] = s5[zi( 8)]^s6[zi( 9)]^s7[zi( 7)]^s8[zi( 6)]^s5[zi( 2)];
+    k[1] = s5[zi(10)]^s6[zi(11)]^s7[zi( 5)]^s8[zi( 4)]^s6[zi( 6)];
+    k[2] = s5[zi(12)]^s6[zi(13)]^s7[zi( 3)]^s8[zi( 2)]^s7[zi( 9)];
+    k[3] = s5[zi(14)]^s6[zi(15)]^s7[zi( 1)]^s8[zi( 0)]^s8[zi(12)];
+
+    x[0] = z[2] ^ s5[zi( 5)]^s6[zi( 7)]^s7[zi( 4)]^s8[zi( 6)]^s7[zi( 0)];
+    x[1] = z[0] ^ s5[xi( 0)]^s6[xi( 2)]^s7[xi( 1)]^s8[xi( 3)]^s8[zi( 2)];
+    x[2] = z[1] ^ s5[xi( 7)]^s6[xi( 6)]^s7[xi( 5)]^s8[xi( 4)]^s5[zi( 1)];
+    x[3] = z[3] ^ s5[xi(10)]^s6[xi( 9)]^s7[xi(11)]^s8[xi( 8)]^s6[zi( 3)];
+    k[4] = s5[xi( 3)]^s6[xi( 2)]^s7[xi(12)]^s8[xi(13)]^s5[xi( 8)];
+    k[5] = s5[xi( 1)]^s6[xi( 0)]^s7[xi(14)]^s8[xi(15)]^s6[xi(13)];
+    k[6] = s5[xi( 7)]^s6[xi( 6)]^s7[xi( 8)]^s8[xi( 9)]^s7[xi( 3)];
+    k[7] = s5[xi( 5)]^s6[xi( 4)]^s7[xi(10)]^s8[xi(11)]^s8[xi( 7)];
+
+    z[0] = x[0] ^ s5[xi(13)]^s6[xi(15)]^s7[xi(12)]^s8[xi(14)]^s7[xi( 8)];
+    z[1] = x[2] ^ s5[zi( 0)]^s6[zi( 2)]^s7[zi( 1)]^s8[zi( 3)]^s8[xi(10)];
+    z[2] = x[3] ^ s5[zi( 7)]^s6[zi( 6)]^s7[zi( 5)]^s8[zi( 4)]^s5[xi( 9)];
+    z[3] = x[1] ^ s5[zi(10)]^s6[zi( 9)]^s7[zi(11)]^s8[zi( 8)]^s6[xi(11)];
+    k[8] = s5[zi( 3)]^s6[zi( 2)]^s7[zi(12)]^s8[zi(13)]^s5[zi( 9)];
+    k[9] = s5[zi( 1)]^s6[zi( 0)]^s7[zi(14)]^s8[zi(15)]^s6[zi(12)];
+    k[10]= s5[zi( 7)]^s6[zi( 6)]^s7[zi( 8)]^s8[zi( 9)]^s7[zi( 2)];
+    k[11]= s5[zi( 5)]^s6[zi( 4)]^s7[zi(10)]^s8[zi(11)]^s8[zi( 6)];
+
+    x[0] = z[2] ^ s5[zi( 5)]^s6[zi( 7)]^s7[zi( 4)]^s8[zi( 6)]^s7[zi( 0)];
+    x[1] = z[0] ^ s5[xi( 0)]^s6[xi( 2)]^s7[xi( 1)]^s8[xi( 3)]^s8[zi( 2)];
+    x[2] = z[1] ^ s5[xi( 7)]^s6[xi( 6)]^s7[xi( 5)]^s8[xi( 4)]^s5[zi( 1)];
+    x[3] = z[3] ^ s5[xi(10)]^s6[xi( 9)]^s7[xi(11)]^s8[xi( 8)]^s6[zi( 3)];
+    k[12]= s5[xi( 8)]^s6[xi( 9)]^s7[xi( 7)]^s8[xi( 6)]^s5[xi( 3)];
+    k[13]= s5[xi(10)]^s6[xi(11)]^s7[xi( 5)]^s8[xi( 4)]^s6[xi( 7)];
+    k[14]= s5[xi(12)]^s6[xi(13)]^s7[xi( 3)]^s8[xi( 2)]^s7[xi( 8)];
+    k[15]= s5[xi(14)]^s6[xi(15)]^s7[xi( 1)]^s8[xi( 0)]^s8[xi(13)];
+
+#undef xi
+#undef zi
+}
+
+
+static gcry_err_code_t
+do_cast_setkey( CAST5_context *c, const byte *key, unsigned keylen )
+{
+  static int initialized;
+  static const char* selftest_failed;
+  int i;
+  u32 x[4];
+  u32 z[4];
+  u32 k[16];
+
+  if( !initialized )
+    {
+      initialized = 1;
+      selftest_failed = selftest();
+      if( selftest_failed )
+        log_error ("CAST5 selftest failed (%s).\n", selftest_failed );
+    }
+  if( selftest_failed )
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if( keylen != 16 )
+    return GPG_ERR_INV_KEYLEN;
+
+  x[0] = buf_get_be32(key + 0);
+  x[1] = buf_get_be32(key + 4);
+  x[2] = buf_get_be32(key + 8);
+  x[3] = buf_get_be32(key + 12);
+
+  key_schedule( x, z, k );
+  for(i=0; i < 16; i++ )
+    c->Km[i] = k[i];
+  key_schedule( x, z, k );
+  for(i=0; i < 16; i++ )
+    c->Kr[i] = k[i] & 0x1f;
+
+#ifdef USE_ARM_ASM
+  for (i = 0; i < 4; i++)
+    {
+      byte Kr_arm[4];
+
+      /* Convert rotate left to rotate right and add shift left
+       * by 2.  */
+      Kr_arm[0] = ((32 - c->Kr[4 * i + 0]) - 2) & 0x1f;
+      Kr_arm[1] = ((32 - c->Kr[4 * i + 1]) - 2) & 0x1f;
+      Kr_arm[2] = ((32 - c->Kr[4 * i + 2]) - 2) & 0x1f;
+      Kr_arm[3] = ((32 - c->Kr[4 * i + 3]) - 2) & 0x1f;
+
+      /* Endian friendly store.  */
+      c->Kr_arm_enc[i] = Kr_arm[0] |
+                        (Kr_arm[1] << 8) |
+                        (Kr_arm[2] << 16) |
+                        (Kr_arm[3] << 24);
+      c->Kr_arm_dec[i] = Kr_arm[3] |
+                        (Kr_arm[2] << 8) |
+                        (Kr_arm[1] << 16) |
+                        (Kr_arm[0] << 24);
+
+      wipememory(Kr_arm, sizeof(Kr_arm));
+    }
+#endif
+
+  wipememory(x, sizeof x);
+  wipememory(z, sizeof z);
+  wipememory(k, sizeof k);
+
+#undef xi
+#undef zi
+  return GPG_ERR_NO_ERROR;
+}
+
+static gcry_err_code_t
+cast_setkey (void *context, const byte *key, unsigned keylen,
+             cipher_bulk_ops_t *bulk_ops)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  gcry_err_code_t rc = do_cast_setkey (c, key, keylen);
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cfb_dec = _gcry_cast5_cfb_dec;
+  bulk_ops->cbc_dec = _gcry_cast5_cbc_dec;
+  bulk_ops->ctr_enc = _gcry_cast5_ctr_enc;
+
+  return rc;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_cast5 =
+  {
+    GCRY_CIPHER_CAST5, {0, 0},
+    "CAST5", NULL, NULL, CAST5_BLOCKSIZE, 128, sizeof (CAST5_context),
+    cast_setkey, encrypt_block, decrypt_block
+  };
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-aarch64.S b/comm/third_party/libgcrypt/cipher/chacha20-aarch64.S
new file mode 100644
index 0000000000..b8f9724a37
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-aarch64.S
@@ -0,0 +1,648 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#include "asm-poly1305-aarch64.h"
+
+/* register macros */
+#define INPUT     x0
+#define DST       x1
+#define SRC       x2
+#define NBLKS     x3
+#define ROUND     x4
+#define INPUT_CTR x5
+#define INPUT_POS x6
+#define CTR       x7
+
+/* vector registers */
+#define X0 v16
+#define X1 v17
+#define X2 v18
+#define X3 v19
+#define X4 v20
+#define X5 v21
+#define X6 v22
+#define X7 v23
+#define X8 v24
+#define X9 v25
+#define X10 v26
+#define X11 v27
+#define X12 v28
+#define X13 v29
+#define X14 v30
+#define X15 v31
+
+#define VCTR    v0
+#define VTMP0   v1
+#define VTMP1   v2
+#define VTMP2   v3
+#define VTMP3   v4
+#define X12_TMP v5
+#define X13_TMP v6
+#define ROT8    v7
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define _(...) __VA_ARGS__
+
+#define vpunpckldq(s1, s2, dst) \
+	zip1 dst.4s, s2.4s, s1.4s;
+
+#define vpunpckhdq(s1, s2, dst) \
+	zip2 dst.4s, s2.4s, s1.4s;
+
+#define vpunpcklqdq(s1, s2, dst) \
+	zip1 dst.2d, s2.2d, s1.2d;
+
+#define vpunpckhqdq(s1, s2, dst) \
+	zip2 dst.2d, s2.2d, s1.2d;
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	vpunpckhdq(x1, x0, t2); \
+	vpunpckldq(x1, x0, x0); \
+	\
+	vpunpckldq(x3, x2, t1); \
+	vpunpckhdq(x3, x2, x2); \
+	\
+	vpunpckhqdq(t1, x0, x1); \
+	vpunpcklqdq(t1, x0, x0); \
+	\
+	vpunpckhqdq(x2, t2, x3); \
+	vpunpcklqdq(x2, t2, x2);
+
+#define clear(x) \
+	eor x.16b, x.16b, x.16b;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define XOR(d,s1,s2) \
+	eor d.16b, s2.16b, s1.16b;
+
+#define PLUS(ds,s) \
+	add ds.4s, ds.4s, s.4s;
+
+#define ROTATE4(dst1,dst2,dst3,dst4,c,src1,src2,src3,src4,iop1,iop2,iop3) \
+	shl dst1.4s, src1.4s, #(c);		\
+	shl dst2.4s, src2.4s, #(c);		\
+	iop1;					\
+	shl dst3.4s, src3.4s, #(c);		\
+	shl dst4.4s, src4.4s, #(c);		\
+	iop2;					\
+	sri dst1.4s, src1.4s, #(32 - (c));	\
+	sri dst2.4s, src2.4s, #(32 - (c));	\
+	iop3;					\
+	sri dst3.4s, src3.4s, #(32 - (c));	\
+	sri dst4.4s, src4.4s, #(32 - (c));
+
+#define ROTATE4_8(dst1,dst2,dst3,dst4,src1,src2,src3,src4,iop1,iop2,iop3) \
+	tbl dst1.16b, {src1.16b}, ROT8.16b;     \
+	iop1;					\
+	tbl dst2.16b, {src2.16b}, ROT8.16b;	\
+	iop2;					\
+	tbl dst3.16b, {src3.16b}, ROT8.16b;	\
+	iop3;					\
+	tbl dst4.16b, {src4.16b}, ROT8.16b;
+
+#define ROTATE4_16(dst1,dst2,dst3,dst4,src1,src2,src3,src4,iop1) \
+	rev32 dst1.8h, src1.8h;			\
+	rev32 dst2.8h, src2.8h;			\
+	iop1;					\
+	rev32 dst3.8h, src3.8h;			\
+	rev32 dst4.8h, src4.8h;
+
+#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4,ign,tmp1,tmp2,tmp3,tmp4,\
+		      iop1,iop2,iop3,iop4,iop5,iop6,iop7,iop8,iop9,iop10,iop11,iop12,iop13,iop14,\
+		      iop15,iop16,iop17,iop18,iop19,iop20,iop21,iop22,iop23,iop24,iop25,iop26,\
+		      iop27,iop28,iop29) \
+	PLUS(a1,b1); PLUS(a2,b2); iop1;						\
+	PLUS(a3,b3); PLUS(a4,b4); iop2;						\
+	    XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop3;				\
+	    XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); iop4;				\
+		ROTATE4_16(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4, _(iop5));	\
+		iop6;								\
+	PLUS(c1,d1); PLUS(c2,d2); iop7;						\
+	PLUS(c3,d3); PLUS(c4,d4); iop8;						\
+	    XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop9;				\
+	    XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); iop10;				\
+		ROTATE4(b1, b2, b3, b4, 12, tmp1, tmp2, tmp3, tmp4,		\
+			_(iop11), _(iop12), _(iop13)); iop14;			\
+	PLUS(a1,b1); PLUS(a2,b2); iop15;					\
+	PLUS(a3,b3); PLUS(a4,b4); iop16;					\
+	    XOR(tmp1,d1,a1); XOR(tmp2,d2,a2); iop17;				\
+	    XOR(tmp3,d3,a3); XOR(tmp4,d4,a4); iop18;				\
+		ROTATE4_8(d1, d2, d3, d4, tmp1, tmp2, tmp3, tmp4,		\
+			  _(iop19), _(iop20), _(iop21)); iop22;			\
+	PLUS(c1,d1); PLUS(c2,d2); iop23;					\
+	PLUS(c3,d3); PLUS(c4,d4); iop24;					\
+	    XOR(tmp1,b1,c1); XOR(tmp2,b2,c2); iop25;				\
+	    XOR(tmp3,b3,c3); XOR(tmp4,b4,c4); iop26;				\
+		ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4,		\
+			_(iop27), _(iop28), _(iop29));
+
+.align 4
+.globl _gcry_chacha20_aarch64_blocks4_data_inc_counter
+_gcry_chacha20_aarch64_blocks4_data_inc_counter:
+	.long 0,1,2,3
+
+.align 4
+.globl _gcry_chacha20_aarch64_blocks4_data_rot8
+_gcry_chacha20_aarch64_blocks4_data_rot8:
+	.byte 3,0,1,2
+	.byte 7,4,5,6
+	.byte 11,8,9,10
+	.byte 15,12,13,14
+
+.align 3
+.globl _gcry_chacha20_aarch64_blocks4
+ELF(.type _gcry_chacha20_aarch64_blocks4,%function;)
+
+_gcry_chacha20_aarch64_blocks4:
+	/* input:
+	 *	x0: input
+	 *	x1: dst
+	 *	x2: src
+	 *	x3: nblks (multiple of 4)
+	 */
+	CFI_STARTPROC()
+
+	GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
+	add INPUT_CTR, INPUT, #(12*4);
+	ld1 {ROT8.16b}, [CTR];
+	GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
+	mov INPUT_POS, INPUT;
+	ld1 {VCTR.16b}, [CTR];
+
+.Loop4:
+	/* Construct counter vectors X12 and X13 */
+
+	ld1 {X15.16b}, [INPUT_CTR];
+	mov ROUND, #20;
+	ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
+
+	dup X12.4s, X15.s[0];
+	dup X13.4s, X15.s[1];
+	ldr CTR, [INPUT_CTR];
+	add X12.4s, X12.4s, VCTR.4s;
+	dup X0.4s, VTMP1.s[0];
+	dup X1.4s, VTMP1.s[1];
+	dup X2.4s, VTMP1.s[2];
+	dup X3.4s, VTMP1.s[3];
+	dup X14.4s, X15.s[2];
+	cmhi VTMP0.4s, VCTR.4s, X12.4s;
+	dup X15.4s, X15.s[3];
+	add CTR, CTR, #4; /* Update counter */
+	dup X4.4s, VTMP2.s[0];
+	dup X5.4s, VTMP2.s[1];
+	dup X6.4s, VTMP2.s[2];
+	dup X7.4s, VTMP2.s[3];
+	sub X13.4s, X13.4s, VTMP0.4s;
+	dup X8.4s, VTMP3.s[0];
+	dup X9.4s, VTMP3.s[1];
+	dup X10.4s, VTMP3.s[2];
+	dup X11.4s, VTMP3.s[3];
+	mov X12_TMP.16b, X12.16b;
+	mov X13_TMP.16b, X13.16b;
+	str CTR, [INPUT_CTR];
+
+.Lround2:
+	subs ROUND, ROUND, #2
+	QUARTERROUND4(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+		      X2, X6, X10, X14,   X3, X7, X11, X15,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,
+		      ,,,,,,,,,,,,,,,,,,,,,,,,,,,,)
+	QUARTERROUND4(X0, X5, X10, X15,   X1, X6, X11, X12,
+		      X2, X7,  X8, X13,   X3, X4,  X9, X14,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,
+		      ,,,,,,,,,,,,,,,,,,,,,,,,,,,,)
+	b.ne .Lround2;
+
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
+
+	PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
+	PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
+
+	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
+	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
+	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
+	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
+	PLUS(X0, VTMP2);
+	PLUS(X1, VTMP3);
+	PLUS(X2, X12_TMP);
+	PLUS(X3, X13_TMP);
+
+	dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
+	dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
+	dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
+	dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
+	mov INPUT_POS, INPUT;
+	PLUS(X4, VTMP2);
+	PLUS(X5, VTMP3);
+	PLUS(X6, X12_TMP);
+	PLUS(X7, X13_TMP);
+
+	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
+	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
+	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
+	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
+	dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
+	dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
+	PLUS(X8, VTMP2);
+	PLUS(X9, VTMP3);
+	PLUS(X10, X12_TMP);
+	PLUS(X11, X13_TMP);
+	PLUS(X14, VTMP0);
+	PLUS(X15, VTMP1);
+
+	transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
+
+	subs NBLKS, NBLKS, #4;
+
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X0.16b, VTMP0.16b;
+	eor VTMP1.16b, X4.16b, VTMP1.16b;
+	eor VTMP2.16b, X8.16b, VTMP2.16b;
+	eor VTMP3.16b, X12.16b, VTMP3.16b;
+	eor X12_TMP.16b, X1.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X5.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X9.16b, VTMP0.16b;
+	eor VTMP1.16b, X13.16b, VTMP1.16b;
+	eor VTMP2.16b, X2.16b, VTMP2.16b;
+	eor VTMP3.16b, X6.16b, VTMP3.16b;
+	eor X12_TMP.16b, X10.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X14.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	eor VTMP0.16b, X3.16b, VTMP0.16b;
+	eor VTMP1.16b, X7.16b, VTMP1.16b;
+	eor VTMP2.16b, X11.16b, VTMP2.16b;
+	eor VTMP3.16b, X15.16b, VTMP3.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+
+	b.ne .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(VTMP0);
+	clear(VTMP1);
+	clear(VTMP2);
+	clear(VTMP3);
+	clear(X12_TMP);
+	clear(X13_TMP);
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	eor x0, x0, x0
+	ret
+	CFI_ENDPROC()
+ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
+
+/**********************************************************************
+  4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 3
+.globl _gcry_chacha20_poly1305_aarch64_blocks4
+ELF(.type _gcry_chacha20_poly1305_aarch64_blocks4,%function;)
+
+_gcry_chacha20_poly1305_aarch64_blocks4:
+	/* input:
+	 *	x0: input
+	 *	x1: dst
+	 *	x2: src
+	 *	x3: nblks (multiple of 4)
+	 *	x4: poly1305-state
+	 *	x5: poly1305-src
+	 */
+	CFI_STARTPROC()
+	POLY1305_PUSH_REGS()
+
+	mov POLY_RSTATE, x4;
+	mov POLY_RSRC, x5;
+
+	GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
+	add INPUT_CTR, INPUT, #(12*4);
+	ld1 {ROT8.16b}, [CTR];
+	GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
+	mov INPUT_POS, INPUT;
+	ld1 {VCTR.16b}, [CTR];
+
+	POLY1305_LOAD_STATE()
+
+.Loop_poly4:
+	/* Construct counter vectors X12 and X13 */
+
+	ld1 {X15.16b}, [INPUT_CTR];
+	ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
+
+	dup X12.4s, X15.s[0];
+	dup X13.4s, X15.s[1];
+	ldr CTR, [INPUT_CTR];
+	add X12.4s, X12.4s, VCTR.4s;
+	dup X0.4s, VTMP1.s[0];
+	dup X1.4s, VTMP1.s[1];
+	dup X2.4s, VTMP1.s[2];
+	dup X3.4s, VTMP1.s[3];
+	dup X14.4s, X15.s[2];
+	cmhi VTMP0.4s, VCTR.4s, X12.4s;
+	dup X15.4s, X15.s[3];
+	add CTR, CTR, #4; /* Update counter */
+	dup X4.4s, VTMP2.s[0];
+	dup X5.4s, VTMP2.s[1];
+	dup X6.4s, VTMP2.s[2];
+	dup X7.4s, VTMP2.s[3];
+	sub X13.4s, X13.4s, VTMP0.4s;
+	dup X8.4s, VTMP3.s[0];
+	dup X9.4s, VTMP3.s[1];
+	dup X10.4s, VTMP3.s[2];
+	dup X11.4s, VTMP3.s[3];
+	mov X12_TMP.16b, X12.16b;
+	mov X13_TMP.16b, X13.16b;
+	str CTR, [INPUT_CTR];
+
+	mov ROUND, #20
+.Lround4_with_poly1305_outer:
+	mov POLY_CHACHA_ROUND, #6;
+.Lround4_with_poly1305_inner1:
+		      POLY1305_BLOCK_PART1(0 * 16)
+	QUARTERROUND4(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+		      X2, X6, X10, X14,   X3, X7, X11, X15,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,
+		      POLY1305_BLOCK_PART2(0 * 16),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART6(),
+		      POLY1305_BLOCK_PART7(),
+		      POLY1305_BLOCK_PART8(),
+		      POLY1305_BLOCK_PART9(),
+		      POLY1305_BLOCK_PART10(),
+		      POLY1305_BLOCK_PART11(),
+		      POLY1305_BLOCK_PART12(),
+		      POLY1305_BLOCK_PART13(),
+		      POLY1305_BLOCK_PART14(),
+		      POLY1305_BLOCK_PART15(),
+		      POLY1305_BLOCK_PART16(),
+		      POLY1305_BLOCK_PART17(),
+		      POLY1305_BLOCK_PART18(),
+		      POLY1305_BLOCK_PART19(),
+		      POLY1305_BLOCK_PART20(),
+		      POLY1305_BLOCK_PART21(),
+		      POLY1305_BLOCK_PART22(),
+		      POLY1305_BLOCK_PART23(),
+		      POLY1305_BLOCK_PART24(),
+		      POLY1305_BLOCK_PART25(),
+		      POLY1305_BLOCK_PART26(),
+		      POLY1305_BLOCK_PART27(),
+		      POLY1305_BLOCK_PART28(),
+		      POLY1305_BLOCK_PART29(),
+		      POLY1305_BLOCK_PART1(1 * 16))
+		      POLY1305_BLOCK_PART2(1 * 16)
+	QUARTERROUND4(X0, X5, X10, X15,   X1, X6, X11, X12,
+		      X2, X7,  X8, X13,   X3, X4,  X9, X14,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,
+		      _(add POLY_RSRC, POLY_RSRC, #(2*16)),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5(),
+		      POLY1305_BLOCK_PART6(),
+		      POLY1305_BLOCK_PART7(),
+		      POLY1305_BLOCK_PART8(),
+		      POLY1305_BLOCK_PART9(),
+		      POLY1305_BLOCK_PART10(),
+		      POLY1305_BLOCK_PART11(),
+		      POLY1305_BLOCK_PART12(),
+		      POLY1305_BLOCK_PART13(),
+		      POLY1305_BLOCK_PART14(),
+		      POLY1305_BLOCK_PART15(),
+		      POLY1305_BLOCK_PART16(),
+		      POLY1305_BLOCK_PART17(),
+		      POLY1305_BLOCK_PART18(),
+		      POLY1305_BLOCK_PART19(),
+		      POLY1305_BLOCK_PART20(),
+		      POLY1305_BLOCK_PART21(),
+		      POLY1305_BLOCK_PART22(),
+		      POLY1305_BLOCK_PART23(),
+		      POLY1305_BLOCK_PART24(),
+		      POLY1305_BLOCK_PART25(),
+		      POLY1305_BLOCK_PART26(),
+		      POLY1305_BLOCK_PART27(),
+		      POLY1305_BLOCK_PART28(),
+		      POLY1305_BLOCK_PART29(),
+		      _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2));
+	b.ne .Lround4_with_poly1305_inner1;
+
+	mov POLY_CHACHA_ROUND, #4;
+.Lround4_with_poly1305_inner2:
+		      POLY1305_BLOCK_PART1(0 * 16)
+	QUARTERROUND4(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+		      X2, X6, X10, X14,   X3, X7, X11, X15,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,,
+		      POLY1305_BLOCK_PART2(0 * 16),,
+		      _(add POLY_RSRC, POLY_RSRC, #(1*16)),,
+		      POLY1305_BLOCK_PART3(),,
+		      POLY1305_BLOCK_PART4(),,
+		      POLY1305_BLOCK_PART5(),,
+		      POLY1305_BLOCK_PART6(),,
+		      POLY1305_BLOCK_PART7(),,
+		      POLY1305_BLOCK_PART8(),,
+		      POLY1305_BLOCK_PART9(),,
+		      POLY1305_BLOCK_PART10(),,
+		      POLY1305_BLOCK_PART11(),,
+		      POLY1305_BLOCK_PART12(),,
+		      POLY1305_BLOCK_PART13(),,
+		      POLY1305_BLOCK_PART14(),)
+		      POLY1305_BLOCK_PART15()
+	QUARTERROUND4(X0, X5, X10, X15,   X1, X6, X11, X12,
+		      X2, X7,  X8, X13,   X3, X4,  X9, X14,
+		      tmp:=,VTMP0,VTMP1,VTMP2,VTMP3,
+		      POLY1305_BLOCK_PART16(),,
+		      POLY1305_BLOCK_PART17(),,
+		      POLY1305_BLOCK_PART18(),,
+		      POLY1305_BLOCK_PART19(),,
+		      POLY1305_BLOCK_PART20(),,
+		      POLY1305_BLOCK_PART21(),,
+		      POLY1305_BLOCK_PART22(),,
+		      POLY1305_BLOCK_PART23(),,
+		      POLY1305_BLOCK_PART24(),,
+		      POLY1305_BLOCK_PART25(),,
+		      POLY1305_BLOCK_PART26(),,
+		      POLY1305_BLOCK_PART27(),,
+		      POLY1305_BLOCK_PART28(),,
+		      POLY1305_BLOCK_PART29(),
+		      _(subs POLY_CHACHA_ROUND, POLY_CHACHA_ROUND, #2),)
+	b.ne .Lround4_with_poly1305_inner2;
+
+	subs ROUND, ROUND, #10
+	b.ne .Lround4_with_poly1305_outer;
+
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS], #32;
+
+	PLUS(X12, X12_TMP);        /* INPUT + 12 * 4 + counter */
+	PLUS(X13, X13_TMP);        /* INPUT + 13 * 4 + counter */
+
+	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
+	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
+	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
+	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
+	PLUS(X0, VTMP2);
+	PLUS(X1, VTMP3);
+	PLUS(X2, X12_TMP);
+	PLUS(X3, X13_TMP);
+
+	dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
+	dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
+	dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
+	dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
+	ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
+	mov INPUT_POS, INPUT;
+	PLUS(X4, VTMP2);
+	PLUS(X5, VTMP3);
+	PLUS(X6, X12_TMP);
+	PLUS(X7, X13_TMP);
+
+	dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
+	dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
+	dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
+	dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
+	dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
+	dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
+	PLUS(X8, VTMP2);
+	PLUS(X9, VTMP3);
+	PLUS(X10, X12_TMP);
+	PLUS(X11, X13_TMP);
+	PLUS(X14, VTMP0);
+	PLUS(X15, VTMP1);
+
+	transpose_4x4(X0, X1, X2, X3, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X4, X5, X6, X7, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X8, X9, X10, X11, VTMP0, VTMP1, VTMP2);
+	transpose_4x4(X12, X13, X14, X15, VTMP0, VTMP1, VTMP2);
+
+	subs NBLKS, NBLKS, #4;
+
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X0.16b, VTMP0.16b;
+	eor VTMP1.16b, X4.16b, VTMP1.16b;
+	eor VTMP2.16b, X8.16b, VTMP2.16b;
+	eor VTMP3.16b, X12.16b, VTMP3.16b;
+	eor X12_TMP.16b, X1.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X5.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	ld1 {X12_TMP.16b-X13_TMP.16b}, [SRC], #32;
+	eor VTMP0.16b, X9.16b, VTMP0.16b;
+	eor VTMP1.16b, X13.16b, VTMP1.16b;
+	eor VTMP2.16b, X2.16b, VTMP2.16b;
+	eor VTMP3.16b, X6.16b, VTMP3.16b;
+	eor X12_TMP.16b, X10.16b, X12_TMP.16b;
+	eor X13_TMP.16b, X14.16b, X13_TMP.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+	ld1 {VTMP0.16b-VTMP3.16b}, [SRC], #64;
+	st1 {X12_TMP.16b-X13_TMP.16b}, [DST], #32;
+	eor VTMP0.16b, X3.16b, VTMP0.16b;
+	eor VTMP1.16b, X7.16b, VTMP1.16b;
+	eor VTMP2.16b, X11.16b, VTMP2.16b;
+	eor VTMP3.16b, X15.16b, VTMP3.16b;
+	st1 {VTMP0.16b-VTMP3.16b}, [DST], #64;
+
+	b.ne .Loop_poly4;
+
+	POLY1305_STORE_STATE()
+
+	/* clear the used vector registers and stack */
+	clear(VTMP0);
+	clear(VTMP1);
+	clear(VTMP2);
+	clear(VTMP3);
+	clear(X12_TMP);
+	clear(X13_TMP);
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	eor x0, x0, x0
+	POLY1305_POP_REGS()
+	ret
+	CFI_ENDPROC()
+ELF(.size _gcry_chacha20_poly1305_aarch64_blocks4, .-_gcry_chacha20_poly1305_aarch64_blocks4;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-amd64-avx2.S b/comm/third_party/libgcrypt/cipher/chacha20-amd64-avx2.S
new file mode 100644
index 0000000000..51e107be83
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-amd64-avx2.S
@@ -0,0 +1,601 @@
+/* chacha20-amd64-avx2.S  -  AVX2 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (32)
+#define STACK_VEC_X13 (32 + STACK_VEC_X12)
+#define STACK_TMP     (32 + STACK_VEC_X13)
+#define STACK_TMP1    (32 + STACK_TMP)
+
+#define STACK_MAX     (32 + STACK_TMP1)
+
+/* vector registers */
+#define X0 %ymm0
+#define X1 %ymm1
+#define X2 %ymm2
+#define X3 %ymm3
+#define X4 %ymm4
+#define X5 %ymm5
+#define X6 %ymm6
+#define X7 %ymm7
+#define X8 %ymm8
+#define X9 %ymm9
+#define X10 %ymm10
+#define X11 %ymm11
+#define X12 %ymm12
+#define X13 %ymm13
+#define X14 %ymm14
+#define X15 %ymm15
+
+#define X0h %xmm0
+#define X1h %xmm1
+#define X2h %xmm2
+#define X3h %xmm3
+#define X4h %xmm4
+#define X5h %xmm5
+#define X6h %xmm6
+#define X7h %xmm7
+#define X8h %xmm8
+#define X9h %xmm9
+#define X10h %xmm10
+#define X11h %xmm11
+#define X12h %xmm12
+#define X13h %xmm13
+#define X14h %xmm14
+#define X15h %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+	vmovdqa    x0, t1; \
+	vperm2i128 $0x20, x1, x0, x0; \
+	vperm2i128 $0x31, x1, t1, x1;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg) \
+	vpxor offset(src), xreg, xreg; \
+	vmovdqu xreg, offset(dst);
+
+/**********************************************************************
+  8-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp)	\
+	vpsrld $(32 - (c)), v1, tmp;	\
+	vpslld $(c), v1, v1;		\
+	vpaddb tmp, v1, v1;		\
+	vpsrld $(32 - (c)), v2, tmp;	\
+	vpslld $(c), v2, v2;		\
+	vpaddb tmp, v2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	vpshufb shuf, v1, v1;		\
+	vpshufb shuf, v2, v2;
+
+#define XOR(ds,s) \
+	vpxor s, ds, ds;
+
+#define PLUS(ds,s) \
+	vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
+		      interleave_op1,interleave_op2,\
+		      interleave_op3,interleave_op4)		\
+	vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;			\
+		interleave_op1;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op2;					\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1);				\
+	vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;			\
+		interleave_op3;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op4;					\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1);
+
+.align 32
+chacha20_data:
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Linc_counter:
+	.byte 0,1,2,3,4,5,6,7
+.Lunsigned_cmp:
+	.long 0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_amd64_avx2_blocks8,@function;)
+
+_gcry_chacha20_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $STACK_MAX, %rsp;
+	andq $~31, %rsp;
+
+.Loop8:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+.Lround2:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
+	sub $2, ROUND;
+	jnz .Lround2;
+
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
+
+	sub $8, NBLKS;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	jnz .Loop8;
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vzeroall;
+
+	/* eax zeroed by round loop. */
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_amd64_avx2_blocks8;)
+
+/**********************************************************************
+  8-way stitched chacha20-poly1305
+ **********************************************************************/
+
+#define _ /*_*/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8,@function;)
+
+_gcry_chacha20_poly1305_amd64_avx2_blocks8:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	subq $(9 * 8) + STACK_MAX + 32, %rsp;
+	andq $~31, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly8:
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd .Linc_counter rRIP, X0;
+	vpbroadcastd .Lunsigned_cmp rRIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+	/* Process eight ChaCha20 blocks and 32 Poly1305 blocks. */
+
+	movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround8_with_poly1305_outer:
+	movl $6, (STACK_MAX + 8 * 8)(%rsp);
+.Lround8_with_poly1305_inner1:
+	/* rounds 0-5 & 10-15 */
+		      POLY1305_BLOCK_PART1(0 * 16)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+		      POLY1305_BLOCK_PART1(1 * 16)
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+		      POLY1305_BLOCK_PART1(2 * 16)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+		      POLY1305_BLOCK_PART1(3 * 16)
+		      lea (4 * 16)(POLY_RSRC), POLY_RSRC;
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3(),
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround8_with_poly1305_inner1;
+
+	movl $4, (STACK_MAX + 8 * 8)(%rsp);
+.Lround8_with_poly1305_inner2:
+	/* rounds 6-9 & 16-19 */
+		      POLY1305_BLOCK_PART1(0 * 16)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
+		      POLY1305_BLOCK_PART2(),
+		      _,
+		      POLY1305_BLOCK_PART3(),
+		      _)
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
+		      _,
+		      POLY1305_BLOCK_PART4(),
+		      _,
+		      POLY1305_BLOCK_PART5())
+		      POLY1305_BLOCK_PART1(1 * 16);
+		      lea (2 * 16)(POLY_RSRC), POLY_RSRC;
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
+		      _,
+		      POLY1305_BLOCK_PART2(),
+		      _,
+		      POLY1305_BLOCK_PART3())
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
+		      POLY1305_BLOCK_PART4(),
+		      _,
+		      POLY1305_BLOCK_PART5(),
+		      _)
+
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround8_with_poly1305_inner2;
+
+	subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+	jnz .Lround8_with_poly1305_outer;
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
+
+	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly8;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vzeroall;
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+
+	xorl %eax, %eax;
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
+	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-amd64-ssse3.S b/comm/third_party/libgcrypt/cipher/chacha20-amd64-ssse3.S
new file mode 100644
index 0000000000..9cdb69ae6d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-amd64-ssse3.S
@@ -0,0 +1,1012 @@
+/* chacha20-amd64-ssse3.S  -  SSSE3 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (16 + STACK_VEC_X12)
+#define STACK_TMP     (16 + STACK_VEC_X13)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %xmm0
+#define X1 %xmm1
+#define X2 %xmm2
+#define X3 %xmm3
+#define X4 %xmm4
+#define X5 %xmm5
+#define X6 %xmm6
+#define X7 %xmm7
+#define X8 %xmm8
+#define X9 %xmm9
+#define X10 %xmm10
+#define X11 %xmm11
+#define X12 %xmm12
+#define X13 %xmm13
+#define X14 %xmm14
+#define X15 %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg, t) \
+	pxor_u(offset(src), xreg, t); \
+	movdqu xreg, offset(dst);
+
+#define clear(x) pxor x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp1,tmp2)	\
+	movdqa v1, tmp1; 		\
+	movdqa v2, tmp2; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;			\
+	psrld $(32 - (c)), v2;		\
+	pslld $(c), tmp2;		\
+	paddb tmp2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	pshufb shuf, v1;		\
+	pshufb shuf, v2;
+
+#define XOR(ds,s) \
+	pxor s, ds;
+
+#define PLUS(ds,s) \
+	paddd s, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
+		      interleave_op1,interleave_op2)		\
+	movdqa .Lshuf_rol16 rRIP, tmp1;				\
+		interleave_op1;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
+	movdqa .Lshuf_rol8 rRIP, tmp1;				\
+		interleave_op2;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 16
+.Lshuf_rol16:
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_rol8:
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Lcounter1:
+	.long 1,0,0,0
+.Linc_counter:
+	.long 0,1,2,3
+.Lunsigned_cmp:
+	.long 0x80000000,0x80000000,0x80000000,0x80000000
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks4,@function;)
+
+_gcry_chacha20_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $STACK_MAX, %rsp;
+	andq $~15, %rsp;
+
+.Loop4:
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+.Lround2_4:
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
+	sub $2, ROUND;
+	jnz .Lround2_4;
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	sub $4, NBLKS;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	jnz .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	/* eax zeroed by round loop. */
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
+
+/**********************************************************************
+  2-way && 1-way chacha20
+ **********************************************************************/
+
+#define ROTATE_SHUF(v1,shuf)		\
+	pshufb shuf, v1;
+
+#define ROTATE(v1,c,tmp1)		\
+	movdqa v1, tmp1; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;
+
+#define WORD_SHUF(v1,shuf)		\
+	pshufd $shuf, v1, v1;
+
+#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
+		      shuf_x2,shuf_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
+	PLUS(x2, x3); \
+	  WORD_SHUF(x3, shuf_x3); \
+		      XOR(x1, x2); \
+	  WORD_SHUF(x2, shuf_x2); \
+				   ROTATE(x1, 7, tmp1); \
+	  WORD_SHUF(x1, shuf_x1);
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks1,@function;)
+
+_gcry_chacha20_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 */
+	CFI_STARTPROC();
+
+	/* Load constants */
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+	cmp $2, NBLKS;
+	jb .Loop1;
+
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	movdqa X10, X8;
+	movdqa X11, X9;
+	movdqa X12, X14;
+	movdqa X13, X15;
+	paddq X4, X15;
+
+.Lround2_2:
+	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_2;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	PLUS(X8, X10);
+	PLUS(X9, X11);
+	PLUS(X14, X12);
+	PLUS(X15, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
+	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
+	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
+	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
+
+	lea (2 * 64)(DST), DST;
+	lea (2 * 64)(SRC), SRC;
+
+	clear(X8);
+	clear(X9);
+	clear(X14);
+	clear(X15);
+
+	sub $2, NBLKS;
+	jz .Ldone1;
+
+.Loop1:
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+.Lround2_1:
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_1;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	lea (64)(DST), DST;
+	lea (64)(SRC), SRC;
+
+	sub $1, NBLKS;
+	jnz .Loop1;
+
+.Ldone1:
+	/* Store counter */
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	/* eax zeroed by round loop. */
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
+
+/**********************************************************************
+  4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+#define _ /*_*/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4,@function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks4:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(9 * 8) + STACK_MAX + 16, %rsp;
+	andq $~15, %rsp;
+
+	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+	movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
+
+	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+	movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	/* Load state */
+	POLY1305_LOAD_STATE();
+
+.Loop_poly4:
+
+	/* Construct counter vectors X12 and X13 */
+	movdqa .Linc_counter rRIP, X0;
+	movdqa .Lunsigned_cmp rRIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+	/* Process four ChaCha20 blocks and sixteen Poly1305 blocks. */
+
+	movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround4_with_poly1305_outer:
+	movl $6, (STACK_MAX + 8 * 8)(%rsp);
+.Lround4_with_poly1305_inner1:
+	/* rounds 0-5 & 10-15 */
+		      POLY1305_BLOCK_PART1(0 * 16)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+		      POLY1305_BLOCK_PART1(1 * 16)
+		      lea (2 * 16)(POLY_RSRC), POLY_RSRC;
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART2(),
+		      POLY1305_BLOCK_PART3())
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART4(),
+		      POLY1305_BLOCK_PART5())
+
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround4_with_poly1305_inner1;
+
+	movl $4, (STACK_MAX + 8 * 8)(%rsp);
+.Lround4_with_poly1305_inner2:
+	/* rounds 6-9 & 16-19 */
+		      POLY1305_BLOCK_PART1(0 * 16)
+		      lea (1 * 16)(POLY_RSRC), POLY_RSRC;
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART2(),
+		      _)
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART3(),
+		      _)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
+		      POLY1305_BLOCK_PART4(),
+		      _)
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
+		      POLY1305_BLOCK_PART5(),
+		      _)
+
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround4_with_poly1305_inner2;
+
+	subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+	jnz .Lround4_with_poly1305_outer;
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+	movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+	jnz .Loop_poly4;
+
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+	movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+
+	xorl %eax, %eax;
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
+
+/**********************************************************************
+  2-way && 1-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1,@function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: chacha20-state
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 *	%r9: poly1305-state
+	 *	%r8: poly1305-src
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(9 * 8), %rsp;
+	movq %rbx, (0 * 8)(%rsp);
+	movq %r12, (1 * 8)(%rsp);
+	movq %r13, (2 * 8)(%rsp);
+	movq %r14, (3 * 8)(%rsp);
+	movq %r15, (4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, 0 * 8);
+	CFI_REG_ON_STACK(r12, 1 * 8);
+	CFI_REG_ON_STACK(r13, 2 * 8);
+	CFI_REG_ON_STACK(r14, 3 * 8);
+	CFI_REG_ON_STACK(r15, 4 * 8);
+
+	movq %rdx, (5 * 8)(%rsp); # SRC
+	movq %rsi, (6 * 8)(%rsp); # DST
+	movq %rcx, (7 * 8)(%rsp); # NBLKS
+
+	/* Load constants */
+	movdqa .Lcounter1 rRIP, X4;
+	movdqa .Lshuf_rol8 rRIP, X5;
+	movdqa .Lshuf_rol16 rRIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+	POLY1305_LOAD_STATE();
+
+	cmpq $2, (7 * 8)(%rsp); #NBLKS
+	jb .Loop_poly1;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	movdqa X10, X8;
+	movdqa X11, X9;
+	movdqa X12, X14;
+	movdqa X13, X15;
+	paddq X4, X15;
+
+	/* Process two ChaCha20 blocks and eight Poly1305 blocks. */
+
+	movl $20, (8 * 8 + 4)(%rsp);
+.Lround2_with_poly1305_outer:
+	movl $8, (8 * 8)(%rsp);
+.Lround2_with_poly1305_inner:
+	POLY1305_BLOCK_PART1(0 * 16);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	lea (1 * 16)(POLY_RSRC), POLY_RSRC;
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART5();
+
+	subl $2, (8 * 8)(%rsp);
+	jnz .Lround2_with_poly1305_inner;
+
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	subl $10, (8 * 8 + 4)(%rsp);
+	jnz .Lround2_with_poly1305_outer;
+
+	movq (5 * 8)(%rsp), SRC;
+	movq (6 * 8)(%rsp), DST;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	PLUS(X8, X10);
+	PLUS(X9, X11);
+	PLUS(X14, X12);
+	PLUS(X15, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
+	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
+	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
+	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
+
+	clear(X8);
+	clear(X9);
+	clear(X14);
+	clear(X15);
+
+	subq $2, (7 * 8)(%rsp); # NBLKS
+	lea (2 * 64)(SRC), SRC;
+	lea (2 * 64)(DST), DST;
+	movq SRC, (5 * 8)(%rsp);
+	movq DST, (6 * 8)(%rsp);
+	jz .Ldone_poly1;
+
+.Loop_poly1:
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	/* Process one ChaCha20 block and four Poly1305 blocks. */
+
+	movl $20, (8 * 8 + 4)(%rsp);
+.Lround1_with_poly1305_outer:
+	movl $8, (8 * 8)(%rsp);
+.Lround1_with_poly1305_inner:
+	POLY1305_BLOCK_PART1(0 * 16);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	lea (1 * 16)(POLY_RSRC), POLY_RSRC;
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART5();
+
+	subl $4, (8 * 8)(%rsp);
+	jnz .Lround1_with_poly1305_inner;
+
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	subl $10, (8 * 8 + 4)(%rsp);
+	jnz .Lround1_with_poly1305_outer;
+
+	movq (5 * 8)(%rsp), SRC;
+	movq (6 * 8)(%rsp), DST;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	subq $1, (7 * 8)(%rsp); # NBLKS
+	lea (64)(SRC), SRC;
+	lea (64)(DST), DST;
+	movq SRC, (5 * 8)(%rsp);
+	movq DST, (6 * 8)(%rsp);
+
+	jnz .Loop_poly1;
+
+.Ldone_poly1:
+	/* Store state */
+	POLY1305_STORE_STATE();
+
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	movq (0 * 8)(%rsp), %rbx;
+	movq (1 * 8)(%rsp), %r12;
+	movq (2 * 8)(%rsp), %r13;
+	movq (3 * 8)(%rsp), %r14;
+	movq (4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+
+	xorl %eax, %eax;
+	leave;
+	CFI_LEAVE();
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-armv7-neon.S b/comm/third_party/libgcrypt/cipher/chacha20-armv7-neon.S
new file mode 100644
index 0000000000..33a43df1f3
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-armv7-neon.S
@@ -0,0 +1,393 @@
+/* chacha20-armv7-neon.S  -  ARMv7 NEON implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* register macros */
+#define INPUT r0
+#define DST   r1
+#define SRC   r2
+#define NBLKS r3
+#define ROUND r4
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (STACK_VEC_X12 + 16)
+#define STACK_TMP     (STACK_VEC_X13 + 16)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 q0
+#define X1 q1
+#define X2 q2
+#define X3 q3
+#define X4 q4
+#define X5 q5
+#define X6 q6
+#define X7 q7
+#define X8 q8
+#define X9 q9
+#define X10 q10
+#define X11 q11
+#define X12 q12
+#define X13 q13
+#define X14 q14
+#define X15 q15
+
+#define X0l d0
+#define X1l d2
+#define X2l d4
+#define X3l d6
+#define X4l d8
+#define X5l d10
+#define X6l d12
+#define X7l d14
+#define X8l d16
+#define X9l d18
+#define X10l d20
+#define X11l d22
+#define X12l d24
+#define X13l d26
+#define X14l d28
+#define X15l d30
+
+#define X0h d1
+#define X1h d3
+#define X2h d5
+#define X3h d7
+#define X4h d9
+#define X5h d11
+#define X6h d13
+#define X7h d15
+#define X8h d17
+#define X9h d19
+#define X10h d21
+#define X11h d23
+#define X12h d25
+#define X13h d27
+#define X14h d29
+#define X15h d31
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4_part1(_q0, _q1, _q2, _q3)	\
+	vtrn.32 _q0, _q1;			\
+	vtrn.32 _q2, _q3;
+#define transpose_4x4_part2(_q0, _q1, _q2, _q3)	\
+	vswp _q0##h, _q2##l;			\
+	vswp _q1##h, _q3##l;
+
+#define clear(x) veor x,x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(dst1,dst2,c,src1,src2)		\
+	vshl.u32 dst1, src1, #(c);		\
+	vshl.u32 dst2, src2, #(c);		\
+	vsri.u32 dst1, src1, #(32 - (c));	\
+	vsri.u32 dst2, src2, #(32 - (c));
+
+#define ROTATE2_16(dst1,dst2,src1,src2)		\
+	vrev32.16 dst1, src1;			\
+	vrev32.16 dst2, src2;
+
+#define XOR(d,s1,s2) \
+	veor d, s2, s1;
+
+#define PLUS(ds,s) \
+	vadd.u32 ds, ds, s;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2)		\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2_16(d1, d2, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(tmp1,d1,a1); XOR(tmp2,d2,a2);	\
+	    ROTATE2(d1, d2,  8, tmp1, tmp2);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(tmp1,b1,c1); XOR(tmp2,b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+chacha20_data:
+.align 4
+.Linc_counter:
+	.long 0,1,2,3
+
+.align 3
+.globl _gcry_chacha20_armv7_neon_blocks4
+.type _gcry_chacha20_armv7_neon_blocks4,%function;
+
+_gcry_chacha20_armv7_neon_blocks4:
+	/* input:
+	 *	r0: input
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: nblks (multiple of 4)
+	 */
+
+	vpush {q4-q7};
+	push {r4-r12,lr};
+
+	mov r12, sp
+
+	mov r6, sp;
+	sub r6, r6, #(STACK_MAX);
+	and r6, r6, #(~15);
+	mov sp, r6;
+	GET_DATA_POINTER(r9, .Linc_counter, lr);
+	add lr, INPUT, #(12*4);
+	add r8, sp, #STACK_VEC_X12;
+
+.Loop4:
+	mov ROUND, #20;
+
+	/* Construct counter vectors X12 and X13 */
+
+	vld1.8 {X15}, [lr];
+	mov lr, INPUT;
+	vld1.8 {X8}, [r9];
+	vdup.32 X12, X15l[0];
+	vdup.32 X13, X15l[1];
+	vld1.8 {X3}, [lr]!;
+	vadd.u32 X12, X12, X8;
+	vdup.32 X0, X3l[0];
+	vdup.32 X1, X3l[1];
+	vdup.32 X2, X3h[0];
+	vcgt.u32 X8, X8, X12;
+	vdup.32 X3, X3h[1];
+	vdup.32 X14, X15h[0];
+	vdup.32 X15, X15h[1];
+	vsub.u32 X13, X13, X8;
+	vld1.8 {X7}, [lr]!;
+	vld1.8 {X11}, [lr];
+	vst1.8 {X12, X13}, [r8];
+	vdup.32 X4, X7l[0];
+	vdup.32 X5, X7l[1];
+	vdup.32 X6, X7h[0];
+	vdup.32 X7, X7h[1];
+	vdup.32 X8, X11l[0];
+	vdup.32 X9, X11l[1];
+	vdup.32 X10, X11h[0];
+	vdup.32 X11, X11h[1];
+
+	add r7, sp, #STACK_TMP2;
+	add r6, sp, #STACK_TMP1;
+	add r5, sp, #STACK_TMP;
+	vst1.8 {X15}, [r6];
+	vst1.8 {X11}, [r5];
+
+	mov lr, INPUT;
+.Lround2:
+	subs ROUND, ROUND, #2
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X8}, [r5];
+	vst1.8 {X9}, [r6];
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9)
+	vld1.8 {X8}, [r5];
+	vld1.8 {X9}, [r6];
+	vst1.8 {X11}, [r5];
+	vst1.8 {X15}, [r6];
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
+	bne .Lround2;
+
+	vld1.8 {X11}, [lr]!;
+	vst1.8 {X14}, [r7];
+
+	vdup.32 X14, X11l[0]; /* INPUT + 0 * 4 */
+	vdup.32 X15, X11l[1]; /* INPUT + 1 * 4 */
+	PLUS(X0, X14);
+	PLUS(X1, X15);
+	vdup.32 X14, X11h[0]; /* INPUT + 2 * 4 */
+	vdup.32 X15, X11h[1]; /* INPUT + 3 * 4 */
+	PLUS(X2, X14);
+	PLUS(X3, X15);
+
+	vld1.8 {X11}, [r5];
+	vld1.8 {X15}, [r6];
+	vst1.8 {X0}, [r5];
+	vld1.8 {X0}, [lr]!;
+	vst1.8 {X1}, [r6];
+
+	vdup.32 X14, X0l[0]; /* INPUT + 4 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 5 * 4 */
+	PLUS(X4, X14);
+	PLUS(X5, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 6 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 7 * 4 */
+	PLUS(X6, X14);
+	PLUS(X7, X1);
+
+	vld1.8 {X0}, [lr]!;
+
+	vdup.32 X14, X0l[0]; /* INPUT + 8 * 4 */
+	vdup.32  X1, X0l[1]; /* INPUT + 9 * 4 */
+	PLUS(X8, X14);
+	PLUS(X9, X1);
+	vdup.32 X14, X0h[0]; /* INPUT + 10 * 4 */
+	vdup.32  X1, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X10, X14);
+	PLUS(X11, X1);
+
+	vld1.8 {X0}, [lr];
+	add lr, INPUT, #(12*4)
+	vld1.8 {X14}, [r7];
+
+	vdup.32 X1, X0h[0]; /* INPUT + 10 * 4 */
+	ldm lr, {r10, r11}; /* Update counter */
+	vdup.32 X0, X0h[1]; /* INPUT + 11 * 4 */
+	PLUS(X14, X1);
+	PLUS(X15, X0);
+	adds r10, r10, #4;  /* Update counter */
+	vld1.8 {X0, X1}, [r8];
+
+	PLUS(X12, X0);
+	vld1.8 {X0}, [r5];
+	PLUS(X13, X1);
+	adc r11, r11, #0;   /* Update counter */
+
+	vld1.8 {X1}, [r6];
+	stm lr, {r10, r11}; /* Update counter */
+	transpose_4x4_part1(X0, X1, X2, X3);
+	transpose_4x4_part1(X4, X5, X6, X7);
+	transpose_4x4_part1(X8, X9, X10, X11);
+	transpose_4x4_part1(X12, X13, X14, X15);
+	transpose_4x4_part2(X0, X1, X2, X3);
+	transpose_4x4_part2(X4, X5, X6, X7);
+	transpose_4x4_part2(X8, X9, X10, X11);
+	transpose_4x4_part2(X12, X13, X14, X15);
+
+	subs NBLKS, NBLKS, #4;
+
+	vst1.8 {X10}, [r5];
+	add lr, INPUT, #(12*4)
+	vst1.8 {X11}, [r6];
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X10, X0, X10;
+	vld1.8 {X0}, [SRC]!;
+	veor X11, X4, X11;
+	vld1.8 {X4}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10, X11}, [SRC]!;
+	veor X0, X8, X0;
+	veor X4, X12, X4;
+	veor X10, X1, X10;
+	veor X11, X5, X11;
+	vst1.8 {X0}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4}, [DST]!;
+	vld1.8 {X4, X5}, [SRC]!;
+	vst1.8 {X10, X11}, [DST]!;
+	vld1.8 {X10}, [r5];
+	vld1.8 {X11}, [r6];
+	veor X0, X9, X0;
+	vld1.8 {X8, X9}, [SRC]!;
+	veor X1, X13, X1;
+	vld1.8 {X12, X13}, [SRC]!;
+	veor X4, X2, X4;
+	veor X5, X6, X5;
+	vst1.8 {X0, X1}, [DST]!;
+	vld1.8 {X0, X1}, [SRC]!;
+	vst1.8 {X4, X5}, [DST]!;
+	veor X8, X10, X8;
+	veor X9, X14, X9;
+	veor X12, X3, X12;
+	veor X13, X7, X13;
+	veor X0, X11, X0;
+	veor X1, X15, X1;
+	vst1.8 {X8, X9}, [DST]!;
+	vst1.8 {X12, X13}, [DST]!;
+	vst1.8 {X0, X1}, [DST]!;
+
+	bne .Loop4;
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	vst1.8 {X0}, [r5];
+	vst1.8 {X0}, [r6];
+	vst1.8 {X0}, [r7];
+	vst1.8 {X0}, [r8]!;
+	vst1.8 {X0}, [r8];
+
+	mov sp, r12
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	pop {r4-r12,lr}
+	vpop {q4-q7}
+	eor r0, r0, r0
+	bx lr
+.size _gcry_chacha20_armv7_neon_blocks4, .-_gcry_chacha20_armv7_neon_blocks4;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-ppc.c b/comm/third_party/libgcrypt/cipher/chacha20-ppc.c
new file mode 100644
index 0000000000..4a21b837d1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-ppc.c
@@ -0,0 +1,646 @@
+/* chacha20-ppc.c - PowerPC vector implementation of ChaCha20
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    defined(USE_CHACHA20) && \
+    __GNUC__ >= 4
+
+#include <altivec.h>
+#include "bufhelp.h"
+#include "poly1305-internal.h"
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+#ifdef WORDS_BIGENDIAN
+static const vector16x_u8 le_bswap_const =
+  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#endif
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_rol_elems(vector4x_u32 v, unsigned int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return vec_sld (v, v, (16 - (4 * idx)) & 15);
+#else
+  return vec_sld (v, v, (4 * idx) & 15);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_load_le(unsigned long offset, const unsigned char *ptr)
+{
+  vector4x_u32 vec;
+  vec = vec_vsx_ld (offset, (const u32 *)ptr);
+#ifdef WORDS_BIGENDIAN
+  vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
+			       le_bswap_const);
+#endif
+  return vec;
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+vec_store_le(vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
+{
+#ifdef WORDS_BIGENDIAN
+  vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
+			       le_bswap_const);
+#endif
+  vec_vsx_st (vec, offset, (u32 *)ptr);
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
+{
+#ifdef WORDS_BIGENDIAN
+  static const vector16x_u8 swap32 =
+    { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
+  vector2x_u64 vec, add, sum;
+
+  vec = (vector2x_u64)vec_perm((vector16x_u8)v, (vector16x_u8)v, swap32);
+  add = (vector2x_u64)vec_perm((vector16x_u8)a, (vector16x_u8)a, swap32);
+  sum = vec + add;
+  return (vector4x_u32)vec_perm((vector16x_u8)sum, (vector16x_u8)sum, swap32);
+#else
+  return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
+#endif
+}
+
+
+/**********************************************************************
+  2-way && 1-way chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,rolv)			\
+	__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
+
+#define WORD_ROL(v1,c)			\
+	((v1) = vec_rol_elems((v1), (c)))
+
+#define XOR(ds,s) \
+	((ds) ^= (s))
+
+#define PLUS(ds,s) \
+	((ds) += (s))
+
+#define QUARTERROUND4(x0,x1,x2,x3,rol_x1,rol_x2,rol_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, rotate_16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, rotate_12); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, rotate_8); \
+	PLUS(x2, x3); \
+	  WORD_ROL(x3, rol_x3); \
+		      XOR(x1, x2); \
+	  WORD_ROL(x2, rol_x2); \
+				   ROTATE(x1, rotate_7); \
+	  WORD_ROL(x1, rol_x1);
+
+#define ADD_U64(v,a) \
+	(v = vec_add_ctr_u64(v, a))
+
+unsigned int ASM_FUNC_ATTR
+_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  vector4x_u32 counter_1 = { 1, 0, 0, 0 };
+  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+  vector4x_u32 state0, state1, state2, state3;
+  vector4x_u32 v0, v1, v2, v3;
+  vector4x_u32 v4, v5, v6, v7;
+  int i;
+
+  /* force preload of constants to vector registers */
+  __asm__ ("": "+v" (counter_1) :: "memory");
+  __asm__ ("": "+v" (rotate_16) :: "memory");
+  __asm__ ("": "+v" (rotate_12) :: "memory");
+  __asm__ ("": "+v" (rotate_8) :: "memory");
+  __asm__ ("": "+v" (rotate_7) :: "memory");
+
+  state0 = vec_vsx_ld(0 * 16, state);
+  state1 = vec_vsx_ld(1 * 16, state);
+  state2 = vec_vsx_ld(2 * 16, state);
+  state3 = vec_vsx_ld(3 * 16, state);
+
+  while (nblks >= 2)
+    {
+      v0 = state0;
+      v1 = state1;
+      v2 = state2;
+      v3 = state3;
+
+      v4 = state0;
+      v5 = state1;
+      v6 = state2;
+      v7 = state3;
+      ADD_U64(v7, counter_1);
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3);
+	  QUARTERROUND4(v4, v5, v6, v7, 1, 2, 3);
+	  QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1);
+	  QUARTERROUND4(v4, v5, v6, v7, 3, 2, 1);
+	}
+
+      v0 += state0;
+      v1 += state1;
+      v2 += state2;
+      v3 += state3;
+      ADD_U64(state3, counter_1); /* update counter */
+      v4 += state0;
+      v5 += state1;
+      v6 += state2;
+      v7 += state3;
+      ADD_U64(state3, counter_1); /* update counter */
+
+      v0 ^= vec_load_le(0 * 16, src);
+      v1 ^= vec_load_le(1 * 16, src);
+      v2 ^= vec_load_le(2 * 16, src);
+      v3 ^= vec_load_le(3 * 16, src);
+      vec_store_le(v0, 0 * 16, dst);
+      vec_store_le(v1, 1 * 16, dst);
+      vec_store_le(v2, 2 * 16, dst);
+      vec_store_le(v3, 3 * 16, dst);
+      src += 64;
+      dst += 64;
+      v4 ^= vec_load_le(0 * 16, src);
+      v5 ^= vec_load_le(1 * 16, src);
+      v6 ^= vec_load_le(2 * 16, src);
+      v7 ^= vec_load_le(3 * 16, src);
+      vec_store_le(v4, 0 * 16, dst);
+      vec_store_le(v5, 1 * 16, dst);
+      vec_store_le(v6, 2 * 16, dst);
+      vec_store_le(v7, 3 * 16, dst);
+      src += 64;
+      dst += 64;
+
+      nblks -= 2;
+    }
+
+  while (nblks)
+    {
+      v0 = state0;
+      v1 = state1;
+      v2 = state2;
+      v3 = state3;
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3);
+	  QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1);
+	}
+
+      v0 += state0;
+      v1 += state1;
+      v2 += state2;
+      v3 += state3;
+      ADD_U64(state3, counter_1); /* update counter */
+
+      v0 ^= vec_load_le(0 * 16, src);
+      v1 ^= vec_load_le(1 * 16, src);
+      v2 ^= vec_load_le(2 * 16, src);
+      v3 ^= vec_load_le(3 * 16, src);
+      vec_store_le(v0, 0 * 16, dst);
+      vec_store_le(v1, 1 * 16, dst);
+      vec_store_le(v2, 2 * 16, dst);
+      vec_store_le(v3, 3 * 16, dst);
+      src += 64;
+      dst += 64;
+
+      nblks--;
+    }
+
+  vec_vsx_st(state3, 3 * 16, state); /* store counter */
+
+  return 0;
+}
+
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3) ({ \
+	vector4x_u32 t1 = vec_mergeh(x0, x2); \
+	vector4x_u32 t2 = vec_mergel(x0, x2); \
+	vector4x_u32 t3 = vec_mergeh(x1, x3); \
+	x3 = vec_mergel(x1, x3); \
+	x0 = vec_mergeh(t1, t3); \
+	x1 = vec_mergel(t1, t3); \
+	x2 = vec_mergeh(t2, x3); \
+	x3 = vec_mergel(t2, x3); \
+      })
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_16); ROTATE(d2, rotate_16);	\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_12); ROTATE(b2, rotate_12);	\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_8); ROTATE(d2, rotate_8);		\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
+
+unsigned int ASM_FUNC_ATTR
+_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
+  vector4x_u32 counter_4 = { 4, 0, 0, 0 };
+  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+  vector4x_u32 state0, state1, state2, state3;
+  vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
+  vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
+  vector4x_u32 tmp;
+  int i;
+
+  /* force preload of constants to vector registers */
+  __asm__ ("": "+v" (counters_0123) :: "memory");
+  __asm__ ("": "+v" (counter_4) :: "memory");
+  __asm__ ("": "+v" (rotate_16) :: "memory");
+  __asm__ ("": "+v" (rotate_12) :: "memory");
+  __asm__ ("": "+v" (rotate_8) :: "memory");
+  __asm__ ("": "+v" (rotate_7) :: "memory");
+
+  state0 = vec_vsx_ld(0 * 16, state);
+  state1 = vec_vsx_ld(1 * 16, state);
+  state2 = vec_vsx_ld(2 * 16, state);
+  state3 = vec_vsx_ld(3 * 16, state);
+
+  do
+    {
+      v0 = vec_splat(state0, 0);
+      v1 = vec_splat(state0, 1);
+      v2 = vec_splat(state0, 2);
+      v3 = vec_splat(state0, 3);
+      v4 = vec_splat(state1, 0);
+      v5 = vec_splat(state1, 1);
+      v6 = vec_splat(state1, 2);
+      v7 = vec_splat(state1, 3);
+      v8 = vec_splat(state2, 0);
+      v9 = vec_splat(state2, 1);
+      v10 = vec_splat(state2, 2);
+      v11 = vec_splat(state2, 3);
+      v12 = vec_splat(state3, 0);
+      v13 = vec_splat(state3, 1);
+      v14 = vec_splat(state3, 2);
+      v15 = vec_splat(state3, 3);
+
+      v12 += counters_0123;
+      v13 -= vec_cmplt(v12, counters_0123);
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	  QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
+	  QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
+	  QUARTERROUND2(v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	}
+
+      v0 += vec_splat(state0, 0);
+      v1 += vec_splat(state0, 1);
+      v2 += vec_splat(state0, 2);
+      v3 += vec_splat(state0, 3);
+      v4 += vec_splat(state1, 0);
+      v5 += vec_splat(state1, 1);
+      v6 += vec_splat(state1, 2);
+      v7 += vec_splat(state1, 3);
+      v8 += vec_splat(state2, 0);
+      v9 += vec_splat(state2, 1);
+      v10 += vec_splat(state2, 2);
+      v11 += vec_splat(state2, 3);
+      tmp = vec_splat(state3, 0);
+      tmp += counters_0123;
+      v12 += tmp;
+      v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123);
+      v14 += vec_splat(state3, 2);
+      v15 += vec_splat(state3, 3);
+      ADD_U64(state3, counter_4); /* update counter */
+
+      transpose_4x4(v0, v1, v2, v3);
+      transpose_4x4(v4, v5, v6, v7);
+      transpose_4x4(v8, v9, v10, v11);
+      transpose_4x4(v12, v13, v14, v15);
+
+      v0 ^= vec_load_le((64 * 0 + 16 * 0), src);
+      v1 ^= vec_load_le((64 * 1 + 16 * 0), src);
+      v2 ^= vec_load_le((64 * 2 + 16 * 0), src);
+      v3 ^= vec_load_le((64 * 3 + 16 * 0), src);
+
+      v4 ^= vec_load_le((64 * 0 + 16 * 1), src);
+      v5 ^= vec_load_le((64 * 1 + 16 * 1), src);
+      v6 ^= vec_load_le((64 * 2 + 16 * 1), src);
+      v7 ^= vec_load_le((64 * 3 + 16 * 1), src);
+
+      v8 ^= vec_load_le((64 * 0 + 16 * 2), src);
+      v9 ^= vec_load_le((64 * 1 + 16 * 2), src);
+      v10 ^= vec_load_le((64 * 2 + 16 * 2), src);
+      v11 ^= vec_load_le((64 * 3 + 16 * 2), src);
+
+      v12 ^= vec_load_le((64 * 0 + 16 * 3), src);
+      v13 ^= vec_load_le((64 * 1 + 16 * 3), src);
+      v14 ^= vec_load_le((64 * 2 + 16 * 3), src);
+      v15 ^= vec_load_le((64 * 3 + 16 * 3), src);
+
+      vec_store_le(v0, (64 * 0 + 16 * 0), dst);
+      vec_store_le(v1, (64 * 1 + 16 * 0), dst);
+      vec_store_le(v2, (64 * 2 + 16 * 0), dst);
+      vec_store_le(v3, (64 * 3 + 16 * 0), dst);
+
+      vec_store_le(v4, (64 * 0 + 16 * 1), dst);
+      vec_store_le(v5, (64 * 1 + 16 * 1), dst);
+      vec_store_le(v6, (64 * 2 + 16 * 1), dst);
+      vec_store_le(v7, (64 * 3 + 16 * 1), dst);
+
+      vec_store_le(v8, (64 * 0 + 16 * 2), dst);
+      vec_store_le(v9, (64 * 1 + 16 * 2), dst);
+      vec_store_le(v10, (64 * 2 + 16 * 2), dst);
+      vec_store_le(v11, (64 * 3 + 16 * 2), dst);
+
+      vec_store_le(v12, (64 * 0 + 16 * 3), dst);
+      vec_store_le(v13, (64 * 1 + 16 * 3), dst);
+      vec_store_le(v14, (64 * 2 + 16 * 3), dst);
+      vec_store_le(v15, (64 * 3 + 16 * 3), dst);
+
+      src += 4*64;
+      dst += 4*64;
+
+      nblks -= 4;
+    }
+  while (nblks);
+
+  vec_vsx_st(state3, 3 * 16, state); /* store counter */
+
+  return 0;
+}
+
+
+#if SIZEOF_UNSIGNED_LONG == 8
+
+/**********************************************************************
+  4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addc %0, %3, %0\n" \
+	       "adde %1, %4, %1\n" \
+	       "adde %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#define MUL_MOD_1305_64_PART1(H2, H1, H0, R1, R0, R1_MULT5) do { \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    \
+    umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
+  } while (0)
+
+#define MUL_MOD_1305_64_PART2(H2, H1, H0, R1, R0, R1_MULT5) do { \
+    add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
+    umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
+    add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
+    t1_hi = H2 * R0;       /* h2 * r0 */ \
+    add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    /* carry propagation */ \
+    H2 = H0 & 3; \
+    H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
+    ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
+  } while (0)
+
+#define POLY1305_BLOCK_PART1(in_pos) do { \
+    m0 = buf_get_le64(poly1305_src + (in_pos) + 0); \
+    m1 = buf_get_le64(poly1305_src + (in_pos) + 8); \
+    /* a = h + m */ \
+    ADD_1305_64(h2, h1, h0, m2, m1, m0); \
+    /* h = a * r (partial mod 2^130-5) */ \
+    MUL_MOD_1305_64_PART1(h2, h1, h0, r1, r0, r1_mult5); \
+  } while (0)
+
+#define POLY1305_BLOCK_PART2(in_pos) do { \
+    MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \
+  } while (0)
+
+unsigned int ASM_FUNC_ATTR
+_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
+  vector4x_u32 counter_4 = { 4, 0, 0, 0 };
+  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+  vector4x_u32 state0, state1, state2, state3;
+  vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
+  vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
+  vector4x_u32 tmp;
+  u64 r0, r1, r1_mult5;
+  u64 h0, h1, h2;
+  u64 m0, m1, m2;
+  u64 x0_lo, x0_hi, x1_lo, x1_hi;
+  u64 t0_lo, t0_hi, t1_lo, t1_hi;
+  unsigned int i, o;
+
+  /* load poly1305 state */
+  m2 = 1;
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
+  r0 = st->r[0] + ((u64)st->r[1] << 32);
+  r1 = st->r[2] + ((u64)st->r[3] << 32);
+  r1_mult5 = (r1 >> 2) + r1;
+
+  /* force preload of constants to vector registers */
+  __asm__ ("": "+v" (counters_0123) :: "memory");
+  __asm__ ("": "+v" (counter_4) :: "memory");
+  __asm__ ("": "+v" (rotate_16) :: "memory");
+  __asm__ ("": "+v" (rotate_12) :: "memory");
+  __asm__ ("": "+v" (rotate_8) :: "memory");
+  __asm__ ("": "+v" (rotate_7) :: "memory");
+
+  state0 = vec_vsx_ld(0 * 16, state);
+  state1 = vec_vsx_ld(1 * 16, state);
+  state2 = vec_vsx_ld(2 * 16, state);
+  state3 = vec_vsx_ld(3 * 16, state);
+
+  do
+    {
+      v0 = vec_splat(state0, 0);
+      v1 = vec_splat(state0, 1);
+      v2 = vec_splat(state0, 2);
+      v3 = vec_splat(state0, 3);
+      v4 = vec_splat(state1, 0);
+      v5 = vec_splat(state1, 1);
+      v6 = vec_splat(state1, 2);
+      v7 = vec_splat(state1, 3);
+      v8 = vec_splat(state2, 0);
+      v9 = vec_splat(state2, 1);
+      v10 = vec_splat(state2, 2);
+      v11 = vec_splat(state2, 3);
+      v12 = vec_splat(state3, 0);
+      v13 = vec_splat(state3, 1);
+      v14 = vec_splat(state3, 2);
+      v15 = vec_splat(state3, 3);
+
+      v12 += counters_0123;
+      v13 -= vec_cmplt(v12, counters_0123);
+
+      for (o = 20; o; o -= 10)
+	{
+	  for (i = 8; i; i -= 2)
+	    {
+	      POLY1305_BLOCK_PART1(0 * 16);
+	      QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	      POLY1305_BLOCK_PART2();
+	      QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
+	      POLY1305_BLOCK_PART1(1 * 16);
+	      poly1305_src += 2 * 16;
+	      QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
+	      POLY1305_BLOCK_PART2();
+	      QUARTERROUND2(v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	    }
+
+	  QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	  QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
+	  QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
+	  QUARTERROUND2(v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	}
+
+      v0 += vec_splat(state0, 0);
+      v1 += vec_splat(state0, 1);
+      v2 += vec_splat(state0, 2);
+      v3 += vec_splat(state0, 3);
+      v4 += vec_splat(state1, 0);
+      v5 += vec_splat(state1, 1);
+      v6 += vec_splat(state1, 2);
+      v7 += vec_splat(state1, 3);
+      v8 += vec_splat(state2, 0);
+      v9 += vec_splat(state2, 1);
+      v10 += vec_splat(state2, 2);
+      v11 += vec_splat(state2, 3);
+      tmp = vec_splat(state3, 0);
+      tmp += counters_0123;
+      v12 += tmp;
+      v13 += vec_splat(state3, 1) - vec_cmplt(tmp, counters_0123);
+      v14 += vec_splat(state3, 2);
+      v15 += vec_splat(state3, 3);
+      ADD_U64(state3, counter_4); /* update counter */
+
+      transpose_4x4(v0, v1, v2, v3);
+      transpose_4x4(v4, v5, v6, v7);
+      transpose_4x4(v8, v9, v10, v11);
+      transpose_4x4(v12, v13, v14, v15);
+
+      v0 ^= vec_load_le((64 * 0 + 16 * 0), src);
+      v1 ^= vec_load_le((64 * 1 + 16 * 0), src);
+      v2 ^= vec_load_le((64 * 2 + 16 * 0), src);
+      v3 ^= vec_load_le((64 * 3 + 16 * 0), src);
+
+      v4 ^= vec_load_le((64 * 0 + 16 * 1), src);
+      v5 ^= vec_load_le((64 * 1 + 16 * 1), src);
+      v6 ^= vec_load_le((64 * 2 + 16 * 1), src);
+      v7 ^= vec_load_le((64 * 3 + 16 * 1), src);
+
+      v8 ^= vec_load_le((64 * 0 + 16 * 2), src);
+      v9 ^= vec_load_le((64 * 1 + 16 * 2), src);
+      v10 ^= vec_load_le((64 * 2 + 16 * 2), src);
+      v11 ^= vec_load_le((64 * 3 + 16 * 2), src);
+
+      v12 ^= vec_load_le((64 * 0 + 16 * 3), src);
+      v13 ^= vec_load_le((64 * 1 + 16 * 3), src);
+      v14 ^= vec_load_le((64 * 2 + 16 * 3), src);
+      v15 ^= vec_load_le((64 * 3 + 16 * 3), src);
+
+      vec_store_le(v0, (64 * 0 + 16 * 0), dst);
+      vec_store_le(v1, (64 * 1 + 16 * 0), dst);
+      vec_store_le(v2, (64 * 2 + 16 * 0), dst);
+      vec_store_le(v3, (64 * 3 + 16 * 0), dst);
+
+      vec_store_le(v4, (64 * 0 + 16 * 1), dst);
+      vec_store_le(v5, (64 * 1 + 16 * 1), dst);
+      vec_store_le(v6, (64 * 2 + 16 * 1), dst);
+      vec_store_le(v7, (64 * 3 + 16 * 1), dst);
+
+      vec_store_le(v8, (64 * 0 + 16 * 2), dst);
+      vec_store_le(v9, (64 * 1 + 16 * 2), dst);
+      vec_store_le(v10, (64 * 2 + 16 * 2), dst);
+      vec_store_le(v11, (64 * 3 + 16 * 2), dst);
+
+      vec_store_le(v12, (64 * 0 + 16 * 3), dst);
+      vec_store_le(v13, (64 * 1 + 16 * 3), dst);
+      vec_store_le(v14, (64 * 2 + 16 * 3), dst);
+      vec_store_le(v15, (64 * 3 + 16 * 3), dst);
+
+      src += 4*64;
+      dst += 4*64;
+
+      nblks -= 4;
+    }
+  while (nblks);
+
+  vec_vsx_st(state3, 3 * 16, state); /* store counter */
+
+  /* store poly1305 state */
+  st->h[0] = h0;
+  st->h[1] = h0 >> 32;
+  st->h[2] = h1;
+  st->h[3] = h1 >> 32;
+  st->h[4] = h2;
+
+  return 0;
+}
+
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/comm/third_party/libgcrypt/cipher/chacha20-s390x.S b/comm/third_party/libgcrypt/cipher/chacha20-s390x.S
new file mode 100644
index 0000000000..9b1d59c6ad
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20-s390x.S
@@ -0,0 +1,1561 @@
+/* chacha20-s390x.S  -  zSeries implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
+
+#include "asm-common-s390x.h"
+#include "asm-poly1305-s390x.h"
+
+.machine "z13+vx"
+.text
+
+.balign 16
+.Lconsts:
+.Lwordswap:
+	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+.Lbswap128:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap32:
+	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lone:
+	.long 0, 0, 0, 1
+.Ladd_counter_0123:
+	.long 0, 1, 2, 3
+.Ladd_counter_4567:
+	.long 4, 5, 6, 7
+
+/* register macros */
+#define INPUT %r2
+#define DST   %r3
+#define SRC   %r4
+#define NBLKS %r0
+#define ROUND %r1
+
+/* stack structure */
+
+#define STACK_FRAME_STD    (8 * 16 + 8 * 4)
+#define STACK_FRAME_F8_F15 (8 * 8)
+#define STACK_FRAME_Y0_Y15 (16 * 16)
+#define STACK_FRAME_CTR    (4 * 16)
+#define STACK_FRAME_PARAMS (6 * 8)
+
+#define STACK_MAX   (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
+		     STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
+		     STACK_FRAME_PARAMS)
+
+#define STACK_F8     (STACK_MAX - STACK_FRAME_F8_F15)
+#define STACK_F9     (STACK_F8 + 8)
+#define STACK_F10    (STACK_F9 + 8)
+#define STACK_F11    (STACK_F10 + 8)
+#define STACK_F12    (STACK_F11 + 8)
+#define STACK_F13    (STACK_F12 + 8)
+#define STACK_F14    (STACK_F13 + 8)
+#define STACK_F15    (STACK_F14 + 8)
+#define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
+#define STACK_CTR    (STACK_Y0_Y15 - STACK_FRAME_CTR)
+#define STACK_INPUT  (STACK_CTR - STACK_FRAME_PARAMS)
+#define STACK_DST    (STACK_INPUT + 8)
+#define STACK_SRC    (STACK_DST + 8)
+#define STACK_NBLKS  (STACK_SRC + 8)
+#define STACK_POCTX  (STACK_NBLKS + 8)
+#define STACK_POSRC  (STACK_POCTX + 8)
+
+#define STACK_G0_H3  STACK_Y0_Y15
+
+/* vector registers */
+#define A0 %v0
+#define A1 %v1
+#define A2 %v2
+#define A3 %v3
+
+#define B0 %v4
+#define B1 %v5
+#define B2 %v6
+#define B3 %v7
+
+#define C0 %v8
+#define C1 %v9
+#define C2 %v10
+#define C3 %v11
+
+#define D0 %v12
+#define D1 %v13
+#define D2 %v14
+#define D3 %v15
+
+#define E0 %v16
+#define E1 %v17
+#define E2 %v18
+#define E3 %v19
+
+#define F0 %v20
+#define F1 %v21
+#define F2 %v22
+#define F3 %v23
+
+#define G0 %v24
+#define G1 %v25
+#define G2 %v26
+#define G3 %v27
+
+#define H0 %v28
+#define H1 %v29
+#define H2 %v30
+#define H3 %v31
+
+#define IO0 E0
+#define IO1 E1
+#define IO2 E2
+#define IO3 E3
+#define IO4 F0
+#define IO5 F1
+#define IO6 F2
+#define IO7 F3
+
+#define S0 G0
+#define S1 G1
+#define S2 G2
+#define S3 G3
+
+#define TMP0 H0
+#define TMP1 H1
+#define TMP2 H2
+#define TMP3 H3
+
+#define X0 A0
+#define X1 A1
+#define X2 A2
+#define X3 A3
+#define X4 B0
+#define X5 B1
+#define X6 B2
+#define X7 B3
+#define X8 C0
+#define X9 C1
+#define X10 C2
+#define X11 C3
+#define X12 D0
+#define X13 D1
+#define X14 D2
+#define X15 D3
+
+#define Y0 E0
+#define Y1 E1
+#define Y2 E2
+#define Y3 E3
+#define Y4 F0
+#define Y5 F1
+#define Y6 F2
+#define Y7 F3
+#define Y8 G0
+#define Y9 G1
+#define Y10 G2
+#define Y11 G3
+#define Y12 H0
+#define Y13 H1
+#define Y14 H2
+#define Y15 H3
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define _ /*_*/
+
+#define CLEAR(x,...) vzero x;
+
+#define START_STACK(last_r) \
+	lgr %r0, %r15; \
+	lghi %r1, ~15; \
+	stmg %r6, last_r, 6 * 8(%r15); \
+	aghi %r0, -STACK_MAX; \
+	ngr %r0, %r1; \
+	lgr %r1, %r15; \
+	CFI_DEF_CFA_REGISTER(1); \
+	lgr %r15, %r0; \
+	stg %r1, 0(%r15); \
+	CFI_CFA_ON_STACK(0, 0); \
+	std %f8, STACK_F8(%r15); \
+	std %f9, STACK_F9(%r15); \
+	std %f10, STACK_F10(%r15); \
+	std %f11, STACK_F11(%r15); \
+	std %f12, STACK_F12(%r15); \
+	std %f13, STACK_F13(%r15); \
+	std %f14, STACK_F14(%r15); \
+	std %f15, STACK_F15(%r15);
+
+#define END_STACK(last_r) \
+	lg %r1, 0(%r15); \
+	ld %f8, STACK_F8(%r15); \
+	ld %f9, STACK_F9(%r15); \
+	ld %f10, STACK_F10(%r15); \
+	ld %f11, STACK_F11(%r15); \
+	ld %f12, STACK_F12(%r15); \
+	ld %f13, STACK_F13(%r15); \
+	ld %f14, STACK_F14(%r15); \
+	ld %f15, STACK_F15(%r15); \
+	lmg %r6, last_r, 6 * 8(%r1); \
+	lgr %r15, %r1; \
+	CFI_DEF_CFA_REGISTER(DW_REGNO_SP);
+
+#define PLUS(dst,src) \
+	vaf dst, dst, src;
+
+#define XOR(dst,src) \
+	vx dst, dst, src;
+
+#define ROTATE(v1,c) \
+	verllf v1, v1, (c)(0);
+
+#define WORD_ROTATE(v1,s) \
+	vsldb v1, v1, v1, ((s) * 4);
+
+#define DST_1(OPER, I, J) \
+	OPER(A##I, J);
+
+#define DST_2(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J);
+
+#define DST_4(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J);
+
+#define DST_8(OPER, I, J) \
+	OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
+	OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
+
+#define DST_SRC_1(OPER, I, J) \
+	OPER(A##I, A##J);
+
+#define DST_SRC_2(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J);
+
+#define DST_SRC_4(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
+	OPER(D##I, D##J);
+
+#define DST_SRC_8(OPER, I, J) \
+	OPER(A##I, A##J); OPER(B##I, B##J); OPER(C##I, C##J); \
+	OPER(D##I, D##J); OPER(E##I, E##J); OPER(F##I, F##J); \
+	OPER(G##I, G##J); OPER(H##I, H##J);
+
+/**********************************************************************
+  round macros
+ **********************************************************************/
+
+#define QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,op1,op2) \
+	op1; DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 16); \
+	DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 12); \
+	DST_SRC_1(PLUS, 0, 1); DST_SRC_1(XOR, 3, 0); DST_1(ROTATE, 3, 8); \
+	op2; DST_SRC_1(PLUS, 2, 3); DST_SRC_1(XOR, 1, 2); DST_1(ROTATE, 1, 7); \
+	DST_1(WORD_ROTATE, 3, wrot_3); \
+	DST_1(WORD_ROTATE, 2, wrot_2); \
+	DST_1(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_POLY(wrot_1,wrot_2,wrot_3,,)
+
+#define QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4) \
+	op1; DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); DST_2(ROTATE, 3, 16); \
+	DST_SRC_2(PLUS, 2, 3); op2; DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 12); \
+	DST_SRC_2(PLUS, 0, 1); DST_SRC_2(XOR, 3, 0); op3; DST_2(ROTATE, 3, 8); \
+	DST_SRC_2(PLUS, 2, 3); DST_SRC_2(XOR, 1, 2); DST_2(ROTATE, 1, 7); op4; \
+	DST_2(WORD_ROTATE, 3, wrot_3); \
+	DST_2(WORD_ROTATE, 2, wrot_2); \
+	DST_2(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4_2(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_2_POLY(wrot_1,wrot_2,wrot_3,,,,)
+
+#define QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,op1,op2,op3,op4,op5,op6) \
+	DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op1; DST_4(ROTATE, 3, 16); \
+	DST_SRC_4(PLUS, 2, 3); op2; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 12); \
+	op3; DST_SRC_4(PLUS, 0, 1); DST_SRC_4(XOR, 3, 0); op4; DST_4(ROTATE, 3, 8); \
+	DST_SRC_4(PLUS, 2, 3); op5; DST_SRC_4(XOR, 1, 2); DST_4(ROTATE, 1, 7); \
+	op6; \
+	DST_4(WORD_ROTATE, 3, wrot_3); \
+	DST_4(WORD_ROTATE, 2, wrot_2); \
+	DST_4(WORD_ROTATE, 1, wrot_1);
+
+#define QUARTERROUND4_4(wrot_1,wrot_2,wrot_3) \
+	QUARTERROUND4_4_POLY(wrot_1,wrot_2,wrot_3,,,,,,)
+
+/**********************************************************************
+  4-way && 2-way && 1-way chacha20 ("horizontal")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_s390x_vx_blocks4_2_1
+ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1,@function;)
+
+_gcry_chacha20_s390x_vx_blocks4_2_1:
+	/* input:
+	 *	%r2: input
+	 *	%r3: dst
+	 *	%r4: src
+	 *	%r5: nblks
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r7);
+	lgr NBLKS, %r5;
+
+	/* Load constants. */
+	larl %r7, .Lconsts;
+	vl TMP0, (.Lwordswap - .Lconsts)(%r7);
+	vl TMP1, (.Lone - .Lconsts)(%r7);
+	vl TMP2, (.Lbswap128 - .Lconsts)(%r7);
+
+	/* Load state. */
+	vlm S0, S3, 0(INPUT);
+	vperm S0, S0, S0, TMP0;
+	vperm S1, S1, S1, TMP0;
+	vperm S2, S2, S2, TMP0;
+	vperm S3, S3, S3, TMP0;
+
+	clgijl NBLKS, 4, .Lloop2;
+
+.balign 4
+.Lloop4:
+	/* Process four chacha20 blocks. */
+	vlr TMP3, S3;
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vlr B3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr C0, S0;
+	vlr C1, S1;
+	vlr C2, S2;
+	vlr C3, TMP3;
+	vlr D0, S0;
+	vlr D1, S1;
+	vlr D2, S2;
+	vag D3, TMP3, TMP1;
+
+	slgfi NBLKS, 4;
+
+.balign 4
+.Lround2_4:
+	QUARTERROUND4_4(3, 2, 1);
+	QUARTERROUND4_4(1, 2, 3);
+	brctg ROUND, .Lround2_4;
+
+	vlm IO0, IO7, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+	PLUS(C0, S0);
+	PLUS(C1, S1);
+	PLUS(C2, S2);
+	PLUS(C3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(D0, S0);
+	PLUS(D1, S1);
+	PLUS(D2, S2);
+	PLUS(D3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm C0, C0, C0, TMP2;
+	vperm C1, C1, C1, TMP2;
+	vperm C2, C2, C2, TMP2;
+	vperm C3, C3, C3, TMP2;
+	vperm D0, D0, D0, TMP2;
+	vperm D1, D1, D1, TMP2;
+	vperm D2, D2, D2, TMP2;
+	vperm D3, D3, D3, TMP2;
+
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vlm A0, B3, 128(SRC);
+	vstm IO0, IO7, 0(DST);
+	XOR(A0, C0);
+	XOR(A1, C1);
+	XOR(A2, C2);
+	XOR(A3, C3);
+	XOR(B0, D0);
+	XOR(B1, D1);
+	XOR(B2, D2);
+	XOR(B3, D3);
+	vstm A0, B3, 128(DST);
+
+	aghi SRC, 256;
+	aghi DST, 256;
+
+	clgijhe NBLKS, 4, .Lloop4;
+
+	CLEAR(C0);
+	CLEAR(C1);
+	CLEAR(C2);
+	CLEAR(C3);
+	CLEAR(D0);
+	CLEAR(D1);
+	CLEAR(D2);
+	CLEAR(D3);
+
+.balign 4
+.Lloop2:
+	clgijl NBLKS, 2, .Lloop1;
+
+	/* Process two chacha20 blocks. */
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vag B3, S3, TMP1;
+
+	slgfi NBLKS, 2;
+
+.balign 4
+.Lround2_2:
+	QUARTERROUND4_2(3, 2, 1);
+	QUARTERROUND4_2(1, 2, 3);
+	brctg ROUND, .Lround2_2;
+
+	vlm IO0, IO7, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vstm IO0, IO7, 0(DST);
+
+	aghi SRC, 128;
+	aghi DST, 128;
+
+	clgijhe NBLKS, 2, .Lloop2;
+
+	CLEAR(B0);
+	CLEAR(B1);
+	CLEAR(B2);
+	CLEAR(B3);
+
+.balign 4
+.Lloop1:
+	clgijl NBLKS, 1, .Ldone;
+
+	/* Process one chacha20 block.*/
+	lghi ROUND, (20 / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+
+	slgfi NBLKS, 1;
+
+.balign 4
+.Lround2_1:
+	QUARTERROUND4(3, 2, 1);
+	QUARTERROUND4(1, 2, 3);
+	brct ROUND, .Lround2_1;
+
+	vlm IO0, IO3, 0(SRC);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	vstm IO0, IO3, 0(DST);
+
+	aghi SRC, 64;
+	aghi DST, 64;
+
+	clgijhe NBLKS, 1, .Lloop1;
+
+.balign 4
+.Ldone:
+	/* Store counter. */
+	vperm S3, S3, S3, TMP0;
+	vst S3, (48)(INPUT);
+
+	/* Clear the used vector registers. */
+	CLEAR(A0);
+	CLEAR(A1);
+	CLEAR(A2);
+	CLEAR(A3);
+	CLEAR(IO0);
+	CLEAR(IO1);
+	CLEAR(IO2);
+	CLEAR(IO3);
+	CLEAR(IO4);
+	CLEAR(IO5);
+	CLEAR(IO6);
+	CLEAR(IO7);
+	CLEAR(TMP0);
+	CLEAR(TMP1);
+	CLEAR(TMP2);
+
+	END_STACK(%r7);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1,
+    .-_gcry_chacha20_s390x_vx_blocks4_2_1;)
+
+/**********************************************************************
+  4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1
+ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;)
+
+_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1:
+	/* input:
+	 *       %r2: input
+	 *       %r3: dst
+	 *       %r4: src
+	 *       %r5: nblks
+	 *       %r6: poly1305 state
+	 * 160(%r15): poly1305 src
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r14);
+	lgr NBLKS, %r5;
+
+	/* Load constants. */
+	larl %r8, .Lconsts;
+	vl TMP0, (.Lwordswap - .Lconsts)(%r8);
+	vl TMP1, (.Lone - .Lconsts)(%r8);
+	vl TMP2, (.Lbswap128 - .Lconsts)(%r8);
+
+	/* Load state. */
+	vlm S0, S3, 0(INPUT);
+	vperm S0, S0, S0, TMP0;
+	vperm S1, S1, S1, TMP0;
+	vperm S2, S2, S2, TMP0;
+	vperm S3, S3, S3, TMP0;
+
+	/* Store parameters to stack. */
+	stmg %r2, %r6, STACK_INPUT(%r15);
+
+	lgr POLY_RSTATE, %r6;
+	lgr NBLKS, %r5;
+
+	lg POLY_RSRC, 0(%r15);
+	lg POLY_RSRC, 160(POLY_RSRC);
+	stg POLY_RSRC, STACK_POSRC(%r15);
+
+	/* Load poly1305 state */
+	POLY1305_LOAD_STATE();
+
+	clgijl NBLKS, 4, .Lloop2_poly;
+
+.balign 4
+.Lloop4_poly:
+	/* Process four chacha20 blocks and 16 poly1305 blocks. */
+	vlr TMP3, S3;
+	lghi ROUND, (20 / 4);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vlr B3, TMP3;
+	vag TMP3, TMP3, TMP1;
+	vlr C0, S0;
+	vlr C1, S1;
+	vlr C2, S2;
+	vlr C3, TMP3;
+	vlr D0, S0;
+	vlr D1, S1;
+	vlr D2, S2;
+	vag D3, TMP3, TMP1;
+
+	slgfi NBLKS, 4;
+
+.balign 4
+.Lround4_4_poly:
+	/* Total 15 poly1305 blocks processed by this loop. */
+	QUARTERROUND4_4_POLY(3, 2, 1,
+			     POLY1305_BLOCK_PART1(0 * 16),
+			     POLY1305_BLOCK_PART2(),
+			     POLY1305_BLOCK_PART3(),
+			     POLY1305_BLOCK_PART4(),
+			     POLY1305_BLOCK_PART5(),
+			     POLY1305_BLOCK_PART6());
+	QUARTERROUND4_4_POLY(1, 2, 3,
+			     POLY1305_BLOCK_PART7(),
+			     POLY1305_BLOCK_PART8(),
+			     POLY1305_BLOCK_PART1(1 * 16),
+			     POLY1305_BLOCK_PART2(),
+			     POLY1305_BLOCK_PART3(),
+			     POLY1305_BLOCK_PART4());
+	QUARTERROUND4_4_POLY(3, 2, 1,
+			     POLY1305_BLOCK_PART5(),
+			     POLY1305_BLOCK_PART6(),
+			     POLY1305_BLOCK_PART7(),
+			     POLY1305_BLOCK_PART8(),
+			     POLY1305_BLOCK_PART1(2 * 16);
+			      INC_POLY1305_SRC(3 * 16),
+			     POLY1305_BLOCK_PART2());
+	QUARTERROUND4_4_POLY(1, 2, 3,
+			     POLY1305_BLOCK_PART3(),
+			     POLY1305_BLOCK_PART4(),
+			     POLY1305_BLOCK_PART5(),
+			     POLY1305_BLOCK_PART6(),
+			     POLY1305_BLOCK_PART7(),
+			     POLY1305_BLOCK_PART8());
+	brctg ROUND, .Lround4_4_poly;
+
+	POLY1305_BLOCK_PART1(0 * 16);
+	INC_POLY1305_SRC(1 * 16);
+	stg POLY_RSRC, STACK_POSRC(%r15);
+
+	lg %r14, STACK_SRC(%r15);
+	vlm IO0, IO7, 0(%r14);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	POLY1305_BLOCK_PART2();
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	POLY1305_BLOCK_PART3();
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+	POLY1305_BLOCK_PART4();
+	PLUS(C0, S0);
+	PLUS(C1, S1);
+	PLUS(C2, S2);
+	PLUS(C3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(D0, S0);
+	PLUS(D1, S1);
+	PLUS(D2, S2);
+	PLUS(D3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	POLY1305_BLOCK_PART5();
+	vperm C0, C0, C0, TMP2;
+	vperm C1, C1, C1, TMP2;
+	vperm C2, C2, C2, TMP2;
+	vperm C3, C3, C3, TMP2;
+	vperm D0, D0, D0, TMP2;
+	vperm D1, D1, D1, TMP2;
+	vperm D2, D2, D2, TMP2;
+	vperm D3, D3, D3, TMP2;
+
+	POLY1305_BLOCK_PART6();
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vlm A0, B3, 128(%r14);
+	aghi %r14, 256;
+	stg %r14, STACK_SRC(%r15);
+
+	lg %r14, STACK_DST(%r15);
+	POLY1305_BLOCK_PART7();
+	vstm IO0, IO7, 0(%r14);
+	XOR(A0, C0);
+	XOR(A1, C1);
+	XOR(A2, C2);
+	XOR(A3, C3);
+	XOR(B0, D0);
+	XOR(B1, D1);
+	XOR(B2, D2);
+	XOR(B3, D3);
+	POLY1305_BLOCK_PART8();
+	vstm A0, B3, 128(%r14);
+	aghi %r14, 256;
+	stg %r14, STACK_DST(%r15);
+
+	lg POLY_RSRC, STACK_POSRC(%r15);
+
+	clgijhe NBLKS, 4, .Lloop4_poly;
+
+	CLEAR(C0);
+	CLEAR(C1);
+	CLEAR(C2);
+	CLEAR(C3);
+	CLEAR(D0);
+	CLEAR(D1);
+	CLEAR(D2);
+	CLEAR(D3);
+
+.balign 4
+.Lloop2_poly:
+	clgijl NBLKS, 2, .Lloop1_poly;
+
+	/* Process two chacha20 and eight poly1305 blocks. */
+	lghi ROUND, ((20 - 4) / 2);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+	vlr B0, S0;
+	vlr B1, S1;
+	vlr B2, S2;
+	vag B3, S3, TMP1;
+
+	slgfi NBLKS, 2;
+
+.balign 4
+.Lround4_2_poly:
+	/* Total eight poly1305 blocks processed by this loop. */
+	QUARTERROUND4_2_POLY(3, 2, 1,
+			     POLY1305_BLOCK_PART1(0 * 16),
+			     POLY1305_BLOCK_PART2(),
+			     POLY1305_BLOCK_PART3(),
+			     POLY1305_BLOCK_PART4());
+			     INC_POLY1305_SRC(1 * 16);
+	QUARTERROUND4_2_POLY(1, 2, 3,
+			     POLY1305_BLOCK_PART5(),
+			     POLY1305_BLOCK_PART6(),
+			     POLY1305_BLOCK_PART7(),
+			     POLY1305_BLOCK_PART8());
+	brctg ROUND, .Lround4_2_poly;
+
+	stg POLY_RSRC, STACK_POSRC(%r15);
+	lg %r14, STACK_SRC(%r15);
+
+	QUARTERROUND4_2(3, 2, 1);
+	QUARTERROUND4_2(1, 2, 3);
+	QUARTERROUND4_2(3, 2, 1);
+	QUARTERROUND4_2(1, 2, 3);
+
+	vlm IO0, IO7, 0(%r14);
+	aghi %r14, 128;
+	stg %r14, STACK_SRC(%r15);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	PLUS(B0, S0);
+	PLUS(B1, S1);
+	PLUS(B2, S2);
+	PLUS(B3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	vperm B0, B0, B0, TMP2;
+	vperm B1, B1, B1, TMP2;
+	vperm B2, B2, B2, TMP2;
+	vperm B3, B3, B3, TMP2;
+
+	lg %r14, STACK_DST(%r15);
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	XOR(IO4, B0);
+	XOR(IO5, B1);
+	XOR(IO6, B2);
+	XOR(IO7, B3);
+	vstm IO0, IO7, 0(%r14);
+	aghi %r14, 128;
+	stg %r14, STACK_DST(%r15);
+
+	lg POLY_RSRC, STACK_POSRC(%r15);
+
+	clgijhe NBLKS, 2, .Lloop2_poly;
+
+	CLEAR(B0);
+	CLEAR(B1);
+	CLEAR(B2);
+	CLEAR(B3);
+
+.balign 4
+.Lloop1_poly:
+	clgijl NBLKS, 1, .Ldone_poly;
+
+	/* Process one chacha20 block and four poly1305 blocks.*/
+	lghi ROUND, ((20 - 4) / 4);
+	vlr A0, S0;
+	vlr A1, S1;
+	vlr A2, S2;
+	vlr A3, S3;
+
+	slgfi NBLKS, 1;
+
+.balign 4
+.Lround4_1_poly:
+	/* Total four poly1305 blocks processed by this loop. */
+	QUARTERROUND4_POLY(3, 2, 1,
+			   POLY1305_BLOCK_PART1(0 * 16),
+			   POLY1305_BLOCK_PART2());
+			   INC_POLY1305_SRC(1 * 16);
+	QUARTERROUND4_POLY(1, 2, 3,
+			   POLY1305_BLOCK_PART3(),
+			   POLY1305_BLOCK_PART4());
+	QUARTERROUND4_POLY(3, 2, 1,
+			   POLY1305_BLOCK_PART5(),
+			   POLY1305_BLOCK_PART6());
+	QUARTERROUND4_POLY(1, 2, 3,
+			   POLY1305_BLOCK_PART7(),
+			   POLY1305_BLOCK_PART8());
+	brct ROUND, .Lround4_1_poly;
+
+	stg POLY_RSRC, STACK_POSRC(%r15);
+	lg %r14, STACK_SRC(%r15);
+
+	QUARTERROUND4(3, 2, 1);
+	QUARTERROUND4(1, 2, 3);
+	QUARTERROUND4(3, 2, 1);
+	QUARTERROUND4(1, 2, 3);
+
+	vlm IO0, IO3, 0(%r14);
+	aghi %r14, 64;
+	stg %r14, STACK_SRC(%r15);
+
+	PLUS(A0, S0);
+	PLUS(A1, S1);
+	PLUS(A2, S2);
+	PLUS(A3, S3);
+	vag S3, S3, TMP1; /* Update counter. */
+
+	lg %r14, STACK_DST(%r15);
+	vperm A0, A0, A0, TMP2;
+	vperm A1, A1, A1, TMP2;
+	vperm A2, A2, A2, TMP2;
+	vperm A3, A3, A3, TMP2;
+	XOR(IO0, A0);
+	XOR(IO1, A1);
+	XOR(IO2, A2);
+	XOR(IO3, A3);
+	vstm IO0, IO3, 0(%r14);
+	aghi %r14, 64;
+	stg %r14, STACK_DST(%r15);
+
+	lg POLY_RSRC, STACK_POSRC(%r15);
+
+	clgijhe NBLKS, 1, .Lloop1_poly;
+
+.balign 4
+.Ldone_poly:
+	/* Store poly1305 state */
+	lg POLY_RSTATE, STACK_POCTX(%r15);
+	POLY1305_STORE_STATE();
+
+	/* Store counter. */
+	lg INPUT, STACK_INPUT(%r15);
+	vperm S3, S3, S3, TMP0;
+	vst S3, (48)(INPUT);
+
+	/* Clear the used vector registers. */
+	CLEAR(A0);
+	CLEAR(A1);
+	CLEAR(A2);
+	CLEAR(A3);
+	CLEAR(IO0);
+	CLEAR(IO1);
+	CLEAR(IO2);
+	CLEAR(IO3);
+	CLEAR(IO4);
+	CLEAR(IO5);
+	CLEAR(IO6);
+	CLEAR(IO7);
+	CLEAR(TMP0);
+	CLEAR(TMP1);
+	CLEAR(TMP2);
+
+	END_STACK(%r14);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,
+    .-_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1;)
+
+/**********************************************************************
+  8-way chacha20 ("vertical")
+ **********************************************************************/
+
+#define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
+			      x8,x9,x10,x11,x12,x13,x14,x15,\
+			      y0,y1,y2,y3,y4,y5,y6,y7,\
+			      y8,y9,y10,y11,y12,y13,y14,y15,\
+			      op1,op2,op3,op4,op5,op6,op7,op8,\
+			      op9,op10,op11,op12) \
+	op1;							\
+	PLUS(x0, x1); PLUS(x4, x5);				\
+	PLUS(x8, x9); PLUS(x12, x13);				\
+	PLUS(y0, y1); PLUS(y4, y5);				\
+	PLUS(y8, y9); PLUS(y12, y13);				\
+	    op2;						\
+	    XOR(x3, x0);  XOR(x7, x4);				\
+	    XOR(x11, x8); XOR(x15, x12);			\
+	    XOR(y3, y0);  XOR(y7, y4);				\
+	    XOR(y11, y8); XOR(y15, y12);			\
+		op3;						\
+		ROTATE(x3, 16); ROTATE(x7, 16);			\
+		ROTATE(x11, 16); ROTATE(x15, 16);		\
+		ROTATE(y3, 16); ROTATE(y7, 16);			\
+		ROTATE(y11, 16); ROTATE(y15, 16);		\
+	op4;							\
+	PLUS(x2, x3); PLUS(x6, x7);				\
+	PLUS(x10, x11); PLUS(x14, x15);				\
+	PLUS(y2, y3); PLUS(y6, y7);				\
+	PLUS(y10, y11); PLUS(y14, y15);				\
+	    op5;						\
+	    XOR(x1, x2); XOR(x5, x6);				\
+	    XOR(x9, x10); XOR(x13, x14);			\
+	    XOR(y1, y2); XOR(y5, y6);				\
+	    XOR(y9, y10); XOR(y13, y14);			\
+		op6;						\
+		ROTATE(x1,12); ROTATE(x5,12);			\
+		ROTATE(x9,12); ROTATE(x13,12);			\
+		ROTATE(y1,12); ROTATE(y5,12);			\
+		ROTATE(y9,12); ROTATE(y13,12);			\
+	op7;							\
+	PLUS(x0, x1); PLUS(x4, x5);				\
+	PLUS(x8, x9); PLUS(x12, x13);				\
+	PLUS(y0, y1); PLUS(y4, y5);				\
+	PLUS(y8, y9); PLUS(y12, y13);				\
+	    op8;						\
+	    XOR(x3, x0); XOR(x7, x4);				\
+	    XOR(x11, x8); XOR(x15, x12);			\
+	    XOR(y3, y0); XOR(y7, y4);				\
+	    XOR(y11, y8); XOR(y15, y12);			\
+		op9;						\
+		ROTATE(x3,8); ROTATE(x7,8);			\
+		ROTATE(x11,8); ROTATE(x15,8);			\
+		ROTATE(y3,8); ROTATE(y7,8);			\
+		ROTATE(y11,8); ROTATE(y15,8);			\
+	op10;							\
+	PLUS(x2, x3); PLUS(x6, x7);				\
+	PLUS(x10, x11); PLUS(x14, x15);				\
+	PLUS(y2, y3); PLUS(y6, y7);				\
+	PLUS(y10, y11); PLUS(y14, y15);				\
+	    op11;						\
+	    XOR(x1, x2); XOR(x5, x6);				\
+	    XOR(x9, x10); XOR(x13, x14);			\
+	    XOR(y1, y2); XOR(y5, y6);				\
+	    XOR(y9, y10); XOR(y13, y14);			\
+		op12;						\
+		ROTATE(x1,7); ROTATE(x5,7);			\
+		ROTATE(x9,7); ROTATE(x13,7);			\
+		ROTATE(y1,7); ROTATE(y5,7);			\
+		ROTATE(y9,7); ROTATE(y13,7);
+
+#define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
+			 y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
+	QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
+			      x8,x9,x10,x11,x12,x13,x14,x15,\
+			      y0,y1,y2,y3,y4,y5,y6,y7,\
+			      y8,y9,y10,y11,y12,y13,y14,y15,\
+			      ,,,,,,,,,,,)
+
+#define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
+	  vmrhf tmp0, v0, v1;					\
+	  vmrhf tmp1, v2, v3;					\
+	  vmrlf tmp2, v0, v1;					\
+	  vmrlf   v3, v2, v3;					\
+	  vmrhf tmpa, va, vb;					\
+	  vmrhf tmpb, vc, vd;					\
+	  vmrlf tmpc, va, vb;					\
+	  vmrlf   vd, vc, vd;					\
+	  vpdi v0, tmp0, tmp1, 0;				\
+	  vpdi v1, tmp0, tmp1, 5;				\
+	  vpdi v2, tmp2,   v3, 0;				\
+	  vpdi v3, tmp2,   v3, 5;				\
+	  vpdi va, tmpa, tmpb, 0;				\
+	  vpdi vb, tmpa, tmpb, 5;				\
+	  vpdi vc, tmpc,   vd, 0;				\
+	  vpdi vd, tmpc,   vd, 5;
+
+.balign 8
+.globl _gcry_chacha20_s390x_vx_blocks8
+ELF(.type _gcry_chacha20_s390x_vx_blocks8,@function;)
+
+_gcry_chacha20_s390x_vx_blocks8:
+	/* input:
+	 *	%r2: input
+	 *	%r3: dst
+	 *	%r4: src
+	 *	%r5: nblks (multiple of 8)
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r8);
+	lgr NBLKS, %r5;
+
+	larl %r7, .Lconsts;
+
+	/* Load counter. */
+	lg %r8, (12 * 4)(INPUT);
+	rllg %r8, %r8, 32;
+
+.balign 4
+	/* Process eight chacha20 blocks per loop. */
+.Lloop8:
+	vlm Y0, Y3, 0(INPUT);
+
+	slgfi NBLKS, 8;
+	lghi ROUND, (20 / 2);
+
+	/* Construct counter vectors X12/X13 & Y12/Y13. */
+	vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
+	vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
+	vrepf Y12, Y3, 0;
+	vrepf Y13, Y3, 1;
+	vaccf X5, Y12, X4;
+	vaccf Y5, Y12, Y4;
+	vaf X12, Y12, X4;
+	vaf Y12, Y12, Y4;
+	vaf X13, Y13, X5;
+	vaf Y13, Y13, Y5;
+
+	vrepf X0, Y0, 0;
+	vrepf X1, Y0, 1;
+	vrepf X2, Y0, 2;
+	vrepf X3, Y0, 3;
+	vrepf X4, Y1, 0;
+	vrepf X5, Y1, 1;
+	vrepf X6, Y1, 2;
+	vrepf X7, Y1, 3;
+	vrepf X8, Y2, 0;
+	vrepf X9, Y2, 1;
+	vrepf X10, Y2, 2;
+	vrepf X11, Y2, 3;
+	vrepf X14, Y3, 2;
+	vrepf X15, Y3, 3;
+
+	/* Store counters for blocks 0-7. */
+	vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
+	vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
+
+	vlr Y0, X0;
+	vlr Y1, X1;
+	vlr Y2, X2;
+	vlr Y3, X3;
+	vlr Y4, X4;
+	vlr Y5, X5;
+	vlr Y6, X6;
+	vlr Y7, X7;
+	vlr Y8, X8;
+	vlr Y9, X9;
+	vlr Y10, X10;
+	vlr Y11, X11;
+	vlr Y14, X14;
+	vlr Y15, X15;
+
+	/* Update and store counter. */
+	agfi %r8, 8;
+	rllg %r5, %r8, 32;
+	stg %r5, (12 * 4)(INPUT);
+
+.balign 4
+.Lround2_8:
+	QUARTERROUND4_V8(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+			 X2, X6, X10, X14,   X3, X7, X11, X15,
+			 Y0, Y4,  Y8, Y12,   Y1, Y5,  Y9, Y13,
+			 Y2, Y6, Y10, Y14,   Y3, Y7, Y11, Y15);
+	QUARTERROUND4_V8(X0, X5, X10, X15,   X1, X6, X11, X12,
+			 X2, X7,  X8, X13,   X3, X4,  X9, X14,
+			 Y0, Y5, Y10, Y15,   Y1, Y6, Y11, Y12,
+			 Y2, Y7,  Y8, Y13,   Y3, Y4,  Y9, Y14);
+	brctg ROUND, .Lround2_8;
+
+	/* Store blocks 4-7. */
+	vstm Y0, Y15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 0-3. */
+	vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
+
+	lghi ROUND, 1;
+	j .Lfirst_output_4blks_8;
+
+.balign 4
+.Lsecond_output_4blks_8:
+	/* Load blocks 4-7. */
+	vlm X0, X15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 4-7. */
+	vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
+
+	lghi ROUND, 0;
+
+.balign 4
+	/* Output four chacha20 blocks per loop. */
+.Lfirst_output_4blks_8:
+	vlm Y12, Y15, 0(INPUT);
+	PLUS(X12, Y0);
+	PLUS(X13, Y1);
+	vrepf Y0, Y12, 0;
+	vrepf Y1, Y12, 1;
+	vrepf Y2, Y12, 2;
+	vrepf Y3, Y12, 3;
+	vrepf Y4, Y13, 0;
+	vrepf Y5, Y13, 1;
+	vrepf Y6, Y13, 2;
+	vrepf Y7, Y13, 3;
+	vrepf Y8, Y14, 0;
+	vrepf Y9, Y14, 1;
+	vrepf Y10, Y14, 2;
+	vrepf Y11, Y14, 3;
+	vrepf Y14, Y15, 2;
+	vrepf Y15, Y15, 3;
+	PLUS(X0, Y0);
+	PLUS(X1, Y1);
+	PLUS(X2, Y2);
+	PLUS(X3, Y3);
+	PLUS(X4, Y4);
+	PLUS(X5, Y5);
+	PLUS(X6, Y6);
+	PLUS(X7, Y7);
+	PLUS(X8, Y8);
+	PLUS(X9, Y9);
+	PLUS(X10, Y10);
+	PLUS(X11, Y11);
+	PLUS(X14, Y14);
+	PLUS(X15, Y15);
+
+	vl Y15, (.Lbswap32 - .Lconsts)(%r7);
+	TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+	TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+
+	vlm Y0, Y14, 0(SRC);
+	vperm X0, X0, X0, Y15;
+	vperm X1, X1, X1, Y15;
+	vperm X2, X2, X2, Y15;
+	vperm X3, X3, X3, Y15;
+	vperm X4, X4, X4, Y15;
+	vperm X5, X5, X5, Y15;
+	vperm X6, X6, X6, Y15;
+	vperm X7, X7, X7, Y15;
+	vperm X8, X8, X8, Y15;
+	vperm X9, X9, X9, Y15;
+	vperm X10, X10, X10, Y15;
+	vperm X11, X11, X11, Y15;
+	vperm X12, X12, X12, Y15;
+	vperm X13, X13, X13, Y15;
+	vperm X14, X14, X14, Y15;
+	vperm X15, X15, X15, Y15;
+	vl Y15, (15 * 16)(SRC);
+
+	XOR(Y0, X0);
+	XOR(Y1, X4);
+	XOR(Y2, X8);
+	XOR(Y3, X12);
+	XOR(Y4, X1);
+	XOR(Y5, X5);
+	XOR(Y6, X9);
+	XOR(Y7, X13);
+	XOR(Y8, X2);
+	XOR(Y9, X6);
+	XOR(Y10, X10);
+	XOR(Y11, X14);
+	XOR(Y12, X3);
+	XOR(Y13, X7);
+	XOR(Y14, X11);
+	XOR(Y15, X15);
+	vstm Y0, Y15, 0(DST);
+
+	aghi SRC, 256;
+	aghi DST, 256;
+
+	clgije ROUND, 1, .Lsecond_output_4blks_8;
+
+	clgijhe NBLKS, 8, .Lloop8;
+
+	/* Clear the used vector registers. */
+	DST_8(CLEAR, 0, _);
+	DST_8(CLEAR, 1, _);
+	DST_8(CLEAR, 2, _);
+	DST_8(CLEAR, 3, _);
+
+	/* Clear sensitive data in stack. */
+	vlm Y0, Y15, STACK_Y0_Y15(%r15);
+	vlm Y0, Y3, STACK_CTR(%r15);
+
+	END_STACK(%r8);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_s390x_vx_blocks8,
+    .-_gcry_chacha20_s390x_vx_blocks8;)
+
+/**********************************************************************
+  8-way stitched chacha20-poly1305 ("vertical")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_poly1305_s390x_vx_blocks8
+ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;)
+
+_gcry_chacha20_poly1305_s390x_vx_blocks8:
+	/* input:
+	 *       %r2: input
+	 *       %r3: dst
+	 *       %r4: src
+	 *       %r5: nblks (multiple of 8)
+	 *       %r6: poly1305 state
+	 * 160(%r15): poly1305 src
+	 */
+	CFI_STARTPROC();
+
+	START_STACK(%r14);
+
+	/* Store parameters to stack. */
+	stmg %r2, %r6, STACK_INPUT(%r15);
+
+	lgr POLY_RSTATE, %r6;
+	lgr NBLKS, %r5;
+
+	lg POLY_RSRC, 0(%r15);
+	lg POLY_RSRC, 160(POLY_RSRC);
+	stg POLY_RSRC, STACK_POSRC(%r15);
+
+	/* Load poly1305 state */
+	POLY1305_LOAD_STATE();
+
+.balign 4
+	/* Process eight chacha20 blocks and 32 poly1305 blocks per loop. */
+.Lloop8_poly:
+	lg INPUT, STACK_INPUT(%r15);
+	larl %r8, .Lconsts;
+
+	vlm Y0, Y3, 0(INPUT);
+
+	slgfi NBLKS, 8;
+	lghi ROUND, (20 / 2);
+
+	/* Construct counter vectors X12/X13 & Y12/Y13. */
+	vl X4, (.Ladd_counter_0123 - .Lconsts)(%r8);
+	vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r8);
+	lg %r8, (12 * 4)(INPUT); /* Update counter. */
+	vrepf Y12, Y3, 0;
+	vrepf Y13, Y3, 1;
+	vaccf X5, Y12, X4;
+	vaccf Y5, Y12, Y4;
+	vaf X12, Y12, X4;
+	vaf Y12, Y12, Y4;
+	vaf X13, Y13, X5;
+	vaf Y13, Y13, Y5;
+	rllg %r8, %r8, 32;
+
+	vrepf X0, Y0, 0;
+	vrepf X1, Y0, 1;
+	vrepf X2, Y0, 2;
+	vrepf X3, Y0, 3;
+	vrepf X4, Y1, 0;
+	vrepf X5, Y1, 1;
+	vrepf X6, Y1, 2;
+	vrepf X7, Y1, 3;
+	vrepf X8, Y2, 0;
+	vrepf X9, Y2, 1;
+	vrepf X10, Y2, 2;
+	vrepf X11, Y2, 3;
+	vrepf X14, Y3, 2;
+	vrepf X15, Y3, 3;
+	agfi %r8, 8;
+
+	/* Store counters for blocks 0-7. */
+	vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
+	vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
+	rllg %r8, %r8, 32;
+
+	vlr Y0, X0;
+	vlr Y1, X1;
+	vlr Y2, X2;
+	vlr Y3, X3;
+	vlr Y4, X4;
+	vlr Y5, X5;
+	vlr Y6, X6;
+	vlr Y7, X7;
+	vlr Y8, X8;
+	vlr Y9, X9;
+	vlr Y10, X10;
+	vlr Y11, X11;
+	vlr Y14, X14;
+	vlr Y15, X15;
+	stg %r8, (12 * 4)(INPUT);
+
+.balign 4
+.Lround2_8_poly:
+	/* Total 30 poly1305 blocks processed by this loop. */
+	QUARTERROUND4_V8_POLY(X0, X4,  X8, X12,   X1, X5,  X9, X13,
+			      X2, X6, X10, X14,   X3, X7, X11, X15,
+			      Y0, Y4,  Y8, Y12,   Y1, Y5,  Y9, Y13,
+			      Y2, Y6, Y10, Y14,   Y3, Y7, Y11, Y15,
+			      POLY1305_BLOCK_PART1(0 * 16),
+			      POLY1305_BLOCK_PART2(),
+			      POLY1305_BLOCK_PART3(),
+			      POLY1305_BLOCK_PART4(),
+			      POLY1305_BLOCK_PART5(),
+			      POLY1305_BLOCK_PART6(),
+			      POLY1305_BLOCK_PART7(),
+			      POLY1305_BLOCK_PART8(),
+			      POLY1305_BLOCK_PART1(1 * 16),
+			      POLY1305_BLOCK_PART2(),
+			      POLY1305_BLOCK_PART3(),
+			      POLY1305_BLOCK_PART4());
+	QUARTERROUND4_V8_POLY(X0, X5, X10, X15,   X1, X6, X11, X12,
+			      X2, X7,  X8, X13,   X3, X4,  X9, X14,
+			      Y0, Y5, Y10, Y15,   Y1, Y6, Y11, Y12,
+			      Y2, Y7,  Y8, Y13,   Y3, Y4,  Y9, Y14,
+			      POLY1305_BLOCK_PART5(),
+			      POLY1305_BLOCK_PART6(),
+			      POLY1305_BLOCK_PART7(),
+			      POLY1305_BLOCK_PART8(),
+			      POLY1305_BLOCK_PART1(2 * 16);
+				INC_POLY1305_SRC(3 * 16),
+			      POLY1305_BLOCK_PART2(),
+			      POLY1305_BLOCK_PART3(),
+			      POLY1305_BLOCK_PART4(),
+			      POLY1305_BLOCK_PART5(),
+			      POLY1305_BLOCK_PART6(),
+			      POLY1305_BLOCK_PART7(),
+			      POLY1305_BLOCK_PART8());
+	brctg ROUND, .Lround2_8_poly;
+
+	POLY1305_BLOCK_PART1(0 * 16);
+
+	/* Store blocks 4-7. */
+	vstm Y0, Y15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 0-3. */
+	vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
+
+	stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
+
+	lghi ROUND, 1;
+	j .Lfirst_output_4blks_8_poly;
+
+.balign 4
+.Lsecond_output_4blks_8_poly:
+
+	POLY1305_BLOCK_PART1(1 * 16);
+
+	/* Load blocks 4-7. */
+	vlm X0, X15, STACK_Y0_Y15(%r15);
+
+	/* Load counters for blocks 4-7. */
+	vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
+
+	INC_POLY1305_SRC(2 * 16);
+	stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
+
+	lghi ROUND, 0;
+
+.balign 4
+	/* Output four chacha20 blocks and one poly1305 block per loop. */
+.Lfirst_output_4blks_8_poly:
+	lg %r14, STACK_INPUT(%r15);
+	vlm Y12, Y15, 0(%r14);
+	POLY1305_BLOCK_PART2();
+	PLUS(X12, Y0);
+	PLUS(X13, Y1);
+	vrepf Y0, Y12, 0;
+	vrepf Y1, Y12, 1;
+	vrepf Y2, Y12, 2;
+	vrepf Y3, Y12, 3;
+	vrepf Y4, Y13, 0;
+	vrepf Y5, Y13, 1;
+	vrepf Y6, Y13, 2;
+	vrepf Y7, Y13, 3;
+	vrepf Y8, Y14, 0;
+	vrepf Y9, Y14, 1;
+	vrepf Y10, Y14, 2;
+	vrepf Y11, Y14, 3;
+	vrepf Y14, Y15, 2;
+	vrepf Y15, Y15, 3;
+	POLY1305_BLOCK_PART3();
+	PLUS(X0, Y0);
+	PLUS(X1, Y1);
+	PLUS(X2, Y2);
+	PLUS(X3, Y3);
+	PLUS(X4, Y4);
+	PLUS(X5, Y5);
+	PLUS(X6, Y6);
+	PLUS(X7, Y7);
+	PLUS(X8, Y8);
+	PLUS(X9, Y9);
+	PLUS(X10, Y10);
+	PLUS(X11, Y11);
+	PLUS(X14, Y14);
+	PLUS(X15, Y15);
+	POLY1305_BLOCK_PART4();
+
+	larl %r14, .Lconsts;
+	vl Y15, (.Lbswap32 - .Lconsts)(%r14);
+	TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+	lg %r14, STACK_SRC(%r15);
+	POLY1305_BLOCK_PART5();
+	TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
+			Y9, Y10, Y11, Y12, Y13, Y14);
+
+	vlm Y0, Y14, 0(%r14);
+	POLY1305_BLOCK_PART6();
+	vperm X0, X0, X0, Y15;
+	vperm X1, X1, X1, Y15;
+	vperm X2, X2, X2, Y15;
+	vperm X3, X3, X3, Y15;
+	vperm X4, X4, X4, Y15;
+	vperm X5, X5, X5, Y15;
+	vperm X6, X6, X6, Y15;
+	vperm X7, X7, X7, Y15;
+	vperm X8, X8, X8, Y15;
+	vperm X9, X9, X9, Y15;
+	vperm X10, X10, X10, Y15;
+	vperm X11, X11, X11, Y15;
+	vperm X12, X12, X12, Y15;
+	vperm X13, X13, X13, Y15;
+	vperm X14, X14, X14, Y15;
+	vperm X15, X15, X15, Y15;
+	vl Y15, (15 * 16)(%r14);
+	POLY1305_BLOCK_PART7();
+
+	aghi %r14, 256;
+	stg %r14, STACK_SRC(%r15);
+	lg %r14, STACK_DST(%r15);
+
+	XOR(Y0, X0);
+	XOR(Y1, X4);
+	XOR(Y2, X8);
+	XOR(Y3, X12);
+	XOR(Y4, X1);
+	XOR(Y5, X5);
+	XOR(Y6, X9);
+	XOR(Y7, X13);
+	XOR(Y8, X2);
+	XOR(Y9, X6);
+	XOR(Y10, X10);
+	XOR(Y11, X14);
+	XOR(Y12, X3);
+	XOR(Y13, X7);
+	XOR(Y14, X11);
+	XOR(Y15, X15);
+	POLY1305_BLOCK_PART8();
+	vstm Y0, Y15, 0(%r14);
+
+	aghi %r14, 256;
+	stg %r14, STACK_DST(%r15);
+
+	lg POLY_RSRC, STACK_POSRC(%r15);
+
+	clgije ROUND, 1, .Lsecond_output_4blks_8_poly;
+
+	clgijhe NBLKS, 8, .Lloop8_poly;
+
+	/* Store poly1305 state */
+	lg POLY_RSTATE, STACK_POCTX(%r15);
+	POLY1305_STORE_STATE();
+
+	/* Clear the used vector registers */
+	DST_8(CLEAR, 0, _);
+	DST_8(CLEAR, 1, _);
+	DST_8(CLEAR, 2, _);
+	DST_8(CLEAR, 3, _);
+
+	/* Clear sensitive data in stack. */
+	vlm Y0, Y15, STACK_Y0_Y15(%r15);
+	vlm Y0, Y3, STACK_CTR(%r15);
+
+	END_STACK(%r14);
+	xgr %r2, %r2;
+	br %r14;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks8,
+    .-_gcry_chacha20_poly1305_s390x_vx_blocks8;)
+
+#endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/
+#endif /*__s390x__*/
diff --git a/comm/third_party/libgcrypt/cipher/chacha20.c b/comm/third_party/libgcrypt/cipher/chacha20.c
new file mode 100644
index 0000000000..497594a0bb
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/chacha20.c
@@ -0,0 +1,1306 @@
+/* chacha20.c  -  Bernstein's ChaCha20 cipher
+ * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * For a description of the algorithm, see:
+ *   http://cr.yp.to/chacha.html
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+
+#define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
+#define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
+#define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
+#define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
+#define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
+#define CHACHA20_CTR_SIZE     16        /* Bytes.  */
+
+
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSSE3 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
+#undef USE_ARMV7_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_ARMV7_NEON 1
+# endif
+#endif
+
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#  define USE_AARCH64_SIMD 1
+# endif
+#endif
+
+/* USE_PPC_VEC indicates whether to enable PowerPC vector
+ * accelerated code. */
+#undef USE_PPC_VEC
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_VEC 1
+#  endif
+# endif
+#endif
+
+/* USE_S390X_VX indicates whether to enable zSeries code. */
+#undef USE_S390X_VX
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+# if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
+#  define USE_S390X_VX 1
+# endif /* USE_S390X_VX */
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+#else
+# define ASM_FUNC_ABI
+#endif
+
+
+typedef struct CHACHA20_context_s
+{
+  u32 input[16];
+  unsigned char pad[CHACHA20_BLOCK_SIZE];
+  unsigned int unused; /* bytes in the pad.  */
+  unsigned int use_ssse3:1;
+  unsigned int use_avx2:1;
+  unsigned int use_neon:1;
+  unsigned int use_ppc:1;
+  unsigned int use_s390x:1;
+} CHACHA20_context_t;
+
+
+#ifdef USE_SSSE3
+
+unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+#endif /* USE_SSSE3 */
+
+#ifdef USE_AVX2
+
+unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+#endif /* USE_AVX2 */
+
+#ifdef USE_PPC_VEC
+
+unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
+unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
+#undef USE_PPC_VEC_POLY1305
+#if SIZEOF_UNSIGNED_LONG == 8
+#define USE_PPC_VEC_POLY1305 1
+unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		POLY1305_STATE *st, const byte *poly1305_src);
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
+
+#endif /* USE_PPC_VEC */
+
+#ifdef USE_S390X_VX
+
+unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
+					     const byte *src, size_t nblks);
+
+unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
+						 const byte *src, size_t nblks);
+
+#undef USE_S390X_VX_POLY1305
+#if SIZEOF_UNSIGNED_LONG == 8
+#define USE_S390X_VX_POLY1305 1
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		POLY1305_STATE *st, const byte *poly1305_src);
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
+
+#endif /* USE_S390X_VX */
+
+#ifdef USE_ARMV7_NEON
+
+unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
+					       const byte *src,
+					       size_t nblks);
+
+#endif /* USE_ARMV7_NEON */
+
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks4(u32 *state, byte *dst,
+					    const byte *src, size_t nblks);
+
+unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		void *poly1305_state, const byte *poly1305_src);
+
+#endif /* USE_AARCH64_SIMD */
+
+
+static const char *selftest (void);
+
+
+#define ROTATE(v,c)	(rol(v,c))
+#define XOR(v,w)	((v) ^ (w))
+#define PLUS(v,w)	((u32)((v) + (w)))
+#define PLUSONE(v)	(PLUS((v),1))
+
+#define QUARTERROUND(a,b,c,d) \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+  a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+  c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
+
+#define BUF_XOR_LE32(dst, src, offset, x) \
+  buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
+
+static unsigned int
+do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
+{
+  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  unsigned int i;
+
+  while (nblks)
+    {
+      x0 = input[0];
+      x1 = input[1];
+      x2 = input[2];
+      x3 = input[3];
+      x4 = input[4];
+      x5 = input[5];
+      x6 = input[6];
+      x7 = input[7];
+      x8 = input[8];
+      x9 = input[9];
+      x10 = input[10];
+      x11 = input[11];
+      x12 = input[12];
+      x13 = input[13];
+      x14 = input[14];
+      x15 = input[15];
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND(x0, x4,  x8, x12)
+	  QUARTERROUND(x1, x5,  x9, x13)
+	  QUARTERROUND(x2, x6, x10, x14)
+	  QUARTERROUND(x3, x7, x11, x15)
+	  QUARTERROUND(x0, x5, x10, x15)
+	  QUARTERROUND(x1, x6, x11, x12)
+	  QUARTERROUND(x2, x7,  x8, x13)
+	  QUARTERROUND(x3, x4,  x9, x14)
+	}
+
+      x0 = PLUS(x0, input[0]);
+      x1 = PLUS(x1, input[1]);
+      x2 = PLUS(x2, input[2]);
+      x3 = PLUS(x3, input[3]);
+      x4 = PLUS(x4, input[4]);
+      x5 = PLUS(x5, input[5]);
+      x6 = PLUS(x6, input[6]);
+      x7 = PLUS(x7, input[7]);
+      x8 = PLUS(x8, input[8]);
+      x9 = PLUS(x9, input[9]);
+      x10 = PLUS(x10, input[10]);
+      x11 = PLUS(x11, input[11]);
+      x12 = PLUS(x12, input[12]);
+      x13 = PLUS(x13, input[13]);
+      x14 = PLUS(x14, input[14]);
+      x15 = PLUS(x15, input[15]);
+
+      input[12] = PLUSONE(input[12]);
+      input[13] = PLUS(input[13], !input[12]);
+
+      BUF_XOR_LE32(dst, src, 0, x0);
+      BUF_XOR_LE32(dst, src, 4, x1);
+      BUF_XOR_LE32(dst, src, 8, x2);
+      BUF_XOR_LE32(dst, src, 12, x3);
+      BUF_XOR_LE32(dst, src, 16, x4);
+      BUF_XOR_LE32(dst, src, 20, x5);
+      BUF_XOR_LE32(dst, src, 24, x6);
+      BUF_XOR_LE32(dst, src, 28, x7);
+      BUF_XOR_LE32(dst, src, 32, x8);
+      BUF_XOR_LE32(dst, src, 36, x9);
+      BUF_XOR_LE32(dst, src, 40, x10);
+      BUF_XOR_LE32(dst, src, 44, x11);
+      BUF_XOR_LE32(dst, src, 48, x12);
+      BUF_XOR_LE32(dst, src, 52, x13);
+      BUF_XOR_LE32(dst, src, 56, x14);
+      BUF_XOR_LE32(dst, src, 60, x15);
+
+      src += CHACHA20_BLOCK_SIZE;
+      dst += CHACHA20_BLOCK_SIZE;
+      nblks--;
+    }
+
+  /* burn_stack */
+  return (17 * sizeof(u32) + 6 * sizeof(void *));
+}
+
+
+static unsigned int
+chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
+		 size_t nblks)
+{
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+#ifdef USE_PPC_VEC
+  if (ctx->use_ppc)
+    {
+      return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+#ifdef USE_S390X_VX
+  if (ctx->use_s390x)
+    {
+      return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+  return do_chacha20_blocks (ctx->input, dst, src, nblks);
+}
+
+
+static void
+chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
+                   unsigned int keylen)
+{
+  static const char sigma[16] = "expand 32-byte k";
+  static const char tau[16] = "expand 16-byte k";
+  const char *constants;
+
+  ctx->input[4] = buf_get_le32(key + 0);
+  ctx->input[5] = buf_get_le32(key + 4);
+  ctx->input[6] = buf_get_le32(key + 8);
+  ctx->input[7] = buf_get_le32(key + 12);
+  if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
+    {
+      key += 16;
+      constants = sigma;
+    }
+  else /* 128 bits */
+    {
+      constants = tau;
+    }
+  ctx->input[8] = buf_get_le32(key + 0);
+  ctx->input[9] = buf_get_le32(key + 4);
+  ctx->input[10] = buf_get_le32(key + 8);
+  ctx->input[11] = buf_get_le32(key + 12);
+  ctx->input[0] = buf_get_le32(constants + 0);
+  ctx->input[1] = buf_get_le32(constants + 4);
+  ctx->input[2] = buf_get_le32(constants + 8);
+  ctx->input[3] = buf_get_le32(constants + 12);
+}
+
+
+static void
+chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
+{
+  if (ivlen == CHACHA20_CTR_SIZE)
+    {
+      ctx->input[12] = buf_get_le32 (iv + 0);
+      ctx->input[13] = buf_get_le32 (iv + 4);
+      ctx->input[14] = buf_get_le32 (iv + 8);
+      ctx->input[15] = buf_get_le32 (iv + 12);
+    }
+  else if (ivlen == CHACHA20_MAX_IV_SIZE)
+    {
+      ctx->input[12] = 0;
+      ctx->input[13] = buf_get_le32 (iv + 0);
+      ctx->input[14] = buf_get_le32 (iv + 4);
+      ctx->input[15] = buf_get_le32 (iv + 8);
+    }
+  else if (ivlen == CHACHA20_MIN_IV_SIZE)
+    {
+      ctx->input[12] = 0;
+      ctx->input[13] = 0;
+      ctx->input[14] = buf_get_le32 (iv + 0);
+      ctx->input[15] = buf_get_le32 (iv + 4);
+    }
+  else
+    {
+      ctx->input[12] = 0;
+      ctx->input[13] = 0;
+      ctx->input[14] = 0;
+      ctx->input[15] = 0;
+    }
+}
+
+
+static void
+chacha20_setiv (void *context, const byte *iv, size_t ivlen)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
+  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
+      && ivlen != CHACHA20_CTR_SIZE)
+    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
+
+  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
+             || ivlen == CHACHA20_CTR_SIZE))
+    chacha20_ivsetup (ctx, iv, ivlen);
+  else
+    chacha20_ivsetup (ctx, NULL, 0);
+
+  /* Reset the unused pad bytes counter.  */
+  ctx->unused = 0;
+}
+
+
+static gcry_err_code_t
+chacha20_do_setkey (CHACHA20_context_t *ctx,
+                    const byte *key, unsigned int keylen)
+{
+  static int initialized;
+  static const char *selftest_failed;
+  unsigned int features = _gcry_get_hw_features ();
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
+    return GPG_ERR_INV_KEYLEN;
+
+#ifdef USE_SSSE3
+  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+#endif
+#ifdef USE_AVX2
+  ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
+#endif
+#ifdef USE_ARMV7_NEON
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+#endif
+#ifdef USE_AARCH64_SIMD
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
+#endif
+#ifdef USE_PPC_VEC
+  ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+#endif
+#ifdef USE_S390X_VX
+  ctx->use_s390x = (features & HWF_S390X_VX) != 0;
+#endif
+
+  (void)features;
+
+  chacha20_keysetup (ctx, key, keylen);
+
+  /* We default to a zero nonce.  */
+  chacha20_setiv (ctx, NULL, 0);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+chacha20_setkey (void *context, const byte *key, unsigned int keylen,
+                 cipher_bulk_ops_t *bulk_ops)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+  gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
+  (void)bulk_ops;
+  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
+  return rc;
+}
+
+
+static unsigned int
+do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
+				 const byte *inbuf, size_t length)
+{
+  static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
+  unsigned int nburn, burn = 0;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
+						 nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_ARMV7_NEON
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
+						nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf,
+					     nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_PPC_VEC
+  if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_S390X_VX
+  if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
+					      nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+  if (length >= CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+
+  if (length > 0)
+    {
+      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
+      burn = nburn > burn ? nburn : burn;
+
+      buf_xor (outbuf, inbuf, ctx->pad, length);
+      ctx->unused = CHACHA20_BLOCK_SIZE - length;
+    }
+
+  if (burn)
+    burn += 5 * sizeof(void *);
+
+  return burn;
+}
+
+
+static void
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+                         size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+        return;
+      gcry_assert (!ctx->unused);
+    }
+
+  nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+  burn = nburn > burn ? nburn : burn;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (void *) &c->context.c;
+  unsigned int nburn, burn = 0;
+  byte *authptr = NULL;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+  if (0)
+    { }
+#ifdef USE_AVX2
+  else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 8 * CHACHA20_BLOCK_SIZE;
+      outbuf += 8 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_SSSE3
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 2 * CHACHA20_BLOCK_SIZE;
+      outbuf += 2 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 1 * CHACHA20_BLOCK_SIZE;
+      outbuf += 1 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_AARCH64_SIMD
+  else if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_aarch64_blocks4(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_PPC_VEC_POLY1305
+  else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+#ifdef USE_S390X_VX_POLY1305
+  else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
+    {
+      nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 8 * CHACHA20_BLOCK_SIZE;
+      outbuf += 8 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 8 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 4 * CHACHA20_BLOCK_SIZE;
+      outbuf += 4 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 4 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
+    {
+      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 2 * CHACHA20_BLOCK_SIZE;
+      outbuf += 2 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
+    }
+  else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
+    {
+      nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 1 * CHACHA20_BLOCK_SIZE;
+      outbuf += 1 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 1 * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+  if (authptr)
+    {
+      size_t authoffset = outbuf - authptr;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2 &&
+	  length >= 8 * CHACHA20_BLOCK_SIZE &&
+	  authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 8;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+#endif
+
+#ifdef USE_SSSE3
+      if (ctx->use_ssse3)
+	{
+	  if (length >= 4 * CHACHA20_BLOCK_SIZE &&
+	      authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	      nblocks -= nblocks % 4;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+
+	  if (length >= CHACHA20_BLOCK_SIZE &&
+	      authoffset >= CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	      nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+	}
+#endif
+
+#ifdef USE_AARCH64_SIMD
+      if (ctx->use_neon &&
+	  length >= 4 * CHACHA20_BLOCK_SIZE &&
+	  authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 4;
+
+	  nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+#endif
+
+#ifdef USE_PPC_VEC_POLY1305
+      if (ctx->use_ppc &&
+	  length >= 4 * CHACHA20_BLOCK_SIZE &&
+	  authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 4;
+
+	  nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	  authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+#endif
+
+#ifdef USE_S390X_VX_POLY1305
+      if (ctx->use_s390x)
+	{
+	  if (length >= 8 * CHACHA20_BLOCK_SIZE &&
+	      authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	      nblocks -= nblocks % 8;
+
+	      burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+
+	  if (length >= CHACHA20_BLOCK_SIZE &&
+	      authoffset >= CHACHA20_BLOCK_SIZE)
+	    {
+	      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	      burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, authptr);
+	      burn = nburn > burn ? nburn : burn;
+
+	      length  -= nblocks * CHACHA20_BLOCK_SIZE;
+	      outbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	      inbuf   += nblocks * CHACHA20_BLOCK_SIZE;
+	      authptr += nblocks * CHACHA20_BLOCK_SIZE;
+	    }
+	}
+#endif
+
+      if (authoffset > 0)
+	{
+	  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
+	  authptr += authoffset;
+	  authoffset = 0;
+	}
+
+      gcry_assert(authptr == outbuf);
+    }
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
+				const byte *inbuf, size_t length)
+{
+  CHACHA20_context_t *ctx = (void *) &c->context.c;
+  unsigned int nburn, burn = 0;
+
+  if (!length)
+    return 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
+      burn = nburn > burn ? nburn : burn;
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+
+      if (!length)
+	{
+	  if (burn)
+	    _gcry_burn_stack (burn);
+
+	  return 0;
+	}
+      gcry_assert (!ctx->unused);
+    }
+
+  gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+
+      nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+			ctx->input, outbuf, inbuf, nblocks,
+			&c->u_mode.poly1305.ctx.state, inbuf);
+      burn = nburn > burn ? nburn : burn;
+
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      if (length >= 4 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 4;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+
+      if (length >= CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	  nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+    }
+#endif
+
+#ifdef USE_AARCH64_SIMD
+  if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+
+      nburn = _gcry_chacha20_poly1305_aarch64_blocks4(
+			ctx->input, outbuf, inbuf, nblocks,
+			&c->u_mode.poly1305.ctx.state, inbuf);
+      burn = nburn > burn ? nburn : burn;
+
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_PPC_VEC_POLY1305
+  if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+
+      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+			ctx->input, outbuf, inbuf, nblocks,
+			&c->u_mode.poly1305.ctx.state, inbuf);
+      burn = nburn > burn ? nburn : burn;
+
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
+#ifdef USE_S390X_VX_POLY1305
+  if (ctx->use_s390x)
+    {
+      if (length >= 8 * CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+	  nblocks -= nblocks % 8;
+
+	  nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+
+      if (length >= CHACHA20_BLOCK_SIZE)
+	{
+	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+	  nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+			    ctx->input, outbuf, inbuf, nblocks,
+			    &c->u_mode.poly1305.ctx.state, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
+    }
+#endif
+
+  while (length)
+    {
+      size_t currlen = length;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
+					  currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, currlen);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      length -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return 0;
+}
+
+
+static const char *
+selftest (void)
+{
+  byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
+  CHACHA20_context_t *ctx;
+  byte scratch[127 + 1];
+  byte buf[512 + 64 + 4];
+  int i;
+
+  /* From draft-strombergson-chacha-test-vectors */
+  static byte key_1[] = {
+    0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
+    0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
+    0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
+    0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
+  };
+  static const byte nonce_1[] =
+    { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
+  static const byte plaintext_1[127] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  static const byte ciphertext_1[127] = {
+    0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
+    0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
+    0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
+    0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
+    0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
+    0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
+    0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
+    0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
+    0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
+    0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
+    0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
+    0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
+    0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
+    0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
+    0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
+    0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
+  };
+
+  /* 16-byte alignment required for amd64 implementation. */
+  ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
+
+  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  scratch[sizeof (scratch) - 1] = 0;
+  chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
+  if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
+    return "ChaCha20 encryption test 1 failed.";
+  if (scratch[sizeof (scratch) - 1])
+    return "ChaCha20 wrote too much.";
+  chacha20_setkey (ctx, key_1, sizeof (key_1), NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
+  if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
+    return "ChaCha20 decryption test 1 failed.";
+
+  for (i = 0; i < sizeof buf; i++)
+    buf[i] = i;
+  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  /*encrypt */
+  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
+  /*decrypt */
+  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, buf, buf, 1);
+  chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
+  chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
+                           buf + (sizeof buf) - 1, 1);
+  for (i = 0; i < sizeof buf; i++)
+    if (buf[i] != (byte) i)
+      return "ChaCha20 encryption test 2 failed.";
+
+  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  /* encrypt */
+  for (i = 0; i < sizeof buf; i++)
+    chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
+  /* decrypt */
+  chacha20_setkey (ctx, key_1, sizeof key_1, NULL);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
+  for (i = 0; i < sizeof buf; i++)
+    if (buf[i] != (byte) i)
+      return "ChaCha20 encryption test 3 failed.";
+
+  return NULL;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
+  GCRY_CIPHER_CHACHA20,
+  {0, 0},                       /* flags */
+  "CHACHA20",                   /* name */
+  NULL,                         /* aliases */
+  NULL,                         /* oids */
+  1,                            /* blocksize in bytes. */
+  CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
+  sizeof (CHACHA20_context_t),
+  chacha20_setkey,
+  NULL,
+  NULL,
+  chacha20_encrypt_stream,
+  chacha20_encrypt_stream,
+  NULL,
+  NULL,
+  chacha20_setiv
+};
diff --git a/comm/third_party/libgcrypt/cipher/cipher-aeswrap.c b/comm/third_party/libgcrypt/cipher/cipher-aeswrap.c
new file mode 100644
index 0000000000..c182657e1f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-aeswrap.c
@@ -0,0 +1,209 @@
+/* cipher-aeswrap.c  - Generic AESWRAP mode implementation
+ * Copyright (C) 2009, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+/* Perform the AES-Wrap algorithm as specified by RFC3394.  We
+   implement this as a mode usable with any cipher algorithm of
+   blocksize 128.  */
+gcry_err_code_t
+_gcry_cipher_aeswrap_encrypt (gcry_cipher_hd_t c,
+                              byte *outbuf, size_t outbuflen,
+                              const byte *inbuf, size_t inbuflen )
+{
+  int j, x;
+  size_t n, i;
+  unsigned char *r, *a, *b;
+  unsigned char t[8];
+  unsigned int burn, nburn;
+
+#if MAX_BLOCKSIZE < 8
+#error Invalid block size
+#endif
+  /* We require a cipher with a 128 bit block length.  */
+  if (c->spec->blocksize != 16)
+    return GPG_ERR_INV_LENGTH;
+
+  /* The output buffer must be able to hold the input data plus one
+     additional block.  */
+  if (outbuflen < inbuflen + 8)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  /* Input data must be multiple of 64 bits.  */
+  if (inbuflen % 8)
+    return GPG_ERR_INV_ARG;
+
+  n = inbuflen / 8;
+
+  /* We need at least two 64 bit blocks.  */
+  if (n < 2)
+    return GPG_ERR_INV_ARG;
+
+  burn = 0;
+
+  r = outbuf;
+  a = outbuf;  /* We store A directly in OUTBUF.  */
+  b = c->u_ctr.ctr;  /* B is also used to concatenate stuff.  */
+
+  /* Copy the inbuf to the outbuf. */
+  memmove (r+8, inbuf, inbuflen);
+
+  /* If an IV has been set we use that IV as the Alternative Initial
+     Value; if it has not been set we use the standard value.  */
+  if (c->marks.iv)
+    memcpy (a, c->u_iv.iv, 8);
+  else
+    memset (a, 0xa6, 8);
+
+  memset (t, 0, sizeof t); /* t := 0.  */
+
+  for (j = 0; j <= 5; j++)
+    {
+      for (i = 1; i <= n; i++)
+        {
+          /* B := AES_k( A | R[i] ) */
+          memcpy (b, a, 8);
+          memcpy (b+8, r+i*8, 8);
+          nburn = c->spec->encrypt (&c->context.c, b, b);
+          burn = nburn > burn ? nburn : burn;
+          /* t := t + 1  */
+	  for (x = 7; x >= 0; x--)
+	    {
+	      t[x]++;
+	      if (t[x])
+		break;
+	    }
+          /* A := MSB_64(B) ^ t */
+	  cipher_block_xor(a, b, t, 8);
+          /* R[i] := LSB_64(B) */
+          memcpy (r+i*8, b+8, 8);
+        }
+   }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+/* Perform the AES-Unwrap algorithm as specified by RFC3394.  We
+   implement this as a mode usable with any cipher algorithm of
+   blocksize 128.  */
+gcry_err_code_t
+_gcry_cipher_aeswrap_decrypt (gcry_cipher_hd_t c,
+                              byte *outbuf, size_t outbuflen,
+                              const byte *inbuf, size_t inbuflen)
+{
+  int j, x;
+  size_t n, i;
+  unsigned char *r, *a, *b;
+  unsigned char t[8];
+  unsigned int burn, nburn;
+
+#if MAX_BLOCKSIZE < 8
+#error Invalid block size
+#endif
+  /* We require a cipher with a 128 bit block length.  */
+  if (c->spec->blocksize != 16)
+    return GPG_ERR_INV_LENGTH;
+
+  /* The output buffer must be able to hold the input data minus one
+     additional block.  Fixme: The caller has more restrictive checks
+     - we may want to fix them for this mode.  */
+  if (outbuflen + 8  < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  /* Input data must be multiple of 64 bits.  */
+  if (inbuflen % 8)
+    return GPG_ERR_INV_ARG;
+
+  n = inbuflen / 8;
+
+  /* We need at least three 64 bit blocks.  */
+  if (n < 3)
+    return GPG_ERR_INV_ARG;
+
+  burn = 0;
+
+  r = outbuf;
+  a = c->lastiv;  /* We use c->LASTIV as buffer for A.  */
+  b = c->u_ctr.ctr;     /* B is also used to concatenate stuff.  */
+
+  /* Copy the inbuf to the outbuf and save A. */
+  memcpy (a, inbuf, 8);
+  memmove (r, inbuf+8, inbuflen-8);
+  n--; /* Reduce to actual number of data blocks.  */
+
+  /* t := 6 * n  */
+  i = n * 6;  /* The range is valid because: n = inbuflen / 8 - 1.  */
+  for (x=0; x < 8 && x < sizeof (i); x++)
+    t[7-x] = i >> (8*x);
+  for (; x < 8; x++)
+    t[7-x] = 0;
+
+  for (j = 5; j >= 0; j--)
+    {
+      for (i = n; i >= 1; i--)
+        {
+          /* B := AES_k^1( (A ^ t)| R[i] ) */
+	  cipher_block_xor(b, a, t, 8);
+          memcpy (b+8, r+(i-1)*8, 8);
+          nburn = c->spec->decrypt (&c->context.c, b, b);
+          burn = nburn > burn ? nburn : burn;
+          /* t := t - 1  */
+	  for (x = 7; x >= 0; x--)
+	    {
+	      t[x]--;
+	      if (t[x] != 0xff)
+		break;
+	    }
+          /* A := MSB_64(B) */
+          memcpy (a, b, 8);
+          /* R[i] := LSB_64(B) */
+          memcpy (r+(i-1)*8, b+8, 8);
+        }
+   }
+
+  /* If an IV has been set we compare against this Alternative Initial
+     Value; if it has not been set we compare against the standard IV.  */
+  if (c->marks.iv)
+    j = memcmp (a, c->u_iv.iv, 8);
+  else
+    {
+      for (j=0, x=0; x < 8; x++)
+        if (a[x] != 0xa6)
+          {
+            j=1;
+            break;
+          }
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return j? GPG_ERR_CHECKSUM : 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-cbc.c b/comm/third_party/libgcrypt/cipher/cipher-cbc.c
new file mode 100644
index 0000000000..d4df1e72aa
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-cbc.c
@@ -0,0 +1,292 @@
+/* cipher-cbc.c  - Generic CBC mode implementation
+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ *               2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "./cipher-internal.h"
+#include "bufhelp.h"
+
+
+
+static inline unsigned int
+cbc_encrypt_inner(gcry_cipher_hd_t c, unsigned char *outbuf,
+                  const unsigned char *inbuf, size_t nblocks, size_t blocksize,
+                  int is_cbc_cmac)
+{
+
+  unsigned int burn, nburn;
+  size_t n;
+
+  burn = 0;
+
+  if (c->bulk.cbc_enc)
+    {
+      c->bulk.cbc_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks,
+                       is_cbc_cmac);
+    }
+  else
+    {
+      gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+      unsigned char *ivp;
+
+      ivp = c->u_iv.iv;
+
+      for (n=0; n < nblocks; n++ )
+        {
+          cipher_block_xor (outbuf, inbuf, ivp, blocksize);
+          nburn = enc_fn ( &c->context.c, outbuf, outbuf );
+          burn = nburn > burn ? nburn : burn;
+          ivp = outbuf;
+          inbuf += blocksize;
+          if (!is_cbc_cmac)
+            outbuf += blocksize;
+        }
+
+      if (ivp != c->u_iv.iv)
+        cipher_block_cpy (c->u_iv.iv, ivp, blocksize);
+    }
+
+  return burn;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cbc_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_mask = blocksize - 1;
+  size_t nblocks = inbuflen >> blocksize_shift;
+  int is_cbc_cmac = !!(c->flags & GCRY_CIPHER_CBC_MAC);
+  unsigned int burn;
+
+  if (outbuflen < (is_cbc_cmac ? blocksize : inbuflen))
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if (inbuflen & blocksize_mask)
+    return GPG_ERR_INV_LENGTH;
+
+  burn = cbc_encrypt_inner(c, outbuf, inbuf, nblocks, blocksize, is_cbc_cmac);
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cbc_cts_encrypt (gcry_cipher_hd_t c,
+                              unsigned char *outbuf, size_t outbuflen,
+                              const unsigned char *inbuf, size_t inbuflen)
+{
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_mask = blocksize - 1;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t nblocks = inbuflen >> blocksize_shift;
+  unsigned int burn, nburn;
+  unsigned char *ivp;
+  int i;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if ((inbuflen & blocksize_mask) && !(inbuflen > blocksize))
+    return GPG_ERR_INV_LENGTH;
+
+  burn = 0;
+
+  if (inbuflen > blocksize)
+    {
+      if ((inbuflen & blocksize_mask) == 0)
+	nblocks--;
+    }
+
+  burn = cbc_encrypt_inner(c, outbuf, inbuf, nblocks, blocksize, 0);
+  inbuf += nblocks << blocksize_shift;
+  outbuf += nblocks << blocksize_shift;
+
+  if (inbuflen > blocksize)
+    {
+      /* We have to be careful here, since outbuf might be equal to
+         inbuf.  */
+      size_t restbytes;
+      unsigned char b;
+
+      if ((inbuflen & blocksize_mask) == 0)
+        restbytes = blocksize;
+      else
+        restbytes = inbuflen & blocksize_mask;
+
+      outbuf -= blocksize;
+      for (ivp = c->u_iv.iv, i = 0; i < restbytes; i++)
+        {
+          b = inbuf[i];
+          outbuf[blocksize + i] = outbuf[i];
+          outbuf[i] = b ^ *ivp++;
+        }
+      for (; i < blocksize; i++)
+        outbuf[i] = 0 ^ *ivp++;
+
+      nburn = enc_fn (&c->context.c, outbuf, outbuf);
+      burn = nburn > burn ? nburn : burn;
+      cipher_block_cpy (c->u_iv.iv, outbuf, blocksize);
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+static inline unsigned int
+cbc_decrypt_inner(gcry_cipher_hd_t c, unsigned char *outbuf,
+                  const unsigned char *inbuf, size_t nblocks, size_t blocksize)
+{
+  unsigned int burn, nburn;
+  size_t n;
+
+  burn = 0;
+
+  if (c->bulk.cbc_dec)
+    {
+      c->bulk.cbc_dec (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks);
+    }
+  else
+    {
+      gcry_cipher_decrypt_t dec_fn = c->spec->decrypt;
+
+      for (n = 0; n < nblocks; n++)
+        {
+          /* Because outbuf and inbuf might be the same, we must not overwrite
+             the original ciphertext block.  We use LASTIV as intermediate
+             storage here because it is not used otherwise.  */
+          nburn = dec_fn ( &c->context.c, c->lastiv, inbuf );
+          burn = nburn > burn ? nburn : burn;
+          cipher_block_xor_n_copy_2 (outbuf, c->lastiv, c->u_iv.iv, inbuf,
+                                     blocksize);
+          inbuf  += blocksize;
+          outbuf += blocksize;
+        }
+    }
+
+  return burn;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cbc_decrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_mask = blocksize - 1;
+  size_t nblocks = inbuflen >> blocksize_shift;
+  unsigned int burn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if (inbuflen & blocksize_mask)
+    return GPG_ERR_INV_LENGTH;
+
+  burn = cbc_decrypt_inner(c, outbuf, inbuf, nblocks, blocksize);
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cbc_cts_decrypt (gcry_cipher_hd_t c,
+                              unsigned char *outbuf, size_t outbuflen,
+                              const unsigned char *inbuf, size_t inbuflen)
+{
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_mask = blocksize - 1;
+  gcry_cipher_decrypt_t dec_fn = c->spec->decrypt;
+  size_t nblocks = inbuflen >> blocksize_shift;
+  unsigned int burn, nburn;
+  int i;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if ((inbuflen & blocksize_mask) && !(inbuflen > blocksize))
+    return GPG_ERR_INV_LENGTH;
+
+  burn = 0;
+
+  if (inbuflen > blocksize)
+    {
+      nblocks--;
+      if ((inbuflen & blocksize_mask) == 0)
+	nblocks--;
+      cipher_block_cpy (c->lastiv, c->u_iv.iv, blocksize);
+    }
+
+  burn = cbc_decrypt_inner(c, outbuf, inbuf, nblocks, blocksize);
+  inbuf  += nblocks << blocksize_shift;
+  outbuf += nblocks << blocksize_shift;
+
+  if (inbuflen > blocksize)
+    {
+      size_t restbytes;
+
+      if ((inbuflen & blocksize_mask) == 0)
+        restbytes = blocksize;
+      else
+        restbytes = inbuflen & blocksize_mask;
+
+      cipher_block_cpy (c->lastiv, c->u_iv.iv, blocksize ); /* Save Cn-2. */
+      buf_cpy (c->u_iv.iv, inbuf + blocksize, restbytes ); /* Save Cn. */
+
+      nburn = dec_fn ( &c->context.c, outbuf, inbuf );
+      burn = nburn > burn ? nburn : burn;
+      buf_xor(outbuf, outbuf, c->u_iv.iv, restbytes);
+
+      buf_cpy (outbuf + blocksize, outbuf, restbytes);
+      for(i=restbytes; i < blocksize; i++)
+        c->u_iv.iv[i] = outbuf[i];
+      nburn = dec_fn (&c->context.c, outbuf, c->u_iv.iv);
+      burn = nburn > burn ? nburn : burn;
+      cipher_block_xor(outbuf, outbuf, c->lastiv, blocksize);
+      /* c->lastiv is now really lastlastiv, does this matter? */
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-ccm.c b/comm/third_party/libgcrypt/cipher/cipher-ccm.c
new file mode 100644
index 0000000000..dcb268d084
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-ccm.c
@@ -0,0 +1,415 @@
+/* cipher-ccm.c - CTR mode with CBC-MAC mode implementation
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+#define set_burn(burn, nburn) do { \
+  unsigned int __nburn = (nburn); \
+  (burn) = (burn) > __nburn ? (burn) : __nburn; } while (0)
+
+
+static unsigned int
+do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
+            int do_padding)
+{
+  const unsigned int blocksize = 16;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  unsigned char tmp[blocksize];
+  unsigned int burn = 0;
+  unsigned int unused = c->u_mode.ccm.mac_unused;
+  size_t nblocks;
+  size_t n;
+
+  if (inlen == 0 && (unused == 0 || !do_padding))
+    return 0;
+
+  do
+    {
+      if (inlen + unused < blocksize || unused > 0)
+        {
+	  n = (inlen > blocksize - unused) ? blocksize - unused : inlen;
+
+	  buf_cpy (&c->u_mode.ccm.macbuf[unused], inbuf, n);
+	  unused += n;
+	  inlen -= n;
+	  inbuf += n;
+        }
+      if (!inlen)
+        {
+          if (!do_padding)
+            break;
+
+	  n = blocksize - unused;
+	  if (n > 0)
+	    {
+	      memset (&c->u_mode.ccm.macbuf[unused], 0, n);
+	      unused = blocksize;
+	    }
+        }
+
+      if (unused > 0)
+        {
+          /* Process one block from macbuf.  */
+          cipher_block_xor(c->u_iv.iv, c->u_iv.iv, c->u_mode.ccm.macbuf,
+                           blocksize);
+          set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
+
+          unused = 0;
+        }
+
+      if (c->bulk.cbc_enc)
+        {
+          nblocks = inlen / blocksize;
+          c->bulk.cbc_enc (&c->context.c, c->u_iv.iv, tmp, inbuf, nblocks, 1);
+          inbuf += nblocks * blocksize;
+          inlen -= nblocks * blocksize;
+
+          wipememory (tmp, sizeof(tmp));
+        }
+      else
+        {
+          while (inlen >= blocksize)
+            {
+              cipher_block_xor(c->u_iv.iv, c->u_iv.iv, inbuf, blocksize);
+
+              set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
+
+              inlen -= blocksize;
+              inbuf += blocksize;
+            }
+        }
+    }
+  while (inlen > 0);
+
+  c->u_mode.ccm.mac_unused = unused;
+
+  if (burn)
+    burn += 4 * sizeof(void *);
+
+  return burn;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
+                            size_t noncelen)
+{
+  unsigned int marks_key;
+  size_t L = 15 - noncelen;
+  size_t L_;
+
+  L_ = L - 1;
+
+  if (!nonce)
+    return GPG_ERR_INV_ARG;
+  /* Length field must be 2, 3, ..., or 8. */
+  if (L < 2 || L > 8)
+    return GPG_ERR_INV_LENGTH;
+
+  /* Reset state */
+  marks_key = c->marks.key;
+  memset (&c->u_mode, 0, sizeof(c->u_mode));
+  memset (&c->marks, 0, sizeof(c->marks));
+  memset (&c->u_iv, 0, sizeof(c->u_iv));
+  memset (&c->u_ctr, 0, sizeof(c->u_ctr));
+  memset (c->lastiv, 0, sizeof(c->lastiv));
+  c->unused = 0;
+  c->marks.key = marks_key;
+
+  /* Setup CTR */
+  c->u_ctr.ctr[0] = L_;
+  memcpy (&c->u_ctr.ctr[1], nonce, noncelen);
+  memset (&c->u_ctr.ctr[1 + noncelen], 0, L);
+
+  /* Setup IV */
+  c->u_iv.iv[0] = L_;
+  memcpy (&c->u_iv.iv[1], nonce, noncelen);
+  /* Add (8 * M_ + 64 * flags) to iv[0] and set iv[noncelen + 1 ... 15] later
+     in set_aad.  */
+  memset (&c->u_iv.iv[1 + noncelen], 0, L);
+
+  c->u_mode.ccm.nonce = 1;
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, u64 encryptlen, u64 aadlen,
+                              u64 taglen)
+{
+  unsigned int burn = 0;
+  unsigned char b0[16];
+  size_t noncelen = 15 - (c->u_iv.iv[0] + 1);
+  u64 M = taglen;
+  u64 M_;
+  int i;
+
+  M_ = (M - 2) / 2;
+
+  /* Authentication field must be 4, 6, 8, 10, 12, 14 or 16. */
+  if ((M_ * 2 + 2) != M || M < 4 || M > 16)
+    return GPG_ERR_INV_LENGTH;
+  if (!c->u_mode.ccm.nonce || c->marks.tag)
+    return GPG_ERR_INV_STATE;
+  if (c->u_mode.ccm.lengths)
+    return GPG_ERR_INV_STATE;
+
+  c->u_mode.ccm.authlen = taglen;
+  c->u_mode.ccm.encryptlen = encryptlen;
+  c->u_mode.ccm.aadlen = aadlen;
+
+  /* Complete IV setup.  */
+  c->u_iv.iv[0] += (aadlen > 0) * 64 + M_ * 8;
+  for (i = 16 - 1; i >= 1 + noncelen; i--)
+    {
+      c->u_iv.iv[i] = encryptlen & 0xff;
+      encryptlen >>= 8;
+    }
+
+  memcpy (b0, c->u_iv.iv, 16);
+  memset (c->u_iv.iv, 0, 16);
+
+  set_burn (burn, do_cbc_mac (c, b0, 16, 0));
+
+  if (aadlen == 0)
+    {
+      /* Do nothing.  */
+    }
+  else if (aadlen > 0 && aadlen <= (unsigned int)0xfeff)
+    {
+      b0[0] = (aadlen >> 8) & 0xff;
+      b0[1] = aadlen & 0xff;
+      set_burn (burn, do_cbc_mac (c, b0, 2, 0));
+    }
+  else if (aadlen > 0xfeff && aadlen <= (unsigned int)0xffffffff)
+    {
+      b0[0] = 0xff;
+      b0[1] = 0xfe;
+      buf_put_be32(&b0[2], aadlen);
+      set_burn (burn, do_cbc_mac (c, b0, 6, 0));
+    }
+  else if (aadlen > (unsigned int)0xffffffff)
+    {
+      b0[0] = 0xff;
+      b0[1] = 0xff;
+      buf_put_be64(&b0[2], aadlen);
+      set_burn (burn, do_cbc_mac (c, b0, 10, 0));
+    }
+
+  /* Generate S_0 and increase counter.  */
+  set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_mode.ccm.s0,
+                                     c->u_ctr.ctr ));
+  c->u_ctr.ctr[15]++;
+
+  if (burn)
+    _gcry_burn_stack (burn + sizeof(void *) * 5);
+
+  c->u_mode.ccm.lengths = 1;
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
+                               size_t abuflen)
+{
+  unsigned int burn;
+
+  if (abuflen > 0 && !abuf)
+    return GPG_ERR_INV_ARG;
+  if (!c->u_mode.ccm.nonce || !c->u_mode.ccm.lengths || c->marks.tag)
+    return GPG_ERR_INV_STATE;
+  if (abuflen > c->u_mode.ccm.aadlen)
+    return GPG_ERR_INV_LENGTH;
+
+  c->u_mode.ccm.aadlen -= abuflen;
+  burn = do_cbc_mac (c, abuf, abuflen, c->u_mode.ccm.aadlen == 0);
+
+  if (burn)
+    _gcry_burn_stack (burn + sizeof(void *) * 5);
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_tag (gcry_cipher_hd_t c, unsigned char *outbuf,
+		      size_t outbuflen, int check)
+{
+  unsigned int burn;
+
+  if (!outbuf || outbuflen == 0)
+    return GPG_ERR_INV_ARG;
+  /* Tag length must be same as initial authlen.  */
+  if (c->u_mode.ccm.authlen != outbuflen)
+    return GPG_ERR_INV_LENGTH;
+  if (!c->u_mode.ccm.nonce || !c->u_mode.ccm.lengths || c->u_mode.ccm.aadlen > 0)
+    return GPG_ERR_INV_STATE;
+  /* Initial encrypt length must match with length of actual data processed.  */
+  if (c->u_mode.ccm.encryptlen > 0)
+    return GPG_ERR_UNFINISHED;
+
+  if (!c->marks.tag)
+    {
+      burn = do_cbc_mac (c, NULL, 0, 1); /* Perform final padding.  */
+
+      /* Add S_0 */
+      cipher_block_xor (c->u_iv.iv, c->u_iv.iv, c->u_mode.ccm.s0, 16);
+
+      wipememory (c->u_ctr.ctr, 16);
+      wipememory (c->u_mode.ccm.s0, 16);
+      wipememory (c->u_mode.ccm.macbuf, 16);
+
+      if (burn)
+        _gcry_burn_stack (burn + sizeof(void *) * 5);
+
+      c->marks.tag = 1;
+    }
+
+  if (!check)
+    {
+      memcpy (outbuf, c->u_iv.iv, outbuflen);
+      return GPG_ERR_NO_ERROR;
+    }
+  else
+    {
+      return buf_eq_const(outbuf, c->u_iv.iv, outbuflen) ?
+             GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
+    }
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+			  size_t taglen)
+{
+  return _gcry_cipher_ccm_tag (c, outtag, taglen, 0);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+			    size_t taglen)
+{
+  return _gcry_cipher_ccm_tag (c, (unsigned char *)intag, taglen, 1);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
+                          size_t outbuflen, const unsigned char *inbuf,
+                          size_t inbuflen)
+{
+  gcry_err_code_t err = 0;
+  unsigned int burn = 0;
+  unsigned int nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (!c->u_mode.ccm.nonce || c->marks.tag || !c->u_mode.ccm.lengths ||
+      c->u_mode.ccm.aadlen > 0)
+    return GPG_ERR_INV_STATE;
+  if (inbuflen > c->u_mode.ccm.encryptlen)
+    return GPG_ERR_INV_LENGTH;
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for encryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      c->u_mode.ccm.encryptlen -= currlen;
+      nburn = do_cbc_mac (c, inbuf, currlen, 0);
+      burn = nburn > burn ? nburn : burn;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err)
+	break;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn + sizeof(void *) * 5);
+  return err;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf,
+                          size_t outbuflen, const unsigned char *inbuf,
+                          size_t inbuflen)
+{
+  gcry_err_code_t err = 0;
+  unsigned int burn = 0;
+  unsigned int nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (!c->u_mode.ccm.nonce || c->marks.tag || !c->u_mode.ccm.lengths ||
+      c->u_mode.ccm.aadlen > 0)
+    return GPG_ERR_INV_STATE;
+  if (inbuflen > c->u_mode.ccm.encryptlen)
+    return GPG_ERR_INV_LENGTH;
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err)
+	break;
+
+      c->u_mode.ccm.encryptlen -= currlen;
+      nburn = do_cbc_mac (c, outbuf, currlen, 0);
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn + sizeof(void *) * 5);
+  return err;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-cfb.c b/comm/third_party/libgcrypt/cipher/cipher-cfb.c
new file mode 100644
index 0000000000..012c6c13c3
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-cfb.c
@@ -0,0 +1,317 @@
+/* cipher-cfb.c  - Generic CFB mode implementation
+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ *               2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_x_2 = blocksize + blocksize;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if ( inbuflen <= c->unused )
+    {
+      /* Short enough to be encoded by the remaining XOR mask. */
+      /* XOR the input with the IV and store input into IV. */
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor_2dst(outbuf, ivp, inbuf, inbuflen);
+      c->unused -= inbuflen;
+      return 0;
+    }
+
+  burn = 0;
+
+  if ( c->unused )
+    {
+      /* XOR the input with the IV and store input into IV */
+      inbuflen -= c->unused;
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor_2dst(outbuf, ivp, inbuf, c->unused);
+      outbuf += c->unused;
+      inbuf += c->unused;
+      c->unused = 0;
+    }
+
+  /* Now we can process complete blocks.  We use a loop as long as we
+     have at least 2 blocks and use conditions for the rest.  This
+     also allows to use a bulk encryption function if available.  */
+  if (inbuflen >= blocksize_x_2 && c->bulk.cfb_enc)
+    {
+      size_t nblocks = inbuflen >> blocksize_shift;
+      c->bulk.cfb_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks);
+      outbuf += nblocks << blocksize_shift;
+      inbuf  += nblocks << blocksize_shift;
+      inbuflen -= nblocks << blocksize_shift;
+    }
+  else
+    {
+      while ( inbuflen >= blocksize_x_2 )
+        {
+          /* Encrypt the IV. */
+          nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          burn = nburn > burn ? nburn : burn;
+          /* XOR the input with the IV and store input into IV.  */
+          cipher_block_xor_2dst(outbuf, c->u_iv.iv, inbuf, blocksize);
+          outbuf += blocksize;
+          inbuf += blocksize;
+          inbuflen -= blocksize;
+        }
+    }
+
+  if ( inbuflen >= blocksize )
+    {
+      /* Save the current IV and then encrypt the IV. */
+      cipher_block_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+      /* XOR the input with the IV and store input into IV */
+      cipher_block_xor_2dst(outbuf, c->u_iv.iv, inbuf, blocksize);
+      outbuf += blocksize;
+      inbuf += blocksize;
+      inbuflen -= blocksize;
+    }
+  if ( inbuflen )
+    {
+      /* Save the current IV and then encrypt the IV. */
+      cipher_block_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+      c->unused = blocksize;
+      /* Apply the XOR. */
+      c->unused -= inbuflen;
+      buf_xor_2dst(outbuf, c->u_iv.iv, inbuf, inbuflen);
+      outbuf += inbuflen;
+      inbuf += inbuflen;
+      inbuflen = 0;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cfb_decrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t blocksize_x_2 = blocksize + blocksize;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if (inbuflen <= c->unused)
+    {
+      /* Short enough to be encoded by the remaining XOR mask. */
+      /* XOR the input with the IV and store input into IV. */
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor_n_copy(outbuf, ivp, inbuf, inbuflen);
+      c->unused -= inbuflen;
+      return 0;
+    }
+
+  burn = 0;
+
+  if (c->unused)
+    {
+      /* XOR the input with the IV and store input into IV. */
+      inbuflen -= c->unused;
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor_n_copy(outbuf, ivp, inbuf, c->unused);
+      outbuf += c->unused;
+      inbuf += c->unused;
+      c->unused = 0;
+    }
+
+  /* Now we can process complete blocks.  We use a loop as long as we
+     have at least 2 blocks and use conditions for the rest.  This
+     also allows to use a bulk encryption function if available.  */
+  if (inbuflen >= blocksize_x_2 && c->bulk.cfb_dec)
+    {
+      size_t nblocks = inbuflen >> blocksize_shift;
+      c->bulk.cfb_dec (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks);
+      outbuf += nblocks << blocksize_shift;
+      inbuf  += nblocks << blocksize_shift;
+      inbuflen -= nblocks << blocksize_shift;
+    }
+  else
+    {
+      while (inbuflen >= blocksize_x_2 )
+        {
+          /* Encrypt the IV. */
+          nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          burn = nburn > burn ? nburn : burn;
+          /* XOR the input with the IV and store input into IV. */
+          cipher_block_xor_n_copy(outbuf, c->u_iv.iv, inbuf, blocksize);
+          outbuf += blocksize;
+          inbuf += blocksize;
+          inbuflen -= blocksize;
+        }
+    }
+
+  if (inbuflen >= blocksize )
+    {
+      /* Save the current IV and then encrypt the IV. */
+      cipher_block_cpy ( c->lastiv, c->u_iv.iv, blocksize);
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+      /* XOR the input with the IV and store input into IV */
+      cipher_block_xor_n_copy(outbuf, c->u_iv.iv, inbuf, blocksize);
+      outbuf += blocksize;
+      inbuf += blocksize;
+      inbuflen -= blocksize;
+    }
+
+  if (inbuflen)
+    {
+      /* Save the current IV and then encrypt the IV. */
+      cipher_block_cpy ( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+      c->unused = blocksize;
+      /* Apply the XOR. */
+      c->unused -= inbuflen;
+      buf_xor_n_copy(outbuf, c->u_iv.iv, inbuf, inbuflen);
+      outbuf += inbuflen;
+      inbuf += inbuflen;
+      inbuflen = 0;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cfb8_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize = c->spec->blocksize;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  burn = 0;
+
+  while ( inbuflen > 0)
+    {
+      int i;
+
+      /* Encrypt the IV. */
+      nburn = enc_fn ( &c->context.c, c->lastiv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+
+      outbuf[0] = c->lastiv[0] ^ inbuf[0];
+
+      /* Bitshift iv by 8 bit to the left */
+      for (i = 0; i < blocksize-1; i++)
+        c->u_iv.iv[i] = c->u_iv.iv[i+1];
+
+      /* append cipher text to iv */
+      c->u_iv.iv[blocksize-1] = outbuf[0];
+
+      outbuf += 1;
+      inbuf += 1;
+      inbuflen -= 1;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cfb8_decrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize = c->spec->blocksize;
+  unsigned int burn, nburn;
+  unsigned char appendee;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  burn = 0;
+
+  while (inbuflen > 0)
+    {
+      int i;
+
+      /* Encrypt the IV. */
+      nburn = enc_fn ( &c->context.c, c->lastiv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+
+      /* inbuf might == outbuf, make sure we keep the value
+         so we can append it later */
+      appendee = inbuf[0];
+
+      outbuf[0] = inbuf[0] ^ c->lastiv[0];
+
+      /* Bitshift iv by 8 bit to the left */
+      for (i = 0; i < blocksize-1; i++)
+        c->u_iv.iv[i] = c->u_iv.iv[i+1];
+
+      c->u_iv.iv[blocksize-1] = appendee;
+
+      outbuf += 1;
+      inbuf += 1;
+      inbuflen -= 1;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-cmac.c b/comm/third_party/libgcrypt/cipher/cipher-cmac.c
new file mode 100644
index 0000000000..4efd1e19b4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-cmac.c
@@ -0,0 +1,292 @@
+/* cmac.c - CMAC, Cipher-based MAC.
+ * Copyright (C) 2013,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+
+#define set_burn(burn, nburn) do { \
+  unsigned int __nburn = (nburn); \
+  (burn) = (burn) > __nburn ? (burn) : __nburn; } while (0)
+
+
+gcry_err_code_t
+_gcry_cmac_write (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		  const byte * inbuf, size_t inlen)
+{
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  byte outbuf[MAX_BLOCKSIZE];
+  unsigned int burn = 0;
+  unsigned int nblocks;
+  size_t n;
+
+  if (ctx->tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!inbuf)
+    return GPG_ERR_INV_ARG;
+
+  if (inlen == 0)
+    return 0;
+
+  /* Last block is needed for cmac_final.  */
+  if (ctx->mac_unused + inlen <= blocksize)
+    {
+      buf_cpy (&ctx->macbuf[ctx->mac_unused], inbuf, inlen);
+      ctx->mac_unused += inlen;
+      inbuf += inlen;
+      inlen -= inlen;
+
+      return 0;
+    }
+
+  if (ctx->mac_unused)
+    {
+      n = inlen;
+      if (n > blocksize - ctx->mac_unused)
+	n = blocksize - ctx->mac_unused;
+
+      buf_cpy (&ctx->macbuf[ctx->mac_unused], inbuf, n);
+      ctx->mac_unused += n;
+      inbuf += n;
+      inlen -= n;
+
+      cipher_block_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+      set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
+
+      ctx->mac_unused = 0;
+    }
+
+  if (c->bulk.cbc_enc && inlen > blocksize)
+    {
+      nblocks = inlen >> blocksize_shift;
+      nblocks -= ((nblocks << blocksize_shift) == inlen);
+
+      c->bulk.cbc_enc (&c->context.c, ctx->u_iv.iv, outbuf, inbuf, nblocks, 1);
+      inbuf += nblocks << blocksize_shift;
+      inlen -= nblocks << blocksize_shift;
+
+      wipememory (outbuf, sizeof (outbuf));
+    }
+  else
+    while (inlen > blocksize)
+      {
+        cipher_block_xor (ctx->u_iv.iv, ctx->u_iv.iv, inbuf, blocksize);
+        set_burn (burn, enc_fn (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv));
+        inlen -= blocksize;
+        inbuf += blocksize;
+      }
+
+  /* Make sure that last block is passed to cmac_final.  */
+  if (inlen == 0)
+    BUG ();
+
+  n = inlen;
+  if (n > blocksize - ctx->mac_unused)
+    n = blocksize - ctx->mac_unused;
+
+  buf_cpy (&ctx->macbuf[ctx->mac_unused], inbuf, n);
+  ctx->mac_unused += n;
+  inbuf += n;
+  inlen -= n;
+
+  if (burn)
+    _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cmac_generate_subkeys (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
+{
+  const unsigned int blocksize = c->spec->blocksize;
+  byte rb, carry, t, bi;
+  unsigned int burn;
+  int i, j;
+  union
+  {
+    size_t _aligned;
+    byte buf[MAX_BLOCKSIZE];
+  } u;
+
+  /* Tell compiler that we require a cipher with a 64bit or 128 bit block
+   * length, to allow better optimization of this function.  */
+  if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
+    return GPG_ERR_INV_CIPHER_MODE;
+
+  if (MAX_BLOCKSIZE < blocksize)
+    BUG ();
+
+  /* encrypt zero block */
+  memset (u.buf, 0, blocksize);
+  burn = c->spec->encrypt (&c->context.c, u.buf, u.buf);
+
+  /* Currently supported blocksizes are 16 and 8. */
+  rb = blocksize == 16 ? 0x87 : 0x1B /* blocksize == 8 */ ;
+
+  for (j = 0; j < 2; j++)
+    {
+      /* Generate subkeys K1 and K2 */
+      carry = 0;
+      for (i = blocksize - 1; i >= 0; i--)
+        {
+          bi = u.buf[i];
+          t = carry | (bi << 1);
+          carry = bi >> 7;
+          u.buf[i] = t & 0xff;
+          ctx->subkeys[j][i] = u.buf[i];
+        }
+      u.buf[blocksize - 1] ^= carry ? rb : 0;
+      ctx->subkeys[j][blocksize - 1] = u.buf[blocksize - 1];
+    }
+
+  wipememory (&u, sizeof (u));
+  if (burn)
+    _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cmac_final (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx)
+{
+  const unsigned int blocksize = c->spec->blocksize;
+  unsigned int count = ctx->mac_unused;
+  unsigned int burn;
+  byte *subkey;
+
+  /* Tell compiler that we require a cipher with a 64bit or 128 bit block
+   * length, to allow better optimization of this function.  */
+  if (blocksize > 16 || blocksize < 8 || blocksize & (8 - 1))
+    return GPG_ERR_INV_CIPHER_MODE;
+
+  if (count == blocksize)
+    subkey = ctx->subkeys[0];        /* K1 */
+  else
+    {
+      subkey = ctx->subkeys[1];      /* K2 */
+      ctx->macbuf[count++] = 0x80;
+      while (count < blocksize)
+        ctx->macbuf[count++] = 0;
+    }
+
+  cipher_block_xor (ctx->macbuf, ctx->macbuf, subkey, blocksize);
+
+  cipher_block_xor (ctx->u_iv.iv, ctx->u_iv.iv, ctx->macbuf, blocksize);
+  burn = c->spec->encrypt (&c->context.c, ctx->u_iv.iv, ctx->u_iv.iv);
+  if (burn)
+    _gcry_burn_stack (burn + 4 * sizeof (void *));
+
+  ctx->mac_unused = 0;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+cmac_tag (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+	  unsigned char *tag, size_t taglen, int check)
+{
+  gcry_err_code_t ret;
+
+  if (!tag || taglen == 0 || taglen > c->spec->blocksize)
+    return GPG_ERR_INV_ARG;
+
+  if (!ctx->tag)
+    {
+      ret = _gcry_cmac_final (c, ctx);
+      if (ret != 0)
+	return ret;
+
+      ctx->tag = 1;
+    }
+
+  if (!check)
+    {
+      memcpy (tag, ctx->u_iv.iv, taglen);
+      return GPG_ERR_NO_ERROR;
+    }
+  else
+    {
+      return buf_eq_const (tag, ctx->u_iv.iv, taglen) ?
+        GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
+    }
+}
+
+
+void
+_gcry_cmac_reset (gcry_cmac_context_t *ctx)
+{
+  char tmp_buf[sizeof(ctx->subkeys)];
+
+  /* Only keep subkeys when reseting context. */
+
+  buf_cpy (tmp_buf, ctx->subkeys, sizeof(ctx->subkeys));
+  memset (ctx, 0, sizeof(*ctx));
+  buf_cpy (ctx->subkeys, tmp_buf, sizeof(ctx->subkeys));
+  wipememory (tmp_buf, sizeof(tmp_buf));
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cmac_authenticate (gcry_cipher_hd_t c,
+                                const unsigned char *abuf, size_t abuflen)
+{
+  if (abuflen > 0 && !abuf)
+    return GPG_ERR_INV_ARG;
+  /* To support new blocksize, update cmac_generate_subkeys() then add new
+     blocksize here. */
+  if (c->spec->blocksize != 16 && c->spec->blocksize != 8)
+    return GPG_ERR_INV_CIPHER_MODE;
+
+  return _gcry_cmac_write (c, &c->u_mode.cmac, abuf, abuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cmac_get_tag (gcry_cipher_hd_t c,
+                           unsigned char *outtag, size_t taglen)
+{
+  return cmac_tag (c, &c->u_mode.cmac, outtag, taglen, 0);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_cmac_check_tag (gcry_cipher_hd_t c,
+                             const unsigned char *intag, size_t taglen)
+{
+  return cmac_tag (c, &c->u_mode.cmac, (unsigned char *) intag, taglen, 1);
+}
+
+gcry_err_code_t
+_gcry_cipher_cmac_set_subkeys (gcry_cipher_hd_t c)
+{
+  return _gcry_cmac_generate_subkeys (c, &c->u_mode.cmac);
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-ctr.c b/comm/third_party/libgcrypt/cipher/cipher-ctr.c
new file mode 100644
index 0000000000..5f0afc2f88
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-ctr.c
@@ -0,0 +1,120 @@
+/* cipher-ctr.c  - Generic CTR mode implementation
+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ *               2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_ctr_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  size_t n;
+  int i;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  size_t nblocks;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  burn = 0;
+
+  /* First process a left over encrypted counter.  */
+  if (c->unused)
+    {
+      gcry_assert (c->unused < blocksize);
+      i = blocksize - c->unused;
+      n = c->unused > inbuflen ? inbuflen : c->unused;
+      buf_xor(outbuf, inbuf, &c->lastiv[i], n);
+      c->unused -= n;
+      inbuf  += n;
+      outbuf += n;
+      inbuflen -= n;
+    }
+
+  /* Use a bulk method if available.  */
+  nblocks = inbuflen >> blocksize_shift;
+  if (nblocks && c->bulk.ctr_enc)
+    {
+      c->bulk.ctr_enc (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks);
+      inbuf  += nblocks << blocksize_shift;
+      outbuf += nblocks << blocksize_shift;
+      inbuflen -= nblocks << blocksize_shift;
+    }
+
+  /* If we don't have a bulk method use the standard method.  We also
+     use this method for the a remaining partial block.  */
+  if (inbuflen)
+    {
+      unsigned char tmp[MAX_BLOCKSIZE];
+
+      n = blocksize;
+      do
+        {
+          nburn = enc_fn (&c->context.c, tmp, c->u_ctr.ctr);
+          burn = nburn > burn ? nburn : burn;
+
+	  cipher_block_add(c->u_ctr.ctr, 1, blocksize);
+
+          if (inbuflen < blocksize)
+            break;
+          cipher_block_xor(outbuf, inbuf, tmp, blocksize);
+
+          inbuflen -= n;
+          outbuf += n;
+          inbuf += n;
+        }
+      while (inbuflen);
+
+      if (inbuflen)
+        {
+          n = inbuflen;
+          buf_xor(outbuf, inbuf, tmp, inbuflen);
+
+          inbuflen -= n;
+          outbuf += n;
+          inbuf += n;
+        }
+
+      /* Save the unused bytes of the counter.  */
+      c->unused = blocksize - n;
+      if (c->unused)
+        buf_cpy (c->lastiv+n, tmp+n, c->unused);
+
+      wipememory (tmp, sizeof tmp);
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-eax.c b/comm/third_party/libgcrypt/cipher/cipher-eax.c
new file mode 100644
index 0000000000..08f815a9e4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-eax.c
@@ -0,0 +1,289 @@
+/* cipher-eax.c  -  EAX implementation
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_eax_encrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
+
+      err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, outbuf,
+			      currlen);
+      if (err != 0)
+	return err;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_decrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, inbuf,
+			      currlen);
+      if (err != 0)
+	return err;
+
+      err = _gcry_cipher_ctr_encrypt (c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_authenticate (gcry_cipher_hd_t c,
+                               const byte * aadbuf, size_t aadbuflen)
+{
+  gcry_err_code_t err;
+
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    {
+      err = _gcry_cipher_eax_set_nonce (c, NULL, 0);
+      if (err != 0)
+	return err;
+    }
+
+  return _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, aadbuf, aadbuflen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_setkey (gcry_cipher_hd_t c)
+{
+  gcry_err_code_t err;
+
+  err = _gcry_cmac_generate_subkeys (c, &c->u_mode.eax.cmac_header);
+  if (err != 0)
+    return err;
+
+  buf_cpy (c->u_mode.eax.cmac_ciphertext.subkeys,
+	   c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_set_nonce (gcry_cipher_hd_t c, const byte *nonce,
+			    size_t noncelen)
+{
+  gcry_cmac_context_t nonce_cmac;
+  unsigned char initbuf[MAX_BLOCKSIZE];
+  gcry_err_code_t err;
+
+  c->marks.iv = 0;
+  c->marks.tag = 0;
+
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+  _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+  /* Calculate nonce CMAC */
+
+  memset(&nonce_cmac, 0, sizeof(nonce_cmac));
+  memset(&initbuf, 0, sizeof(initbuf));
+
+  buf_cpy (&nonce_cmac.subkeys, c->u_mode.eax.cmac_header.subkeys,
+	   sizeof(c->u_mode.eax.cmac_header.subkeys));
+
+  err = _gcry_cmac_write (c, &nonce_cmac, initbuf, c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  if (noncelen != 0)
+    {
+      err = _gcry_cmac_write (c, &nonce_cmac, nonce, noncelen);
+      if (err != 0)
+        return err;
+    }
+
+  err = _gcry_cmac_final (c, &nonce_cmac);
+  if (err != 0)
+    return err;
+
+  cipher_block_cpy (c->u_iv.iv, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+  cipher_block_cpy (c->u_ctr.ctr, nonce_cmac.u_iv.iv, MAX_BLOCKSIZE);
+
+  wipememory (&nonce_cmac, sizeof(nonce_cmac));
+
+  /* Prepare header CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 1;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_header, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  /* Prepare ciphertext CMAC */
+
+  initbuf[c->spec->blocksize - 1] = 2;
+  err = _gcry_cmac_write (c, &c->u_mode.eax.cmac_ciphertext, initbuf,
+			  c->spec->blocksize);
+  if (err != 0)
+    return err;
+
+  c->marks.iv = 1;
+  c->marks.tag = 0;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+_gcry_cipher_eax_tag (gcry_cipher_hd_t c,
+                      byte *outbuf, size_t outbuflen, int check)
+{
+  gcry_err_code_t err;
+
+  if (!c->marks.tag)
+    {
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_header);
+      if (err != 0)
+	return err;
+
+      err = _gcry_cmac_final (c, &c->u_mode.eax.cmac_ciphertext);
+      if (err != 0)
+	return err;
+
+      cipher_block_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_header.u_iv.iv,
+                          MAX_BLOCKSIZE);
+      cipher_block_xor_1 (c->u_iv.iv, c->u_mode.eax.cmac_ciphertext.u_iv.iv,
+                          MAX_BLOCKSIZE);
+
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset (&c->u_mode.eax.cmac_ciphertext);
+
+      c->marks.tag = 1;
+    }
+
+  if (!check)
+    {
+      if (outbuflen > c->spec->blocksize)
+        outbuflen = c->spec->blocksize;
+
+      /* NB: We already checked that OUTBUF is large enough to hold
+       * the result or has valid truncated length.  */
+      memcpy (outbuf, c->u_iv.iv, outbuflen);
+    }
+  else
+    {
+      /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
+       * and thus we need to compare its length first.  */
+      if (!(outbuflen <= c->spec->blocksize)
+          || !buf_eq_const (outbuf, c->u_iv.iv, outbuflen))
+        return GPG_ERR_CHECKSUM;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_eax_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+                          size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, outtag, taglen, 0);
+}
+
+gcry_err_code_t
+_gcry_cipher_eax_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+                            size_t taglen)
+{
+  return _gcry_cipher_eax_tag (c, (unsigned char *) intag, taglen, 1);
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm-armv7-neon.S b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv7-neon.S
new file mode 100644
index 0000000000..a801a5e57b
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv7-neon.S
@@ -0,0 +1,341 @@
+/* cipher-gcm-armv7-neon.S - ARM/NEON accelerated GHASH
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+.align 4
+gcry_gcm_reduction_constant:
+.Lrconst64:
+  .quad 0xc200000000000000
+
+/* Register macros */
+
+#define rhash q0
+#define rhash_l d0
+#define rhash_h d1
+
+#define rh1 q1
+#define rh1_l d2
+#define rh1_h d3
+
+#define rbuf q2
+#define rbuf_l d4
+#define rbuf_h d5
+
+#define rbuf1 q3
+#define rbuf1_l d6
+#define rbuf1_h d7
+
+#define t0q q4
+#define t0l d8
+#define t0h d9
+
+#define t1q q5
+#define t1l d10
+#define t1h d11
+
+#define t2q q6
+#define t2l d12
+#define t2h d13
+
+#define t3q q7
+#define t3l d14
+#define t3h d15
+
+/* q8 */
+#define k16 d16
+#define k32 d17
+
+/* q9 */
+#define k48 d18
+
+#define k0 q10
+
+#define rr0 q11
+#define rr0_l d22
+#define rr0_h d23
+
+#define rr1 q12
+#define rr1_l d24
+#define rr1_h d25
+
+#define rt0 q13
+#define rt0_l d26
+#define rt0_h d27
+
+#define rt1 q14
+#define rt1_l d28
+#define rt1_h d29
+
+#define rrconst q15
+#define rrconst_l d30
+#define rrconst_h d31
+
+/* Macro for 64x64=>128 carry-less multiplication using vmull.p8 instruction.
+ *
+ * From "Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R. Fast Software
+ * Polynomial Multiplication on ARM Processors using the NEON Engine. The
+ * Second International Workshop on Modern Cryptography and Security
+ * Engineering — MoCrySEn, 2013". */
+
+#define vmull_p64(rq, rl, rh, ad, bd) \
+	vext.8 t0l, ad, ad, $1; \
+	vmull.p8 t0q, t0l, bd; \
+	vext.8 rl, bd, bd, $1; \
+	vmull.p8 rq, ad, rl; \
+	vext.8 t1l, ad, ad, $2; \
+	vmull.p8 t1q, t1l, bd; \
+	vext.8 t3l, bd, bd, $2; \
+	vmull.p8 t3q, ad, t3l; \
+	vext.8 t2l, ad, ad, $3; \
+	vmull.p8 t2q, t2l, bd; \
+	veor t0q, t0q, rq; \
+	vext.8 rl, bd, bd, $3; \
+	vmull.p8 rq, ad, rl; \
+	veor t1q, t1q, t3q; \
+	vext.8 t3l, bd, bd, $4; \
+	vmull.p8 t3q, ad, t3l; \
+	veor t0l, t0l, t0h; \
+	vand t0h, t0h, k48; \
+	veor t1l, t1l, t1h; \
+	vand t1h, t1h, k32; \
+	veor t2q, t2q, rq; \
+	veor t0l, t0l, t0h; \
+	veor t1l, t1l, t1h; \
+	veor t2l, t2l, t2h; \
+	vand t2h, t2h, k16; \
+	veor t3l, t3l, t3h; \
+	vmov.i64 t3h, $0; \
+	vext.8 t0q, t0q, t0q, $15; \
+	veor t2l, t2l, t2h; \
+	vext.8 t1q, t1q, t1q, $14; \
+	vmull.p8 rq, ad, bd; \
+	vext.8 t2q, t2q, t2q, $13; \
+	vext.8 t3q, t3q, t3q, $12; \
+	veor t0q, t0q, t1q; \
+	veor t2q, t2q, t3q; \
+	veor rq, rq, t0q; \
+	veor rq, rq, t2q;
+
+/* GHASH macros.
+ *
+ * See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+
+/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
+ *  Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
+ */
+#define PMUL_128x128(r0, r1, a, b, t1, t2, interleave_op) \
+        veor t1##_h, b##_l, b##_h; \
+        veor t1##_l, a##_l, a##_h; \
+        vmull_p64( r0, r0##_l, r0##_h, a##_l, b##_l ); \
+        vmull_p64( r1, r1##_l, r1##_h, a##_h, b##_h ); \
+        vmull_p64( t2, t2##_h, t2##_l, t1##_h, t1##_l ); \
+        interleave_op; \
+        veor t2, r0; \
+        veor t2, r1; \
+        veor r0##_h, t2##_l; \
+        veor r1##_l, t2##_h;
+
+/* Reduction using Xor and Shift.
+ * Input: 'r0:r1', Output: 'a'
+ *
+ * See "Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication
+ * Instruction and its Usage for Computing the GCM Mode" for details.
+ */
+#define REDUCTION(a, r0, r1, t, interleave_op) \
+        vshl.u32 t0q, r0, #31; \
+        vshl.u32 t1q, r0, #30; \
+        vshl.u32 t2q, r0, #25; \
+        veor t0q, t0q, t1q; \
+        veor t0q, t0q, t2q; \
+        vext.8 t, t0q, k0, #4; \
+        vext.8 t0q, k0, t0q, #(16-12); \
+        veor r0, r0, t0q; \
+        interleave_op; \
+        vshr.u32 t0q, r0, #1; \
+        vshr.u32 t1q, r0, #2; \
+        vshr.u32 t2q, r0, #7; \
+        veor t0q, t0q, t1q; \
+        veor t0q, t0q, t2q; \
+        veor t0q, t0q, t; \
+        veor r0, r0, t0q; \
+        veor a, r0, r1;
+
+#define _(...) __VA_ARGS__
+#define __ _()
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
+ *                                      const byte *buf, size_t nblocks);
+ */
+.align 3
+.globl _gcry_ghash_armv7_neon
+.type  _gcry_ghash_armv7_neon,%function;
+_gcry_ghash_armv7_neon:
+  /* input:
+   *    r0: gcm_key
+   *    r1: result/hash
+   *    r2: buf
+   *    r3: nblocks
+   */
+  push {r4-r6, lr}
+
+  cmp r3, #0
+  beq .Ldo_nothing
+
+  vpush {q4-q7}
+
+  vld1.64 {rhash}, [r1]
+  vld1.64 {rh1}, [r0]
+
+  vrev64.8 rhash, rhash /* byte-swap */
+
+  vmov.i64 k0, #0x0
+  vmov.i64 k16, #0xffff
+  vmov.i64 k32, #0xffffffff
+  vmov.i64 k48, #0xffffffffffff
+
+  vext.8 rhash, rhash, rhash, #8
+
+  /* Handle remaining blocks. */
+
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+
+  vrev64.8 rbuf, rbuf /* byte-swap */
+  vext.8 rbuf, rbuf, rbuf, #8
+
+  veor rhash, rhash, rbuf
+
+  beq .Lend
+
+.Loop:
+  vld1.64 {rbuf}, [r2]!
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(vrev64.8 rbuf, rbuf))
+  REDUCTION(rhash, rr0, rr1, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
+  subs r3, r3, #1
+  veor rhash, rhash, rbuf
+
+  bne .Loop
+
+.Lend:
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, rt1, _(CLEAR_REG(rbuf)))
+  REDUCTION(rhash, rr0, rr1, rt0, _(CLEAR_REG(rh1)))
+
+.Ldone:
+  CLEAR_REG(rr1)
+  vrev64.8 rhash, rhash /* byte-swap */
+  CLEAR_REG(rt0)
+  CLEAR_REG(rr0)
+  vext.8 rhash, rhash, rhash, #8
+  CLEAR_REG(rt1)
+  CLEAR_REG(t0q)
+  CLEAR_REG(t1q)
+  CLEAR_REG(t2q)
+  CLEAR_REG(t3q)
+  vst1.64 {rhash}, [r1]
+  CLEAR_REG(rhash)
+
+  vpop {q4-q7}
+
+.Ldo_nothing:
+  mov r0, #0
+  pop {r4-r6, pc}
+.size _gcry_ghash_armv7_neon,.-_gcry_ghash_armv7_neon;
+
+
+/*
+ * void _gcry_ghash_armv7_neon (void *gcm_key);
+ */
+.align 3
+.globl _gcry_ghash_setup_armv7_neon
+.type  _gcry_ghash_setup_armv7_neon,%function;
+_gcry_ghash_setup_armv7_neon:
+  /* input:
+   *	r0: gcm_key
+   */
+
+  vpush {q4-q7}
+
+  GET_DATA_POINTER(r2, .Lrconst64, r3)
+
+  vld1.64 {rrconst_h}, [r2]
+
+#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
+        /* H <<< 1 */ \
+        vshr.s64 ma, ib, #63; \
+        vshr.u64 oa, ib, #63; \
+        vshr.u64 ob, ia, #63; \
+        vand ma, const_d; \
+        vshl.u64 ib, ib, #1; \
+        vshl.u64 ia, ia, #1; \
+        vorr ob, ib; \
+        vorr oa, ia; \
+        veor ob, ma; \
+        vst1.64 {oa, ob}, [r_out]
+
+  vld1.64 {rhash}, [r0]
+  vrev64.8 rhash, rhash /* byte-swap */
+  vext.8 rhash, rhash, rhash, #8
+
+  vmov rbuf1, rhash
+  GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
+
+  CLEAR_REG(rh1)
+  CLEAR_REG(rhash)
+  CLEAR_REG(rbuf1)
+  CLEAR_REG(rrconst)
+  vpop {q4-q7}
+  bx lr
+.size _gcry_ghash_setup_armv7_neon,.-_gcry_ghash_setup_armv7_neon;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
new file mode 100644
index 0000000000..1de66a1626
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
@@ -0,0 +1,433 @@
+/* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+
+.syntax unified
+.arch armv8-a
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+.align 4
+gcry_gcm_reduction_constant:
+.Lrconst64:
+  .quad 0xc200000000000000
+
+
+/* Register macros */
+
+#define rhash q0
+#define rhash_l d0
+#define rhash_h d1
+
+#define rh1 q1
+#define rh1_l d2
+#define rh1_h d3
+
+#define rbuf q2
+#define rbuf_l d4
+#define rbuf_h d5
+
+#define rbuf1 q3
+#define rbuf1_l d6
+#define rbuf1_h d7
+
+#define rbuf2 q4
+#define rbuf2_l d8
+#define rbuf2_h d9
+
+#define rbuf3 q5
+#define rbuf3_l d10
+#define rbuf3_h d11
+
+#define rh2 q6
+#define rh2_l d12
+#define rh2_h d13
+
+#define rh3 q7
+#define rh3_l d14
+#define rh3_h d15
+
+#define rh4 q8
+#define rh4_l d16
+#define rh4_h d17
+
+#define rr2 q9
+#define rr2_l d18
+#define rr2_h d19
+
+#define rr3 q10
+#define rr3_l d20
+#define rr3_h d21
+
+#define rr0 q11
+#define rr0_l d22
+#define rr0_h d23
+
+#define rr1 q12
+#define rr1_l d24
+#define rr1_h d25
+
+#define rt0 q13
+#define rt0_l d26
+#define rt0_h d27
+
+#define rt1 q14
+#define rt1_l d28
+#define rt1_h d29
+
+#define rrconst q15
+#define rrconst_l d30
+#define rrconst_h d31
+
+/* GHASH macros */
+
+/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+
+/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
+ *  Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
+ */
+#define PMUL_128x128(r0, r1, a, b, t, interleave_op) \
+        veor t##_h, b##_l, b##_h; \
+        veor t##_l, a##_l, a##_h; \
+        vmull.p64 r0, a##_l, b##_l; \
+        vmull.p64 r1, a##_h, b##_h; \
+        vmull.p64 t, t##_h, t##_l; \
+        interleave_op; \
+        veor t, r0; \
+        veor t, r1; \
+        veor r0##_h, t##_l; \
+        veor r1##_l, t##_h;
+
+/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
+ *  Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'.
+ * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
+ *  Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'.
+ */
+#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \
+        veor tA##_h, bA##_l, bA##_h; \
+        veor tA##_l, aA##_l, aA##_h; \
+          veor tB##_h, bB##_l, bB##_h; \
+          veor tB##_l, aB##_l, aB##_h; \
+        vmull.p64 r0A, aA##_l, bA##_l; \
+        vmull.p64 r1A, aA##_h, bA##_h; \
+        vmull.p64 tA, tA##_h, tA##_l; \
+          vmull.p64 r0B, aB##_l, bB##_l; \
+          vmull.p64 r1B, aB##_h, bB##_h; \
+          vmull.p64 tB, tB##_h, tB##_l; \
+        interleave_op; \
+        veor tA, r0A; \
+        veor tA, r1A; \
+          veor tB, r0B; \
+          veor tB, r1B; \
+        veor r0A##_h, tA##_l; \
+        veor r1A##_l, tA##_h; \
+          veor r0B##_h, tB##_l; \
+          veor r1B##_l, tB##_h; \
+
+/* Input: 'r0:r1', Output: 'a' */
+#define REDUCTION(a, r0, r1, rconst, t, interleave_op) \
+        vmull.p64 t, r0##_l, rconst; \
+        veor r0##_h, t##_l; \
+        veor r1##_l, t##_h; \
+        interleave_op; \
+        vmull.p64 t, r0##_h, rconst; \
+        veor r1, t; \
+        veor a, r0, r1;
+
+#define _(...) __VA_ARGS__
+#define __ _()
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+ *                                          const byte *buf, size_t nblocks,
+ *                                          void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_armv8_ce_pmull
+.type  _gcry_ghash_armv8_ce_pmull,%function;
+_gcry_ghash_armv8_ce_pmull:
+  /* input:
+   *    r0: gcm_key
+   *    r1: result/hash
+   *    r2: buf
+   *    r3: nblocks
+   *    %st+0: gcm_table
+   */
+  push {r4-r6, lr}
+
+  cmp r3, #0
+  beq .Ldo_nothing
+
+  GET_DATA_POINTER(r4, .Lrconst64, lr)
+
+  vld1.64 {rhash}, [r1]
+  vld1.64 {rh1}, [r0]
+
+  vrev64.8 rhash, rhash /* byte-swap */
+  vld1.64 {rrconst_h}, [r4]
+  vext.8 rhash, rhash, rhash, #8
+
+  cmp r3, #4
+  blo .Less_than_4
+
+  /* Bulk processing of 4 blocks per loop iteration. */
+
+  ldr r5, [sp, #(4*4)];
+  add r6, r5, #32
+
+  vpush {q4-q7}
+
+  vld1.64 {rh2-rh3}, [r5]
+  vld1.64 {rh4}, [r6]
+
+  vld1.64 {rbuf-rbuf1}, [r2]!
+  sub r3, r3, #4
+  vld1.64 {rbuf2-rbuf3}, [r2]!
+
+  cmp r3, #4
+  vrev64.8 rbuf, rbuf /* byte-swap */
+  vrev64.8 rbuf1, rbuf1 /* byte-swap */
+  vrev64.8 rbuf2, rbuf2 /* byte-swap */
+  vrev64.8 rbuf3, rbuf3 /* byte-swap */
+
+  vext.8 rbuf, rbuf, rbuf, #8
+  vext.8 rbuf1, rbuf1, rbuf1, #8
+  vext.8 rbuf2, rbuf2, rbuf2, #8
+  vext.8 rbuf3, rbuf3, rbuf3, #8
+  veor rhash, rhash, rbuf /* in0 ^ hash */
+
+  blo .Lend_4
+
+.Loop_4:
+  /* (in0 ^ hash) * H⁴ => rr2:rr3 */
+  /* (in1) * H³ => rr0:rr1 */
+  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+  vld1.64 {rbuf-rbuf1}, [r2]!
+  sub r3, r3, #4
+  veor rr0, rr0, rr2
+  veor rr1, rr1, rr3
+
+  /* (in2) * H² => rr2:rr3 */
+  /* (in3) * H¹ => rhash:rbuf3 */
+  PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1,
+                 _(vrev64.8 rbuf, rbuf))
+
+  vld1.64 {rbuf2}, [r2]!
+
+  vrev64.8 rbuf1, rbuf1
+  veor rr0, rr0, rr2
+  veor rr1, rr1, rr3
+
+  cmp r3, #4
+  vext.8 rbuf, rbuf, rbuf, #8
+  vext.8 rbuf1, rbuf1, rbuf1, #8
+
+  veor rr0, rr0, rhash
+  veor rr1, rr1, rbuf3
+
+  vld1.64 {rbuf3}, [r2]!
+
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
+            _(vrev64.8 rbuf2, rbuf2;
+              vrev64.8 rbuf3, rbuf3))
+
+  vext.8 rbuf2, rbuf2, rbuf2, #8
+  vext.8 rbuf3, rbuf3, rbuf3, #8
+  veor rhash, rhash, rbuf /* in0 ^ hash */
+
+  bhs .Loop_4
+
+.Lend_4:
+  /* (in0 ^ hash) * H⁴ => rr2:rr3 */
+  /* (in1) * H³ => rr0:rr1 */
+  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+  /* (in2) * H² => rhash:rbuf */
+  /* (in3) * H¹ => rbuf1:rbuf2 */
+  PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
+                 _(veor rr0, rr0, rr2;
+                   veor rr1, rr1, rr3))
+
+  veor rr0, rr0, rhash
+  veor rr1, rr1, rbuf
+
+  veor rr0, rr0, rbuf1
+  veor rr1, rr1, rbuf2
+
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
+            _(CLEAR_REG(rr2);
+              CLEAR_REG(rr3);
+              CLEAR_REG(rbuf1);
+              CLEAR_REG(rbuf2);
+              CLEAR_REG(rbuf3);
+              CLEAR_REG(rh2);
+              CLEAR_REG(rh3);
+              CLEAR_REG(rh4)))
+
+  vpop {q4-q7}
+
+  cmp r3, #0
+  beq .Ldone
+
+.Less_than_4:
+  /* Handle remaining blocks. */
+
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+
+  vrev64.8 rbuf, rbuf /* byte-swap */
+  vext.8 rbuf, rbuf, rbuf, #8
+
+  veor rhash, rhash, rbuf
+
+  beq .Lend
+
+.Loop:
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf))
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
+  veor rhash, rhash, rbuf
+
+  bne .Loop
+
+.Lend:
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
+
+.Ldone:
+  CLEAR_REG(rr1)
+  vrev64.8 rhash, rhash /* byte-swap */
+  CLEAR_REG(rt0)
+  CLEAR_REG(rr0)
+  vext.8 rhash, rhash, rhash, #8
+  CLEAR_REG(rt1)
+  vst1.64 {rhash}, [r1]
+  CLEAR_REG(rhash)
+
+.Ldo_nothing:
+  mov r0, #0
+  pop {r4-r6, pc}
+.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
+
+
+/*
+ * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_setup_armv8_ce_pmull
+.type  _gcry_ghash_setup_armv8_ce_pmull,%function;
+_gcry_ghash_setup_armv8_ce_pmull:
+  /* input:
+   *	r0: gcm_key
+   *	r1: gcm_table
+   */
+
+  vpush {q4-q7}
+
+  GET_DATA_POINTER(r2, .Lrconst64, r3)
+
+  vld1.64 {rrconst_h}, [r2]
+
+#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
+        /* H <<< 1 */ \
+        vshr.s64 ma, ib, #63; \
+        vshr.u64 oa, ib, #63; \
+        vshr.u64 ob, ia, #63; \
+        vand ma, const_d; \
+        vshl.u64 ib, ib, #1; \
+        vshl.u64 ia, ia, #1; \
+        vorr ob, ib; \
+        vorr oa, ia; \
+        veor ob, ma; \
+        vst1.64 {oa, ob}, [r_out]
+
+  vld1.64 {rhash}, [r0]
+  vrev64.8 rhash, rhash /* byte-swap */
+  vext.8 rhash, rhash, rhash, #8
+
+  vmov rbuf1, rhash
+  GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
+
+  /* H² */
+  PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __)
+  REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __)
+  vmov rhash, rh2
+  GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */
+  add r1, r1, #16
+
+  /* H³ */
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __)
+  REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __)
+
+  /* H⁴ */
+  PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __)
+  REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __)
+
+  GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */
+  add r1, r1, #16
+  GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */
+
+  CLEAR_REG(rt0)
+  CLEAR_REG(rt1)
+  CLEAR_REG(rr1)
+  CLEAR_REG(rr0)
+  CLEAR_REG(rh1)
+  CLEAR_REG(rh2)
+  CLEAR_REG(rh3)
+  CLEAR_REG(rh4)
+  CLEAR_REG(rhash)
+  CLEAR_REG(rbuf1)
+  CLEAR_REG(rrconst)
+  vpop {q4-q7}
+  bx lr
+.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch64-ce.S b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch64-ce.S
new file mode 100644
index 0000000000..877207d3e5
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -0,0 +1,424 @@
+/* cipher-gcm-armv8-aarch64-ce.S - ARM/CE accelerated GHASH
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.cpu generic+simd+crypto
+
+.text
+
+
+/* Constants */
+
+.align 4
+gcry_gcm_reduction_constant:
+.Lrconst:
+  .quad 0x87
+
+
+/* Register macros */
+
+#define rhash   v0
+#define rr0     v1
+#define rr1     v2
+#define rbuf    v3
+#define rbuf1   v4
+#define rbuf2   v5
+#define rbuf3   v6
+#define rbuf4   v7
+#define rbuf5   v8
+#define rr2     v9
+#define rr3     v10
+#define rr4     v11
+#define rr5     v12
+#define rr6     v13
+#define rr7     v14
+#define rr8     v15
+#define rr9     v16
+
+#define rrconst v18
+#define rh1     v19
+#define rh2     v20
+#define rh3     v21
+#define rh4     v22
+#define rh5     v23
+#define rh6     v24
+#define t0      v25
+#define t1      v26
+#define t2      v27
+#define t3      v28
+#define t4      v29
+#define t5      v30
+#define vZZ     v31
+
+/* GHASH macros */
+
+/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology — CT-RSA 2015" for details.
+ */
+
+/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */
+#define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \
+	ext T0.16b, b.16b, b.16b, #8; \
+	pmull r0.1q, a.1d, b.1d; \
+	pmull2 r1.1q, a.2d, b.2d; \
+	pmull T1.1q, a.1d, T0.1d; \
+	pmull2 T0.1q, a.2d, T0.2d; \
+	interleave_op; \
+	eor T0.16b, T0.16b, T1.16b; \
+	ext T1.16b, vZZ.16b, T0.16b, #8; \
+	ext T0.16b, T0.16b, vZZ.16b, #8; \
+	eor r0.16b, r0.16b, T1.16b; \
+	eor r1.16b, r1.16b, T0.16b;
+
+/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
+ * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
+ * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C)
+ */
+#define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \
+                       r0B, r1B, aB, bB, t0B, t1B, \
+                       r0C, r1C, aC, bC, t0C, t1C,  interleave_op) \
+        ext t0A.16b, bA.16b, bA.16b, #8; \
+        pmull r0A.1q, aA.1d, bA.1d; \
+        pmull2 r1A.1q, aA.2d, bA.2d; \
+          ext t0B.16b, bB.16b, bB.16b, #8; \
+          pmull r0B.1q, aB.1d, bB.1d; \
+          pmull2 r1B.1q, aB.2d, bB.2d; \
+            ext t0C.16b, bC.16b, bC.16b, #8; \
+            pmull r0C.1q, aC.1d, bC.1d; \
+            pmull2 r1C.1q, aC.2d, bC.2d; \
+        pmull t1A.1q, aA.1d, t0A.1d; \
+        pmull2 t0A.1q, aA.2d, t0A.2d; \
+          pmull t1B.1q, aB.1d, t0B.1d; \
+          pmull2 t0B.1q, aB.2d, t0B.2d; \
+            pmull t1C.1q, aC.1d, t0C.1d; \
+            pmull2 t0C.1q, aC.2d, t0C.2d; \
+        eor t0A.16b, t0A.16b, t1A.16b; \
+          eor t0B.16b, t0B.16b, t1B.16b; \
+            eor t0C.16b, t0C.16b, t1C.16b; \
+              interleave_op; \
+        ext t1A.16b, vZZ.16b, t0A.16b, #8; \
+        ext t0A.16b, t0A.16b, vZZ.16b, #8; \
+          ext t1B.16b, vZZ.16b, t0B.16b, #8; \
+          ext t0B.16b, t0B.16b, vZZ.16b, #8; \
+            ext t1C.16b, vZZ.16b, t0C.16b, #8; \
+            ext t0C.16b, t0C.16b, vZZ.16b, #8; \
+        eor r0A.16b, r0A.16b, t1A.16b; \
+        eor r1A.16b, r1A.16b, t0A.16b; \
+          eor r0B.16b, r0B.16b, t1B.16b; \
+          eor r1B.16b, r1B.16b, t0B.16b; \
+            eor r0C.16b, r0C.16b, t1C.16b; \
+            eor r1C.16b, r1C.16b, t0C.16b; \
+
+/* Input: 'r0:r1', Output: 'a' */
+#define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \
+                  interleave_op3) \
+        pmull2 T0.1q, r1.2d, rconst.2d; \
+        interleave_op1; \
+        ext T1.16b, T0.16b, vZZ.16b, #8; \
+        ext T0.16b, vZZ.16b, T0.16b, #8; \
+        interleave_op2; \
+        eor r1.16b, r1.16b, T1.16b; \
+        eor r0.16b, r0.16b, T0.16b; \
+        pmull T0.1q, r1.1d, rconst.1d; \
+        interleave_op3; \
+        eor a.16b, r0.16b, T0.16b;
+
+/* Other functional macros */
+
+#define _(...) __VA_ARGS__
+#define __ _()
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+#define VPUSH_ABI \
+        stp d8, d9, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16); \
+        stp d10, d11, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16); \
+        stp d12, d13, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16); \
+        stp d14, d15, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16);
+
+#define VPOP_ABI \
+        ldp d14, d15, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16); \
+        ldp d12, d13, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16); \
+        ldp d10, d11, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16); \
+        ldp d8, d9, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16);
+
+/*
+ * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+ *                                          const byte *buf, size_t nblocks,
+ *                                          void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_armv8_ce_pmull
+ELF(.type  _gcry_ghash_armv8_ce_pmull,%function;)
+_gcry_ghash_armv8_ce_pmull:
+  /* input:
+   *    x0: gcm_key
+   *    x1: result/hash
+   *    x2: buf
+   *    x3: nblocks
+   *    x4: gcm_table
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Ldo_nothing;
+
+  GET_DATA_POINTER(x5, .Lrconst)
+
+  eor vZZ.16b, vZZ.16b, vZZ.16b
+  ld1 {rhash.16b}, [x1]
+  ld1 {rh1.16b}, [x0]
+
+  rbit rhash.16b, rhash.16b /* bit-swap */
+  ld1r {rrconst.2d}, [x5]
+
+  cmp x3, #6
+  b.lo .Less_than_6
+
+  add x6, x4, #64
+  VPUSH_ABI
+
+  ld1 {rh2.16b-rh5.16b}, [x4]
+  ld1 {rh6.16b}, [x6]
+
+  sub x3, x3, #6
+
+  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
+  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
+
+  rbit rbuf.16b, rbuf.16b /* bit-swap */
+  rbit rbuf1.16b, rbuf1.16b /* bit-swap */
+  rbit rbuf2.16b, rbuf2.16b /* bit-swap */
+  rbit rbuf3.16b, rbuf3.16b /* bit-swap */
+  rbit rbuf4.16b, rbuf4.16b /* bit-swap */
+  rbit rbuf5.16b, rbuf5.16b /* bit-swap */
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cmp x3, #6
+  b.lo .Lend_6
+
+.Loop_6:
+
+  /* (in1) * H⁵ => rr0:rr1 */
+  /* (in2) * H⁴ => rr2:rr3 */
+  /* (in0 ^ hash) * H⁶ => rr4:rr5 */
+  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
+                 rr2, rr3, rbuf2, rh4, t2, t3,
+                 rr4, rr5, rhash, rh6, t4, t5,
+                 _(sub x3, x3, #6))
+
+  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
+  cmp x3, #6
+
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+
+  /* (in3) * H³ => rr2:rr3 */
+  /* (in4) * H² => rr6:rr7 */
+  /* (in5) * H¹ => rr8:rr9 */
+  PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1,
+                 rr6, rr7, rbuf4, rh2, t2, t3,
+                 rr8, rr9, rbuf5, rh1, t4, t5,
+                 _(eor rr0.16b, rr0.16b, rr4.16b;
+                   eor rr1.16b, rr1.16b, rr5.16b))
+
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+  rbit rbuf.16b, rbuf.16b
+  eor rr0.16b, rr0.16b, rr6.16b
+  eor rr1.16b, rr1.16b, rr7.16b
+  rbit rbuf1.16b, rbuf1.16b
+  eor rr0.16b, rr0.16b, rr8.16b
+  eor rr1.16b, rr1.16b, rr9.16b
+  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
+
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(rbit rbuf2.16b, rbuf2.16b),
+            _(rbit rbuf3.16b, rbuf3.16b),
+            _(rbit rbuf4.16b, rbuf4.16b))
+
+  rbit rbuf5.16b, rbuf5.16b
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  b.hs .Loop_6
+
+.Lend_6:
+
+  /* (in1) * H⁵ => rr0:rr1 */
+  /* (in0 ^ hash) * H⁶ => rr2:rr3 */
+  /* (in2) * H⁴ => rr4:rr5 */
+  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
+                 rr2, rr3, rhash, rh6, t2, t3,
+                 rr4, rr5, rbuf2, rh4, t4, t5,
+                 __)
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+  eor rr0.16b, rr0.16b, rr4.16b
+  eor rr1.16b, rr1.16b, rr5.16b
+
+  /* (in3) * H³ => rhash:rbuf */
+  /* (in4) * H² => rr6:rr7 */
+  /* (in5) * H¹ => rr8:rr9 */
+  PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1,
+                 rr6, rr7, rbuf4, rh2, t2, t3,
+                 rr8, rr9, rbuf5, rh1, t4, t5,
+                 _(CLEAR_REG(rh4);
+                   CLEAR_REG(rh5);
+                   CLEAR_REG(rh6)))
+  eor rr0.16b, rr0.16b, rhash.16b
+  eor rr1.16b, rr1.16b, rbuf.16b
+  eor rr0.16b, rr0.16b, rr6.16b
+  eor rr1.16b, rr1.16b, rr7.16b
+  eor rr0.16b, rr0.16b, rr8.16b
+  eor rr1.16b, rr1.16b, rr9.16b
+
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(CLEAR_REG(rh2);
+              CLEAR_REG(rh3);
+              CLEAR_REG(rr2);
+              CLEAR_REG(rbuf2);
+              CLEAR_REG(rbuf3)),
+            _(CLEAR_REG(rr3);
+              CLEAR_REG(rr4);
+              CLEAR_REG(rr5);
+              CLEAR_REG(rr6);
+              CLEAR_REG(rr7)),
+            _(CLEAR_REG(rr8);
+              CLEAR_REG(rr9);
+              CLEAR_REG(rbuf1);
+              CLEAR_REG(rbuf2)))
+
+  CLEAR_REG(rbuf4)
+  CLEAR_REG(rbuf5)
+  CLEAR_REG(t2)
+  CLEAR_REG(t3)
+  CLEAR_REG(t4)
+  CLEAR_REG(t5)
+
+  VPOP_ABI
+
+  cbz x3, .Ldone
+
+.Less_than_6:
+  /* Handle remaining blocks. */
+
+  ld1 {rbuf.16b}, [x2], #16
+  sub x3, x3, #1
+
+  rbit rbuf.16b, rbuf.16b /* bit-swap */
+
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cbz x3, .Lend
+
+.Loop:
+  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16))
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(sub x3, x3, #1),
+            _(rbit rbuf.16b, rbuf.16b),
+            __)
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cbnz x3, .Loop
+
+.Lend:
+  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf)))
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __)
+
+.Ldone:
+  CLEAR_REG(rr1)
+  CLEAR_REG(rr0)
+  rbit rhash.16b, rhash.16b /* bit-swap */
+  CLEAR_REG(t0)
+  CLEAR_REG(t1)
+
+  st1 {rhash.2d}, [x1]
+  CLEAR_REG(rhash)
+
+.Ldo_nothing:
+  mov x0, #0
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
+
+
+/*
+ * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+ */
+.align 3
+.globl _gcry_ghash_setup_armv8_ce_pmull
+ELF(.type  _gcry_ghash_setup_armv8_ce_pmull,%function;)
+_gcry_ghash_setup_armv8_ce_pmull:
+  /* input:
+   *	x0: gcm_key
+   *	x1: gcm_table
+   */
+  CFI_STARTPROC()
+
+  GET_DATA_POINTER(x2, .Lrconst)
+
+  eor vZZ.16b, vZZ.16b, vZZ.16b
+
+  /* H¹ */
+  ld1 {rh1.16b}, [x0]
+  rbit rh1.16b, rh1.16b
+  st1 {rh1.16b}, [x0]
+
+  ld1r {rrconst.2d}, [x2]
+
+  /* H² */
+  PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __)
+  REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __)
+
+  /* H³ */
+  PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __)
+  REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __)
+
+  /* H⁴ */
+  PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __)
+  REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __)
+
+  /* H⁵ */
+  PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __)
+  REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __)
+
+  /* H⁶ */
+  PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __)
+  REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __)
+
+  st1 {rh2.16b-rh4.16b}, [x1], #(3*16)
+  st1 {rh5.16b-rh6.16b}, [x1]
+
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm-intel-pclmul.c b/comm/third_party/libgcrypt/cipher/cipher-gcm-intel-pclmul.c
new file mode 100644
index 0000000000..28165c653f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm-intel-pclmul.c
@@ -0,0 +1,712 @@
+/* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
+ *                               implementation
+ * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
+
+
+/*
+ Intel PCLMUL ghash based on white paper:
+  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
+   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+ */
+static ASM_FUNC_ATTR_INLINE void reduction(void)
+{
+  /* input: <xmm1:xmm3> */
+
+  asm volatile (/* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psllq $1, %%xmm6\n\t"  /* packed right shifting << 63 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psllq $57, %%xmm5\n\t"  /* packed right shifting << 57 */
+                "psllq $62, %%xmm6\n\t"  /* packed right shifting << 62 */
+                "pxor %%xmm5, %%xmm6\n\t" /* xor the shifted versions */
+                "pshufd $0x6a, %%xmm6, %%xmm5\n\t"
+                "pshufd $0xae, %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm5, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 1 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 2 */
+                "pxor %%xmm3, %%xmm1\n\t"
+                "psrlq $5, %%xmm3\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */
+                ::: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
+{
+  /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
+                "pshufd $78, %%xmm0, %%xmm2\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
+
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+                ::: "memory" );
+
+  reduction();
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
+		   const unsigned char *be_mask)
+{
+  /* Input:
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+   */
+  asm volatile (/* perform clmul and merge results... */
+                "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */
+                "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
+                "pxor %%xmm5, %%xmm1\n\t"
+
+                "pshufd $78, %%xmm2, %%xmm5\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */
+                "movdqa %%xmm2, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */
+
+                "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */
+                "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm5, %%xmm0\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm5, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "movdqu 2*16(%[buf]), %%xmm5\n\t"
+                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */
+
+                "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */
+
+                "pshufd $78, %%xmm2, %%xmm0\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */
+                "movdqa %%xmm2, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */
+                "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
+
+                "movdqu 3*16(%[buf]), %%xmm2\n\t"
+                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table),
+                  [be_mask] "m" (*be_mask)
+                : "memory" );
+
+  asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+                "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */
+
+                "pshufd $78, %%xmm5, %%xmm0\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */
+                "movdqa %%xmm5, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */
+                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+                :
+                : [h_1] "m" (*(const unsigned char *)h_1)
+                : "memory" );
+
+  reduction();
+}
+
+#ifdef __x86_64__
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr8(const void *buf, const void *h_table)
+{
+  /* Input:
+      H¹: XMM0
+      bemask: XMM15
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+     Inputs XMM0 and XMM15 stays unmodified.
+   */
+  asm volatile (/* Load H6, H7, H8. */
+                "movdqu 6*16(%[h_table]), %%xmm10\n\t"
+                "movdqu 5*16(%[h_table]), %%xmm9\n\t"
+                "movdqu 4*16(%[h_table]), %%xmm8\n\t"
+
+                /* perform clmul and merge results... */
+                "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+                "pxor %%xmm5, %%xmm1\n\t"
+
+                "pshufd $78, %%xmm10, %%xmm5\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */
+                "movdqa %%xmm10, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */
+                "movdqa %%xmm9, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 2*16(%[buf]), %%xmm5\n\t"
+                "movdqu 3*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */
+
+                /* Load H3, H4, H5. */
+                "movdqu 3*16(%[h_table]), %%xmm10\n\t"
+                "movdqu 2*16(%[h_table]), %%xmm9\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm8\n\t"
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */
+                "movdqa %%xmm10, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 4*16(%[buf]), %%xmm5\n\t"
+                "movdqu 5*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */
+                "movdqa %%xmm9, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 6*16(%[buf]), %%xmm5\n\t"
+                "movdqu 7*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
+
+  reduction();
+}
+#endif
+
+static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[2] __attribute__ ((aligned (16))) =
+    { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
+
+  asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+                "pshufd $0xff, %%xmm2, %%xmm3\n\t"
+                "movdqa %%xmm2, %%xmm4\n\t"
+                "psrad $31, %%xmm3\n\t"
+                "pslldq $8, %%xmm4\n\t"
+                "pand %[pconst], %%xmm3\n\t"
+                "paddq %%xmm2, %%xmm2\n\t"
+                "psrlq $63, %%xmm4\n\t"
+                "pxor %%xmm3, %%xmm2\n\t"
+                "pxor %%xmm4, %%xmm2\n\t"
+                "movdqu %%xmm2, (%[h])\n\t"
+                :
+                : [pconst] "m" (pconst),
+                  [h] "r" ((byte *)h + hoffs)
+                : "memory" );
+}
+
+void ASM_FUNC_ATTR
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[10 * 16];
+
+  /* XMM6-XMM15 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
+                "movdqu %%xmm7,  1*16(%0)\n\t"
+                "movdqu %%xmm8,  2*16(%0)\n\t"
+                "movdqu %%xmm9,  3*16(%0)\n\t"
+                "movdqu %%xmm10, 4*16(%0)\n\t"
+                "movdqu %%xmm11, 5*16(%0)\n\t"
+                "movdqu %%xmm12, 6*16(%0)\n\t"
+                "movdqu %%xmm13, 7*16(%0)\n\t"
+                "movdqu %%xmm14, 8*16(%0)\n\t"
+                "movdqu %%xmm15, 9*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#endif
+
+  /* Swap endianness of hsub. */
+  asm volatile ("movdqu (%[key]), %%xmm0\n\t"
+                "pshufb %[be_mask], %%xmm0\n\t"
+                "movdqu %%xmm0, (%[key])\n\t"
+                :
+                : [key] "r" (c->u_mode.gcm.u_ghash_key.key),
+                  [be_mask] "m" (*be_mask)
+                : "memory");
+
+  gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+
+  asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
+                "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
+                :
+                : [key] "r" (c->u_mode.gcm.u_ghash_key.key)
+                : "memory");
+
+  gfmul_pclmul (); /* H<<<1•H => H² */
+
+  asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
+                "movdqa %%xmm1, %%xmm7\n\t"
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
+  gfmul_pclmul (); /* H<<<1•H² => H³ */
+
+  asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
+                "movdqu %%xmm1, 1*16(%[h_table])\n\t"
+                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
+
+  asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t"
+                "movdqa %%xmm1, %%xmm0\n\t"
+                "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table),
+                  [key] "r" (c->u_mode.gcm.u_ghash_key.key)
+                : "memory");
+
+  gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */
+
+#ifdef __x86_64__
+  gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */
+
+  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
+                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */
+
+  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */
+
+  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
+                "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁸ */
+
+  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */
+
+#ifdef __WIN64__
+  /* Clear/restore used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "movdqu 0*16(%0), %%xmm6\n\t"
+                "movdqu 1*16(%0), %%xmm7\n\t"
+                "movdqu 2*16(%0), %%xmm8\n\t"
+                "movdqu 3*16(%0), %%xmm9\n\t"
+                "movdqu 4*16(%0), %%xmm10\n\t"
+                "movdqu 5*16(%0), %%xmm11\n\t"
+                "movdqu 6*16(%0), %%xmm12\n\t"
+                "movdqu 7*16(%0), %%xmm13\n\t"
+                "movdqu 8*16(%0), %%xmm14\n\t"
+                "movdqu 9*16(%0), %%xmm15\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#else
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm8\n\t"
+                "pxor %%xmm9, %%xmm9\n\t"
+                "pxor %%xmm10, %%xmm10\n\t"
+                "pxor %%xmm11, %%xmm11\n\t"
+                "pxor %%xmm12, %%xmm12\n\t"
+                "pxor %%xmm13, %%xmm13\n\t"
+                "pxor %%xmm14, %%xmm14\n\t"
+                "pxor %%xmm15, %%xmm15\n\t"
+                ::: "memory" );
+#endif
+#endif
+}
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                          size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[10 * 16];
+#endif
+
+  if (nblocks == 0)
+    return 0;
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* XMM6-XMM15 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
+                "movdqu %%xmm7,  1*16(%0)\n\t"
+                "movdqu %%xmm8,  2*16(%0)\n\t"
+                "movdqu %%xmm9,  3*16(%0)\n\t"
+                "movdqu %%xmm10, 4*16(%0)\n\t"
+                "movdqu %%xmm11, 5*16(%0)\n\t"
+                "movdqu %%xmm12, 6*16(%0)\n\t"
+                "movdqu %%xmm13, 7*16(%0)\n\t"
+                "movdqu %%xmm14, 8*16(%0)\n\t"
+                "movdqu %%xmm15, 9*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#endif
+
+  /* Preload hash. */
+  asm volatile ("movdqa %[be_mask], %%xmm7\n\t"
+                "movdqu %[hash], %%xmm1\n\t"
+                "pshufb %%xmm7, %%xmm1\n\t" /* be => le */
+                :
+                : [hash] "m" (*result),
+                  [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      /* Preload H1. */
+      asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
+                    "movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks >= 8)
+        {
+          gfmul_pclmul_aggr8 (buf, c->u_mode.gcm.gcm_table);
+
+          buf += 8 * blocksize;
+          nblocks -= 8;
+        }
+#ifndef __WIN64__
+      /* Clear used x86-64/XMM registers. */
+      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                    "pxor %%xmm9, %%xmm9\n\t"
+                    "pxor %%xmm10, %%xmm10\n\t"
+                    "pxor %%xmm11, %%xmm11\n\t"
+                    "pxor %%xmm12, %%xmm12\n\t"
+                    "pxor %%xmm13, %%xmm13\n\t"
+                    "pxor %%xmm14, %%xmm14\n\t"
+                    "pxor %%xmm15, %%xmm15\n\t"
+                    ::: "memory" );
+#endif
+    }
+#endif
+
+  while (nblocks >= 4)
+    {
+      gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.u_ghash_key.key,
+                          c->u_mode.gcm.gcm_table, be_mask);
+
+      buf += 4 * blocksize;
+      nblocks -= 4;
+    }
+
+  if (nblocks)
+    {
+      /* Preload H1. */
+      asm volatile ("movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks)
+        {
+          asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                        "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                        "pxor %%xmm2, %%xmm1\n\t"
+                        :
+                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+                        : "memory" );
+
+          gfmul_pclmul ();
+
+          buf += blocksize;
+          nblocks--;
+        }
+    }
+
+  /* Store hash. */
+  asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "movdqu %%xmm1, %[hash]\n\t"
+                : [hash] "=m" (*result)
+                : [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* Clear/restore used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "movdqu 0*16(%0), %%xmm6\n\t"
+                "movdqu 1*16(%0), %%xmm7\n\t"
+                "movdqu 2*16(%0), %%xmm8\n\t"
+                "movdqu 3*16(%0), %%xmm9\n\t"
+                "movdqu 4*16(%0), %%xmm10\n\t"
+                "movdqu 5*16(%0), %%xmm11\n\t"
+                "movdqu 6*16(%0), %%xmm12\n\t"
+                "movdqu 7*16(%0), %%xmm13\n\t"
+                "movdqu 8*16(%0), %%xmm14\n\t"
+                "movdqu 9*16(%0), %%xmm15\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#else
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                ::: "memory" );
+#endif
+
+  return 0;
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* GCM_USE_INTEL_PCLMUL */
diff --git a/comm/third_party/libgcrypt/cipher/cipher-gcm.c b/comm/third_party/libgcrypt/cipher/cipher-gcm.c
new file mode 100644
index 0000000000..7aad12776f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-gcm.c
@@ -0,0 +1,1207 @@
+/* cipher-gcm.c  - Generic Galois Counter Mode implementation
+ * Copyright (C) 2013 Dmitry Eremin-Solenikov
+ * Copyright (C) 2013, 2018-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+/* Helper macro to force alignment to 16 or 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
+
+extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
+                                              const byte *buf, size_t nblocks);
+#endif
+
+#ifdef GCM_USE_ARM_PMULL
+extern void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
+
+extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
+                                                const byte *buf, size_t nblocks,
+                                                void *gcm_table);
+
+static void
+ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c)
+{
+  _gcry_ghash_setup_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key,
+                                   c->u_mode.gcm.gcm_table);
+}
+
+static unsigned int
+ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                      size_t nblocks)
+{
+  return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf,
+                                    nblocks, c->u_mode.gcm.gcm_table);
+}
+#endif /* GCM_USE_ARM_PMULL */
+
+#ifdef GCM_USE_ARM_NEON
+extern void _gcry_ghash_setup_armv7_neon (void *gcm_key);
+
+extern unsigned int _gcry_ghash_armv7_neon (void *gcm_key, byte *result,
+					    const byte *buf, size_t nblocks);
+
+static void
+ghash_setup_armv7_neon (gcry_cipher_hd_t c)
+{
+  _gcry_ghash_setup_armv7_neon(c->u_mode.gcm.u_ghash_key.key);
+}
+
+static unsigned int
+ghash_armv7_neon (gcry_cipher_hd_t c, byte *result, const byte *buf,
+		  size_t nblocks)
+{
+  return _gcry_ghash_armv7_neon(c->u_mode.gcm.u_ghash_key.key, result, buf,
+				nblocks);
+}
+#endif /* GCM_USE_ARM_NEON */
+
+#ifdef GCM_USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static unsigned int
+ghash_s390x_kimd (gcry_cipher_hd_t c, byte *result, const byte *buf,
+		  size_t nblocks)
+{
+  u128_t params[2];
+
+  memcpy (&params[0], result, 16);
+  memcpy (&params[1], c->u_mode.gcm.u_ghash_key.key, 16);
+
+  kimd_execute (KMID_FUNCTION_GHASH, &params, buf, nblocks * 16);
+
+  memcpy (result, &params[0], 16);
+  wipememory (params, sizeof(params));
+  return 0;
+}
+#endif /* GCM_USE_S390X_CRYPTO*/
+
+
+#ifdef GCM_USE_TABLES
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  u16 R[256];
+  volatile u32 counter_tail;
+} gcm_table ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+    {
+      0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
+      0x0e10, 0x0fd2, 0x0d94, 0x0c56, 0x0918, 0x08da, 0x0a9c, 0x0b5e,
+      0x1c20, 0x1de2, 0x1fa4, 0x1e66, 0x1b28, 0x1aea, 0x18ac, 0x196e,
+      0x1230, 0x13f2, 0x11b4, 0x1076, 0x1538, 0x14fa, 0x16bc, 0x177e,
+      0x3840, 0x3982, 0x3bc4, 0x3a06, 0x3f48, 0x3e8a, 0x3ccc, 0x3d0e,
+      0x3650, 0x3792, 0x35d4, 0x3416, 0x3158, 0x309a, 0x32dc, 0x331e,
+      0x2460, 0x25a2, 0x27e4, 0x2626, 0x2368, 0x22aa, 0x20ec, 0x212e,
+      0x2a70, 0x2bb2, 0x29f4, 0x2836, 0x2d78, 0x2cba, 0x2efc, 0x2f3e,
+      0x7080, 0x7142, 0x7304, 0x72c6, 0x7788, 0x764a, 0x740c, 0x75ce,
+      0x7e90, 0x7f52, 0x7d14, 0x7cd6, 0x7998, 0x785a, 0x7a1c, 0x7bde,
+      0x6ca0, 0x6d62, 0x6f24, 0x6ee6, 0x6ba8, 0x6a6a, 0x682c, 0x69ee,
+      0x62b0, 0x6372, 0x6134, 0x60f6, 0x65b8, 0x647a, 0x663c, 0x67fe,
+      0x48c0, 0x4902, 0x4b44, 0x4a86, 0x4fc8, 0x4e0a, 0x4c4c, 0x4d8e,
+      0x46d0, 0x4712, 0x4554, 0x4496, 0x41d8, 0x401a, 0x425c, 0x439e,
+      0x54e0, 0x5522, 0x5764, 0x56a6, 0x53e8, 0x522a, 0x506c, 0x51ae,
+      0x5af0, 0x5b32, 0x5974, 0x58b6, 0x5df8, 0x5c3a, 0x5e7c, 0x5fbe,
+      0xe100, 0xe0c2, 0xe284, 0xe346, 0xe608, 0xe7ca, 0xe58c, 0xe44e,
+      0xef10, 0xeed2, 0xec94, 0xed56, 0xe818, 0xe9da, 0xeb9c, 0xea5e,
+      0xfd20, 0xfce2, 0xfea4, 0xff66, 0xfa28, 0xfbea, 0xf9ac, 0xf86e,
+      0xf330, 0xf2f2, 0xf0b4, 0xf176, 0xf438, 0xf5fa, 0xf7bc, 0xf67e,
+      0xd940, 0xd882, 0xdac4, 0xdb06, 0xde48, 0xdf8a, 0xddcc, 0xdc0e,
+      0xd750, 0xd692, 0xd4d4, 0xd516, 0xd058, 0xd19a, 0xd3dc, 0xd21e,
+      0xc560, 0xc4a2, 0xc6e4, 0xc726, 0xc268, 0xc3aa, 0xc1ec, 0xc02e,
+      0xcb70, 0xcab2, 0xc8f4, 0xc936, 0xcc78, 0xcdba, 0xcffc, 0xce3e,
+      0x9180, 0x9042, 0x9204, 0x93c6, 0x9688, 0x974a, 0x950c, 0x94ce,
+      0x9f90, 0x9e52, 0x9c14, 0x9dd6, 0x9898, 0x995a, 0x9b1c, 0x9ade,
+      0x8da0, 0x8c62, 0x8e24, 0x8fe6, 0x8aa8, 0x8b6a, 0x892c, 0x88ee,
+      0x83b0, 0x8272, 0x8034, 0x81f6, 0x84b8, 0x857a, 0x873c, 0x86fe,
+      0xa9c0, 0xa802, 0xaa44, 0xab86, 0xaec8, 0xaf0a, 0xad4c, 0xac8e,
+      0xa7d0, 0xa612, 0xa454, 0xa596, 0xa0d8, 0xa11a, 0xa35c, 0xa29e,
+      0xb5e0, 0xb422, 0xb664, 0xb7a6, 0xb2e8, 0xb32a, 0xb16c, 0xb0ae,
+      0xbbf0, 0xba32, 0xb874, 0xb9b6, 0xbcf8, 0xbd3a, 0xbf7c, 0xbebe,
+    },
+    0
+  };
+
+#define gcmR gcm_table.R
+
+static inline
+void prefetch_table(const void *tab, size_t len)
+{
+  const volatile byte *vtab = tab;
+  size_t i;
+
+  for (i = 0; len - i >= 8 * 32; i += 8 * 32)
+    {
+      (void)vtab[i + 0 * 32];
+      (void)vtab[i + 1 * 32];
+      (void)vtab[i + 2 * 32];
+      (void)vtab[i + 3 * 32];
+      (void)vtab[i + 4 * 32];
+      (void)vtab[i + 5 * 32];
+      (void)vtab[i + 6 * 32];
+      (void)vtab[i + 7 * 32];
+    }
+  for (; i < len; i += 32)
+    {
+      (void)vtab[i];
+    }
+
+  (void)vtab[len - 1];
+}
+
+static inline void
+do_prefetch_tables (const void *gcmM, size_t gcmM_size)
+{
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  gcm_table.counter_head++;
+  gcm_table.counter_tail++;
+
+  /* Prefetch look-up tables to cache.  */
+  prefetch_table(gcmM, gcmM_size);
+  prefetch_table(&gcm_table, sizeof(gcm_table));
+}
+
+#ifdef GCM_TABLES_USE_U64
+static void
+bshift (u64 * b0, u64 * b1)
+{
+  u64 t[2], mask;
+
+  t[0] = *b0;
+  t[1] = *b1;
+  mask = -(t[1] & 1) & 0xe1;
+  mask <<= 56;
+
+  *b1 = (t[1] >> 1) ^ (t[0] << 63);
+  *b0 = (t[0] >> 1) ^ mask;
+}
+
+static void
+do_fillM (unsigned char *h, u64 *M)
+{
+  int i, j;
+
+  M[0 + 0] = 0;
+  M[0 + 16] = 0;
+
+  M[8 + 0] = buf_get_be64 (h + 0);
+  M[8 + 16] = buf_get_be64 (h + 8);
+
+  for (i = 4; i > 0; i /= 2)
+    {
+      M[i + 0] = M[2 * i + 0];
+      M[i + 16] = M[2 * i + 16];
+
+      bshift (&M[i], &M[i + 16]);
+    }
+
+  for (i = 2; i < 16; i *= 2)
+    for (j = 1; j < i; j++)
+      {
+        M[(i + j) + 0] = M[i + 0] ^ M[j + 0];
+        M[(i + j) + 16] = M[i + 16] ^ M[j + 16];
+      }
+
+  for (i = 0; i < 16; i++)
+    {
+      M[i + 32] = (M[i + 0] >> 4) ^ ((u64) gcmR[(M[i + 16] & 0xf) << 4] << 48);
+      M[i + 48] = (M[i + 16] >> 4) ^ (M[i + 0] << 60);
+    }
+}
+
+static inline unsigned int
+do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
+{
+  u64 V[2];
+  u64 tmp[2];
+  const u64 *M;
+  u64 T;
+  u32 A;
+  int i;
+
+  cipher_block_xor (V, result, buf, 16);
+  V[0] = be_bswap64 (V[0]);
+  V[1] = be_bswap64 (V[1]);
+
+  /* First round can be manually tweaked based on fact that 'tmp' is zero. */
+  M = &gcmM[(V[1] & 0xf) + 32];
+  V[1] >>= 4;
+  tmp[0] = M[0];
+  tmp[1] = M[16];
+  tmp[0] ^= gcmM[(V[1] & 0xf) + 0];
+  tmp[1] ^= gcmM[(V[1] & 0xf) + 16];
+  V[1] >>= 4;
+
+  i = 6;
+  while (1)
+    {
+      M = &gcmM[(V[1] & 0xf) + 32];
+      V[1] >>= 4;
+
+      A = tmp[1] & 0xff;
+      T = tmp[0];
+      tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[1] & 0xf) + 0];
+      tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[1] & 0xf) + 16];
+
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[16];
+
+      if (i == 0)
+        break;
+
+      V[1] >>= 4;
+      --i;
+    }
+
+  i = 7;
+  while (1)
+    {
+      M = &gcmM[(V[0] & 0xf) + 32];
+      V[0] >>= 4;
+
+      A = tmp[1] & 0xff;
+      T = tmp[0];
+      tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[0] & 0xf) + 0];
+      tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[0] & 0xf) + 16];
+
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[16];
+
+      if (i == 0)
+        break;
+
+      V[0] >>= 4;
+      --i;
+    }
+
+  buf_put_be64 (result + 0, tmp[0]);
+  buf_put_be64 (result + 8, tmp[1]);
+
+  return (sizeof(V) + sizeof(T) + sizeof(tmp) +
+          sizeof(int)*2 + sizeof(void*)*5);
+}
+
+#else /*!GCM_TABLES_USE_U64*/
+
+static void
+bshift (u32 * M, int i)
+{
+  u32 t[4], mask;
+
+  t[0] = M[i * 4 + 0];
+  t[1] = M[i * 4 + 1];
+  t[2] = M[i * 4 + 2];
+  t[3] = M[i * 4 + 3];
+  mask = -(t[3] & 1) & 0xe1;
+
+  M[i * 4 + 3] = (t[3] >> 1) ^ (t[2] << 31);
+  M[i * 4 + 2] = (t[2] >> 1) ^ (t[1] << 31);
+  M[i * 4 + 1] = (t[1] >> 1) ^ (t[0] << 31);
+  M[i * 4 + 0] = (t[0] >> 1) ^ (mask << 24);
+}
+
+static void
+do_fillM (unsigned char *h, u32 *M)
+{
+  int i, j;
+
+  M[0 * 4 + 0] = 0;
+  M[0 * 4 + 1] = 0;
+  M[0 * 4 + 2] = 0;
+  M[0 * 4 + 3] = 0;
+
+  M[8 * 4 + 0] = buf_get_be32 (h + 0);
+  M[8 * 4 + 1] = buf_get_be32 (h + 4);
+  M[8 * 4 + 2] = buf_get_be32 (h + 8);
+  M[8 * 4 + 3] = buf_get_be32 (h + 12);
+
+  for (i = 4; i > 0; i /= 2)
+    {
+      M[i * 4 + 0] = M[2 * i * 4 + 0];
+      M[i * 4 + 1] = M[2 * i * 4 + 1];
+      M[i * 4 + 2] = M[2 * i * 4 + 2];
+      M[i * 4 + 3] = M[2 * i * 4 + 3];
+
+      bshift (M, i);
+    }
+
+  for (i = 2; i < 16; i *= 2)
+    for (j = 1; j < i; j++)
+      {
+        M[(i + j) * 4 + 0] = M[i * 4 + 0] ^ M[j * 4 + 0];
+        M[(i + j) * 4 + 1] = M[i * 4 + 1] ^ M[j * 4 + 1];
+        M[(i + j) * 4 + 2] = M[i * 4 + 2] ^ M[j * 4 + 2];
+        M[(i + j) * 4 + 3] = M[i * 4 + 3] ^ M[j * 4 + 3];
+      }
+
+  for (i = 0; i < 4 * 16; i += 4)
+    {
+      M[i + 0 + 64] = (M[i + 0] >> 4)
+                      ^ ((u64) gcmR[(M[i + 3] << 4) & 0xf0] << 16);
+      M[i + 1 + 64] = (M[i + 1] >> 4) ^ (M[i + 0] << 28);
+      M[i + 2 + 64] = (M[i + 2] >> 4) ^ (M[i + 1] << 28);
+      M[i + 3 + 64] = (M[i + 3] >> 4) ^ (M[i + 2] << 28);
+    }
+}
+
+static inline unsigned int
+do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
+{
+  byte V[16];
+  u32 tmp[4];
+  u32 v;
+  const u32 *M, *m;
+  u32 T[3];
+  int i;
+
+  cipher_block_xor (V, result, buf, 16); /* V is big-endian */
+
+  /* First round can be manually tweaked based on fact that 'tmp' is zero. */
+  i = 15;
+
+  v = V[i];
+  M = &gcmM[(v & 0xf) * 4 + 64];
+  v = (v & 0xf0) >> 4;
+  m = &gcmM[v * 4];
+  v = V[--i];
+
+  tmp[0] = M[0] ^ m[0];
+  tmp[1] = M[1] ^ m[1];
+  tmp[2] = M[2] ^ m[2];
+  tmp[3] = M[3] ^ m[3];
+
+  while (1)
+    {
+      M = &gcmM[(v & 0xf) * 4 + 64];
+      v = (v & 0xf0) >> 4;
+      m = &gcmM[v * 4];
+
+      T[0] = tmp[0];
+      T[1] = tmp[1];
+      T[2] = tmp[2];
+      tmp[0] = (T[0] >> 8) ^ ((u32) gcmR[tmp[3] & 0xff] << 16) ^ m[0];
+      tmp[1] = (T[0] << 24) ^ (tmp[1] >> 8) ^ m[1];
+      tmp[2] = (T[1] << 24) ^ (tmp[2] >> 8) ^ m[2];
+      tmp[3] = (T[2] << 24) ^ (tmp[3] >> 8) ^ m[3];
+
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[1];
+      tmp[2] ^= M[2];
+      tmp[3] ^= M[3];
+
+      if (i == 0)
+        break;
+
+      v = V[--i];
+    }
+
+  buf_put_be32 (result + 0, tmp[0]);
+  buf_put_be32 (result + 4, tmp[1]);
+  buf_put_be32 (result + 8, tmp[2]);
+  buf_put_be32 (result + 12, tmp[3]);
+
+  return (sizeof(V) + sizeof(T) + sizeof(tmp) +
+          sizeof(int)*2 + sizeof(void*)*6);
+}
+#endif /*!GCM_TABLES_USE_U64*/
+
+#define fillM(c) \
+  do_fillM (c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table)
+#define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
+#define prefetch_tables(c) \
+  do_prefetch_tables(c->u_mode.gcm.gcm_table, sizeof(c->u_mode.gcm.gcm_table))
+
+#else
+
+static unsigned long
+bshift (unsigned long *b)
+{
+  unsigned long c;
+  int i;
+  c = b[3] & 1;
+  for (i = 3; i > 0; i--)
+    {
+      b[i] = (b[i] >> 1) | (b[i - 1] << 31);
+    }
+  b[i] >>= 1;
+  return c;
+}
+
+static unsigned int
+do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
+{
+  unsigned long V[4];
+  int i, j;
+  byte *p;
+
+#ifdef WORDS_BIGENDIAN
+  p = result;
+#else
+  unsigned long T[4];
+
+  cipher_block_xor (V, result, buf, 16);
+  for (i = 0; i < 4; i++)
+    {
+      V[i] = (V[i] & 0x00ff00ff) << 8 | (V[i] & 0xff00ff00) >> 8;
+      V[i] = (V[i] & 0x0000ffff) << 16 | (V[i] & 0xffff0000) >> 16;
+    }
+  p = (byte *) T;
+#endif
+
+  memset (p, 0, 16);
+
+  for (i = 0; i < 16; i++)
+    {
+      for (j = 0x80; j; j >>= 1)
+        {
+          if (hsub[i] & j)
+            cipher_block_xor (p, p, V, 16);
+          if (bshift (V))
+            V[0] ^= 0xe1000000;
+        }
+    }
+#ifndef WORDS_BIGENDIAN
+  for (i = 0, p = (byte *) T; i < 16; i += 4, p += 4)
+    {
+      result[i + 0] = p[3];
+      result[i + 1] = p[2];
+      result[i + 2] = p[1];
+      result[i + 3] = p[0];
+    }
+#endif
+
+  return (sizeof(V) + sizeof(T) + sizeof(int)*2 + sizeof(void*)*5);
+}
+
+#define fillM(c) do { } while (0)
+#define GHASH(c, result, buf) do_ghash (c->u_mode.gcm.u_ghash_key.key, result, buf)
+#define prefetch_tables(c) do {} while (0)
+
+#endif /* !GCM_USE_TABLES */
+
+
+static unsigned int
+ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                size_t nblocks)
+{
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+  unsigned int burn = 0;
+
+  prefetch_tables (c);
+
+  while (nblocks)
+    {
+      burn = GHASH (c, result, buf);
+      buf += blocksize;
+      nblocks--;
+    }
+
+  return burn + (burn ? 5*sizeof(void*) : 0);
+}
+
+
+static void
+setupM (gcry_cipher_hd_t c)
+{
+#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
+    defined(GCM_USE_S390X_CRYPTO)
+  unsigned int features = _gcry_get_hw_features ();
+#endif
+
+  c->u_mode.gcm.ghash_fn = NULL;
+
+  if (0)
+    ;
+#ifdef GCM_USE_INTEL_PCLMUL
+  else if (features & HWF_INTEL_PCLMUL)
+    {
+      c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+      _gcry_ghash_setup_intel_pclmul (c);
+    }
+#endif
+#ifdef GCM_USE_ARM_PMULL
+  else if (features & HWF_ARM_PMULL)
+    {
+      c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull;
+      ghash_setup_armv8_ce_pmull (c);
+    }
+#endif
+#ifdef GCM_USE_ARM_NEON
+  else if (features & HWF_ARM_NEON)
+    {
+      c->u_mode.gcm.ghash_fn = ghash_armv7_neon;
+      ghash_setup_armv7_neon (c);
+    }
+#endif
+#ifdef GCM_USE_S390X_CRYPTO
+  else if (features & HWF_S390X_MSA)
+    {
+      if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
+	{
+	  c->u_mode.gcm.ghash_fn = ghash_s390x_kimd;
+	}
+    }
+#endif
+
+  if (c->u_mode.gcm.ghash_fn == NULL)
+    {
+      c->u_mode.gcm.ghash_fn = ghash_internal;
+      fillM (c);
+    }
+}
+
+
+static inline void
+gcm_bytecounter_add (u32 ctr[2], size_t add)
+{
+  if (sizeof(add) > sizeof(u32))
+    {
+      u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
+      ctr[1] += high_add;
+    }
+
+  ctr[0] += add;
+  if (ctr[0] >= add)
+    return;
+  ++ctr[1];
+}
+
+
+static inline u32
+gcm_add32_be128 (byte *ctr, unsigned int add)
+{
+  /* 'ctr' must be aligned to four bytes. */
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+  u32 *pval = (u32 *)(void *)(ctr + blocksize - sizeof(u32));
+  u32 val;
+
+  val = be_bswap32(*pval) + add;
+  *pval = be_bswap32(val);
+
+  return val; /* return result as host-endian value */
+}
+
+
+static inline int
+gcm_check_datalen (u32 ctr[2])
+{
+  /* len(plaintext) <= 2^39-256 bits == 2^36-32 bytes == 2^32-2 blocks */
+  if (ctr[1] > 0xfU)
+    return 0;
+  if (ctr[1] < 0xfU)
+    return 1;
+
+  if (ctr[0] <= 0xffffffe0U)
+    return 1;
+
+  return 0;
+}
+
+
+static inline int
+gcm_check_aadlen_or_ivlen (u32 ctr[2])
+{
+  /* len(aad/iv) <= 2^64-1 bits ~= 2^61-1 bytes */
+  if (ctr[1] > 0x1fffffffU)
+    return 0;
+  if (ctr[1] < 0x1fffffffU)
+    return 1;
+
+  if (ctr[0] <= 0xffffffffU)
+    return 1;
+
+  return 0;
+}
+
+
+static void
+do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
+             size_t buflen, int do_padding)
+{
+  unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+  unsigned int unused = c->u_mode.gcm.mac_unused;
+  ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
+  size_t nblocks, n;
+  unsigned int burn = 0;
+
+  if (buflen == 0 && (unused == 0 || !do_padding))
+    return;
+
+  do
+    {
+      if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
+        {
+          n = blocksize - unused;
+          n = n < buflen ? n : buflen;
+
+          buf_cpy (&c->u_mode.gcm.macbuf[unused], buf, n);
+
+          unused += n;
+          buf += n;
+          buflen -= n;
+        }
+      if (!buflen)
+        {
+          if (!do_padding && unused < blocksize)
+	    {
+	      break;
+	    }
+
+	  n = blocksize - unused;
+	  if (n > 0)
+	    {
+	      memset (&c->u_mode.gcm.macbuf[unused], 0, n);
+	      unused = blocksize;
+	    }
+        }
+
+      if (unused > 0)
+        {
+          gcry_assert (unused == blocksize);
+
+          /* Process one block from macbuf.  */
+          burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+          unused = 0;
+        }
+
+      nblocks = buflen / blocksize;
+
+      if (nblocks)
+        {
+          burn = ghash_fn (c, hash, buf, nblocks);
+          buf += blocksize * nblocks;
+          buflen -= blocksize * nblocks;
+        }
+    }
+  while (buflen > 0);
+
+  c->u_mode.gcm.mac_unused = unused;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+static gcry_err_code_t
+gcm_ctr_encrypt (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
+                 const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err = 0;
+
+  while (inbuflen)
+    {
+      u32 nblocks_to_overflow;
+      u32 num_ctr_increments;
+      u32 curr_ctr_low;
+      size_t currlen = inbuflen;
+      byte ctr_copy[GCRY_GCM_BLOCK_LEN];
+      int fix_ctr = 0;
+
+      /* GCM CTR increments only least significant 32-bits, without carry
+       * to upper 96-bits of counter.  Using generic CTR implementation
+       * directly would carry 32-bit overflow to upper 96-bit.  Detect
+       * if input length is long enough to cause overflow, and limit
+       * input length so that CTR overflow happen but updated CTR value is
+       * not used to encrypt further input.  After overflow, upper 96 bits
+       * of CTR are restored to cancel out modification done by generic CTR
+       * encryption. */
+
+      if (inbuflen > c->unused)
+        {
+          curr_ctr_low = gcm_add32_be128 (c->u_ctr.ctr, 0);
+
+          /* Number of CTR increments this inbuflen would cause. */
+          num_ctr_increments = (inbuflen - c->unused) / GCRY_GCM_BLOCK_LEN +
+                               !!((inbuflen - c->unused) % GCRY_GCM_BLOCK_LEN);
+
+          if ((u32)(num_ctr_increments + curr_ctr_low) < curr_ctr_low)
+            {
+              nblocks_to_overflow = 0xffffffffU - curr_ctr_low + 1;
+              currlen = nblocks_to_overflow * GCRY_GCM_BLOCK_LEN + c->unused;
+              if (currlen > inbuflen)
+                {
+                  currlen = inbuflen;
+                }
+
+              fix_ctr = 1;
+              cipher_block_cpy(ctr_copy, c->u_ctr.ctr, GCRY_GCM_BLOCK_LEN);
+            }
+        }
+
+      err = _gcry_cipher_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+        return err;
+
+      if (fix_ctr)
+        {
+          /* Lower 32-bits of CTR should now be zero. */
+          gcry_assert(gcm_add32_be128 (c->u_ctr.ctr, 0) == 0);
+
+          /* Restore upper part of CTR. */
+          buf_cpy(c->u_ctr.ctr, ctr_copy, GCRY_GCM_BLOCK_LEN - sizeof(u32));
+
+          wipememory(ctr_copy, sizeof(ctr_copy));
+        }
+
+      inbuflen -= currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      outbuf += currlen;
+    }
+
+  return err;
+}
+
+
+static gcry_err_code_t
+gcm_crypt_inner (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
+		 const byte *inbuf, size_t inbuflen, int encrypt)
+{
+  gcry_err_code_t err;
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Use a bulk method if available.  */
+      if (c->bulk.gcm_crypt)
+	{
+	  /* Bulk method requires that there is no cached data. */
+	  if (inbuflen >= GCRY_GCM_BLOCK_LEN && c->u_mode.gcm.mac_unused == 0)
+	    {
+	      size_t nblks = inbuflen / GCRY_GCM_BLOCK_LEN;
+	      size_t nleft;
+	      size_t ndone;
+
+	      nleft = c->bulk.gcm_crypt (c, outbuf, inbuf, nblks, encrypt);
+	      ndone = nblks - nleft;
+
+	      inbuf += ndone * GCRY_GCM_BLOCK_LEN;
+	      outbuf += ndone * GCRY_GCM_BLOCK_LEN;
+	      inbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
+	      outbuflen -= ndone * GCRY_GCM_BLOCK_LEN;
+
+	      if (inbuflen == 0)
+		break;
+
+	      currlen = inbuflen;
+	    }
+	  else if (c->u_mode.gcm.mac_unused > 0
+	           && inbuflen >= GCRY_GCM_BLOCK_LEN
+			  + (16 - c->u_mode.gcm.mac_unused))
+	    {
+	      /* Handle just enough data so that cache is depleted, and on
+	       * next loop iteration use bulk method. */
+	      currlen = 16 - c->u_mode.gcm.mac_unused;
+
+	      gcry_assert(currlen);
+	    }
+	}
+
+      /* Since checksumming is done after/before encryption/decryption,
+       * process input in 24KiB chunks to keep data loaded in L1 cache for
+       * checksumming/decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      if (!encrypt)
+	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, inbuf, currlen, 0);
+
+      err = gcm_ctr_encrypt(c, outbuf, outbuflen, inbuf, currlen);
+      if (err != 0)
+	return err;
+
+      if (encrypt)
+	do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, outbuf, currlen, 0);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  static const unsigned char zerobuf[MAX_BLOCKSIZE];
+
+  if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->u_mode.gcm.datalen_over_limits)
+    return GPG_ERR_INV_LENGTH;
+  if (c->marks.tag
+      || c->u_mode.gcm.ghash_data_finalized
+      || !c->u_mode.gcm.ghash_fn)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+
+  if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->u_mode.gcm.ghash_aad_finalized)
+    {
+      /* Start of encryption marks end of AAD stream. */
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
+      c->u_mode.gcm.ghash_aad_finalized = 1;
+    }
+
+  gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
+  if (!gcm_check_datalen(c->u_mode.gcm.datalen))
+    {
+      c->u_mode.gcm.datalen_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 1);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
+                          byte *outbuf, size_t outbuflen,
+                          const byte *inbuf, size_t inbuflen)
+{
+  static const unsigned char zerobuf[MAX_BLOCKSIZE];
+
+  if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->u_mode.gcm.datalen_over_limits)
+    return GPG_ERR_INV_LENGTH;
+  if (c->marks.tag
+      || c->u_mode.gcm.ghash_data_finalized
+      || !c->u_mode.gcm.ghash_fn)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+
+  if (!c->u_mode.gcm.ghash_aad_finalized)
+    {
+      /* Start of decryption marks end of AAD stream. */
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
+      c->u_mode.gcm.ghash_aad_finalized = 1;
+    }
+
+  gcm_bytecounter_add(c->u_mode.gcm.datalen, inbuflen);
+  if (!gcm_check_datalen(c->u_mode.gcm.datalen))
+    {
+      c->u_mode.gcm.datalen_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  return gcm_crypt_inner (c, outbuf, outbuflen, inbuf, inbuflen, 0);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
+                               const byte * aadbuf, size_t aadbuflen)
+{
+  static const unsigned char zerobuf[MAX_BLOCKSIZE];
+
+  if (c->spec->blocksize != GCRY_GCM_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (c->u_mode.gcm.datalen_over_limits)
+    return GPG_ERR_INV_LENGTH;
+  if (c->marks.tag
+      || c->u_mode.gcm.ghash_aad_finalized
+      || c->u_mode.gcm.ghash_data_finalized
+      || !c->u_mode.gcm.ghash_fn)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    _gcry_cipher_gcm_setiv (c, zerobuf, GCRY_GCM_BLOCK_LEN);
+
+  gcm_bytecounter_add(c->u_mode.gcm.aadlen, aadbuflen);
+  if (!gcm_check_aadlen_or_ivlen(c->u_mode.gcm.aadlen))
+    {
+      c->u_mode.gcm.datalen_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, aadbuf, aadbuflen, 0);
+
+  return 0;
+}
+
+
+void
+_gcry_cipher_gcm_setkey (gcry_cipher_hd_t c)
+{
+  memset (c->u_mode.gcm.u_ghash_key.key, 0, GCRY_GCM_BLOCK_LEN);
+
+  c->spec->encrypt (&c->context.c, c->u_mode.gcm.u_ghash_key.key,
+                    c->u_mode.gcm.u_ghash_key.key);
+  setupM (c);
+}
+
+
+static gcry_err_code_t
+_gcry_cipher_gcm_initiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
+{
+  memset (c->u_mode.gcm.aadlen, 0, sizeof(c->u_mode.gcm.aadlen));
+  memset (c->u_mode.gcm.datalen, 0, sizeof(c->u_mode.gcm.datalen));
+  memset (c->u_mode.gcm.u_tag.tag, 0, GCRY_GCM_BLOCK_LEN);
+  c->u_mode.gcm.datalen_over_limits = 0;
+  c->u_mode.gcm.ghash_data_finalized = 0;
+  c->u_mode.gcm.ghash_aad_finalized = 0;
+
+  if (ivlen == 0)
+    return GPG_ERR_INV_LENGTH;
+
+  if (ivlen != GCRY_GCM_BLOCK_LEN - 4)
+    {
+      u32 iv_bytes[2] = {0, 0};
+      u32 bitlengths[2][2];
+
+      if (!c->u_mode.gcm.ghash_fn)
+        return GPG_ERR_INV_STATE;
+
+      memset(c->u_ctr.ctr, 0, GCRY_GCM_BLOCK_LEN);
+
+      gcm_bytecounter_add(iv_bytes, ivlen);
+      if (!gcm_check_aadlen_or_ivlen(iv_bytes))
+        {
+          c->u_mode.gcm.datalen_over_limits = 1;
+          return GPG_ERR_INV_LENGTH;
+        }
+
+      do_ghash_buf(c, c->u_ctr.ctr, iv, ivlen, 1);
+
+      /* iv length, 64-bit */
+      bitlengths[1][1] = be_bswap32(iv_bytes[0] << 3);
+      bitlengths[1][0] = be_bswap32((iv_bytes[0] >> 29) |
+                                    (iv_bytes[1] << 3));
+      /* zeros, 64-bit */
+      bitlengths[0][1] = 0;
+      bitlengths[0][0] = 0;
+
+      do_ghash_buf(c, c->u_ctr.ctr, (byte*)bitlengths, GCRY_GCM_BLOCK_LEN, 1);
+
+      wipememory (iv_bytes, sizeof iv_bytes);
+      wipememory (bitlengths, sizeof bitlengths);
+    }
+  else
+    {
+      /* 96-bit IV is handled differently. */
+      memcpy (c->u_ctr.ctr, iv, ivlen);
+      c->u_ctr.ctr[12] = c->u_ctr.ctr[13] = c->u_ctr.ctr[14] = 0;
+      c->u_ctr.ctr[15] = 1;
+    }
+
+  c->spec->encrypt (&c->context.c, c->u_mode.gcm.tagiv, c->u_ctr.ctr);
+
+  gcm_add32_be128 (c->u_ctr.ctr, 1);
+
+  c->unused = 0;
+  c->marks.iv = 1;
+  c->marks.tag = 0;
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
+{
+  c->marks.iv = 0;
+  c->marks.tag = 0;
+  c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
+
+  if (fips_mode ())
+    {
+      /* Direct invocation of GCM setiv in FIPS mode disables encryption. */
+      c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 1;
+    }
+
+  return _gcry_cipher_gcm_initiv (c, iv, ivlen);
+}
+
+
+#if 0 && TODO
+void
+_gcry_cipher_gcm_geniv (gcry_cipher_hd_t c,
+                        byte *ivout, size_t ivoutlen, const byte *nonce,
+                        size_t noncelen)
+{
+  /* nonce:    user provided part (might be null) */
+  /* noncelen: check if proper length (if nonce not null) */
+  /* ivout:    iv used to initialize gcm, output to user */
+  /* ivoutlen: check correct size */
+  byte iv[IVLEN];
+
+  if (!ivout)
+    return GPG_ERR_INV_ARG;
+  if (ivoutlen != IVLEN)
+    return GPG_ERR_INV_LENGTH;
+  if (nonce != NULL && !is_nonce_ok_len(noncelen))
+    return GPG_ERR_INV_ARG;
+
+  gcm_generate_iv(iv, nonce, noncelen);
+
+  c->marks.iv = 0;
+  c->marks.tag = 0;
+  c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode = 0;
+
+  _gcry_cipher_gcm_initiv (c, iv, IVLEN);
+
+  buf_cpy(ivout, iv, IVLEN);
+  wipememory(iv, sizeof(iv));
+}
+#endif
+
+
+static int
+is_tag_length_valid(size_t taglen)
+{
+  switch (taglen)
+    {
+    /* Allowed tag lengths from NIST SP 800-38D.  */
+    case 128 / 8: /* GCRY_GCM_BLOCK_LEN */
+    case 120 / 8:
+    case 112 / 8:
+    case 104 / 8:
+    case 96 / 8:
+    case 64 / 8:
+    case 32 / 8:
+      return 1;
+
+    default:
+      return 0;
+    }
+}
+
+static gcry_err_code_t
+_gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
+                      byte * outbuf, size_t outbuflen, int check)
+{
+  if (!(is_tag_length_valid (outbuflen) || outbuflen >= GCRY_GCM_BLOCK_LEN))
+    return GPG_ERR_INV_LENGTH;
+  if (c->u_mode.gcm.datalen_over_limits)
+    return GPG_ERR_INV_LENGTH;
+
+  if (!c->marks.tag)
+    {
+      u32 bitlengths[2][2];
+
+      if (!c->u_mode.gcm.ghash_fn)
+        return GPG_ERR_INV_STATE;
+
+      /* aad length */
+      bitlengths[0][1] = be_bswap32(c->u_mode.gcm.aadlen[0] << 3);
+      bitlengths[0][0] = be_bswap32((c->u_mode.gcm.aadlen[0] >> 29) |
+                                    (c->u_mode.gcm.aadlen[1] << 3));
+      /* data length */
+      bitlengths[1][1] = be_bswap32(c->u_mode.gcm.datalen[0] << 3);
+      bitlengths[1][0] = be_bswap32((c->u_mode.gcm.datalen[0] >> 29) |
+                                    (c->u_mode.gcm.datalen[1] << 3));
+
+      /* Finalize data-stream. */
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, NULL, 0, 1);
+      c->u_mode.gcm.ghash_aad_finalized = 1;
+      c->u_mode.gcm.ghash_data_finalized = 1;
+
+      /* Add bitlengths to tag. */
+      do_ghash_buf(c, c->u_mode.gcm.u_tag.tag, (byte*)bitlengths,
+                   GCRY_GCM_BLOCK_LEN, 1);
+      cipher_block_xor (c->u_mode.gcm.u_tag.tag, c->u_mode.gcm.tagiv,
+                        c->u_mode.gcm.u_tag.tag, GCRY_GCM_BLOCK_LEN);
+      c->marks.tag = 1;
+
+      wipememory (bitlengths, sizeof (bitlengths));
+      wipememory (c->u_mode.gcm.macbuf, GCRY_GCM_BLOCK_LEN);
+      wipememory (c->u_mode.gcm.tagiv, GCRY_GCM_BLOCK_LEN);
+      wipememory (c->u_mode.gcm.aadlen, sizeof (c->u_mode.gcm.aadlen));
+      wipememory (c->u_mode.gcm.datalen, sizeof (c->u_mode.gcm.datalen));
+    }
+
+  if (!check)
+    {
+      if (outbuflen > GCRY_GCM_BLOCK_LEN)
+        outbuflen = GCRY_GCM_BLOCK_LEN;
+
+      /* NB: We already checked that OUTBUF is large enough to hold
+       * the result or has valid truncated length.  */
+      memcpy (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen);
+    }
+  else
+    {
+      /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
+       * and thus we need to compare its length first.  */
+      if (!is_tag_length_valid (outbuflen)
+          || !buf_eq_const (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen))
+        return GPG_ERR_CHECKSUM;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gcm_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+                          size_t taglen)
+{
+  /* Outputting authentication tag is part of encryption. */
+  if (c->u_mode.gcm.disallow_encryption_because_of_setiv_in_fips_mode)
+    return GPG_ERR_INV_STATE;
+
+  return _gcry_cipher_gcm_tag (c, outtag, taglen, 0);
+}
+
+gcry_err_code_t
+_gcry_cipher_gcm_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+                            size_t taglen)
+{
+  return _gcry_cipher_gcm_tag (c, (unsigned char *) intag, taglen, 1);
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-internal.h b/comm/third_party/libgcrypt/cipher/cipher-internal.h
new file mode 100644
index 0000000000..59b36ce78b
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-internal.h
@@ -0,0 +1,809 @@
+/* cipher-internal.h  - Internal defs for cipher.c
+ * Copyright (C) 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_CIPHER_INTERNAL_H
+#define G10_CIPHER_INTERNAL_H
+
+#include "./poly1305-internal.h"
+
+
+/* The maximum supported size of a block in bytes.  */
+#define MAX_BLOCKSIZE 16
+
+/* The length for an OCB block.  Although OCB supports any block
+   length it does not make sense to use a 64 bit blocklen (and cipher)
+   because this reduces the security margin to an unacceptable state.
+   Thus we require a cipher with 128 bit blocklength.  */
+#define OCB_BLOCK_LEN  (128/8)
+
+/* The size of the pre-computed L table for OCB.  This takes the same
+   size as the table used for GCM and thus we don't save anything by
+   not using such a table.  */
+#define OCB_L_TABLE_SIZE 16
+
+
+/* Check the above constants.  */
+#if OCB_BLOCK_LEN > MAX_BLOCKSIZE
+# error OCB_BLOCKLEN > MAX_BLOCKSIZE
+#endif
+
+
+
+/* Magic values for the context structure.  */
+#define CTX_MAGIC_NORMAL 0x24091964
+#define CTX_MAGIC_SECURE 0x46919042
+
+/* Try to use 16 byte aligned cipher context for better performance.
+   We use the aligned attribute, thus it is only possible to implement
+   this with gcc.  */
+#undef NEED_16BYTE_ALIGNED_CONTEXT
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define NEED_16BYTE_ALIGNED_CONTEXT 1
+#endif
+
+/* Undef this symbol to trade GCM speed for 256 bytes of memory per context */
+#define GCM_USE_TABLES 1
+
+
+/* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL
+   code.  */
+#undef GCM_USE_INTEL_PCLMUL
+#if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
+# if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define GCM_USE_INTEL_PCLMUL 1
+#  endif
+# endif
+#endif /* GCM_USE_INTEL_PCLMUL */
+
+/* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
+#undef GCM_USE_ARM_PMULL
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+#  define GCM_USE_ARM_PMULL 1
+# elif defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define GCM_USE_ARM_PMULL 1
+# endif
+#endif /* GCM_USE_ARM_PMULL */
+
+/* GCM_USE_ARM_NEON indicates whether to compile GCM with ARMv7 NEON code. */
+#undef GCM_USE_ARM_NEON
+#if defined(GCM_USE_TABLES)
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define GCM_USE_ARM_NEON 1
+#endif
+#endif /* GCM_USE_ARM_NEON */
+
+/* GCM_USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef GCM_USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define GCM_USE_S390X_CRYPTO 1
+#endif /* GCM_USE_S390X_CRYPTO */
+
+typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
+                                    const byte *buf, size_t nblocks);
+
+
+/* A structure with function pointers for mode operations. */
+typedef struct cipher_mode_ops
+{
+  gcry_err_code_t (*encrypt)(gcry_cipher_hd_t c, unsigned char *outbuf,
+			     size_t outbuflen, const unsigned char *inbuf,
+			     size_t inbuflen);
+  gcry_err_code_t (*decrypt)(gcry_cipher_hd_t c, unsigned char *outbuf,
+			     size_t outbuflen, const unsigned char *inbuf,
+			     size_t inbuflen);
+  gcry_err_code_t (*setiv)(gcry_cipher_hd_t c, const unsigned char *iv,
+			   size_t ivlen);
+
+  gcry_err_code_t (*authenticate)(gcry_cipher_hd_t c,
+				  const unsigned char *abuf, size_t abuflen);
+  gcry_err_code_t (*get_tag)(gcry_cipher_hd_t c, unsigned char *outtag,
+			     size_t taglen);
+  gcry_err_code_t (*check_tag)(gcry_cipher_hd_t c, const unsigned char *intag,
+			       size_t taglen);
+} cipher_mode_ops_t;
+
+
+/* A structure with function pointers for bulk operations.  The cipher
+   algorithm setkey function initializes them when bulk operations are
+   available and the actual encryption routines use them if they are
+   not NULL.  */
+typedef struct cipher_bulk_ops
+{
+  void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks);
+  void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks);
+  void (*cbc_enc)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks, int cbc_mac);
+  void (*cbc_dec)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks);
+  void (*ofb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks);
+  void (*ctr_enc)(void *context, unsigned char *iv, void *outbuf_arg,
+		  const void *inbuf_arg, size_t nblocks);
+  size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks, int encrypt);
+  size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks);
+  void (*xts_crypt)(void *context, unsigned char *tweak, void *outbuf_arg,
+		    const void *inbuf_arg, size_t nblocks, int encrypt);
+  size_t (*gcm_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks, int encrypt);
+} cipher_bulk_ops_t;
+
+
+/* A VIA processor with the Padlock engine as well as the Intel AES_NI
+   instructions require an alignment of most data on a 16 byte
+   boundary.  Because we trick out the compiler while allocating the
+   context, the align attribute as used in rijndael.c does not work on
+   its own.  Thus we need to make sure that the entire context
+   structure is a aligned on that boundary.  We achieve this by
+   defining a new type and use that instead of our usual alignment
+   type.  */
+typedef union
+{
+  PROPERLY_ALIGNED_TYPE foo;
+#ifdef NEED_16BYTE_ALIGNED_CONTEXT
+  char bar[16] __attribute__ ((aligned (16)));
+#endif
+  char c[1];
+} cipher_context_alignment_t;
+
+
+/* Storage structure for CMAC, for CMAC and EAX modes. */
+typedef struct {
+  /* The initialization vector. Also contains tag after finalization. */
+  union {
+    cipher_context_alignment_t iv_align;
+    unsigned char iv[MAX_BLOCKSIZE];
+  } u_iv;
+
+  /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
+  unsigned char subkeys[2][MAX_BLOCKSIZE];
+
+  /* Space to save partial input lengths for MAC. */
+  unsigned char macbuf[MAX_BLOCKSIZE];
+
+  int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
+  unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
+} gcry_cmac_context_t;
+
+
+/* The handle structure.  */
+struct gcry_cipher_handle
+{
+  int magic;
+  size_t actual_handle_size;     /* Allocated size of this handle. */
+  size_t handle_offset;          /* Offset to the malloced block.  */
+  gcry_cipher_spec_t *spec;
+
+  /* The algorithm id.  This is a hack required because the module
+     interface does not easily allow to retrieve this value. */
+  int algo;
+
+  /* A structure with function pointers for mode operations. */
+  cipher_mode_ops_t mode_ops;
+
+  /* A structure with function pointers for bulk operations.  Due to
+     limitations of the module system (we don't want to change the
+     API) we need to keep these function pointers here.  */
+  cipher_bulk_ops_t bulk;
+
+  int mode;
+  unsigned int flags;
+
+  struct {
+    unsigned int key:1; /* Set to 1 if a key has been set.  */
+    unsigned int iv:1;  /* Set to 1 if a IV has been set.  */
+    unsigned int tag:1; /* Set to 1 if a tag is finalized. */
+    unsigned int finalize:1; /* Next encrypt/decrypt has the final data.  */
+    unsigned int allow_weak_key:1; /* Set to 1 if weak keys are allowed. */
+  } marks;
+
+  /* The initialization vector.  For best performance we make sure
+     that it is properly aligned.  In particular some implementations
+     of bulk operations expect an 16 byte aligned IV.  IV is also used
+     to store CBC-MAC in CCM mode; counter IV is stored in U_CTR.  For
+     OCB mode it is used for the offset value.  */
+  union {
+    cipher_context_alignment_t iv_align;
+    unsigned char iv[MAX_BLOCKSIZE];
+  } u_iv;
+
+  /* The counter for CTR mode.  This field is also used by AESWRAP and
+     thus we can't use the U_IV union.  For OCB mode it is used for
+     the checksum.  */
+  union {
+    cipher_context_alignment_t iv_align;
+    unsigned char ctr[MAX_BLOCKSIZE];
+  } u_ctr;
+
+  /* Space to save an IV or CTR for chaining operations.  */
+  unsigned char lastiv[MAX_BLOCKSIZE];
+  int unused;  /* Number of unused bytes in LASTIV. */
+
+  union {
+    /* Mode specific storage for CCM mode. */
+    struct {
+      u64 encryptlen;
+      u64 aadlen;
+      unsigned int authlen;
+
+      /* Space to save partial input lengths for MAC. */
+      unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
+      int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
+
+      unsigned char s0[GCRY_CCM_BLOCK_LEN];
+
+      unsigned int nonce:1; /* Set to 1 if nonce has been set.  */
+      unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
+                                 processed.  */
+    } ccm;
+
+    /* Mode specific storage for Poly1305 mode. */
+    struct {
+      /* byte counter for AAD. */
+      u32 aadcount[2];
+
+      /* byte counter for data. */
+      u32 datacount[2];
+
+      unsigned int aad_finalized:1;
+      unsigned int bytecount_over_limits:1;
+
+      poly1305_context_t ctx;
+    } poly1305;
+
+    /* Mode specific storage for CMAC mode. */
+    gcry_cmac_context_t cmac;
+
+    /* Mode specific storage for EAX mode. */
+    struct {
+      /* CMAC for header (AAD). */
+      gcry_cmac_context_t cmac_header;
+
+      /* CMAC for ciphertext. */
+      gcry_cmac_context_t cmac_ciphertext;
+    } eax;
+
+    /* Mode specific storage for GCM mode. */
+    struct {
+      /* The interim tag for GCM mode.  */
+      union {
+        cipher_context_alignment_t iv_align;
+        unsigned char tag[MAX_BLOCKSIZE];
+      } u_tag;
+
+      /* Space to save partial input lengths for MAC. */
+      unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
+      int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
+
+      /* byte counters for GCM */
+      u32 aadlen[2];
+      u32 datalen[2];
+
+      /* encrypted tag counter */
+      unsigned char tagiv[MAX_BLOCKSIZE];
+
+      unsigned int ghash_data_finalized:1;
+      unsigned int ghash_aad_finalized:1;
+
+      unsigned int datalen_over_limits:1;
+      unsigned int disallow_encryption_because_of_setiv_in_fips_mode:1;
+
+      /* --- Following members are not cleared in gcry_cipher_reset --- */
+
+      /* GHASH multiplier from key.  */
+      union {
+        cipher_context_alignment_t iv_align;
+        unsigned char key[MAX_BLOCKSIZE];
+      } u_ghash_key;
+
+      /* GHASH implementation in use. */
+      ghash_fn_t ghash_fn;
+
+      /* Pre-calculated table for GCM. */
+#ifdef GCM_USE_TABLES
+ #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
+      #define GCM_TABLES_USE_U64 1
+      u64 gcm_table[4 * 16];
+ #else
+      #undef GCM_TABLES_USE_U64
+      u32 gcm_table[8 * 16];
+ #endif
+#endif
+    } gcm;
+
+    /* Mode specific storage for OCB mode. */
+    struct {
+      /* --- Following members are not cleared in gcry_cipher_reset --- */
+
+      /* Helper variables and pre-computed table of L values.  */
+      unsigned char L_star[OCB_BLOCK_LEN];
+      unsigned char L_dollar[OCB_BLOCK_LEN];
+      unsigned char L0L1[OCB_BLOCK_LEN];
+      unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
+
+      /* --- Following members are cleared in gcry_cipher_reset --- */
+
+      /* The tag is valid if marks.tag has been set.  */
+      unsigned char tag[OCB_BLOCK_LEN];
+
+      /* A buffer to hold the offset for the AAD processing.  */
+      unsigned char aad_offset[OCB_BLOCK_LEN];
+
+      /* A buffer to hold the current sum of AAD processing.  We can't
+         use tag here because tag may already hold the preprocessed
+         checksum of the data.  */
+      unsigned char aad_sum[OCB_BLOCK_LEN];
+
+      /* A buffer to store AAD data not yet processed.  */
+      unsigned char aad_leftover[OCB_BLOCK_LEN];
+
+      /* Number of data/aad blocks processed so far.  */
+      u64 data_nblocks;
+      u64 aad_nblocks;
+
+      /* Number of valid bytes in AAD_LEFTOVER.  */
+      unsigned char aad_nleftover;
+
+      /* Length of the tag.  Fixed for now but may eventually be
+         specified using a set of gcry_cipher_flags.  */
+      unsigned char taglen;
+
+      /* Flags indicating that the final data/aad block has been
+         processed.  */
+      unsigned int data_finalized:1;
+      unsigned int aad_finalized:1;
+    } ocb;
+
+    /* Mode specific storage for XTS mode. */
+    struct {
+      /* Pointer to tweak cipher context, allocated after actual
+       * cipher context. */
+      char *tweak_context;
+    } xts;
+  } u_mode;
+
+  /* What follows are two contexts of the cipher in use.  The first
+     one needs to be aligned well enough for the cipher operation
+     whereas the second one is a copy created by cipher_setkey and
+     used by cipher_reset.  That second copy has no need for proper
+     aligment because it is only accessed by memcpy.  */
+  cipher_context_alignment_t context;
+};
+
+
+/*-- cipher-cbc.c --*/
+gcry_err_code_t _gcry_cipher_cbc_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cbc_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cbc_cts_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cbc_cts_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+
+/*-- cipher-cfb.c --*/
+gcry_err_code_t _gcry_cipher_cfb_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cfb_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cfb8_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_cfb8_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+
+
+/*-- cipher-ofb.c --*/
+gcry_err_code_t _gcry_cipher_ofb_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+
+/*-- cipher-ctr.c --*/
+gcry_err_code_t _gcry_cipher_ctr_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+
+
+/*-- cipher-aeswrap.c --*/
+gcry_err_code_t _gcry_cipher_aeswrap_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   byte *outbuf, size_t outbuflen,
+                   const byte *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_aeswrap_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   byte *outbuf, size_t outbuflen,
+                   const byte *inbuf, size_t inbuflen);
+
+
+/*-- cipher-ccm.c --*/
+gcry_err_code_t _gcry_cipher_ccm_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_ccm_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_ccm_set_nonce
+/*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
+                 size_t noncelen);
+gcry_err_code_t _gcry_cipher_ccm_authenticate
+/*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
+gcry_err_code_t _gcry_cipher_ccm_set_lengths
+/*           */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen);
+gcry_err_code_t _gcry_cipher_ccm_get_tag
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_ccm_check_tag
+/*           */ (gcry_cipher_hd_t c,
+                 const unsigned char *intag, size_t taglen);
+
+
+/*-- cipher-cmac.c --*/
+gcry_err_code_t _gcry_cmac_generate_subkeys
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+gcry_err_code_t _gcry_cmac_write
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx,
+		 const byte * inbuf, size_t inlen);
+gcry_err_code_t _gcry_cmac_final
+/*           */ (gcry_cipher_hd_t c, gcry_cmac_context_t *ctx);
+void _gcry_cmac_reset (gcry_cmac_context_t *ctx);
+
+
+/*-- cipher-eax.c --*/
+gcry_err_code_t _gcry_cipher_eax_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_eax_set_nonce
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *nonce, size_t noncelen);
+gcry_err_code_t _gcry_cipher_eax_authenticate
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *aadbuf, size_t aadbuflen);
+gcry_err_code_t _gcry_cipher_eax_get_tag
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_check_tag
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *intag, size_t taglen);
+gcry_err_code_t _gcry_cipher_eax_setkey
+/*           */   (gcry_cipher_hd_t c);
+
+
+/*-- cipher-gcm.c --*/
+gcry_err_code_t _gcry_cipher_gcm_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_gcm_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_gcm_setiv
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *iv, size_t ivlen);
+gcry_err_code_t _gcry_cipher_gcm_authenticate
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *aadbuf, size_t aadbuflen);
+gcry_err_code_t _gcry_cipher_gcm_get_tag
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_gcm_check_tag
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *intag, size_t taglen);
+void _gcry_cipher_gcm_setkey
+/*           */   (gcry_cipher_hd_t c);
+
+
+/*-- cipher-poly1305.c --*/
+gcry_err_code_t _gcry_cipher_poly1305_encrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_poly1305_decrypt
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outbuf, size_t outbuflen,
+                   const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_poly1305_setiv
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *iv, size_t ivlen);
+gcry_err_code_t _gcry_cipher_poly1305_authenticate
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *aadbuf, size_t aadbuflen);
+gcry_err_code_t _gcry_cipher_poly1305_get_tag
+/*           */   (gcry_cipher_hd_t c,
+                   unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_poly1305_check_tag
+/*           */   (gcry_cipher_hd_t c,
+                   const unsigned char *intag, size_t taglen);
+void _gcry_cipher_poly1305_setkey
+/*           */   (gcry_cipher_hd_t c);
+
+
+/*-- chacha20.c --*/
+gcry_err_code_t _gcry_chacha20_poly1305_encrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+gcry_err_code_t _gcry_chacha20_poly1305_decrypt
+/*           */   (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+		   size_t length);
+
+
+/*-- cipher-ocb.c --*/
+gcry_err_code_t _gcry_cipher_ocb_encrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_ocb_decrypt
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outbuf, size_t outbuflen,
+                 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_ocb_set_nonce
+/*           */ (gcry_cipher_hd_t c, const unsigned char *nonce,
+                 size_t noncelen);
+gcry_err_code_t _gcry_cipher_ocb_authenticate
+/*           */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen);
+gcry_err_code_t _gcry_cipher_ocb_get_tag
+/*           */ (gcry_cipher_hd_t c,
+                 unsigned char *outtag, size_t taglen);
+gcry_err_code_t _gcry_cipher_ocb_check_tag
+/*           */ (gcry_cipher_hd_t c,
+                 const unsigned char *intag, size_t taglen);
+void _gcry_cipher_ocb_setkey
+/*           */ (gcry_cipher_hd_t c);
+
+
+/*-- cipher-xts.c --*/
+gcry_err_code_t _gcry_cipher_xts_encrypt
+/*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+		 const unsigned char *inbuf, size_t inbuflen);
+gcry_err_code_t _gcry_cipher_xts_decrypt
+/*           */ (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+		 const unsigned char *inbuf, size_t inbuflen);
+
+
+/* Return the L-value for block N.  Note: 'cipher_ocb.c' ensures that N
+ * will never be multiple of 65536 (1 << OCB_L_TABLE_SIZE), thus N can
+ * be directly passed to _gcry_ctz() function and resulting index will
+ * never overflow the table.  */
+static inline const unsigned char *
+ocb_get_l (gcry_cipher_hd_t c, u64 n)
+{
+  unsigned long ntz;
+
+#if ((defined(__i386__) || defined(__x86_64__)) && __GNUC__ >= 4)
+  /* Assumes that N != 0. */
+  asm ("rep;bsfl %k[low], %k[ntz]\n\t"
+        : [ntz] "=r" (ntz)
+        : [low] "r" ((unsigned long)n)
+        : "cc");
+#else
+  ntz = _gcry_ctz (n);
+#endif
+
+  return c->u_mode.ocb.L[ntz];
+}
+
+
+/* Return bit-shift of blocksize. */
+static inline unsigned int _gcry_blocksize_shift(gcry_cipher_hd_t c)
+{
+  /* Only blocksizes 8 and 16 are used. Return value in such way
+   * that compiler can optimize calling functions based on this.  */
+  return c->spec->blocksize == 8 ? 3 : 4;
+}
+
+
+/* Optimized function for adding value to cipher block. */
+static inline void
+cipher_block_add(void *_dstsrc, unsigned int add, size_t blocksize)
+{
+  byte *dstsrc = _dstsrc;
+  u64 s[2];
+
+  if (blocksize == 8)
+    {
+      buf_put_be64(dstsrc + 0, buf_get_be64(dstsrc + 0) + add);
+    }
+  else /* blocksize == 16 */
+    {
+      s[0] = buf_get_be64(dstsrc + 8);
+      s[1] = buf_get_be64(dstsrc + 0);
+      s[0] += add;
+      s[1] += (s[0] < add);
+      buf_put_be64(dstsrc + 8, s[0]);
+      buf_put_be64(dstsrc + 0, s[1]);
+    }
+}
+
+
+/* Optimized function for cipher block copying */
+static inline void
+cipher_block_cpy(void *_dst, const void *_src, size_t blocksize)
+{
+  byte *dst = _dst;
+  const byte *src = _src;
+  u64 s[2];
+
+  if (blocksize == 8)
+    {
+      buf_put_he64(dst + 0, buf_get_he64(src + 0));
+    }
+  else /* blocksize == 16 */
+    {
+      s[0] = buf_get_he64(src + 0);
+      s[1] = buf_get_he64(src + 8);
+      buf_put_he64(dst + 0, s[0]);
+      buf_put_he64(dst + 8, s[1]);
+    }
+}
+
+
+/* Optimized function for cipher block xoring */
+static inline void
+cipher_block_xor(void *_dst, const void *_src1, const void *_src2,
+                 size_t blocksize)
+{
+  byte *dst = _dst;
+  const byte *src1 = _src1;
+  const byte *src2 = _src2;
+  u64 s1[2];
+  u64 s2[2];
+
+  if (blocksize == 8)
+    {
+      buf_put_he64(dst + 0, buf_get_he64(src1 + 0) ^ buf_get_he64(src2 + 0));
+    }
+  else /* blocksize == 16 */
+    {
+      s1[0] = buf_get_he64(src1 + 0);
+      s1[1] = buf_get_he64(src1 + 8);
+      s2[0] = buf_get_he64(src2 + 0);
+      s2[1] = buf_get_he64(src2 + 8);
+      buf_put_he64(dst + 0, s1[0] ^ s2[0]);
+      buf_put_he64(dst + 8, s1[1] ^ s2[1]);
+    }
+}
+
+
+/* Optimized function for in-place cipher block xoring */
+static inline void
+cipher_block_xor_1(void *_dst, const void *_src, size_t blocksize)
+{
+  cipher_block_xor (_dst, _dst, _src, blocksize);
+}
+
+
+/* Optimized function for cipher block xoring with two destination cipher
+   blocks.  Used mainly by CFB mode encryption.  */
+static inline void
+cipher_block_xor_2dst(void *_dst1, void *_dst2, const void *_src,
+                      size_t blocksize)
+{
+  byte *dst1 = _dst1;
+  byte *dst2 = _dst2;
+  const byte *src = _src;
+  u64 d2[2];
+  u64 s[2];
+
+  if (blocksize == 8)
+    {
+      d2[0] = buf_get_he64(dst2 + 0) ^ buf_get_he64(src + 0);
+      buf_put_he64(dst2 + 0, d2[0]);
+      buf_put_he64(dst1 + 0, d2[0]);
+    }
+  else /* blocksize == 16 */
+    {
+      s[0] = buf_get_he64(src + 0);
+      s[1] = buf_get_he64(src + 8);
+      d2[0] = buf_get_he64(dst2 + 0);
+      d2[1] = buf_get_he64(dst2 + 8);
+      d2[0] = d2[0] ^ s[0];
+      d2[1] = d2[1] ^ s[1];
+      buf_put_he64(dst2 + 0, d2[0]);
+      buf_put_he64(dst2 + 8, d2[1]);
+      buf_put_he64(dst1 + 0, d2[0]);
+      buf_put_he64(dst1 + 8, d2[1]);
+    }
+}
+
+
+/* Optimized function for combined cipher block xoring and copying.
+   Used by mainly CBC mode decryption.  */
+static inline void
+cipher_block_xor_n_copy_2(void *_dst_xor, const void *_src_xor,
+                          void *_srcdst_cpy, const void *_src_cpy,
+                          size_t blocksize)
+{
+  byte *dst_xor = _dst_xor;
+  byte *srcdst_cpy = _srcdst_cpy;
+  const byte *src_xor = _src_xor;
+  const byte *src_cpy = _src_cpy;
+  u64 sc[2];
+  u64 sx[2];
+  u64 sdc[2];
+
+  if (blocksize == 8)
+    {
+      sc[0] = buf_get_he64(src_cpy + 0);
+      buf_put_he64(dst_xor + 0,
+                   buf_get_he64(srcdst_cpy + 0) ^ buf_get_he64(src_xor + 0));
+      buf_put_he64(srcdst_cpy + 0, sc[0]);
+    }
+  else /* blocksize == 16 */
+    {
+      sc[0] = buf_get_he64(src_cpy + 0);
+      sc[1] = buf_get_he64(src_cpy + 8);
+      sx[0] = buf_get_he64(src_xor + 0);
+      sx[1] = buf_get_he64(src_xor + 8);
+      sdc[0] = buf_get_he64(srcdst_cpy + 0);
+      sdc[1] = buf_get_he64(srcdst_cpy + 8);
+      sx[0] ^= sdc[0];
+      sx[1] ^= sdc[1];
+      buf_put_he64(dst_xor + 0, sx[0]);
+      buf_put_he64(dst_xor + 8, sx[1]);
+      buf_put_he64(srcdst_cpy + 0, sc[0]);
+      buf_put_he64(srcdst_cpy + 8, sc[1]);
+    }
+}
+
+
+/* Optimized function for combined cipher block xoring and copying.
+   Used by mainly CFB mode decryption.  */
+static inline void
+cipher_block_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src,
+                        size_t blocksize)
+{
+  cipher_block_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, blocksize);
+}
+
+
+#endif /*G10_CIPHER_INTERNAL_H*/
diff --git a/comm/third_party/libgcrypt/cipher/cipher-ocb.c b/comm/third_party/libgcrypt/cipher/cipher-ocb.c
new file mode 100644
index 0000000000..24db6a9e2c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-ocb.c
@@ -0,0 +1,761 @@
+/* cipher-ocb.c -  OCB cipher mode
+ * Copyright (C) 2015, 2016 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * OCB is covered by several patents but may be used freely by most
+ * software.  See http://web.cs.ucdavis.edu/~rogaway/ocb/license.htm .
+ * In particular license 1 is suitable for Libgcrypt: See
+ * http://web.cs.ucdavis.edu/~rogaway/ocb/license1.pdf for the full
+ * license document; it basically says:
+ *
+ *   License 1 — License for Open-Source Software Implementations of OCB
+ *               (Jan 9, 2013)
+ *
+ *   Under this license, you are authorized to make, use, and
+ *   distribute open-source software implementations of OCB. This
+ *   license terminates for you if you sue someone over their
+ *   open-source software implementation of OCB claiming that you have
+ *   a patent covering their implementation.
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+/* Double the OCB_BLOCK_LEN sized block B in-place.  */
+static inline void
+double_block (u64 b[2])
+{
+  u64 l_0, l, r;
+
+  l = b[1];
+  r = b[0];
+
+  l_0 = -(l >> 63);
+  l = (l + l) ^ (r >> 63);
+  r = (r + r) ^ (l_0 & 135);
+
+  b[1] = l;
+  b[0] = r;
+}
+
+
+/* Copy OCB_BLOCK_LEN from buffer S starting at bit offset BITOFF to
+ * buffer D.  */
+static void
+bit_copy (unsigned char d[16], const unsigned char s[24], unsigned int bitoff)
+{
+  u64 s0l, s1l, s1r, s2r;
+  unsigned int shift;
+  unsigned int byteoff;
+
+  byteoff = bitoff / 8;
+  shift = bitoff % 8;
+
+  s0l = buf_get_be64 (s + byteoff + 0);
+  s1l = buf_get_be64 (s + byteoff + 8);
+  s1r = shift ? s1l : 0;
+  s2r = shift ? buf_get_be64 (s + 16) << (8 * byteoff) : 0;
+
+  buf_put_be64 (d + 0, (s0l << shift) | (s1r >> ((64 - shift) & 63)));
+  buf_put_be64 (d + 8, (s1l << shift) | (s2r >> ((64 - shift) & 63)));
+}
+
+
+/* Get L_big value for block N, where N is multiple of 65536. */
+static void
+ocb_get_L_big (gcry_cipher_hd_t c, u64 n, unsigned char *l_buf)
+{
+  int ntz = _gcry_ctz64 (n);
+  u64 L[2];
+
+  gcry_assert(ntz >= OCB_L_TABLE_SIZE);
+
+  L[1] = buf_get_be64 (c->u_mode.ocb.L[OCB_L_TABLE_SIZE - 1]);
+  L[0] = buf_get_be64 (c->u_mode.ocb.L[OCB_L_TABLE_SIZE - 1] + 8);
+
+  for (ntz -= OCB_L_TABLE_SIZE - 1; ntz; ntz--)
+    double_block (L);
+
+  buf_put_be64 (l_buf + 0, L[1]);
+  buf_put_be64 (l_buf + 8, L[0]);
+}
+
+
+/* Called after key has been set. Sets up L table. */
+void _gcry_cipher_ocb_setkey (gcry_cipher_hd_t c)
+{
+  unsigned char ktop[OCB_BLOCK_LEN];
+  unsigned int burn = 0;
+  unsigned int nburn;
+  u64 L[2];
+  int i;
+
+  /* L_star = E(zero_128) */
+  memset (ktop, 0, OCB_BLOCK_LEN);
+  nburn = c->spec->encrypt (&c->context.c, c->u_mode.ocb.L_star, ktop);
+  burn = nburn > burn ? nburn : burn;
+  /* L_dollar = double(L_star)  */
+  L[1] = buf_get_be64 (c->u_mode.ocb.L_star);
+  L[0] = buf_get_be64 (c->u_mode.ocb.L_star + 8);
+  double_block (L);
+  buf_put_be64 (c->u_mode.ocb.L_dollar + 0, L[1]);
+  buf_put_be64 (c->u_mode.ocb.L_dollar + 8, L[0]);
+  /* L_0 = double(L_dollar), ...  */
+  double_block (L);
+  buf_put_be64 (c->u_mode.ocb.L[0] + 0, L[1]);
+  buf_put_be64 (c->u_mode.ocb.L[0] + 8, L[0]);
+  for (i = 1; i < OCB_L_TABLE_SIZE; i++)
+    {
+      double_block (L);
+      buf_put_be64 (c->u_mode.ocb.L[i] + 0, L[1]);
+      buf_put_be64 (c->u_mode.ocb.L[i] + 8, L[0]);
+    }
+  /* Precalculated offset L0+L1 */
+  cipher_block_xor (c->u_mode.ocb.L0L1,
+		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
+
+  /* Cleanup */
+  wipememory (ktop, sizeof ktop);
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+}
+
+
+/* Set the nonce for OCB.  This requires that the key has been set.
+   Using it again resets start a new encryption cycle using the same
+   key.  */
+gcry_err_code_t
+_gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
+                            size_t noncelen)
+{
+  unsigned char ktop[OCB_BLOCK_LEN];
+  unsigned char stretch[OCB_BLOCK_LEN + 8];
+  unsigned int bottom;
+  unsigned int burn = 0;
+  unsigned int nburn;
+
+  /* Check args.  */
+  if (!c->marks.key)
+    return GPG_ERR_INV_STATE;  /* Key must have been set first.  */
+  switch (c->u_mode.ocb.taglen)
+    {
+    case 8:
+    case 12:
+    case 16:
+      break;
+    default:
+      return GPG_ERR_BUG; /* Invalid tag length. */
+    }
+
+  if (c->spec->blocksize != OCB_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (!nonce)
+    return GPG_ERR_INV_ARG;
+  /* 120 bit is the allowed maximum.  In addition we impose a minimum
+     of 64 bit.  */
+  if (noncelen > (120/8) || noncelen < (64/8) || noncelen >= OCB_BLOCK_LEN)
+    return GPG_ERR_INV_LENGTH;
+
+  /* Prepare the nonce.  */
+  memset (ktop, 0, OCB_BLOCK_LEN);
+  buf_cpy (ktop + (OCB_BLOCK_LEN - noncelen), nonce, noncelen);
+  ktop[0] = ((c->u_mode.ocb.taglen * 8) % 128) << 1;
+  ktop[OCB_BLOCK_LEN - noncelen - 1] |= 1;
+  bottom = ktop[OCB_BLOCK_LEN - 1] & 0x3f;
+  ktop[OCB_BLOCK_LEN - 1] &= 0xc0; /* Zero the bottom bits.  */
+  nburn = c->spec->encrypt (&c->context.c, ktop, ktop);
+  burn = nburn > burn ? nburn : burn;
+  /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
+  cipher_block_cpy (stretch, ktop, OCB_BLOCK_LEN);
+  cipher_block_xor (stretch + OCB_BLOCK_LEN, ktop, ktop + 1, 8);
+  /* Offset_0 = Stretch[1+bottom..128+bottom]
+     (We use the IV field to store the offset) */
+  bit_copy (c->u_iv.iv, stretch, bottom);
+  c->marks.iv = 1;
+
+  /* Checksum_0 = zeros(128)
+     (We use the CTR field to store the checksum) */
+  memset (c->u_ctr.ctr, 0, OCB_BLOCK_LEN);
+
+  /* Clear AAD buffer.  */
+  memset (c->u_mode.ocb.aad_offset, 0, OCB_BLOCK_LEN);
+  memset (c->u_mode.ocb.aad_sum, 0, OCB_BLOCK_LEN);
+
+  /* Setup other values.  */
+  memset (c->lastiv, 0, sizeof(c->lastiv));
+  c->unused = 0;
+  c->marks.tag = 0;
+  c->marks.finalize = 0;
+  c->u_mode.ocb.data_nblocks = 0;
+  c->u_mode.ocb.aad_nblocks = 0;
+  c->u_mode.ocb.aad_nleftover = 0;
+  c->u_mode.ocb.data_finalized = 0;
+  c->u_mode.ocb.aad_finalized = 0;
+
+  /* log_printhex ("L_*       ", c->u_mode.ocb.L_star, OCB_BLOCK_LEN); */
+  /* log_printhex ("L_$       ", c->u_mode.ocb.L_dollar, OCB_BLOCK_LEN); */
+  /* log_printhex ("L_0       ", c->u_mode.ocb.L[0], OCB_BLOCK_LEN); */
+  /* log_printhex ("L_1       ", c->u_mode.ocb.L[1], OCB_BLOCK_LEN); */
+  /* log_debug (   "bottom    : %u (decimal)\n", bottom); */
+  /* log_printhex ("Ktop      ", ktop, OCB_BLOCK_LEN); */
+  /* log_printhex ("Stretch   ", stretch, sizeof stretch); */
+  /* log_printhex ("Offset_0  ", c->u_iv.iv, OCB_BLOCK_LEN); */
+
+  /* Cleanup */
+  wipememory (ktop, sizeof ktop);
+  wipememory (stretch, sizeof stretch);
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+
+  return 0;
+}
+
+
+/* Process additional authentication data.  This implementation allows
+   to add additional authentication data at any time before the final
+   gcry_cipher_gettag.  */
+gcry_err_code_t
+_gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
+                               size_t abuflen)
+{
+  const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE;
+  const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1);
+  unsigned char l_tmp[OCB_BLOCK_LEN];
+  unsigned int burn = 0;
+  unsigned int nburn;
+  size_t n;
+
+  /* Check that a nonce and thus a key has been set and that we have
+     not yet computed the tag.  We also return an error if the aad has
+     been finalized (i.e. a short block has been processed).  */
+  if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized)
+    return GPG_ERR_INV_STATE;
+
+  /* Check correct usage and arguments.  */
+  if (c->spec->blocksize != OCB_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+
+  /* Process remaining data from the last call first.  */
+  if (c->u_mode.ocb.aad_nleftover)
+    {
+      n = abuflen;
+      if (n > OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover)
+	n = OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover;
+
+      buf_cpy (&c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover],
+	       abuf, n);
+      c->u_mode.ocb.aad_nleftover += n;
+      abuf += n;
+      abuflen -= n;
+
+      if (c->u_mode.ocb.aad_nleftover == OCB_BLOCK_LEN)
+        {
+          c->u_mode.ocb.aad_nblocks++;
+
+          if ((c->u_mode.ocb.aad_nblocks % table_maxblks) == 0)
+            {
+              /* Table overflow, L needs to be generated. */
+              ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks + 1, l_tmp);
+            }
+          else
+            {
+              cipher_block_cpy (l_tmp, ocb_get_l (c, c->u_mode.ocb.aad_nblocks),
+                                OCB_BLOCK_LEN);
+            }
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN);
+          /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+          cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset,
+                            c->u_mode.ocb.aad_leftover, OCB_BLOCK_LEN);
+          nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
+          burn = nburn > burn ? nburn : burn;
+          cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
+
+          c->u_mode.ocb.aad_nleftover = 0;
+        }
+    }
+
+  if (!abuflen)
+    {
+      if (burn > 0)
+        _gcry_burn_stack (burn + 4*sizeof(void*));
+
+      return 0;
+    }
+
+  /* Full blocks handling. */
+  while (abuflen >= OCB_BLOCK_LEN)
+    {
+      size_t nblks = abuflen / OCB_BLOCK_LEN;
+      size_t nmaxblks;
+
+      /* Check how many blocks to process till table overflow. */
+      nmaxblks = (c->u_mode.ocb.aad_nblocks + 1) % table_maxblks;
+      nmaxblks = (table_maxblks - nmaxblks) % table_maxblks;
+
+      if (nmaxblks == 0)
+        {
+          /* Table overflow, generate L and process one block. */
+          c->u_mode.ocb.aad_nblocks++;
+          ocb_get_L_big(c, c->u_mode.ocb.aad_nblocks, l_tmp);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l_tmp, OCB_BLOCK_LEN);
+          /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+          cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf,
+                            OCB_BLOCK_LEN);
+          nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
+          burn = nburn > burn ? nburn : burn;
+          cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
+
+          abuf += OCB_BLOCK_LEN;
+          abuflen -= OCB_BLOCK_LEN;
+          nblks--;
+
+          /* With overflow handled, retry loop again. Next overflow will
+           * happen after 65535 blocks. */
+          continue;
+        }
+
+      nblks = nblks < nmaxblks ? nblks : nmaxblks;
+
+      /* Use a bulk method if available.  */
+      if (nblks && c->bulk.ocb_auth)
+        {
+          size_t nleft;
+          size_t ndone;
+
+          nleft = c->bulk.ocb_auth (c, abuf, nblks);
+          ndone = nblks - nleft;
+
+          abuf += ndone * OCB_BLOCK_LEN;
+          abuflen -= ndone * OCB_BLOCK_LEN;
+          nblks = nleft;
+        }
+
+      /* Hash all full blocks.  */
+      while (nblks)
+        {
+          c->u_mode.ocb.aad_nblocks++;
+
+          gcry_assert(c->u_mode.ocb.aad_nblocks & table_size_mask);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_mode.ocb.aad_offset,
+                              ocb_get_l (c, c->u_mode.ocb.aad_nblocks),
+                              OCB_BLOCK_LEN);
+          /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+          cipher_block_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf,
+                            OCB_BLOCK_LEN);
+          nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
+          burn = nburn > burn ? nburn : burn;
+          cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
+
+          abuf += OCB_BLOCK_LEN;
+          abuflen -= OCB_BLOCK_LEN;
+          nblks--;
+        }
+    }
+
+  /* Store away the remaining data.  */
+  if (abuflen)
+    {
+      n = abuflen;
+      if (n > OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover)
+	n = OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover;
+
+      buf_cpy (&c->u_mode.ocb.aad_leftover[c->u_mode.ocb.aad_nleftover],
+	       abuf, n);
+      c->u_mode.ocb.aad_nleftover += n;
+      abuf += n;
+      abuflen -= n;
+    }
+
+  gcry_assert (!abuflen);
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+
+  return 0;
+}
+
+
+/* Hash final partial AAD block.  */
+static void
+ocb_aad_finalize (gcry_cipher_hd_t c)
+{
+  unsigned char l_tmp[OCB_BLOCK_LEN];
+  unsigned int burn = 0;
+  unsigned int nburn;
+
+  /* Check that a nonce and thus a key has been set and that we have
+     not yet computed the tag.  We also skip this if the aad has been
+     finalized.  */
+  if (!c->marks.iv || c->marks.tag || c->u_mode.ocb.aad_finalized)
+    return;
+  if (c->spec->blocksize != OCB_BLOCK_LEN)
+    return;  /* Ooops.  */
+
+  /* Hash final partial block if any.  */
+  if (c->u_mode.ocb.aad_nleftover)
+    {
+      /* Offset_* = Offset_m xor L_*  */
+      cipher_block_xor_1 (c->u_mode.ocb.aad_offset,
+                          c->u_mode.ocb.L_star, OCB_BLOCK_LEN);
+      /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_*  */
+      buf_cpy (l_tmp, c->u_mode.ocb.aad_leftover, c->u_mode.ocb.aad_nleftover);
+      memset (l_tmp + c->u_mode.ocb.aad_nleftover, 0,
+              OCB_BLOCK_LEN - c->u_mode.ocb.aad_nleftover);
+      l_tmp[c->u_mode.ocb.aad_nleftover] = 0x80;
+      cipher_block_xor_1 (l_tmp, c->u_mode.ocb.aad_offset, OCB_BLOCK_LEN);
+      /* Sum = Sum_m xor ENCIPHER(K, CipherInput)  */
+      nburn = c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
+      burn = nburn > burn ? nburn : burn;
+      cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
+
+      c->u_mode.ocb.aad_nleftover = 0;
+    }
+
+  /* Mark AAD as finalized so that gcry_cipher_ocb_authenticate can
+   * return an erro when called again.  */
+  c->u_mode.ocb.aad_finalized = 1;
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+}
+
+
+
+/* Checksumming for encrypt and decrypt.  */
+static void
+ocb_checksum (unsigned char *chksum, const unsigned char *plainbuf,
+              size_t nblks)
+{
+  while (nblks > 0)
+    {
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      cipher_block_xor_1(chksum, plainbuf, OCB_BLOCK_LEN);
+
+      plainbuf += OCB_BLOCK_LEN;
+      nblks--;
+    }
+}
+
+
+/* Common code for encrypt and decrypt.  */
+static gcry_err_code_t
+ocb_crypt (gcry_cipher_hd_t c, int encrypt,
+           unsigned char *outbuf, size_t outbuflen,
+           const unsigned char *inbuf, size_t inbuflen)
+{
+  const size_t table_maxblks = 1 << OCB_L_TABLE_SIZE;
+  const u32 table_size_mask = ((1 << OCB_L_TABLE_SIZE) - 1);
+  unsigned char l_tmp[OCB_BLOCK_LEN];
+  unsigned int burn = 0;
+  unsigned int nburn;
+  gcry_cipher_encrypt_t crypt_fn =
+      encrypt ? c->spec->encrypt : c->spec->decrypt;
+
+  /* Check that a nonce and thus a key has been set and that we are
+     not yet in end of data state. */
+  if (!c->marks.iv || c->u_mode.ocb.data_finalized)
+    return GPG_ERR_INV_STATE;
+
+  /* Check correct usage and arguments.  */
+  if (c->spec->blocksize != OCB_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.finalize)
+    ; /* Allow arbitarty length. */
+  else if ((inbuflen % OCB_BLOCK_LEN))
+    return GPG_ERR_INV_LENGTH;  /* We support only full blocks for now.  */
+
+  /* Full blocks handling. */
+  while (inbuflen >= OCB_BLOCK_LEN)
+    {
+      size_t nblks = inbuflen / OCB_BLOCK_LEN;
+      size_t nmaxblks;
+
+      /* Check how many blocks to process till table overflow. */
+      nmaxblks = (c->u_mode.ocb.data_nblocks + 1) % table_maxblks;
+      nmaxblks = (table_maxblks - nmaxblks) % table_maxblks;
+
+      if (nmaxblks == 0)
+        {
+          /* Table overflow, generate L and process one block. */
+          c->u_mode.ocb.data_nblocks++;
+          ocb_get_L_big(c, c->u_mode.ocb.data_nblocks, l_tmp);
+
+          if (encrypt)
+            {
+              /* Checksum_i = Checksum_{i-1} xor P_i  */
+              ocb_checksum (c->u_ctr.ctr, inbuf, 1);
+            }
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_iv.iv, l_tmp, OCB_BLOCK_LEN);
+          /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+          cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
+          nburn = crypt_fn (&c->context.c, outbuf, outbuf);
+          burn = nburn > burn ? nburn : burn;
+          cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN);
+
+          if (!encrypt)
+            {
+              /* Checksum_i = Checksum_{i-1} xor P_i  */
+              ocb_checksum (c->u_ctr.ctr, outbuf, 1);
+            }
+
+          inbuf += OCB_BLOCK_LEN;
+          inbuflen -= OCB_BLOCK_LEN;
+          outbuf += OCB_BLOCK_LEN;
+          outbuflen =- OCB_BLOCK_LEN;
+          nblks--;
+
+          /* With overflow handled, retry loop again. Next overflow will
+           * happen after 65535 blocks. */
+          continue;
+        }
+
+      nblks = nblks < nmaxblks ? nblks : nmaxblks;
+
+      /* Since checksum xoring is done before/after encryption/decryption,
+	process input in 24KiB chunks to keep data loaded in L1 cache for
+	checksumming. */
+      if (nblks > 24 * 1024 / OCB_BLOCK_LEN)
+	nblks = 24 * 1024 / OCB_BLOCK_LEN;
+
+      /* Use a bulk method if available.  */
+      if (nblks && c->bulk.ocb_crypt)
+        {
+          size_t nleft;
+          size_t ndone;
+
+          nleft = c->bulk.ocb_crypt (c, outbuf, inbuf, nblks, encrypt);
+          ndone = nblks - nleft;
+
+          inbuf += ndone * OCB_BLOCK_LEN;
+          outbuf += ndone * OCB_BLOCK_LEN;
+          inbuflen -= ndone * OCB_BLOCK_LEN;
+          outbuflen -= ndone * OCB_BLOCK_LEN;
+          nblks = nleft;
+        }
+
+      if (nblks)
+        {
+          size_t nblks_chksum = nblks;
+
+          if (encrypt)
+            {
+              /* Checksum_i = Checksum_{i-1} xor P_i  */
+              ocb_checksum (c->u_ctr.ctr, inbuf, nblks_chksum);
+            }
+
+          /* Encrypt all full blocks.  */
+          while (nblks)
+            {
+              c->u_mode.ocb.data_nblocks++;
+
+              gcry_assert(c->u_mode.ocb.data_nblocks & table_size_mask);
+
+              /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+              cipher_block_xor_1 (c->u_iv.iv,
+                                  ocb_get_l (c, c->u_mode.ocb.data_nblocks),
+                                  OCB_BLOCK_LEN);
+              /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+              cipher_block_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
+              nburn = crypt_fn (&c->context.c, outbuf, outbuf);
+              burn = nburn > burn ? nburn : burn;
+              cipher_block_xor_1 (outbuf, c->u_iv.iv, OCB_BLOCK_LEN);
+
+              inbuf += OCB_BLOCK_LEN;
+              inbuflen -= OCB_BLOCK_LEN;
+              outbuf += OCB_BLOCK_LEN;
+              outbuflen =- OCB_BLOCK_LEN;
+              nblks--;
+            }
+
+          if (!encrypt)
+            {
+              /* Checksum_i = Checksum_{i-1} xor P_i  */
+              ocb_checksum (c->u_ctr.ctr,
+                            outbuf - nblks_chksum * OCB_BLOCK_LEN,
+                            nblks_chksum);
+            }
+        }
+    }
+
+  /* Encrypt final partial block.  Note that we expect INBUFLEN to be
+     shorter than OCB_BLOCK_LEN (see above).  */
+  if (inbuflen)
+    {
+      unsigned char pad[OCB_BLOCK_LEN];
+
+      /* Offset_* = Offset_m xor L_*  */
+      cipher_block_xor_1 (c->u_iv.iv, c->u_mode.ocb.L_star, OCB_BLOCK_LEN);
+      /* Pad = ENCIPHER(K, Offset_*) */
+      nburn = c->spec->encrypt (&c->context.c, pad, c->u_iv.iv);
+      burn = nburn > burn ? nburn : burn;
+
+      if (encrypt)
+        {
+          /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+          /* Note that INBUFLEN is less than OCB_BLOCK_LEN.  */
+          buf_cpy (l_tmp, inbuf, inbuflen);
+          memset (l_tmp + inbuflen, 0, OCB_BLOCK_LEN - inbuflen);
+          l_tmp[inbuflen] = 0x80;
+          cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN);
+          /* C_* = P_* xor Pad[1..bitlen(P_*)] */
+          buf_xor (outbuf, inbuf, pad, inbuflen);
+        }
+      else
+        {
+          /* P_* = C_* xor Pad[1..bitlen(C_*)] */
+          /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+          cipher_block_cpy (l_tmp, pad, OCB_BLOCK_LEN);
+          buf_cpy (l_tmp, inbuf, inbuflen);
+          cipher_block_xor_1 (l_tmp, pad, OCB_BLOCK_LEN);
+          l_tmp[inbuflen] = 0x80;
+          buf_cpy (outbuf, l_tmp, inbuflen);
+
+          cipher_block_xor_1 (c->u_ctr.ctr, l_tmp, OCB_BLOCK_LEN);
+        }
+    }
+
+  /* Compute the tag if the finalize flag has been set.  */
+  if (c->marks.finalize)
+    {
+      /* Tag = ENCIPHER(K, Checksum xor Offset xor L_$) xor HASH(K,A) */
+      cipher_block_xor (c->u_mode.ocb.tag, c->u_ctr.ctr, c->u_iv.iv,
+                        OCB_BLOCK_LEN);
+      cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.L_dollar,
+                          OCB_BLOCK_LEN);
+      nburn = c->spec->encrypt (&c->context.c,
+                                c->u_mode.ocb.tag, c->u_mode.ocb.tag);
+      burn = nburn > burn ? nburn : burn;
+
+      c->u_mode.ocb.data_finalized = 1;
+      /* Note that the the final part of the tag computation is done
+         by _gcry_cipher_ocb_get_tag.  */
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4*sizeof(void*));
+
+  return 0;
+}
+
+
+/* Encrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF.  OUTBUFLEN gives
+   the allocated size of OUTBUF.  This function accepts only multiples
+   of a full block unless gcry_cipher_final has been called in which
+   case the next block may have any length.  */
+gcry_err_code_t
+_gcry_cipher_ocb_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+
+{
+  return ocb_crypt (c, 1, outbuf, outbuflen, inbuf, inbuflen);
+}
+
+
+/* Decrypt (INBUF,INBUFLEN) in OCB mode to OUTBUF.  OUTBUFLEN gives
+   the allocated size of OUTBUF.  This function accepts only multiples
+   of a full block unless gcry_cipher_final has been called in which
+   case the next block may have any length.  */
+gcry_err_code_t
+_gcry_cipher_ocb_decrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  return ocb_crypt (c, 0, outbuf, outbuflen, inbuf, inbuflen);
+}
+
+
+/* Compute the tag.  The last data operation has already done some
+   part of it.  To allow adding AAD even after having done all data,
+   we finish the tag computation only here.  */
+static void
+compute_tag_if_needed (gcry_cipher_hd_t c)
+{
+  if (!c->marks.tag)
+    {
+      ocb_aad_finalize (c);
+      cipher_block_xor_1 (c->u_mode.ocb.tag, c->u_mode.ocb.aad_sum,
+                          OCB_BLOCK_LEN);
+      c->marks.tag = 1;
+    }
+}
+
+
+/* Copy the already computed tag to OUTTAG.  OUTTAGSIZE is the
+   allocated size of OUTTAG; the function returns an error if that is
+   too short to hold the tag.  */
+gcry_err_code_t
+_gcry_cipher_ocb_get_tag (gcry_cipher_hd_t c,
+                          unsigned char *outtag, size_t outtagsize)
+{
+  if (c->u_mode.ocb.taglen > outtagsize)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (!c->u_mode.ocb.data_finalized)
+    return GPG_ERR_INV_STATE; /* Data has not yet been finalized.  */
+
+  compute_tag_if_needed (c);
+
+  memcpy (outtag, c->u_mode.ocb.tag, c->u_mode.ocb.taglen);
+
+  return 0;
+}
+
+
+/* Check that the tag (INTAG,TAGLEN) matches the computed tag for the
+   handle C.  */
+gcry_err_code_t
+_gcry_cipher_ocb_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+			    size_t taglen)
+{
+  size_t n;
+
+  if (!c->u_mode.ocb.data_finalized)
+    return GPG_ERR_INV_STATE; /* Data has not yet been finalized.  */
+
+  compute_tag_if_needed (c);
+
+  n = c->u_mode.ocb.taglen;
+  if (taglen < n)
+    n = taglen;
+
+  if (!buf_eq_const (intag, c->u_mode.ocb.tag, n)
+      || c->u_mode.ocb.taglen != taglen)
+    return GPG_ERR_CHECKSUM;
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-ofb.c b/comm/third_party/libgcrypt/cipher/cipher-ofb.c
new file mode 100644
index 0000000000..09db397e65
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-ofb.c
@@ -0,0 +1,108 @@
+/* cipher-ofb.c  - Generic OFB mode implementation
+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ *               2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+gcry_err_code_t
+_gcry_cipher_ofb_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
+  size_t blocksize_shift = _gcry_blocksize_shift(c);
+  size_t blocksize = 1 << blocksize_shift;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  if ( inbuflen <= c->unused )
+    {
+      /* Short enough to be encoded by the remaining XOR mask. */
+      /* XOR the input with the IV */
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor(outbuf, ivp, inbuf, inbuflen);
+      c->unused -= inbuflen;
+      return 0;
+    }
+
+  burn = 0;
+
+  if( c->unused )
+    {
+      inbuflen -= c->unused;
+      ivp = c->u_iv.iv + blocksize - c->unused;
+      buf_xor(outbuf, ivp, inbuf, c->unused);
+      outbuf += c->unused;
+      inbuf += c->unused;
+      c->unused = 0;
+    }
+
+  /* Now we can process complete blocks. */
+  if (c->bulk.ofb_enc)
+    {
+      size_t nblocks = inbuflen >> blocksize_shift;
+      c->bulk.ofb_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks);
+      outbuf += nblocks << blocksize_shift;
+      inbuf  += nblocks << blocksize_shift;
+      inbuflen -= nblocks << blocksize_shift;
+    }
+  else
+    {
+      while ( inbuflen >= blocksize )
+	{
+	  /* Encrypt the IV (and save the current one). */
+	  nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+	  burn = nburn > burn ? nburn : burn;
+	  cipher_block_xor(outbuf, c->u_iv.iv, inbuf, blocksize);
+	  outbuf += blocksize;
+	  inbuf += blocksize;
+	  inbuflen -= blocksize;
+	}
+    }
+
+  if ( inbuflen )
+    { /* process the remaining bytes */
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      burn = nburn > burn ? nburn : burn;
+      c->unused = blocksize;
+      c->unused -= inbuflen;
+      buf_xor(outbuf, c->u_iv.iv, inbuf, inbuflen);
+      outbuf += inbuflen;
+      inbuf += inbuflen;
+      inbuflen = 0;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-poly1305.c b/comm/third_party/libgcrypt/cipher/cipher-poly1305.c
new file mode 100644
index 0000000000..bb475236b8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-poly1305.c
@@ -0,0 +1,375 @@
+/* cipher-poly1305.c  -  Poly1305 based AEAD cipher mode, RFC-8439
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+#include "./poly1305-internal.h"
+
+
+static inline int
+poly1305_bytecounter_add (u32 ctr[2], size_t add)
+{
+  int overflow = 0;
+
+  if (sizeof(add) > sizeof(u32))
+    {
+      u32 high_add = ((add >> 31) >> 1) & 0xffffffff;
+      ctr[1] += high_add;
+      if (ctr[1] < high_add)
+        overflow = 1;
+    }
+
+  ctr[0] += add;
+  if (ctr[0] >= add)
+    return overflow;
+
+  ctr[1] += 1;
+  return (ctr[1] < 1) || overflow;
+}
+
+
+static void
+poly1305_fill_bytecounts (gcry_cipher_hd_t c)
+{
+  u32 lenbuf[4];
+
+  lenbuf[0] = le_bswap32(c->u_mode.poly1305.aadcount[0]);
+  lenbuf[1] = le_bswap32(c->u_mode.poly1305.aadcount[1]);
+  lenbuf[2] = le_bswap32(c->u_mode.poly1305.datacount[0]);
+  lenbuf[3] = le_bswap32(c->u_mode.poly1305.datacount[1]);
+  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, (byte*)lenbuf,
+			 sizeof(lenbuf));
+
+  wipememory(lenbuf, sizeof(lenbuf));
+}
+
+
+static void
+poly1305_do_padding (gcry_cipher_hd_t c, u32 ctr[2])
+{
+  static const byte zero_padding_buf[15] = {};
+  u32 padding_count;
+
+  /* Padding to 16 byte boundary. */
+  if (ctr[0] % 16 > 0)
+    {
+      padding_count = 16 - ctr[0] % 16;
+
+      _gcry_poly1305_update (&c->u_mode.poly1305.ctx, zero_padding_buf,
+			     padding_count);
+    }
+}
+
+
+static void
+poly1305_aad_finish (gcry_cipher_hd_t c)
+{
+  /* After AAD, feed padding bytes so we get 16 byte alignment. */
+  poly1305_do_padding (c, c->u_mode.poly1305.aadcount);
+
+  /* Start of encryption marks end of AAD stream. */
+  c->u_mode.poly1305.aad_finalized = 1;
+
+  c->u_mode.poly1305.datacount[0] = 0;
+  c->u_mode.poly1305.datacount[1] = 0;
+}
+
+
+static gcry_err_code_t
+poly1305_set_zeroiv (gcry_cipher_hd_t c)
+{
+  byte zero[8] = { 0, };
+
+  return _gcry_cipher_poly1305_setiv (c, zero, sizeof(zero));
+}
+
+
+gcry_err_code_t
+_gcry_cipher_poly1305_authenticate (gcry_cipher_hd_t c,
+				    const byte * aadbuf, size_t aadbuflen)
+{
+  if (c->u_mode.poly1305.bytecount_over_limits)
+    return GPG_ERR_INV_LENGTH;
+  if (c->u_mode.poly1305.aad_finalized)
+    return GPG_ERR_INV_STATE;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  if (!c->marks.iv)
+    poly1305_set_zeroiv(c);
+
+  if (poly1305_bytecounter_add(c->u_mode.poly1305.aadcount, aadbuflen))
+    {
+      c->u_mode.poly1305.bytecount_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  _gcry_poly1305_update (&c->u_mode.poly1305.ctx, aadbuf, aadbuflen);
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
+			       byte *outbuf, size_t outbuflen,
+			       const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+  if (c->u_mode.poly1305.bytecount_over_limits)
+    return GPG_ERR_INV_LENGTH;
+
+  if (!c->marks.iv)
+    {
+      err = poly1305_set_zeroiv(c);
+      if (err)
+        return err;
+    }
+
+  if (!c->u_mode.poly1305.aad_finalized)
+    poly1305_aad_finish(c);
+
+  if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
+    {
+      c->u_mode.poly1305.bytecount_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen);
+    }
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done after encryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for checksumming. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      c->spec->stencrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
+
+      _gcry_poly1305_update (&c->u_mode.poly1305.ctx, outbuf, currlen);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
+			       byte *outbuf, size_t outbuflen,
+			       const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->marks.tag)
+    return GPG_ERR_INV_STATE;
+  if (c->u_mode.poly1305.bytecount_over_limits)
+    return GPG_ERR_INV_LENGTH;
+
+  if (!c->marks.iv)
+    {
+      err = poly1305_set_zeroiv(c);
+      if (err)
+        return err;
+    }
+
+  if (!c->u_mode.poly1305.aad_finalized)
+    poly1305_aad_finish(c);
+
+  if (poly1305_bytecounter_add(c->u_mode.poly1305.datacount, inbuflen))
+    {
+      c->u_mode.poly1305.bytecount_over_limits = 1;
+      return GPG_ERR_INV_LENGTH;
+    }
+
+  if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+    {
+      return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen);
+    }
+
+  while (inbuflen)
+    {
+      size_t currlen = inbuflen;
+
+      /* Since checksumming is done before decryption, process input in 24KiB
+       * chunks to keep data loaded in L1 cache for decryption. */
+      if (currlen > 24 * 1024)
+	currlen = 24 * 1024;
+
+      _gcry_poly1305_update (&c->u_mode.poly1305.ctx, inbuf, currlen);
+
+      c->spec->stdecrypt(&c->context.c, outbuf, (byte*)inbuf, currlen);
+
+      outbuf += currlen;
+      inbuf += currlen;
+      outbuflen -= currlen;
+      inbuflen -= currlen;
+    }
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+_gcry_cipher_poly1305_tag (gcry_cipher_hd_t c,
+			   byte * outbuf, size_t outbuflen, int check)
+{
+  gcry_err_code_t err;
+
+  if (outbuflen < POLY1305_TAGLEN)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (c->u_mode.poly1305.bytecount_over_limits)
+    return GPG_ERR_INV_LENGTH;
+
+  if (!c->marks.iv)
+    {
+      err = poly1305_set_zeroiv(c);
+      if (err)
+        return err;
+    }
+
+  if (!c->u_mode.poly1305.aad_finalized)
+    poly1305_aad_finish(c);
+
+  if (!c->marks.tag)
+    {
+      /* After data, feed padding bytes so we get 16 byte alignment. */
+      poly1305_do_padding (c, c->u_mode.poly1305.datacount);
+
+      /* Write byte counts to poly1305. */
+      poly1305_fill_bytecounts(c);
+
+      _gcry_poly1305_finish(&c->u_mode.poly1305.ctx, c->u_iv.iv);
+
+      c->marks.tag = 1;
+    }
+
+  if (!check)
+    {
+      memcpy (outbuf, c->u_iv.iv, POLY1305_TAGLEN);
+    }
+  else
+    {
+      /* OUTBUFLEN gives the length of the user supplied tag in OUTBUF
+       * and thus we need to compare its length first.  */
+      if (outbuflen != POLY1305_TAGLEN
+          || !buf_eq_const (outbuf, c->u_iv.iv, POLY1305_TAGLEN))
+        return GPG_ERR_CHECKSUM;
+    }
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_poly1305_get_tag (gcry_cipher_hd_t c, unsigned char *outtag,
+                          size_t taglen)
+{
+  return _gcry_cipher_poly1305_tag (c, outtag, taglen, 0);
+}
+
+gcry_err_code_t
+_gcry_cipher_poly1305_check_tag (gcry_cipher_hd_t c, const unsigned char *intag,
+                            size_t taglen)
+{
+  return _gcry_cipher_poly1305_tag (c, (unsigned char *) intag, taglen, 1);
+}
+
+
+void
+_gcry_cipher_poly1305_setkey (gcry_cipher_hd_t c)
+{
+  c->u_mode.poly1305.aadcount[0] = 0;
+  c->u_mode.poly1305.aadcount[1] = 0;
+
+  c->u_mode.poly1305.datacount[0] = 0;
+  c->u_mode.poly1305.datacount[1] = 0;
+
+  c->u_mode.poly1305.bytecount_over_limits = 0;
+  c->u_mode.poly1305.aad_finalized = 0;
+  c->marks.tag = 0;
+  c->marks.iv = 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_poly1305_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
+{
+  byte tmpbuf[64]; /* size of ChaCha20 block */
+  gcry_err_code_t err;
+
+  /* IV must be 96-bits */
+  if (!iv && ivlen != (96 / 8))
+    return GPG_ERR_INV_ARG;
+
+  memset(&c->u_mode.poly1305.ctx, 0, sizeof(c->u_mode.poly1305.ctx));
+
+  c->u_mode.poly1305.aadcount[0] = 0;
+  c->u_mode.poly1305.aadcount[1] = 0;
+
+  c->u_mode.poly1305.datacount[0] = 0;
+  c->u_mode.poly1305.datacount[1] = 0;
+
+  c->u_mode.poly1305.bytecount_over_limits = 0;
+  c->u_mode.poly1305.aad_finalized = 0;
+  c->marks.tag = 0;
+  c->marks.iv = 0;
+
+  /* Set up IV for stream cipher. */
+  c->spec->setiv (&c->context.c, iv, ivlen);
+
+  /* Get the first block from ChaCha20. */
+  memset(tmpbuf, 0, sizeof(tmpbuf));
+  c->spec->stencrypt(&c->context.c, tmpbuf, tmpbuf, sizeof(tmpbuf));
+
+  /* Use the first 32-bytes as Poly1305 key. */
+  err = _gcry_poly1305_init (&c->u_mode.poly1305.ctx, tmpbuf, POLY1305_KEYLEN);
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+
+  if (err)
+    return err;
+
+  c->marks.iv = 1;
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-selftest.c b/comm/third_party/libgcrypt/cipher/cipher-selftest.c
new file mode 100644
index 0000000000..d7f38a4261
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-selftest.c
@@ -0,0 +1,512 @@
+/* cipher-selftest.c - Helper functions for bulk encryption selftests.
+ * Copyright (C) 2013,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#ifdef HAVE_SYSLOG
+# include <syslog.h>
+#endif /*HAVE_SYSLOG*/
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "cipher-internal.h"
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* Return an allocated buffers of size CONTEXT_SIZE with an alignment
+   of 16.  The caller must free that buffer using the address returned
+   at R_MEM.  Returns NULL and sets ERRNO on failure.  */
+void *
+_gcry_cipher_selftest_alloc_ctx (const int context_size, unsigned char **r_mem)
+{
+  int offs;
+  unsigned int ctx_aligned_size, memsize;
+
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + 16;
+
+  *r_mem = xtrycalloc (1, memsize);
+  if (!*r_mem)
+    return NULL;
+
+  offs = (16 - ((uintptr_t)*r_mem & 15)) & 15;
+  return (void*)(*r_mem + offs);
+}
+
+
+/* Run the self-tests for <block cipher>-CBC-<block size>, tests bulk CBC
+   decryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
+{
+  cipher_bulk_ops_t bulk_ops = { 0, };
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
+
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = xtrycalloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
+
+  /* Test single block code path */
+  memset (iv, 0x4e, blocksize);
+  memset (iv2, 0x4e, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CBC manually.  */
+  buf_xor (ciphertext, iv, plaintext, blocksize);
+  encrypt_one (ctx, ciphertext, ciphertext);
+  memcpy (iv, ciphertext, blocksize);
+
+  /* CBC decrypt.  */
+  bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp (plaintext2, plaintext, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CBC-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
+#else
+      (void)cipher; /* Not used.  */
+#endif
+      return "selftest for CBC failed - see syslog for details";
+    }
+
+  if (memcmp (iv2, iv, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
+#endif
+      return "selftest for CBC failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  memset (iv, 0x5f, blocksize);
+  memset (iv2, 0x5f, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
+      encrypt_one (ctx, &ciphertext[i], &ciphertext[i]);
+      memcpy (iv, &ciphertext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp (plaintext2, plaintext, nblocks * blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CBC-%d test failed (plaintext mismatch, parallel path)",
+	      cipher, blocksize * 8);
+#endif
+      return "selftest for CBC failed - see syslog for details";
+    }
+  if (memcmp (iv2, iv, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CBC-%d test failed (IV mismatch, parallel path)",
+	      cipher, blocksize * 8);
+#endif
+      return "selftest for CBC failed - see syslog for details";
+    }
+
+  xfree (mem);
+  return NULL;
+}
+
+/* Run the self-tests for <block cipher>-CFB-<block size>, tests bulk CFB
+   decryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
+{
+  cipher_bulk_ops_t bulk_ops = { 0, };
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+    };
+
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = xtrycalloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
+
+  /* Test single block code path */
+  memset(iv, 0xd3, blocksize);
+  memset(iv2, 0xd3, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CFB manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+  /* CFB decrypt.  */
+  bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp(plaintext2, plaintext, blocksize))
+    {
+      xfree(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CFB-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
+#else
+      (void)cipher; /* Not used.  */
+#endif
+      return "selftest for CFB failed - see syslog for details";
+    }
+
+  if (memcmp(iv2, iv, blocksize))
+    {
+      xfree(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CFB-%d test failed (IV mismatch)", cipher, blocksize * 8);
+#endif
+      return "selftest for CFB failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  memset(iv, 0xe6, blocksize);
+  memset(iv2, 0xe6, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CFB ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      encrypt_one (ctx, &ciphertext[i], iv);
+      buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+    {
+      xfree(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CFB-%d test failed (plaintext mismatch, parallel path)",
+              cipher, blocksize * 8);
+#endif
+      return "selftest for CFB failed - see syslog for details";
+    }
+  if (memcmp(iv2, iv, blocksize))
+    {
+      xfree(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CFB-%d test failed (IV mismatch, parallel path)", cipher,
+	      blocksize * 8);
+#endif
+      return "selftest for CFB failed - see syslog for details";
+    }
+
+  xfree(mem);
+  return NULL;
+}
+
+/* Run the self-tests for <block cipher>-CTR-<block size>, tests IV increment
+   of bulk CTR encryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
+{
+  cipher_bulk_ops_t bulk_ops = { 0, };
+  int i, j, offs, diff;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *ciphertext2,
+                *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 4) + 16;
+
+  mem = xtrycalloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+  ciphertext2 = ciphertext + nblocks * blocksize;
+
+  /* Initialize ctx */
+  if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
+
+  /* Test single block code path */
+  memset (iv, 0xff, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CTR manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  for (i = 0; i < blocksize; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = blocksize; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
+
+  memset (iv2, 0xff, blocksize);
+  bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp (plaintext2, plaintext, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CTR-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
+#else
+      (void)cipher; /* Not used.  */
+#endif
+      return "selftest for CTR failed - see syslog for details";
+    }
+
+  if (memcmp (iv2, iv, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CTR-%d test failed (IV mismatch)", cipher,
+	      blocksize * 8);
+#endif
+      return "selftest for CTR failed - see syslog for details";
+    }
+
+  /* Test bulk encryption with typical IV. */
+  memset(iv, 0x57, blocksize-4);
+  iv[blocksize-1] = 1;
+  iv[blocksize-2] = 0;
+  iv[blocksize-3] = 0;
+  iv[blocksize-4] = 0;
+  memset(iv2, 0x57, blocksize-4);
+  iv2[blocksize-1] = 1;
+  iv2[blocksize-2] = 0;
+  iv2[blocksize-3] = 0;
+  iv2[blocksize-4] = 0;
+
+  for (i = 0; i < blocksize * nblocks; i++)
+    plaintext2[i] = plaintext[i] = i;
+
+  /* Create CTR ciphertext manually.  */
+  for (i = 0; i < blocksize * nblocks; i+=blocksize)
+    {
+      encrypt_one (ctx, &ciphertext[i], iv);
+      for (j = 0; j < blocksize; j++)
+        ciphertext[i+j] ^= plaintext[i+j];
+      for (j = blocksize; j > 0; j--)
+        {
+          iv[j-1]++;
+          if (iv[j-1])
+            break;
+        }
+    }
+
+  bulk_ops.ctr_enc (ctx, iv2, ciphertext2, plaintext2, nblocks);
+
+  if (memcmp (ciphertext2, ciphertext, blocksize * nblocks))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CTR-%d test failed (ciphertext mismatch, bulk)", cipher,
+              blocksize * 8);
+#endif
+      return "selftest for CTR failed - see syslog for details";
+    }
+  if (memcmp(iv2, iv, blocksize))
+    {
+      xfree (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-CTR-%d test failed (IV mismatch, bulk)", cipher,
+              blocksize * 8);
+#endif
+      return "selftest for CTR failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths (check counter overflow handling) */
+  for (diff = 0; diff < nblocks; diff++) {
+    memset(iv, 0xff, blocksize);
+    iv[blocksize-1] -= diff;
+    iv[0] = iv[1] = 0;
+    iv[2] = 0x07;
+
+    for (i = 0; i < blocksize * nblocks; i++)
+      plaintext[i] = i;
+
+    /* Create CTR ciphertext manually.  */
+    for (i = 0; i < blocksize * nblocks; i+=blocksize)
+      {
+        encrypt_one (ctx, &ciphertext[i], iv);
+        for (j = 0; j < blocksize; j++)
+          ciphertext[i+j] ^= plaintext[i+j];
+        for (j = blocksize; j > 0; j--)
+          {
+            iv[j-1]++;
+            if (iv[j-1])
+              break;
+          }
+      }
+
+    /* Decrypt using bulk CTR and compare result.  */
+    memset(iv2, 0xff, blocksize);
+    iv2[blocksize-1] -= diff;
+    iv2[0] = iv2[1] = 0;
+    iv2[2] = 0x07;
+
+    bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+    if (memcmp (plaintext2, plaintext, blocksize * nblocks))
+      {
+        xfree (mem);
+#ifdef HAVE_SYSLOG
+        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+                "%s-CTR-%d test failed (plaintext mismatch, diff: %d)", cipher,
+		blocksize * 8, diff);
+#endif
+        return "selftest for CTR failed - see syslog for details";
+      }
+    if (memcmp(iv2, iv, blocksize))
+      {
+        xfree (mem);
+#ifdef HAVE_SYSLOG
+        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+                "%s-CTR-%d test failed (IV mismatch, diff: %d)", cipher,
+		blocksize * 8, diff);
+#endif
+        return "selftest for CTR failed - see syslog for details";
+      }
+  }
+
+  xfree (mem);
+  return NULL;
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher-selftest.h b/comm/third_party/libgcrypt/cipher/cipher-selftest.h
new file mode 100644
index 0000000000..c3090ad122
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-selftest.h
@@ -0,0 +1,69 @@
+/* cipher-selftest.h - Helper functions for bulk encryption selftests.
+ * Copyright (C) 2013,2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_SELFTEST_HELP_H
+#define G10_SELFTEST_HELP_H
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+
+typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   size_t nblocks);
+
+typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   size_t nblocks);
+
+typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   size_t nblocks);
+
+/* Helper function to allocate an aligned context for selftests.  */
+void *_gcry_cipher_selftest_alloc_ctx (const int context_size,
+                                       unsigned char **r_mem);
+
+
+/* Helper function for bulk CBC decryption selftest */
+const char *
+_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
+
+/* Helper function for bulk CFB decryption selftest */
+const char *
+_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
+
+/* Helper function for bulk CTR encryption selftest */
+const char *
+_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
+
+#endif /*G10_SELFTEST_HELP_H*/
diff --git a/comm/third_party/libgcrypt/cipher/cipher-xts.c b/comm/third_party/libgcrypt/cipher/cipher-xts.c
new file mode 100644
index 0000000000..0522a271a1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher-xts.c
@@ -0,0 +1,189 @@
+/* cipher-xts.c  - XTS mode implementation
+ * Copyright (C) 2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+static inline void xts_gfmul_byA (unsigned char *out, const unsigned char *in)
+{
+  u64 hi = buf_get_le64 (in + 8);
+  u64 lo = buf_get_le64 (in + 0);
+  u64 carry = -(hi >> 63) & 0x87;
+
+  hi = (hi << 1) + (lo >> 63);
+  lo = (lo << 1) ^ carry;
+
+  buf_put_le64 (out + 8, hi);
+  buf_put_le64 (out + 0, lo);
+}
+
+
+static inline void xts_inc128 (unsigned char *seqno)
+{
+  u64 lo = buf_get_le64 (seqno + 0);
+  u64 hi = buf_get_le64 (seqno + 8);
+
+  hi += !(++lo);
+
+  buf_put_le64 (seqno + 0, lo);
+  buf_put_le64 (seqno + 8, hi);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
+			unsigned char *outbuf, size_t outbuflen,
+			const unsigned char *inbuf, size_t inbuflen,
+			int encrypt)
+{
+  gcry_cipher_encrypt_t tweak_fn = c->spec->encrypt;
+  gcry_cipher_encrypt_t crypt_fn =
+    encrypt ? c->spec->encrypt : c->spec->decrypt;
+  union
+  {
+    cipher_context_alignment_t xcx;
+    byte x1[GCRY_XTS_BLOCK_LEN];
+    u64 x64[GCRY_XTS_BLOCK_LEN / sizeof(u64)];
+  } tmp;
+  unsigned int burn, nburn;
+  size_t nblocks;
+
+  if (c->spec->blocksize != GCRY_XTS_BLOCK_LEN)
+    return GPG_ERR_CIPHER_ALGO;
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if (inbuflen < GCRY_XTS_BLOCK_LEN)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+
+  /* Data-unit max length: 2^20 blocks. */
+  if (inbuflen > GCRY_XTS_BLOCK_LEN << 20)
+    return GPG_ERR_INV_LENGTH;
+
+  nblocks = inbuflen / GCRY_XTS_BLOCK_LEN;
+  nblocks -= !encrypt && (inbuflen % GCRY_XTS_BLOCK_LEN) != 0;
+
+  /* Generate first tweak value.  */
+  burn = tweak_fn (c->u_mode.xts.tweak_context, c->u_ctr.ctr, c->u_iv.iv);
+
+  /* Use a bulk method if available.  */
+  if (nblocks && c->bulk.xts_crypt)
+    {
+      c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+			 encrypt);
+      inbuf  += nblocks * GCRY_XTS_BLOCK_LEN;
+      outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
+      inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
+      nblocks = 0;
+    }
+
+  /* If we don't have a bulk method use the standard method.  We also
+    use this method for the a remaining partial block.  */
+
+  while (nblocks)
+    {
+      /* Xor-Encrypt/Decrypt-Xor block. */
+      cipher_block_xor (tmp.x64, inbuf, c->u_ctr.ctr, GCRY_XTS_BLOCK_LEN);
+      nburn = crypt_fn (&c->context.c, tmp.x1, tmp.x1);
+      burn = nburn > burn ? nburn : burn;
+      cipher_block_xor (outbuf, tmp.x64, c->u_ctr.ctr, GCRY_XTS_BLOCK_LEN);
+
+      outbuf += GCRY_XTS_BLOCK_LEN;
+      inbuf += GCRY_XTS_BLOCK_LEN;
+      inbuflen -= GCRY_XTS_BLOCK_LEN;
+      nblocks--;
+
+      /* Generate next tweak. */
+      xts_gfmul_byA (c->u_ctr.ctr, c->u_ctr.ctr);
+    }
+
+  /* Handle remaining data with ciphertext stealing. */
+  if (inbuflen)
+    {
+      if (!encrypt)
+	{
+	  gcry_assert (inbuflen > GCRY_XTS_BLOCK_LEN);
+	  gcry_assert (inbuflen < GCRY_XTS_BLOCK_LEN * 2);
+
+	  /* Generate last tweak. */
+	  xts_gfmul_byA (tmp.x1, c->u_ctr.ctr);
+
+	  /* Decrypt last block first. */
+	  cipher_block_xor (outbuf, inbuf, tmp.x64, GCRY_XTS_BLOCK_LEN);
+	  nburn = crypt_fn (&c->context.c, outbuf, outbuf);
+	  burn = nburn > burn ? nburn : burn;
+	  cipher_block_xor (outbuf, outbuf, tmp.x64, GCRY_XTS_BLOCK_LEN);
+
+	  inbuflen -= GCRY_XTS_BLOCK_LEN;
+	  inbuf += GCRY_XTS_BLOCK_LEN;
+	  outbuf += GCRY_XTS_BLOCK_LEN;
+	}
+
+      gcry_assert (inbuflen < GCRY_XTS_BLOCK_LEN);
+      outbuf -= GCRY_XTS_BLOCK_LEN;
+
+      /* Steal ciphertext from previous block. */
+      cipher_block_cpy (tmp.x64, outbuf, GCRY_XTS_BLOCK_LEN);
+      buf_cpy (tmp.x64, inbuf, inbuflen);
+      buf_cpy (outbuf + GCRY_XTS_BLOCK_LEN, outbuf, inbuflen);
+
+      /* Decrypt/Encrypt last block. */
+      cipher_block_xor (tmp.x64, tmp.x64, c->u_ctr.ctr, GCRY_XTS_BLOCK_LEN);
+      nburn = crypt_fn (&c->context.c, tmp.x1, tmp.x1);
+      burn = nburn > burn ? nburn : burn;
+      cipher_block_xor (outbuf, tmp.x64, c->u_ctr.ctr, GCRY_XTS_BLOCK_LEN);
+    }
+
+  /* Auto-increment data-unit sequence number */
+  xts_inc128 (c->u_iv.iv);
+
+  wipememory (&tmp, sizeof(tmp));
+  wipememory (c->u_ctr.ctr, sizeof(c->u_ctr.ctr));
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_xts_encrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  return _gcry_cipher_xts_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_xts_decrypt (gcry_cipher_hd_t c,
+                          unsigned char *outbuf, size_t outbuflen,
+                          const unsigned char *inbuf, size_t inbuflen)
+{
+  return _gcry_cipher_xts_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0);
+}
diff --git a/comm/third_party/libgcrypt/cipher/cipher.c b/comm/third_party/libgcrypt/cipher/cipher.c
new file mode 100644
index 0000000000..1039dff728
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/cipher.c
@@ -0,0 +1,1767 @@
+/* cipher.c  -	cipher dispatcher
+ * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003
+ *               2005, 2007, 2008, 2009, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "../src/gcrypt-testapi.h"
+#include "cipher.h"
+#include "./cipher-internal.h"
+
+
+/* This is the list of the default ciphers, which are included in
+   libgcrypt.  */
+static gcry_cipher_spec_t * const cipher_list[] =
+  {
+#if USE_BLOWFISH
+     &_gcry_cipher_spec_blowfish,
+#endif
+#if USE_DES
+     &_gcry_cipher_spec_des,
+     &_gcry_cipher_spec_tripledes,
+#endif
+#if USE_ARCFOUR
+     &_gcry_cipher_spec_arcfour,
+#endif
+#if USE_CAST5
+     &_gcry_cipher_spec_cast5,
+#endif
+#if USE_AES
+     &_gcry_cipher_spec_aes,
+     &_gcry_cipher_spec_aes192,
+     &_gcry_cipher_spec_aes256,
+#endif
+#if USE_TWOFISH
+     &_gcry_cipher_spec_twofish,
+     &_gcry_cipher_spec_twofish128,
+#endif
+#if USE_SERPENT
+     &_gcry_cipher_spec_serpent128,
+     &_gcry_cipher_spec_serpent192,
+     &_gcry_cipher_spec_serpent256,
+#endif
+#if USE_RFC2268
+     &_gcry_cipher_spec_rfc2268_40,
+     &_gcry_cipher_spec_rfc2268_128,
+#endif
+#if USE_SEED
+     &_gcry_cipher_spec_seed,
+#endif
+#if USE_CAMELLIA
+     &_gcry_cipher_spec_camellia128,
+     &_gcry_cipher_spec_camellia192,
+     &_gcry_cipher_spec_camellia256,
+#endif
+#ifdef USE_IDEA
+     &_gcry_cipher_spec_idea,
+#endif
+#if USE_SALSA20
+     &_gcry_cipher_spec_salsa20,
+     &_gcry_cipher_spec_salsa20r12,
+#endif
+#if USE_GOST28147
+     &_gcry_cipher_spec_gost28147,
+     &_gcry_cipher_spec_gost28147_mesh,
+#endif
+#if USE_CHACHA20
+     &_gcry_cipher_spec_chacha20,
+#endif
+#if USE_SM4
+     &_gcry_cipher_spec_sm4,
+#endif
+    NULL
+  };
+
+/* Cipher implementations starting with index 0 (enum gcry_cipher_algos) */
+static gcry_cipher_spec_t * const cipher_list_algo0[] =
+  {
+    NULL, /* GCRY_CIPHER_NONE */
+#ifdef USE_IDEA
+    &_gcry_cipher_spec_idea,
+#else
+    NULL,
+#endif
+#if USE_DES
+    &_gcry_cipher_spec_tripledes,
+#else
+    NULL,
+#endif
+#if USE_CAST5
+    &_gcry_cipher_spec_cast5,
+#else
+    NULL,
+#endif
+#if USE_BLOWFISH
+    &_gcry_cipher_spec_blowfish,
+#else
+    NULL,
+#endif
+    NULL, /* GCRY_CIPHER_SAFER_SK128 */
+    NULL, /* GCRY_CIPHER_DES_SK */
+#if USE_AES
+    &_gcry_cipher_spec_aes,
+    &_gcry_cipher_spec_aes192,
+    &_gcry_cipher_spec_aes256,
+#else
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_TWOFISH
+    &_gcry_cipher_spec_twofish
+#else
+    NULL
+#endif
+  };
+
+/* Cipher implementations starting with index 301 (enum gcry_cipher_algos) */
+static gcry_cipher_spec_t * const cipher_list_algo301[] =
+  {
+#if USE_ARCFOUR
+    &_gcry_cipher_spec_arcfour,
+#else
+    NULL,
+#endif
+#if USE_DES
+    &_gcry_cipher_spec_des,
+#else
+    NULL,
+#endif
+#if USE_TWOFISH
+    &_gcry_cipher_spec_twofish128,
+#else
+    NULL,
+#endif
+#if USE_SERPENT
+    &_gcry_cipher_spec_serpent128,
+    &_gcry_cipher_spec_serpent192,
+    &_gcry_cipher_spec_serpent256,
+#else
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_RFC2268
+    &_gcry_cipher_spec_rfc2268_40,
+    &_gcry_cipher_spec_rfc2268_128,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_SEED
+    &_gcry_cipher_spec_seed,
+#else
+    NULL,
+#endif
+#if USE_CAMELLIA
+    &_gcry_cipher_spec_camellia128,
+    &_gcry_cipher_spec_camellia192,
+    &_gcry_cipher_spec_camellia256,
+#else
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_SALSA20
+    &_gcry_cipher_spec_salsa20,
+    &_gcry_cipher_spec_salsa20r12,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_GOST28147
+    &_gcry_cipher_spec_gost28147,
+#else
+    NULL,
+#endif
+#if USE_CHACHA20
+    &_gcry_cipher_spec_chacha20,
+#else
+    NULL,
+#endif
+#if USE_GOST28147
+    &_gcry_cipher_spec_gost28147_mesh,
+#else
+    NULL,
+#endif
+#if USE_SM4
+     &_gcry_cipher_spec_sm4,
+#else
+    NULL,
+#endif
+  };
+
+
+static void _gcry_cipher_setup_mode_ops(gcry_cipher_hd_t c, int mode);
+
+
+static int
+map_algo (int algo)
+{
+  return algo;
+}
+
+
+/* Return the spec structure for the cipher algorithm ALGO.  For
+   an unknown algorithm NULL is returned.  */
+static gcry_cipher_spec_t *
+spec_from_algo (int algo)
+{
+  gcry_cipher_spec_t *spec = NULL;
+
+  algo = map_algo (algo);
+
+  if (algo >= 0 && algo < DIM(cipher_list_algo0))
+    spec = cipher_list_algo0[algo];
+  else if (algo >= 301 && algo < 301 + DIM(cipher_list_algo301))
+    spec = cipher_list_algo301[algo - 301];
+
+  if (spec)
+    gcry_assert (spec->algo == algo);
+
+  return spec;
+}
+
+
+/* Lookup a cipher's spec by its name.  */
+static gcry_cipher_spec_t *
+spec_from_name (const char *name)
+{
+  gcry_cipher_spec_t *spec;
+  int idx;
+  const char **aliases;
+
+  for (idx=0; (spec = cipher_list[idx]); idx++)
+    {
+      if (!stricmp (name, spec->name))
+        return spec;
+      if (spec->aliases)
+        {
+          for (aliases = spec->aliases; *aliases; aliases++)
+            if (!stricmp (name, *aliases))
+              return spec;
+        }
+    }
+
+  return NULL;
+}
+
+
+/* Lookup a cipher's spec by its OID.  */
+static gcry_cipher_spec_t *
+spec_from_oid (const char *oid)
+{
+  gcry_cipher_spec_t *spec;
+  gcry_cipher_oid_spec_t *oid_specs;
+  int idx, j;
+
+  for (idx=0; (spec = cipher_list[idx]); idx++)
+    {
+      oid_specs = spec->oids;
+      if (oid_specs)
+        {
+          for (j = 0; oid_specs[j].oid; j++)
+            if (!stricmp (oid, oid_specs[j].oid))
+              return spec;
+        }
+    }
+
+  return NULL;
+}
+
+
+/* Locate the OID in the oid table and return the spec or NULL if not
+   found.  An optional "oid." or "OID." prefix in OID is ignored, the
+   OID is expected to be in standard IETF dotted notation.  A pointer
+   to the OID specification of the module implementing this algorithm
+   is return in OID_SPEC unless passed as NULL.*/
+static gcry_cipher_spec_t *
+search_oid (const char *oid, gcry_cipher_oid_spec_t *oid_spec)
+{
+  gcry_cipher_spec_t *spec;
+  int i;
+
+  if (!oid)
+    return NULL;
+
+  if (!strncmp (oid, "oid.", 4) || !strncmp (oid, "OID.", 4))
+    oid += 4;
+
+  spec = spec_from_oid (oid);
+  if (spec && spec->oids)
+    {
+      for (i = 0; spec->oids[i].oid; i++)
+	if (!stricmp (oid, spec->oids[i].oid))
+	  {
+	    if (oid_spec)
+	      *oid_spec = spec->oids[i];
+            return spec;
+	  }
+    }
+
+  return NULL;
+}
+
+
+/* Map STRING to the cipher algorithm identifier.  Returns the
+   algorithm ID of the cipher for the given name or 0 if the name is
+   not known.  It is valid to pass NULL for STRING which results in a
+   return value of 0. */
+int
+_gcry_cipher_map_name (const char *string)
+{
+  gcry_cipher_spec_t *spec;
+
+  if (!string)
+    return 0;
+
+  /* If the string starts with a digit (optionally prefixed with
+     either "OID." or "oid."), we first look into our table of ASN.1
+     object identifiers to figure out the algorithm */
+
+  spec = search_oid (string, NULL);
+  if (spec)
+    return spec->algo;
+
+  spec = spec_from_name (string);
+  if (spec)
+    return spec->algo;
+
+  return 0;
+}
+
+
+/* Given a STRING with an OID in dotted decimal notation, this
+   function returns the cipher mode (GCRY_CIPHER_MODE_*) associated
+   with that OID or 0 if no mode is known.  Passing NULL for string
+   yields a return value of 0. */
+int
+_gcry_cipher_mode_from_oid (const char *string)
+{
+  gcry_cipher_spec_t *spec;
+  gcry_cipher_oid_spec_t oid_spec;
+
+  if (!string)
+    return 0;
+
+  spec = search_oid (string, &oid_spec);
+  if (spec)
+    return oid_spec.mode;
+
+  return 0;
+}
+
+
+/* Map the cipher algorithm identifier ALGORITHM to a string
+   representing this algorithm.  This string is the default name as
+   used by Libgcrypt.  A "?" is returned for an unknown algorithm.
+   NULL is never returned. */
+const char *
+_gcry_cipher_algo_name (int algorithm)
+{
+  gcry_cipher_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  return spec? spec->name : "?";
+}
+
+
+/* Flag the cipher algorithm with the identifier ALGORITHM as
+   disabled.  There is no error return, the function does nothing for
+   unknown algorithms.  Disabled algorithms are virtually not
+   available in Libgcrypt.  This is not thread safe and should thus be
+   called early. */
+static void
+disable_cipher_algo (int algo)
+{
+  gcry_cipher_spec_t *spec = spec_from_algo (algo);
+
+  if (spec)
+    spec->flags.disabled = 1;
+}
+
+
+/* Return 0 if the cipher algorithm with identifier ALGORITHM is
+   available. Returns a basic error code value if it is not
+   available.  */
+static gcry_err_code_t
+check_cipher_algo (int algorithm)
+{
+  gcry_cipher_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  if (spec && !spec->flags.disabled)
+    return 0;
+
+  return GPG_ERR_CIPHER_ALGO;
+}
+
+
+/* Return the standard length in bits of the key for the cipher
+   algorithm with the identifier ALGORITHM.  */
+static unsigned int
+cipher_get_keylen (int algorithm)
+{
+  gcry_cipher_spec_t *spec;
+  unsigned len = 0;
+
+  spec = spec_from_algo (algorithm);
+  if (spec)
+    {
+      len = spec->keylen;
+      if (!len)
+	log_bug ("cipher %d w/o key length\n", algorithm);
+    }
+
+  return len;
+}
+
+
+/* Return the block length of the cipher algorithm with the identifier
+   ALGORITHM.  This function return 0 for an invalid algorithm.  */
+static unsigned int
+cipher_get_blocksize (int algorithm)
+{
+  gcry_cipher_spec_t *spec;
+  unsigned len = 0;
+
+  spec = spec_from_algo (algorithm);
+  if (spec)
+    {
+      len = spec->blocksize;
+      if (!len)
+        log_bug ("cipher %d w/o blocksize\n", algorithm);
+    }
+
+  return len;
+}
+
+
+/*
+   Open a cipher handle for use with cipher algorithm ALGORITHM, using
+   the cipher mode MODE (one of the GCRY_CIPHER_MODE_*) and return a
+   handle in HANDLE.  Put NULL into HANDLE and return an error code if
+   something goes wrong.  FLAGS may be used to modify the
+   operation.  The defined flags are:
+
+   GCRY_CIPHER_SECURE:  allocate all internal buffers in secure memory.
+   GCRY_CIPHER_ENABLE_SYNC:  Enable the sync operation as used in OpenPGP.
+   GCRY_CIPHER_CBC_CTS:  Enable CTS mode.
+   GCRY_CIPHER_CBC_MAC:  Enable MAC mode.
+
+   Values for these flags may be combined using OR.
+ */
+gcry_err_code_t
+_gcry_cipher_open (gcry_cipher_hd_t *handle,
+                   int algo, int mode, unsigned int flags)
+{
+  gcry_err_code_t rc;
+  gcry_cipher_hd_t h = NULL;
+
+  if (mode >= GCRY_CIPHER_MODE_INTERNAL)
+    rc = GPG_ERR_INV_CIPHER_MODE;
+  else
+    rc = _gcry_cipher_open_internal (&h, algo, mode, flags);
+
+  *handle = rc ? NULL : h;
+
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
+			    int algo, int mode, unsigned int flags)
+{
+  int secure = (flags & GCRY_CIPHER_SECURE);
+  gcry_cipher_spec_t *spec;
+  gcry_cipher_hd_t h = NULL;
+  gcry_err_code_t err;
+
+  /* If the application missed to call the random poll function, we do
+     it here to ensure that it is used once in a while. */
+  _gcry_fast_random_poll ();
+
+  spec = spec_from_algo (algo);
+  if (!spec)
+    err = GPG_ERR_CIPHER_ALGO;
+  else if (spec->flags.disabled)
+    err = GPG_ERR_CIPHER_ALGO;
+  else
+    err = 0;
+
+  /* check flags */
+  if ((! err)
+      && ((flags & ~(0
+		     | GCRY_CIPHER_SECURE
+		     | GCRY_CIPHER_ENABLE_SYNC
+		     | GCRY_CIPHER_CBC_CTS
+		     | GCRY_CIPHER_CBC_MAC))
+	  || ((flags & GCRY_CIPHER_CBC_CTS) && (flags & GCRY_CIPHER_CBC_MAC))))
+    err = GPG_ERR_CIPHER_ALGO;
+
+  /* check that a valid mode has been requested */
+  if (! err)
+    switch (mode)
+      {
+      case GCRY_CIPHER_MODE_CCM:
+	if (spec->blocksize != GCRY_CCM_BLOCK_LEN)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	if (!spec->encrypt || !spec->decrypt)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_XTS:
+	if (spec->blocksize != GCRY_XTS_BLOCK_LEN)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	if (!spec->encrypt || !spec->decrypt)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_ECB:
+      case GCRY_CIPHER_MODE_CBC:
+      case GCRY_CIPHER_MODE_CFB:
+      case GCRY_CIPHER_MODE_CFB8:
+      case GCRY_CIPHER_MODE_OFB:
+      case GCRY_CIPHER_MODE_CTR:
+      case GCRY_CIPHER_MODE_AESWRAP:
+      case GCRY_CIPHER_MODE_CMAC:
+      case GCRY_CIPHER_MODE_EAX:
+      case GCRY_CIPHER_MODE_GCM:
+	if (!spec->encrypt || !spec->decrypt)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_POLY1305:
+	if (!spec->stencrypt || !spec->stdecrypt || !spec->setiv)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	else if (spec->algo != GCRY_CIPHER_CHACHA20)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_OCB:
+        /* Note that our implementation allows only for 128 bit block
+           length algorithms.  Lower block lengths would be possible
+           but we do not implement them because they limit the
+           security too much.  */
+	if (!spec->encrypt || !spec->decrypt)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	else if (spec->blocksize != (128/8))
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_STREAM:
+	if (!spec->stencrypt || !spec->stdecrypt)
+	  err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      case GCRY_CIPHER_MODE_NONE:
+        /* This mode may be used for debugging.  It copies the main
+           text verbatim to the ciphertext.  We do not allow this in
+           fips mode or if no debug flag has been set.  */
+	if (fips_mode () || !_gcry_get_debug_flag (0))
+          err = GPG_ERR_INV_CIPHER_MODE;
+	break;
+
+      default:
+	err = GPG_ERR_INV_CIPHER_MODE;
+      }
+
+  /* Perform selftest here and mark this with a flag in cipher_table?
+     No, we should not do this as it takes too long.  Further it does
+     not make sense to exclude algorithms with failing selftests at
+     runtime: If a selftest fails there is something seriously wrong
+     with the system and thus we better die immediately. */
+
+  if (! err)
+    {
+      size_t size = (sizeof (*h)
+                     + 2 * spec->contextsize
+                     - sizeof (cipher_context_alignment_t)
+#ifdef NEED_16BYTE_ALIGNED_CONTEXT
+                     + 15  /* Space for leading alignment gap.  */
+#endif /*NEED_16BYTE_ALIGNED_CONTEXT*/
+                     );
+
+      /* Space needed per mode.  */
+      switch (mode)
+	{
+	case GCRY_CIPHER_MODE_XTS:
+	  /* Additional cipher context for tweak. */
+	  size += 2 * spec->contextsize + 15;
+	  break;
+
+	default:
+	  break;
+	}
+
+      if (secure)
+	h = xtrycalloc_secure (1, size);
+      else
+	h = xtrycalloc (1, size);
+
+      if (! h)
+	err = gpg_err_code_from_syserror ();
+      else
+	{
+          size_t off = 0;
+	  char *tc;
+
+#ifdef NEED_16BYTE_ALIGNED_CONTEXT
+          if ( ((uintptr_t)h & 0x0f) )
+            {
+              /* The malloced block is not aligned on a 16 byte
+                 boundary.  Correct for this.  */
+              off = 16 - ((uintptr_t)h & 0x0f);
+              h = (void*)((char*)h + off);
+            }
+#endif /*NEED_16BYTE_ALIGNED_CONTEXT*/
+
+	  h->magic = secure ? CTX_MAGIC_SECURE : CTX_MAGIC_NORMAL;
+          h->actual_handle_size = size - off;
+          h->handle_offset = off;
+	  h->spec = spec;
+          h->algo = algo;
+	  h->mode = mode;
+	  h->flags = flags;
+
+          /* Setup mode routines. */
+          _gcry_cipher_setup_mode_ops(h, mode);
+
+          /* Setup defaults depending on the mode.  */
+          switch (mode)
+            {
+            case GCRY_CIPHER_MODE_OCB:
+              h->u_mode.ocb.taglen = 16; /* Bytes.  */
+              break;
+
+	    case GCRY_CIPHER_MODE_XTS:
+	      tc = h->context.c + spec->contextsize * 2;
+	      tc += (16 - (uintptr_t)tc % 16) % 16;
+	      h->u_mode.xts.tweak_context = tc;
+
+	      break;
+
+            default:
+              break;
+            }
+        }
+    }
+
+  /* Done.  */
+
+  *handle = err ? NULL : h;
+
+  return err;
+}
+
+
+/* Release all resources associated with the cipher handle H. H may be
+   NULL in which case this is a no-operation. */
+void
+_gcry_cipher_close (gcry_cipher_hd_t h)
+{
+  size_t off;
+
+  if (!h)
+    return;
+
+  if ((h->magic != CTX_MAGIC_SECURE)
+      && (h->magic != CTX_MAGIC_NORMAL))
+    _gcry_fatal_error(GPG_ERR_INTERNAL,
+		      "gcry_cipher_close: already closed/invalid handle");
+  else
+    h->magic = 0;
+
+  /* We always want to wipe out the memory even when the context has
+     been allocated in secure memory.  The user might have disabled
+     secure memory or is using his own implementation which does not
+     do the wiping.  To accomplish this we need to keep track of the
+     actual size of this structure because we have no way to known
+     how large the allocated area was when using a standard malloc. */
+  off = h->handle_offset;
+  wipememory (h, h->actual_handle_size);
+
+  xfree ((char*)h - off);
+}
+
+
+/* Set the key to be used for the encryption context C to KEY with
+   length KEYLEN.  The length should match the required length. */
+static gcry_err_code_t
+cipher_setkey (gcry_cipher_hd_t c, byte *key, size_t keylen)
+{
+  gcry_err_code_t rc;
+
+  if (c->mode == GCRY_CIPHER_MODE_XTS)
+    {
+      /* XTS uses two keys. */
+      if (keylen % 2)
+	return GPG_ERR_INV_KEYLEN;
+      keylen /= 2;
+
+      if (fips_mode ())
+	{
+	  /* Reject key if subkeys Key_1 and Key_2 are equal.
+	     See "Implementation Guidance for FIPS 140-2, A.9 XTS-AES
+	     Key Generation Requirements" for details.  */
+	  if (buf_eq_const (key, key + keylen, keylen))
+	    return GPG_ERR_WEAK_KEY;
+	}
+    }
+
+  rc = c->spec->setkey (&c->context.c, key, keylen, &c->bulk);
+  if (!rc || (c->marks.allow_weak_key && rc == GPG_ERR_WEAK_KEY))
+    {
+      /* Duplicate initial context.  */
+      memcpy ((void *) ((char *) &c->context.c + c->spec->contextsize),
+              (void *) &c->context.c,
+              c->spec->contextsize);
+      c->marks.key = 1;
+
+      switch (c->mode)
+        {
+        case GCRY_CIPHER_MODE_CMAC:
+          rc = _gcry_cipher_cmac_set_subkeys (c);
+          break;
+
+        case GCRY_CIPHER_MODE_EAX:
+          rc = _gcry_cipher_eax_setkey (c);
+          break;
+
+        case GCRY_CIPHER_MODE_GCM:
+          _gcry_cipher_gcm_setkey (c);
+          break;
+
+        case GCRY_CIPHER_MODE_OCB:
+          _gcry_cipher_ocb_setkey (c);
+          break;
+
+        case GCRY_CIPHER_MODE_POLY1305:
+          _gcry_cipher_poly1305_setkey (c);
+          break;
+
+	case GCRY_CIPHER_MODE_XTS:
+	  /* Setup tweak cipher with second part of XTS key. */
+	  rc = c->spec->setkey (c->u_mode.xts.tweak_context, key + keylen,
+				keylen, &c->bulk);
+	  if (!rc || (c->marks.allow_weak_key && rc == GPG_ERR_WEAK_KEY))
+	    {
+	      /* Duplicate initial tweak context.  */
+	      memcpy (c->u_mode.xts.tweak_context + c->spec->contextsize,
+		      c->u_mode.xts.tweak_context, c->spec->contextsize);
+	    }
+	  else
+	    c->marks.key = 0;
+	  break;
+
+        default:
+          break;
+        };
+    }
+  else
+    c->marks.key = 0;
+
+  return rc;
+}
+
+
+/* Set the IV to be used for the encryption context C to IV with
+   length IVLEN.  The length should match the required length. */
+static gcry_err_code_t
+cipher_setiv (gcry_cipher_hd_t c, const byte *iv, size_t ivlen)
+{
+  /* If the cipher has its own IV handler, we use only this one.  This
+     is currently used for stream ciphers requiring a nonce.  */
+  if (c->spec->setiv)
+    {
+      c->spec->setiv (&c->context.c, iv, ivlen);
+      return 0;
+    }
+
+  memset (c->u_iv.iv, 0, c->spec->blocksize);
+  if (iv)
+    {
+      if (ivlen != c->spec->blocksize)
+        {
+          log_info ("WARNING: cipher_setiv: ivlen=%u blklen=%u\n",
+                    (unsigned int)ivlen, (unsigned int)c->spec->blocksize);
+          fips_signal_error ("IV length does not match blocklength");
+        }
+      if (ivlen > c->spec->blocksize)
+        ivlen = c->spec->blocksize;
+      memcpy (c->u_iv.iv, iv, ivlen);
+      c->marks.iv = 1;
+    }
+  else
+      c->marks.iv = 0;
+  c->unused = 0;
+
+  return 0;
+}
+
+
+/* Reset the cipher context to the initial context.  This is basically
+   the same as an release followed by a new. */
+static void
+cipher_reset (gcry_cipher_hd_t c)
+{
+  unsigned int marks_key, marks_allow_weak_key;
+
+  marks_key = c->marks.key;
+  marks_allow_weak_key = c->marks.allow_weak_key;
+
+  memcpy (&c->context.c,
+	  (char *) &c->context.c + c->spec->contextsize,
+	  c->spec->contextsize);
+  memset (&c->marks, 0, sizeof c->marks);
+  memset (c->u_iv.iv, 0, c->spec->blocksize);
+  memset (c->lastiv, 0, c->spec->blocksize);
+  memset (c->u_ctr.ctr, 0, c->spec->blocksize);
+  c->unused = 0;
+
+  c->marks.key = marks_key;
+  c->marks.allow_weak_key = marks_allow_weak_key;
+
+  switch (c->mode)
+    {
+    case GCRY_CIPHER_MODE_CMAC:
+      _gcry_cmac_reset(&c->u_mode.cmac);
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_header);
+      _gcry_cmac_reset(&c->u_mode.eax.cmac_ciphertext);
+      break;
+
+    case GCRY_CIPHER_MODE_GCM:
+      /* Only clear head of u_mode, keep ghash_key and gcm_table. */
+      {
+        byte *u_mode_pos = (void *)&c->u_mode;
+        byte *ghash_key_pos = c->u_mode.gcm.u_ghash_key.key;
+        size_t u_mode_head_length = ghash_key_pos - u_mode_pos;
+
+        memset (&c->u_mode, 0, u_mode_head_length);
+      }
+      break;
+
+    case GCRY_CIPHER_MODE_POLY1305:
+      memset (&c->u_mode.poly1305, 0, sizeof c->u_mode.poly1305);
+      break;
+
+    case GCRY_CIPHER_MODE_CCM:
+      memset (&c->u_mode.ccm, 0, sizeof c->u_mode.ccm);
+      break;
+
+    case GCRY_CIPHER_MODE_OCB:
+      /* Do not clear precalculated L-values */
+      {
+	byte *u_mode_head_pos = (void *)&c->u_mode.ocb;
+	byte *u_mode_tail_pos = (void *)&c->u_mode.ocb.tag;
+	size_t u_mode_head_length = u_mode_tail_pos - u_mode_head_pos;
+	size_t u_mode_tail_length = sizeof(c->u_mode.ocb) - u_mode_head_length;
+
+	memset (u_mode_tail_pos, 0, u_mode_tail_length);
+
+	/* Setup default taglen.  */
+	c->u_mode.ocb.taglen = 16;
+      }
+      break;
+
+    case GCRY_CIPHER_MODE_XTS:
+      memcpy (c->u_mode.xts.tweak_context,
+	      c->u_mode.xts.tweak_context + c->spec->contextsize,
+	      c->spec->contextsize);
+      break;
+
+    default:
+      break; /* u_mode unused by other modes. */
+    }
+}
+
+
+
+static gcry_err_code_t
+do_ecb_crypt (gcry_cipher_hd_t c,
+              unsigned char *outbuf, size_t outbuflen,
+              const unsigned char *inbuf, size_t inbuflen,
+              gcry_cipher_encrypt_t crypt_fn)
+{
+  unsigned int blocksize = c->spec->blocksize;
+  size_t n, nblocks;
+  unsigned int burn, nburn;
+
+  if (outbuflen < inbuflen)
+    return GPG_ERR_BUFFER_TOO_SHORT;
+  if ((inbuflen % blocksize))
+    return GPG_ERR_INV_LENGTH;
+
+  nblocks = inbuflen / blocksize;
+  burn = 0;
+
+  for (n=0; n < nblocks; n++ )
+    {
+      nburn = crypt_fn (&c->context.c, outbuf, inbuf);
+      burn = nburn > burn ? nburn : burn;
+      inbuf  += blocksize;
+      outbuf += blocksize;
+    }
+
+  if (burn > 0)
+    _gcry_burn_stack (burn + 4 * sizeof(void *));
+
+  return 0;
+}
+
+static gcry_err_code_t
+do_ecb_encrypt (gcry_cipher_hd_t c,
+                unsigned char *outbuf, size_t outbuflen,
+                const unsigned char *inbuf, size_t inbuflen)
+{
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt);
+}
+
+static gcry_err_code_t
+do_ecb_decrypt (gcry_cipher_hd_t c,
+                unsigned char *outbuf, size_t outbuflen,
+                const unsigned char *inbuf, size_t inbuflen)
+{
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt);
+}
+
+
+static gcry_err_code_t
+do_stream_encrypt (gcry_cipher_hd_t c,
+                unsigned char *outbuf, size_t outbuflen,
+                const unsigned char *inbuf, size_t inbuflen)
+{
+  (void)outbuflen;
+  c->spec->stencrypt (&c->context.c, outbuf, (void *)inbuf, inbuflen);
+  return 0;
+}
+
+static gcry_err_code_t
+do_stream_decrypt (gcry_cipher_hd_t c,
+                unsigned char *outbuf, size_t outbuflen,
+                const unsigned char *inbuf, size_t inbuflen)
+{
+  (void)outbuflen;
+  c->spec->stdecrypt (&c->context.c, outbuf, (void *)inbuf, inbuflen);
+  return 0;
+}
+
+
+static gcry_err_code_t
+do_encrypt_none_unknown (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
+                         const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t rc;
+
+  (void)outbuflen;
+
+  switch (c->mode)
+    {
+    case GCRY_CIPHER_MODE_CMAC:
+      rc = GPG_ERR_INV_CIPHER_MODE;
+      break;
+
+    case GCRY_CIPHER_MODE_NONE:
+      if (fips_mode () || !_gcry_get_debug_flag (0))
+        {
+          fips_signal_error ("cipher mode NONE used");
+          rc = GPG_ERR_INV_CIPHER_MODE;
+        }
+      else
+        {
+          if (inbuf != outbuf)
+            memmove (outbuf, inbuf, inbuflen);
+          rc = 0;
+        }
+      break;
+
+    default:
+      log_fatal ("cipher_encrypt: invalid mode %d\n", c->mode );
+      rc = GPG_ERR_INV_CIPHER_MODE;
+      break;
+    }
+
+  return rc;
+}
+
+static gcry_err_code_t
+do_decrypt_none_unknown (gcry_cipher_hd_t c, byte *outbuf, size_t outbuflen,
+                         const byte *inbuf, size_t inbuflen)
+{
+  gcry_err_code_t rc;
+
+  (void)outbuflen;
+
+  switch (c->mode)
+    {
+    case GCRY_CIPHER_MODE_CMAC:
+      rc = GPG_ERR_INV_CIPHER_MODE;
+      break;
+
+    case GCRY_CIPHER_MODE_NONE:
+      if (fips_mode () || !_gcry_get_debug_flag (0))
+        {
+          fips_signal_error ("cipher mode NONE used");
+          rc = GPG_ERR_INV_CIPHER_MODE;
+        }
+      else
+        {
+          if (inbuf != outbuf)
+            memmove (outbuf, inbuf, inbuflen);
+          rc = 0;
+        }
+      break;
+
+    default:
+      log_fatal ("cipher_decrypt: invalid mode %d\n", c->mode );
+      rc = GPG_ERR_INV_CIPHER_MODE;
+      break;
+    }
+
+  return rc;
+}
+
+
+/****************
+ * Encrypt IN and write it to OUT.  If IN is NULL, in-place encryption has
+ * been requested.
+ */
+gcry_err_code_t
+_gcry_cipher_encrypt (gcry_cipher_hd_t h, void *out, size_t outsize,
+                      const void *in, size_t inlen)
+{
+  gcry_err_code_t rc;
+
+  if (!in)  /* Caller requested in-place encryption.  */
+    {
+      in = out;
+      inlen = outsize;
+    }
+
+  if (h->mode != GCRY_CIPHER_MODE_NONE && !h->marks.key)
+    {
+      log_error ("cipher_encrypt: key not set\n");
+      return GPG_ERR_MISSING_KEY;
+    }
+
+  rc = h->mode_ops.encrypt (h, out, outsize, in, inlen);
+
+  /* Failsafe: Make sure that the plaintext will never make it into
+     OUT if the encryption returned an error.  */
+  if (rc && out)
+    memset (out, 0x42, outsize);
+
+  return rc;
+}
+
+
+/****************
+ * Decrypt IN and write it to OUT.  If IN is NULL, in-place encryption has
+ * been requested.
+ */
+gcry_err_code_t
+_gcry_cipher_decrypt (gcry_cipher_hd_t h, void *out, size_t outsize,
+                      const void *in, size_t inlen)
+{
+  if (!in) /* Caller requested in-place encryption. */
+    {
+      in = out;
+      inlen = outsize;
+    }
+
+  if (h->mode != GCRY_CIPHER_MODE_NONE && !h->marks.key)
+    {
+      log_error ("cipher_decrypt: key not set\n");
+      return GPG_ERR_MISSING_KEY;
+    }
+
+  return h->mode_ops.decrypt (h, out, outsize, in, inlen);
+}
+
+
+/****************
+ * Used for PGP's somewhat strange CFB mode. Only works if
+ * the corresponding flag is set.
+ */
+static void
+cipher_sync (gcry_cipher_hd_t c)
+{
+  if ((c->flags & GCRY_CIPHER_ENABLE_SYNC) && c->unused)
+    {
+      memmove (c->u_iv.iv + c->unused,
+               c->u_iv.iv, c->spec->blocksize - c->unused);
+      memcpy (c->u_iv.iv,
+              c->lastiv + c->spec->blocksize - c->unused, c->unused);
+      c->unused = 0;
+    }
+}
+
+
+gcry_err_code_t
+_gcry_cipher_setkey (gcry_cipher_hd_t hd, const void *key, size_t keylen)
+{
+  return cipher_setkey (hd, (void*)key, keylen);
+}
+
+
+gcry_err_code_t
+_gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen)
+{
+  return hd->mode_ops.setiv (hd, iv, ivlen);
+}
+
+
+/* Set counter for CTR mode.  (CTR,CTRLEN) must denote a buffer of
+   block size length, or (NULL,0) to set the CTR to the all-zero
+   block. */
+gpg_err_code_t
+_gcry_cipher_setctr (gcry_cipher_hd_t hd, const void *ctr, size_t ctrlen)
+{
+  if (ctr && ctrlen == hd->spec->blocksize)
+    {
+      memcpy (hd->u_ctr.ctr, ctr, hd->spec->blocksize);
+      hd->unused = 0;
+    }
+  else if (!ctr || !ctrlen)
+    {
+      memset (hd->u_ctr.ctr, 0, hd->spec->blocksize);
+      hd->unused = 0;
+    }
+  else
+    return GPG_ERR_INV_ARG;
+
+  return 0;
+}
+
+gpg_err_code_t
+_gcry_cipher_getctr (gcry_cipher_hd_t hd, void *ctr, size_t ctrlen)
+{
+  if (ctr && ctrlen == hd->spec->blocksize)
+    memcpy (ctr, hd->u_ctr.ctr, hd->spec->blocksize);
+  else
+    return GPG_ERR_INV_ARG;
+
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *abuf,
+                           size_t abuflen)
+{
+  gcry_err_code_t rc;
+
+  if (hd->mode_ops.authenticate)
+    {
+      rc = hd->mode_ops.authenticate (hd, abuf, abuflen);
+    }
+  else
+    {
+      log_error ("gcry_cipher_authenticate: invalid mode %d\n", hd->mode);
+      rc = GPG_ERR_INV_CIPHER_MODE;
+    }
+
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_gettag (gcry_cipher_hd_t hd, void *outtag, size_t taglen)
+{
+  gcry_err_code_t rc;
+
+  if (hd->mode_ops.get_tag)
+    {
+      rc = hd->mode_ops.get_tag (hd, outtag, taglen);
+    }
+  else
+    {
+      log_error ("gcry_cipher_gettag: invalid mode %d\n", hd->mode);
+      rc = GPG_ERR_INV_CIPHER_MODE;
+    }
+
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_cipher_checktag (gcry_cipher_hd_t hd, const void *intag, size_t taglen)
+{
+  gcry_err_code_t rc;
+
+  if (hd->mode_ops.check_tag)
+    {
+      rc = hd->mode_ops.check_tag (hd, intag, taglen);
+    }
+  else
+    {
+      log_error ("gcry_cipher_checktag: invalid mode %d\n", hd->mode);
+      rc = GPG_ERR_INV_CIPHER_MODE;
+    }
+
+  return rc;
+}
+
+
+
+static void
+_gcry_cipher_setup_mode_ops(gcry_cipher_hd_t c, int mode)
+{
+  /* Setup encryption and decryption routines. */
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_STREAM:
+      c->mode_ops.encrypt = do_stream_encrypt;
+      c->mode_ops.decrypt = do_stream_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_ECB:
+      c->mode_ops.encrypt = do_ecb_encrypt;
+      c->mode_ops.decrypt = do_ecb_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_CBC:
+      if (!(c->flags & GCRY_CIPHER_CBC_CTS))
+        {
+          c->mode_ops.encrypt = _gcry_cipher_cbc_encrypt;
+          c->mode_ops.decrypt = _gcry_cipher_cbc_decrypt;
+        }
+      else
+        {
+          c->mode_ops.encrypt = _gcry_cipher_cbc_cts_encrypt;
+          c->mode_ops.decrypt = _gcry_cipher_cbc_cts_decrypt;
+        }
+      break;
+
+    case GCRY_CIPHER_MODE_CFB:
+      c->mode_ops.encrypt = _gcry_cipher_cfb_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_cfb_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_CFB8:
+      c->mode_ops.encrypt = _gcry_cipher_cfb8_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_cfb8_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_OFB:
+      c->mode_ops.encrypt = _gcry_cipher_ofb_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_ofb_encrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_CTR:
+      c->mode_ops.encrypt = _gcry_cipher_ctr_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_ctr_encrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_AESWRAP:
+      c->mode_ops.encrypt = _gcry_cipher_aeswrap_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_aeswrap_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_CCM:
+      c->mode_ops.encrypt = _gcry_cipher_ccm_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_ccm_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      c->mode_ops.encrypt = _gcry_cipher_eax_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_eax_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_GCM:
+      c->mode_ops.encrypt = _gcry_cipher_gcm_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_gcm_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_POLY1305:
+      c->mode_ops.encrypt = _gcry_cipher_poly1305_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_poly1305_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_OCB:
+      c->mode_ops.encrypt = _gcry_cipher_ocb_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_ocb_decrypt;
+      break;
+
+    case GCRY_CIPHER_MODE_XTS:
+      c->mode_ops.encrypt = _gcry_cipher_xts_encrypt;
+      c->mode_ops.decrypt = _gcry_cipher_xts_decrypt;
+      break;
+
+    default:
+      c->mode_ops.encrypt = do_encrypt_none_unknown;
+      c->mode_ops.decrypt = do_decrypt_none_unknown;
+      break;
+    }
+
+  /* Setup IV setting routine. */
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_CCM:
+      c->mode_ops.setiv = _gcry_cipher_ccm_set_nonce;
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      c->mode_ops.setiv = _gcry_cipher_eax_set_nonce;
+      break;
+
+    case GCRY_CIPHER_MODE_GCM:
+      c->mode_ops.setiv =  _gcry_cipher_gcm_setiv;
+      break;
+
+    case GCRY_CIPHER_MODE_POLY1305:
+      c->mode_ops.setiv = _gcry_cipher_poly1305_setiv;
+      break;
+
+    case GCRY_CIPHER_MODE_OCB:
+      c->mode_ops.setiv = _gcry_cipher_ocb_set_nonce;
+      break;
+
+    default:
+      c->mode_ops.setiv = cipher_setiv;
+      break;
+    }
+
+
+  /* Setup authentication routines for AEAD modes. */
+  switch (mode)
+    {
+    case GCRY_CIPHER_MODE_CCM:
+      c->mode_ops.authenticate = _gcry_cipher_ccm_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_ccm_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_ccm_check_tag;
+      break;
+
+    case GCRY_CIPHER_MODE_CMAC:
+      c->mode_ops.authenticate = _gcry_cipher_cmac_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_cmac_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_cmac_check_tag;
+      break;
+
+    case GCRY_CIPHER_MODE_EAX:
+      c->mode_ops.authenticate = _gcry_cipher_eax_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_eax_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_eax_check_tag;
+      break;
+
+    case GCRY_CIPHER_MODE_GCM:
+      c->mode_ops.authenticate = _gcry_cipher_gcm_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_gcm_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_gcm_check_tag;
+      break;
+
+    case GCRY_CIPHER_MODE_POLY1305:
+      c->mode_ops.authenticate = _gcry_cipher_poly1305_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_poly1305_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_poly1305_check_tag;
+      break;
+
+    case GCRY_CIPHER_MODE_OCB:
+      c->mode_ops.authenticate = _gcry_cipher_ocb_authenticate;
+      c->mode_ops.get_tag      = _gcry_cipher_ocb_get_tag;
+      c->mode_ops.check_tag    = _gcry_cipher_ocb_check_tag;
+      break;
+
+    default:
+      c->mode_ops.authenticate = NULL;
+      c->mode_ops.get_tag      = NULL;
+      c->mode_ops.check_tag    = NULL;
+      break;
+    }
+}
+
+
+gcry_err_code_t
+_gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen)
+{
+  gcry_err_code_t rc = 0;
+
+  switch (cmd)
+    {
+    case GCRYCTL_RESET:
+      cipher_reset (h);
+      break;
+
+    case GCRYCTL_FINALIZE:
+      if (!h || buffer || buflen)
+	return GPG_ERR_INV_ARG;
+      h->marks.finalize = 1;
+      break;
+
+    case GCRYCTL_CFB_SYNC:
+      cipher_sync( h );
+      break;
+
+    case GCRYCTL_SET_CBC_CTS:
+      if (buflen)
+	if (h->flags & GCRY_CIPHER_CBC_MAC)
+	  rc = GPG_ERR_INV_FLAG;
+	else
+	  h->flags |= GCRY_CIPHER_CBC_CTS;
+      else
+	h->flags &= ~GCRY_CIPHER_CBC_CTS;
+      break;
+
+    case GCRYCTL_SET_CBC_MAC:
+      if (buflen)
+	if (h->flags & GCRY_CIPHER_CBC_CTS)
+	  rc = GPG_ERR_INV_FLAG;
+	else
+	  h->flags |= GCRY_CIPHER_CBC_MAC;
+      else
+	h->flags &= ~GCRY_CIPHER_CBC_MAC;
+      break;
+
+    case GCRYCTL_SET_CCM_LENGTHS:
+      {
+        u64 params[3];
+        size_t encryptedlen;
+        size_t aadlen;
+        size_t authtaglen;
+
+        if (h->mode != GCRY_CIPHER_MODE_CCM)
+          return GPG_ERR_INV_CIPHER_MODE;
+
+        if (!buffer || buflen != 3 * sizeof(u64))
+          return GPG_ERR_INV_ARG;
+
+        /* This command is used to pass additional length parameters needed
+           by CCM mode to initialize CBC-MAC.  */
+        memcpy (params, buffer, sizeof(params));
+        encryptedlen = params[0];
+        aadlen = params[1];
+        authtaglen = params[2];
+
+        rc = _gcry_cipher_ccm_set_lengths (h, encryptedlen, aadlen, authtaglen);
+      }
+      break;
+
+    case GCRYCTL_SET_TAGLEN:
+      if (!h || !buffer || buflen != sizeof(int) )
+	return GPG_ERR_INV_ARG;
+      switch (h->mode)
+        {
+        case GCRY_CIPHER_MODE_OCB:
+          switch (*(int*)buffer)
+            {
+            case 8: case 12: case 16:
+              h->u_mode.ocb.taglen = *(int*)buffer;
+              break;
+            default:
+              rc = GPG_ERR_INV_LENGTH; /* Invalid tag length. */
+              break;
+            }
+          break;
+
+        default:
+          rc =GPG_ERR_INV_CIPHER_MODE;
+          break;
+        }
+      break;
+
+    case GCRYCTL_DISABLE_ALGO:
+      /* This command expects NULL for H and BUFFER to point to an
+         integer with the algo number.  */
+      if( h || !buffer || buflen != sizeof(int) )
+	return GPG_ERR_CIPHER_ALGO;
+      disable_cipher_algo( *(int*)buffer );
+      break;
+
+    case PRIV_CIPHERCTL_DISABLE_WEAK_KEY:  /* (private)  */
+      if (h->spec->set_extra_info)
+        rc = h->spec->set_extra_info
+          (&h->context.c, CIPHER_INFO_NO_WEAK_KEY, NULL, 0);
+      else
+        rc = GPG_ERR_NOT_SUPPORTED;
+      break;
+
+    case PRIV_CIPHERCTL_GET_INPUT_VECTOR: /* (private)  */
+      /* This is the input block as used in CFB and OFB mode which has
+         initially been set as IV.  The returned format is:
+           1 byte  Actual length of the block in bytes.
+           n byte  The block.
+         If the provided buffer is too short, an error is returned. */
+      if (buflen < (1 + h->spec->blocksize))
+        rc = GPG_ERR_TOO_SHORT;
+      else
+        {
+          unsigned char *ivp;
+          unsigned char *dst = buffer;
+          int n = h->unused;
+
+          if (!n)
+            n = h->spec->blocksize;
+          gcry_assert (n <= h->spec->blocksize);
+          *dst++ = n;
+          ivp = h->u_iv.iv + h->spec->blocksize - n;
+          while (n--)
+            *dst++ = *ivp++;
+        }
+      break;
+
+    case GCRYCTL_SET_SBOX:
+      if (h->spec->set_extra_info)
+        rc = h->spec->set_extra_info
+          (&h->context.c, GCRYCTL_SET_SBOX, buffer, buflen);
+      else
+        rc = GPG_ERR_NOT_SUPPORTED;
+      break;
+
+    case GCRYCTL_SET_ALLOW_WEAK_KEY:
+      /* Expecting BUFFER to be NULL and buflen to be on/off flag (0 or 1). */
+      if (!h || buffer || buflen > 1)
+	return GPG_ERR_CIPHER_ALGO;
+      h->marks.allow_weak_key = buflen ? 1 : 0;
+      break;
+
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+
+/* Return information about the cipher handle H.  CMD is the kind of
+ * information requested.
+ *
+ * CMD may be one of:
+ *
+ *  GCRYCTL_GET_TAGLEN:
+ *      Return the length of the tag for an AE algorithm mode.  An
+ *      error is returned for modes which do not support a tag.
+ *      BUFFER must be given as NULL.  On success the result is stored
+ *      at NBYTES.  The taglen is returned in bytes.
+ *
+ * The function returns 0 on success or an error code.
+ */
+gcry_err_code_t
+_gcry_cipher_info (gcry_cipher_hd_t h, int cmd, void *buffer, size_t *nbytes)
+{
+  gcry_err_code_t rc = 0;
+
+  switch (cmd)
+    {
+    case GCRYCTL_GET_TAGLEN:
+      if (!h || buffer || !nbytes)
+	rc = GPG_ERR_INV_ARG;
+      else
+	{
+          switch (h->mode)
+            {
+            case GCRY_CIPHER_MODE_OCB:
+              *nbytes = h->u_mode.ocb.taglen;
+              break;
+
+            case GCRY_CIPHER_MODE_CCM:
+              *nbytes = h->u_mode.ccm.authlen;
+              break;
+
+            case GCRY_CIPHER_MODE_EAX:
+              *nbytes = h->spec->blocksize;
+              break;
+
+            case GCRY_CIPHER_MODE_GCM:
+              *nbytes = GCRY_GCM_BLOCK_LEN;
+              break;
+
+            case GCRY_CIPHER_MODE_POLY1305:
+              *nbytes = POLY1305_TAGLEN;
+              break;
+
+            default:
+              rc = GPG_ERR_INV_CIPHER_MODE;
+              break;
+            }
+        }
+      break;
+
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+/* Return information about the given cipher algorithm ALGO.
+
+   WHAT select the kind of information returned:
+
+    GCRYCTL_GET_KEYLEN:
+  	Return the length of the key.  If the algorithm ALGO
+  	supports multiple key lengths, the maximum supported key length
+  	is returned.  The key length is returned as number of octets.
+  	BUFFER and NBYTES must be zero.
+
+    GCRYCTL_GET_BLKLEN:
+  	Return the blocklength of the algorithm ALGO counted in octets.
+  	BUFFER and NBYTES must be zero.
+
+    GCRYCTL_TEST_ALGO:
+  	Returns 0 if the specified algorithm ALGO is available for use.
+  	BUFFER and NBYTES must be zero.
+
+   Note: Because this function is in most cases used to return an
+   integer value, we can make it easier for the caller to just look at
+   the return value.  The caller will in all cases consult the value
+   and thereby detecting whether a error occurred or not (i.e. while
+   checking the block size)
+ */
+gcry_err_code_t
+_gcry_cipher_algo_info (int algo, int what, void *buffer, size_t *nbytes)
+{
+  gcry_err_code_t rc = 0;
+  unsigned int ui;
+
+  switch (what)
+    {
+    case GCRYCTL_GET_KEYLEN:
+      if (buffer || (! nbytes))
+	rc = GPG_ERR_CIPHER_ALGO;
+      else
+	{
+	  ui = cipher_get_keylen (algo);
+	  if ((ui > 0) && (ui <= 512))
+	    *nbytes = (size_t) ui / 8;
+	  else
+	    /* The only reason for an error is an invalid algo.  */
+	    rc = GPG_ERR_CIPHER_ALGO;
+	}
+      break;
+
+    case GCRYCTL_GET_BLKLEN:
+      if (buffer || (! nbytes))
+	rc = GPG_ERR_CIPHER_ALGO;
+      else
+	{
+	  ui = cipher_get_blocksize (algo);
+	  if ((ui > 0) && (ui < 10000))
+	    *nbytes = ui;
+	  else
+            {
+              /* The only reason is an invalid algo or a strange
+                 blocksize.  */
+              rc = GPG_ERR_CIPHER_ALGO;
+            }
+	}
+      break;
+
+    case GCRYCTL_TEST_ALGO:
+      if (buffer || nbytes)
+	rc = GPG_ERR_INV_ARG;
+      else
+	rc = check_cipher_algo (algo);
+      break;
+
+      default:
+	rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+
+/* This function returns length of the key for algorithm ALGO.  If the
+   algorithm supports multiple key lengths, the maximum supported key
+   length is returned.  On error 0 is returned.  The key length is
+   returned as number of octets.
+
+   This is a convenience functions which should be preferred over
+   gcry_cipher_algo_info because it allows for proper type
+   checking.  */
+size_t
+_gcry_cipher_get_algo_keylen (int algo)
+{
+  size_t n;
+
+  if (_gcry_cipher_algo_info (algo, GCRYCTL_GET_KEYLEN, NULL, &n))
+    n = 0;
+  return n;
+}
+
+
+/* This functions returns the blocklength of the algorithm ALGO
+   counted in octets.  On error 0 is returned.
+
+   This is a convenience functions which should be preferred over
+   gcry_cipher_algo_info because it allows for proper type
+   checking.  */
+size_t
+_gcry_cipher_get_algo_blklen (int algo)
+{
+  size_t n;
+
+  if (_gcry_cipher_algo_info( algo, GCRYCTL_GET_BLKLEN, NULL, &n))
+    n = 0;
+  return n;
+}
+
+
+/* Explicitly initialize this module.  */
+gcry_err_code_t
+_gcry_cipher_init (void)
+{
+  if (fips_mode())
+    {
+      /* disable algorithms that are disallowed in fips */
+      int idx;
+      gcry_cipher_spec_t *spec;
+
+      for (idx = 0; (spec = cipher_list[idx]); idx++)
+        if (!spec->flags.fips)
+          spec->flags.disabled = 1;
+    }
+
+  return 0;
+}
+
+
+/* Run the selftests for cipher algorithm ALGO with optional reporting
+   function REPORT.  */
+gpg_error_t
+_gcry_cipher_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gcry_err_code_t ec = 0;
+  gcry_cipher_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (spec && !spec->flags.disabled && spec->selftest)
+    ec = spec->selftest (algo, extended, report);
+  else
+    {
+      ec = GPG_ERR_CIPHER_ALGO;
+      if (report)
+        report ("cipher", algo, "module",
+                (spec && !spec->flags.disabled)?
+                "no selftest available" :
+                spec? "algorithm disabled" : "algorithm not found");
+    }
+
+  return gpg_error (ec);
+}
diff --git a/comm/third_party/libgcrypt/cipher/crc-armv8-aarch64-ce.S b/comm/third_party/libgcrypt/cipher/crc-armv8-aarch64-ce.S
new file mode 100644
index 0000000000..060abdfe9a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/crc-armv8-aarch64-ce.S
@@ -0,0 +1,497 @@
+/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.cpu generic+simd+crypto
+
+.text
+
+
+/* Structure of crc32_consts_s */
+
+#define consts_k(idx)    ((idx) * 8)
+#define consts_my_p(idx) (consts_k(6) + (idx) * 8)
+
+/* Constants */
+
+.align 6
+.Lcrc32_constants:
+.Lcrc32_partial_fold_input_mask:
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+.Lcrc32_refl_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+  .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lcrc32_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.Lcrc32_bswap_shuf:
+  .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+  .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+
+/*
+ * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                  const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_bulk
+ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
+_gcry_crc32r_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+  CFI_STARTPROC()
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x9, x3, #consts_k(5 - 1)
+  cmp x2, #128
+
+  b.lo .Lcrc32r_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64  /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32r_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  pmull v20.1q, v0.1d, v6.1d
+  pmull v21.1q, v1.1d, v6.1d
+  pmull v22.1q, v2.1d, v6.1d
+  pmull v23.1q, v3.1d, v6.1d
+  cmp x2, #64
+  pmull2 v24.1q, v0.2d, v6.2d
+  pmull2 v25.1q, v1.2d, v6.2d
+  pmull2 v26.1q, v2.2d, v6.2d
+  pmull2 v27.1q, v3.2d, v6.2d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32r_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32r_fold_by_one_done
+  b .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  sub x2, x2, #16
+  ld1 {v1.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b}, [x1], #16         /* load 16 bytes of input */
+  cmp x2, #16
+  ld1 {v6.16b}, [x4]              /* load k3k4 */
+  ld1 {v5.16b}, [x5]              /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  b.lo .Lcrc32r_fold_by_one_done
+
+.Lcrc32r_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16         /* load 16 bytes of input */
+  pmull v3.1q, v0.1d, v6.1d
+  pmull2 v1.1q, v0.2d, v6.2d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32r_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants
+  add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  add x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+
+  pmull v0.1q, v30.1d, v6.1d
+  and v2.16b, v2.16b, v4.16b
+  pmull2 v31.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  eor v0.16b, v0.16b, v31.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32r_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b      /* zero reg */
+  ld1 {v7.16b}, [x9]
+
+  /* reduce 128-bits to 96-bits */
+  ext v6.16b, v6.16b, v6.16b, #8  /* swap high and low parts */
+  mov v1.16b, v0.16b
+  pmull v0.1q, v0.1d, v6.1d
+  ext v6.16b, v5.16b, v5.16b, #8  /* swap high and low parts */
+  ext v1.16b, v1.16b, v2.16b, #8  /* high to low, high zeroed */
+  eor v3.16b, v0.16b, v1.16b
+
+  /* reduce 96-bits to 64-bits */
+  eor v1.16b, v1.16b, v1.16b
+  ext v0.16b, v3.16b, v2.16b, #4  /* [00][00][x2][x1] */
+  mov v1.s[0], v3.s[0]            /* [00][00][00][x0] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull v1.1q, v1.1d, v7.1d       /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b      /* top 64-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[1], v0.s[0]            /* [00][00][x1][00] */
+  ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */
+  pmull v1.1q, v3.1d, v5.1d       /* [00][xx][xx][00] */
+  pmull v1.1q, v1.1d, v6.1d       /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b
+
+  /* store CRC */
+  st1 {v0.s}[2], [x0]
+
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                         const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
+_gcry_crc32r_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+  CFI_STARTPROC()
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[0], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][00][xx][xx] */
+  mov v1.s[1], w1
+  mov v0.s[2], v0.s[0]          /* [00][x0][x1][x0] */
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  mov w0, v0.s[1]
+
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
+
+/*
+ * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                 const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_bulk
+ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
+_gcry_crc32_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+  CFI_STARTPROC()
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
+  cmp x2, #128
+  ld1 {v7.16b}, [x4]
+
+  b.lo .Lcrc32_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]            /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+  ext v4.16b, v6.16b, v6.16b, #8
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  tbl v1.16b, { v1.16b }, v7.16b /* byte swap */
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  tbl v3.16b, { v3.16b }, v7.16b /* byte swap */
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  tbl v16.16b, { v16.16b }, v7.16b /* byte swap */
+  tbl v17.16b, { v17.16b }, v7.16b /* byte swap */
+  tbl v18.16b, { v18.16b }, v7.16b /* byte swap */
+  tbl v19.16b, { v19.16b }, v7.16b /* byte swap */
+  cmp x2, #64
+  pmull2 v20.1q, v0.2d, v4.2d
+  pmull2 v21.1q, v1.2d, v4.2d
+  pmull2 v22.1q, v2.2d, v4.2d
+  pmull2 v23.1q, v3.2d, v4.2d
+  pmull v24.1q, v0.1d, v4.1d
+  pmull v25.1q, v1.1d, v4.1d
+  pmull v26.1q, v2.1d, v4.1d
+  pmull v27.1q, v3.1d, v4.1d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+  ext v6.16b, v6.16b, v6.16b, #8
+  ext v5.16b, v5.16b, v5.16b, #8
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32_fold_by_one_done
+  b .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  ld1 {v1.s}[0], [x0]            /* load pcrc */
+  sub x2, x2, #16
+  ld1 {v0.16b}, [x1], #16        /* load 16 bytes of input */
+  ld1 {v6.16b}, [x4]             /* load k3k4 */
+  ld1 {v5.16b}, [x5]             /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  cmp x2, #16
+  ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  b.lo .Lcrc32_fold_by_one_done
+
+.Lcrc32_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16        /* load 16 bytes of input */
+  pmull2 v3.1q, v0.2d, v6.2d
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  pmull v1.1q, v0.1d, v6.1d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32
+  add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  sub x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+  and v2.16b, v2.16b, v4.16b
+
+  pmull2 v0.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  pmull v1.1q, v30.1d, v6.1d
+  tbl v2.16b, {v2.16b}, v7.16b   /* byte swap */
+  eor v0.16b, v0.16b, v1.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b     /* zero reg */
+
+  /* reduce 128-bits to 96-bits */
+  add x4, x3, #consts_k(4)
+  ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  eor v6.16b, v6.16b, v6.16b
+  mov v1.16b, v0.16b
+  pmull2 v0.1q, v0.2d, v3.2d
+  ld1 {v6.d}[1], [x4]            /* load k4 */
+  ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */
+  eor v3.16b, v0.16b, v1.16b     /* bottom 32-bit are zero */
+
+  /* reduce 96-bits to 64-bits */
+  eor v0.16b, v0.16b, v0.16b
+  eor v1.16b, v1.16b, v1.16b
+  mov v0.s[1], v3.s[1]           /* [00][00][x1][00] */
+  mov v1.s[2], v3.s[3]           /* [00][x3][00][00] */
+  mov v0.s[2], v3.s[2]           /* [00][x2][x1][00] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull2 v1.1q, v1.2d, v6.2d     /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b     /* top and bottom 32-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[0], v0.s[1]           /* [00][00][00][x1] */
+  pmull2 v0.1q, v0.2d, v5.2d     /* [00][xx][xx][xx] */
+  ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */
+  pmull v0.1q, v0.1d, v5.1d
+  eor v0.16b, v0.16b, v3.16b
+
+  /* store CRC in input endian */
+  rev32 v0.8b, v0.8b             /* byte swap */
+  st1 {v0.s}[0], [x0]
+
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                        const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
+_gcry_crc32_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+  CFI_STARTPROC()
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[1], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][xx][xx][00] */
+  mov v1.s[0], w1
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  rev32 v0.8b, v0.8b            /* Return in input endian */
+  mov w0, v0.s[0]
+
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/crc-armv8-ce.c b/comm/third_party/libgcrypt/cipher/crc-armv8-ce.c
new file mode 100644
index 0000000000..17e5554821
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/crc-armv8-ce.c
@@ -0,0 +1,229 @@
+/* crc-armv8-ce.c - ARMv8-CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
+
+struct u16_unaligned_s
+{
+  u16 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+struct u32_unaligned_s
+{
+  u32 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 PMULL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+  u64 k[6];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  u64 my_p[2];
+};
+
+/* PMULL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_16 =
+{
+  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124), 0                   /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
+{
+  { /* k[6] = x^(32*y) mod P(x) << 32*/
+    U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32, 0                        /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+
+u32 _gcry_crc32r_armv8_ce_reduction_4 (u32 data, u32 crc,
+				       const struct crc32_consts_s *consts);
+void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                 const struct crc32_consts_s *consts);
+
+u32 _gcry_crc32_armv8_ce_reduction_4 (u32 data, u32 crc,
+				      const struct crc32_consts_s *consts);
+void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                const struct crc32_consts_s *consts);
+
+
+static inline void
+crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		     const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data <<= 24;
+      crc >>= 8;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data <<= 16;
+      crc >>= 16;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data <<= 8;
+      crc >>= 24;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+static inline void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data);
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data = data & 0xffU;
+      crc = _gcry_bswap32(crc >> 8);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data << 16);
+      crc = _gcry_bswap32(crc >> 16);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data = _gcry_bswap32(data << 8);
+      crc = crc & 0xff000000U;
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+void
+_gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    _gcry_crc32r_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32r_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+void
+_gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+
+  if (!inlen)
+    return;
+
+  /* Note: *pcrc in input endian. */
+
+  if (inlen >= 16)
+    _gcry_crc32_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c b/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c
new file mode 100644
index 0000000000..8c8b1915ab
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c
@@ -0,0 +1,939 @@
+/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_PCLMUL_SUPPORT) && defined(ENABLE_SSE41_SUPPORT) && \
+    __GNUC__ >= 4 &&                                                   \
+    ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
+
+
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
+
+struct u16_unaligned_s
+{
+  u16 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+  u64 k[6];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  u64 my_p[2];
+};
+
+
+/* CLMUL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_16 =
+{
+  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124), 0                   /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* CLMUL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
+{
+  { /* k[6] = x^(32*y) mod P(x) << 32*/
+    U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32, 0                        /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+/* Common constants for CRC32 algorithms. */
+static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
+  {
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+static const byte crc32_shuf_shift[3 * 16] ALIGNED_16 =
+  {
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+    0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+static const byte *crc32_bswap_shuf = &crc32_shuf_shift[16];
+static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
+  {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
+  {
+    { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
+    { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
+    { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
+    { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
+    { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
+    { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
+    { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
+  };
+static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
+  {
+    { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
+    { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
+    { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
+  };
+
+/* PCLMUL functions for reflected CRC32. */
+static ASM_FUNC_ATTR_INLINE void
+crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+		      const struct crc32_consts_s *consts)
+{
+  if (inlen >= 8 * 16)
+    {
+      asm volatile ("movd %[crc], %%xmm4\n\t"
+		    "movdqu %[inbuf_0], %%xmm0\n\t"
+		    "movdqu %[inbuf_1], %%xmm1\n\t"
+		    "movdqu %[inbuf_2], %%xmm2\n\t"
+		    "movdqu %[inbuf_3], %%xmm3\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+		    :
+		    : [inbuf_0] "m" (inbuf[0 * 16]),
+		      [inbuf_1] "m" (inbuf[1 * 16]),
+		      [inbuf_2] "m" (inbuf[2 * 16]),
+		      [inbuf_3] "m" (inbuf[3 * 16]),
+		      [crc] "m" (*pcrc)
+		    );
+
+      inbuf += 4 * 16;
+      inlen -= 4 * 16;
+
+      asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+		    :
+		    : [k1k2] "m" (consts->k[1 - 1])
+		    );
+
+      /* Fold by 4. */
+      while (inlen >= 4 * 16)
+	{
+	  asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
+			"movdqa %%xmm0, %%xmm6\n\t"
+			"pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
+			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm0\n\t"
+			"pxor %%xmm6, %%xmm0\n\t"
+
+			"movdqu %[inbuf_1], %%xmm5\n\t"
+			"movdqa %%xmm1, %%xmm6\n\t"
+			"pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
+			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm1\n\t"
+			"pxor %%xmm6, %%xmm1\n\t"
+
+			"movdqu %[inbuf_2], %%xmm5\n\t"
+			"movdqa %%xmm2, %%xmm6\n\t"
+			"pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
+			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm2\n\t"
+			"pxor %%xmm6, %%xmm2\n\t"
+
+			"movdqu %[inbuf_3], %%xmm5\n\t"
+			"movdqa %%xmm3, %%xmm6\n\t"
+			"pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
+			"pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm3\n\t"
+			"pxor %%xmm6, %%xmm3\n\t"
+			:
+			: [inbuf_0] "m" (inbuf[0 * 16]),
+			  [inbuf_1] "m" (inbuf[1 * 16]),
+			  [inbuf_2] "m" (inbuf[2 * 16]),
+			  [inbuf_3] "m" (inbuf[3 * 16])
+			);
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+	}
+
+      asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [k3k4] "m" (consts->k[3 - 1]),
+		      [my_p] "m" (consts->my_p[0])
+		    );
+
+      /* Fold 4 to 1. */
+
+      asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+
+		    "movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm2, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+
+		    "movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm3, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+		    :
+		    :
+		    );
+    }
+  else
+    {
+      asm volatile ("movd %[crc], %%xmm1\n\t"
+		    "movdqu %[inbuf], %%xmm0\n\t"
+		    "movdqa %[k3k4], %%xmm6\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf),
+		      [crc] "m" (*pcrc),
+		      [k3k4] "m" (consts->k[3 - 1]),
+		      [my_p] "m" (consts->my_p[0])
+		    );
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Fold by 1. */
+  if (inlen >= 16)
+    {
+      while (inlen >= 16)
+	{
+	  /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
+	  asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
+			"movdqa %%xmm0, %%xmm1\n\t"
+			"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+			"pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
+			"pxor %%xmm2, %%xmm0\n\t"
+			"pxor %%xmm1, %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf)
+			);
+
+	  inbuf += 16;
+	  inlen -= 16;
+	}
+    }
+
+  /* Partial fold. */
+  if (inlen)
+    {
+      /* Load last input and add padding zeros. */
+      asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
+		    "movdqu %[shl_shuf], %%xmm4\n\t"
+		    "movdqu %[mask], %%xmm2\n\t"
+
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pshufb %%xmm4, %%xmm0\n\t"
+		    "movdqu %[inbuf], %%xmm4\n\t"
+		    "pshufb %%xmm3, %%xmm1\n\t"
+		    "pand %%xmm4, %%xmm2\n\t"
+		    "por %%xmm1, %%xmm2\n\t"
+
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
+		    "pxor %%xmm2, %%xmm0\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    :
+		    : [inbuf] "m" (*(inbuf - 16 + inlen)),
+		      [mask] "m" (crc32_partial_fold_input_mask[inlen]),
+		      [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
+		      [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
+		    );
+
+      inbuf += inlen;
+      inlen -= inlen;
+    }
+
+  /* Final fold. */
+  asm volatile (/* reduce 128-bits to 96-bits */
+		"movdqa %%xmm0, %%xmm1\n\t"
+		"pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
+		"psrldq $8, %%xmm1\n\t"
+		"pxor %%xmm1, %%xmm0\n\t"
+
+		/* reduce 96-bits to 64-bits */
+		"pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
+		"pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
+		"pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
+		"pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
+
+		/* barrett reduction */
+		"pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
+		"pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
+		"pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+		"pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+		"pxor %%xmm1, %%xmm0\n\t"
+
+		/* store CRC */
+		"pextrd $2, %%xmm0, %[out]\n\t"
+		: [out] "=m" (*pcrc)
+		: [k5] "m" (consts->k[5 - 1])
+	        );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+			      const struct crc32_consts_s *consts)
+{
+  if (inlen < 4)
+    {
+      u32 crc = *pcrc;
+      u32 data;
+
+      asm volatile ("movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [my_p] "m" (consts->my_p[0])
+		    );
+
+      if (inlen == 1)
+	{
+	  data = inbuf[0];
+	  data ^= crc;
+	  data <<= 24;
+	  crc >>= 8;
+	}
+      else if (inlen == 2)
+	{
+	  data = ((const struct u16_unaligned_s *)inbuf)->a;
+	  data ^= crc;
+	  data <<= 16;
+	  crc >>= 16;
+	}
+      else
+	{
+	  data = ((const struct u16_unaligned_s *)inbuf)->a;
+	  data |= inbuf[2] << 16;
+	  data ^= crc;
+	  data <<= 8;
+	  crc >>= 24;
+	}
+
+      /* Barrett reduction */
+      asm volatile ("movd %[in], %%xmm0\n\t"
+		    "movd %[crc], %%xmm1\n\t"
+
+		    "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    "psllq $32, %%xmm1\n\t"
+		    "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
+		    "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    "pxor %%xmm1, %%xmm0\n\t"
+
+		    "pextrd $1, %%xmm0, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    : [in] "rm" (data),
+		      [crc] "rm" (crc)
+		    );
+    }
+  else if (inlen == 4)
+    {
+      /* Barrett reduction */
+      asm volatile ("movd %[crc], %%xmm1\n\t"
+		    "movd %[in], %%xmm0\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+
+		    "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
+		    "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+
+		    "pextrd $1, %%xmm0, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    : [in] "m" (*inbuf),
+		      [crc] "m" (*pcrc),
+		      [my_p] "m" (consts->my_p[0])
+		    );
+    }
+  else
+    {
+      asm volatile ("movdqu %[shuf], %%xmm4\n\t"
+		    "movd %[crc], %%xmm1\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    "movdqa %[k3k4], %%xmm6\n\t"
+		    :
+		    : [shuf] "m" (crc32_refl_shuf_shift[inlen]),
+		      [crc] "m" (*pcrc),
+		      [my_p] "m" (consts->my_p[0]),
+		      [k3k4] "m" (consts->k[3 - 1])
+		    );
+
+      if (inlen >= 8)
+	{
+	  asm volatile ("movq %[inbuf], %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf)
+			);
+	  if (inlen > 8)
+	    {
+	      asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
+			    "movq %[inbuf_tail], %%xmm2\n\t"
+			    "punpcklqdq %%xmm2, %%xmm0\n\t"
+			    "pshufb %[merge_shuf], %%xmm0\n\t"
+			    :
+			    : [inbuf_tail] "m" (inbuf[inlen - 8]),
+			      [merge_shuf] "m"
+				(*crc32_merge9to15_shuf[inlen - 9])
+			    );
+	    }
+	}
+      else
+	{
+	  asm volatile ("movd %[inbuf], %%xmm0\n\t"
+			"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
+			"pshufb %[merge_shuf], %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf),
+			  [inbuf_tail] "m" (inbuf[inlen - 4]),
+			  [merge_shuf] "m"
+			    (*crc32_merge5to7_shuf[inlen - 5])
+			);
+	}
+
+      /* Final fold. */
+      asm volatile ("pxor %%xmm1, %%xmm0\n\t"
+		    "pshufb %%xmm4, %%xmm0\n\t"
+
+		    /* reduce 128-bits to 96-bits */
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
+		    "psrldq $8, %%xmm1\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
+
+		    /* reduce 96-bits to 64-bits */
+		    "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
+		    "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
+		    "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
+		    "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
+
+		    /* barrett reduction */
+		    "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
+		    "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
+		    "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+		    "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+		    "pxor %%xmm1, %%xmm0\n\t"
+
+		    /* store CRC */
+		    "pextrd $2, %%xmm0, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    : [k5] "m" (consts->k[5 - 1])
+		    );
+    }
+}
+
+/* PCLMUL functions for non-reflected CRC32. */
+static ASM_FUNC_ATTR_INLINE void
+crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+	    const struct crc32_consts_s *consts)
+{
+  asm volatile ("movdqa %[bswap], %%xmm7\n\t"
+		:
+		: [bswap] "m" (*crc32_bswap_shuf)
+		);
+
+  if (inlen >= 8 * 16)
+    {
+      asm volatile ("movd %[crc], %%xmm4\n\t"
+		    "movdqu %[inbuf_0], %%xmm0\n\t"
+		    "movdqu %[inbuf_1], %%xmm1\n\t"
+		    "movdqu %[inbuf_2], %%xmm2\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+		    "movdqu %[inbuf_3], %%xmm3\n\t"
+		    "pshufb %%xmm7, %%xmm0\n\t"
+		    "pshufb %%xmm7, %%xmm1\n\t"
+		    "pshufb %%xmm7, %%xmm2\n\t"
+		    "pshufb %%xmm7, %%xmm3\n\t"
+		    :
+		    : [inbuf_0] "m" (inbuf[0 * 16]),
+		      [inbuf_1] "m" (inbuf[1 * 16]),
+		      [inbuf_2] "m" (inbuf[2 * 16]),
+		      [inbuf_3] "m" (inbuf[3 * 16]),
+		      [crc] "m" (*pcrc)
+		    );
+
+      inbuf += 4 * 16;
+      inlen -= 4 * 16;
+
+      asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+		    :
+		    : [k1k2] "m" (consts->k[1 - 1])
+		    );
+
+      /* Fold by 4. */
+      while (inlen >= 4 * 16)
+	{
+	  asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
+			"movdqa %%xmm0, %%xmm6\n\t"
+			"pshufb %%xmm7, %%xmm5\n\t"
+			"pclmulqdq $0x01, %%xmm4, %%xmm0\n\t"
+			"pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm0\n\t"
+			"pxor %%xmm6, %%xmm0\n\t"
+
+			"movdqu %[inbuf_1], %%xmm5\n\t"
+			"movdqa %%xmm1, %%xmm6\n\t"
+			"pshufb %%xmm7, %%xmm5\n\t"
+			"pclmulqdq $0x01, %%xmm4, %%xmm1\n\t"
+			"pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm1\n\t"
+			"pxor %%xmm6, %%xmm1\n\t"
+
+			"movdqu %[inbuf_2], %%xmm5\n\t"
+			"movdqa %%xmm2, %%xmm6\n\t"
+			"pshufb %%xmm7, %%xmm5\n\t"
+			"pclmulqdq $0x01, %%xmm4, %%xmm2\n\t"
+			"pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm2\n\t"
+			"pxor %%xmm6, %%xmm2\n\t"
+
+			"movdqu %[inbuf_3], %%xmm5\n\t"
+			"movdqa %%xmm3, %%xmm6\n\t"
+			"pshufb %%xmm7, %%xmm5\n\t"
+			"pclmulqdq $0x01, %%xmm4, %%xmm3\n\t"
+			"pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+			"pxor %%xmm5, %%xmm3\n\t"
+			"pxor %%xmm6, %%xmm3\n\t"
+			:
+			: [inbuf_0] "m" (inbuf[0 * 16]),
+			  [inbuf_1] "m" (inbuf[1 * 16]),
+			  [inbuf_2] "m" (inbuf[2 * 16]),
+			  [inbuf_3] "m" (inbuf[3 * 16])
+			);
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+	}
+
+      asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [k3k4] "m" (consts->k[3 - 1]),
+		      [my_p] "m" (consts->my_p[0])
+		    );
+
+      /* Fold 4 to 1. */
+
+      asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+
+		    "movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm2, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+
+		    "movdqa %%xmm0, %%xmm4\n\t"
+		    "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+		    "pxor %%xmm3, %%xmm0\n\t"
+		    "pxor %%xmm4, %%xmm0\n\t"
+		    :
+		    :
+		    );
+    }
+  else
+    {
+      asm volatile ("movd %[crc], %%xmm1\n\t"
+		    "movdqu %[inbuf], %%xmm0\n\t"
+		    "movdqa %[k3k4], %%xmm6\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    "pshufb %%xmm7, %%xmm0\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf),
+		      [crc] "m" (*pcrc),
+		      [k3k4] "m" (consts->k[3 - 1]),
+		      [my_p] "m" (consts->my_p[0])
+		    );
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Fold by 1. */
+  if (inlen >= 16)
+    {
+      while (inlen >= 16)
+	{
+	  /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
+	  asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
+			"movdqa %%xmm0, %%xmm1\n\t"
+			"pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+			"pshufb %%xmm7, %%xmm2\n\t"
+			"pclmulqdq $0x10, %%xmm6, %%xmm1\n\t"
+			"pxor %%xmm2, %%xmm0\n\t"
+			"pxor %%xmm1, %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf)
+			);
+
+	  inbuf += 16;
+	  inlen -= 16;
+	}
+    }
+
+  /* Partial fold. */
+  if (inlen)
+    {
+      /* Load last input and add padding zeros. */
+      asm volatile ("movdqu %[shl_shuf], %%xmm4\n\t"
+		    "movdqu %[shr_shuf], %%xmm3\n\t"
+		    "movdqu %[mask], %%xmm2\n\t"
+
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pshufb %%xmm4, %%xmm0\n\t"
+		    "movdqu %[inbuf], %%xmm4\n\t"
+		    "pshufb %%xmm3, %%xmm1\n\t"
+		    "pand %%xmm4, %%xmm2\n\t"
+		    "por %%xmm1, %%xmm2\n\t"
+
+		    "pshufb %%xmm7, %%xmm2\n\t"
+
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+		    "pclmulqdq $0x10, %%xmm6, %%xmm1\n\t"
+		    "pxor %%xmm2, %%xmm0\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+		    :
+		    : [inbuf] "m" (*(inbuf - 16 + inlen)),
+		      [mask] "m" (crc32_partial_fold_input_mask[inlen]),
+		      [shl_shuf] "m" (crc32_refl_shuf_shift[32 - inlen]),
+		      [shr_shuf] "m" (crc32_shuf_shift[inlen + 16])
+		    );
+
+      inbuf += inlen;
+      inlen -= inlen;
+    }
+
+  /* Final fold. */
+  asm volatile (/* reduce 128-bits to 96-bits */
+		"movdqa %%xmm0, %%xmm1\n\t"
+		"pclmulqdq $0x11, %%xmm6, %%xmm0\n\t"
+		"pslldq $8, %%xmm1\n\t"
+		"pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */
+
+		/* reduce 96-bits to 64-bits */
+		"pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */
+		"pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */
+		"pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */
+		"pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */
+
+		/* barrett reduction */
+		"pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */
+		"pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */
+		"psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */
+		"pclmulqdq $0x10, %%xmm5, %%xmm0\n\t"
+		"pxor %%xmm1, %%xmm0\n\t"
+
+		/* store CRC in input endian */
+		"movd %%xmm0, %%eax\n\t"
+		"bswapl %%eax\n\t"
+		"movl %%eax, %[out]\n\t"
+		: [out] "=m" (*pcrc)
+		: [k5] "m" (consts->k[5 - 1])
+		: "eax" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  if (inlen < 4)
+    {
+      u32 crc = *pcrc;
+      u32 data;
+
+      asm volatile ("movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [my_p] "m" (consts->my_p[0])
+		    );
+
+      if (inlen == 1)
+	{
+	  data = inbuf[0];
+	  data ^= crc;
+	  data = _gcry_bswap32(data << 24);
+	  crc = _gcry_bswap32(crc >> 8);
+	}
+      else if (inlen == 2)
+	{
+	  data = ((const struct u16_unaligned_s *)inbuf)->a;
+	  data ^= crc;
+	  data = _gcry_bswap32(data << 16);
+	  crc = _gcry_bswap32(crc >> 16);
+	}
+      else
+	{
+	  data = ((const struct u16_unaligned_s *)inbuf)->a;
+	  data |= inbuf[2] << 16;
+	  data ^= crc;
+	  data = _gcry_bswap32(data << 8);
+	  crc = _gcry_bswap32(crc >> 24);
+	}
+
+      /* Barrett reduction */
+      asm volatile ("movd %[in], %%xmm0\n\t"
+		    "psllq $32, %%xmm0\n\t" /* [00][00][xx][00] */
+		    "movd %[crc], %%xmm1\n\t"
+
+		    "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */
+		    "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    "pxor %%xmm1, %%xmm0\n\t"
+
+		    /* store CRC in input endian */
+		    "movd %%xmm0, %%eax\n\t"
+		    "bswapl %%eax\n\t"
+		    "movl %%eax, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    : [in] "r" (data),
+		      [crc] "r" (crc)
+		    : "eax" );
+    }
+  else if (inlen == 4)
+    {
+      /* Barrett reduction */
+      asm volatile ("movd %[crc], %%xmm0\n\t"
+		    "movd %[in], %%xmm1\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    :
+		    : [in] "m" (*inbuf),
+		      [crc] "m" (*pcrc),
+		      [my_p] "m" (consts->my_p[0])
+		    : "cc" );
+
+      asm volatile ("pxor %%xmm1, %%xmm0\n\t"
+		    "pshufb %[bswap], %%xmm0\n\t" /* [xx][00][00][00] */
+
+		    "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */
+		    "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    :
+		    : [bswap] "m" (*crc32_bswap_shuf)
+		    : "cc" );
+
+      asm volatile (/* store CRC in input endian */
+		    "movd %%xmm0, %%eax\n\t"
+		    "bswapl %%eax\n\t"
+		    "movl %%eax, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    :
+		    : "eax", "cc" );
+    }
+  else
+    {
+      asm volatile ("movdqu %[shuf], %%xmm7\n\t"
+		    "movd %[crc], %%xmm1\n\t"
+		    "movdqa %[my_p], %%xmm5\n\t"
+		    "movdqa %[k3k4], %%xmm6\n\t"
+		    :
+		    : [shuf] "m" (crc32_shuf_shift[32 - inlen]),
+		      [crc] "m" (*pcrc),
+		      [my_p] "m" (consts->my_p[0]),
+		      [k3k4] "m" (consts->k[3 - 1])
+		    );
+
+      if (inlen >= 8)
+	{
+	  asm volatile ("movq %[inbuf], %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf)
+			);
+	  if (inlen > 8)
+	    {
+	      asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
+			    "movq %[inbuf_tail], %%xmm2\n\t"
+			    "punpcklqdq %%xmm2, %%xmm0\n\t"
+			    "pshufb %[merge_shuf], %%xmm0\n\t"
+			    :
+			    : [inbuf_tail] "m" (inbuf[inlen - 8]),
+			      [merge_shuf] "m"
+				(*crc32_merge9to15_shuf[inlen - 9])
+			    );
+	    }
+	}
+      else
+	{
+	  asm volatile ("movd %[inbuf], %%xmm0\n\t"
+			"pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
+			"pshufb %[merge_shuf], %%xmm0\n\t"
+			:
+			: [inbuf] "m" (*inbuf),
+			  [inbuf_tail] "m" (inbuf[inlen - 4]),
+			  [merge_shuf] "m"
+			    (*crc32_merge5to7_shuf[inlen - 5])
+			);
+	}
+
+      /* Final fold. */
+      asm volatile ("pxor %%xmm1, %%xmm0\n\t"
+		    "pshufb %%xmm7, %%xmm0\n\t"
+
+		    /* reduce 128-bits to 96-bits */
+		    "movdqa %%xmm0, %%xmm1\n\t"
+		    "pclmulqdq $0x11, %%xmm6, %%xmm0\n\t"
+		    "pslldq $8, %%xmm1\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */
+
+		    /* reduce 96-bits to 64-bits */
+		    "pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */
+		    "pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */
+		    "pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */
+		    "pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */
+
+		    /* barrett reduction */
+		    "pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */
+		    "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */
+		    "psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */
+		    "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t"
+		    "pxor %%xmm1, %%xmm0\n\t"
+
+		    /* store CRC in input endian */
+		    "movd %%xmm0, %%eax\n\t"
+		    "bswapl %%eax\n\t"
+		    "movl %%eax, %[out]\n\t"
+		    : [out] "=m" (*pcrc)
+		    : [k5] "m" (consts->k[5 - 1])
+		    : "eax" );
+    }
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[2 * 16];
+
+  /* XMM6-XMM7 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
+                "movdqu %%xmm7, 1*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory");
+#endif
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
+  else
+    crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* Restore used registers. */
+  asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
+               "movdqu 1*16(%0), %%xmm7\n\t"
+               :
+               : "r" (win64tmp)
+               : "memory");
+#endif
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[2 * 16];
+
+  /* XMM6-XMM7 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
+                "movdqu %%xmm7, 1*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory");
+#endif
+
+  if (!inlen)
+    return;
+
+  /* Note: *pcrc in input endian. */
+
+  if (inlen >= 16)
+    crc32_bulk(pcrc, inbuf, inlen, consts);
+  else
+    crc32_less_than_16(pcrc, inbuf, inlen, consts);
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* Restore used registers. */
+  asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
+               "movdqu 1*16(%0), %%xmm7\n\t"
+               :
+               : "r" (win64tmp)
+               : "memory");
+#endif
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* USE_INTEL_PCLMUL */
diff --git a/comm/third_party/libgcrypt/cipher/crc-ppc.c b/comm/third_party/libgcrypt/cipher/crc-ppc.c
new file mode 100644
index 0000000000..b9a40130ce
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/crc-ppc.c
@@ -0,0 +1,656 @@
+/* crc-ppc.c - POWER8 vpmsum accelerated CRC implementation
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    __GNUC__ >= 4
+
+#include <altivec.h>
+#include "bufhelp.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+#define ALIGNED_64 __attribute__ ((aligned (64)))
+
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 PMULL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+  unsigned long long k[6];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  unsigned long long my_p[2];
+};
+
+/* PMULL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_64 =
+{
+  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124), 0                   /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_64 =
+{
+  { /* k[6] = x^(32*y) mod P(x) << 32*/
+    U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32, 0                        /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+asm_vpmsumd(vector2x_u64 a, vector2x_u64 b)
+{
+  __asm__("vpmsumd %0, %1, %2"
+	  : "=v" (a)
+	  : "v" (a), "v" (b));
+  return a;
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+asm_swap_u64(vector2x_u64 a)
+{
+  __asm__("xxswapd %x0, %x1"
+	  : "=wa" (a)
+	  : "wa" (a));
+  return a;
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_sld_u32(vector4x_u32 a, vector4x_u32 b, unsigned int idx)
+{
+  return vec_sld (a, b, (4 * idx) & 15);
+}
+
+
+static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_64 =
+  {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+static const byte crc32_shuf_shift[3 * 16] ALIGNED_64 =
+  {
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+    0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+  };
+static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_64 =
+  {
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+  };
+static const vector16x_u8 bswap_const ALIGNED_64 =
+  { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+
+#define CRC_VEC_SWAP(v) ({ vector2x_u64 __vecu64 = (v); \
+                           vec_perm(__vecu64, __vecu64, bswap_const); })
+
+#ifdef WORDS_BIGENDIAN
+# define CRC_VEC_U64_DEF(lo, hi) { (hi), (lo) }
+# define CRC_VEC_U64_LOAD(offs, ptr) \
+	  asm_swap_u64(asm_vec_u64_load(offs, ptr))
+# define CRC_VEC_U64_LOAD_LE(offs, ptr) \
+	  CRC_VEC_SWAP(asm_vec_u64_load(offs, ptr))
+# define CRC_VEC_U64_LOAD_BE(offs, ptr) \
+	  asm_vec_u64_load(offs, ptr)
+# define CRC_VEC_SWAP_TO_LE(v) CRC_VEC_SWAP(v)
+# define CRC_VEC_SWAP_TO_BE(v) (v)
+# define VEC_U64_LO 1
+# define VEC_U64_HI 0
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+asm_vec_u64_load(unsigned long offset, const void *ptr)
+{
+  vector2x_u64 vecu64;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
+		      : "=wa" (vecu64)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
+		      : "=wa" (vecu64)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  return vecu64;
+}
+#else
+# define CRC_VEC_U64_DEF(lo, hi) { (lo), (hi) }
+# define CRC_VEC_U64_LOAD(offs, ptr) asm_vec_u64_load_le(offs, ptr)
+# define CRC_VEC_U64_LOAD_LE(offs, ptr) asm_vec_u64_load_le(offs, ptr)
+# define CRC_VEC_U64_LOAD_BE(offs, ptr) asm_vec_u64_load_be(offs, ptr)
+# define CRC_VEC_SWAP_TO_LE(v) (v)
+# define CRC_VEC_SWAP_TO_BE(v) CRC_VEC_SWAP(v)
+# define VEC_U64_LO 0
+# define VEC_U64_HI 1
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+asm_vec_u64_load_le(unsigned long offset, const void *ptr)
+{
+  vector2x_u64 vecu64;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
+		      : "=wa" (vecu64)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
+		      : "=wa" (vecu64)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  return asm_swap_u64(vecu64);
+}
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+asm_vec_u64_load_be(unsigned int offset, const void *ptr)
+{
+  static const vector16x_u8 vec_load_le_const =
+    { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
+  vector2x_u64 vecu64;
+
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ ("lxvd2x %%vs32,0,%1\n\t"
+	     "vperm %0,%%v0,%%v0,%2\n\t"
+	     : "=v" (vecu64)
+	     : "r" ((uintptr_t)(ptr)), "v" (vec_load_le_const)
+	     : "memory", "v0");
+#endif
+  else
+    __asm__ ("lxvd2x %%vs32,%1,%2\n\t"
+	     "vperm %0,%%v0,%%v0,%3\n\t"
+	     : "=v" (vecu64)
+	     : "r" (offset), "r" ((uintptr_t)(ptr)),
+	       "v" (vec_load_le_const)
+	     : "memory", "r0", "v0");
+
+  return vecu64;
+}
+#endif
+
+
+static ASM_FUNC_ATTR_INLINE void
+crc32r_ppc8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+		     const struct crc32_consts_s *consts)
+{
+  vector4x_u32 zero = { 0, 0, 0, 0 };
+  vector2x_u64 low_64bit_mask = CRC_VEC_U64_DEF((u64)-1, 0);
+  vector2x_u64 low_32bit_mask = CRC_VEC_U64_DEF((u32)-1, 0);
+  vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
+  vector2x_u64 k1k2 = CRC_VEC_U64_LOAD(0, &consts->k[1 - 1]);
+  vector2x_u64 k3k4 = CRC_VEC_U64_LOAD(0, &consts->k[3 - 1]);
+  vector2x_u64 k4lo = CRC_VEC_U64_DEF(k3k4[VEC_U64_HI], 0);
+  vector2x_u64 k5lo = CRC_VEC_U64_LOAD(0, &consts->k[5 - 1]);
+  vector2x_u64 crc = CRC_VEC_U64_DEF(*pcrc, 0);
+  vector2x_u64 crc0, crc1, crc2, crc3;
+  vector2x_u64 v0;
+
+  if (inlen >= 8 * 16)
+    {
+      crc0 = CRC_VEC_U64_LOAD_LE(0 * 16, inbuf);
+      crc0 ^= crc;
+      crc1 = CRC_VEC_U64_LOAD_LE(1 * 16, inbuf);
+      crc2 = CRC_VEC_U64_LOAD_LE(2 * 16, inbuf);
+      crc3 = CRC_VEC_U64_LOAD_LE(3 * 16, inbuf);
+
+      inbuf += 4 * 16;
+      inlen -= 4 * 16;
+
+      /* Fold by 4. */
+      while (inlen >= 4 * 16)
+	{
+	  v0 = CRC_VEC_U64_LOAD_LE(0 * 16, inbuf);
+	  crc0 = asm_vpmsumd(crc0, k1k2) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_LE(1 * 16, inbuf);
+	  crc1 = asm_vpmsumd(crc1, k1k2) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_LE(2 * 16, inbuf);
+	  crc2 = asm_vpmsumd(crc2, k1k2) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_LE(3 * 16, inbuf);
+	  crc3 = asm_vpmsumd(crc3, k1k2) ^ v0;
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+	}
+
+      /* Fold 4 to 1. */
+      crc1 ^= asm_vpmsumd(crc0, k3k4);
+      crc2 ^= asm_vpmsumd(crc1, k3k4);
+      crc3 ^= asm_vpmsumd(crc2, k3k4);
+      crc = crc3;
+    }
+  else
+    {
+      v0 = CRC_VEC_U64_LOAD_LE(0, inbuf);
+      crc ^= v0;
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Fold by 1. */
+  while (inlen >= 16)
+    {
+      v0 = CRC_VEC_U64_LOAD_LE(0, inbuf);
+      crc = asm_vpmsumd(k3k4, crc);
+      crc ^= v0;
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Partial fold. */
+  if (inlen)
+    {
+      /* Load last input and add padding zeros. */
+      vector2x_u64 mask = CRC_VEC_U64_LOAD_LE(inlen, crc32_partial_fold_input_mask);
+      vector2x_u64 shl_shuf = CRC_VEC_U64_LOAD_LE(inlen, crc32_refl_shuf_shift);
+      vector2x_u64 shr_shuf = CRC_VEC_U64_LOAD_LE(inlen + 16, crc32_refl_shuf_shift);
+
+      v0 = CRC_VEC_U64_LOAD_LE(inlen - 16, inbuf);
+      v0 &= mask;
+
+      crc = CRC_VEC_SWAP_TO_LE(crc);
+      v0 |= (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
+				   (vector16x_u8)shr_shuf);
+      crc = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
+				   (vector16x_u8)shl_shuf);
+      crc = asm_vpmsumd(k3k4, crc);
+      crc ^= v0;
+
+      inbuf += inlen;
+      inlen -= inlen;
+    }
+
+  /* Final fold. */
+
+  /* reduce 128-bits to 96-bits */
+  v0 = asm_swap_u64(crc);
+  v0 &= low_64bit_mask;
+  crc = asm_vpmsumd(k4lo, crc);
+  crc ^= v0;
+
+  /* reduce 96-bits to 64-bits */
+  v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
+				 (vector4x_u32)crc, 3);  /* [x0][x3][x2][x1] */
+  v0 &= low_64bit_mask;                                  /* [00][00][x2][x1] */
+  crc = crc & low_32bit_mask;                            /* [00][00][00][x0] */
+  crc = v0 ^ asm_vpmsumd(k5lo, crc);                     /* [00][00][xx][xx] */
+
+  /* barrett reduction */
+  v0 = crc << 32;                                        /* [00][00][x0][00] */
+  v0 = asm_vpmsumd(my_p, v0);
+  v0 = asm_swap_u64(v0);
+  v0 = asm_vpmsumd(my_p, v0);
+  crc = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
+				  zero, 1);              /* [00][x1][x0][00] */
+  crc ^= v0;
+
+  *pcrc = (u32)crc[VEC_U64_HI];
+}
+
+
+static ASM_FUNC_ATTR_INLINE u32
+crc32r_ppc8_ce_reduction_4 (u32 data, u32 crc,
+			    const struct crc32_consts_s *consts)
+{
+  vector4x_u32 zero = { 0, 0, 0, 0 };
+  vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
+  vector2x_u64 v0 = CRC_VEC_U64_DEF((u64)data, 0);
+  v0 = asm_vpmsumd(v0, my_p);                          /* [00][00][xx][xx] */
+  v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)v0,
+				 zero, 3);             /* [x0][00][00][00] */
+  v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)v0,
+				 (vector4x_u32)v0, 3); /* [00][x0][00][00] */
+  v0 = asm_vpmsumd(v0, my_p);                          /* [00][00][xx][xx] */
+  return (v0[VEC_U64_LO] >> 32) ^ crc;
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		     const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = buf_get_le32(inbuf);
+      data ^= crc;
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = crc32r_ppc8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data <<= 24;
+      crc >>= 8;
+      crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = inbuf[0] << 0;
+      data |= inbuf[1] << 8;
+      data ^= crc;
+      data <<= 16;
+      crc >>= 16;
+      crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = inbuf[0] << 0;
+      data |= inbuf[1] << 8;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data <<= 8;
+      crc >>= 24;
+      crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+crc32_ppc8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  vector4x_u32 zero = { 0, 0, 0, 0 };
+  vector2x_u64 low_96bit_mask = CRC_VEC_U64_DEF(~0, ~((u64)(u32)-1 << 32));
+  vector2x_u64 p_my = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->my_p[0]));
+  vector2x_u64 p_my_lo, p_my_hi;
+  vector2x_u64 k2k1 = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->k[1 - 1]));
+  vector2x_u64 k4k3 = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->k[3 - 1]));
+  vector2x_u64 k4hi = CRC_VEC_U64_DEF(0, consts->k[4 - 1]);
+  vector2x_u64 k5hi = CRC_VEC_U64_DEF(0, consts->k[5 - 1]);
+  vector2x_u64 crc = CRC_VEC_U64_DEF(0, _gcry_bswap64(*pcrc));
+  vector2x_u64 crc0, crc1, crc2, crc3;
+  vector2x_u64 v0;
+
+  if (inlen >= 8 * 16)
+    {
+      crc0 = CRC_VEC_U64_LOAD_BE(0 * 16, inbuf);
+      crc0 ^= crc;
+      crc1 = CRC_VEC_U64_LOAD_BE(1 * 16, inbuf);
+      crc2 = CRC_VEC_U64_LOAD_BE(2 * 16, inbuf);
+      crc3 = CRC_VEC_U64_LOAD_BE(3 * 16, inbuf);
+
+      inbuf += 4 * 16;
+      inlen -= 4 * 16;
+
+      /* Fold by 4. */
+      while (inlen >= 4 * 16)
+	{
+	  v0 = CRC_VEC_U64_LOAD_BE(0 * 16, inbuf);
+	  crc0 = asm_vpmsumd(crc0, k2k1) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_BE(1 * 16, inbuf);
+	  crc1 = asm_vpmsumd(crc1, k2k1) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_BE(2 * 16, inbuf);
+	  crc2 = asm_vpmsumd(crc2, k2k1) ^ v0;
+
+	  v0 = CRC_VEC_U64_LOAD_BE(3 * 16, inbuf);
+	  crc3 = asm_vpmsumd(crc3, k2k1) ^ v0;
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+	}
+
+      /* Fold 4 to 1. */
+      crc1 ^= asm_vpmsumd(crc0, k4k3);
+      crc2 ^= asm_vpmsumd(crc1, k4k3);
+      crc3 ^= asm_vpmsumd(crc2, k4k3);
+      crc = crc3;
+    }
+  else
+    {
+      v0 = CRC_VEC_U64_LOAD_BE(0, inbuf);
+      crc ^= v0;
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Fold by 1. */
+  while (inlen >= 16)
+    {
+      v0 = CRC_VEC_U64_LOAD_BE(0, inbuf);
+      crc = asm_vpmsumd(k4k3, crc);
+      crc ^= v0;
+
+      inbuf += 16;
+      inlen -= 16;
+    }
+
+  /* Partial fold. */
+  if (inlen)
+    {
+      /* Load last input and add padding zeros. */
+      vector2x_u64 mask = CRC_VEC_U64_LOAD_LE(inlen, crc32_partial_fold_input_mask);
+      vector2x_u64 shl_shuf = CRC_VEC_U64_LOAD_LE(32 - inlen, crc32_refl_shuf_shift);
+      vector2x_u64 shr_shuf = CRC_VEC_U64_LOAD_LE(inlen + 16, crc32_shuf_shift);
+
+      v0 = CRC_VEC_U64_LOAD_LE(inlen - 16, inbuf);
+      v0 &= mask;
+
+      crc = CRC_VEC_SWAP_TO_LE(crc);
+      crc2 = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
+				    (vector16x_u8)shr_shuf);
+      v0 |= crc2;
+      v0 = CRC_VEC_SWAP(v0);
+      crc = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
+				   (vector16x_u8)shl_shuf);
+      crc = asm_vpmsumd(k4k3, crc);
+      crc ^= v0;
+
+      inbuf += inlen;
+      inlen -= inlen;
+    }
+
+  /* Final fold. */
+
+  /* reduce 128-bits to 96-bits */
+  v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
+				 (vector4x_u32)zero, 2);
+  crc = asm_vpmsumd(k4hi, crc);
+  crc ^= v0; /* bottom 32-bit are zero */
+
+  /* reduce 96-bits to 64-bits */
+  v0 = crc & low_96bit_mask;    /* [00][x2][x1][00] */
+  crc >>= 32;                   /* [00][x3][00][x0] */
+  crc = asm_vpmsumd(k5hi, crc); /* [00][xx][xx][00] */
+  crc ^= v0;                    /* top and bottom 32-bit are zero */
+
+  /* barrett reduction */
+  p_my_hi = p_my;
+  p_my_lo = p_my;
+  p_my_hi[VEC_U64_LO] = 0;
+  p_my_lo[VEC_U64_HI] = 0;
+  v0 = crc >> 32;                                        /* [00][00][00][x1] */
+  crc = asm_vpmsumd(p_my_hi, crc);                       /* [00][xx][xx][xx] */
+  crc = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
+				  (vector4x_u32)crc, 3); /* [x0][00][x2][x1] */
+  crc = asm_vpmsumd(p_my_lo, crc);                       /* [00][xx][xx][xx] */
+  crc ^= v0;
+
+  *pcrc = _gcry_bswap32(crc[VEC_U64_LO]);
+}
+
+
+static ASM_FUNC_ATTR_INLINE u32
+crc32_ppc8_ce_reduction_4 (u32 data, u32 crc,
+			   const struct crc32_consts_s *consts)
+{
+  vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
+  vector2x_u64 v0 = CRC_VEC_U64_DEF((u64)data << 32, 0);
+  v0 = asm_vpmsumd(v0, my_p); /* [00][x1][x0][00] */
+  v0[VEC_U64_LO] = 0;         /* [00][x1][00][00] */
+  v0 = asm_vpmsumd(v0, my_p); /* [00][00][xx][xx] */
+  return _gcry_bswap32(v0[VEC_U64_LO]) ^ crc;
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = buf_get_le32(inbuf);
+      data ^= crc;
+      data = _gcry_bswap32(data);
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = crc32_ppc8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data = data & 0xffU;
+      crc = crc >> 8;
+      crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = inbuf[0] << 0;
+      data |= inbuf[1] << 8;
+      data ^= crc;
+      data = _gcry_bswap32(data << 16);
+      crc = crc >> 16;
+      crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = inbuf[0] << 0;
+      data |= inbuf[1] << 8;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data = _gcry_bswap32(data << 8);
+      crc = crc >> 24;
+      crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc32_ppc8_vpmsum (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    crc32r_ppc8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32r_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc24rfc2440_ppc8_vpmsum (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+
+  if (!inlen)
+    return;
+
+  /* Note: *pcrc in input endian. */
+
+  if (inlen >= 16)
+    crc32_ppc8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/crc.c b/comm/third_party/libgcrypt/cipher/crc.c
new file mode 100644
index 0000000000..6d70f644f7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/crc.c
@@ -0,0 +1,955 @@
+/* crc.c - Cyclic redundancy checks.
+ * Copyright (C) 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+/* USE_INTEL_PCLMUL indicates whether to compile CRC with Intel PCLMUL/SSE4.1
+ * code.  */
+#undef USE_INTEL_PCLMUL
+#if defined(ENABLE_PCLMUL_SUPPORT) && defined(ENABLE_SSE41_SUPPORT)
+# if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define USE_INTEL_PCLMUL 1
+#  endif
+# endif
+#endif /* USE_INTEL_PCLMUL */
+
+/* USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
+#undef USE_ARM_PMULL
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT)
+# if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_PMULL 1
+# endif
+#endif /* USE_ARM_PMULL */
+
+/* USE_PPC_VPMSUM indicates whether to enable PowerPC vector
+ * accelerated code. */
+#undef USE_PPC_VPMSUM
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_VPMSUM 1
+#  endif
+# endif
+#endif /* USE_PPC_VPMSUM */
+
+
+typedef struct
+{
+  u32 CRC;
+#ifdef USE_INTEL_PCLMUL
+  unsigned int use_pclmul:1;           /* Intel PCLMUL shall be used.  */
+#endif
+#ifdef USE_ARM_PMULL
+  unsigned int use_pmull:1;            /* ARMv8 PMULL shall be used. */
+#endif
+#ifdef USE_PPC_VPMSUM
+  unsigned int use_vpmsum:1;           /* POWER vpmsum shall be used. */
+#endif
+  byte buf[4];
+}
+CRC_CONTEXT;
+
+
+#ifdef USE_INTEL_PCLMUL
+/*-- crc-intel-pclmul.c --*/
+void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
+				      size_t inlen);
+#endif
+
+#ifdef USE_ARM_PMULL
+/*-- crc-armv8-ce.c --*/
+void _gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf,
+					size_t inlen);
+#endif
+
+#ifdef USE_PPC_VPMSUM
+/*-- crc-ppc.c --*/
+void _gcry_crc32_ppc8_vpmsum (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_ppc8_vpmsum (u32 *pcrc, const byte *inbuf,
+				     size_t inlen);
+#endif
+
+
+/*
+ * Code generated by universal_crc by Danjel McGougan
+ *
+ * CRC parameters used:
+ *   bits:       32
+ *   poly:       0x04c11db7
+ *   init:       0xffffffff
+ *   xor:        0xffffffff
+ *   reverse:    true
+ *   non-direct: false
+ *
+ * CRC of the string "123456789" is 0xcbf43926
+ */
+
+static const u32 crc32_table[1024] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+  0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+  0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+  0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+  0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+  0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+  0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+  0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+  0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+  0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+  0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+  0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+  0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+  0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+  0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+  0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+  0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+  0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+  0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+  0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+  0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+  0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+  0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
+  0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
+  0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
+  0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
+  0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+  0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
+  0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
+  0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
+  0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
+  0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+  0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
+  0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
+  0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
+  0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
+  0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+  0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
+  0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
+  0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
+  0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
+  0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+  0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
+  0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
+  0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
+  0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
+  0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+  0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
+  0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
+  0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
+  0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
+  0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+  0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
+  0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
+  0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
+  0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
+  0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+  0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
+  0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
+  0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
+  0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
+  0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+  0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
+  0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
+  0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
+  0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
+  0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+  0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
+  0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
+  0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
+  0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
+  0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+  0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
+  0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
+  0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
+  0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
+  0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+  0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
+  0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
+  0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
+  0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
+  0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+  0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
+  0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
+  0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
+  0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
+  0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
+  0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
+  0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
+  0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
+  0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+  0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
+  0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
+  0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
+  0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
+  0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+  0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
+  0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
+  0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
+  0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
+  0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+  0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
+  0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
+  0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
+  0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
+  0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+  0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
+  0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
+  0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
+  0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
+  0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+  0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
+  0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
+  0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
+  0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
+  0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+  0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
+  0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
+  0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
+  0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
+  0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+  0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
+  0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
+  0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
+  0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
+  0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+  0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
+  0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
+  0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
+  0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
+  0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+  0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
+  0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
+  0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
+  0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
+  0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+  0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
+  0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
+  0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
+  0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
+  0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+  0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
+  0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
+  0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
+  0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
+  0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+  0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
+  0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
+  0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
+  0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
+  0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
+  0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
+  0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
+  0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
+  0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+  0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
+  0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
+  0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
+  0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
+  0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+  0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
+  0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
+  0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
+  0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
+  0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+  0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
+  0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
+  0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
+  0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
+  0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+  0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
+  0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
+  0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
+  0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
+  0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+  0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
+  0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
+  0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
+  0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
+  0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+  0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
+  0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
+  0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
+  0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
+  0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+  0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
+  0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
+  0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
+  0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
+  0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+  0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
+  0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
+  0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
+  0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
+  0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+  0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
+  0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
+  0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
+  0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
+  0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+  0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
+  0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
+  0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
+  0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
+  0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+  0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
+  0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
+  0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
+  0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
+  0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+  0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
+  0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
+  0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
+  0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1
+};
+
+/* CRC32 */
+
+static inline u32
+crc32_next (u32 crc, byte data)
+{
+  return (crc >> 8) ^ crc32_table[(crc & 0xff) ^ data];
+}
+
+/*
+ * Process 4 bytes in one go
+ */
+static inline u32
+crc32_next4 (u32 crc, u32 data)
+{
+  crc ^= data;
+  crc = crc32_table[(crc & 0xff) + 0x300] ^
+        crc32_table[((crc >> 8) & 0xff) + 0x200] ^
+        crc32_table[((crc >> 16) & 0xff) + 0x100] ^
+        crc32_table[(crc >> 24) & 0xff];
+  return crc;
+}
+
+static void
+crc32_init (void *context, unsigned int flags)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  u32 hwf = _gcry_get_hw_features ();
+
+#ifdef USE_INTEL_PCLMUL
+  ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
+#ifdef USE_PPC_VPMSUM
+  ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
+#endif
+
+  (void)flags;
+  (void)hwf;
+
+  ctx->CRC = 0 ^ 0xffffffffL;
+}
+
+static void
+crc32_write (void *context, const void *inbuf_arg, size_t inlen)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  const byte *inbuf = inbuf_arg;
+  u32 crc;
+
+#ifdef USE_INTEL_PCLMUL
+  if (ctx->use_pclmul)
+    {
+      _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc32_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+#ifdef USE_PPC_VPMSUM
+  if (ctx->use_vpmsum)
+    {
+      _gcry_crc32_ppc8_vpmsum(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+
+  if (!inbuf || !inlen)
+    return;
+
+  crc = ctx->CRC;
+
+  while (inlen >= 16)
+    {
+      inlen -= 16;
+      crc = crc32_next4(crc, buf_get_le32(&inbuf[0]));
+      crc = crc32_next4(crc, buf_get_le32(&inbuf[4]));
+      crc = crc32_next4(crc, buf_get_le32(&inbuf[8]));
+      crc = crc32_next4(crc, buf_get_le32(&inbuf[12]));
+      inbuf += 16;
+    }
+
+  while (inlen >= 4)
+    {
+      inlen -= 4;
+      crc = crc32_next4(crc, buf_get_le32(inbuf));
+      inbuf += 4;
+    }
+
+  while (inlen--)
+    {
+      crc = crc32_next(crc, *inbuf++);
+    }
+
+  ctx->CRC = crc;
+}
+
+static byte *
+crc32_read (void *context)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  return ctx->buf;
+}
+
+static void
+crc32_final (void *context)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  ctx->CRC ^= 0xffffffffL;
+  buf_put_be32 (ctx->buf, ctx->CRC);
+}
+
+/* CRC32 a'la RFC 1510 */
+/* CRC of the string "123456789" is 0x2dfd2d88 */
+
+static void
+crc32rfc1510_init (void *context, unsigned int flags)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  u32 hwf = _gcry_get_hw_features ();
+
+#ifdef USE_INTEL_PCLMUL
+  ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
+#ifdef USE_PPC_VPMSUM
+  ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
+#endif
+
+  (void)flags;
+  (void)hwf;
+
+  ctx->CRC = 0;
+}
+
+static void
+crc32rfc1510_final (void *context)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  buf_put_be32(ctx->buf, ctx->CRC);
+}
+
+/* CRC24 a'la RFC 2440 */
+/*
+ * Code generated by universal_crc by Danjel McGougan
+ *
+ * CRC parameters used:
+ *   bits:       24
+ *   poly:       0x864cfb
+ *   init:       0xb704ce
+ *   xor:        0x000000
+ *   reverse:    false
+ *   non-direct: false
+ *
+ * CRC of the string "123456789" is 0x21cf02
+ */
+
+static const u32 crc24_table[1024] =
+{
+  0x00000000, 0x00fb4c86, 0x000dd58a, 0x00f6990c,
+  0x00e1e693, 0x001aaa15, 0x00ec3319, 0x00177f9f,
+  0x003981a1, 0x00c2cd27, 0x0034542b, 0x00cf18ad,
+  0x00d86732, 0x00232bb4, 0x00d5b2b8, 0x002efe3e,
+  0x00894ec5, 0x00720243, 0x00849b4f, 0x007fd7c9,
+  0x0068a856, 0x0093e4d0, 0x00657ddc, 0x009e315a,
+  0x00b0cf64, 0x004b83e2, 0x00bd1aee, 0x00465668,
+  0x005129f7, 0x00aa6571, 0x005cfc7d, 0x00a7b0fb,
+  0x00e9d10c, 0x00129d8a, 0x00e40486, 0x001f4800,
+  0x0008379f, 0x00f37b19, 0x0005e215, 0x00feae93,
+  0x00d050ad, 0x002b1c2b, 0x00dd8527, 0x0026c9a1,
+  0x0031b63e, 0x00cafab8, 0x003c63b4, 0x00c72f32,
+  0x00609fc9, 0x009bd34f, 0x006d4a43, 0x009606c5,
+  0x0081795a, 0x007a35dc, 0x008cacd0, 0x0077e056,
+  0x00591e68, 0x00a252ee, 0x0054cbe2, 0x00af8764,
+  0x00b8f8fb, 0x0043b47d, 0x00b52d71, 0x004e61f7,
+  0x00d2a319, 0x0029ef9f, 0x00df7693, 0x00243a15,
+  0x0033458a, 0x00c8090c, 0x003e9000, 0x00c5dc86,
+  0x00eb22b8, 0x00106e3e, 0x00e6f732, 0x001dbbb4,
+  0x000ac42b, 0x00f188ad, 0x000711a1, 0x00fc5d27,
+  0x005beddc, 0x00a0a15a, 0x00563856, 0x00ad74d0,
+  0x00ba0b4f, 0x004147c9, 0x00b7dec5, 0x004c9243,
+  0x00626c7d, 0x009920fb, 0x006fb9f7, 0x0094f571,
+  0x00838aee, 0x0078c668, 0x008e5f64, 0x007513e2,
+  0x003b7215, 0x00c03e93, 0x0036a79f, 0x00cdeb19,
+  0x00da9486, 0x0021d800, 0x00d7410c, 0x002c0d8a,
+  0x0002f3b4, 0x00f9bf32, 0x000f263e, 0x00f46ab8,
+  0x00e31527, 0x001859a1, 0x00eec0ad, 0x00158c2b,
+  0x00b23cd0, 0x00497056, 0x00bfe95a, 0x0044a5dc,
+  0x0053da43, 0x00a896c5, 0x005e0fc9, 0x00a5434f,
+  0x008bbd71, 0x0070f1f7, 0x008668fb, 0x007d247d,
+  0x006a5be2, 0x00911764, 0x00678e68, 0x009cc2ee,
+  0x00a44733, 0x005f0bb5, 0x00a992b9, 0x0052de3f,
+  0x0045a1a0, 0x00beed26, 0x0048742a, 0x00b338ac,
+  0x009dc692, 0x00668a14, 0x00901318, 0x006b5f9e,
+  0x007c2001, 0x00876c87, 0x0071f58b, 0x008ab90d,
+  0x002d09f6, 0x00d64570, 0x0020dc7c, 0x00db90fa,
+  0x00ccef65, 0x0037a3e3, 0x00c13aef, 0x003a7669,
+  0x00148857, 0x00efc4d1, 0x00195ddd, 0x00e2115b,
+  0x00f56ec4, 0x000e2242, 0x00f8bb4e, 0x0003f7c8,
+  0x004d963f, 0x00b6dab9, 0x004043b5, 0x00bb0f33,
+  0x00ac70ac, 0x00573c2a, 0x00a1a526, 0x005ae9a0,
+  0x0074179e, 0x008f5b18, 0x0079c214, 0x00828e92,
+  0x0095f10d, 0x006ebd8b, 0x00982487, 0x00636801,
+  0x00c4d8fa, 0x003f947c, 0x00c90d70, 0x003241f6,
+  0x00253e69, 0x00de72ef, 0x0028ebe3, 0x00d3a765,
+  0x00fd595b, 0x000615dd, 0x00f08cd1, 0x000bc057,
+  0x001cbfc8, 0x00e7f34e, 0x00116a42, 0x00ea26c4,
+  0x0076e42a, 0x008da8ac, 0x007b31a0, 0x00807d26,
+  0x009702b9, 0x006c4e3f, 0x009ad733, 0x00619bb5,
+  0x004f658b, 0x00b4290d, 0x0042b001, 0x00b9fc87,
+  0x00ae8318, 0x0055cf9e, 0x00a35692, 0x00581a14,
+  0x00ffaaef, 0x0004e669, 0x00f27f65, 0x000933e3,
+  0x001e4c7c, 0x00e500fa, 0x001399f6, 0x00e8d570,
+  0x00c62b4e, 0x003d67c8, 0x00cbfec4, 0x0030b242,
+  0x0027cddd, 0x00dc815b, 0x002a1857, 0x00d154d1,
+  0x009f3526, 0x006479a0, 0x0092e0ac, 0x0069ac2a,
+  0x007ed3b5, 0x00859f33, 0x0073063f, 0x00884ab9,
+  0x00a6b487, 0x005df801, 0x00ab610d, 0x00502d8b,
+  0x00475214, 0x00bc1e92, 0x004a879e, 0x00b1cb18,
+  0x00167be3, 0x00ed3765, 0x001bae69, 0x00e0e2ef,
+  0x00f79d70, 0x000cd1f6, 0x00fa48fa, 0x0001047c,
+  0x002ffa42, 0x00d4b6c4, 0x00222fc8, 0x00d9634e,
+  0x00ce1cd1, 0x00355057, 0x00c3c95b, 0x003885dd,
+  0x00000000, 0x00488f66, 0x00901ecd, 0x00d891ab,
+  0x00db711c, 0x0093fe7a, 0x004b6fd1, 0x0003e0b7,
+  0x00b6e338, 0x00fe6c5e, 0x0026fdf5, 0x006e7293,
+  0x006d9224, 0x00251d42, 0x00fd8ce9, 0x00b5038f,
+  0x006cc771, 0x00244817, 0x00fcd9bc, 0x00b456da,
+  0x00b7b66d, 0x00ff390b, 0x0027a8a0, 0x006f27c6,
+  0x00da2449, 0x0092ab2f, 0x004a3a84, 0x0002b5e2,
+  0x00015555, 0x0049da33, 0x00914b98, 0x00d9c4fe,
+  0x00d88ee3, 0x00900185, 0x0048902e, 0x00001f48,
+  0x0003ffff, 0x004b7099, 0x0093e132, 0x00db6e54,
+  0x006e6ddb, 0x0026e2bd, 0x00fe7316, 0x00b6fc70,
+  0x00b51cc7, 0x00fd93a1, 0x0025020a, 0x006d8d6c,
+  0x00b44992, 0x00fcc6f4, 0x0024575f, 0x006cd839,
+  0x006f388e, 0x0027b7e8, 0x00ff2643, 0x00b7a925,
+  0x0002aaaa, 0x004a25cc, 0x0092b467, 0x00da3b01,
+  0x00d9dbb6, 0x009154d0, 0x0049c57b, 0x00014a1d,
+  0x004b5141, 0x0003de27, 0x00db4f8c, 0x0093c0ea,
+  0x0090205d, 0x00d8af3b, 0x00003e90, 0x0048b1f6,
+  0x00fdb279, 0x00b53d1f, 0x006dacb4, 0x002523d2,
+  0x0026c365, 0x006e4c03, 0x00b6dda8, 0x00fe52ce,
+  0x00279630, 0x006f1956, 0x00b788fd, 0x00ff079b,
+  0x00fce72c, 0x00b4684a, 0x006cf9e1, 0x00247687,
+  0x00917508, 0x00d9fa6e, 0x00016bc5, 0x0049e4a3,
+  0x004a0414, 0x00028b72, 0x00da1ad9, 0x009295bf,
+  0x0093dfa2, 0x00db50c4, 0x0003c16f, 0x004b4e09,
+  0x0048aebe, 0x000021d8, 0x00d8b073, 0x00903f15,
+  0x00253c9a, 0x006db3fc, 0x00b52257, 0x00fdad31,
+  0x00fe4d86, 0x00b6c2e0, 0x006e534b, 0x0026dc2d,
+  0x00ff18d3, 0x00b797b5, 0x006f061e, 0x00278978,
+  0x002469cf, 0x006ce6a9, 0x00b47702, 0x00fcf864,
+  0x0049fbeb, 0x0001748d, 0x00d9e526, 0x00916a40,
+  0x00928af7, 0x00da0591, 0x0002943a, 0x004a1b5c,
+  0x0096a282, 0x00de2de4, 0x0006bc4f, 0x004e3329,
+  0x004dd39e, 0x00055cf8, 0x00ddcd53, 0x00954235,
+  0x002041ba, 0x0068cedc, 0x00b05f77, 0x00f8d011,
+  0x00fb30a6, 0x00b3bfc0, 0x006b2e6b, 0x0023a10d,
+  0x00fa65f3, 0x00b2ea95, 0x006a7b3e, 0x0022f458,
+  0x002114ef, 0x00699b89, 0x00b10a22, 0x00f98544,
+  0x004c86cb, 0x000409ad, 0x00dc9806, 0x00941760,
+  0x0097f7d7, 0x00df78b1, 0x0007e91a, 0x004f667c,
+  0x004e2c61, 0x0006a307, 0x00de32ac, 0x0096bdca,
+  0x00955d7d, 0x00ddd21b, 0x000543b0, 0x004dccd6,
+  0x00f8cf59, 0x00b0403f, 0x0068d194, 0x00205ef2,
+  0x0023be45, 0x006b3123, 0x00b3a088, 0x00fb2fee,
+  0x0022eb10, 0x006a6476, 0x00b2f5dd, 0x00fa7abb,
+  0x00f99a0c, 0x00b1156a, 0x006984c1, 0x00210ba7,
+  0x00940828, 0x00dc874e, 0x000416e5, 0x004c9983,
+  0x004f7934, 0x0007f652, 0x00df67f9, 0x0097e89f,
+  0x00ddf3c3, 0x00957ca5, 0x004ded0e, 0x00056268,
+  0x000682df, 0x004e0db9, 0x00969c12, 0x00de1374,
+  0x006b10fb, 0x00239f9d, 0x00fb0e36, 0x00b38150,
+  0x00b061e7, 0x00f8ee81, 0x00207f2a, 0x0068f04c,
+  0x00b134b2, 0x00f9bbd4, 0x00212a7f, 0x0069a519,
+  0x006a45ae, 0x0022cac8, 0x00fa5b63, 0x00b2d405,
+  0x0007d78a, 0x004f58ec, 0x0097c947, 0x00df4621,
+  0x00dca696, 0x009429f0, 0x004cb85b, 0x0004373d,
+  0x00057d20, 0x004df246, 0x009563ed, 0x00ddec8b,
+  0x00de0c3c, 0x0096835a, 0x004e12f1, 0x00069d97,
+  0x00b39e18, 0x00fb117e, 0x002380d5, 0x006b0fb3,
+  0x0068ef04, 0x00206062, 0x00f8f1c9, 0x00b07eaf,
+  0x0069ba51, 0x00213537, 0x00f9a49c, 0x00b12bfa,
+  0x00b2cb4d, 0x00fa442b, 0x0022d580, 0x006a5ae6,
+  0x00df5969, 0x0097d60f, 0x004f47a4, 0x0007c8c2,
+  0x00042875, 0x004ca713, 0x009436b8, 0x00dcb9de,
+  0x00000000, 0x00d70983, 0x00555f80, 0x00825603,
+  0x0051f286, 0x0086fb05, 0x0004ad06, 0x00d3a485,
+  0x0059a88b, 0x008ea108, 0x000cf70b, 0x00dbfe88,
+  0x00085a0d, 0x00df538e, 0x005d058d, 0x008a0c0e,
+  0x00491c91, 0x009e1512, 0x001c4311, 0x00cb4a92,
+  0x0018ee17, 0x00cfe794, 0x004db197, 0x009ab814,
+  0x0010b41a, 0x00c7bd99, 0x0045eb9a, 0x0092e219,
+  0x0041469c, 0x00964f1f, 0x0014191c, 0x00c3109f,
+  0x006974a4, 0x00be7d27, 0x003c2b24, 0x00eb22a7,
+  0x00388622, 0x00ef8fa1, 0x006dd9a2, 0x00bad021,
+  0x0030dc2f, 0x00e7d5ac, 0x006583af, 0x00b28a2c,
+  0x00612ea9, 0x00b6272a, 0x00347129, 0x00e378aa,
+  0x00206835, 0x00f761b6, 0x007537b5, 0x00a23e36,
+  0x00719ab3, 0x00a69330, 0x0024c533, 0x00f3ccb0,
+  0x0079c0be, 0x00aec93d, 0x002c9f3e, 0x00fb96bd,
+  0x00283238, 0x00ff3bbb, 0x007d6db8, 0x00aa643b,
+  0x0029a4ce, 0x00fead4d, 0x007cfb4e, 0x00abf2cd,
+  0x00785648, 0x00af5fcb, 0x002d09c8, 0x00fa004b,
+  0x00700c45, 0x00a705c6, 0x002553c5, 0x00f25a46,
+  0x0021fec3, 0x00f6f740, 0x0074a143, 0x00a3a8c0,
+  0x0060b85f, 0x00b7b1dc, 0x0035e7df, 0x00e2ee5c,
+  0x00314ad9, 0x00e6435a, 0x00641559, 0x00b31cda,
+  0x003910d4, 0x00ee1957, 0x006c4f54, 0x00bb46d7,
+  0x0068e252, 0x00bfebd1, 0x003dbdd2, 0x00eab451,
+  0x0040d06a, 0x0097d9e9, 0x00158fea, 0x00c28669,
+  0x001122ec, 0x00c62b6f, 0x00447d6c, 0x009374ef,
+  0x001978e1, 0x00ce7162, 0x004c2761, 0x009b2ee2,
+  0x00488a67, 0x009f83e4, 0x001dd5e7, 0x00cadc64,
+  0x0009ccfb, 0x00dec578, 0x005c937b, 0x008b9af8,
+  0x00583e7d, 0x008f37fe, 0x000d61fd, 0x00da687e,
+  0x00506470, 0x00876df3, 0x00053bf0, 0x00d23273,
+  0x000196f6, 0x00d69f75, 0x0054c976, 0x0083c0f5,
+  0x00a9041b, 0x007e0d98, 0x00fc5b9b, 0x002b5218,
+  0x00f8f69d, 0x002fff1e, 0x00ada91d, 0x007aa09e,
+  0x00f0ac90, 0x0027a513, 0x00a5f310, 0x0072fa93,
+  0x00a15e16, 0x00765795, 0x00f40196, 0x00230815,
+  0x00e0188a, 0x00371109, 0x00b5470a, 0x00624e89,
+  0x00b1ea0c, 0x0066e38f, 0x00e4b58c, 0x0033bc0f,
+  0x00b9b001, 0x006eb982, 0x00ecef81, 0x003be602,
+  0x00e84287, 0x003f4b04, 0x00bd1d07, 0x006a1484,
+  0x00c070bf, 0x0017793c, 0x00952f3f, 0x004226bc,
+  0x00918239, 0x00468bba, 0x00c4ddb9, 0x0013d43a,
+  0x0099d834, 0x004ed1b7, 0x00cc87b4, 0x001b8e37,
+  0x00c82ab2, 0x001f2331, 0x009d7532, 0x004a7cb1,
+  0x00896c2e, 0x005e65ad, 0x00dc33ae, 0x000b3a2d,
+  0x00d89ea8, 0x000f972b, 0x008dc128, 0x005ac8ab,
+  0x00d0c4a5, 0x0007cd26, 0x00859b25, 0x005292a6,
+  0x00813623, 0x00563fa0, 0x00d469a3, 0x00036020,
+  0x0080a0d5, 0x0057a956, 0x00d5ff55, 0x0002f6d6,
+  0x00d15253, 0x00065bd0, 0x00840dd3, 0x00530450,
+  0x00d9085e, 0x000e01dd, 0x008c57de, 0x005b5e5d,
+  0x0088fad8, 0x005ff35b, 0x00dda558, 0x000aacdb,
+  0x00c9bc44, 0x001eb5c7, 0x009ce3c4, 0x004bea47,
+  0x00984ec2, 0x004f4741, 0x00cd1142, 0x001a18c1,
+  0x009014cf, 0x00471d4c, 0x00c54b4f, 0x001242cc,
+  0x00c1e649, 0x0016efca, 0x0094b9c9, 0x0043b04a,
+  0x00e9d471, 0x003eddf2, 0x00bc8bf1, 0x006b8272,
+  0x00b826f7, 0x006f2f74, 0x00ed7977, 0x003a70f4,
+  0x00b07cfa, 0x00677579, 0x00e5237a, 0x00322af9,
+  0x00e18e7c, 0x003687ff, 0x00b4d1fc, 0x0063d87f,
+  0x00a0c8e0, 0x0077c163, 0x00f59760, 0x00229ee3,
+  0x00f13a66, 0x002633e5, 0x00a465e6, 0x00736c65,
+  0x00f9606b, 0x002e69e8, 0x00ac3feb, 0x007b3668,
+  0x00a892ed, 0x007f9b6e, 0x00fdcd6d, 0x002ac4ee,
+  0x00000000, 0x00520936, 0x00a4126c, 0x00f61b5a,
+  0x004825d8, 0x001a2cee, 0x00ec37b4, 0x00be3e82,
+  0x006b0636, 0x00390f00, 0x00cf145a, 0x009d1d6c,
+  0x002323ee, 0x00712ad8, 0x00873182, 0x00d538b4,
+  0x00d60c6c, 0x0084055a, 0x00721e00, 0x00201736,
+  0x009e29b4, 0x00cc2082, 0x003a3bd8, 0x006832ee,
+  0x00bd0a5a, 0x00ef036c, 0x00191836, 0x004b1100,
+  0x00f52f82, 0x00a726b4, 0x00513dee, 0x000334d8,
+  0x00ac19d8, 0x00fe10ee, 0x00080bb4, 0x005a0282,
+  0x00e43c00, 0x00b63536, 0x00402e6c, 0x0012275a,
+  0x00c71fee, 0x009516d8, 0x00630d82, 0x003104b4,
+  0x008f3a36, 0x00dd3300, 0x002b285a, 0x0079216c,
+  0x007a15b4, 0x00281c82, 0x00de07d8, 0x008c0eee,
+  0x0032306c, 0x0060395a, 0x00962200, 0x00c42b36,
+  0x00111382, 0x00431ab4, 0x00b501ee, 0x00e708d8,
+  0x0059365a, 0x000b3f6c, 0x00fd2436, 0x00af2d00,
+  0x00a37f36, 0x00f17600, 0x00076d5a, 0x0055646c,
+  0x00eb5aee, 0x00b953d8, 0x004f4882, 0x001d41b4,
+  0x00c87900, 0x009a7036, 0x006c6b6c, 0x003e625a,
+  0x00805cd8, 0x00d255ee, 0x00244eb4, 0x00764782,
+  0x0075735a, 0x00277a6c, 0x00d16136, 0x00836800,
+  0x003d5682, 0x006f5fb4, 0x009944ee, 0x00cb4dd8,
+  0x001e756c, 0x004c7c5a, 0x00ba6700, 0x00e86e36,
+  0x005650b4, 0x00045982, 0x00f242d8, 0x00a04bee,
+  0x000f66ee, 0x005d6fd8, 0x00ab7482, 0x00f97db4,
+  0x00474336, 0x00154a00, 0x00e3515a, 0x00b1586c,
+  0x006460d8, 0x003669ee, 0x00c072b4, 0x00927b82,
+  0x002c4500, 0x007e4c36, 0x0088576c, 0x00da5e5a,
+  0x00d96a82, 0x008b63b4, 0x007d78ee, 0x002f71d8,
+  0x00914f5a, 0x00c3466c, 0x00355d36, 0x00675400,
+  0x00b26cb4, 0x00e06582, 0x00167ed8, 0x004477ee,
+  0x00fa496c, 0x00a8405a, 0x005e5b00, 0x000c5236,
+  0x0046ff6c, 0x0014f65a, 0x00e2ed00, 0x00b0e436,
+  0x000edab4, 0x005cd382, 0x00aac8d8, 0x00f8c1ee,
+  0x002df95a, 0x007ff06c, 0x0089eb36, 0x00dbe200,
+  0x0065dc82, 0x0037d5b4, 0x00c1ceee, 0x0093c7d8,
+  0x0090f300, 0x00c2fa36, 0x0034e16c, 0x0066e85a,
+  0x00d8d6d8, 0x008adfee, 0x007cc4b4, 0x002ecd82,
+  0x00fbf536, 0x00a9fc00, 0x005fe75a, 0x000dee6c,
+  0x00b3d0ee, 0x00e1d9d8, 0x0017c282, 0x0045cbb4,
+  0x00eae6b4, 0x00b8ef82, 0x004ef4d8, 0x001cfdee,
+  0x00a2c36c, 0x00f0ca5a, 0x0006d100, 0x0054d836,
+  0x0081e082, 0x00d3e9b4, 0x0025f2ee, 0x0077fbd8,
+  0x00c9c55a, 0x009bcc6c, 0x006dd736, 0x003fde00,
+  0x003cead8, 0x006ee3ee, 0x0098f8b4, 0x00caf182,
+  0x0074cf00, 0x0026c636, 0x00d0dd6c, 0x0082d45a,
+  0x0057ecee, 0x0005e5d8, 0x00f3fe82, 0x00a1f7b4,
+  0x001fc936, 0x004dc000, 0x00bbdb5a, 0x00e9d26c,
+  0x00e5805a, 0x00b7896c, 0x00419236, 0x00139b00,
+  0x00ada582, 0x00ffacb4, 0x0009b7ee, 0x005bbed8,
+  0x008e866c, 0x00dc8f5a, 0x002a9400, 0x00789d36,
+  0x00c6a3b4, 0x0094aa82, 0x0062b1d8, 0x0030b8ee,
+  0x00338c36, 0x00618500, 0x00979e5a, 0x00c5976c,
+  0x007ba9ee, 0x0029a0d8, 0x00dfbb82, 0x008db2b4,
+  0x00588a00, 0x000a8336, 0x00fc986c, 0x00ae915a,
+  0x0010afd8, 0x0042a6ee, 0x00b4bdb4, 0x00e6b482,
+  0x00499982, 0x001b90b4, 0x00ed8bee, 0x00bf82d8,
+  0x0001bc5a, 0x0053b56c, 0x00a5ae36, 0x00f7a700,
+  0x00229fb4, 0x00709682, 0x00868dd8, 0x00d484ee,
+  0x006aba6c, 0x0038b35a, 0x00cea800, 0x009ca136,
+  0x009f95ee, 0x00cd9cd8, 0x003b8782, 0x00698eb4,
+  0x00d7b036, 0x0085b900, 0x0073a25a, 0x0021ab6c,
+  0x00f493d8, 0x00a69aee, 0x005081b4, 0x00028882,
+  0x00bcb600, 0x00eebf36, 0x0018a46c, 0x004aad5a
+};
+
+static inline
+u32 crc24_init (void)
+{
+  /* Transformed to 32-bit CRC by multiplied by x⁸ and then byte swapped. */
+  return 0xce04b7; /* _gcry_bswap(0xb704ce << 8) */
+}
+
+static inline
+u32 crc24_next (u32 crc, byte data)
+{
+  return (crc >> 8) ^ crc24_table[(crc & 0xff) ^ data];
+}
+
+/*
+ * Process 4 bytes in one go
+ */
+static inline
+u32 crc24_next4 (u32 crc, u32 data)
+{
+  crc ^= data;
+  crc = crc24_table[(crc & 0xff) + 0x300] ^
+        crc24_table[((crc >> 8) & 0xff) + 0x200] ^
+        crc24_table[((crc >> 16) & 0xff) + 0x100] ^
+        crc24_table[(data >> 24) & 0xff];
+  return crc;
+}
+
+static inline
+u32 crc24_final (u32 crc)
+{
+  return crc & 0xffffff;
+}
+
+static void
+crc24rfc2440_init (void *context, unsigned int flags)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  u32 hwf = _gcry_get_hw_features ();
+
+#ifdef USE_INTEL_PCLMUL
+  ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
+#ifdef USE_PPC_VPMSUM
+  ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
+#endif
+
+  (void)hwf;
+  (void)flags;
+
+  ctx->CRC = crc24_init();
+}
+
+static void
+crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
+{
+  const unsigned char *inbuf = inbuf_arg;
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  u32 crc;
+
+#ifdef USE_INTEL_PCLMUL
+  if (ctx->use_pclmul)
+    {
+      _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc24rfc2440_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+#ifdef USE_PPC_VPMSUM
+  if (ctx->use_vpmsum)
+    {
+      _gcry_crc24rfc2440_ppc8_vpmsum(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
+
+  if (!inbuf || !inlen)
+    return;
+
+  crc = ctx->CRC;
+
+  while (inlen >= 16)
+    {
+      inlen -= 16;
+      crc = crc24_next4(crc, buf_get_le32(&inbuf[0]));
+      crc = crc24_next4(crc, buf_get_le32(&inbuf[4]));
+      crc = crc24_next4(crc, buf_get_le32(&inbuf[8]));
+      crc = crc24_next4(crc, buf_get_le32(&inbuf[12]));
+      inbuf += 16;
+    }
+
+  while (inlen >= 4)
+    {
+      inlen -= 4;
+      crc = crc24_next4(crc, buf_get_le32(inbuf));
+      inbuf += 4;
+    }
+
+  while (inlen--)
+    {
+      crc = crc24_next(crc, *inbuf++);
+    }
+
+  ctx->CRC = crc;
+}
+
+static void
+crc24rfc2440_final (void *context)
+{
+  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+  ctx->CRC = crc24_final(ctx->CRC);
+  buf_put_le32 (ctx->buf, ctx->CRC);
+}
+
+/* We allow the CRC algorithms even in FIPS mode because they are
+   actually no cryptographic primitives.  */
+
+gcry_md_spec_t _gcry_digest_spec_crc32 =
+  {
+    GCRY_MD_CRC32, {0, 1},
+    "CRC32", NULL, 0, NULL, 4,
+    crc32_init, crc32_write, crc32_final, crc32_read, NULL,
+    NULL, NULL,
+    sizeof (CRC_CONTEXT)
+  };
+
+gcry_md_spec_t _gcry_digest_spec_crc32_rfc1510 =
+  {
+    GCRY_MD_CRC32_RFC1510, {0, 1},
+    "CRC32RFC1510", NULL, 0, NULL, 4,
+    crc32rfc1510_init, crc32_write, crc32rfc1510_final, crc32_read, NULL,
+    NULL, NULL,
+    sizeof (CRC_CONTEXT)
+  };
+
+gcry_md_spec_t _gcry_digest_spec_crc24_rfc2440 =
+  {
+    GCRY_MD_CRC24_RFC2440, {0, 1},
+    "CRC24RFC2440", NULL, 0, NULL, 3,
+    crc24rfc2440_init, crc24rfc2440_write, crc24rfc2440_final, crc32_read, NULL,
+    NULL, NULL,
+    sizeof (CRC_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/des-amd64.S b/comm/third_party/libgcrypt/cipher/des-amd64.S
new file mode 100644
index 0000000000..a211dac38a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/des-amd64.S
@@ -0,0 +1,1111 @@
+/* des-amd64.S  -  AMD64 assembly implementation of 3DES cipher
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+#define s1 0
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+#define SBOXES %rbp
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rsi
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %esi
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+	movl a, RT0d; \
+	shrl $(offset), RT0d; \
+	xorl b, RT0d; \
+	andl $(mask), RT0d; \
+	xorl RT0d, b; \
+	shll $(offset), RT0d; \
+	xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	movl left##d, RW0d; \
+	roll $1, right##d; \
+	xorl right##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##d; \
+	xorl RW0d, right##d; \
+	roll $1, left##d; \
+	expand_to_64bits(right, RT3); \
+	expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+	compress_to_64bits(right); \
+	compress_to_64bits(left); \
+	movl right##d, RW0d; \
+	rorl $1, left##d; \
+	xorl left##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##d; \
+	xorl RW0d, left##d; \
+	rorl $1, right##d; \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+	xorq from, RW0; \
+	\
+	movzbl RW0bl, RT0d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	shrq $16, RW0; \
+	movq s8(SBOXES, RT0, 8), RT0; \
+	xorq s6(SBOXES, RT1, 8), to; \
+	movzbl RW0bl, RL1d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s4(SBOXES, RT2, 8), RT0; \
+	xorq s2(SBOXES, RT3, 8), to; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	xorq s7(SBOXES, RL1, 8), RT0; \
+	xorq s5(SBOXES, RT1, 8), to; \
+	xorq s3(SBOXES, RT2, 8), RT0; \
+	load_next_key(n, RW0); \
+	xorq RT0, to; \
+	xorq s1(SBOXES, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+	movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+	movl    (io), left##d; \
+	movl   4(io), right##d; \
+	bswapl left##d; \
+	bswapl right##d;
+
+#define write_block(io, left, right) \
+	bswapl left##d; \
+	bswapl right##d; \
+	movl   left##d,   (io); \
+	movl   right##d, 4(io);
+
+.align 8
+.globl _gcry_3des_amd64_crypt_block
+ELF(.type  _gcry_3des_amd64_crypt_block,@function;)
+
+_gcry_3des_amd64_crypt_block:
+	/* input:
+	 *	%rdi: round keys, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
+
+	leaq .L_s1 rRIP, SBOXES;
+
+	read_block(%rdx, RL0, RR0);
+	initial_permutation(RL0, RR0);
+
+	movq (CTX), RW0;
+
+	round1(0, RR0, RL0, load_next_key);
+	round1(1, RL0, RR0, load_next_key);
+	round1(2, RR0, RL0, load_next_key);
+	round1(3, RL0, RR0, load_next_key);
+	round1(4, RR0, RL0, load_next_key);
+	round1(5, RL0, RR0, load_next_key);
+	round1(6, RR0, RL0, load_next_key);
+	round1(7, RL0, RR0, load_next_key);
+	round1(8, RR0, RL0, load_next_key);
+	round1(9, RL0, RR0, load_next_key);
+	round1(10, RR0, RL0, load_next_key);
+	round1(11, RL0, RR0, load_next_key);
+	round1(12, RR0, RL0, load_next_key);
+	round1(13, RL0, RR0, load_next_key);
+	round1(14, RR0, RL0, load_next_key);
+	round1(15, RL0, RR0, load_next_key);
+
+	round1(16+0, RL0, RR0, load_next_key);
+	round1(16+1, RR0, RL0, load_next_key);
+	round1(16+2, RL0, RR0, load_next_key);
+	round1(16+3, RR0, RL0, load_next_key);
+	round1(16+4, RL0, RR0, load_next_key);
+	round1(16+5, RR0, RL0, load_next_key);
+	round1(16+6, RL0, RR0, load_next_key);
+	round1(16+7, RR0, RL0, load_next_key);
+	round1(16+8, RL0, RR0, load_next_key);
+	round1(16+9, RR0, RL0, load_next_key);
+	round1(16+10, RL0, RR0, load_next_key);
+	round1(16+11, RR0, RL0, load_next_key);
+	round1(16+12, RL0, RR0, load_next_key);
+	round1(16+13, RR0, RL0, load_next_key);
+	round1(16+14, RL0, RR0, load_next_key);
+	round1(16+15, RR0, RL0, load_next_key);
+
+	round1(32+0, RR0, RL0, load_next_key);
+	round1(32+1, RL0, RR0, load_next_key);
+	round1(32+2, RR0, RL0, load_next_key);
+	round1(32+3, RL0, RR0, load_next_key);
+	round1(32+4, RR0, RL0, load_next_key);
+	round1(32+5, RL0, RR0, load_next_key);
+	round1(32+6, RR0, RL0, load_next_key);
+	round1(32+7, RL0, RR0, load_next_key);
+	round1(32+8, RR0, RL0, load_next_key);
+	round1(32+9, RL0, RR0, load_next_key);
+	round1(32+10, RR0, RL0, load_next_key);
+	round1(32+11, RL0, RR0, load_next_key);
+	round1(32+12, RR0, RL0, load_next_key);
+	round1(32+13, RL0, RR0, load_next_key);
+	round1(32+14, RR0, RL0, load_next_key);
+	round1(32+15, RL0, RR0, dummy2);
+
+	popq RW2; /*dst*/
+	CFI_POP_TMP_REG();
+	final_permutation(RR0, RL0);
+	write_block(RW2, RR0, RL0);
+
+	popq %r15;
+	CFI_POP(%r15);
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	    \
+	movl left##0d, RW0d; \
+	roll $1, right##0d; \
+	xorl right##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##0d; \
+	xorl RW0d, right##0d; \
+	roll $1, left##0d; \
+	expand_to_64bits(right##0, RT3); \
+	expand_to_64bits(left##0, RT3); \
+	  movl left##1d, RW1d; \
+	  roll $1, right##1d; \
+	  xorl right##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, left##1d; \
+	  xorl RW1d, right##1d; \
+	  roll $1, left##1d; \
+	  expand_to_64bits(right##1, RT3); \
+	  expand_to_64bits(left##1, RT3); \
+	    movl left##2d, RW2d; \
+	    roll $1, right##2d; \
+	    xorl right##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, left##2d; \
+	    xorl RW2d, right##2d; \
+	    roll $1, left##2d; \
+	    expand_to_64bits(right##2, RT3); \
+	    expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+	compress_to_64bits(right##0); \
+	compress_to_64bits(left##0); \
+	movl right##0d, RW0d; \
+	rorl $1, left##0d; \
+	xorl left##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##0d; \
+	xorl RW0d, left##0d; \
+	rorl $1, right##0d; \
+	  compress_to_64bits(right##1); \
+	  compress_to_64bits(left##1); \
+	  movl right##1d, RW1d; \
+	  rorl $1, left##1d; \
+	  xorl left##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, right##1d; \
+	  xorl RW1d, left##1d; \
+	  rorl $1, right##1d; \
+	    compress_to_64bits(right##2); \
+	    compress_to_64bits(left##2); \
+	    movl right##2d, RW2d; \
+	    rorl $1, left##2d; \
+	    xorl left##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, right##2d; \
+	    xorl RW2d, left##2d; \
+	    rorl $1, right##2d; \
+	    \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+	xorq from##0, RW0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s8(SBOXES, RT3, 8), to##0; \
+	xorq s6(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s4(SBOXES, RT3, 8), to##0; \
+	xorq s2(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s7(SBOXES, RT3, 8), to##0; \
+	xorq s5(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	load_next_key(n, RW0); \
+	xorq s3(SBOXES, RT3, 8), to##0; \
+	xorq s1(SBOXES, RT1, 8), to##0; \
+		xorq from##1, RW1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s8(SBOXES, RT3, 8), to##1; \
+		xorq s6(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s4(SBOXES, RT3, 8), to##1; \
+		xorq s2(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrl $16, RW1d; \
+		xorq s7(SBOXES, RT3, 8), to##1; \
+		xorq s5(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		do_movq(RW0, RW1); \
+		xorq s3(SBOXES, RT3, 8), to##1; \
+		xorq s1(SBOXES, RT1, 8), to##1; \
+			xorq from##2, RW2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s8(SBOXES, RT3, 8), to##2; \
+			xorq s6(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s4(SBOXES, RT3, 8), to##2; \
+			xorq s2(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrl $16, RW2d; \
+			xorq s7(SBOXES, RT3, 8), to##2; \
+			xorq s5(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			do_movq(RW0, RW2); \
+			xorq s3(SBOXES, RT3, 8), to##2; \
+			xorq s1(SBOXES, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+	movq src, dst;
+
+#define read_block(io, left, right) \
+	movl    (io), left##d; \
+	movl   4(io), right##d; \
+	bswapl left##d; \
+	bswapl right##d;
+
+#define write_block(io, left, right) \
+	bswapl left##d; \
+	bswapl right##d; \
+	movl   left##d,   (io); \
+	movl   right##d, 4(io);
+
+.align 8
+ELF(.type  _gcry_3des_amd64_crypt_blk3,@function;)
+_gcry_3des_amd64_crypt_blk3:
+	/* input:
+	 *  %rdi: round keys, CTX
+	 *  RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks
+	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
+	 */
+	CFI_STARTPROC();
+
+	leaq .L_s1 rRIP, SBOXES;
+
+	initial_permutation3(RL, RR);
+
+	movq 0(CTX), RW0;
+	movq RW0, RW1;
+	movq RW0, RW2;
+
+	round3(0, RR, RL, load_next_key, __movq);
+	round3(1, RL, RR, load_next_key, __movq);
+	round3(2, RR, RL, load_next_key, __movq);
+	round3(3, RL, RR, load_next_key, __movq);
+	round3(4, RR, RL, load_next_key, __movq);
+	round3(5, RL, RR, load_next_key, __movq);
+	round3(6, RR, RL, load_next_key, __movq);
+	round3(7, RL, RR, load_next_key, __movq);
+	round3(8, RR, RL, load_next_key, __movq);
+	round3(9, RL, RR, load_next_key, __movq);
+	round3(10, RR, RL, load_next_key, __movq);
+	round3(11, RL, RR, load_next_key, __movq);
+	round3(12, RR, RL, load_next_key, __movq);
+	round3(13, RL, RR, load_next_key, __movq);
+	round3(14, RR, RL, load_next_key, __movq);
+	round3(15, RL, RR, load_next_key, __movq);
+
+	round3(16+0, RL, RR, load_next_key, __movq);
+	round3(16+1, RR, RL, load_next_key, __movq);
+	round3(16+2, RL, RR, load_next_key, __movq);
+	round3(16+3, RR, RL, load_next_key, __movq);
+	round3(16+4, RL, RR, load_next_key, __movq);
+	round3(16+5, RR, RL, load_next_key, __movq);
+	round3(16+6, RL, RR, load_next_key, __movq);
+	round3(16+7, RR, RL, load_next_key, __movq);
+	round3(16+8, RL, RR, load_next_key, __movq);
+	round3(16+9, RR, RL, load_next_key, __movq);
+	round3(16+10, RL, RR, load_next_key, __movq);
+	round3(16+11, RR, RL, load_next_key, __movq);
+	round3(16+12, RL, RR, load_next_key, __movq);
+	round3(16+13, RR, RL, load_next_key, __movq);
+	round3(16+14, RL, RR, load_next_key, __movq);
+	round3(16+15, RR, RL, load_next_key, __movq);
+
+	round3(32+0, RR, RL, load_next_key, __movq);
+	round3(32+1, RL, RR, load_next_key, __movq);
+	round3(32+2, RR, RL, load_next_key, __movq);
+	round3(32+3, RL, RR, load_next_key, __movq);
+	round3(32+4, RR, RL, load_next_key, __movq);
+	round3(32+5, RL, RR, load_next_key, __movq);
+	round3(32+6, RR, RL, load_next_key, __movq);
+	round3(32+7, RL, RR, load_next_key, __movq);
+	round3(32+8, RR, RL, load_next_key, __movq);
+	round3(32+9, RL, RR, load_next_key, __movq);
+	round3(32+10, RR, RL, load_next_key, __movq);
+	round3(32+11, RL, RR, load_next_key, __movq);
+	round3(32+12, RR, RL, load_next_key, __movq);
+	round3(32+13, RL, RR, load_next_key, __movq);
+	round3(32+14, RR, RL, load_next_key, __movq);
+	round3(32+15, RL, RR, dummy2, dummy2);
+
+	final_permutation3(RR, RL);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
+
+.align 8
+.globl  _gcry_3des_amd64_cbc_dec
+ELF(.type   _gcry_3des_amd64_cbc_dec,@function;)
+_gcry_3des_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+
+	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
+	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
+	pushq %rcx; /*iv*/
+	CFI_PUSH(%rcx);
+
+	/* load input */
+	movl 0 * 4(%rdx), RL0d;
+	movl 1 * 4(%rdx), RR0d;
+	movl 2 * 4(%rdx), RL1d;
+	movl 3 * 4(%rdx), RR1d;
+	movl 4 * 4(%rdx), RL2d;
+	movl 5 * 4(%rdx), RR2d;
+
+	bswapl RL0d;
+	bswapl RR0d;
+	bswapl RL1d;
+	bswapl RR1d;
+	bswapl RL2d;
+	bswapl RR2d;
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rcx; /*iv*/
+	CFI_POP_TMP_REG();
+	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
+	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	movq 2 * 8(%rdx), RT0;
+	xorl 0 * 4(%rcx), RR0d;
+	xorl 1 * 4(%rcx), RL0d;
+	xorl 0 * 4(%rdx), RR1d;
+	xorl 1 * 4(%rdx), RL1d;
+	xorl 2 * 4(%rdx), RR2d;
+	xorl 3 * 4(%rdx), RL2d;
+	movq RT0, (%rcx); /* store new IV */
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	CFI_POP(%r15);
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
+
+.align 8
+.globl  _gcry_3des_amd64_ctr_enc
+ELF(.type   _gcry_3des_amd64_ctr_enc,@function;)
+_gcry_3des_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+
+	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
+	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
+	movq %rcx, RW2;
+
+	/* load IV and byteswap */
+	movq (RW2), RT0;
+	bswapq RT0;
+	movq RT0, RR0;
+
+	/* construct IVs */
+	leaq 1(RT0), RR1;
+	leaq 2(RT0), RR2;
+	leaq 3(RT0), RT0;
+	movq RR0, RL0;
+	movq RR1, RL1;
+	movq RR2, RL2;
+	bswapq RT0;
+	shrq $32, RL0;
+	shrq $32, RL1;
+	shrq $32, RL2;
+
+	/* store new IV */
+	movq RT0, (RW2);
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
+	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	xorl 0 * 4(%rdx), RR0d;
+	xorl 1 * 4(%rdx), RL0d;
+	xorl 2 * 4(%rdx), RR1d;
+	xorl 3 * 4(%rdx), RL1d;
+	xorl 4 * 4(%rdx), RR2d;
+	xorl 5 * 4(%rdx), RL2d;
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	CFI_POP(%r15);
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
+
+.align 8
+.globl  _gcry_3des_amd64_cfb_dec
+ELF(.type   _gcry_3des_amd64_cfb_dec,@function;)
+_gcry_3des_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	pushq %rbx;
+	CFI_PUSH(%rbx);
+	pushq %r12;
+	CFI_PUSH(%r12);
+	pushq %r13;
+	CFI_PUSH(%r13);
+	pushq %r14;
+	CFI_PUSH(%r14);
+	pushq %r15;
+	CFI_PUSH(%r15);
+
+	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
+	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
+	movq %rcx, RW2;
+
+	/* Load input */
+	movl 0 * 4(RW2), RL0d;
+	movl 1 * 4(RW2), RR0d;
+	movl 0 * 4(%rdx), RL1d;
+	movl 1 * 4(%rdx), RR1d;
+	movl 2 * 4(%rdx), RL2d;
+	movl 3 * 4(%rdx), RR2d;
+
+	bswapl RL0d;
+	bswapl RR0d;
+	bswapl RL1d;
+	bswapl RR1d;
+	bswapl RL2d;
+	bswapl RR2d;
+
+	/* Update IV */
+	movq 4 * 4(%rdx), RW0;
+	movq RW0, (RW2);
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
+	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	xorl 0 * 4(%rdx), RR0d;
+	xorl 1 * 4(%rdx), RL0d;
+	xorl 2 * 4(%rdx), RR1d;
+	xorl 3 * 4(%rdx), RL1d;
+	xorl 4 * 4(%rdx), RR2d;
+	xorl 5 * 4(%rdx), RL2d;
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	CFI_POP(%r15);
+	popq %r14;
+	CFI_POP(%r14);
+	popq %r13;
+	CFI_POP(%r13);
+	popq %r12;
+	CFI_POP(%r12);
+	popq %rbx;
+	CFI_POP(%rbx);
+	popq %rbp;
+	CFI_POP(%rbp);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
+
+.align 16
+.L_s1:
+	.quad 0x0010100001010400, 0x0000000000000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0010100001010004, 0x0000100000010404
+	.quad 0x0000000000000004, 0x0000100000010000
+	.quad 0x0000000000000400, 0x0010100001010400
+	.quad 0x0010100001010404, 0x0000000000000400
+	.quad 0x0010000001000404, 0x0010100001010004
+	.quad 0x0010000001000000, 0x0000000000000004
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000100000010400
+	.quad 0x0000100000010400, 0x0010100001010000
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0000100000010004, 0x0010000001000004
+	.quad 0x0010000001000004, 0x0000100000010004
+	.quad 0x0000000000000000, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010000001000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0000000000000004, 0x0010100001010000
+	.quad 0x0010100001010400, 0x0010000001000000
+	.quad 0x0010000001000000, 0x0000000000000400
+	.quad 0x0010100001010004, 0x0000100000010000
+	.quad 0x0000100000010400, 0x0010000001000004
+	.quad 0x0000000000000400, 0x0000000000000004
+	.quad 0x0010000001000404, 0x0000100000010404
+	.quad 0x0010100001010404, 0x0000100000010004
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0010000001000004, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010100001010400
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000000000000000
+	.quad 0x0000100000010004, 0x0000100000010400
+	.quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+	.quad 0x0801080200100020, 0x0800080000000000
+	.quad 0x0000080000000000, 0x0001080200100020
+	.quad 0x0001000000100000, 0x0000000200000020
+	.quad 0x0801000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0801080200100020
+	.quad 0x0801080000100000, 0x0800000000000000
+	.quad 0x0800080000000000, 0x0001000000100000
+	.quad 0x0000000200000020, 0x0801000200100020
+	.quad 0x0001080000100000, 0x0001000200100020
+	.quad 0x0800080200000020, 0x0000000000000000
+	.quad 0x0800000000000000, 0x0000080000000000
+	.quad 0x0001080200100020, 0x0801000000100000
+	.quad 0x0001000200100020, 0x0800000200000020
+	.quad 0x0000000000000000, 0x0001080000100000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0801000000100000, 0x0000080200000020
+	.quad 0x0000000000000000, 0x0001080200100020
+	.quad 0x0801000200100020, 0x0001000000100000
+	.quad 0x0800080200000020, 0x0801000000100000
+	.quad 0x0801080000100000, 0x0000080000000000
+	.quad 0x0801000000100000, 0x0800080000000000
+	.quad 0x0000000200000020, 0x0801080200100020
+	.quad 0x0001080200100020, 0x0000000200000020
+	.quad 0x0000080000000000, 0x0800000000000000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0001000000100000, 0x0800000200000020
+	.quad 0x0001000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0001000200100020
+	.quad 0x0001080000100000, 0x0000000000000000
+	.quad 0x0800080000000000, 0x0000080200000020
+	.quad 0x0800000000000000, 0x0801000200100020
+	.quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+	.quad 0x0000002000000208, 0x0000202008020200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000202000020208, 0x0000002008000200
+	.quad 0x0000200000020008, 0x0000000008000008
+	.quad 0x0000000008000008, 0x0000200000020000
+	.quad 0x0000202008020208, 0x0000200000020008
+	.quad 0x0000200008020000, 0x0000002000000208
+	.quad 0x0000000008000000, 0x0000000000000008
+	.quad 0x0000202008020200, 0x0000002000000200
+	.quad 0x0000202000020200, 0x0000200008020000
+	.quad 0x0000200008020008, 0x0000202000020208
+	.quad 0x0000002008000208, 0x0000202000020200
+	.quad 0x0000200000020000, 0x0000002008000208
+	.quad 0x0000000000000008, 0x0000202008020208
+	.quad 0x0000002000000200, 0x0000000008000000
+	.quad 0x0000202008020200, 0x0000000008000000
+	.quad 0x0000200000020008, 0x0000002000000208
+	.quad 0x0000200000020000, 0x0000202008020200
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000002000000200, 0x0000200000020008
+	.quad 0x0000202008020208, 0x0000002008000200
+	.quad 0x0000000008000008, 0x0000002000000200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000208, 0x0000200000020000
+	.quad 0x0000000008000000, 0x0000202008020208
+	.quad 0x0000000000000008, 0x0000202000020208
+	.quad 0x0000202000020200, 0x0000000008000008
+	.quad 0x0000200008020000, 0x0000002008000208
+	.quad 0x0000002000000208, 0x0000200008020000
+	.quad 0x0000202000020208, 0x0000000000000008
+	.quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x0000020000002000, 0x0008020800002000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+	.quad 0x0000001000000100, 0x0020001002080100
+	.quad 0x0020000002080000, 0x0420001002000100
+	.quad 0x0000000000080000, 0x0000001000000100
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0400001000080100, 0x0000000000080000
+	.quad 0x0020001002000100, 0x0400001000080100
+	.quad 0x0420001002000100, 0x0420000002080000
+	.quad 0x0000001000080100, 0x0400000000000000
+	.quad 0x0020000002000000, 0x0400000000080000
+	.quad 0x0400000000080000, 0x0000000000000000
+	.quad 0x0400001000000100, 0x0420001002080100
+	.quad 0x0420001002080100, 0x0020001002000100
+	.quad 0x0420000002080000, 0x0400001000000100
+	.quad 0x0000000000000000, 0x0420000002000000
+	.quad 0x0020001002080100, 0x0020000002000000
+	.quad 0x0420000002000000, 0x0000001000080100
+	.quad 0x0000000000080000, 0x0420001002000100
+	.quad 0x0000001000000100, 0x0020000002000000
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0420001002000100, 0x0400001000080100
+	.quad 0x0020001002000100, 0x0400000000000000
+	.quad 0x0420000002080000, 0x0020001002080100
+	.quad 0x0400001000080100, 0x0000001000000100
+	.quad 0x0020000002000000, 0x0420000002080000
+	.quad 0x0420001002080100, 0x0000001000080100
+	.quad 0x0420000002000000, 0x0420001002080100
+	.quad 0x0020000002080000, 0x0000000000000000
+	.quad 0x0400000000080000, 0x0420000002000000
+	.quad 0x0000001000080100, 0x0020001002000100
+	.quad 0x0400001000000100, 0x0000000000080000
+	.quad 0x0000000000000000, 0x0400000000080000
+	.quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+	.quad 0x0200000120000010, 0x0204000020000000
+	.quad 0x0000040000000000, 0x0204040120000010
+	.quad 0x0204000020000000, 0x0000000100000010
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0200040020000000, 0x0004040100000010
+	.quad 0x0004000000000000, 0x0200000120000010
+	.quad 0x0004000100000010, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0000000000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000040000000000
+	.quad 0x0004040000000000, 0x0200040120000010
+	.quad 0x0000000100000010, 0x0204000120000010
+	.quad 0x0204000120000010, 0x0000000000000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000040100000010, 0x0004040000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0200040020000000, 0x0000000100000010
+	.quad 0x0204000120000010, 0x0004040000000000
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0000040100000010, 0x0200000120000010
+	.quad 0x0004000000000000, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0200000120000010, 0x0204040120000010
+	.quad 0x0004040000000000, 0x0204000020000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000000000000000, 0x0204000120000010
+	.quad 0x0000000100000010, 0x0000040000000000
+	.quad 0x0204000020000000, 0x0004040100000010
+	.quad 0x0000040000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000000000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+	.quad 0x0002000000200000, 0x2002000004200002
+	.quad 0x2000000004000802, 0x0000000000000000
+	.quad 0x0000000000000800, 0x2000000004000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2002000004200802, 0x0002000000200000
+	.quad 0x0000000000000000, 0x2000000004000002
+	.quad 0x2000000000000002, 0x0000000004000000
+	.quad 0x2002000004200002, 0x2000000000000802
+	.quad 0x0000000004000800, 0x2002000000200802
+	.quad 0x2002000000200002, 0x0000000004000800
+	.quad 0x2000000004000002, 0x0002000004200000
+	.quad 0x0002000004200800, 0x2002000000200002
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000000000802, 0x2002000004200802
+	.quad 0x0002000000200800, 0x2000000000000002
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0002000000200000, 0x2000000004000802
+	.quad 0x2000000004000802, 0x2002000004200002
+	.quad 0x2002000004200002, 0x2000000000000002
+	.quad 0x2002000000200002, 0x0000000004000000
+	.quad 0x0000000004000800, 0x0002000000200000
+	.quad 0x0002000004200800, 0x2000000000000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2000000000000802, 0x2000000004000002
+	.quad 0x2002000004200802, 0x0002000004200000
+	.quad 0x0002000000200800, 0x0000000000000000
+	.quad 0x2000000000000002, 0x2002000004200802
+	.quad 0x0000000000000000, 0x2002000000200802
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000004000002, 0x0000000004000800
+	.quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+	.quad 0x0100010410001000, 0x0000010000001000
+	.quad 0x0000000000040000, 0x0100010410041000
+	.quad 0x0100000010000000, 0x0100010410001000
+	.quad 0x0000000400000000, 0x0100000010000000
+	.quad 0x0000000400040000, 0x0100000010040000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0100010010041000, 0x0000010400041000
+	.quad 0x0000010000001000, 0x0000000400000000
+	.quad 0x0100000010040000, 0x0100000410000000
+	.quad 0x0100010010001000, 0x0000010400001000
+	.quad 0x0000010000041000, 0x0000000400040000
+	.quad 0x0100000410040000, 0x0100010010041000
+	.quad 0x0000010400001000, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0100000410040000
+	.quad 0x0100000410000000, 0x0100010010001000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0100010010041000, 0x0000010000001000
+	.quad 0x0000000400000000, 0x0100000410040000
+	.quad 0x0000010000001000, 0x0000010400041000
+	.quad 0x0100010010001000, 0x0000000400000000
+	.quad 0x0100000410000000, 0x0100000010040000
+	.quad 0x0100000410040000, 0x0100000010000000
+	.quad 0x0000000000040000, 0x0100010410001000
+	.quad 0x0000000000000000, 0x0100010410041000
+	.quad 0x0000000400040000, 0x0100000410000000
+	.quad 0x0100000010040000, 0x0100010010001000
+	.quad 0x0100010410001000, 0x0000000000000000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0000010000041000, 0x0000010400001000
+	.quad 0x0000010400001000, 0x0000000400040000
+	.quad 0x0100000010000000, 0x0100010010041000
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/des.c b/comm/third_party/libgcrypt/cipher/des.c
new file mode 100644
index 0000000000..1580ea4ec5
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/des.c
@@ -0,0 +1,1507 @@
+/* des.c - DES and Triple-DES encryption/decryption Algorithm
+ * Copyright (C) 1998, 1999, 2001, 2002, 2003,
+ *               2008  Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * For a description of triple encryption, see:
+ *   Bruce Schneier: Applied Cryptography. Second Edition.
+ *   John Wiley & Sons, 1996. ISBN 0-471-12845-7. Pages 358 ff.
+ * This implementation is according to the definition of DES in FIPS
+ * PUB 46-2 from December 1993.
+ */
+
+
+/*
+ * Written by Michael Roth <mroth@nessie.de>, September 1998
+ */
+
+
+/*
+ *  U S A G E
+ * ===========
+ *
+ * For DES or Triple-DES encryption/decryption you must initialize a proper
+ * encryption context with a key.
+ *
+ * A DES key is 64bit wide but only 56bits of the key are used. The remaining
+ * bits are parity bits and they will _not_ checked in this implementation, but
+ * simply ignored.
+ *
+ * For Triple-DES you could use either two 64bit keys or three 64bit keys.
+ * The parity bits will _not_ checked, too.
+ *
+ * After initializing a context with a key you could use this context to
+ * encrypt or decrypt data in 64bit blocks in Electronic Codebook Mode.
+ *
+ * (In the examples below the slashes at the beginning and ending of comments
+ * are omitted.)
+ *
+ * DES Example
+ * -----------
+ *     unsigned char key[8];
+ *     unsigned char plaintext[8];
+ *     unsigned char ciphertext[8];
+ *     unsigned char recoverd[8];
+ *     des_ctx context;
+ *
+ *     * Fill 'key' and 'plaintext' with some data *
+ *     ....
+ *
+ *     * Set up the DES encryption context *
+ *     des_setkey(context, key);
+ *
+ *     * Encrypt the plaintext *
+ *     des_ecb_encrypt(context, plaintext, ciphertext);
+ *
+ *     * To recover the original plaintext from ciphertext use: *
+ *     des_ecb_decrypt(context, ciphertext, recoverd);
+ *
+ *
+ * Triple-DES Example
+ * ------------------
+ *     unsigned char key1[8];
+ *     unsigned char key2[8];
+ *     unsigned char key3[8];
+ *     unsigned char plaintext[8];
+ *     unsigned char ciphertext[8];
+ *     unsigned char recoverd[8];
+ *     tripledes_ctx context;
+ *
+ *     * If you would like to use two 64bit keys, fill 'key1' and'key2'
+ *	 then setup the encryption context: *
+ *     tripledes_set2keys(context, key1, key2);
+ *
+ *     * To use three 64bit keys with Triple-DES use: *
+ *     tripledes_set3keys(context, key1, key2, key3);
+ *
+ *     * Encrypting plaintext with Triple-DES *
+ *     tripledes_ecb_encrypt(context, plaintext, ciphertext);
+ *
+ *     * Decrypting ciphertext to recover the plaintext with Triple-DES *
+ *     tripledes_ecb_decrypt(context, ciphertext, recoverd);
+ *
+ *
+ * Selftest
+ * --------
+ *     char *error_msg;
+ *
+ *     * To perform a selftest of this DES/Triple-DES implementation use the
+ *	 function selftest(). It will return an error string if there are
+ *	 some problems with this library. *
+ *
+ *     if ( (error_msg = selftest()) )
+ *     {
+ *	   fprintf(stderr, "An error in the DES/Triple-DES implementation occurred: %s\n", error_msg);
+ *	   abort();
+ *     }
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <string.h>	       /* memcpy, memcmp */
+#include "types.h"             /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+
+#define DES_BLOCKSIZE 8
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+#if defined(__GNUC__) && defined(__GNU_LIBRARY__)
+# define working_memcmp memcmp
+#else
+/*
+ * According to the SunOS man page, memcmp returns indeterminate sign
+ * depending on whether characters are signed or not.
+ */
+static int
+working_memcmp( const void *_a, const void *_b, size_t n )
+{
+    const char *a = _a;
+    const char *b = _b;
+    for( ; n; n--, a++, b++ )
+	if( *a != *b )
+	    return (int)(*(byte*)a) - (int)(*(byte*)b);
+    return 0;
+}
+#endif
+
+/*
+ * Encryption/Decryption context of DES
+ */
+typedef struct _des_ctx
+  {
+    u32 encrypt_subkeys[32];
+    u32 decrypt_subkeys[32];
+  }
+des_ctx[1];
+
+/*
+ * Encryption/Decryption context of Triple-DES
+ */
+typedef struct _tripledes_ctx
+  {
+    u32 encrypt_subkeys[96];
+    u32 decrypt_subkeys[96];
+    struct {
+      int no_weak_key;
+    } flags;
+  }
+tripledes_ctx[1];
+
+static void des_key_schedule (const byte *, u32 *);
+static int des_setkey (struct _des_ctx *, const byte *);
+static int des_ecb_crypt (struct _des_ctx *, const byte *, byte *, int);
+static int tripledes_set2keys (struct _tripledes_ctx *,
+                               const byte *, const byte *);
+static int tripledes_set3keys (struct _tripledes_ctx *,
+                               const byte *, const byte *, const byte *);
+static int tripledes_ecb_crypt (struct _tripledes_ctx *,
+                                const byte *, byte *, int);
+static int is_weak_key ( const byte *key );
+static const char *selftest (void);
+static unsigned int do_tripledes_encrypt(void *context, byte *outbuf,
+					 const byte *inbuf );
+static unsigned int do_tripledes_decrypt(void *context, byte *outbuf,
+					 const byte *inbuf );
+static gcry_err_code_t do_tripledes_setkey(void *context, const byte *key,
+                                           unsigned keylen,
+                                           cipher_bulk_ops_t *bulk_ops);
+
+static int initialized;
+
+
+
+
+/*
+ * The s-box values are permuted according to the 'primitive function P'
+ * and are rotated one bit to the left.
+ */
+static u32 sbox1[64] =
+{
+  0x01010400, 0x00000000, 0x00010000, 0x01010404, 0x01010004, 0x00010404, 0x00000004, 0x00010000,
+  0x00000400, 0x01010400, 0x01010404, 0x00000400, 0x01000404, 0x01010004, 0x01000000, 0x00000004,
+  0x00000404, 0x01000400, 0x01000400, 0x00010400, 0x00010400, 0x01010000, 0x01010000, 0x01000404,
+  0x00010004, 0x01000004, 0x01000004, 0x00010004, 0x00000000, 0x00000404, 0x00010404, 0x01000000,
+  0x00010000, 0x01010404, 0x00000004, 0x01010000, 0x01010400, 0x01000000, 0x01000000, 0x00000400,
+  0x01010004, 0x00010000, 0x00010400, 0x01000004, 0x00000400, 0x00000004, 0x01000404, 0x00010404,
+  0x01010404, 0x00010004, 0x01010000, 0x01000404, 0x01000004, 0x00000404, 0x00010404, 0x01010400,
+  0x00000404, 0x01000400, 0x01000400, 0x00000000, 0x00010004, 0x00010400, 0x00000000, 0x01010004
+};
+
+static u32 sbox2[64] =
+{
+  0x80108020, 0x80008000, 0x00008000, 0x00108020, 0x00100000, 0x00000020, 0x80100020, 0x80008020,
+  0x80000020, 0x80108020, 0x80108000, 0x80000000, 0x80008000, 0x00100000, 0x00000020, 0x80100020,
+  0x00108000, 0x00100020, 0x80008020, 0x00000000, 0x80000000, 0x00008000, 0x00108020, 0x80100000,
+  0x00100020, 0x80000020, 0x00000000, 0x00108000, 0x00008020, 0x80108000, 0x80100000, 0x00008020,
+  0x00000000, 0x00108020, 0x80100020, 0x00100000, 0x80008020, 0x80100000, 0x80108000, 0x00008000,
+  0x80100000, 0x80008000, 0x00000020, 0x80108020, 0x00108020, 0x00000020, 0x00008000, 0x80000000,
+  0x00008020, 0x80108000, 0x00100000, 0x80000020, 0x00100020, 0x80008020, 0x80000020, 0x00100020,
+  0x00108000, 0x00000000, 0x80008000, 0x00008020, 0x80000000, 0x80100020, 0x80108020, 0x00108000
+};
+
+static u32 sbox3[64] =
+{
+  0x00000208, 0x08020200, 0x00000000, 0x08020008, 0x08000200, 0x00000000, 0x00020208, 0x08000200,
+  0x00020008, 0x08000008, 0x08000008, 0x00020000, 0x08020208, 0x00020008, 0x08020000, 0x00000208,
+  0x08000000, 0x00000008, 0x08020200, 0x00000200, 0x00020200, 0x08020000, 0x08020008, 0x00020208,
+  0x08000208, 0x00020200, 0x00020000, 0x08000208, 0x00000008, 0x08020208, 0x00000200, 0x08000000,
+  0x08020200, 0x08000000, 0x00020008, 0x00000208, 0x00020000, 0x08020200, 0x08000200, 0x00000000,
+  0x00000200, 0x00020008, 0x08020208, 0x08000200, 0x08000008, 0x00000200, 0x00000000, 0x08020008,
+  0x08000208, 0x00020000, 0x08000000, 0x08020208, 0x00000008, 0x00020208, 0x00020200, 0x08000008,
+  0x08020000, 0x08000208, 0x00000208, 0x08020000, 0x00020208, 0x00000008, 0x08020008, 0x00020200
+};
+
+static u32 sbox4[64] =
+{
+  0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802080, 0x00800081, 0x00800001, 0x00002001,
+  0x00000000, 0x00802000, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00800080, 0x00800001,
+  0x00000001, 0x00002000, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002001, 0x00002080,
+  0x00800081, 0x00000001, 0x00002080, 0x00800080, 0x00002000, 0x00802080, 0x00802081, 0x00000081,
+  0x00800080, 0x00800001, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00000000, 0x00802000,
+  0x00002080, 0x00800080, 0x00800081, 0x00000001, 0x00802001, 0x00002081, 0x00002081, 0x00000080,
+  0x00802081, 0x00000081, 0x00000001, 0x00002000, 0x00800001, 0x00002001, 0x00802080, 0x00800081,
+  0x00002001, 0x00002080, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002000, 0x00802080
+};
+
+static u32 sbox5[64] =
+{
+  0x00000100, 0x02080100, 0x02080000, 0x42000100, 0x00080000, 0x00000100, 0x40000000, 0x02080000,
+  0x40080100, 0x00080000, 0x02000100, 0x40080100, 0x42000100, 0x42080000, 0x00080100, 0x40000000,
+  0x02000000, 0x40080000, 0x40080000, 0x00000000, 0x40000100, 0x42080100, 0x42080100, 0x02000100,
+  0x42080000, 0x40000100, 0x00000000, 0x42000000, 0x02080100, 0x02000000, 0x42000000, 0x00080100,
+  0x00080000, 0x42000100, 0x00000100, 0x02000000, 0x40000000, 0x02080000, 0x42000100, 0x40080100,
+  0x02000100, 0x40000000, 0x42080000, 0x02080100, 0x40080100, 0x00000100, 0x02000000, 0x42080000,
+  0x42080100, 0x00080100, 0x42000000, 0x42080100, 0x02080000, 0x00000000, 0x40080000, 0x42000000,
+  0x00080100, 0x02000100, 0x40000100, 0x00080000, 0x00000000, 0x40080000, 0x02080100, 0x40000100
+};
+
+static u32 sbox6[64] =
+{
+  0x20000010, 0x20400000, 0x00004000, 0x20404010, 0x20400000, 0x00000010, 0x20404010, 0x00400000,
+  0x20004000, 0x00404010, 0x00400000, 0x20000010, 0x00400010, 0x20004000, 0x20000000, 0x00004010,
+  0x00000000, 0x00400010, 0x20004010, 0x00004000, 0x00404000, 0x20004010, 0x00000010, 0x20400010,
+  0x20400010, 0x00000000, 0x00404010, 0x20404000, 0x00004010, 0x00404000, 0x20404000, 0x20000000,
+  0x20004000, 0x00000010, 0x20400010, 0x00404000, 0x20404010, 0x00400000, 0x00004010, 0x20000010,
+  0x00400000, 0x20004000, 0x20000000, 0x00004010, 0x20000010, 0x20404010, 0x00404000, 0x20400000,
+  0x00404010, 0x20404000, 0x00000000, 0x20400010, 0x00000010, 0x00004000, 0x20400000, 0x00404010,
+  0x00004000, 0x00400010, 0x20004010, 0x00000000, 0x20404000, 0x20000000, 0x00400010, 0x20004010
+};
+
+static u32 sbox7[64] =
+{
+  0x00200000, 0x04200002, 0x04000802, 0x00000000, 0x00000800, 0x04000802, 0x00200802, 0x04200800,
+  0x04200802, 0x00200000, 0x00000000, 0x04000002, 0x00000002, 0x04000000, 0x04200002, 0x00000802,
+  0x04000800, 0x00200802, 0x00200002, 0x04000800, 0x04000002, 0x04200000, 0x04200800, 0x00200002,
+  0x04200000, 0x00000800, 0x00000802, 0x04200802, 0x00200800, 0x00000002, 0x04000000, 0x00200800,
+  0x04000000, 0x00200800, 0x00200000, 0x04000802, 0x04000802, 0x04200002, 0x04200002, 0x00000002,
+  0x00200002, 0x04000000, 0x04000800, 0x00200000, 0x04200800, 0x00000802, 0x00200802, 0x04200800,
+  0x00000802, 0x04000002, 0x04200802, 0x04200000, 0x00200800, 0x00000000, 0x00000002, 0x04200802,
+  0x00000000, 0x00200802, 0x04200000, 0x00000800, 0x04000002, 0x04000800, 0x00000800, 0x00200002
+};
+
+static u32 sbox8[64] =
+{
+  0x10001040, 0x00001000, 0x00040000, 0x10041040, 0x10000000, 0x10001040, 0x00000040, 0x10000000,
+  0x00040040, 0x10040000, 0x10041040, 0x00041000, 0x10041000, 0x00041040, 0x00001000, 0x00000040,
+  0x10040000, 0x10000040, 0x10001000, 0x00001040, 0x00041000, 0x00040040, 0x10040040, 0x10041000,
+  0x00001040, 0x00000000, 0x00000000, 0x10040040, 0x10000040, 0x10001000, 0x00041040, 0x00040000,
+  0x00041040, 0x00040000, 0x10041000, 0x00001000, 0x00000040, 0x10040040, 0x00001000, 0x00041040,
+  0x10001000, 0x00000040, 0x10000040, 0x10040000, 0x10040040, 0x10000000, 0x00040000, 0x10001040,
+  0x00000000, 0x10041040, 0x00040040, 0x10000040, 0x10040000, 0x10001000, 0x10001040, 0x00000000,
+  0x10041040, 0x00041000, 0x00041000, 0x00001040, 0x00001040, 0x00040040, 0x10000000, 0x10041000
+};
+
+
+/*
+ * These two tables are part of the 'permuted choice 1' function.
+ * In this implementation several speed improvements are done.
+ */
+static u32 leftkey_swap[16] =
+{
+  0x00000000, 0x00000001, 0x00000100, 0x00000101,
+  0x00010000, 0x00010001, 0x00010100, 0x00010101,
+  0x01000000, 0x01000001, 0x01000100, 0x01000101,
+  0x01010000, 0x01010001, 0x01010100, 0x01010101
+};
+
+static u32 rightkey_swap[16] =
+{
+  0x00000000, 0x01000000, 0x00010000, 0x01010000,
+  0x00000100, 0x01000100, 0x00010100, 0x01010100,
+  0x00000001, 0x01000001, 0x00010001, 0x01010001,
+  0x00000101, 0x01000101, 0x00010101, 0x01010101,
+};
+
+
+
+/*
+ * Numbers of left shifts per round for encryption subkeys.
+ * To calculate the decryption subkeys we just reverse the
+ * ordering of the calculated encryption subkeys. So their
+ * is no need for a decryption rotate tab.
+ */
+static byte encrypt_rotate_tab[16] =
+{
+  1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1
+};
+
+
+
+/*
+ * Table with weak DES keys sorted in ascending order.
+ * In DES their are 64 known keys which are weak. They are weak
+ * because they produce only one, two or four different
+ * subkeys in the subkey scheduling process.
+ * The keys in this table have all their parity bits cleared.
+ */
+static byte weak_keys[64][8] =
+{
+  { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, /*w*/
+  { 0x00, 0x00, 0x1e, 0x1e, 0x00, 0x00, 0x0e, 0x0e },
+  { 0x00, 0x00, 0xe0, 0xe0, 0x00, 0x00, 0xf0, 0xf0 },
+  { 0x00, 0x00, 0xfe, 0xfe, 0x00, 0x00, 0xfe, 0xfe },
+  { 0x00, 0x1e, 0x00, 0x1e, 0x00, 0x0e, 0x00, 0x0e }, /*sw*/
+  { 0x00, 0x1e, 0x1e, 0x00, 0x00, 0x0e, 0x0e, 0x00 },
+  { 0x00, 0x1e, 0xe0, 0xfe, 0x00, 0x0e, 0xf0, 0xfe },
+  { 0x00, 0x1e, 0xfe, 0xe0, 0x00, 0x0e, 0xfe, 0xf0 },
+  { 0x00, 0xe0, 0x00, 0xe0, 0x00, 0xf0, 0x00, 0xf0 }, /*sw*/
+  { 0x00, 0xe0, 0x1e, 0xfe, 0x00, 0xf0, 0x0e, 0xfe },
+  { 0x00, 0xe0, 0xe0, 0x00, 0x00, 0xf0, 0xf0, 0x00 },
+  { 0x00, 0xe0, 0xfe, 0x1e, 0x00, 0xf0, 0xfe, 0x0e },
+  { 0x00, 0xfe, 0x00, 0xfe, 0x00, 0xfe, 0x00, 0xfe }, /*sw*/
+  { 0x00, 0xfe, 0x1e, 0xe0, 0x00, 0xfe, 0x0e, 0xf0 },
+  { 0x00, 0xfe, 0xe0, 0x1e, 0x00, 0xfe, 0xf0, 0x0e },
+  { 0x00, 0xfe, 0xfe, 0x00, 0x00, 0xfe, 0xfe, 0x00 },
+  { 0x1e, 0x00, 0x00, 0x1e, 0x0e, 0x00, 0x00, 0x0e },
+  { 0x1e, 0x00, 0x1e, 0x00, 0x0e, 0x00, 0x0e, 0x00 }, /*sw*/
+  { 0x1e, 0x00, 0xe0, 0xfe, 0x0e, 0x00, 0xf0, 0xfe },
+  { 0x1e, 0x00, 0xfe, 0xe0, 0x0e, 0x00, 0xfe, 0xf0 },
+  { 0x1e, 0x1e, 0x00, 0x00, 0x0e, 0x0e, 0x00, 0x00 },
+  { 0x1e, 0x1e, 0x1e, 0x1e, 0x0e, 0x0e, 0x0e, 0x0e }, /*w*/
+  { 0x1e, 0x1e, 0xe0, 0xe0, 0x0e, 0x0e, 0xf0, 0xf0 },
+  { 0x1e, 0x1e, 0xfe, 0xfe, 0x0e, 0x0e, 0xfe, 0xfe },
+  { 0x1e, 0xe0, 0x00, 0xfe, 0x0e, 0xf0, 0x00, 0xfe },
+  { 0x1e, 0xe0, 0x1e, 0xe0, 0x0e, 0xf0, 0x0e, 0xf0 }, /*sw*/
+  { 0x1e, 0xe0, 0xe0, 0x1e, 0x0e, 0xf0, 0xf0, 0x0e },
+  { 0x1e, 0xe0, 0xfe, 0x00, 0x0e, 0xf0, 0xfe, 0x00 },
+  { 0x1e, 0xfe, 0x00, 0xe0, 0x0e, 0xfe, 0x00, 0xf0 },
+  { 0x1e, 0xfe, 0x1e, 0xfe, 0x0e, 0xfe, 0x0e, 0xfe }, /*sw*/
+  { 0x1e, 0xfe, 0xe0, 0x00, 0x0e, 0xfe, 0xf0, 0x00 },
+  { 0x1e, 0xfe, 0xfe, 0x1e, 0x0e, 0xfe, 0xfe, 0x0e },
+  { 0xe0, 0x00, 0x00, 0xe0, 0xf0, 0x00, 0x00, 0xf0 },
+  { 0xe0, 0x00, 0x1e, 0xfe, 0xf0, 0x00, 0x0e, 0xfe },
+  { 0xe0, 0x00, 0xe0, 0x00, 0xf0, 0x00, 0xf0, 0x00 }, /*sw*/
+  { 0xe0, 0x00, 0xfe, 0x1e, 0xf0, 0x00, 0xfe, 0x0e },
+  { 0xe0, 0x1e, 0x00, 0xfe, 0xf0, 0x0e, 0x00, 0xfe },
+  { 0xe0, 0x1e, 0x1e, 0xe0, 0xf0, 0x0e, 0x0e, 0xf0 },
+  { 0xe0, 0x1e, 0xe0, 0x1e, 0xf0, 0x0e, 0xf0, 0x0e }, /*sw*/
+  { 0xe0, 0x1e, 0xfe, 0x00, 0xf0, 0x0e, 0xfe, 0x00 },
+  { 0xe0, 0xe0, 0x00, 0x00, 0xf0, 0xf0, 0x00, 0x00 },
+  { 0xe0, 0xe0, 0x1e, 0x1e, 0xf0, 0xf0, 0x0e, 0x0e },
+  { 0xe0, 0xe0, 0xe0, 0xe0, 0xf0, 0xf0, 0xf0, 0xf0 }, /*w*/
+  { 0xe0, 0xe0, 0xfe, 0xfe, 0xf0, 0xf0, 0xfe, 0xfe },
+  { 0xe0, 0xfe, 0x00, 0x1e, 0xf0, 0xfe, 0x00, 0x0e },
+  { 0xe0, 0xfe, 0x1e, 0x00, 0xf0, 0xfe, 0x0e, 0x00 },
+  { 0xe0, 0xfe, 0xe0, 0xfe, 0xf0, 0xfe, 0xf0, 0xfe }, /*sw*/
+  { 0xe0, 0xfe, 0xfe, 0xe0, 0xf0, 0xfe, 0xfe, 0xf0 },
+  { 0xfe, 0x00, 0x00, 0xfe, 0xfe, 0x00, 0x00, 0xfe },
+  { 0xfe, 0x00, 0x1e, 0xe0, 0xfe, 0x00, 0x0e, 0xf0 },
+  { 0xfe, 0x00, 0xe0, 0x1e, 0xfe, 0x00, 0xf0, 0x0e },
+  { 0xfe, 0x00, 0xfe, 0x00, 0xfe, 0x00, 0xfe, 0x00 }, /*sw*/
+  { 0xfe, 0x1e, 0x00, 0xe0, 0xfe, 0x0e, 0x00, 0xf0 },
+  { 0xfe, 0x1e, 0x1e, 0xfe, 0xfe, 0x0e, 0x0e, 0xfe },
+  { 0xfe, 0x1e, 0xe0, 0x00, 0xfe, 0x0e, 0xf0, 0x00 },
+  { 0xfe, 0x1e, 0xfe, 0x1e, 0xfe, 0x0e, 0xfe, 0x0e }, /*sw*/
+  { 0xfe, 0xe0, 0x00, 0x1e, 0xfe, 0xf0, 0x00, 0x0e },
+  { 0xfe, 0xe0, 0x1e, 0x00, 0xfe, 0xf0, 0x0e, 0x00 },
+  { 0xfe, 0xe0, 0xe0, 0xfe, 0xfe, 0xf0, 0xf0, 0xfe },
+  { 0xfe, 0xe0, 0xfe, 0xe0, 0xfe, 0xf0, 0xfe, 0xf0 }, /*sw*/
+  { 0xfe, 0xfe, 0x00, 0x00, 0xfe, 0xfe, 0x00, 0x00 },
+  { 0xfe, 0xfe, 0x1e, 0x1e, 0xfe, 0xfe, 0x0e, 0x0e },
+  { 0xfe, 0xfe, 0xe0, 0xe0, 0xfe, 0xfe, 0xf0, 0xf0 },
+  { 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe }  /*w*/
+};
+static unsigned char weak_keys_chksum[20] = {
+  0xD0, 0xCF, 0x07, 0x38, 0x93, 0x70, 0x8A, 0x83, 0x7D, 0xD7,
+  0x8A, 0x36, 0x65, 0x29, 0x6C, 0x1F, 0x7C, 0x3F, 0xD3, 0x41
+};
+
+
+
+/*
+ * Macro to swap bits across two words.
+ */
+#define DO_PERMUTATION(a, temp, b, offset, mask)	\
+    temp = ((a>>offset) ^ b) & mask;			\
+    b ^= temp;						\
+    a ^= temp<<offset;
+
+
+/*
+ * This performs the 'initial permutation' of the data to be encrypted
+ * or decrypted. Additionally the resulting two words are rotated one bit
+ * to the left.
+ */
+#define INITIAL_PERMUTATION(left, temp, right)		\
+    DO_PERMUTATION(left, temp, right, 4, 0x0f0f0f0f)	\
+    DO_PERMUTATION(left, temp, right, 16, 0x0000ffff)	\
+    DO_PERMUTATION(right, temp, left, 2, 0x33333333)	\
+    DO_PERMUTATION(right, temp, left, 8, 0x00ff00ff)	\
+    right =  (right << 1) | (right >> 31);		\
+    temp  =  (left ^ right) & 0xaaaaaaaa;		\
+    right ^= temp;					\
+    left  ^= temp;					\
+    left  =  (left << 1) | (left >> 31);
+
+/*
+ * The 'inverse initial permutation'.
+ */
+#define FINAL_PERMUTATION(left, temp, right)		\
+    left  =  (left << 31) | (left >> 1);		\
+    temp  =  (left ^ right) & 0xaaaaaaaa;		\
+    left  ^= temp;					\
+    right ^= temp;					\
+    right  =  (right << 31) | (right >> 1);		\
+    DO_PERMUTATION(right, temp, left, 8, 0x00ff00ff)	\
+    DO_PERMUTATION(right, temp, left, 2, 0x33333333)	\
+    DO_PERMUTATION(left, temp, right, 16, 0x0000ffff)	\
+    DO_PERMUTATION(left, temp, right, 4, 0x0f0f0f0f)
+
+
+/*
+ * A full DES round including 'expansion function', 'sbox substitution'
+ * and 'primitive function P' but without swapping the left and right word.
+ * Please note: The data in 'from' and 'to' is already rotated one bit to
+ * the left, done in the initial permutation.
+ */
+#define DES_ROUND(from, to, work, subkey)		\
+    work = from ^ *subkey++;				\
+    to ^= sbox8[  work	    & 0x3f ];			\
+    to ^= sbox6[ (work>>8)  & 0x3f ];			\
+    to ^= sbox4[ (work>>16) & 0x3f ];			\
+    to ^= sbox2[ (work>>24) & 0x3f ];			\
+    work = ((from << 28) | (from >> 4)) ^ *subkey++;	\
+    to ^= sbox7[  work	    & 0x3f ];			\
+    to ^= sbox5[ (work>>8)  & 0x3f ];			\
+    to ^= sbox3[ (work>>16) & 0x3f ];			\
+    to ^= sbox1[ (work>>24) & 0x3f ];
+
+/*
+ * Macros to convert 8 bytes from/to 32bit words.
+ */
+#define READ_64BIT_DATA(data, left, right)				   \
+    left = buf_get_be32(data + 0);					   \
+    right = buf_get_be32(data + 4);
+
+#define WRITE_64BIT_DATA(data, left, right)				   \
+    buf_put_be32(data + 0, left);					   \
+    buf_put_be32(data + 4, right);
+
+/*
+ * Handy macros for encryption and decryption of data
+ */
+#define des_ecb_encrypt(ctx, from, to)	      des_ecb_crypt(ctx, from, to, 0)
+#define des_ecb_decrypt(ctx, from, to)	      des_ecb_crypt(ctx, from, to, 1)
+#define tripledes_ecb_encrypt(ctx, from, to) tripledes_ecb_crypt(ctx,from,to,0)
+#define tripledes_ecb_decrypt(ctx, from, to) tripledes_ecb_crypt(ctx,from,to,1)
+
+
+
+
+
+
+/*
+ * des_key_schedule():	  Calculate 16 subkeys pairs (even/odd) for
+ *			  16 encryption rounds.
+ *			  To calculate subkeys for decryption the caller
+ *			  have to reorder the generated subkeys.
+ *
+ *    rawkey:	    8 Bytes of key data
+ *    subkey:	    Array of at least 32 u32s. Will be filled
+ *		    with calculated subkeys.
+ *
+ */
+static void
+des_key_schedule (const byte * rawkey, u32 * subkey)
+{
+  u32 left, right, work;
+  int round;
+
+  READ_64BIT_DATA (rawkey, left, right)
+
+  DO_PERMUTATION (right, work, left, 4, 0x0f0f0f0f)
+  DO_PERMUTATION (right, work, left, 0, 0x10101010)
+
+  left = ((leftkey_swap[(left >> 0) & 0xf] << 3)
+          | (leftkey_swap[(left >> 8) & 0xf] << 2)
+          | (leftkey_swap[(left >> 16) & 0xf] << 1)
+          | (leftkey_swap[(left >> 24) & 0xf])
+          | (leftkey_swap[(left >> 5) & 0xf] << 7)
+          | (leftkey_swap[(left >> 13) & 0xf] << 6)
+          | (leftkey_swap[(left >> 21) & 0xf] << 5)
+          | (leftkey_swap[(left >> 29) & 0xf] << 4));
+
+  left &= 0x0fffffff;
+
+  right = ((rightkey_swap[(right >> 1) & 0xf] << 3)
+           | (rightkey_swap[(right >> 9) & 0xf] << 2)
+           | (rightkey_swap[(right >> 17) & 0xf] << 1)
+           | (rightkey_swap[(right >> 25) & 0xf])
+           | (rightkey_swap[(right >> 4) & 0xf] << 7)
+           | (rightkey_swap[(right >> 12) & 0xf] << 6)
+           | (rightkey_swap[(right >> 20) & 0xf] << 5)
+           | (rightkey_swap[(right >> 28) & 0xf] << 4));
+
+  right &= 0x0fffffff;
+
+  for (round = 0; round < 16; ++round)
+    {
+      left = ((left << encrypt_rotate_tab[round])
+              | (left >> (28 - encrypt_rotate_tab[round]))) & 0x0fffffff;
+      right = ((right << encrypt_rotate_tab[round])
+               | (right >> (28 - encrypt_rotate_tab[round]))) & 0x0fffffff;
+
+      *subkey++ = (((left << 4) & 0x24000000)
+                   | ((left << 28) & 0x10000000)
+                   | ((left << 14) & 0x08000000)
+                   | ((left << 18) & 0x02080000)
+                   | ((left << 6) & 0x01000000)
+                   | ((left << 9) & 0x00200000)
+                   | ((left >> 1) & 0x00100000)
+                   | ((left << 10) & 0x00040000)
+                   | ((left << 2) & 0x00020000)
+                   | ((left >> 10) & 0x00010000)
+                   | ((right >> 13) & 0x00002000)
+                   | ((right >> 4) & 0x00001000)
+                   | ((right << 6) & 0x00000800)
+                   | ((right >> 1) & 0x00000400)
+                   | ((right >> 14) & 0x00000200)
+                   | (right & 0x00000100)
+                   | ((right >> 5) & 0x00000020)
+                   | ((right >> 10) & 0x00000010)
+                   | ((right >> 3) & 0x00000008)
+                   | ((right >> 18) & 0x00000004)
+                   | ((right >> 26) & 0x00000002)
+                   | ((right >> 24) & 0x00000001));
+
+      *subkey++ = (((left << 15) & 0x20000000)
+                   | ((left << 17) & 0x10000000)
+                   | ((left << 10) & 0x08000000)
+                   | ((left << 22) & 0x04000000)
+                   | ((left >> 2) & 0x02000000)
+                   | ((left << 1) & 0x01000000)
+                   | ((left << 16) & 0x00200000)
+                   | ((left << 11) & 0x00100000)
+                   | ((left << 3) & 0x00080000)
+                   | ((left >> 6) & 0x00040000)
+                   | ((left << 15) & 0x00020000)
+                   | ((left >> 4) & 0x00010000)
+                   | ((right >> 2) & 0x00002000)
+                   | ((right << 8) & 0x00001000)
+                   | ((right >> 14) & 0x00000808)
+                   | ((right >> 9) & 0x00000400)
+                   | ((right) & 0x00000200)
+                   | ((right << 7) & 0x00000100)
+                   | ((right >> 7) & 0x00000020)
+                   | ((right >> 3) & 0x00000011)
+                   | ((right << 2) & 0x00000004)
+                   | ((right >> 21) & 0x00000002));
+    }
+}
+
+
+/*
+ * Fill a DES context with subkeys calculated from a 64bit key.
+ * Does not check parity bits, but simply ignore them.
+ * Does not check for weak keys.
+ */
+static int
+des_setkey (struct _des_ctx *ctx, const byte * key)
+{
+  static const char *selftest_failed;
+  int i;
+
+  if (!fips_mode () && !initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+
+      if (selftest_failed)
+	log_error ("%s\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  des_key_schedule (key, ctx->encrypt_subkeys);
+  _gcry_burn_stack (32);
+
+  for(i=0; i<32; i+=2)
+    {
+      ctx->decrypt_subkeys[i]	= ctx->encrypt_subkeys[30-i];
+      ctx->decrypt_subkeys[i+1] = ctx->encrypt_subkeys[31-i];
+    }
+
+  return 0;
+}
+
+
+
+/*
+ * Electronic Codebook Mode DES encryption/decryption of data according
+ * to 'mode'.
+ */
+static int
+des_ecb_crypt (struct _des_ctx *ctx, const byte * from, byte * to, int mode)
+{
+  u32 left, right, work;
+  u32 *keys;
+
+  keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
+
+  READ_64BIT_DATA (from, left, right)
+  INITIAL_PERMUTATION (left, work, right)
+
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+
+  FINAL_PERMUTATION (right, work, left)
+  WRITE_64BIT_DATA (to, right, left)
+
+  return 0;
+}
+
+
+
+/*
+ * Fill a Triple-DES context with subkeys calculated from two 64bit keys.
+ * Does not check the parity bits of the keys, but simply ignore them.
+ * Does not check for weak keys.
+ */
+static int
+tripledes_set2keys (struct _tripledes_ctx *ctx,
+		    const byte * key1,
+		    const byte * key2)
+{
+  int i;
+
+  des_key_schedule (key1, ctx->encrypt_subkeys);
+  des_key_schedule (key2, &(ctx->decrypt_subkeys[32]));
+  _gcry_burn_stack (32);
+
+  for(i=0; i<32; i+=2)
+    {
+      ctx->decrypt_subkeys[i]	 = ctx->encrypt_subkeys[30-i];
+      ctx->decrypt_subkeys[i+1]  = ctx->encrypt_subkeys[31-i];
+
+      ctx->encrypt_subkeys[i+32] = ctx->decrypt_subkeys[62-i];
+      ctx->encrypt_subkeys[i+33] = ctx->decrypt_subkeys[63-i];
+
+      ctx->encrypt_subkeys[i+64] = ctx->encrypt_subkeys[i];
+      ctx->encrypt_subkeys[i+65] = ctx->encrypt_subkeys[i+1];
+
+      ctx->decrypt_subkeys[i+64] = ctx->decrypt_subkeys[i];
+      ctx->decrypt_subkeys[i+65] = ctx->decrypt_subkeys[i+1];
+    }
+
+  return 0;
+}
+
+
+
+/*
+ * Fill a Triple-DES context with subkeys calculated from three 64bit keys.
+ * Does not check the parity bits of the keys, but simply ignore them.
+ * Does not check for weak keys.
+ */
+static int
+tripledes_set3keys (struct _tripledes_ctx *ctx,
+		    const byte * key1,
+		    const byte * key2,
+		    const byte * key3)
+{
+  static const char *selftest_failed;
+  int i;
+
+  if (!fips_mode () && !initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+
+      if (selftest_failed)
+	log_error ("%s\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  des_key_schedule (key1, ctx->encrypt_subkeys);
+  des_key_schedule (key2, &(ctx->decrypt_subkeys[32]));
+  des_key_schedule (key3, &(ctx->encrypt_subkeys[64]));
+  _gcry_burn_stack (32);
+
+  for(i=0; i<32; i+=2)
+    {
+      ctx->decrypt_subkeys[i]	 = ctx->encrypt_subkeys[94-i];
+      ctx->decrypt_subkeys[i+1]  = ctx->encrypt_subkeys[95-i];
+
+      ctx->encrypt_subkeys[i+32] = ctx->decrypt_subkeys[62-i];
+      ctx->encrypt_subkeys[i+33] = ctx->decrypt_subkeys[63-i];
+
+      ctx->decrypt_subkeys[i+64] = ctx->encrypt_subkeys[30-i];
+      ctx->decrypt_subkeys[i+65] = ctx->encrypt_subkeys[31-i];
+     }
+
+  return 0;
+}
+
+
+
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementation of triple-DES. */
+extern void _gcry_3des_amd64_crypt_block(const void *keys, byte *out,
+                                         const byte *in);
+
+/* These assembly implementations process three blocks in parallel. */
+extern void _gcry_3des_amd64_ctr_enc(const void *keys, byte *out,
+                                     const byte *in, byte *ctr);
+
+extern void _gcry_3des_amd64_cbc_dec(const void *keys, byte *out,
+                                     const byte *in, byte *iv);
+
+extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out,
+                                     const byte *in, byte *iv);
+
+#define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *))
+
+
+/*
+ * Electronic Codebook Mode Triple-DES encryption/decryption of data
+ * according to 'mode'.  Sometimes this mode is named 'EDE' mode
+ * (Encryption-Decryption-Encryption).
+ */
+static inline int
+tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
+                     byte * to, int mode)
+{
+  u32 *keys;
+
+  keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
+
+  _gcry_3des_amd64_crypt_block(keys, to, from);
+
+  return 0;
+}
+
+static inline void
+tripledes_amd64_ctr_enc(const void *keys, byte *out, const byte *in, byte *ctr)
+{
+  _gcry_3des_amd64_ctr_enc(keys, out, in, ctr);
+}
+
+static inline void
+tripledes_amd64_cbc_dec(const void *keys, byte *out, const byte *in, byte *iv)
+{
+  _gcry_3des_amd64_cbc_dec(keys, out, in, iv);
+}
+
+static inline void
+tripledes_amd64_cfb_dec(const void *keys, byte *out, const byte *in, byte *iv)
+{
+  _gcry_3des_amd64_cfb_dec(keys, out, in, iv);
+}
+
+#else /*USE_AMD64_ASM*/
+
+#define TRIPLEDES_ECB_BURN_STACK 32
+
+/*
+ * Electronic Codebook Mode Triple-DES encryption/decryption of data
+ * according to 'mode'.  Sometimes this mode is named 'EDE' mode
+ * (Encryption-Decryption-Encryption).
+ */
+static int
+tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
+                     byte * to, int mode)
+{
+  u32 left, right, work;
+  u32 *keys;
+
+  keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
+
+  READ_64BIT_DATA (from, left, right)
+  INITIAL_PERMUTATION (left, work, right)
+
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+  DES_ROUND (left, right, work, keys) DES_ROUND (right, left, work, keys)
+
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+  DES_ROUND (right, left, work, keys) DES_ROUND (left, right, work, keys)
+
+  FINAL_PERMUTATION (right, work, left)
+  WRITE_64BIT_DATA (to, right, left)
+
+  return 0;
+}
+
+#endif /*!USE_AMD64_ASM*/
+
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size DES_BLOCKSIZE. */
+static void
+_gcry_3des_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+                   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[DES_BLOCKSIZE];
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 9 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        tripledes_amd64_ctr_enc(ctx->encrypt_subkeys, outbuf, inbuf, ctr);
+
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      tripledes_ecb_encrypt (ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, DES_BLOCKSIZE);
+      outbuf += DES_BLOCKSIZE;
+      inbuf  += DES_BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add(ctr, 1, DES_BLOCKSIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_3des_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+                   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[DES_BLOCKSIZE];
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 10 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        tripledes_amd64_cbc_dec(ctx->decrypt_subkeys, outbuf, inbuf, iv);
+
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      tripledes_ecb_decrypt (ctx, inbuf, savebuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, DES_BLOCKSIZE);
+      inbuf += DES_BLOCKSIZE;
+      outbuf += DES_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_3des_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 9 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        tripledes_amd64_cfb_dec(ctx->encrypt_subkeys, outbuf, inbuf, iv);
+
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      tripledes_ecb_encrypt (ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, DES_BLOCKSIZE);
+      outbuf += DES_BLOCKSIZE;
+      inbuf  += DES_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/*
+ * Check whether the 8 byte key is weak.
+ * Does not check the parity bits of the key but simple ignore them.
+ */
+static int
+is_weak_key ( const byte *key )
+{
+  byte work[8];
+  int i, left, right, middle, cmp_result;
+
+  /* clear parity bits */
+  for(i=0; i<8; ++i)
+     work[i] = key[i] & 0xfe;
+
+  /* binary search in the weak key table */
+  left = 0;
+  right = 63;
+  while(left <= right)
+    {
+      middle = (left + right) / 2;
+
+      if ( !(cmp_result=working_memcmp(work, weak_keys[middle], 8)) )
+	  return -1;
+
+      if ( cmp_result > 0 )
+	  left = middle + 1;
+      else
+	  right = middle - 1;
+    }
+
+  return 0;
+}
+
+
+/* Alternative setkey for selftests; need larger key than default. */
+static gcry_err_code_t
+bulk_selftest_setkey (void *context, const byte *__key, unsigned __keylen,
+                      cipher_bulk_ops_t *bulk_ops)
+{
+  static const unsigned char key[24] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
+      0x18,0x2A,0x39,0x47,0x5E,0x6F,0x75,0x82
+    };
+
+  (void)__key;
+  (void)__keylen;
+
+  return do_tripledes_setkey(context, key, sizeof(key), bulk_ops);
+}
+
+
+/* Run the self-tests for DES-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 3+1;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_ctr("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for DES-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 3+2;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_cbc("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for DES-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 3+2;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_cfb("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/*
+ * Performs a selftest of this DES/Triple-DES implementation.
+ * Returns an string with the error text on failure.
+ * Returns NULL if all is ok.
+ */
+static const char *
+selftest (void)
+{
+  const char *r;
+
+  /*
+   * Check if 'u32' is really 32 bits wide. This DES / 3DES implementation
+   * need this.
+   */
+  if (sizeof (u32) != 4)
+    return "Wrong word size for DES configured.";
+
+  /*
+   * DES Maintenance Test
+   */
+  {
+    int i;
+    byte key[8] =
+      {0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55};
+    byte input[8] =
+      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    byte result[8] =
+      {0x24, 0x6e, 0x9d, 0xb9, 0xc5, 0x50, 0x38, 0x1a};
+    byte temp1[8], temp2[8], temp3[8];
+    des_ctx des;
+
+    for (i = 0; i < 64; ++i)
+      {
+	des_setkey (des, key);
+	des_ecb_encrypt (des, input, temp1);
+	des_ecb_encrypt (des, temp1, temp2);
+	des_setkey (des, temp2);
+	des_ecb_decrypt (des, temp1, temp3);
+	memcpy (key, temp3, 8);
+	memcpy (input, temp1, 8);
+      }
+    if (memcmp (temp3, result, 8))
+      return "DES maintenance test failed.";
+  }
+
+
+  /*
+   * Self made Triple-DES test	(Does somebody know an official test?)
+   */
+  {
+    int i;
+    byte input[8] =
+      {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10};
+    byte key1[8] =
+      {0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0};
+    byte key2[8] =
+      {0x11, 0x22, 0x33, 0x44, 0xff, 0xaa, 0xcc, 0xdd};
+    byte result[8] =
+      {0x7b, 0x38, 0x3b, 0x23, 0xa2, 0x7d, 0x26, 0xd3};
+
+    tripledes_ctx des3;
+
+    for (i = 0; i < 16; ++i)
+      {
+	tripledes_set2keys (des3, key1, key2);
+	tripledes_ecb_encrypt (des3, input, key1);
+	tripledes_ecb_decrypt (des3, input, key2);
+	tripledes_set3keys (des3, key1, input, key2);
+	tripledes_ecb_encrypt (des3, input, input);
+      }
+    if (memcmp (input, result, 8))
+      return "Triple-DES test failed.";
+  }
+
+  /*
+   * More Triple-DES test.  These are testvectors as used by SSLeay,
+   * thanks to Jeroen C. van Gelderen.
+   */
+  {
+    static const struct { byte key[24]; byte plain[8]; byte cipher[8]; }
+      testdata[] = {
+      { { 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01  },
+        { 0x95,0xF8,0xA5,0xE5,0xDD,0x31,0xD9,0x00  },
+        { 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00  }
+      },
+
+      { { 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01  },
+        { 0x9D,0x64,0x55,0x5A,0x9A,0x10,0xB8,0x52, },
+        { 0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x00  }
+      },
+      { { 0x38,0x49,0x67,0x4C,0x26,0x02,0x31,0x9E,
+          0x38,0x49,0x67,0x4C,0x26,0x02,0x31,0x9E,
+          0x38,0x49,0x67,0x4C,0x26,0x02,0x31,0x9E  },
+        { 0x51,0x45,0x4B,0x58,0x2D,0xDF,0x44,0x0A  },
+        { 0x71,0x78,0x87,0x6E,0x01,0xF1,0x9B,0x2A  }
+      },
+      { { 0x04,0xB9,0x15,0xBA,0x43,0xFE,0xB5,0xB6,
+          0x04,0xB9,0x15,0xBA,0x43,0xFE,0xB5,0xB6,
+          0x04,0xB9,0x15,0xBA,0x43,0xFE,0xB5,0xB6  },
+        { 0x42,0xFD,0x44,0x30,0x59,0x57,0x7F,0xA2  },
+        { 0xAF,0x37,0xFB,0x42,0x1F,0x8C,0x40,0x95  }
+      },
+      { { 0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF,
+          0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF,
+          0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF  },
+        { 0x73,0x6F,0x6D,0x65,0x64,0x61,0x74,0x61  },
+        { 0x3D,0x12,0x4F,0xE2,0x19,0x8B,0xA3,0x18  }
+      },
+      { { 0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF,
+          0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,
+          0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF  },
+        { 0x73,0x6F,0x6D,0x65,0x64,0x61,0x74,0x61  },
+        { 0xFB,0xAB,0xA1,0xFF,0x9D,0x05,0xE9,0xB1  }
+      },
+      { { 0x01,0x23,0x45,0x67,0x89,0xAB,0xCD,0xEF,
+          0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55,
+          0xFE,0xDC,0xBA,0x98,0x76,0x54,0x32,0x10  },
+        { 0x73,0x6F,0x6D,0x65,0x64,0x61,0x74,0x61  },
+        { 0x18,0xd7,0x48,0xe5,0x63,0x62,0x05,0x72  }
+      },
+      { { 0x03,0x52,0x02,0x07,0x67,0x20,0x82,0x17,
+          0x86,0x02,0x87,0x66,0x59,0x08,0x21,0x98,
+          0x64,0x05,0x6A,0xBD,0xFE,0xA9,0x34,0x57  },
+        { 0x73,0x71,0x75,0x69,0x67,0x67,0x6C,0x65  },
+        { 0xc0,0x7d,0x2a,0x0f,0xa5,0x66,0xfa,0x30  }
+      },
+      { { 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x80,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+          0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x02  },
+        { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00  },
+        { 0xe6,0xe6,0xdd,0x5b,0x7e,0x72,0x29,0x74  }
+      },
+      { { 0x10,0x46,0x10,0x34,0x89,0x98,0x80,0x20,
+          0x91,0x07,0xD0,0x15,0x89,0x19,0x01,0x01,
+          0x19,0x07,0x92,0x10,0x98,0x1A,0x01,0x01  },
+        { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00  },
+        { 0xe1,0xef,0x62,0xc3,0x32,0xfe,0x82,0x5b  }
+      }
+    };
+
+    byte		result[8];
+    int		i;
+    tripledes_ctx	des3;
+
+    for (i=0; i<sizeof(testdata)/sizeof(*testdata); ++i)
+      {
+        tripledes_set3keys (des3, testdata[i].key,
+                            testdata[i].key + 8, testdata[i].key + 16);
+
+        tripledes_ecb_encrypt (des3, testdata[i].plain, result);
+        if (memcmp (testdata[i].cipher, result, 8))
+          return "Triple-DES SSLeay test failed on encryption.";
+
+        tripledes_ecb_decrypt (des3, testdata[i].cipher, result);
+        if (memcmp (testdata[i].plain, result, 8))
+          return  "Triple-DES SSLeay test failed on decryption.";;
+      }
+  }
+
+  /*
+   * Check the weak key detection. We simply assume that the table
+   * with weak keys is ok and check every key in the table if it is
+   * detected... (This test is a little bit stupid).
+   */
+  {
+    int i;
+    unsigned char *p;
+    gcry_md_hd_t h;
+
+    if (_gcry_md_open (&h, GCRY_MD_SHA1, 0))
+      return "SHA1 not available";
+
+    for (i = 0; i < 64; ++i)
+      _gcry_md_write (h, weak_keys[i], 8);
+    p = _gcry_md_read (h, GCRY_MD_SHA1);
+    i = memcmp (p, weak_keys_chksum, 20);
+    _gcry_md_close (h);
+    if (i)
+      return "weak key table defect";
+
+    for (i = 0; i < 64; ++i)
+      if (!is_weak_key(weak_keys[i]))
+        return "DES weak key detection failed";
+  }
+
+  if ( (r = selftest_cbc ()) )
+    return r;
+
+  if ( (r = selftest_cfb ()) )
+    return r;
+
+  if ( (r = selftest_ctr ()) )
+    return r;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+do_tripledes_setkey ( void *context, const byte *key, unsigned keylen,
+                      cipher_bulk_ops_t *bulk_ops )
+{
+  struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context;
+
+  if( keylen != 24 )
+    return GPG_ERR_INV_KEYLEN;
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_dec =  _gcry_3des_cbc_dec;
+  bulk_ops->cfb_dec =  _gcry_3des_cfb_dec;
+  bulk_ops->ctr_enc =  _gcry_3des_ctr_enc;
+
+  tripledes_set3keys ( ctx, key, key+8, key+16);
+
+  if (ctx->flags.no_weak_key)
+    ; /* Detection has been disabled.  */
+  else if (is_weak_key (key) || is_weak_key (key+8) || is_weak_key (key+16))
+    {
+      _gcry_burn_stack (64);
+      return GPG_ERR_WEAK_KEY;
+    }
+  _gcry_burn_stack (64);
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+static gcry_err_code_t
+do_tripledes_set_extra_info (void *context, int what,
+                             const void *buffer, size_t buflen)
+{
+  struct _tripledes_ctx *ctx = (struct _tripledes_ctx *)context;
+  gpg_err_code_t ec = 0;
+
+  (void)buffer;
+  (void)buflen;
+
+  switch (what)
+    {
+    case CIPHER_INFO_NO_WEAK_KEY:
+      ctx->flags.no_weak_key = 1;
+      break;
+
+    default:
+      ec = GPG_ERR_INV_OP;
+      break;
+    }
+  return ec;
+}
+
+
+static unsigned int
+do_tripledes_encrypt( void *context, byte *outbuf, const byte *inbuf )
+{
+  struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context;
+
+  tripledes_ecb_encrypt ( ctx, inbuf, outbuf );
+  return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK;
+}
+
+static unsigned int
+do_tripledes_decrypt( void *context, byte *outbuf, const byte *inbuf )
+{
+  struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context;
+  tripledes_ecb_decrypt ( ctx, inbuf, outbuf );
+  return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK;
+}
+
+static gcry_err_code_t
+do_des_setkey (void *context, const byte *key, unsigned keylen,
+               cipher_bulk_ops_t *bulk_ops)
+{
+  struct _des_ctx *ctx = (struct _des_ctx *) context;
+
+  (void)bulk_ops;
+
+  if (keylen != 8)
+    return GPG_ERR_INV_KEYLEN;
+
+  des_setkey (ctx, key);
+
+  if (is_weak_key (key)) {
+    _gcry_burn_stack (64);
+    return GPG_ERR_WEAK_KEY;
+  }
+  _gcry_burn_stack (64);
+
+  return GPG_ERR_NO_ERROR;
+}
+
+
+static unsigned int
+do_des_encrypt( void *context, byte *outbuf, const byte *inbuf )
+{
+  struct _des_ctx *ctx = (struct _des_ctx *) context;
+
+  des_ecb_encrypt ( ctx, inbuf, outbuf );
+  return /*burn_stack*/ (32);
+}
+
+static unsigned int
+do_des_decrypt( void *context, byte *outbuf, const byte *inbuf )
+{
+  struct _des_ctx *ctx = (struct _des_ctx *) context;
+
+  des_ecb_decrypt ( ctx, inbuf, outbuf );
+  return /*burn_stack*/ (32);
+}
+
+
+
+
+/*
+     Self-test section.
+ */
+
+
+/* Selftest for TripleDES.  */
+static gpg_err_code_t
+selftest_fips (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  (void)extended; /* No extended tests available.  */
+
+  what = "low-level";
+  errtxt = selftest ();
+  if (errtxt)
+    goto failed;
+
+  /* The low-level self-tests are quite extensive and thus we can do
+     without high level tests.  This is also justified because we have
+     no custom block code implementation for 3des but always use the
+     standard high level block code.  */
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cipher", GCRY_CIPHER_3DES, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_CIPHER_3DES:
+      ec = selftest_fips (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_CIPHER_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_des =
+  {
+    GCRY_CIPHER_DES, {0, 0},
+    "DES", NULL, NULL, 8, 64, sizeof (struct _des_ctx),
+    do_des_setkey, do_des_encrypt, do_des_decrypt
+  };
+
+static gcry_cipher_oid_spec_t oids_tripledes[] =
+  {
+    { "1.2.840.113549.3.7", GCRY_CIPHER_MODE_CBC },
+    /* Teletrust specific OID for 3DES. */
+    { "1.3.36.3.1.3.2.1",   GCRY_CIPHER_MODE_CBC },
+    /* pbeWithSHAAnd3_KeyTripleDES_CBC */
+    { "1.2.840.113549.1.12.1.3", GCRY_CIPHER_MODE_CBC },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_tripledes =
+  {
+    GCRY_CIPHER_3DES, {0, 1},
+    "3DES", NULL, oids_tripledes, 8, 192, sizeof (struct _tripledes_ctx),
+    do_tripledes_setkey, do_tripledes_encrypt, do_tripledes_decrypt,
+    NULL, NULL,
+    run_selftests,
+    do_tripledes_set_extra_info
+  };
diff --git a/comm/third_party/libgcrypt/cipher/dsa-common.c b/comm/third_party/libgcrypt/cipher/dsa-common.c
new file mode 100644
index 0000000000..fe49248dd6
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/dsa-common.c
@@ -0,0 +1,418 @@
+/* dsa-common.c - Common code for DSA
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ * Copyright (C) 2013  g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+/*
+ * Modify K, so that computation time difference can be small,
+ * by making K large enough.
+ *
+ * Originally, (EC)DSA computation requires k where 0 < k < q.  Here,
+ * we add q (the order), to keep k in a range: q < k < 2*q (or,
+ * addming more q, to keep k in a range: 2*q < k < 3*q), so that
+ * timing difference of the EC multiply (or exponentiation) operation
+ * can be small.  The result of (EC)DSA computation is same.
+ */
+void
+_gcry_dsa_modify_k (gcry_mpi_t k, gcry_mpi_t q, int qbits)
+{
+  gcry_mpi_t k1 = mpi_new (qbits+2);
+
+  mpi_resize (k, (qbits+2+BITS_PER_MPI_LIMB-1) / BITS_PER_MPI_LIMB);
+  k->nlimbs = k->alloced;
+  mpi_add (k, k, q);
+  mpi_add (k1, k, q);
+  mpi_set_cond (k, k1, !mpi_test_bit (k, qbits));
+
+  mpi_free (k1);
+}
+
+/*
+ * Generate a random secret exponent K less than Q.
+ * Note that ECDSA uses this code also to generate D.
+ */
+gcry_mpi_t
+_gcry_dsa_gen_k (gcry_mpi_t q, int security_level)
+{
+  gcry_mpi_t k        = mpi_alloc_secure (mpi_get_nlimbs (q));
+  unsigned int nbits  = mpi_get_nbits (q);
+  unsigned int nbytes = (nbits+7)/8;
+  char *rndbuf = NULL;
+
+  /* To learn why we don't use mpi_mod to get the requested bit size,
+     read the paper: "The Insecurity of the Digital Signature
+     Algorithm with Partially Known Nonces" by Nguyen and Shparlinski.
+     Journal of Cryptology, New York. Vol 15, nr 3 (2003)  */
+
+  if (DBG_CIPHER)
+    log_debug ("choosing a random k of %u bits at seclevel %d\n",
+               nbits, security_level);
+  for (;;)
+    {
+      if ( !rndbuf || nbits < 32 )
+        {
+          xfree (rndbuf);
+          rndbuf = _gcry_random_bytes_secure (nbytes, security_level);
+	}
+      else
+        { /* Change only some of the higher bits.  We could improve
+	     this by directly requesting more memory at the first call
+	     to get_random_bytes() and use these extra bytes here.
+	     However the required management code is more complex and
+	     thus we better use this simple method.  */
+          char *pp = _gcry_random_bytes_secure (4, security_level);
+          memcpy (rndbuf, pp, 4);
+          xfree (pp);
+	}
+      _gcry_mpi_set_buffer (k, rndbuf, nbytes, 0);
+
+      /* Make sure we have the requested number of bits.  This code
+         looks a bit funny but it is easy to understand if you
+         consider that mpi_set_highbit clears all higher bits.  We
+         don't have a clear_highbit, thus we first set the high bit
+         and then clear it again.  */
+      if (mpi_test_bit (k, nbits-1))
+        mpi_set_highbit (k, nbits-1);
+      else
+        {
+          mpi_set_highbit (k, nbits-1);
+          mpi_clear_bit (k, nbits-1);
+	}
+
+      if (!(mpi_cmp (k, q) < 0))    /* check: k < q */
+        {
+          if (DBG_CIPHER)
+            log_debug ("\tk too large - again\n");
+          continue; /* no  */
+        }
+      if (!(mpi_cmp_ui (k, 0) > 0)) /* check: k > 0 */
+        {
+          if (DBG_CIPHER)
+            log_debug ("\tk is zero - again\n");
+          continue; /* no */
+        }
+      break;	/* okay */
+    }
+  xfree (rndbuf);
+
+  return k;
+}
+
+
+/* Turn VALUE into an octet string and store it in an allocated buffer
+   at R_FRAME.  If the resulting octet string is shorter than NBYTES
+   the result will be left padded with zeroes.  If VALUE does not fit
+   into NBYTES an error code is returned.  */
+static gpg_err_code_t
+int2octets (unsigned char **r_frame, gcry_mpi_t value, size_t nbytes)
+{
+  gpg_err_code_t rc;
+  size_t nframe, noff, n;
+  unsigned char *frame;
+
+  rc = _gcry_mpi_print (GCRYMPI_FMT_USG, NULL, 0, &nframe, value);
+  if (rc)
+    return rc;
+  if (nframe > nbytes)
+    return GPG_ERR_TOO_LARGE; /* Value too long to fit into NBYTES.  */
+
+  noff = (nframe < nbytes)? nbytes - nframe : 0;
+  n = nframe + noff;
+  frame = mpi_is_secure (value)? xtrymalloc_secure (n) : xtrymalloc (n);
+  if (!frame)
+    return gpg_err_code_from_syserror ();
+  if (noff)
+    memset (frame, 0, noff);
+  nframe += noff;
+  rc = _gcry_mpi_print (GCRYMPI_FMT_USG, frame+noff, nframe-noff, NULL, value);
+  if (rc)
+    {
+      xfree (frame);
+      return rc;
+    }
+
+  *r_frame = frame;
+  return 0;
+}
+
+
+/* Connert the bit string BITS of length NBITS into an octet string
+   with a length of (QBITS+7)/8 bytes.  On success store the result at
+   R_FRAME.  */
+static gpg_err_code_t
+bits2octets (unsigned char **r_frame,
+             const void *bits, unsigned int nbits,
+             gcry_mpi_t q, unsigned int qbits)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t z1;
+
+  /* z1 = bits2int (b) */
+  rc = _gcry_mpi_scan (&z1, GCRYMPI_FMT_USG, bits, (nbits+7)/8, NULL);
+  if (rc)
+    return rc;
+  if (nbits > qbits)
+    mpi_rshift (z1, z1, nbits - qbits);
+
+  /* z2 - z1 mod q */
+  if (mpi_cmp (z1, q) >= 0)
+    mpi_sub (z1, z1, q);
+
+  /* Convert to an octet string.  */
+  rc = int2octets (r_frame, z1, (qbits+7)/8);
+
+  mpi_free (z1);
+  return rc;
+}
+
+
+/*
+ * Generate a deterministic secret exponent K less than DSA_Q.  H1 is
+ * the to be signed digest with a length of HLEN bytes.  HALGO is the
+ * algorithm used to create the hash.  On success the value for K is
+ * stored at R_K.
+ */
+gpg_err_code_t
+_gcry_dsa_gen_rfc6979_k (gcry_mpi_t *r_k,
+                         gcry_mpi_t dsa_q, gcry_mpi_t dsa_x,
+                         const unsigned char *h1, unsigned int hlen,
+                         int halgo, unsigned int extraloops)
+{
+  gpg_err_code_t rc;
+  unsigned char *V = NULL;
+  unsigned char *K = NULL;
+  unsigned char *x_buf = NULL;
+  unsigned char *h1_buf = NULL;
+  gcry_md_hd_t hd = NULL;
+  unsigned char *t = NULL;
+  gcry_mpi_t k = NULL;
+  unsigned int tbits, qbits;
+  int i;
+
+  qbits = mpi_get_nbits (dsa_q);
+
+  if (!qbits || !h1 || !hlen)
+    return GPG_ERR_EINVAL;
+
+  if (_gcry_md_get_algo_dlen (halgo) != hlen)
+    return GPG_ERR_DIGEST_ALGO;
+
+  /* Step b:  V = 0x01 0x01 0x01 ... 0x01 */
+  V = xtrymalloc (hlen);
+  if (!V)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  for (i=0; i < hlen; i++)
+    V[i] = 1;
+
+  /* Step c:  K = 0x00 0x00 0x00 ... 0x00 */
+  K = xtrycalloc (1, hlen);
+  if (!K)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  rc = int2octets (&x_buf, dsa_x, (qbits+7)/8);
+  if (rc)
+    goto leave;
+
+  rc = bits2octets (&h1_buf, h1, hlen*8, dsa_q, qbits);
+  if (rc)
+    goto leave;
+
+  /* Create a handle to compute the HMACs.  */
+  rc = _gcry_md_open (&hd, halgo, (GCRY_MD_FLAG_SECURE | GCRY_MD_FLAG_HMAC));
+  if (rc)
+    goto leave;
+
+  /* Step d:  K = HMAC_K(V || 0x00 || int2octets(x) || bits2octets(h1) */
+  rc = _gcry_md_setkey (hd, K, hlen);
+  if (rc)
+    goto leave;
+  _gcry_md_write (hd, V, hlen);
+  _gcry_md_write (hd, "", 1);
+  _gcry_md_write (hd, x_buf, (qbits+7)/8);
+  _gcry_md_write (hd, h1_buf, (qbits+7)/8);
+  memcpy (K, _gcry_md_read (hd, 0), hlen);
+
+  /* Step e:  V = HMAC_K(V) */
+  rc = _gcry_md_setkey (hd, K, hlen);
+  if (rc)
+    goto leave;
+  _gcry_md_write (hd, V, hlen);
+  memcpy (V, _gcry_md_read (hd, 0), hlen);
+
+  /* Step f:  K = HMAC_K(V || 0x01 || int2octets(x) || bits2octets(h1) */
+  rc = _gcry_md_setkey (hd, K, hlen);
+  if (rc)
+    goto leave;
+  _gcry_md_write (hd, V, hlen);
+  _gcry_md_write (hd, "\x01", 1);
+  _gcry_md_write (hd, x_buf, (qbits+7)/8);
+  _gcry_md_write (hd, h1_buf, (qbits+7)/8);
+  memcpy (K, _gcry_md_read (hd, 0), hlen);
+
+  /* Step g:  V = HMAC_K(V) */
+  rc = _gcry_md_setkey (hd, K, hlen);
+  if (rc)
+    goto leave;
+  _gcry_md_write (hd, V, hlen);
+  memcpy (V, _gcry_md_read (hd, 0), hlen);
+
+  /* Step h. */
+  t = xtrymalloc_secure ((qbits+7)/8+hlen);
+  if (!t)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+ again:
+  for (tbits = 0; tbits < qbits;)
+    {
+      /* V = HMAC_K(V) */
+      rc = _gcry_md_setkey (hd, K, hlen);
+      if (rc)
+        goto leave;
+      _gcry_md_write (hd, V, hlen);
+      memcpy (V, _gcry_md_read (hd, 0), hlen);
+
+      /* T = T || V */
+      memcpy (t+(tbits+7)/8, V, hlen);
+      tbits += 8*hlen;
+    }
+
+  /* k = bits2int (T) */
+  mpi_free (k);
+  k = NULL;
+  rc = _gcry_mpi_scan (&k, GCRYMPI_FMT_USG, t, (tbits+7)/8, NULL);
+  if (rc)
+    goto leave;
+  if (tbits > qbits)
+    mpi_rshift (k, k, tbits - qbits);
+
+  /* Check: k < q and k > 1 */
+  if (!(mpi_cmp (k, dsa_q) < 0 && mpi_cmp_ui (k, 0) > 0))
+    {
+      /* K = HMAC_K(V || 0x00) */
+      rc = _gcry_md_setkey (hd, K, hlen);
+      if (rc)
+        goto leave;
+      _gcry_md_write (hd, V, hlen);
+      _gcry_md_write (hd, "", 1);
+      memcpy (K, _gcry_md_read (hd, 0), hlen);
+
+      /* V = HMAC_K(V) */
+      rc = _gcry_md_setkey (hd, K, hlen);
+      if (rc)
+        goto leave;
+      _gcry_md_write (hd, V, hlen);
+      memcpy (V, _gcry_md_read (hd, 0), hlen);
+
+      goto again;
+    }
+
+  /* The caller may have requested that we introduce some extra loops.
+     This is for example useful if the caller wants another value for
+     K because the last returned one yielded an R of 0.  Because this
+     is very unlikely we implement it in a straightforward way.  */
+  if (extraloops)
+    {
+      extraloops--;
+
+      /* K = HMAC_K(V || 0x00) */
+      rc = _gcry_md_setkey (hd, K, hlen);
+      if (rc)
+        goto leave;
+      _gcry_md_write (hd, V, hlen);
+      _gcry_md_write (hd, "", 1);
+      memcpy (K, _gcry_md_read (hd, 0), hlen);
+
+      /* V = HMAC_K(V) */
+      rc = _gcry_md_setkey (hd, K, hlen);
+      if (rc)
+        goto leave;
+      _gcry_md_write (hd, V, hlen);
+      memcpy (V, _gcry_md_read (hd, 0), hlen);
+
+      goto again;
+    }
+
+  /* log_mpidump ("  k", k); */
+
+ leave:
+  xfree (t);
+  _gcry_md_close (hd);
+  xfree (h1_buf);
+  xfree (x_buf);
+  xfree (K);
+  xfree (V);
+
+  if (rc)
+    mpi_free (k);
+  else
+    *r_k = k;
+  return rc;
+}
+
+/*
+ * Truncate opaque hash value to qbits for DSA.
+ * Non-opaque input is not truncated, in hope that user
+ * knows what is passed. It is not possible to correctly
+ * trucate non-opaque inputs.
+ */
+gpg_err_code_t
+_gcry_dsa_normalize_hash (gcry_mpi_t input,
+                          gcry_mpi_t *out,
+                          unsigned int qbits)
+{
+  gpg_err_code_t rc = 0;
+  const void *abuf;
+  unsigned int abits;
+  gcry_mpi_t hash;
+
+  if (mpi_is_opaque (input))
+    {
+      abuf = mpi_get_opaque (input, &abits);
+      rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL);
+      if (rc)
+        return rc;
+      if (abits > qbits)
+        mpi_rshift (hash, hash, abits - qbits);
+    }
+  else
+    hash = input;
+
+  *out = hash;
+
+  return rc;
+}
diff --git a/comm/third_party/libgcrypt/cipher/dsa.c b/comm/third_party/libgcrypt/cipher/dsa.c
new file mode 100644
index 0000000000..d793b9aaf2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/dsa.c
@@ -0,0 +1,1394 @@
+/* dsa.c - DSA signature algorithm
+ * Copyright (C) 1998, 2000, 2001, 2002, 2003,
+ *               2006, 2008  Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+typedef struct
+{
+  gcry_mpi_t p;	    /* prime */
+  gcry_mpi_t q;	    /* group order */
+  gcry_mpi_t g;	    /* group generator */
+  gcry_mpi_t y;	    /* g^x mod p */
+} DSA_public_key;
+
+
+typedef struct
+{
+  gcry_mpi_t p;	    /* prime */
+  gcry_mpi_t q;	    /* group order */
+  gcry_mpi_t g;	    /* group generator */
+  gcry_mpi_t y;	    /* g^x mod p */
+  gcry_mpi_t x;	    /* secret exponent */
+} DSA_secret_key;
+
+
+/* A structure used to hold domain parameters.  */
+typedef struct
+{
+  gcry_mpi_t p;	    /* prime */
+  gcry_mpi_t q;	    /* group order */
+  gcry_mpi_t g;	    /* group generator */
+} dsa_domain_t;
+
+
+static const char *dsa_names[] =
+  {
+    "dsa",
+    "openpgp-dsa",
+    NULL,
+  };
+
+
+/* A sample 1024 bit DSA key used for the selftests.  Not anymore
+ * used, kept only for reference.  */
+#if 0
+static const char sample_secret_key_1024[] =
+"(private-key"
+" (dsa"
+"  (p #00AD7C0025BA1A15F775F3F2D673718391D00456978D347B33D7B49E7F32EDAB"
+"      96273899DD8B2BB46CD6ECA263FAF04A28903503D59062A8865D2AE8ADFB5191"
+"      CF36FFB562D0E2F5809801A1F675DAE59698A9E01EFE8D7DCFCA084F4C6F5A44"
+"      44D499A06FFAEA5E8EF5E01F2FD20A7B7EF3F6968AFBA1FB8D91F1559D52D8777B#)"
+"  (q #00EB7B5751D25EBBB7BD59D920315FD840E19AEBF9#)"
+"  (g #1574363387FDFD1DDF38F4FBE135BB20C7EE4772FB94C337AF86EA8E49666503"
+"      AE04B6BE81A2F8DD095311E0217ACA698A11E6C5D33CCDAE71498ED35D13991E"
+"      B02F09AB40BD8F4C5ED8C75DA779D0AE104BC34C960B002377068AB4B5A1F984"
+"      3FBA91F537F1B7CAC4D8DD6D89B0D863AF7025D549F9C765D2FC07EE208F8D15#)"
+"  (y #64B11EF8871BE4AB572AA810D5D3CA11A6CDBC637A8014602C72960DB135BF46"
+"      A1816A724C34F87330FC9E187C5D66897A04535CC2AC9164A7150ABFA8179827"
+"      6E45831AB811EEE848EBB24D9F5F2883B6E5DDC4C659DEF944DCFD80BF4D0A20"
+"      42CAA7DC289F0C5A9D155F02D3D551DB741A81695B74D4C8F477F9C7838EB0FB#)"
+"  (x #11D54E4ADBD3034160F2CED4B7CD292A4EBF3EC0#)))";
+/* A sample 1024 bit DSA key used for the selftests (public only).  */
+static const char sample_public_key_1024[] =
+"(public-key"
+" (dsa"
+"  (p #00AD7C0025BA1A15F775F3F2D673718391D00456978D347B33D7B49E7F32EDAB"
+"      96273899DD8B2BB46CD6ECA263FAF04A28903503D59062A8865D2AE8ADFB5191"
+"      CF36FFB562D0E2F5809801A1F675DAE59698A9E01EFE8D7DCFCA084F4C6F5A44"
+"      44D499A06FFAEA5E8EF5E01F2FD20A7B7EF3F6968AFBA1FB8D91F1559D52D8777B#)"
+"  (q #00EB7B5751D25EBBB7BD59D920315FD840E19AEBF9#)"
+"  (g #1574363387FDFD1DDF38F4FBE135BB20C7EE4772FB94C337AF86EA8E49666503"
+"      AE04B6BE81A2F8DD095311E0217ACA698A11E6C5D33CCDAE71498ED35D13991E"
+"      B02F09AB40BD8F4C5ED8C75DA779D0AE104BC34C960B002377068AB4B5A1F984"
+"      3FBA91F537F1B7CAC4D8DD6D89B0D863AF7025D549F9C765D2FC07EE208F8D15#)"
+"  (y #64B11EF8871BE4AB572AA810D5D3CA11A6CDBC637A8014602C72960DB135BF46"
+"      A1816A724C34F87330FC9E187C5D66897A04535CC2AC9164A7150ABFA8179827"
+"      6E45831AB811EEE848EBB24D9F5F2883B6E5DDC4C659DEF944DCFD80BF4D0A20"
+"      42CAA7DC289F0C5A9D155F02D3D551DB741A81695B74D4C8F477F9C7838EB0FB#)))";
+#endif /*0*/
+
+/* 2048 DSA key from RFC 6979 A.2.2 */
+static const char sample_public_key_2048[] =
+"(public-key"
+" (dsa"
+"  (p #9DB6FB5951B66BB6FE1E140F1D2CE5502374161FD6538DF1648218642F0B5C48C8F7A41AADFA187324B87674FA1822B00F1ECF8136943D7C55757264E5A1A44FFE012E9936E00C1D3E9310B01C7D179805D3058B2A9F4BB6F9716BFE6117C6B5B3CC4D9BE341104AD4A80AD6C94E005F4B993E14F091EB51743BF33050C38DE235567E1B34C3D6A5C0CEAA1A0F368213C3D19843D0B4B09DCB9FC72D39C8DE41F1BF14D4BB4563CA28371621CAD3324B6A2D392145BEBFAC748805236F5CA2FE92B871CD8F9C36D3292B5509CA8CAA77A2ADFC7BFD77DDA6F71125A7456FEA153E433256A2261C6A06ED3693797E7995FAD5AABBCFBE3EDA2741E375404AE25B#)"
+"  (q #F2C3119374CE76C9356990B465374A17F23F9ED35089BD969F61C6DDE9998C1F#)"
+"  (g #5C7FF6B06F8F143FE8288433493E4769C4D988ACE5BE25A0E24809670716C613D7B0CEE6932F8FAA7C44D2CB24523DA53FBE4F6EC3595892D1AA58C4328A06C46A15662E7EAA703A1DECF8BBB2D05DBE2EB956C142A338661D10461C0D135472085057F3494309FFA73C611F78B32ADBB5740C361C9F35BE90997DB2014E2EF5AA61782F52ABEB8BD6432C4DD097BC5423B285DAFB60DC364E8161F4A2A35ACA3A10B1C4D203CC76A470A33AFDCBDD92959859ABD8B56E1725252D78EAC66E71BA9AE3F1DD2487199874393CD4D832186800654760E1E34C09E4D155179F9EC0DC4473F996BDCE6EED1CABED8B6F116F7AD9CF505DF0F998E34AB27514B0FFE7#)"
+"  (y #667098C654426C78D7F8201EAC6C203EF030D43605032C2F1FA937E5237DBD949F34A0A2564FE126DC8B715C5141802CE0979C8246463C40E6B6BDAA2513FA611728716C2E4FD53BC95B89E69949D96512E873B9C8F8DFD499CC312882561ADECB31F658E934C0C197F2C4D96B05CBAD67381E7B768891E4DA3843D24D94CDFB5126E9B8BF21E8358EE0E0A30EF13FD6A664C0DCE3731F7FB49A4845A4FD8254687972A2D382599C9BAC4E0ED7998193078913032558134976410B89D2C171D123AC35FD977219597AA7D15C1A9A428E59194F75C721EBCBCFAE44696A499AFA74E04299F132026601638CB87AB79190D4A0986315DA8EEC6561C938996BEADF#)))";
+
+static const char sample_secret_key_2048[] =
+"(private-key"
+" (dsa"
+"  (p #9DB6FB5951B66BB6FE1E140F1D2CE5502374161FD6538DF1648218642F0B5C48C8F7A41AADFA187324B87674FA1822B00F1ECF8136943D7C55757264E5A1A44FFE012E9936E00C1D3E9310B01C7D179805D3058B2A9F4BB6F9716BFE6117C6B5B3CC4D9BE341104AD4A80AD6C94E005F4B993E14F091EB51743BF33050C38DE235567E1B34C3D6A5C0CEAA1A0F368213C3D19843D0B4B09DCB9FC72D39C8DE41F1BF14D4BB4563CA28371621CAD3324B6A2D392145BEBFAC748805236F5CA2FE92B871CD8F9C36D3292B5509CA8CAA77A2ADFC7BFD77DDA6F71125A7456FEA153E433256A2261C6A06ED3693797E7995FAD5AABBCFBE3EDA2741E375404AE25B#)"
+"  (q #F2C3119374CE76C9356990B465374A17F23F9ED35089BD969F61C6DDE9998C1F#)"
+"  (g #5C7FF6B06F8F143FE8288433493E4769C4D988ACE5BE25A0E24809670716C613D7B0CEE6932F8FAA7C44D2CB24523DA53FBE4F6EC3595892D1AA58C4328A06C46A15662E7EAA703A1DECF8BBB2D05DBE2EB956C142A338661D10461C0D135472085057F3494309FFA73C611F78B32ADBB5740C361C9F35BE90997DB2014E2EF5AA61782F52ABEB8BD6432C4DD097BC5423B285DAFB60DC364E8161F4A2A35ACA3A10B1C4D203CC76A470A33AFDCBDD92959859ABD8B56E1725252D78EAC66E71BA9AE3F1DD2487199874393CD4D832186800654760E1E34C09E4D155179F9EC0DC4473F996BDCE6EED1CABED8B6F116F7AD9CF505DF0F998E34AB27514B0FFE7#)"
+"  (y #667098C654426C78D7F8201EAC6C203EF030D43605032C2F1FA937E5237DBD949F34A0A2564FE126DC8B715C5141802CE0979C8246463C40E6B6BDAA2513FA611728716C2E4FD53BC95B89E69949D96512E873B9C8F8DFD499CC312882561ADECB31F658E934C0C197F2C4D96B05CBAD67381E7B768891E4DA3843D24D94CDFB5126E9B8BF21E8358EE0E0A30EF13FD6A664C0DCE3731F7FB49A4845A4FD8254687972A2D382599C9BAC4E0ED7998193078913032558134976410B89D2C171D123AC35FD977219597AA7D15C1A9A428E59194F75C721EBCBCFAE44696A499AFA74E04299F132026601638CB87AB79190D4A0986315DA8EEC6561C938996BEADF#)"
+"  (x #69C7548C21D0DFEA6B9A51C9EAD4E27C33D3B3F180316E5BCAB92C933F0E4DBC#)))";
+
+
+
+static int test_keys (DSA_secret_key *sk, unsigned int qbits);
+static int check_secret_key (DSA_secret_key *sk);
+static gpg_err_code_t generate (DSA_secret_key *sk,
+                                unsigned int nbits,
+                                unsigned int qbits,
+                                int transient_key,
+                                dsa_domain_t *domain,
+                                gcry_mpi_t **ret_factors);
+static gpg_err_code_t sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input,
+                            DSA_secret_key *skey, int flags, int hashalgo);
+static gpg_err_code_t verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input,
+                   DSA_public_key *pkey);
+static unsigned int dsa_get_nbits (gcry_sexp_t parms);
+
+
+static void (*progress_cb) (void *,const char *, int, int, int );
+static void *progress_cb_data;
+
+
+void
+_gcry_register_pk_dsa_progress (void (*cb) (void *, const char *,
+                                            int, int, int),
+				void *cb_data)
+{
+  progress_cb = cb;
+  progress_cb_data = cb_data;
+}
+
+
+static void
+progress (int c)
+{
+  if (progress_cb)
+    progress_cb (progress_cb_data, "pk_dsa", c, 0, 0);
+}
+
+
+/* Check that a freshly generated key actually works.  Returns 0 on success. */
+static int
+test_keys (DSA_secret_key *sk, unsigned int qbits)
+{
+  int result = -1;  /* Default to failure.  */
+  DSA_public_key pk;
+  gcry_mpi_t data  = mpi_new (qbits);
+  gcry_mpi_t sig_a = mpi_new (qbits);
+  gcry_mpi_t sig_b = mpi_new (qbits);
+
+  /* Put the relevant parameters into a public key structure.  */
+  pk.p = sk->p;
+  pk.q = sk->q;
+  pk.g = sk->g;
+  pk.y = sk->y;
+
+  /* Create a random plaintext.  */
+  _gcry_mpi_randomize (data, qbits, GCRY_WEAK_RANDOM);
+
+  /* Sign DATA using the secret key.  */
+  sign (sig_a, sig_b, data, sk, 0, 0);
+
+  /* Verify the signature using the public key.  */
+  if ( verify (sig_a, sig_b, data, &pk) )
+    goto leave; /* Signature does not match.  */
+
+  /* Modify the data and check that the signing fails.  */
+  mpi_add_ui (data, data, 1);
+  if ( !verify (sig_a, sig_b, data, &pk) )
+    goto leave; /* Signature matches but should not.  */
+
+  result = 0; /* The test succeeded.  */
+
+ leave:
+  _gcry_mpi_release (sig_b);
+  _gcry_mpi_release (sig_a);
+  _gcry_mpi_release (data);
+  return result;
+}
+
+
+
+/*
+   Generate a DSA key pair with a key of size NBITS.  If transient_key
+   is true the key is generated using the standard RNG and not the
+   very secure one.
+
+   Returns: 2 structures filled with all needed values
+ 	    and an array with the n-1 factors of (p-1)
+ */
+static gpg_err_code_t
+generate (DSA_secret_key *sk, unsigned int nbits, unsigned int qbits,
+          int transient_key, dsa_domain_t *domain, gcry_mpi_t **ret_factors )
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t p;    /* the prime */
+  gcry_mpi_t q;    /* the 160 bit prime factor */
+  gcry_mpi_t g;    /* the generator */
+  gcry_mpi_t y;    /* g^x mod p */
+  gcry_mpi_t x;    /* the secret exponent */
+  gcry_mpi_t h, e;  /* helper */
+  unsigned char *rndbuf;
+  gcry_random_level_t random_level;
+
+  if (qbits)
+    ; /* Caller supplied qbits.  Use this value.  */
+  else if ( nbits >= 512 && nbits <= 1024 )
+    qbits = 160;
+  else if ( nbits == 2048 )
+    qbits = 224;
+  else if ( nbits == 3072 )
+    qbits = 256;
+  else if ( nbits == 7680 )
+    qbits = 384;
+  else if ( nbits == 15360 )
+    qbits = 512;
+  else
+    return GPG_ERR_INV_VALUE;
+
+  if (qbits < 160 || qbits > 512 || (qbits%8) )
+    return GPG_ERR_INV_VALUE;
+  if (nbits < 2*qbits || nbits > 15360)
+    return GPG_ERR_INV_VALUE;
+
+  if (fips_mode ())
+    {
+      if (nbits < 1024)
+        return GPG_ERR_INV_VALUE;
+      if (transient_key)
+        return GPG_ERR_INV_VALUE;
+    }
+
+  if (domain->p && domain->q && domain->g)
+    {
+      /* Domain parameters are given; use them.  */
+      p = mpi_copy (domain->p);
+      q = mpi_copy (domain->q);
+      g = mpi_copy (domain->g);
+      gcry_assert (mpi_get_nbits (p) == nbits);
+      gcry_assert (mpi_get_nbits (q) == qbits);
+      h = mpi_alloc (0);
+      e = NULL;
+    }
+  else
+    {
+      /* Generate new domain parameters.  */
+      rc = _gcry_generate_elg_prime (1, nbits, qbits, NULL, &p, ret_factors);
+      if (rc)
+        return rc;
+
+      /* Get q out of factors.  */
+      q = mpi_copy ((*ret_factors)[0]);
+      gcry_assert (mpi_get_nbits (q) == qbits);
+
+      /* Find a generator g (h and e are helpers).
+         e = (p-1)/q */
+      e = mpi_alloc (mpi_get_nlimbs (p));
+      mpi_sub_ui (e, p, 1);
+      mpi_fdiv_q (e, e, q);
+      g = mpi_alloc (mpi_get_nlimbs (p));
+      h = mpi_alloc_set_ui (1); /* (We start with 2.) */
+      do
+        {
+          mpi_add_ui (h, h, 1);
+          /* g = h^e mod p */
+          mpi_powm (g, h, e, p);
+        }
+      while (!mpi_cmp_ui (g, 1));  /* Continue until g != 1. */
+    }
+
+  /* Select a random number X with the property:
+   *	 0 < x < q-1
+   *
+   * FIXME: Why do we use the requirement x < q-1 ? It should be
+   * sufficient to test for x < q.  FIPS-186-3 check x < q-1 but it
+   * does not check for 0 < x because it makes sure that Q is unsigned
+   * and finally adds one to the result so that 0 will never be
+   * returned.  We should replace the code below with _gcry_dsa_gen_k.
+   *
+   * This must be a very good random number because this is the secret
+   * part.  The random quality depends on the transient_key flag.  */
+  random_level = transient_key ? GCRY_STRONG_RANDOM : GCRY_VERY_STRONG_RANDOM;
+  if (DBG_CIPHER)
+    log_debug("choosing a random x%s\n", transient_key? " (transient-key)":"");
+  gcry_assert( qbits >= 160 );
+  x = mpi_alloc_secure( mpi_get_nlimbs(q) );
+  mpi_sub_ui( h, q, 1 );  /* put q-1 into h */
+  rndbuf = NULL;
+  do
+    {
+      if( DBG_CIPHER )
+        progress('.');
+      if( !rndbuf )
+        rndbuf = _gcry_random_bytes_secure ((qbits+7)/8, random_level);
+      else
+        { /* Change only some of the higher bits (= 2 bytes)*/
+          char *r = _gcry_random_bytes_secure (2, random_level);
+          memcpy(rndbuf, r, 2 );
+          xfree(r);
+        }
+
+      _gcry_mpi_set_buffer( x, rndbuf, (qbits+7)/8, 0 );
+      mpi_clear_highbit( x, qbits+1 );
+    }
+  while ( !( mpi_cmp_ui( x, 0 )>0 && mpi_cmp( x, h )<0 ) );
+  xfree(rndbuf);
+  mpi_free( e );
+  mpi_free( h );
+
+  /* y = g^x mod p */
+  y = mpi_alloc( mpi_get_nlimbs(p) );
+  mpi_powm (y, g, x, p);
+
+  if( DBG_CIPHER )
+    {
+      progress('\n');
+      log_mpidump("dsa  p", p );
+      log_mpidump("dsa  q", q );
+      log_mpidump("dsa  g", g );
+      log_mpidump("dsa  y", y );
+      log_mpidump("dsa  x", x );
+    }
+
+  /* Copy the stuff to the key structures. */
+  sk->p = p;
+  sk->q = q;
+  sk->g = g;
+  sk->y = y;
+  sk->x = x;
+
+  /* Now we can test our keys (this should never fail!). */
+  if ( test_keys (sk, qbits) )
+    {
+      _gcry_mpi_release (sk->p); sk->p = NULL;
+      _gcry_mpi_release (sk->q); sk->q = NULL;
+      _gcry_mpi_release (sk->g); sk->g = NULL;
+      _gcry_mpi_release (sk->y); sk->y = NULL;
+      _gcry_mpi_release (sk->x); sk->x = NULL;
+      fips_signal_error ("self-test after key generation failed");
+      return GPG_ERR_SELFTEST_FAILED;
+    }
+  return 0;
+}
+
+
+/* Generate a DSA key pair with a key of size NBITS using the
+   algorithm given in FIPS-186-3.  If USE_FIPS186_2 is true,
+   FIPS-186-2 is used and thus the length is restricted to 1024/160.
+   If DERIVEPARMS is not NULL it may contain a seed value.  If domain
+   parameters are specified in DOMAIN, DERIVEPARMS may not be given
+   and NBITS and QBITS must match the specified domain parameters.  */
+static gpg_err_code_t
+generate_fips186 (DSA_secret_key *sk, unsigned int nbits, unsigned int qbits,
+                  gcry_sexp_t deriveparms, int use_fips186_2,
+                  dsa_domain_t *domain,
+                  int *r_counter, void **r_seed, size_t *r_seedlen,
+                  gcry_mpi_t *r_h)
+{
+  gpg_err_code_t ec;
+  struct {
+    gcry_sexp_t sexp;
+    const void *seed;
+    size_t seedlen;
+  } initial_seed = { NULL, NULL, 0 };
+  gcry_mpi_t prime_q = NULL;
+  gcry_mpi_t prime_p = NULL;
+  gcry_mpi_t value_g = NULL; /* The generator. */
+  gcry_mpi_t value_y = NULL; /* g^x mod p */
+  gcry_mpi_t value_x = NULL; /* The secret exponent. */
+  gcry_mpi_t value_h = NULL; /* Helper.  */
+  gcry_mpi_t value_e = NULL; /* Helper.  */
+  gcry_mpi_t value_c = NULL; /* helper for x */
+  gcry_mpi_t value_qm2 = NULL; /* q - 2 */
+
+  /* Preset return values.  */
+  *r_counter = 0;
+  *r_seed = NULL;
+  *r_seedlen = 0;
+  *r_h = NULL;
+
+  /* Derive QBITS from NBITS if requested  */
+  if (!qbits)
+    {
+      if (nbits == 1024)
+        qbits = 160;
+      else if (nbits == 2048)
+        qbits = 224;
+      else if (nbits == 3072)
+        qbits = 256;
+    }
+
+  /* Check that QBITS and NBITS match the standard.  Note that FIPS
+     186-3 uses N for QBITS and L for NBITS.  */
+  if (nbits == 1024 && qbits == 160 && use_fips186_2)
+    ; /* Allowed in FIPS 186-2 mode.  */
+  else if (nbits == 2048 && qbits == 224)
+    ;
+  else if (nbits == 2048 && qbits == 256)
+    ;
+  else if (nbits == 3072 && qbits == 256)
+    ;
+  else
+    return GPG_ERR_INV_VALUE;
+
+  if (domain->p && domain->q && domain->g)
+    {
+      /* Domain parameters are given; use them.  */
+      prime_p = mpi_copy (domain->p);
+      prime_q = mpi_copy (domain->q);
+      value_g = mpi_copy (domain->g);
+      gcry_assert (mpi_get_nbits (prime_p) == nbits);
+      gcry_assert (mpi_get_nbits (prime_q) == qbits);
+      gcry_assert (!deriveparms);
+      ec = 0;
+    }
+  else
+    {
+      /* Generate new domain parameters.  */
+
+      /* Get an initial seed value.  */
+      if (deriveparms)
+        {
+          initial_seed.sexp = sexp_find_token (deriveparms, "seed", 0);
+          if (initial_seed.sexp)
+            initial_seed.seed = sexp_nth_data (initial_seed.sexp, 1,
+                                               &initial_seed.seedlen);
+        }
+
+      if (use_fips186_2)
+        ec = _gcry_generate_fips186_2_prime (nbits, qbits,
+                                             initial_seed.seed,
+                                             initial_seed.seedlen,
+                                             &prime_q, &prime_p,
+                                             r_counter,
+                                             r_seed, r_seedlen);
+      else
+        ec = _gcry_generate_fips186_3_prime (nbits, qbits,
+                                             initial_seed.seed,
+                                             initial_seed.seedlen,
+                                             &prime_q, &prime_p,
+                                             r_counter,
+                                             r_seed, r_seedlen, NULL);
+      sexp_release (initial_seed.sexp);
+      if (ec)
+        goto leave;
+
+      /* Find a generator g (h and e are helpers).
+       *    e = (p-1)/q
+       */
+      value_e = mpi_alloc_like (prime_p);
+      mpi_sub_ui (value_e, prime_p, 1);
+      mpi_fdiv_q (value_e, value_e, prime_q );
+      value_g = mpi_alloc_like (prime_p);
+      value_h = mpi_alloc_set_ui (1);
+      do
+        {
+          mpi_add_ui (value_h, value_h, 1);
+          /* g = h^e mod p */
+          mpi_powm (value_g, value_h, value_e, prime_p);
+        }
+      while (!mpi_cmp_ui (value_g, 1));  /* Continue until g != 1.  */
+    }
+
+  value_c = mpi_snew (qbits);
+  value_x = mpi_snew (qbits);
+  value_qm2 = mpi_snew (qbits);
+  mpi_sub_ui (value_qm2, prime_q, 2);
+
+  /* FIPS 186-4 B.1.2 steps 4-6 */
+  do
+    {
+      if( DBG_CIPHER )
+        progress('.');
+      _gcry_mpi_randomize (value_c, qbits, GCRY_VERY_STRONG_RANDOM);
+      mpi_clear_highbit (value_c, qbits+1);
+    }
+  while (!(mpi_cmp_ui (value_c, 0) > 0 && mpi_cmp (value_c, value_qm2) < 0));
+  /* while (mpi_cmp (value_c, value_qm2) > 0); */
+
+  /* x = c + 1 */
+  mpi_add_ui(value_x, value_c, 1);
+
+  /* y = g^x mod p */
+  value_y = mpi_alloc_like (prime_p);
+  mpi_powm (value_y, value_g, value_x, prime_p);
+
+  if (DBG_CIPHER)
+    {
+      progress('\n');
+      log_mpidump("dsa  p", prime_p );
+      log_mpidump("dsa  q", prime_q );
+      log_mpidump("dsa  g", value_g );
+      log_mpidump("dsa  y", value_y );
+      log_mpidump("dsa  x", value_x );
+      log_mpidump("dsa  h", value_h );
+    }
+
+  /* Copy the stuff to the key structures. */
+  sk->p = prime_p; prime_p = NULL;
+  sk->q = prime_q; prime_q = NULL;
+  sk->g = value_g; value_g = NULL;
+  sk->y = value_y; value_y = NULL;
+  sk->x = value_x; value_x = NULL;
+  *r_h = value_h; value_h = NULL;
+
+ leave:
+  _gcry_mpi_release (prime_p);
+  _gcry_mpi_release (prime_q);
+  _gcry_mpi_release (value_g);
+  _gcry_mpi_release (value_y);
+  _gcry_mpi_release (value_x);
+  _gcry_mpi_release (value_h);
+  _gcry_mpi_release (value_e);
+  _gcry_mpi_release (value_c);
+  _gcry_mpi_release (value_qm2);
+
+  /* As a last step test this keys (this should never fail of course). */
+  if (!ec && test_keys (sk, qbits) )
+    {
+      _gcry_mpi_release (sk->p); sk->p = NULL;
+      _gcry_mpi_release (sk->q); sk->q = NULL;
+      _gcry_mpi_release (sk->g); sk->g = NULL;
+      _gcry_mpi_release (sk->y); sk->y = NULL;
+      _gcry_mpi_release (sk->x); sk->x = NULL;
+      fips_signal_error ("self-test after key generation failed");
+      ec = GPG_ERR_SELFTEST_FAILED;
+    }
+
+  if (ec)
+    {
+      *r_counter = 0;
+      xfree (*r_seed); *r_seed = NULL;
+      *r_seedlen = 0;
+      _gcry_mpi_release (*r_h); *r_h = NULL;
+    }
+
+  return ec;
+}
+
+
+
+/*
+   Test whether the secret key is valid.
+   Returns: if this is a valid key.
+ */
+static int
+check_secret_key( DSA_secret_key *sk )
+{
+  int rc;
+  gcry_mpi_t y = mpi_alloc( mpi_get_nlimbs(sk->y) );
+
+  mpi_powm( y, sk->g, sk->x, sk->p );
+  rc = !mpi_cmp( y, sk->y );
+  mpi_free( y );
+  return rc;
+}
+
+
+
+/*
+   Make a DSA signature from INPUT and put it into r and s.
+
+   INPUT may either be a plain MPI or an opaque MPI which is then
+   internally converted to a plain MPI.  FLAGS and HASHALGO may both
+   be 0 for standard operation mode.
+
+   The return value is 0 on success or an error code.  Note that for
+   backward compatibility the function will not return any error if
+   FLAGS and HASHALGO are both 0 and INPUT is a plain MPI.
+ */
+static gpg_err_code_t
+sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_secret_key *skey,
+      int flags, int hashalgo)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t hash;
+  gcry_mpi_t k;
+  gcry_mpi_t kinv;
+  gcry_mpi_t tmp;
+  const void *abuf;
+  unsigned int abits, qbits;
+  int extraloops = 0;
+
+  qbits = mpi_get_nbits (skey->q);
+
+  /* Convert the INPUT into an MPI.  */
+  rc = _gcry_dsa_normalize_hash (input, &hash, qbits);
+  if (rc)
+    return rc;
+
+ again:
+  /* Create the K value.  */
+  if ((flags & PUBKEY_FLAG_RFC6979) && hashalgo)
+    {
+      /* Use Pornin's method for deterministic DSA.  If this flag is
+         set, it is expected that HASH is an opaque MPI with the to be
+         signed hash.  That hash is also used as h1 from 3.2.a.  */
+      if (!mpi_is_opaque (input))
+        {
+          rc = GPG_ERR_CONFLICT;
+          goto leave;
+        }
+
+      abuf = mpi_get_opaque (input, &abits);
+      rc = _gcry_dsa_gen_rfc6979_k (&k, skey->q, skey->x,
+                                    abuf, (abits+7)/8, hashalgo, extraloops);
+      if (rc)
+        goto leave;
+    }
+  else
+    {
+      /* Select a random k with 0 < k < q */
+      k = _gcry_dsa_gen_k (skey->q, GCRY_STRONG_RANDOM);
+    }
+
+  /* kinv = k^(-1) mod q */
+  kinv = mpi_alloc( mpi_get_nlimbs(k) );
+  mpi_invm(kinv, k, skey->q );
+
+  _gcry_dsa_modify_k (k, skey->q, qbits);
+
+  /* r = (a^k mod p) mod q */
+  mpi_powm( r, skey->g, k, skey->p );
+  mpi_fdiv_r( r, r, skey->q );
+
+  /* s = (kinv * ( hash + x * r)) mod q */
+  tmp = mpi_alloc( mpi_get_nlimbs(skey->p) );
+  mpi_mul( tmp, skey->x, r );
+  mpi_add( tmp, tmp, hash );
+  mpi_mulm( s , kinv, tmp, skey->q );
+
+  mpi_free(k);
+  mpi_free(kinv);
+  mpi_free(tmp);
+
+  if (!mpi_cmp_ui (r, 0))
+    {
+      /* This is a highly unlikely code path.  */
+      extraloops++;
+      goto again;
+    }
+
+  rc = 0;
+
+ leave:
+  if (hash != input)
+    mpi_free (hash);
+
+  return rc;
+}
+
+
+/*
+   Returns true if the signature composed from R and S is valid.
+ */
+static gpg_err_code_t
+verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_public_key *pkey )
+{
+  gpg_err_code_t rc = 0;
+  gcry_mpi_t w, u1, u2, v;
+  gcry_mpi_t base[3];
+  gcry_mpi_t ex[3];
+  gcry_mpi_t hash;
+  unsigned int nbits;
+
+  if( !(mpi_cmp_ui( r, 0 ) > 0 && mpi_cmp( r, pkey->q ) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < r < n  failed.  */
+  if( !(mpi_cmp_ui( s, 0 ) > 0 && mpi_cmp( s, pkey->q ) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < s < n  failed.  */
+
+  nbits = mpi_get_nbits (pkey->q);
+  rc = _gcry_dsa_normalize_hash (input, &hash, nbits);
+  if (rc)
+    return rc;
+
+  w  = mpi_alloc( mpi_get_nlimbs(pkey->q) );
+  u1 = mpi_alloc( mpi_get_nlimbs(pkey->q) );
+  u2 = mpi_alloc( mpi_get_nlimbs(pkey->q) );
+  v  = mpi_alloc( mpi_get_nlimbs(pkey->p) );
+
+  /* w = s^(-1) mod q */
+  mpi_invm( w, s, pkey->q );
+
+  /* u1 = (hash * w) mod q */
+  mpi_mulm( u1, hash, w, pkey->q );
+
+  /* u2 = r * w mod q  */
+  mpi_mulm( u2, r, w, pkey->q );
+
+  /* v =  g^u1 * y^u2 mod p mod q */
+  base[0] = pkey->g; ex[0] = u1;
+  base[1] = pkey->y; ex[1] = u2;
+  base[2] = NULL;    ex[2] = NULL;
+  mpi_mulpowm( v, base, ex, pkey->p );
+  mpi_fdiv_r( v, v, pkey->q );
+
+  if (mpi_cmp( v, r ))
+    {
+      if (DBG_CIPHER)
+        {
+          log_mpidump ("     i", input);
+          log_mpidump ("     h", hash);
+          log_mpidump ("     v", v);
+          log_mpidump ("     r", r);
+          log_mpidump ("     s", s);
+        }
+      rc = GPG_ERR_BAD_SIGNATURE;
+    }
+
+  mpi_free(w);
+  mpi_free(u1);
+  mpi_free(u2);
+  mpi_free(v);
+  if (hash != input)
+    mpi_free (hash);
+
+  return rc;
+}
+
+
+/*********************************************
+ **************  interface  ******************
+ *********************************************/
+
+static gcry_err_code_t
+dsa_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
+{
+  gpg_err_code_t rc;
+  unsigned int nbits;
+  gcry_sexp_t domainsexp;
+  DSA_secret_key sk;
+  gcry_sexp_t l1;
+  unsigned int qbits = 0;
+  gcry_sexp_t deriveparms = NULL;
+  gcry_sexp_t seedinfo = NULL;
+  gcry_sexp_t misc_info = NULL;
+  int flags = 0;
+  dsa_domain_t domain;
+  gcry_mpi_t *factors = NULL;
+
+  memset (&sk, 0, sizeof sk);
+  memset (&domain, 0, sizeof domain);
+
+  rc = _gcry_pk_util_get_nbits (genparms, &nbits);
+  if (rc)
+    return rc;
+
+  /* Parse the optional flags list.  */
+  l1 = sexp_find_token (genparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      sexp_release (l1);
+      if (rc)
+        return rc;\
+    }
+
+  /* Parse the optional qbits element.  */
+  l1 = sexp_find_token (genparms, "qbits", 0);
+  if (l1)
+    {
+      char buf[50];
+      const char *s;
+      size_t n;
+
+      s = sexp_nth_data (l1, 1, &n);
+      if (!s || n >= DIM (buf) - 1 )
+        {
+          sexp_release (l1);
+          return GPG_ERR_INV_OBJ; /* No value or value too large.  */
+        }
+      memcpy (buf, s, n);
+      buf[n] = 0;
+      qbits = (unsigned int)strtoul (buf, NULL, 0);
+      sexp_release (l1);
+    }
+
+  /* Parse the optional transient-key flag.  */
+  if (!(flags & PUBKEY_FLAG_TRANSIENT_KEY))
+    {
+      l1 = sexp_find_token (genparms, "transient-key", 0);
+      if (l1)
+        {
+          flags |= PUBKEY_FLAG_TRANSIENT_KEY;
+          sexp_release (l1);
+        }
+    }
+
+  /* Get the optional derive parameters.  */
+  deriveparms = sexp_find_token (genparms, "derive-parms", 0);
+
+  /* Parse the optional "use-fips186" flags.  */
+  if (!(flags & PUBKEY_FLAG_USE_FIPS186))
+    {
+      l1 = sexp_find_token (genparms, "use-fips186", 0);
+      if (l1)
+        {
+          flags |= PUBKEY_FLAG_USE_FIPS186;
+          sexp_release (l1);
+        }
+    }
+  if (!(flags & PUBKEY_FLAG_USE_FIPS186_2))
+    {
+      l1 = sexp_find_token (genparms, "use-fips186-2", 0);
+      if (l1)
+        {
+          flags |= PUBKEY_FLAG_USE_FIPS186_2;
+          sexp_release (l1);
+        }
+    }
+
+  /* Check whether domain parameters are given.  */
+  domainsexp = sexp_find_token (genparms, "domain", 0);
+  if (domainsexp)
+    {
+      /* DERIVEPARMS can't be used together with domain parameters.
+         NBITS abnd QBITS may not be specified because there values
+         are derived from the domain parameters.  */
+      if (deriveparms || qbits || nbits)
+        {
+          sexp_release (domainsexp);
+          sexp_release (deriveparms);
+          return GPG_ERR_INV_VALUE;
+        }
+
+      /* Put all domain parameters into the domain object.  */
+      l1 = sexp_find_token (domainsexp, "p", 0);
+      domain.p = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+      sexp_release (l1);
+      l1 = sexp_find_token (domainsexp, "q", 0);
+      domain.q = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+      sexp_release (l1);
+      l1 = sexp_find_token (domainsexp, "g", 0);
+      domain.g = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+      sexp_release (l1);
+      sexp_release (domainsexp);
+
+      /* Check that all domain parameters are available.  */
+      if (!domain.p || !domain.q || !domain.g)
+        {
+          _gcry_mpi_release (domain.p);
+          _gcry_mpi_release (domain.q);
+          _gcry_mpi_release (domain.g);
+          sexp_release (deriveparms);
+          return GPG_ERR_MISSING_VALUE;
+        }
+
+      /* Get NBITS and QBITS from the domain parameters.  */
+      nbits = mpi_get_nbits (domain.p);
+      qbits = mpi_get_nbits (domain.q);
+    }
+
+  if (deriveparms
+      || (flags & PUBKEY_FLAG_USE_FIPS186)
+      || (flags & PUBKEY_FLAG_USE_FIPS186_2)
+      || fips_mode ())
+    {
+      int counter;
+      void *seed;
+      size_t seedlen;
+      gcry_mpi_t h_value;
+
+      rc = generate_fips186 (&sk, nbits, qbits, deriveparms,
+                             !!(flags & PUBKEY_FLAG_USE_FIPS186_2),
+                             &domain,
+                             &counter, &seed, &seedlen, &h_value);
+      if (!rc && h_value)
+        {
+          /* Format the seed-values unless domain parameters are used
+             for which a H_VALUE of NULL is an indication.  */
+          rc = sexp_build (&seedinfo, NULL,
+                           "(seed-values(counter %d)(seed %b)(h %m))",
+                           counter, (int)seedlen, seed, h_value);
+          xfree (seed);
+          _gcry_mpi_release (h_value);
+        }
+    }
+  else
+    {
+      rc = generate (&sk, nbits, qbits,
+                     !!(flags & PUBKEY_FLAG_TRANSIENT_KEY),
+                     &domain, &factors);
+    }
+
+  if (!rc)
+    {
+      /* Put the factors into MISC_INFO.  Note that the factors are
+         not confidential thus we can store them in standard memory.  */
+      int nfactors, i, j;
+      char *p;
+      char *format = NULL;
+      void **arg_list = NULL;
+
+      for (nfactors=0; factors && factors[nfactors]; nfactors++)
+        ;
+      /* Allocate space for the format string:
+         "(misc-key-info%S(pm1-factors%m))"
+         with one "%m" for each factor and construct it.  */
+      format = xtrymalloc (50 + 2*nfactors);
+      if (!format)
+        rc = gpg_err_code_from_syserror ();
+      else
+        {
+          p = stpcpy (format, "(misc-key-info");
+          if (seedinfo)
+            p = stpcpy (p, "%S");
+          if (nfactors)
+            {
+              p = stpcpy (p, "(pm1-factors");
+              for (i=0; i < nfactors; i++)
+                p = stpcpy (p, "%m");
+              p = stpcpy (p, ")");
+            }
+          p = stpcpy (p, ")");
+
+          /* Allocate space for the list of factors plus one for the
+             seedinfo s-exp plus an extra NULL entry for safety and
+             fill it with the factors.  */
+          arg_list = xtrycalloc (nfactors+1+1, sizeof *arg_list);
+          if (!arg_list)
+            rc = gpg_err_code_from_syserror ();
+          else
+            {
+              i = 0;
+              if (seedinfo)
+                arg_list[i++] = &seedinfo;
+              for (j=0; j < nfactors; j++)
+                arg_list[i++] = factors + j;
+              arg_list[i] = NULL;
+
+              rc = sexp_build_array (&misc_info, NULL, format, arg_list);
+            }
+        }
+
+      xfree (arg_list);
+      xfree (format);
+    }
+
+  if (!rc)
+    rc = sexp_build (r_skey, NULL,
+                     "(key-data"
+                     " (public-key"
+                     "  (dsa(p%m)(q%m)(g%m)(y%m)))"
+                     " (private-key"
+                     "  (dsa(p%m)(q%m)(g%m)(y%m)(x%m)))"
+                     " %S)",
+                     sk.p, sk.q, sk.g, sk.y,
+                     sk.p, sk.q, sk.g, sk.y, sk.x,
+                     misc_info);
+
+
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+
+  _gcry_mpi_release (domain.p);
+  _gcry_mpi_release (domain.q);
+  _gcry_mpi_release (domain.g);
+
+  sexp_release (seedinfo);
+  sexp_release (misc_info);
+  sexp_release (deriveparms);
+  if (factors)
+    {
+      gcry_mpi_t *mp;
+      for (mp = factors; *mp; mp++)
+        mpi_free (*mp);
+      xfree (factors);
+    }
+  return rc;
+}
+
+
+
+static gcry_err_code_t
+dsa_check_secret_key (gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  DSA_secret_key sk = {NULL, NULL, NULL, NULL, NULL};
+
+  rc = _gcry_sexp_extract_param (keyparms, NULL, "pqgyx",
+                                  &sk.p, &sk.q, &sk.g, &sk.y, &sk.x,
+                                  NULL);
+  if (rc)
+    goto leave;
+
+  if (!check_secret_key (&sk))
+    rc = GPG_ERR_BAD_SECKEY;
+
+ leave:
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+  if (DBG_CIPHER)
+    log_debug ("dsa_testkey    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+dsa_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t data = NULL;
+  DSA_secret_key sk = {NULL, NULL, NULL, NULL, NULL};
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_SIGN,
+                                   dsa_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("dsa_sign   data", data);
+
+  /* Extract the key.  */
+  rc = _gcry_sexp_extract_param (keyparms, NULL, "pqgyx",
+                                 &sk.p, &sk.q, &sk.g, &sk.y, &sk.x, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("dsa_sign      p", sk.p);
+      log_mpidump ("dsa_sign      q", sk.q);
+      log_mpidump ("dsa_sign      g", sk.g);
+      log_mpidump ("dsa_sign      y", sk.y);
+      if (!fips_mode ())
+        log_mpidump ("dsa_sign      x", sk.x);
+    }
+
+  sig_r = mpi_new (0);
+  sig_s = mpi_new (0);
+  rc = sign (sig_r, sig_s, data, &sk, ctx.flags, ctx.hash_algo);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("dsa_sign  sig_r", sig_r);
+      log_mpidump ("dsa_sign  sig_s", sig_s);
+    }
+  rc = sexp_build (r_sig, NULL, "(sig-val(dsa(r%M)(s%M)))", sig_r, sig_s);
+
+ leave:
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+  _gcry_mpi_release (data);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("dsa_sign      => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+dsa_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+  gcry_mpi_t data = NULL;
+  DSA_public_key pk = { NULL, NULL, NULL, NULL };
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_VERIFY,
+                                   dsa_get_nbits (s_keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("dsa_verify data", data);
+
+  /* Extract the signature value.  */
+  rc = _gcry_pk_util_preparse_sigval (s_sig, dsa_names, &l1, NULL);
+  if (rc)
+    goto leave;
+  rc = _gcry_sexp_extract_param (l1, NULL, "rs", &sig_r, &sig_s, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("dsa_verify  s_r", sig_r);
+      log_mpidump ("dsa_verify  s_s", sig_s);
+    }
+
+  /* Extract the key.  */
+  rc = _gcry_sexp_extract_param (s_keyparms, NULL, "pqgy",
+                                 &pk.p, &pk.q, &pk.g, &pk.y, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("dsa_verify    p", pk.p);
+      log_mpidump ("dsa_verify    q", pk.q);
+      log_mpidump ("dsa_verify    g", pk.g);
+      log_mpidump ("dsa_verify    y", pk.y);
+    }
+
+  /* Verify the signature.  */
+  rc = verify (sig_r, sig_s, data, &pk);
+
+ leave:
+  _gcry_mpi_release (pk.p);
+  _gcry_mpi_release (pk.q);
+  _gcry_mpi_release (pk.g);
+  _gcry_mpi_release (pk.y);
+  _gcry_mpi_release (data);
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("dsa_verify    => %s\n", rc?gpg_strerror (rc):"Good");
+  return rc;
+}
+
+
+/* Return the number of bits for the key described by PARMS.  On error
+ * 0 is returned.  The format of PARMS starts with the algorithm name;
+ * for example:
+ *
+ *   (dsa
+ *     (p <mpi>)
+ *     (q <mpi>)
+ *     (g <mpi>)
+ *     (y <mpi>))
+ *
+ * More parameters may be given but we only need P here.
+ */
+static unsigned int
+dsa_get_nbits (gcry_sexp_t parms)
+{
+  gcry_sexp_t l1;
+  gcry_mpi_t p;
+  unsigned int nbits;
+
+  l1 = sexp_find_token (parms, "p", 1);
+  if (!l1)
+    return 0; /* Parameter P not found.  */
+
+  p = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+  sexp_release (l1);
+  nbits = p? mpi_get_nbits (p) : 0;
+  _gcry_mpi_release (p);
+  return nbits;
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+static const char *
+selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
+{
+  /* Sample data from RFC 6979 section A.2.2, hash is of message "sample" */
+  static const char sample_data[] =
+    "(data (flags rfc6979)"
+    " (hash sha256 #af2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e9891562113d8a62add1bf#))";
+  static const char sample_data_bad[] =
+    "(data (flags rfc6979)"
+    " (hash sha256 #bf2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e9891562113d8a62add1bf#))";
+  static const char signature_r[] =
+    "eace8bdbbe353c432a795d9ec556c6d021f7a03f42c36e9bc87e4ac7932cc809";
+  static const char signature_s[] =
+    "7081e175455f9247b812b74583e9e94f9ea79bd640dc962533b0680793a38d53";
+
+  const char *errtxt = NULL;
+  gcry_error_t err;
+  gcry_sexp_t data = NULL;
+  gcry_sexp_t data_bad = NULL;
+  gcry_sexp_t sig = NULL;
+  gcry_sexp_t l1 = NULL;
+  gcry_sexp_t l2 = NULL;
+  gcry_mpi_t r = NULL;
+  gcry_mpi_t s = NULL;
+  gcry_mpi_t calculated_r = NULL;
+  gcry_mpi_t calculated_s = NULL;
+  int cmp;
+
+  err = sexp_sscan (&data, NULL, sample_data, strlen (sample_data));
+  if (!err)
+    err = sexp_sscan (&data_bad, NULL,
+                      sample_data_bad, strlen (sample_data_bad));
+  if (!err)
+    err = _gcry_mpi_scan (&r, GCRYMPI_FMT_HEX, signature_r, 0, NULL);
+  if (!err)
+    err = _gcry_mpi_scan (&s, GCRYMPI_FMT_HEX, signature_s, 0, NULL);
+
+  if (err)
+    {
+      errtxt = "converting data failed";
+      goto leave;
+    }
+
+  err = _gcry_pk_sign (&sig, data, skey);
+  if (err)
+    {
+      errtxt = "signing failed";
+      goto leave;
+    }
+
+  /* check against known signature */
+  errtxt = "signature validity failed";
+  l1 = _gcry_sexp_find_token (sig, "sig-val", 0);
+  if (!l1)
+    goto leave;
+  l2 = _gcry_sexp_find_token (l1, "dsa", 0);
+  if (!l2)
+    goto leave;
+
+  sexp_release (l1);
+  l1 = l2;
+
+  l2 = _gcry_sexp_find_token (l1, "r", 0);
+  if (!l2)
+    goto leave;
+  calculated_r = _gcry_sexp_nth_mpi (l2, 1, GCRYMPI_FMT_USG);
+  if (!calculated_r)
+    goto leave;
+
+  sexp_release (l2);
+  l2 = _gcry_sexp_find_token (l1, "s", 0);
+  if (!l2)
+    goto leave;
+  calculated_s = _gcry_sexp_nth_mpi (l2, 1, GCRYMPI_FMT_USG);
+  if (!calculated_s)
+    goto leave;
+
+  errtxt = "known sig check failed";
+
+  cmp = _gcry_mpi_cmp (r, calculated_r);
+  if (cmp)
+    goto leave;
+  cmp = _gcry_mpi_cmp (s, calculated_s);
+  if (cmp)
+    goto leave;
+
+  errtxt = NULL;
+
+
+  err = _gcry_pk_verify (sig, data, pkey);
+  if (err)
+    {
+      errtxt = "verify failed";
+      goto leave;
+    }
+  err = _gcry_pk_verify (sig, data_bad, pkey);
+  if (gcry_err_code (err) != GPG_ERR_BAD_SIGNATURE)
+    {
+      errtxt = "bad signature not detected";
+      goto leave;
+    }
+
+
+ leave:
+  _gcry_mpi_release (calculated_s);
+  _gcry_mpi_release (calculated_r);
+  _gcry_mpi_release (s);
+  _gcry_mpi_release (r);
+  sexp_release (l2);
+  sexp_release (l1);
+  sexp_release (sig);
+  sexp_release (data_bad);
+  sexp_release (data);
+  return errtxt;
+}
+
+
+static gpg_err_code_t
+selftests_dsa_2048 (selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+  gcry_error_t err;
+  gcry_sexp_t skey = NULL;
+  gcry_sexp_t pkey = NULL;
+
+  /* Convert the S-expressions into the internal representation.  */
+  what = "convert";
+  err = sexp_sscan (&skey, NULL, sample_secret_key_2048, strlen (sample_secret_key_2048));
+  if (!err)
+    err = sexp_sscan (&pkey, NULL,
+                      sample_public_key_2048, strlen (sample_public_key_2048));
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "key consistency";
+  err = _gcry_pk_testkey (skey);
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "sign";
+  errtxt = selftest_sign (pkey, skey);
+  if (errtxt)
+    goto failed;
+
+  sexp_release (pkey);
+  sexp_release (skey);
+  return 0; /* Succeeded. */
+
+ failed:
+  sexp_release (pkey);
+  sexp_release (skey);
+  if (report)
+    report ("pubkey", GCRY_PK_DSA, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  (void)extended;
+
+  switch (algo)
+    {
+    case GCRY_PK_DSA:
+      ec = selftests_dsa_2048 (report);
+      break;
+    default:
+      ec = GPG_ERR_PUBKEY_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+gcry_pk_spec_t _gcry_pubkey_spec_dsa =
+  {
+    GCRY_PK_DSA, { 0, 1 },
+    GCRY_PK_USAGE_SIGN,
+    "DSA", dsa_names,
+    "pqgy", "pqgyx", "", "rs", "pqgy",
+    dsa_generate,
+    dsa_check_secret_key,
+    NULL,
+    NULL,
+    dsa_sign,
+    dsa_verify,
+    dsa_get_nbits,
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/ecc-common.h b/comm/third_party/libgcrypt/cipher/ecc-common.h
new file mode 100644
index 0000000000..25c3111263
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-common.h
@@ -0,0 +1,140 @@
+/* ecc-common.h - Declarations of common ECC code
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ECC_COMMON_H
+#define GCRY_ECC_COMMON_H
+
+
+/* Definition of a curve.  */
+typedef struct
+{
+  enum gcry_mpi_ec_models model;/* The model descrinbing this curve.  */
+  enum ecc_dialects dialect;    /* The dialect used with the curve.   */
+  gcry_mpi_t p;         /* Prime specifying the field GF(p).  */
+  gcry_mpi_t a;         /* First coefficient of the Weierstrass equation.  */
+  gcry_mpi_t b;         /* Second coefficient of the Weierstrass equation.
+                           or d as used by Twisted Edwards curves.  */
+  mpi_point_struct G;   /* Base point (generator).  */
+  gcry_mpi_t n;         /* Order of G.  */
+  unsigned int h;       /* Cofactor.  */
+  const char *name;     /* Name of the curve or NULL.  */
+} elliptic_curve_t;
+
+
+
+/* Set the value from S into D.  */
+static inline void
+point_set (mpi_point_t d, mpi_point_t s)
+{
+  mpi_set (d->x, s->x);
+  mpi_set (d->y, s->y);
+  mpi_set (d->z, s->z);
+}
+
+#define point_init(a)  _gcry_mpi_point_init ((a))
+#define point_free(a)  _gcry_mpi_point_free_parts ((a))
+
+
+/*-- ecc-curves.c --*/
+gpg_err_code_t _gcry_ecc_fill_in_curve (unsigned int nbits,
+                                        const char *name,
+                                        elliptic_curve_t *curve,
+                                        unsigned int *r_nbits);
+gpg_err_code_t _gcry_ecc_update_curve_param (const char *name,
+                                             enum gcry_mpi_ec_models *model,
+                                             enum ecc_dialects *dialect,
+                                             gcry_mpi_t *p, gcry_mpi_t *a,
+                                             gcry_mpi_t *b, gcry_mpi_t *g,
+                                             gcry_mpi_t *n);
+
+const char *_gcry_ecc_get_curve (gcry_sexp_t keyparms,
+                                 int iterator,
+                                 unsigned int *r_nbits);
+gcry_sexp_t _gcry_ecc_get_param_sexp (const char *name);
+
+/*-- ecc-misc.c --*/
+void _gcry_ecc_curve_free (elliptic_curve_t *E);
+elliptic_curve_t _gcry_ecc_curve_copy (elliptic_curve_t E);
+const char *_gcry_ecc_model2str (enum gcry_mpi_ec_models model);
+const char *_gcry_ecc_dialect2str (enum ecc_dialects dialect);
+gcry_mpi_t   _gcry_ecc_ec2os (gcry_mpi_t x, gcry_mpi_t y, gcry_mpi_t p);
+
+mpi_point_t  _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec);
+gpg_err_code_t _gcry_ecc_mont_encodepoint (gcry_mpi_t x, unsigned int nbits,
+                                           int with_prefix,
+                                           unsigned char **r_buffer,
+                                           unsigned int *r_buflen);
+
+
+/*-- ecc.c --*/
+
+/*-- ecc-ecdsa.c --*/
+gpg_err_code_t _gcry_ecc_ecdsa_sign (gcry_mpi_t input, mpi_ec_t ec,
+                                     gcry_mpi_t r, gcry_mpi_t s,
+                                     int flags, int hashalgo);
+gpg_err_code_t _gcry_ecc_ecdsa_verify (gcry_mpi_t input, mpi_ec_t ec,
+                                       gcry_mpi_t r, gcry_mpi_t s);
+
+/*-- ecc-eddsa.c --*/
+gpg_err_code_t _gcry_ecc_eddsa_recover_x (gcry_mpi_t x, gcry_mpi_t y, int sign,
+                                          mpi_ec_t ec);
+gpg_err_code_t _gcry_ecc_eddsa_encodepoint (mpi_point_t point, mpi_ec_t ctx,
+                                            gcry_mpi_t x, gcry_mpi_t y,
+                                            int with_prefix,
+                                            unsigned char **r_buffer,
+                                            unsigned int *r_buflen);
+gpg_err_code_t _gcry_ecc_eddsa_ensure_compact (gcry_mpi_t value,
+                                               unsigned int nbits);
+
+
+gpg_err_code_t _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest,
+                                            mpi_ec_t ec);
+
+gpg_err_code_t _gcry_ecc_eddsa_genkey (mpi_ec_t ec, int flags);
+gpg_err_code_t _gcry_ecc_eddsa_sign (gcry_mpi_t input,
+                                     mpi_ec_t ec,
+                                     gcry_mpi_t r_r, gcry_mpi_t s,
+                                     struct pk_encoding_ctx *ctx);
+gpg_err_code_t _gcry_ecc_eddsa_verify (gcry_mpi_t input,
+                                       mpi_ec_t ec,
+                                       gcry_mpi_t r, gcry_mpi_t s,
+                                       struct pk_encoding_ctx *ctx);
+void reverse_buffer (unsigned char *buffer, unsigned int length);
+
+
+/*-- ecc-gost.c --*/
+gpg_err_code_t _gcry_ecc_gost_sign (gcry_mpi_t input, mpi_ec_t ec,
+                                    gcry_mpi_t r, gcry_mpi_t s);
+gpg_err_code_t _gcry_ecc_gost_verify (gcry_mpi_t input, mpi_ec_t ec,
+                                      gcry_mpi_t r, gcry_mpi_t s);
+
+
+/*-- ecc-sm2.c --*/
+gpg_err_code_t _gcry_ecc_sm2_encrypt (gcry_sexp_t *r_ciph,
+                                      gcry_mpi_t input, mpi_ec_t ec);
+gpg_err_code_t _gcry_ecc_sm2_decrypt (gcry_sexp_t *r_plain,
+                                      gcry_sexp_t data_list, mpi_ec_t ec);
+gpg_err_code_t _gcry_ecc_sm2_sign (gcry_mpi_t input, mpi_ec_t ec,
+                                   gcry_mpi_t r, gcry_mpi_t s,
+                                   int flags, int hashalgo);
+gpg_err_code_t _gcry_ecc_sm2_verify (gcry_mpi_t input, mpi_ec_t ec,
+                                     gcry_mpi_t r, gcry_mpi_t s);
+
+
+#endif /*GCRY_ECC_COMMON_H*/
diff --git a/comm/third_party/libgcrypt/cipher/ecc-curves.c b/comm/third_party/libgcrypt/cipher/ecc-curves.c
new file mode 100644
index 0000000000..900b668aac
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-curves.c
@@ -0,0 +1,1603 @@
+/* ecc-curves.c  -  Elliptic Curve parameter mangement
+ * Copyright (C) 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "mpi-internal.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "pubkey-internal.h"
+#include "ecc-common.h"
+
+
+static gpg_err_code_t
+point_from_keyparam (gcry_mpi_point_t *r_a,
+                     gcry_sexp_t keyparam, const char *name, mpi_ec_t ec);
+
+/* This tables defines aliases for curve names.  */
+static const struct
+{
+  const char *name;  /* Our name.  */
+  const char *other; /* Other name. */
+} curve_aliases[] =
+  {
+    { "Ed25519",    "1.3.6.1.4.1.11591.15.1" }, /* OpenPGP */
+    { "Ed25519",    "1.3.101.112" },         /* rfc8410 */
+
+    { "Curve25519", "1.3.6.1.4.1.3029.1.5.1" }, /* OpenPGP */
+    { "Curve25519", "1.3.101.110" },         /* rfc8410 */
+    { "Curve25519", "X25519" },              /* rfc8410 */
+
+    { "Ed448",      "1.3.101.113" },         /* rfc8410 */
+    { "X448",       "1.3.101.111" },         /* rfc8410 */
+
+    { "NIST P-192", "1.2.840.10045.3.1.1" }, /* X9.62 OID  */
+    { "NIST P-192", "prime192v1" },          /* X9.62 name.  */
+    { "NIST P-192", "secp192r1"  },          /* SECP name.  */
+    { "NIST P-192", "nistp192"   },          /* rfc5656.  */
+
+    { "NIST P-224", "secp224r1" },
+    { "NIST P-224", "1.3.132.0.33" },        /* SECP OID.  */
+    { "NIST P-224", "nistp224"   },          /* rfc5656.  */
+
+    { "NIST P-256", "1.2.840.10045.3.1.7" }, /* From NIST SP 800-78-1.  */
+    { "NIST P-256", "prime256v1" },
+    { "NIST P-256", "secp256r1"  },
+    { "NIST P-256", "nistp256"   },          /* rfc5656.  */
+
+    { "NIST P-384", "secp384r1" },
+    { "NIST P-384", "1.3.132.0.34" },
+    { "NIST P-384", "nistp384"   },          /* rfc5656.  */
+
+    { "NIST P-521", "secp521r1" },
+    { "NIST P-521", "1.3.132.0.35" },
+    { "NIST P-521", "nistp521"   },          /* rfc5656.  */
+
+    { "brainpoolP160r1", "1.3.36.3.3.2.8.1.1.1" },
+    { "brainpoolP192r1", "1.3.36.3.3.2.8.1.1.3" },
+    { "brainpoolP224r1", "1.3.36.3.3.2.8.1.1.5" },
+    { "brainpoolP256r1", "1.3.36.3.3.2.8.1.1.7" },
+    { "brainpoolP320r1", "1.3.36.3.3.2.8.1.1.9" },
+    { "brainpoolP384r1", "1.3.36.3.3.2.8.1.1.11"},
+    { "brainpoolP512r1", "1.3.36.3.3.2.8.1.1.13"},
+
+    { "GOST2001-test", "1.2.643.2.2.35.0" },
+    { "GOST2001-CryptoPro-A", "1.2.643.2.2.35.1" },
+    { "GOST2001-CryptoPro-B", "1.2.643.2.2.35.2" },
+    { "GOST2001-CryptoPro-C", "1.2.643.2.2.35.3" },
+    { "GOST2001-CryptoPro-A", "GOST2001-CryptoPro-XchA" },
+    { "GOST2001-CryptoPro-C", "GOST2001-CryptoPro-XchB" },
+    { "GOST2001-CryptoPro-A", "1.2.643.2.2.36.0" },
+    { "GOST2001-CryptoPro-C", "1.2.643.2.2.36.1" },
+
+    { "GOST2012-256-tc26-A", "1.2.643.7.1.2.1.1.1" },
+    { "GOST2001-CryptoPro-A", "1.2.643.7.1.2.1.1.2" },
+    { "GOST2001-CryptoPro-A", "GOST2012-256-tc26-B" },
+    { "GOST2001-CryptoPro-B", "1.2.643.7.1.2.1.1.3" },
+    { "GOST2001-CryptoPro-B", "GOST2012-256-tc26-C" },
+    { "GOST2001-CryptoPro-C", "1.2.643.7.1.2.1.1.4" },
+    { "GOST2001-CryptoPro-C", "GOST2012-256-tc26-D" },
+
+    { "GOST2012-512-test", "GOST2012-test" },
+    { "GOST2012-512-test", "1.2.643.7.1.2.1.2.0" },
+    { "GOST2012-512-tc26-A", "GOST2012-tc26-A" },
+    { "GOST2012-512-tc26-B", "GOST2012-tc26-B" },
+    { "GOST2012-512-tc26-A", "1.2.643.7.1.2.1.2.1" },
+    { "GOST2012-512-tc26-B", "1.2.643.7.1.2.1.2.2" },
+    { "GOST2012-512-tc26-C", "1.2.643.7.1.2.1.2.3" },
+
+    { "secp256k1", "1.3.132.0.10" },
+
+    { "sm2p256v1", "1.2.156.10197.1.301" },
+
+    { NULL, NULL}
+  };
+
+
+typedef struct
+{
+  const char *desc;           /* Description of the curve.  */
+  unsigned int nbits;         /* Number of bits.  */
+  unsigned int fips:1;        /* True if this is a FIPS140-2 approved curve. */
+
+  /* The model describing this curve.  This is mainly used to select
+     the group equation. */
+  enum gcry_mpi_ec_models model;
+
+  /* The actual ECC dialect used.  This is used for curve specific
+     optimizations and to select encodings etc. */
+  enum ecc_dialects dialect;
+
+  const char *p;              /* The prime defining the field.  */
+  const char *a, *b;          /* The coefficients.  For Twisted Edwards
+                                 Curves b is used for d.  For Montgomery
+                                 Curves (a,b) has ((A-2)/4,B^-1).  */
+  const char *n;              /* The order of the base point.  */
+  const char *g_x, *g_y;      /* Base point.  */
+  unsigned int h;             /* Cofactor.  */
+} ecc_domain_parms_t;
+
+
+/* This static table defines all available curves.  */
+static const ecc_domain_parms_t domain_parms[] =
+  {
+    {
+      /* (-x^2 + y^2 = 1 + dx^2y^2) */
+      "Ed25519", 255, 0,
+      MPI_EC_EDWARDS, ECC_DIALECT_ED25519,
+      "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED",
+      "-0x01",
+      "-0x2DFC9311D490018C7338BF8688861767FF8FF5B2BEBE27548A14B235ECA6874A",
+      "0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED",
+      "0x216936D3CD6E53FEC0A4E231FDD6DC5C692CC7609525A7B2C9562D608F25D51A",
+      "0x6666666666666666666666666666666666666666666666666666666666666658",
+      8
+    },
+    {
+      /* (y^2 = x^3 + 486662*x^2 + x) */
+      "Curve25519", 255, 0,
+      MPI_EC_MONTGOMERY, ECC_DIALECT_STANDARD,
+      "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED",
+      "0x01DB41",
+      "0x01",
+      "0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED",
+      "0x0000000000000000000000000000000000000000000000000000000000000009",
+      "0x20AE19A1B8A086B4E01EDD2C7748D14C923D4D7E6D7C61B229E9C5A27ECED3D9",
+      8
+      /* Note: As per RFC-7748 errata eid4730 the g_y value should be
+       * "0x5F51E65E475F794B1FE122D388B72EB36DC2B28192839E4DD6163A5D81312C14"
+       * but that breaks the keygrip.  The new value is recovered in
+       * the function _gcry_ecc_fill_in_curve.  See bug #4712.
+       */
+    },
+    {
+      /* (x^2 + y^2 = 1 + dx^2y^2) */
+      "Ed448", 448, 0,
+      MPI_EC_EDWARDS, ECC_DIALECT_SAFECURVE,
+      "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE"
+      "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
+      "0x01",
+      "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE"
+      "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF6756",
+      "0x3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+      "7CCA23E9C44EDB49AED63690216CC2728DC58F552378C292AB5844F3",
+      "0x4F1970C66BED0DED221D15A622BF36DA9E146570470F1767EA6DE324"
+      "A3D3A46412AE1AF72AB66511433B80E18B00938E2626A82BC70CC05E",
+      "0x693F46716EB6BC248876203756C9C7624BEA73736CA3984087789C1E"
+      "05A0C2D73AD3FF1CE67C39C4FDBD132C4ED7C8AD9808795BF230FA14",
+      4,
+    },
+    {
+      /* (y^2 = x^3 + 156326*x^2 + x) */
+      "X448", 448, 0,
+      MPI_EC_MONTGOMERY, ECC_DIALECT_SAFECURVE,
+      "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE"
+      "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF",
+      "0x98A9",
+      "0x01",
+      "0x3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+      "7CCA23E9C44EDB49AED63690216CC2728DC58F552378C292AB5844F3",
+      "0x00000000000000000000000000000000000000000000000000000000"
+      "00000000000000000000000000000000000000000000000000000005",
+      "0x7D235D1295F5B1F66C98AB6E58326FCECBAE5D34F55545D060F75DC2"
+      "8DF3F6EDB8027E2346430D211312C4B150677AF76FD7223D457B5B1A",
+      4,
+    },
+#if 0 /* No real specs yet found.  */
+    {
+      /* x^2 + y^2 = 1 + 3617x^2y^2 mod 2^414 - 17 */
+      "Curve3617",
+      "0x3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+      "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEF",
+      MPI_EC_EDWARDS, 0,
+      "0x01",
+      "0x0e21",
+      "0x07FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEB3CC92414CF"
+      "706022B36F1C0338AD63CF181B0E71A5E106AF79",
+      "0x1A334905141443300218C0631C326E5FCD46369F44C03EC7F57FF35498A4AB4D"
+      "6D6BA111301A73FAA8537C64C4FD3812F3CBC595",
+      "0x22",
+      8
+    },
+#endif /*0*/
+    {
+      "NIST P-192", 192, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xfffffffffffffffffffffffffffffffeffffffffffffffff",
+      "0xfffffffffffffffffffffffffffffffefffffffffffffffc",
+      "0x64210519e59c80e70fa7e9ab72243049feb8deecc146b9b1",
+      "0xffffffffffffffffffffffff99def836146bc9b1b4d22831",
+
+      "0x188da80eb03090f67cbf20eb43a18800f4ff0afd82ff1012",
+      "0x07192b95ffc8da78631011ed6b24cdd573f977a11e794811",
+      1
+    },
+    {
+      "NIST P-224", 224, 1,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xffffffffffffffffffffffffffffffff000000000000000000000001",
+      "0xfffffffffffffffffffffffffffffffefffffffffffffffffffffffe",
+      "0xb4050a850c04b3abf54132565044b0b7d7bfd8ba270b39432355ffb4",
+      "0xffffffffffffffffffffffffffff16a2e0b8f03e13dd29455c5c2a3d" ,
+
+      "0xb70e0cbd6bb4bf7f321390b94a03c1d356c21122343280d6115c1d21",
+      "0xbd376388b5f723fb4c22dfe6cd4375a05a07476444d5819985007e34",
+      1
+    },
+    {
+      "NIST P-256", 256, 1,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff",
+      "0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc",
+      "0x5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b",
+      "0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551",
+
+      "0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296",
+      "0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5",
+      1
+    },
+    {
+      "NIST P-384", 384, 1,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe"
+      "ffffffff0000000000000000ffffffff",
+      "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe"
+      "ffffffff0000000000000000fffffffc",
+      "0xb3312fa7e23ee7e4988e056be3f82d19181d9c6efe8141120314088f5013875a"
+      "c656398d8a2ed19d2a85c8edd3ec2aef",
+      "0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf"
+      "581a0db248b0a77aecec196accc52973",
+
+      "0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a38"
+      "5502f25dbf55296c3a545e3872760ab7",
+      "0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c0"
+      "0a60b1ce1d7e819d7a431d7c90ea0e5f",
+      1
+    },
+    {
+      "NIST P-521", 521, 1,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+      "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+      "0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+      "fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc",
+      "0x051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef10"
+      "9e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00",
+      "0x01ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+      "fffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409",
+
+      "0x00c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d"
+      "3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66",
+      "0x011839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e"
+      "662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650",
+      1
+    },
+
+    { "brainpoolP160r1", 160, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xe95e4a5f737059dc60dfc7ad95b3d8139515620f",
+      "0x340e7be2a280eb74e2be61bada745d97e8f7c300",
+      "0x1e589a8595423412134faa2dbdec95c8d8675e58",
+      "0xe95e4a5f737059dc60df5991d45029409e60fc09",
+      "0xbed5af16ea3f6a4f62938c4631eb5af7bdbcdbc3",
+      "0x1667cb477a1a8ec338f94741669c976316da6321",
+      1
+    },
+
+    { "brainpoolP192r1", 192, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xc302f41d932a36cda7a3463093d18db78fce476de1a86297",
+      "0x6a91174076b1e0e19c39c031fe8685c1cae040e5c69a28ef",
+      "0x469a28ef7c28cca3dc721d044f4496bcca7ef4146fbf25c9",
+      "0xc302f41d932a36cda7a3462f9e9e916b5be8f1029ac4acc1",
+      "0xc0a0647eaab6a48753b033c56cb0f0900a2f5c4853375fd6",
+      "0x14b690866abd5bb88b5f4828c1490002e6773fa2fa299b8f",
+      1
+    },
+
+    { "brainpoolP224r1", 224, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xd7c134aa264366862a18302575d1d787b09f075797da89f57ec8c0ff",
+      "0x68a5e62ca9ce6c1c299803a6c1530b514e182ad8b0042a59cad29f43",
+      "0x2580f63ccfe44138870713b1a92369e33e2135d266dbb372386c400b",
+      "0xd7c134aa264366862a18302575d0fb98d116bc4b6ddebca3a5a7939f",
+      "0x0d9029ad2c7e5cf4340823b2a87dc68c9e4ce3174c1e6efdee12c07d",
+      "0x58aa56f772c0726f24c6b89e4ecdac24354b9e99caa3f6d3761402cd",
+      1
+    },
+
+    { "brainpoolP256r1", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xa9fb57dba1eea9bc3e660a909d838d726e3bf623d52620282013481d1f6e5377",
+      "0x7d5a0975fc2c3057eef67530417affe7fb8055c126dc5c6ce94a4b44f330b5d9",
+      "0x26dc5c6ce94a4b44f330b5d9bbd77cbf958416295cf7e1ce6bccdc18ff8c07b6",
+      "0xa9fb57dba1eea9bc3e660a909d838d718c397aa3b561a6f7901e0e82974856a7",
+      "0x8bd2aeb9cb7e57cb2c4b482ffc81b7afb9de27e1e3bd23c23a4453bd9ace3262",
+      "0x547ef835c3dac4fd97f8461a14611dc9c27745132ded8e545c1d54c72f046997",
+      1
+    },
+
+    { "brainpoolP320r1", 320, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xd35e472036bc4fb7e13c785ed201e065f98fcfa6f6f40def4f92b9ec7893ec28"
+      "fcd412b1f1b32e27",
+      "0x3ee30b568fbab0f883ccebd46d3f3bb8a2a73513f5eb79da66190eb085ffa9f4"
+      "92f375a97d860eb4",
+      "0x520883949dfdbc42d3ad198640688a6fe13f41349554b49acc31dccd88453981"
+      "6f5eb4ac8fb1f1a6",
+      "0xd35e472036bc4fb7e13c785ed201e065f98fcfa5b68f12a32d482ec7ee8658e9"
+      "8691555b44c59311",
+      "0x43bd7e9afb53d8b85289bcc48ee5bfe6f20137d10a087eb6e7871e2a10a599c7"
+      "10af8d0d39e20611",
+      "0x14fdd05545ec1cc8ab4093247f77275e0743ffed117182eaa9c77877aaac6ac7"
+      "d35245d1692e8ee1",
+      1
+    },
+
+    { "brainpoolP384r1", 384, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x8cb91e82a3386d280f5d6f7e50e641df152f7109ed5456b412b1da197fb71123"
+      "acd3a729901d1a71874700133107ec53",
+      "0x7bc382c63d8c150c3c72080ace05afa0c2bea28e4fb22787139165efba91f90f"
+      "8aa5814a503ad4eb04a8c7dd22ce2826",
+      "0x04a8c7dd22ce28268b39b55416f0447c2fb77de107dcd2a62e880ea53eeb62d5"
+      "7cb4390295dbc9943ab78696fa504c11",
+      "0x8cb91e82a3386d280f5d6f7e50e641df152f7109ed5456b31f166e6cac0425a7"
+      "cf3ab6af6b7fc3103b883202e9046565",
+      "0x1d1c64f068cf45ffa2a63a81b7c13f6b8847a3e77ef14fe3db7fcafe0cbd10e8"
+      "e826e03436d646aaef87b2e247d4af1e",
+      "0x8abe1d7520f9c2a45cb1eb8e95cfd55262b70b29feec5864e19c054ff9912928"
+      "0e4646217791811142820341263c5315",
+      1
+    },
+
+    { "brainpoolP512r1", 512, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xaadd9db8dbe9c48b3fd4e6ae33c9fc07cb308db3b3c9d20ed6639cca70330871"
+      "7d4d9b009bc66842aecda12ae6a380e62881ff2f2d82c68528aa6056583a48f3",
+      "0x7830a3318b603b89e2327145ac234cc594cbdd8d3df91610a83441caea9863bc"
+      "2ded5d5aa8253aa10a2ef1c98b9ac8b57f1117a72bf2c7b9e7c1ac4d77fc94ca",
+      "0x3df91610a83441caea9863bc2ded5d5aa8253aa10a2ef1c98b9ac8b57f1117a7"
+      "2bf2c7b9e7c1ac4d77fc94cadc083e67984050b75ebae5dd2809bd638016f723",
+      "0xaadd9db8dbe9c48b3fd4e6ae33c9fc07cb308db3b3c9d20ed6639cca70330870"
+      "553e5c414ca92619418661197fac10471db1d381085ddaddb58796829ca90069",
+      "0x81aee4bdd82ed9645a21322e9c4c6a9385ed9f70b5d916c1b43b62eef4d0098e"
+      "ff3b1f78e2d0d48d50d1687b93b97d5f7c6d5047406a5e688b352209bcb9f822",
+      "0x7dde385d566332ecc0eabfa9cf7822fdf209f70024a57b1aa000c55b881f8111"
+      "b2dcde494a5f485e5bca4bd88a2763aed1ca2b2fa8f0540678cd1e0f3ad80892",
+      1
+    },
+    {
+      "GOST2001-test", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x8000000000000000000000000000000000000000000000000000000000000431",
+      "0x0000000000000000000000000000000000000000000000000000000000000007",
+      "0x5fbff498aa938ce739b8e022fbafef40563f6e6a3472fc2a514c0ce9dae23b7e",
+      "0x8000000000000000000000000000000150fe8a1892976154c59cfc193accf5b3",
+
+      "0x0000000000000000000000000000000000000000000000000000000000000002",
+      "0x08e2a8a0e65147d4bd6316030e16d19c85c97f0a9ca267122b96abbcea7e8fc8",
+      1
+    },
+    {
+      "GOST2001-CryptoPro-A", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffd97",
+      "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffd94",
+      "0x00000000000000000000000000000000000000000000000000000000000000a6",
+      "0xffffffffffffffffffffffffffffffff6c611070995ad10045841b09b761b893",
+      "0x0000000000000000000000000000000000000000000000000000000000000001",
+      "0x8d91e471e0989cda27df505a453f2b7635294f2ddf23e3b122acc99c9e9f1e14",
+      1
+    },
+    {
+      "GOST2001-CryptoPro-B", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x8000000000000000000000000000000000000000000000000000000000000c99",
+      "0x8000000000000000000000000000000000000000000000000000000000000c96",
+      "0x3e1af419a269a5f866a7d3c25c3df80ae979259373ff2b182f49d4ce7e1bbc8b",
+      "0x800000000000000000000000000000015f700cfff1a624e5e497161bcc8a198f",
+      "0x0000000000000000000000000000000000000000000000000000000000000001",
+      "0x3fa8124359f96680b83d1c3eb2c070e5c545c9858d03ecfb744bf8d717717efc",
+      1
+    },
+    {
+      "GOST2001-CryptoPro-C", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x9b9f605f5a858107ab1ec85e6b41c8aacf846e86789051d37998f7b9022d759b",
+      "0x9b9f605f5a858107ab1ec85e6b41c8aacf846e86789051d37998f7b9022d7598",
+      "0x000000000000000000000000000000000000000000000000000000000000805a",
+      "0x9b9f605f5a858107ab1ec85e6b41c8aa582ca3511eddfb74f02f3a6598980bb9",
+      "0x0000000000000000000000000000000000000000000000000000000000000000",
+      "0x41ece55743711a8c3cbf3783cd08c0ee4d4dc440d4641a8f366e550dfdb3bb67",
+      1
+    },
+    {
+      "GOST2012-256-A", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffd97",
+      "0xc2173f1513981673af4892c23035a27ce25e2013bf95aa33b22c656f277e7335",
+      "0x295f9bae7428ed9ccc20e7c359a9d41a22fccd9108e17bf7ba9337a6f8ae9513",
+      "0x400000000000000000000000000000000fd8cddfc87b6635c115af556c360c67",
+      "0x91e38443a5e82c0d880923425712b2bb658b9196932e02c78b2582fe742daa28",
+      "0x32879423ab1a0375895786c4bb46e9565fde0b5344766740af268adb32322e5c",
+      4
+    },
+    {
+      "GOST2012-512-test", 511, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x4531acd1fe0023c7550d267b6b2fee80922b14b2ffb90f04d4eb7c09b5d2d15d"
+      "f1d852741af4704a0458047e80e4546d35b8336fac224dd81664bbf528be6373",
+      "0x0000000000000000000000000000000000000000000000000000000000000007",
+      "0x1cff0806a31116da29d8cfa54e57eb748bc5f377e49400fdd788b649eca1ac4"
+      "361834013b2ad7322480a89ca58e0cf74bc9e540c2add6897fad0a3084f302adc",
+      "0x4531acd1fe0023c7550d267b6b2fee80922b14b2ffb90f04d4eb7c09b5d2d15d"
+      "a82f2d7ecb1dbac719905c5eecc423f1d86e25edbe23c595d644aaf187e6e6df",
+
+      "0x24d19cc64572ee30f396bf6ebbfd7a6c5213b3b3d7057cc825f91093a68cd762"
+      "fd60611262cd838dc6b60aa7eee804e28bc849977fac33b4b530f1b120248a9a",
+      "0x2bb312a43bd2ce6e0d020613c857acddcfbf061e91e5f2c3f32447c259f39b2"
+      "c83ab156d77f1496bf7eb3351e1ee4e43dc1a18b91b24640b6dbb92cb1add371e",
+      1
+    },
+    {
+      "GOST2012-512-tc26-A", 512, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+        "fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffdc7",
+      "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+        "fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffdc4",
+      "0xe8c2505dedfc86ddc1bd0b2b6667f1da34b82574761cb0e879bd081cfd0b6265"
+        "ee3cb090f30d27614cb4574010da90dd862ef9d4ebee4761503190785a71c760",
+      "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+        "27e69532f48d89116ff22b8d4e0560609b4b38abfad2b85dcacdb1411f10b275",
+      "0x0000000000000000000000000000000000000000000000000000000000000000"
+        "0000000000000000000000000000000000000000000000000000000000000003",
+      "0x7503cfe87a836ae3a61b8816e25450e6ce5e1c93acf1abc1778064fdcbefa921"
+        "df1626be4fd036e93d75e6a50e3a41e98028fe5fc235f5b889a589cb5215f2a4",
+      1
+    },
+    {
+      "GOST2012-512-tc26-B", 512, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0x8000000000000000000000000000000000000000000000000000000000000000"
+        "000000000000000000000000000000000000000000000000000000000000006f",
+      "0x8000000000000000000000000000000000000000000000000000000000000000"
+        "000000000000000000000000000000000000000000000000000000000000006c",
+      "0x687d1b459dc841457e3e06cf6f5e2517b97c7d614af138bcbf85dc806c4b289f"
+        "3e965d2db1416d217f8b276fad1ab69c50f78bee1fa3106efb8ccbc7c5140116",
+      "0x8000000000000000000000000000000000000000000000000000000000000001"
+        "49a1ec142565a545acfdb77bd9d40cfa8b996712101bea0ec6346c54374f25bd",
+      "0x0000000000000000000000000000000000000000000000000000000000000000"
+        "0000000000000000000000000000000000000000000000000000000000000002",
+      "0x1a8f7eda389b094c2c071e3647a8940f3c123b697578c213be6dd9e6c8ec7335"
+        "dcb228fd1edf4a39152cbcaaf8c0398828041055f94ceeec7e21340780fe41bd",
+      1
+    },
+    {
+      "GOST2012-512-tc26-C", 512, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+        "fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffdc7",
+      "0xdc9203e514a721875485a529d2c722fb187bc8980eb866644de41c68e1430645"
+        "46e861c0e2c9edd92ade71f46fcf50ff2ad97f951fda9f2a2eb6546f39689bd3",
+      "0xb4c4ee28cebc6c2c8ac12952cf37f16ac7efb6a9f69f4b57ffda2e4f0de5ade0"
+        "38cbc2fff719d2c18de0284b8bfef3b52b8cc7a5f5bf0a3c8d2319a5312557e1",
+      "0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+        "c98cdba46506ab004c33a9ff5147502cc8eda9e7a769a12694623cef47f023ed",
+      "0xe2e31edfc23de7bdebe241ce593ef5de2295b7a9cbaef021d385f7074cea043a"
+        "a27272a7ae602bf2a7b9033db9ed3610c6fb85487eae97aac5bc7928c1950148",
+      "0xf5ce40d95b5eb899abbccff5911cb8577939804d6527378b8c108c3d2090ff9be"
+        "18e2d33e3021ed2ef32d85822423b6304f726aa854bae07d0396e9a9addc40f",
+      4
+    },
+
+    {
+      "secp256k1", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F",
+      "0x0000000000000000000000000000000000000000000000000000000000000000",
+      "0x0000000000000000000000000000000000000000000000000000000000000007",
+      "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141",
+      "0x79BE667EF9DCBBAC55A06295CE870B07029BFCDB2DCE28D959F2815B16F81798",
+      "0x483ADA7726A3C4655DA4FBFC0E1108A8FD17B448A68554199C47D08FFB10D4B8",
+      1
+    },
+
+    {
+      "sm2p256v1", 256, 0,
+      MPI_EC_WEIERSTRASS, ECC_DIALECT_STANDARD,
+      "0xfffffffeffffffffffffffffffffffffffffffff00000000ffffffffffffffff",
+      "0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc",
+      "0x28e9fa9e9d9f5e344d5a9e4bcf6509a7f39789f515ab8f92ddbcbd414d940e93",
+      "0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123",
+      "0x32c4ae2c1f1981195f9904466a39c9948fe30bbff2660be1715a4589334c74c7",
+      "0xbc3736a2f4f6779c59bdcee36b692153d0a9877cc62a474002df32e52139f0a0",
+      1
+    },
+
+    { NULL, 0, 0, 0, 0, NULL, NULL, NULL, NULL, NULL }
+  };
+
+
+
+
+/* Return a copy of POINT.  */
+static gcry_mpi_point_t
+point_copy (gcry_mpi_point_t point)
+{
+  gcry_mpi_point_t newpoint;
+
+  if (point)
+    {
+      newpoint = mpi_point_new (0);
+      point_set (newpoint, point);
+    }
+  else
+    newpoint = NULL;
+  return newpoint;
+}
+
+
+/* Helper to scan a hex string. */
+static gcry_mpi_t
+scanval (const char *string)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t val;
+
+  rc = _gcry_mpi_scan (&val, GCRYMPI_FMT_HEX, string, 0, NULL);
+  if (rc)
+    log_fatal ("scanning ECC parameter failed: %s\n", gpg_strerror (rc));
+  return val;
+}
+
+
+/* Return the index of the domain_parms table for a curve with NAME.
+   Return -1 if not found.  */
+static int
+find_domain_parms_idx (const char *name)
+{
+  int idx, aliasno;
+
+  /* First check our native curves.  */
+  for (idx = 0; domain_parms[idx].desc; idx++)
+    if (!strcmp (name, domain_parms[idx].desc))
+      return idx;
+
+  /* If not found consult the alias table.  */
+  if (!domain_parms[idx].desc)
+    {
+      for (aliasno = 0; curve_aliases[aliasno].name; aliasno++)
+        if (!strcmp (name, curve_aliases[aliasno].other))
+          break;
+      if (curve_aliases[aliasno].name)
+        {
+          for (idx = 0; domain_parms[idx].desc; idx++)
+            if (!strcmp (curve_aliases[aliasno].name, domain_parms[idx].desc))
+              return idx;
+        }
+    }
+
+  return -1;
+}
+
+
+/* Generate the crypto system setup.  This function takes the NAME of
+   a curve or the desired number of bits and stores at R_CURVE the
+   parameters of the named curve or those of a suitable curve.  If
+   R_NBITS is not NULL, the chosen number of bits is stored there.
+   NULL may be given for R_CURVE, if the value is not required and for
+   example only a quick test for availability is desired.  Note that
+   the curve fields should be initialized to zero because fields which
+   are not NULL are skipped.  */
+gpg_err_code_t
+_gcry_ecc_fill_in_curve (unsigned int nbits, const char *name,
+                         elliptic_curve_t *curve, unsigned int *r_nbits)
+{
+  int idx;
+  const char *resname = NULL; /* Set to a found curve name.  */
+
+  if (name)
+    idx = find_domain_parms_idx (name);
+  else
+    {
+      for (idx = 0; domain_parms[idx].desc; idx++)
+        if (nbits == domain_parms[idx].nbits
+            && domain_parms[idx].model == MPI_EC_WEIERSTRASS)
+          break;
+      if (!domain_parms[idx].desc)
+        idx = -1;
+    }
+  if (idx < 0)
+    return GPG_ERR_UNKNOWN_CURVE;
+
+  resname = domain_parms[idx].desc;
+
+  /* In fips mode we only support NIST curves.  Note that it is
+     possible to bypass this check by specifying the curve parameters
+     directly.  */
+  if (fips_mode () && !domain_parms[idx].fips )
+    return GPG_ERR_NOT_SUPPORTED;
+
+  switch (domain_parms[idx].model)
+    {
+    case MPI_EC_WEIERSTRASS:
+    case MPI_EC_EDWARDS:
+    case MPI_EC_MONTGOMERY:
+      break;
+    default:
+      return GPG_ERR_BUG;
+    }
+
+
+  if (r_nbits)
+    *r_nbits = domain_parms[idx].nbits;
+
+  if (curve)
+    {
+      curve->model = domain_parms[idx].model;
+      curve->dialect = domain_parms[idx].dialect;
+      if (!curve->p)
+        curve->p = scanval (domain_parms[idx].p);
+      if (!curve->a)
+        {
+          curve->a = scanval (domain_parms[idx].a);
+          if (curve->a->sign)
+            {
+              mpi_resize (curve->a, curve->p->nlimbs);
+              _gcry_mpih_sub_n (curve->a->d, curve->p->d,
+                                curve->a->d, curve->p->nlimbs);
+              curve->a->nlimbs = curve->p->nlimbs;
+              curve->a->sign = 0;
+            }
+        }
+      if (!curve->b)
+        {
+          curve->b = scanval (domain_parms[idx].b);
+          if (curve->b->sign)
+            {
+              mpi_resize (curve->b, curve->p->nlimbs);
+              _gcry_mpih_sub_n (curve->b->d, curve->p->d,
+                                curve->b->d, curve->p->nlimbs);
+              curve->b->nlimbs = curve->p->nlimbs;
+              curve->b->sign = 0;
+            }
+        }
+      if (!curve->n)
+        curve->n = scanval (domain_parms[idx].n);
+      if (!curve->G.x)
+        curve->G.x = scanval (domain_parms[idx].g_x);
+      if (!curve->G.y)
+        curve->G.y = scanval (domain_parms[idx].g_y);
+      curve->h = domain_parms[idx].h;
+
+      /*
+       * In the constants of domain_parms, we defined Curve25519
+       * domain parameters as the ones in RFC-7748 before the errata
+       * (eid4730).  To keep the computation having exact same values,
+       * we recover the new value of g_y, here.
+       */
+      if (!strcmp (resname, "Curve25519"))
+        mpi_sub (curve->G.y, curve->p, curve->G.y);
+
+      if (!curve->G.z)
+        curve->G.z = mpi_alloc_set_ui (1);
+      if (!curve->name)
+        curve->name = resname;
+    }
+
+  return 0;
+}
+
+
+/* Give the name of the curve NAME, store the curve parameters into P,
+   A, B, G, and N if they point to NULL value.  Note that G is
+   returned in standard uncompressed format.  Also update MODEL and
+   DIALECT if they are not NULL. */
+gpg_err_code_t
+_gcry_ecc_update_curve_param (const char *name,
+                              enum gcry_mpi_ec_models *model,
+                              enum ecc_dialects *dialect,
+                              gcry_mpi_t *p, gcry_mpi_t *a, gcry_mpi_t *b,
+                              gcry_mpi_t *g, gcry_mpi_t *n)
+{
+  int idx;
+
+  idx = find_domain_parms_idx (name);
+  if (idx < 0)
+    return GPG_ERR_UNKNOWN_CURVE;
+
+  if (g)
+    {
+      char *buf;
+      size_t len;
+
+      len = 4;
+      len += strlen (domain_parms[idx].g_x+2);
+      len += strlen (domain_parms[idx].g_y+2);
+      len++;
+      buf = xtrymalloc (len);
+      if (!buf)
+        return gpg_err_code_from_syserror ();
+      strcpy (stpcpy (stpcpy (buf, "0x04"), domain_parms[idx].g_x+2),
+              domain_parms[idx].g_y+2);
+      _gcry_mpi_release (*g);
+      *g = scanval (buf);
+      xfree (buf);
+    }
+  if (model)
+    *model = domain_parms[idx].model;
+  if (dialect)
+    *dialect = domain_parms[idx].dialect;
+  if (p)
+    {
+      _gcry_mpi_release (*p);
+      *p = scanval (domain_parms[idx].p);
+    }
+  if (a)
+    {
+      _gcry_mpi_release (*a);
+      *a = scanval (domain_parms[idx].a);
+    }
+  if (b)
+    {
+      _gcry_mpi_release (*b);
+      *b = scanval (domain_parms[idx].b);
+    }
+  if (n)
+    {
+      _gcry_mpi_release (*n);
+      *n = scanval (domain_parms[idx].n);
+    }
+  return 0;
+}
+
+
+/* Return the name matching the parameters in PKEY.  This works only
+   with curves described by the Weierstrass equation. */
+const char *
+_gcry_ecc_get_curve (gcry_sexp_t keyparms, int iterator, unsigned int *r_nbits)
+{
+  gpg_err_code_t rc;
+  const char *result = NULL;
+  elliptic_curve_t E;
+  gcry_mpi_point_t G = NULL;
+  gcry_mpi_t tmp = NULL;
+  int idx;
+
+  memset (&E, 0, sizeof E);
+
+  if (r_nbits)
+    *r_nbits = 0;
+
+  if (!keyparms)
+    {
+      idx = iterator;
+      if (idx >= 0 && idx < DIM (domain_parms))
+        {
+          result = domain_parms[idx].desc;
+          if (r_nbits)
+            *r_nbits = domain_parms[idx].nbits;
+        }
+      return result;
+    }
+
+
+  /*
+   * Extract the curve parameters..
+   */
+  rc = gpg_err_code (sexp_extract_param (keyparms, NULL, "pabn",
+                                         &E.p, &E.a, &E.b, &E.n, NULL));
+  if (rc == GPG_ERR_NO_OBJ)
+    {
+      /* This might be the second use case of checking whether a
+         specific curve given by name is supported.  */
+      gcry_sexp_t l1;
+      char *name;
+
+      l1 = sexp_find_token (keyparms, "curve", 5);
+      if (!l1)
+        goto leave;  /* No curve name parameter.  */
+
+      name = sexp_nth_string (l1, 1);
+      sexp_release (l1);
+      if (!name)
+        goto leave;  /* Name missing or out of core. */
+
+      idx = find_domain_parms_idx (name);
+      xfree (name);
+      if (idx >= 0)  /* Curve found.  */
+        {
+          result = domain_parms[idx].desc;
+          if (r_nbits)
+            *r_nbits = domain_parms[idx].nbits;
+        }
+      return result;
+    }
+
+  if (rc)
+    goto leave;
+
+  rc = point_from_keyparam (&G, keyparms, "g", NULL);
+  if (rc)
+    goto leave;
+
+  _gcry_mpi_point_init (&E.G);
+  _gcry_mpi_point_set (&E.G, G->x, G->y, G->z);
+
+  for (idx = 0; domain_parms[idx].desc; idx++)
+    {
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].p);
+      if (mpi_cmp (tmp, E.p))
+        continue;
+
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].a);
+      if (tmp->sign)
+        {
+          if (!mpi_cmpabs (tmp, E.a))
+            /* For backward compatibility to <= libgcrypt 1.8, we
+               allow this match to support existing keys in SEXP.  */
+            ;
+          else
+            {
+              mpi_resize (tmp, E.p->nlimbs);
+              _gcry_mpih_sub_n (tmp->d, E.p->d,
+                                tmp->d, E.p->nlimbs);
+              tmp->nlimbs = E.p->nlimbs;
+              tmp->sign = 0;
+              if (mpi_cmp (tmp, E.a))
+                continue;
+            }
+        }
+      else if (mpi_cmp (tmp, E.a))
+        continue;
+
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].b);
+      if (tmp->sign)
+        {
+          if (!mpi_cmpabs (tmp, E.b))
+            /* Same for backward compatibility, see above.  */
+            ;
+          else
+            {
+              mpi_resize (tmp, E.p->nlimbs);
+              _gcry_mpih_sub_n (tmp->d, E.p->d,
+                                tmp->d, E.p->nlimbs);
+              tmp->nlimbs = E.p->nlimbs;
+              tmp->sign = 0;
+              if (mpi_cmp (tmp, E.b))
+                continue;
+            }
+        }
+      else if (mpi_cmp (tmp, E.b))
+        continue;
+
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].n);
+      if (mpi_cmp (tmp, E.n))
+        continue;
+
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].g_x);
+      if (mpi_cmp (tmp, E.G.x))
+        continue;
+
+      mpi_free (tmp);
+      tmp = scanval (domain_parms[idx].g_y);
+      if (mpi_cmp (tmp, E.G.y))
+        continue;
+
+      result = domain_parms[idx].desc;
+      if (r_nbits)
+        *r_nbits = domain_parms[idx].nbits;
+      break;
+    }
+
+ leave:
+  _gcry_mpi_point_release (G);
+  _gcry_mpi_release (tmp);
+  _gcry_mpi_release (E.p);
+  _gcry_mpi_release (E.a);
+  _gcry_mpi_release (E.b);
+  _gcry_mpi_point_free_parts (&E.G);
+  _gcry_mpi_release (E.n);
+  return result;
+}
+
+
+/* Helper to extract an MPI from key parameters.  */
+static gpg_err_code_t
+mpi_from_keyparam (gcry_mpi_t *r_a, gcry_sexp_t keyparam, const char *name,
+                   int opaque)
+{
+  gcry_err_code_t ec = 0;
+  gcry_sexp_t l1;
+
+  l1 = sexp_find_token (keyparam, name, 0);
+  if (l1)
+    {
+      *r_a = sexp_nth_mpi (l1, 1, opaque? GCRYMPI_FMT_OPAQUE : GCRYMPI_FMT_USG);
+      sexp_release (l1);
+      if (!*r_a)
+        ec = GPG_ERR_INV_OBJ;
+    }
+  return ec;
+}
+
+/* Helper to extract a point from key parameters.  If no parameter
+   with NAME is found, the functions tries to find a non-encoded point
+   by appending ".x", ".y" and ".z" to NAME.  ".z" is in this case
+   optional and defaults to 1.  EC is the context which at this point
+   may not be fully initialized. */
+static gpg_err_code_t
+point_from_keyparam (gcry_mpi_point_t *r_a,
+                     gcry_sexp_t keyparam, const char *name, mpi_ec_t ec)
+{
+  gcry_err_code_t rc;
+  gcry_sexp_t l1;
+  gcry_mpi_point_t point;
+
+  l1 = sexp_find_token (keyparam, name, 0);
+  if (l1)
+    {
+      gcry_mpi_t a;
+
+      a = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_OPAQUE);
+      sexp_release (l1);
+      if (!a)
+        return GPG_ERR_INV_OBJ;
+
+      point = mpi_point_new (0);
+      rc = _gcry_mpi_ec_decode_point (point, a, ec);
+      mpi_free (a);
+      if (rc)
+        {
+          mpi_point_release (point);
+          return rc;
+        }
+    }
+  else
+    {
+      char *tmpname;
+      gcry_mpi_t x = NULL;
+      gcry_mpi_t y = NULL;
+      gcry_mpi_t z = NULL;
+
+      tmpname = xtrymalloc (strlen (name) + 2 + 1);
+      if (!tmpname)
+        return gpg_err_code_from_syserror ();
+      strcpy (stpcpy (tmpname, name), ".x");
+      rc = mpi_from_keyparam (&x, keyparam, tmpname, 0);
+      if (rc)
+        {
+          xfree (tmpname);
+          return rc;
+        }
+      strcpy (stpcpy (tmpname, name), ".y");
+      rc = mpi_from_keyparam (&y, keyparam, tmpname, 0);
+      if (rc)
+        {
+          mpi_free (x);
+          xfree (tmpname);
+          return rc;
+        }
+      strcpy (stpcpy (tmpname, name), ".z");
+      rc = mpi_from_keyparam (&z, keyparam, tmpname, 0);
+      if (rc)
+        {
+          mpi_free (y);
+          mpi_free (x);
+          xfree (tmpname);
+          return rc;
+        }
+      if (!z)
+        z = mpi_set_ui (NULL, 1);
+      if (x && y)
+        point = mpi_point_snatch_set (NULL, x, y, z);
+      else
+        {
+          mpi_free (x);
+          mpi_free (y);
+          mpi_free (z);
+          point = NULL;
+        }
+      xfree (tmpname);
+    }
+
+  if (point)
+    *r_a = point;
+  return 0;
+}
+
+
+
+static gpg_err_code_t
+mpi_ec_get_elliptic_curve (elliptic_curve_t *E, int *r_flags,
+                           gcry_sexp_t keyparam, const char *curvename)
+{
+  gpg_err_code_t errc;
+  unsigned int nbits;
+  gcry_sexp_t l1;
+
+  errc = _gcry_pk_util_get_nbits (keyparam, &nbits);
+  if (errc)
+    return errc;
+
+  E->model = MPI_EC_WEIERSTRASS;
+  E->dialect = ECC_DIALECT_STANDARD;
+  E->h = 1;
+
+  if (keyparam)
+    {
+      /* Parse an optional flags list.  */
+      l1 = sexp_find_token (keyparam, "flags", 0);
+      if (l1)
+        {
+          int flags = 0;
+
+          errc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+          sexp_release (l1);
+          l1 = NULL;
+          if (errc)
+            goto leave;
+
+          *r_flags |= flags;
+        }
+
+      /* Parse the deprecated optional transient-key flag.  */
+      l1 = sexp_find_token (keyparam, "transient-key", 0);
+      if (l1)
+        {
+          *r_flags |= PUBKEY_FLAG_TRANSIENT_KEY;
+          sexp_release (l1);
+        }
+
+      /* Check whether a curve name was given.  */
+      l1 = sexp_find_token (keyparam, "curve", 5);
+
+      /* If we don't have a curve name or if override parameters have
+         explicitly been requested, parse them.  */
+      if (!l1 || (*r_flags & PUBKEY_FLAG_PARAM))
+        {
+          gcry_mpi_point_t G = NULL;
+          gcry_mpi_t cofactor = NULL;
+
+          errc = mpi_from_keyparam (&E->p, keyparam, "p", 0);
+          if (errc)
+            goto leave;
+          errc = mpi_from_keyparam (&E->a, keyparam, "a", 0);
+          if (errc)
+            goto leave;
+          errc = mpi_from_keyparam (&E->b, keyparam, "b", 0);
+          if (errc)
+            goto leave;
+          errc = point_from_keyparam (&G, keyparam, "g", NULL);
+          if (errc)
+            goto leave;
+          if (G)
+            {
+              _gcry_mpi_point_init (&E->G);
+              mpi_point_set (&E->G, G->x, G->y, G->z);
+              mpi_point_set (G, NULL, NULL, NULL);
+              mpi_point_release (G);
+            }
+          errc = mpi_from_keyparam (&E->n, keyparam, "n", 0);
+          if (errc)
+            goto leave;
+          errc = mpi_from_keyparam (&cofactor, keyparam, "h", 0);
+          if (errc)
+            goto leave;
+          if (cofactor)
+            {
+              mpi_get_ui (&E->h, cofactor);
+              mpi_free (cofactor);
+            }
+        }
+    }
+  else
+    l1 = NULL; /* No curvename.  */
+
+  /* Check whether a curve parameter is available and use that to fill
+     in missing values.  If no curve parameter is available try an
+     optional provided curvename.  If only the curvename has been
+     given use that one. */
+  if (l1 || curvename || nbits)
+    {
+      char *name;
+
+      if (l1)
+        {
+          name = sexp_nth_string (l1, 1);
+          sexp_release (l1);
+          if (!name)
+            {
+              errc = GPG_ERR_INV_OBJ; /* Name missing or out of core. */
+              goto leave;
+            }
+        }
+      else
+        name = NULL;
+
+      errc = _gcry_ecc_fill_in_curve (nbits, name? name : curvename, E, NULL);
+      xfree (name);
+      if (errc)
+        goto leave;
+    }
+
+ leave:
+  return errc;
+}
+
+static gpg_err_code_t
+mpi_ec_setup_elliptic_curve (mpi_ec_t ec, int flags,
+                             elliptic_curve_t *E, gcry_sexp_t keyparam)
+{
+  gpg_err_code_t errc = 0;
+
+  ec->G = mpi_point_snatch_set (NULL, E->G.x, E->G.y, E->G.z);
+  E->G.x = NULL;
+  E->G.y = NULL;
+  E->G.z = NULL;
+  ec->n = E->n;
+  E->n = NULL;
+  ec->h = E->h;
+  ec->name = E->name;
+
+  /* Now that we know the curve name we can look for the public key
+     Q.  point_from_keyparam needs to know the curve parameters so
+     that it is able to use the correct decompression.  Parsing
+     the private key D could have been done earlier but it is less
+     surprising if we do it here as well.  */
+  if (keyparam)
+    {
+      int is_opaque_bytes = ((ec->dialect == ECC_DIALECT_ED25519
+                              && (flags & PUBKEY_FLAG_EDDSA))
+                             || (ec->dialect == ECC_DIALECT_SAFECURVE));
+
+      errc = point_from_keyparam (&ec->Q, keyparam, "q", ec);
+      if (errc)
+        return errc;
+      errc = mpi_from_keyparam (&ec->d, keyparam, "d", is_opaque_bytes);
+
+      /* Size of opaque bytes should match size of P.  */
+      if (!errc && ec->d && is_opaque_bytes)
+        {
+          unsigned int n = mpi_get_nbits (ec->d);
+          unsigned int len;
+
+          len = (ec->nbits+7)/8;
+          /* EdDSA requires additional bit for sign.  */
+          if ((ec->nbits%8) == 0 && ec->model == MPI_EC_EDWARDS)
+            len++;
+
+          if ((n+7)/8 != len)
+            {
+              if (ec->dialect == ECC_DIALECT_ED25519)
+                {
+                  /*
+                   * GnuPG (<= 2.2) or OpenPGP implementations with no
+                   * SOS support may remove zeros at the beginning.
+                   * Recover those zeros.
+                   */
+                  /*
+                   * Also, GnuPG (<= 2.2) may add additional zero at
+                   * the beginning, when private key is moved from
+                   * OpenPGP to gpg-agent.  Remove such a zero-prefix.
+                   */
+                  const unsigned char *buf;
+                  unsigned char *value;
+
+                  buf = mpi_get_opaque (ec->d, &n);
+                  if (!buf)
+                    return GPG_ERR_INV_OBJ;
+
+                  value = xtrymalloc_secure (len);
+                  if (!value)
+                    return gpg_err_code_from_syserror ();
+
+                  if ((n+7)/8 < len)
+                    /* Recover zeros.  */
+                    {
+                      memset (value, 0, len - (n+7)/8);
+                      memcpy (value + len - (n+7)/8, buf, (n+7)/8);
+                    }
+                  else if ((n+7)/8 == len + 1)
+                    /* Remove a zero.  */
+                    memcpy (value, buf+1, len);
+                  else
+                    {
+                      xfree (value);
+                      return GPG_ERR_INV_OBJ;
+                    }
+
+                  mpi_set_opaque (ec->d, value, len*8);
+                }
+              else
+                {
+                  if (DBG_CIPHER)
+                    log_debug ("scalar size (%d) != prime size (%d)",
+                               (n+7)/8, len);
+
+                  errc = GPG_ERR_INV_OBJ;
+                }
+            }
+        }
+    }
+
+  return errc;
+}
+
+gpg_err_code_t
+_gcry_mpi_ec_internal_new (mpi_ec_t *r_ec, int *r_flags, const char *name_op,
+                           gcry_sexp_t keyparam, const char *curvename)
+{
+  gpg_err_code_t errc;
+  elliptic_curve_t E;
+  mpi_ec_t ec;
+
+  *r_ec = NULL;
+
+  memset (&E, 0, sizeof E);
+  errc = mpi_ec_get_elliptic_curve (&E, r_flags, keyparam, curvename);
+  if (errc)
+    goto leave;
+
+  ec = _gcry_mpi_ec_p_internal_new (E.model, E.dialect, *r_flags,
+                                    E.p, E.a, E.b);
+  if (!ec)
+    goto leave;
+
+  errc = mpi_ec_setup_elliptic_curve (ec, *r_flags, &E, keyparam);
+  if (errc)
+    {
+      _gcry_mpi_ec_free (ec);
+      goto leave;
+    }
+  else
+    *r_ec = ec;
+
+  if (!errc && DBG_CIPHER)
+    {
+      gcry_mpi_t mpi_q = NULL;
+      gcry_sexp_t l1;
+      char msg[80];
+
+      l1 = sexp_find_token (keyparam, "q", 0);
+      if (l1)
+        {
+          mpi_q = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_OPAQUE);
+          sexp_release (l1);
+        }
+
+      log_debug ("%s info: %s/%s%s\n", name_op,
+                 _gcry_ecc_model2str (ec->model),
+                 _gcry_ecc_dialect2str (ec->dialect),
+                 (*r_flags & PUBKEY_FLAG_EDDSA)? "+EdDSA" : "");
+      if (ec->name)
+        log_debug  ("%s name: %s\n", name_op, ec->name);
+      snprintf (msg, sizeof msg, "%s    p", name_op);
+      log_printmpi (msg, ec->p);
+      snprintf (msg, sizeof msg, "%s    a", name_op);
+      log_printmpi (msg, ec->a);
+      snprintf (msg, sizeof msg, "%s    b", name_op);
+      log_printmpi (msg, ec->b);
+      snprintf (msg, sizeof msg, "%s  g", name_op);
+      log_printpnt (msg, ec->G, NULL);
+      snprintf (msg, sizeof msg, "%s    n", name_op);
+      log_printmpi (msg, ec->n);
+      log_debug ("%s    h:+%02x\n", name_op, ec->h);
+      if (mpi_q)
+        {
+          snprintf (msg, sizeof msg, "%s    q", name_op);
+          log_printmpi (msg, mpi_q);
+          mpi_free (mpi_q);
+        }
+      if (!fips_mode () && ec->d)
+        {
+          snprintf (msg, sizeof msg, "%s    d", name_op);
+          log_printmpi (msg, ec->d);
+        }
+    }
+
+ leave:
+  _gcry_ecc_curve_free (&E);
+  return errc;
+}
+
+/* This function creates a new context for elliptic curve operations.
+   Either KEYPARAM or CURVENAME must be given.  If both are given and
+   KEYPARAM has no curve parameter, CURVENAME is used to add missing
+   parameters.  On success 0 is returned and the new context stored at
+   R_CTX.  On error NULL is stored at R_CTX and an error code is
+   returned.  The context needs to be released using
+   gcry_ctx_release.  */
+gpg_err_code_t
+_gcry_mpi_ec_new (gcry_ctx_t *r_ctx,
+                  gcry_sexp_t keyparam, const char *curvename)
+{
+  gpg_err_code_t errc;
+  elliptic_curve_t E;
+  gcry_ctx_t ctx = NULL;
+  int flags = 0;
+  mpi_ec_t ec;
+
+  *r_ctx = NULL;
+
+  memset (&E, 0, sizeof E);
+  errc = mpi_ec_get_elliptic_curve (&E, &flags, keyparam, curvename);
+  if (errc)
+    goto leave;
+
+  errc = _gcry_mpi_ec_p_new (&ctx, E.model, E.dialect, flags, E.p, E.a, E.b);
+  if (errc)
+    goto leave;
+
+  ec = _gcry_ctx_get_pointer (ctx, CONTEXT_TYPE_EC);
+  errc = mpi_ec_setup_elliptic_curve (ec, flags, &E, keyparam);
+  if (errc)
+    goto leave;
+
+  *r_ctx = ctx;
+  ctx = NULL;
+
+ leave:
+  _gcry_ecc_curve_free (&E);
+  _gcry_ctx_release (ctx);
+  return errc;
+}
+
+
+/* Return the parameters of the curve NAME as an S-expression.  */
+gcry_sexp_t
+_gcry_ecc_get_param_sexp (const char *name)
+{
+  unsigned int nbits;
+  elliptic_curve_t E;
+  mpi_ec_t ctx;
+  gcry_mpi_t g_x, g_y;
+  gcry_mpi_t pkey[5];
+  gcry_sexp_t result;
+  int i;
+
+  memset (&E, 0, sizeof E);
+  if (_gcry_ecc_fill_in_curve (0, name, &E, &nbits))
+    return NULL;
+
+  g_x = mpi_new (0);
+  g_y = mpi_new (0);
+  ctx = _gcry_mpi_ec_p_internal_new (E.model,
+                                     E.dialect,
+                                     0,
+                                     E.p, E.a, E.b);
+  if (_gcry_mpi_ec_get_affine (g_x, g_y, &E.G, ctx))
+    log_fatal ("ecc get param: Failed to get affine coordinates\n");
+  _gcry_mpi_ec_free (ctx);
+  _gcry_mpi_point_free_parts (&E.G);
+
+  pkey[0] = E.p;
+  pkey[1] = E.a;
+  pkey[2] = E.b;
+  pkey[3] = _gcry_ecc_ec2os (g_x, g_y, E.p);
+  pkey[4] = E.n;
+
+  mpi_free (g_x);
+  mpi_free (g_y);
+
+  if (sexp_build (&result, NULL,
+                  "(public-key(ecc(p%m)(a%m)(b%m)(g%m)(n%m)(h%u)))",
+                  pkey[0], pkey[1], pkey[2], pkey[3], pkey[4], E.h))
+    result = NULL;
+
+  for (i=0; i < DIM (pkey); i++)
+    _gcry_mpi_release (pkey[i]);
+
+  return result;
+}
+
+
+/* Return an MPI (or opaque MPI) described by NAME and the context EC.
+   If COPY is true a copy is returned, if not a const MPI may be
+   returned.  In any case mpi_free must be used.  */
+gcry_mpi_t
+_gcry_ecc_get_mpi (const char *name, mpi_ec_t ec, int copy)
+{
+  if (!*name)
+    return NULL;
+
+  if (!strcmp (name, "p") && ec->p)
+    return mpi_is_const (ec->p) && !copy? ec->p : mpi_copy (ec->p);
+  if (!strcmp (name, "a") && ec->a)
+    return mpi_is_const (ec->a) && !copy? ec->a : mpi_copy (ec->a);
+  if (!strcmp (name, "b") && ec->b)
+    return mpi_is_const (ec->b) && !copy? ec->b : mpi_copy (ec->b);
+  if (!strcmp (name, "n") && ec->n)
+    return mpi_is_const (ec->n) && !copy? ec->n : mpi_copy (ec->n);
+  if (!strcmp (name, "h"))
+    {
+      gcry_mpi_t h = _gcry_mpi_get_const (ec->h);
+
+      return !copy? h : mpi_set (NULL, h);
+    }
+  if (!strcmp (name, "d") && ec->d)
+    return mpi_is_const (ec->d) && !copy? ec->d : mpi_copy (ec->d);
+
+  /* Return a requested point coordinate.  */
+  if (!strcmp (name, "g.x") && ec->G && ec->G->x)
+    return mpi_is_const (ec->G->x) && !copy? ec->G->x : mpi_copy (ec->G->x);
+  if (!strcmp (name, "g.y") && ec->G && ec->G->y)
+    return mpi_is_const (ec->G->y) && !copy? ec->G->y : mpi_copy (ec->G->y);
+  if (!strcmp (name, "q.x") && ec->Q && ec->Q->x)
+    return mpi_is_const (ec->Q->x) && !copy? ec->Q->x : mpi_copy (ec->Q->x);
+  if (!strcmp (name, "q.y") && ec->Q && ec->Q->y)
+    return mpi_is_const (ec->Q->y) && !copy? ec->Q->y : mpi_copy (ec->Q->y);
+
+  /* If the base point has been requested, return it in standard
+     encoding.  */
+  if (!strcmp (name, "g") && ec->G)
+    return _gcry_mpi_ec_ec2os (ec->G, ec);
+
+  /* If the public key has been requested, return it by default in
+     standard uncompressed encoding or if requested in other
+     encodings.  */
+  if (*name == 'q' && (!name[1] || name[1] == '@'))
+    {
+      /* If only the private key is given, compute the public key.  */
+      if (!ec->Q)
+        ec->Q = _gcry_ecc_compute_public (NULL, ec);
+
+      if (!ec->Q)
+        return NULL;
+
+      if (name[1] != '@')
+        return _gcry_mpi_ec_ec2os (ec->Q, ec);
+
+      if (!strcmp (name+2, "eddsa") && ec->model == MPI_EC_EDWARDS)
+        {
+          unsigned char *encpk;
+          unsigned int encpklen;
+
+          if (!_gcry_ecc_eddsa_encodepoint (ec->Q, ec, NULL, NULL, 0,
+                                            &encpk, &encpklen))
+            return mpi_set_opaque (NULL, encpk, encpklen*8);
+        }
+    }
+
+  return NULL;
+}
+
+
+/* Return a point described by NAME and the context EC.  */
+gcry_mpi_point_t
+_gcry_ecc_get_point (const char *name, mpi_ec_t ec)
+{
+  if (!strcmp (name, "g") && ec->G)
+    return point_copy (ec->G);
+  if (!strcmp (name, "q"))
+    {
+      /* If only the private key is given, compute the public key.  */
+      if (!ec->Q)
+        ec->Q = _gcry_ecc_compute_public (NULL, ec);
+
+      if (ec->Q)
+        return point_copy (ec->Q);
+    }
+
+  return NULL;
+}
+
+
+/* Store the MPI NEWVALUE into the context EC under NAME. */
+gpg_err_code_t
+_gcry_ecc_set_mpi (const char *name, gcry_mpi_t newvalue, mpi_ec_t ec)
+{
+  gpg_err_code_t rc = 0;
+
+  if (!*name)
+    ;
+  else if (!strcmp (name, "p"))
+    {
+      mpi_free (ec->p);
+      ec->p = mpi_copy (newvalue);
+      _gcry_mpi_ec_get_reset (ec);
+    }
+  else if (!strcmp (name, "a"))
+    {
+      mpi_free (ec->a);
+      ec->a = mpi_copy (newvalue);
+      _gcry_mpi_ec_get_reset (ec);
+    }
+  else if (!strcmp (name, "b"))
+    {
+      mpi_free (ec->b);
+      ec->b = mpi_copy (newvalue);
+    }
+  else if (!strcmp (name, "n"))
+    {
+      mpi_free (ec->n);
+      ec->n = mpi_copy (newvalue);
+    }
+  else if (!strcmp (name, "h"))
+    {
+      mpi_get_ui (&ec->h, newvalue);
+    }
+  else if (*name == 'q' && (!name[1] || name[1] == '@'))
+    {
+      if (newvalue)
+        {
+          if (!ec->Q)
+            ec->Q = mpi_point_new (0);
+          rc = _gcry_mpi_ec_decode_point (ec->Q, newvalue, ec);
+        }
+      if (rc || !newvalue)
+        {
+          _gcry_mpi_point_release (ec->Q);
+          ec->Q = NULL;
+        }
+      /* Note: We assume that Q matches d and thus do not reset d.  */
+    }
+  else if (!strcmp (name, "d"))
+    {
+      mpi_free (ec->d);
+      ec->d = mpi_copy (newvalue);
+      if (ec->d)
+        {
+          /* We need to reset the public key because it may not
+             anymore match.  */
+          _gcry_mpi_point_release (ec->Q);
+          ec->Q = NULL;
+        }
+    }
+  else
+   rc = GPG_ERR_UNKNOWN_NAME;
+
+  return rc;
+}
+
+
+/* Store the point NEWVALUE into the context EC under NAME.  */
+gpg_err_code_t
+_gcry_ecc_set_point (const char *name, gcry_mpi_point_t newvalue, mpi_ec_t ec)
+{
+  if (!strcmp (name, "g"))
+    {
+      _gcry_mpi_point_release (ec->G);
+      ec->G = point_copy (newvalue);
+    }
+  else if (!strcmp (name, "q"))
+    {
+      _gcry_mpi_point_release (ec->Q);
+      ec->Q = point_copy (newvalue);
+    }
+  else
+    return GPG_ERR_UNKNOWN_NAME;
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-ecdh.c b/comm/third_party/libgcrypt/cipher/ecc-ecdh.c
new file mode 100644
index 0000000000..d6b8991af6
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-ecdh.c
@@ -0,0 +1,127 @@
+/* ecc-ecdh.c  -  Elliptic Curve Diffie-Hellman key agreement
+ * Copyright (C) 2019 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1+
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "ecc-common.h"
+
+#define ECC_CURVE25519_BYTES 32
+#define ECC_CURVE448_BYTES   56
+
+static gpg_err_code_t
+prepare_ec (mpi_ec_t *r_ec, const char *name)
+{
+  int flags = 0;
+
+  if (!strcmp (name, "Curve25519"))
+    flags = PUBKEY_FLAG_DJB_TWEAK;
+
+  return _gcry_mpi_ec_internal_new (r_ec, &flags, "ecc_mul_point", NULL, name);
+}
+
+unsigned int
+_gcry_ecc_get_algo_keylen (int curveid)
+{
+  unsigned int len = 0;
+
+  if (curveid == GCRY_ECC_CURVE25519)
+    len = ECC_CURVE25519_BYTES;
+  else if (curveid == GCRY_ECC_CURVE448)
+    len = ECC_CURVE448_BYTES;
+
+  return len;
+}
+
+gpg_error_t
+_gcry_ecc_mul_point (int curveid, unsigned char *result,
+                     const unsigned char *scalar, const unsigned char *point)
+{
+  unsigned int nbits;
+  unsigned int nbytes;
+  const char *curve;
+  gpg_err_code_t err;
+  gcry_mpi_t mpi_k;
+  mpi_ec_t ec;
+  mpi_point_struct Q;
+  gcry_mpi_t x;
+  unsigned int len;
+  unsigned char *buf;
+
+  if (curveid == GCRY_ECC_CURVE25519)
+    curve = "Curve25519";
+  else if (curveid == GCRY_ECC_CURVE448)
+    curve = "X448";
+  else
+    return gpg_error (GPG_ERR_UNKNOWN_CURVE);
+
+  err = prepare_ec (&ec, curve);
+  if (err)
+    return err;
+
+  nbits = ec->nbits;
+  nbytes = (nbits + 7)/8;
+
+  mpi_k = _gcry_mpi_set_opaque_copy (NULL, scalar, nbytes*8);
+  x = mpi_new (nbits);
+  point_init (&Q);
+
+  if (point)
+    {
+      gcry_mpi_t mpi_u = _gcry_mpi_set_opaque_copy (NULL, point, nbytes*8);
+      mpi_point_struct P;
+
+      point_init (&P);
+      err = _gcry_ecc_mont_decodepoint (mpi_u, ec, &P);
+      _gcry_mpi_release (mpi_u);
+      if (err)
+        goto leave;
+      _gcry_mpi_ec_mul_point (&Q, mpi_k, &P, ec);
+      point_free (&P);
+    }
+  else
+    _gcry_mpi_ec_mul_point (&Q, mpi_k, ec->G, ec);
+
+  _gcry_mpi_ec_get_affine (x, NULL, &Q, ec);
+
+  buf = _gcry_mpi_get_buffer (x, nbytes, &len, NULL);
+  if (!buf)
+    err = gpg_error_from_syserror ();
+  else
+    {
+      memcpy (result, buf, nbytes);
+      xfree (buf);
+    }
+
+ leave:
+  _gcry_mpi_release (x);
+  point_free (&Q);
+  _gcry_mpi_release (mpi_k);
+  _gcry_mpi_ec_free (ec);
+  return err;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-ecdsa.c b/comm/third_party/libgcrypt/cipher/ecc-ecdsa.c
new file mode 100644
index 0000000000..30103f1417
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-ecdsa.c
@@ -0,0 +1,248 @@
+/* ecc-ecdsa.c  -  Elliptic Curve ECDSA signatures
+ * Copyright (C) 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "pubkey-internal.h"
+#include "ecc-common.h"
+
+
+/* Compute an ECDSA signature.
+ * Return the signature struct (r,s) from the message hash.  The caller
+ * must have allocated R and S.
+ */
+gpg_err_code_t
+_gcry_ecc_ecdsa_sign (gcry_mpi_t input, mpi_ec_t ec,
+                      gcry_mpi_t r, gcry_mpi_t s,
+                      int flags, int hashalgo)
+{
+  gpg_err_code_t rc = 0;
+  int extraloops = 0;
+  gcry_mpi_t k, dr, sum, k_1, x;
+  mpi_point_struct I;
+  gcry_mpi_t hash;
+  const void *abuf;
+  unsigned int abits, qbits;
+  gcry_mpi_t b;                /* Random number needed for blinding.  */
+  gcry_mpi_t bi;               /* multiplicative inverse of B.        */
+
+  if (DBG_CIPHER)
+    log_mpidump ("ecdsa sign hash  ", input );
+
+  qbits = mpi_get_nbits (ec->n);
+
+  /* Convert the INPUT into an MPI if needed.  */
+  rc = _gcry_dsa_normalize_hash (input, &hash, qbits);
+  if (rc)
+    return rc;
+
+  b  = mpi_snew (qbits);
+  bi = mpi_snew (qbits);
+  do
+    {
+      _gcry_mpi_randomize (b, qbits, GCRY_WEAK_RANDOM);
+      mpi_mod (b, b, ec->n);
+    }
+  while (!mpi_invm (bi, b, ec->n));
+
+  k = NULL;
+  dr = mpi_alloc (0);
+  sum = mpi_alloc (0);
+  k_1 = mpi_alloc (0);
+  x = mpi_alloc (0);
+  point_init (&I);
+
+  /* Two loops to avoid R or S are zero.  This is more of a joke than
+     a real demand because the probability of them being zero is less
+     than any hardware failure.  Some specs however require it.  */
+  do
+    {
+      do
+        {
+          mpi_free (k);
+          k = NULL;
+          if ((flags & PUBKEY_FLAG_RFC6979) && hashalgo)
+            {
+              /* Use Pornin's method for deterministic DSA.  If this
+                 flag is set, it is expected that HASH is an opaque
+                 MPI with the to be signed hash.  That hash is also
+                 used as h1 from 3.2.a.  */
+              if (!mpi_is_opaque (input))
+                {
+                  rc = GPG_ERR_CONFLICT;
+                  goto leave;
+                }
+
+              abuf = mpi_get_opaque (input, &abits);
+              rc = _gcry_dsa_gen_rfc6979_k (&k, ec->n, ec->d,
+                                            abuf, (abits+7)/8,
+                                            hashalgo, extraloops);
+              if (rc)
+                goto leave;
+              extraloops++;
+            }
+          else
+            k = _gcry_dsa_gen_k (ec->n, GCRY_STRONG_RANDOM);
+
+          mpi_invm (k_1, k, ec->n);     /* k_1 = k^(-1) mod n  */
+
+          _gcry_dsa_modify_k (k, ec->n, qbits);
+
+          _gcry_mpi_ec_mul_point (&I, k, ec->G, ec);
+          if (_gcry_mpi_ec_get_affine (x, NULL, &I, ec))
+            {
+              if (DBG_CIPHER)
+                log_debug ("ecc sign: Failed to get affine coordinates\n");
+              rc = GPG_ERR_BAD_SIGNATURE;
+              goto leave;
+            }
+          mpi_mod (r, x, ec->n);  /* r = x mod n */
+        }
+      while (!mpi_cmp_ui (r, 0));
+
+      /* Computation of dr, sum, and s are blinded with b.  */
+      mpi_mulm (dr, b, ec->d, ec->n);
+      mpi_mulm (dr, dr, r, ec->n);      /* dr = d*r mod n */
+      mpi_mulm (sum, b, hash, ec->n);
+      mpi_addm (sum, sum, dr, ec->n);   /* sum = hash + (d*r) mod n */
+      mpi_mulm (s, k_1, sum, ec->n);    /* s = k^(-1)*(hash+(d*r)) mod n */
+      /* Undo blinding by b^-1 */
+      mpi_mulm (s, bi, s, ec->n);
+    }
+  while (!mpi_cmp_ui (s, 0));
+
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("ecdsa sign result r ", r);
+      log_mpidump ("ecdsa sign result s ", s);
+    }
+
+ leave:
+  mpi_free (b);
+  mpi_free (bi);
+  point_free (&I);
+  mpi_free (x);
+  mpi_free (k_1);
+  mpi_free (sum);
+  mpi_free (dr);
+  mpi_free (k);
+
+  if (hash != input)
+    mpi_free (hash);
+
+  return rc;
+}
+
+
+/* Verify an ECDSA signature.
+ * Check if R and S verifies INPUT.
+ */
+gpg_err_code_t
+_gcry_ecc_ecdsa_verify (gcry_mpi_t input, mpi_ec_t ec,
+                        gcry_mpi_t r, gcry_mpi_t s)
+{
+  gpg_err_code_t err = 0;
+  gcry_mpi_t hash, h, h1, h2, x;
+  mpi_point_struct Q, Q1, Q2;
+  unsigned int nbits;
+
+  if (!_gcry_mpi_ec_curve_point (ec->Q, ec))
+    return GPG_ERR_BROKEN_PUBKEY;
+
+  if( !(mpi_cmp_ui (r, 0) > 0 && mpi_cmp (r, ec->n) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < r < n  failed.  */
+  if( !(mpi_cmp_ui (s, 0) > 0 && mpi_cmp (s, ec->n) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < s < n  failed.  */
+
+  nbits = mpi_get_nbits (ec->n);
+  err = _gcry_dsa_normalize_hash (input, &hash, nbits);
+  if (err)
+    return err;
+
+  h  = mpi_alloc (0);
+  h1 = mpi_alloc (0);
+  h2 = mpi_alloc (0);
+  x = mpi_alloc (0);
+  point_init (&Q);
+  point_init (&Q1);
+  point_init (&Q2);
+
+  /* h  = s^(-1) (mod n) */
+  mpi_invm (h, s, ec->n);
+  /* h1 = hash * s^(-1) (mod n) */
+  mpi_mulm (h1, hash, h, ec->n);
+  /* Q1 = [ hash * s^(-1) ]G  */
+  _gcry_mpi_ec_mul_point (&Q1, h1, ec->G, ec);
+  /* h2 = r * s^(-1) (mod n) */
+  mpi_mulm (h2, r, h, ec->n);
+  /* Q2 = [ r * s^(-1) ]Q */
+  _gcry_mpi_ec_mul_point (&Q2, h2, ec->Q, ec);
+  /* Q  = ([hash * s^(-1)]G) + ([r * s^(-1)]Q) */
+  _gcry_mpi_ec_add_points (&Q, &Q1, &Q2, ec);
+
+  if (!mpi_cmp_ui (Q.z, 0))
+    {
+      if (DBG_CIPHER)
+          log_debug ("ecc verify: Rejected\n");
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  if (_gcry_mpi_ec_get_affine (x, NULL, &Q, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("ecc verify: Failed to get affine coordinates\n");
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  mpi_mod (x, x, ec->n); /* x = x mod E_n */
+  if (mpi_cmp (x, r))   /* x != r */
+    {
+      if (DBG_CIPHER)
+        {
+          log_mpidump ("     x", x);
+          log_mpidump ("     r", r);
+          log_mpidump ("     s", s);
+        }
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+ leave:
+  point_free (&Q2);
+  point_free (&Q1);
+  point_free (&Q);
+  mpi_free (x);
+  mpi_free (h2);
+  mpi_free (h1);
+  mpi_free (h);
+  if (hash != input)
+    mpi_free (hash);
+
+  return err;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-eddsa.c b/comm/third_party/libgcrypt/cipher/ecc-eddsa.c
new file mode 100644
index 0000000000..2a1a89073c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-eddsa.c
@@ -0,0 +1,1182 @@
+/* ecc-eddsa.c  -  Elliptic Curve EdDSA signatures
+ * Copyright (C) 2013, 2014 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "ecc-common.h"
+
+
+
+void
+reverse_buffer (unsigned char *buffer, unsigned int length)
+{
+  unsigned int tmp, i;
+
+  for (i=0; i < length/2; i++)
+    {
+      tmp = buffer[i];
+      buffer[i] = buffer[length-1-i];
+      buffer[length-1-i] = tmp;
+    }
+}
+
+
+/* Helper to scan a hex string. */
+static gcry_mpi_t
+scanval (const char *string)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t val;
+
+  rc = _gcry_mpi_scan (&val, GCRYMPI_FMT_HEX, string, 0, NULL);
+  if (rc)
+    log_fatal ("scanning ECC parameter failed: %s\n", gpg_strerror (rc));
+  return val;
+}
+
+
+
+/* Encode MPI using the EdDSA scheme.  MINLEN specifies the required
+   length of the buffer in bytes.  On success 0 is returned an a
+   malloced buffer with the encoded point is stored at R_BUFFER; the
+   length of this buffer is stored at R_BUFLEN.  */
+static gpg_err_code_t
+eddsa_encodempi (gcry_mpi_t mpi, unsigned int nbits,
+                 unsigned char **r_buffer, unsigned int *r_buflen)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+  unsigned int minlen = (nbits%8) == 0 ? (nbits/8 + 1): (nbits+7)/8;
+
+  rawmpi = _gcry_mpi_get_buffer (mpi, minlen, &rawmpilen, NULL);
+  if (!rawmpi)
+    return gpg_err_code_from_syserror ();
+
+  *r_buffer = rawmpi;
+  *r_buflen = rawmpilen;
+  return 0;
+}
+
+
+/* Encode (X,Y) using the EdDSA scheme.  NBITS is the number of bits
+   of the field of the curve.  If WITH_PREFIX is set the returned
+   buffer is prefixed with a 0x40 byte.  On success 0 is returned and
+   a malloced buffer with the encoded point is stored at R_BUFFER; the
+   length of this buffer is stored at R_BUFLEN.  */
+static gpg_err_code_t
+eddsa_encode_x_y (gcry_mpi_t x, gcry_mpi_t y, unsigned int nbits,
+                  int with_prefix,
+                  unsigned char **r_buffer, unsigned int *r_buflen)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+  int off = with_prefix? 1:0;
+  unsigned int minlen = (nbits%8) == 0 ? (nbits/8 + 1): (nbits+7)/8;
+
+  rawmpi = _gcry_mpi_get_buffer_extra (y, minlen, off?-1:0, &rawmpilen, NULL);
+  if (!rawmpi)
+    return gpg_err_code_from_syserror ();
+  if (mpi_test_bit (x, 0) && rawmpilen)
+    rawmpi[off + rawmpilen - 1] |= 0x80;  /* Set sign bit.  */
+  if (off)
+    rawmpi[0] = 0x40;
+
+  *r_buffer = rawmpi;
+  *r_buflen = rawmpilen + off;
+  return 0;
+}
+
+/* Encode POINT using the EdDSA scheme.  X and Y are either scratch
+   variables supplied by the caller or NULL.  CTX is the usual
+   context.  If WITH_PREFIX is set the returned buffer is prefixed
+   with a 0x40 byte.  On success 0 is returned and a malloced buffer
+   with the encoded point is stored at R_BUFFER; the length of this
+   buffer is stored at R_BUFLEN.  */
+gpg_err_code_t
+_gcry_ecc_eddsa_encodepoint (mpi_point_t point, mpi_ec_t ec,
+                             gcry_mpi_t x_in, gcry_mpi_t y_in,
+                             int with_prefix,
+                             unsigned char **r_buffer, unsigned int *r_buflen)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t x, y;
+
+  x = x_in? x_in : mpi_new (0);
+  y = y_in? y_in : mpi_new (0);
+
+  if (_gcry_mpi_ec_get_affine (x, y, point, ec))
+    {
+      log_error ("eddsa_encodepoint: Failed to get affine coordinates\n");
+      rc = GPG_ERR_INTERNAL;
+    }
+  else
+    rc = eddsa_encode_x_y (x, y, ec->nbits, with_prefix, r_buffer, r_buflen);
+
+  if (!x_in)
+    mpi_free (x);
+  if (!y_in)
+    mpi_free (y);
+  return rc;
+}
+
+
+/* Make sure that the opaque MPI VALUE is in compact EdDSA format.
+   This function updates MPI if needed.  */
+gpg_err_code_t
+_gcry_ecc_eddsa_ensure_compact (gcry_mpi_t value, unsigned int nbits)
+{
+  gpg_err_code_t rc;
+  const unsigned char *buf;
+  unsigned int rawmpilen;
+  gcry_mpi_t x, y;
+  unsigned char *enc;
+  unsigned int enclen;
+
+  if (!mpi_is_opaque (value))
+    return GPG_ERR_INV_OBJ;
+  buf = mpi_get_opaque (value, &rawmpilen);
+  if (!buf)
+    return GPG_ERR_INV_OBJ;
+  rawmpilen = (rawmpilen + 7)/8;
+
+  if (rawmpilen > 1 && (rawmpilen%2))
+    {
+      if (buf[0] == 0x04)
+        {
+          /* Buffer is in SEC1 uncompressed format.  Extract y and
+             compress.  */
+          rc = _gcry_mpi_scan (&x, GCRYMPI_FMT_USG,
+                               buf+1, (rawmpilen-1)/2, NULL);
+          if (rc)
+            return rc;
+          rc = _gcry_mpi_scan (&y, GCRYMPI_FMT_USG,
+                               buf+1+(rawmpilen-1)/2, (rawmpilen-1)/2, NULL);
+          if (rc)
+            {
+              mpi_free (x);
+              return rc;
+            }
+
+          rc = eddsa_encode_x_y (x, y, nbits, 0, &enc, &enclen);
+          mpi_free (x);
+          mpi_free (y);
+          if (rc)
+            return rc;
+
+          mpi_set_opaque (value, enc, 8*enclen);
+        }
+      else if (buf[0] == 0x40)
+        {
+          /* Buffer is compressed but with our SEC1 alike compression
+             indicator.  Remove that byte.  FIXME: We should write and
+             use a function to manipulate an opaque MPI in place. */
+          if (!_gcry_mpi_set_opaque_copy (value, buf + 1, (rawmpilen - 1)*8))
+            return gpg_err_code_from_syserror ();
+        }
+    }
+
+  return 0;
+}
+
+
+static gpg_err_code_t
+ecc_ed448_recover_x (gcry_mpi_t x, gcry_mpi_t y, int x_0, mpi_ec_t ec)
+{
+  gpg_err_code_t rc = 0;
+  gcry_mpi_t u, v, u3, v3, t;
+  static gcry_mpi_t p34; /* Hard coded (P-3)/4 */
+
+  if (mpi_cmp (y, ec->p) >= 0)
+    rc = GPG_ERR_INV_OBJ;
+
+  if (!p34)
+    p34 = scanval ("3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+                   "BFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF");
+
+  u   = mpi_new (0);
+  v   = mpi_new (0);
+  u3  = mpi_new (0);
+  v3  = mpi_new (0);
+  t   = mpi_new (0);
+
+  /* Compute u and v */
+  /* u = y^2    */
+  mpi_mulm (u, y, y, ec->p);
+  /* v = b*y^2   */
+  mpi_mulm (v, ec->b, u, ec->p);
+  /* u = y^2-1  */
+  mpi_sub_ui (u, u, 1);
+  /* v = b*y^2-1 */
+  mpi_sub_ui (v, v, 1);
+
+  /* Compute sqrt(u/v) */
+  /* u3 = u^3 */
+  mpi_powm (u3, u, mpi_const (MPI_C_THREE), ec->p);
+  mpi_powm (v3, v, mpi_const (MPI_C_THREE), ec->p);
+  /* t = u^4 * u * v3 = u^5 * v^3 */
+  mpi_powm (t, u, mpi_const (MPI_C_FOUR), ec->p);
+  mpi_mulm (t, t, u, ec->p);
+  mpi_mulm (t, t, v3, ec->p);
+  /* t = t^((p-3)/4) = (u^5 * v^3)^((p-3)/4)  */
+  mpi_powm (t, t, p34, ec->p);
+  /* x = t * u^3 * v = (u^3 * v) * (u^5 * v^3)^((p-3)/4) */
+  mpi_mulm (t, t, u3, ec->p);
+  mpi_mulm (x, t, v, ec->p);
+
+  /* t = v * x^2  */
+  mpi_mulm (t, x, x, ec->p);
+  mpi_mulm (t, t, v, ec->p);
+
+  if (mpi_cmp (t, u) != 0)
+    rc = GPG_ERR_INV_OBJ;
+  else
+    {
+      if (!mpi_cmp_ui (x, 0) && x_0)
+        rc = GPG_ERR_INV_OBJ;
+
+      /* Choose the desired square root according to parity */
+      if (mpi_test_bit (x, 0) != !!x_0)
+        mpi_sub (x, ec->p, x);
+    }
+
+  mpi_free (t);
+  mpi_free (u3);
+  mpi_free (v3);
+  mpi_free (v);
+  mpi_free (u);
+
+  return rc;
+}
+
+
+/* Recover X from Y and SIGN (which actually is a parity bit).  */
+gpg_err_code_t
+_gcry_ecc_eddsa_recover_x (gcry_mpi_t x, gcry_mpi_t y, int sign, mpi_ec_t ec)
+{
+  gpg_err_code_t rc = 0;
+  gcry_mpi_t u, v, v3, t;
+  static gcry_mpi_t p58, seven;
+
+  /*
+   * This routine is actually curve specific.  Now, only supports
+   * Ed25519 and Ed448.
+   */
+
+  if (ec->dialect != ECC_DIALECT_ED25519)
+    /* For now, it's only Ed448.  */
+    return ecc_ed448_recover_x (x, y, sign, ec);
+
+  /* It's Ed25519.  */
+
+  if (!p58)
+    p58 = scanval ("0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+                   "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFD");
+  if (!seven)
+    seven = mpi_set_ui (NULL, 7);
+
+  u   = mpi_new (0);
+  v   = mpi_new (0);
+  v3  = mpi_new (0);
+  t   = mpi_new (0);
+
+  /* Compute u and v */
+  /* u = y^2    */
+  mpi_mulm (u, y, y, ec->p);
+  /* v = b*y^2   */
+  mpi_mulm (v, ec->b, u, ec->p);
+  /* u = y^2-1  */
+  mpi_sub_ui (u, u, 1);
+  /* v = b*y^2+1 */
+  mpi_add_ui (v, v, 1);
+
+  /* Compute sqrt(u/v) */
+  /* v3 = v^3 */
+  mpi_powm (v3, v, mpi_const (MPI_C_THREE), ec->p);
+  /* t = v3 * v3 * u * v = u * v^7 */
+  mpi_powm (t, v, seven, ec->p);
+  mpi_mulm (t, t, u, ec->p);
+  /* t = t^((p-5)/8) = (u * v^7)^((p-5)/8)  */
+  mpi_powm (t, t, p58, ec->p);
+  /* x = t * u * v^3 = (u * v^3) * (u * v^7)^((p-5)/8) */
+  mpi_mulm (t, t, u, ec->p);
+  mpi_mulm (x, t, v3, ec->p);
+
+  /* Adjust if needed.  */
+  /* t = v * x^2  */
+  mpi_mulm (t, x, x, ec->p);
+  mpi_mulm (t, t, v, ec->p);
+  /* -t == u ? x = x * sqrt(-1) */
+  mpi_sub (t, ec->p, t);
+  if (!mpi_cmp (t, u))
+    {
+      static gcry_mpi_t m1;  /* Fixme: this is not thread-safe.  */
+      if (!m1)
+        m1 = scanval ("2B8324804FC1DF0B2B4D00993DFBD7A7"
+                      "2F431806AD2FE478C4EE1B274A0EA0B0");
+      mpi_mulm (x, x, m1, ec->p);
+      /* t = v * x^2  */
+      mpi_mulm (t, x, x, ec->p);
+      mpi_mulm (t, t, v, ec->p);
+      /* -t == u ? x = x * sqrt(-1) */
+      mpi_sub (t, ec->p, t);
+      if (!mpi_cmp (t, u))
+        rc = GPG_ERR_INV_OBJ;
+    }
+
+  /* Choose the desired square root according to parity */
+  if (mpi_test_bit (x, 0) != !!sign)
+    mpi_sub (x, ec->p, x);
+
+  mpi_free (t);
+  mpi_free (v3);
+  mpi_free (v);
+  mpi_free (u);
+
+  return rc;
+}
+
+
+/* Decode the EdDSA style encoded PK and set it into RESULT.  CTX is
+   the usual curve context.  If R_ENCPK is not NULL, the encoded PK is
+   stored at that address; this is a new copy to be released by the
+   caller.  In contrast to the supplied PK, this is not an MPI and
+   thus guaranteed to be properly padded.  R_ENCPKLEN receives the
+   length of that encoded key.  */
+gpg_err_code_t
+_gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result,
+                             unsigned char **r_encpk, unsigned int *r_encpklen)
+{
+  gpg_err_code_t rc;
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+  int sign;
+
+  if (mpi_is_opaque (pk))
+    {
+      const unsigned char *buf;
+      unsigned int len;
+
+      len = (ctx->nbits%8) == 0 ? (ctx->nbits/8 + 1): (ctx->nbits+7)/8;
+
+      buf = mpi_get_opaque (pk, &rawmpilen);
+      if (!buf)
+        return GPG_ERR_INV_OBJ;
+      rawmpilen = (rawmpilen + 7)/8;
+
+      if (!(rawmpilen == len
+            || rawmpilen == len + 1
+            || rawmpilen == len * 2 + 1))
+        return GPG_ERR_INV_OBJ;
+
+      /* Handle compression prefixes.  The size of the buffer will be
+         odd in this case.  */
+      if (rawmpilen > 1 && (rawmpilen == len + 1 || rawmpilen == len * 2 + 1))
+        {
+          /* First check whether the public key has been given in
+             standard uncompressed format (SEC1).  No need to recover
+             x in this case.  */
+          if (buf[0] == 0x04)
+            {
+              gcry_mpi_t x, y;
+
+              rc = _gcry_mpi_scan (&x, GCRYMPI_FMT_USG,
+                                   buf+1, (rawmpilen-1)/2, NULL);
+              if (rc)
+                return rc;
+              rc = _gcry_mpi_scan (&y, GCRYMPI_FMT_USG,
+                                   buf+1+(rawmpilen-1)/2, (rawmpilen-1)/2,NULL);
+              if (rc)
+                {
+                  mpi_free (x);
+                  return rc;
+                }
+
+              if (r_encpk)
+                {
+                  rc = eddsa_encode_x_y (x, y, ctx->nbits, 0,
+                                         r_encpk, r_encpklen);
+                  if (rc)
+                    {
+                      mpi_free (x);
+                      mpi_free (y);
+                      return rc;
+                    }
+                }
+              mpi_snatch (result->x, x);
+              mpi_snatch (result->y, y);
+              mpi_set_ui (result->z, 1);
+              return 0;
+            }
+
+          /* Check whether the public key has been prefixed with a 0x40
+             byte to explicitly indicate compressed format using a SEC1
+             alike prefix byte.  This is a Libgcrypt extension.  */
+          if (buf[0] == 0x40)
+            {
+              rawmpilen--;
+              buf++;
+            }
+        }
+
+      /* EdDSA compressed point.  */
+      rawmpi = xtrymalloc (rawmpilen);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+      memcpy (rawmpi, buf, rawmpilen);
+      reverse_buffer (rawmpi, rawmpilen);
+    }
+  else
+    {
+      /* Note: Without using an opaque MPI it is not reliable possible
+         to find out whether the public key has been given in
+         uncompressed format.  Thus we expect native EdDSA format.  */
+      rawmpi = _gcry_mpi_get_buffer (pk, (ctx->nbits+7)/8, &rawmpilen, NULL);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+    }
+
+  if (rawmpilen)
+    {
+      sign = !!(rawmpi[0] & 0x80);
+      rawmpi[0] &= 0x7f;
+    }
+  else
+    sign = 0;
+  _gcry_mpi_set_buffer (result->y, rawmpi, rawmpilen, 0);
+  if (r_encpk)
+    {
+      /* Revert to little endian.  */
+      if (sign && rawmpilen)
+        rawmpi[0] |= 0x80;
+      reverse_buffer (rawmpi, rawmpilen);
+      *r_encpk = rawmpi;
+      if (r_encpklen)
+        *r_encpklen = rawmpilen;
+    }
+  else
+    xfree (rawmpi);
+
+  rc = _gcry_ecc_eddsa_recover_x (result->x, result->y, sign, ctx);
+  mpi_set_ui (result->z, 1);
+
+  return rc;
+}
+
+
+/* Compute the A value as used by EdDSA.  The caller needs to provide
+   the context EC and the actual secret D as an MPI.  The function
+   returns a newly allocated 64 byte buffer at r_digest; the first 32
+   bytes represent the A value.  NULL is returned on error and NULL
+   stored at R_DIGEST.  */
+gpg_err_code_t
+_gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, mpi_ec_t ec)
+{
+  gpg_err_code_t rc;
+  unsigned char *rawmpi = NULL;
+  unsigned int rawmpilen;
+  unsigned char *digest;
+  int hashalgo, b;
+
+  *r_digest = NULL;
+
+  b = (ec->nbits+7)/8;
+
+  /*
+   * Choice of hashalgo is curve specific.
+   * For now, it's determine by the bit size of the field.
+   */
+  if (ec->nbits == 255)
+    hashalgo = GCRY_MD_SHA512;
+  else if (ec->nbits == 448)
+    {
+      b++;
+      hashalgo = GCRY_MD_SHAKE256;
+    }
+  else
+    return GPG_ERR_NOT_IMPLEMENTED;
+
+  /* Note that we clear DIGEST so we can use it as input to left pad
+     the key with zeroes for hashing.  */
+  digest = xtrycalloc_secure (2, b);
+  if (!digest)
+    return gpg_err_code_from_syserror ();
+
+  rawmpi = _gcry_mpi_get_buffer (ec->d, 0, &rawmpilen, NULL);
+  if (!rawmpi)
+    {
+      xfree (digest);
+      return gpg_err_code_from_syserror ();
+    }
+
+  if (hashalgo == GCRY_MD_SHAKE256)
+    {
+      gcry_error_t err;
+      gcry_md_hd_t hd;
+
+      err = _gcry_md_open (&hd, hashalgo, 0);
+      if (err)
+        rc = gcry_err_code (err);
+      else
+        {
+          _gcry_md_write (hd, rawmpi, rawmpilen);
+          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
+          _gcry_md_close (hd);
+          rc = 0;
+        }
+    }
+  else
+    {
+      gcry_buffer_t hvec[2];
+
+      memset (hvec, 0, sizeof hvec);
+
+      hvec[0].data = digest;
+      hvec[0].len = b > rawmpilen? b - rawmpilen : 0;
+      hvec[1].data = rawmpi;
+      hvec[1].len = rawmpilen;
+      rc = _gcry_md_hash_buffers (hashalgo, 0, digest, hvec, 2);
+    }
+
+  xfree (rawmpi);
+  if (rc)
+    {
+      xfree (digest);
+      return rc;
+    }
+
+  /* Compute the A value.  */
+  reverse_buffer (digest, b);  /* Only the first half of the hash.  */
+
+  /* Field specific handling of clearing/setting bits. */
+  if (ec->nbits == 255)
+    {
+      digest[0]   = (digest[0] & 0x7f) | 0x40;
+      digest[31] &= 0xf8;
+    }
+  else
+    {
+      digest[0]   = 0;
+      digest[1]  |= 0x80;
+      digest[56] &= 0xfc;
+    }
+
+  *r_digest = digest;
+  return 0;
+}
+
+
+/**
+ * _gcry_ecc_eddsa_genkey - EdDSA version of the key generation.
+ *
+ * @ec: Elliptic curve computation context.
+ * @flags: Flags controlling aspects of the creation.
+ *
+ * Return: An error code.
+ *
+ * The only @flags bit used by this function is %PUBKEY_FLAG_TRANSIENT
+ * to use a faster RNG.
+ */
+gpg_err_code_t
+_gcry_ecc_eddsa_genkey (mpi_ec_t ec, int flags)
+{
+  gpg_err_code_t rc;
+  int b;
+  gcry_mpi_t a, x, y;
+  mpi_point_struct Q;
+  gcry_random_level_t random_level;
+  char *dbuf;
+  size_t dlen;
+  unsigned char *hash_d = NULL;
+
+  point_init (&Q);
+
+  if ((flags & PUBKEY_FLAG_TRANSIENT_KEY))
+    random_level = GCRY_STRONG_RANDOM;
+  else
+    random_level = GCRY_VERY_STRONG_RANDOM;
+
+  b = (ec->nbits+7)/8;
+
+  if (ec->nbits == 255)
+    ;
+  else if (ec->nbits == 448)
+    b++;
+  else
+    return GPG_ERR_NOT_IMPLEMENTED;
+
+  dlen = b;
+
+  a = mpi_snew (0);
+  x = mpi_new (0);
+  y = mpi_new (0);
+
+  /* Generate a secret.  */
+  dbuf = _gcry_random_bytes_secure (dlen, random_level);
+  ec->d = _gcry_mpi_set_opaque (NULL, dbuf, dlen*8);
+  rc = _gcry_ecc_eddsa_compute_h_d (&hash_d, ec);
+  if (rc)
+    goto leave;
+
+  _gcry_mpi_set_buffer (a, hash_d, b, 0);
+  xfree (hash_d);
+  /* log_printmpi ("ecgen         a", a); */
+
+  /* Compute Q.  */
+  _gcry_mpi_ec_mul_point (&Q, a, ec->G, ec);
+  if (DBG_CIPHER)
+    log_printpnt ("ecgen      pk", &Q, ec);
+
+  ec->Q = mpi_point_snatch_set (NULL, Q.x, Q.y, Q.z);
+  Q.x = NULL;
+  Q.y = NULL;
+  Q.x = NULL;
+
+ leave:
+  _gcry_mpi_release (a);
+  _gcry_mpi_release (x);
+  _gcry_mpi_release (y);
+  return rc;
+}
+
+
+/* Compute an EdDSA signature. See:
+ *   [ed25519] 23pp. (PDF) Daniel J. Bernstein, Niels Duif, Tanja
+ *   Lange, Peter Schwabe, Bo-Yin Yang. High-speed high-security
+ *   signatures.  Journal of Cryptographic Engineering 2 (2012), 77-89.
+ *   Document ID: a1a62a2f76d23f65d622484ddd09caf8.
+ *   URL: http://cr.yp.to/papers.html#ed25519. Date: 2011.09.26.
+ *
+ * Despite that this function requires the specification of a hash
+ * algorithm, we only support what has been specified by the paper.
+ * This may change in the future.
+ *
+ * Return the signature struct (r,s) from the message hash.  The caller
+ * must have allocated R_R and S.
+ */
+
+/* String to be used with Ed448 */
+#define DOM25519     "SigEd25519 no Ed25519 collisions"
+#define DOM25519_LEN 32
+#define DOM448       "SigEd448"
+#define DOM448_LEN   8
+
+gpg_err_code_t
+_gcry_ecc_eddsa_sign (gcry_mpi_t input, mpi_ec_t ec,
+                      gcry_mpi_t r_r, gcry_mpi_t s,
+                      struct pk_encoding_ctx *ctx)
+{
+  int rc;
+  unsigned int tmp;
+  unsigned char *digest = NULL;
+  const void *mbuf;
+  size_t mlen;
+  unsigned char *rawmpi = NULL;
+  unsigned int rawmpilen;
+  unsigned char *encpk = NULL; /* Encoded public key.  */
+  unsigned int encpklen;
+  mpi_point_struct I;          /* Intermediate value.  */
+  gcry_mpi_t a, x, y, r;
+  int b;
+  unsigned char x_olen[2];
+  unsigned char prehashed_msg[64];
+
+  b = (ec->nbits+7)/8;
+
+  if (ec->nbits == 255)
+    ;
+  else if (ec->nbits == 448)
+    b++;
+  else
+    return GPG_ERR_NOT_IMPLEMENTED;
+
+  if (!mpi_is_opaque (input))
+    return GPG_ERR_INV_DATA;
+
+  /* Initialize some helpers.  */
+  point_init (&I);
+  a = mpi_snew (0);
+  x = mpi_new (0);
+  y = mpi_new (0);
+  r = mpi_snew (0);
+
+  rc = _gcry_ecc_eddsa_compute_h_d (&digest, ec);
+  if (rc)
+    goto leave;
+  _gcry_mpi_set_buffer (a, digest, b, 0);
+
+  /* Compute the public key if it's not available (only secret part).  */
+  if (ec->Q == NULL)
+    {
+      mpi_point_struct Q;
+
+      point_init (&Q);
+      _gcry_mpi_ec_mul_point (&Q, a, ec->G, ec);
+      ec->Q = mpi_point_snatch_set (NULL, Q.x, Q.y, Q.z);
+    }
+  rc = _gcry_ecc_eddsa_encodepoint (ec->Q, ec, x, y, 0, &encpk, &encpklen);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printhex ("  e_pk", encpk, encpklen);
+
+  /* Compute R.  */
+  mbuf = mpi_get_opaque (input, &tmp);
+  mlen = (tmp +7)/8;
+  if (DBG_CIPHER)
+    log_printhex ("     m", mbuf, mlen);
+
+  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+    {
+      gcry_error_t err;
+      gcry_md_hd_t hd;
+
+      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
+      if (err)
+        rc = gcry_err_code (err);
+      else
+        {
+          _gcry_md_write (hd, DOM448, DOM448_LEN);
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          _gcry_md_write (hd, x_olen, 2);
+          if (ctx->labellen)
+            _gcry_md_write (hd, ctx->label, ctx->labellen);
+          _gcry_md_write (hd, digest+b, b);
+          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+            {
+              gcry_md_hd_t hd2;
+
+              err = _gcry_md_open (&hd2, ctx->hash_algo, 0);
+              if (err)
+                {
+                  rc = gcry_err_code (err);
+                  _gcry_md_close (hd);
+                  goto leave;
+                }
+              _gcry_md_write (hd2, mbuf, mlen);
+              _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0);
+              _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64);
+              _gcry_md_close (hd2);
+              _gcry_md_write (hd, prehashed_msg, 64);
+            }
+          else
+            _gcry_md_write (hd, mbuf, mlen);
+          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
+          _gcry_md_close (hd);
+          rc = 0;
+        }
+    }
+  else
+    {
+      gcry_buffer_t hvec[6];
+      int i = 0;
+
+      memset (hvec, 0, sizeof hvec);
+
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
+        {
+          hvec[i].data = (void *)DOM25519;
+          hvec[i].len  = DOM25519_LEN;
+          i++;
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          hvec[i].data = x_olen;
+          hvec[i].len  = 2;
+          i++;
+          if (ctx->labellen)
+            {
+              hvec[i].data = ctx->label;
+              hvec[i].len  = ctx->labellen;
+              i++;
+            }
+        }
+
+      hvec[i].data = digest;
+      hvec[i].off  = b;
+      hvec[i].len  = b;
+      i++;
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+        {
+          _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen);
+          hvec[i].data = (char*)prehashed_msg;
+          hvec[i].len  = 64;
+        }
+      else
+        {
+          hvec[i].data = (char*)mbuf;
+          hvec[i].len  = mlen;
+        }
+      i++;
+      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+    }
+
+  if (rc)
+    goto leave;
+  reverse_buffer (digest, 2*b);
+  if (DBG_CIPHER)
+    log_printhex ("     r", digest, 2*b);
+  _gcry_mpi_set_buffer (r, digest, 2*b, 0);
+  mpi_mod (r, r, ec->n);
+  _gcry_mpi_ec_mul_point (&I, r, ec->G, ec);
+  if (DBG_CIPHER)
+    log_printpnt ("   r", &I, ec);
+
+  /* Convert R into affine coordinates and apply encoding.  */
+  rc = _gcry_ecc_eddsa_encodepoint (&I, ec, x, y, 0, &rawmpi, &rawmpilen);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printhex ("   e_r", rawmpi, rawmpilen);
+
+  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+    {
+      gcry_error_t err;
+      gcry_md_hd_t hd;
+
+      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
+      if (err)
+        rc = gcry_err_code (err);
+      else
+        {
+          _gcry_md_write (hd, DOM448, DOM448_LEN);
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          _gcry_md_write (hd, x_olen, 2);
+          if (ctx->labellen)
+            _gcry_md_write (hd, ctx->label, ctx->labellen);
+          _gcry_md_write (hd, rawmpi, rawmpilen);
+          _gcry_md_write (hd, encpk, encpklen);
+          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+            _gcry_md_write (hd, prehashed_msg, 64);
+          else
+            _gcry_md_write (hd, mbuf, mlen);
+          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
+          _gcry_md_close (hd);
+          rc = 0;
+        }
+    }
+  else
+    {
+      gcry_buffer_t hvec[6];
+      int i = 0;
+
+      memset (hvec, 0, sizeof hvec);
+
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
+        {
+          hvec[i].data = (void *)DOM25519;
+          hvec[i].len  = DOM25519_LEN;
+          i++;
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          hvec[i].data = x_olen;
+          hvec[i].len  = 2;
+          i++;
+          if (ctx->labellen)
+            {
+              hvec[i].data = ctx->label;
+              hvec[i].len  = ctx->labellen;
+              i++;
+            }
+        }
+
+      /* S = r + a * H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m) mod n  */
+      hvec[i].data = rawmpi;  /* (this is R) */
+      hvec[i].len  = rawmpilen;
+      i++;
+      hvec[i].data = encpk;
+      hvec[i].len  = encpklen;
+      i++;
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+        {
+          hvec[i].data = (char*)prehashed_msg;
+          hvec[i].len  = 64;
+        }
+      else
+        {
+          hvec[i].data = (char*)mbuf;
+          hvec[i].len  = mlen;
+        }
+      i++;
+      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+    }
+
+  if (rc)
+    goto leave;
+
+  /* No more need for RAWMPI thus we now transfer it to R_R.  */
+  mpi_set_opaque (r_r, rawmpi, rawmpilen*8);
+  rawmpi = NULL;
+
+  reverse_buffer (digest, 2*b);
+  if (DBG_CIPHER)
+    log_printhex (" H(R+)", digest, 2*b);
+  _gcry_mpi_set_buffer (s, digest, 2*b, 0);
+  mpi_mulm (s, s, a, ec->n);
+  mpi_addm (s, s, r, ec->n);
+  rc = eddsa_encodempi (s, ec->nbits, &rawmpi, &rawmpilen);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printhex ("   e_s", rawmpi, rawmpilen);
+  mpi_set_opaque (s, rawmpi, rawmpilen*8);
+  rawmpi = NULL;
+
+  rc = 0;
+
+ leave:
+  _gcry_mpi_release (a);
+  _gcry_mpi_release (x);
+  _gcry_mpi_release (y);
+  _gcry_mpi_release (r);
+  xfree (digest);
+  point_free (&I);
+  xfree (encpk);
+  xfree (rawmpi);
+  return rc;
+}
+
+
+/* Verify an EdDSA signature.  See sign_eddsa for the reference.
+ * Check if R_IN and S_IN verifies INPUT.
+ */
+gpg_err_code_t
+_gcry_ecc_eddsa_verify (gcry_mpi_t input, mpi_ec_t ec,
+                        gcry_mpi_t r_in, gcry_mpi_t s_in,
+                        struct pk_encoding_ctx *ctx)
+{
+  int rc;
+  int b;
+  unsigned int tmp;
+  unsigned char *encpk = NULL; /* Encoded public key.  */
+  unsigned int encpklen;
+  const void *mbuf, *rbuf;
+  unsigned char *tbuf = NULL;
+  size_t mlen, rlen;
+  unsigned int tlen;
+  unsigned char digest[114];
+  gcry_mpi_t h, s;
+  mpi_point_struct Ia, Ib;
+  unsigned char x_olen[2];
+  unsigned char prehashed_msg[64];
+
+  if (!mpi_is_opaque (input) || !mpi_is_opaque (r_in) || !mpi_is_opaque (s_in))
+    return GPG_ERR_INV_DATA;
+
+  point_init (&Ia);
+  point_init (&Ib);
+  h = mpi_new (0);
+  s = mpi_new (0);
+
+  b = (ec->nbits+7)/8;
+
+  if (ec->nbits == 255)
+    ;
+  else if (ec->nbits == 448)
+    b++;
+  else
+    return GPG_ERR_NOT_IMPLEMENTED;
+
+  /* Encode and check the public key.  */
+  rc = _gcry_ecc_eddsa_encodepoint (ec->Q, ec, NULL, NULL, 0,
+                                    &encpk, &encpklen);
+  if (rc)
+    goto leave;
+  if (!_gcry_mpi_ec_curve_point (ec->Q, ec))
+    {
+      rc = GPG_ERR_BROKEN_PUBKEY;
+      goto leave;
+    }
+  if (DBG_CIPHER)
+    log_printhex ("  e_pk", encpk, encpklen);
+  if (encpklen != b)
+    {
+      rc = GPG_ERR_INV_LENGTH;
+      goto leave;
+    }
+
+  /* Convert the other input parameters.  */
+  mbuf = mpi_get_opaque (input, &tmp);
+  mlen = (tmp +7)/8;
+  if (DBG_CIPHER)
+    log_printhex ("     m", mbuf, mlen);
+  rbuf = mpi_get_opaque (r_in, &tmp);
+  rlen = (tmp +7)/8;
+  if (DBG_CIPHER)
+    log_printhex ("     r", rbuf, rlen);
+  if (rlen != b)
+    {
+      rc = GPG_ERR_INV_LENGTH;
+      goto leave;
+    }
+
+  if (ctx->hash_algo == GCRY_MD_SHAKE256)
+    {
+      gcry_error_t err;
+      gcry_md_hd_t hd;
+
+      err = _gcry_md_open (&hd, ctx->hash_algo, 0);
+      if (err)
+        rc = gcry_err_code (err);
+      else
+        {
+          _gcry_md_write (hd, DOM448, DOM448_LEN);
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          _gcry_md_write (hd, x_olen, 2);
+          if (ctx->labellen)
+            _gcry_md_write (hd, ctx->label, ctx->labellen);
+          _gcry_md_write (hd, rbuf, rlen);
+          _gcry_md_write (hd, encpk, encpklen);
+          if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+            {
+              gcry_md_hd_t hd2;
+
+              err = _gcry_md_open (&hd2, ctx->hash_algo, 0);
+              if (err)
+                {
+                  rc = gcry_err_code (err);
+                  _gcry_md_close (hd);
+                  goto leave;
+                }
+              _gcry_md_write (hd2, mbuf, mlen);
+              _gcry_md_ctl (hd2, GCRYCTL_FINALIZE, NULL, 0);
+              _gcry_md_extract (hd2, GCRY_MD_SHAKE256, prehashed_msg, 64);
+              _gcry_md_close (hd2);
+              _gcry_md_write (hd, prehashed_msg, 64);
+            }
+          else
+            _gcry_md_write (hd, mbuf, mlen);
+          _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+          _gcry_md_extract (hd, GCRY_MD_SHAKE256, digest, 2*b);
+          _gcry_md_close (hd);
+          rc = 0;
+        }
+    }
+  else
+    {
+      gcry_buffer_t hvec[6];
+      int i = 0;
+
+      memset (hvec, 0, sizeof hvec);
+
+      /* h = H(dom2(F,C)+encodepoint(R)+encodepoint(pk)+m)  */
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH) || ctx->labellen)
+        {
+          hvec[i].data = (void *)DOM25519;
+          hvec[i].len  = DOM25519_LEN;
+          i++;
+          x_olen[0] = !!(ctx->flags & PUBKEY_FLAG_PREHASH);
+          x_olen[1] = ctx->labellen;
+          hvec[i].data = x_olen;
+          hvec[i].len  = 2;
+          i++;
+          if (ctx->labellen)
+            {
+              hvec[i].data = ctx->label;
+              hvec[i].len  = ctx->labellen;
+              i++;
+            }
+        }
+
+      hvec[i].data = (char*)rbuf;
+      hvec[i].len  = rlen;
+      i++;
+      hvec[i].data = encpk;
+      hvec[i].len  = encpklen;
+      i++;
+      if ((ctx->flags & PUBKEY_FLAG_PREHASH))
+        {
+          _gcry_md_hash_buffer (ctx->hash_algo, prehashed_msg, mbuf, mlen);
+          hvec[i].data = (char*)prehashed_msg;
+          hvec[i].len  = 64;
+        }
+      else
+        {
+          hvec[i].data = (char*)mbuf;
+          hvec[i].len  = mlen;
+        }
+      i++;
+      rc = _gcry_md_hash_buffers (ctx->hash_algo, 0, digest, hvec, i);
+    }
+
+  if (rc)
+    goto leave;
+  reverse_buffer (digest, 2*b);
+  if (DBG_CIPHER)
+    log_printhex (" H(R+)", digest, 2*b);
+  _gcry_mpi_set_buffer (h, digest, 2*b, 0);
+
+  /* According to the paper the best way for verification is:
+         encodepoint(sG - h·Q) = encodepoint(r)
+     because we don't need to decode R. */
+  {
+    void *sbuf;
+    unsigned int slen;
+
+    sbuf = _gcry_mpi_get_opaque_copy (s_in, &tmp);
+    slen = (tmp +7)/8;
+    reverse_buffer (sbuf, slen);
+    if (DBG_CIPHER)
+      log_printhex ("     s", sbuf, slen);
+    _gcry_mpi_set_buffer (s, sbuf, slen, 0);
+    xfree (sbuf);
+    if (slen != b)
+      {
+        rc = GPG_ERR_INV_LENGTH;
+        goto leave;
+      }
+  }
+
+  _gcry_mpi_ec_mul_point (&Ia, s, ec->G, ec);
+  _gcry_mpi_ec_mul_point (&Ib, h, ec->Q, ec);
+  _gcry_mpi_sub (Ib.x, ec->p, Ib.x);
+  _gcry_mpi_ec_add_points (&Ia, &Ia, &Ib, ec);
+  rc = _gcry_ecc_eddsa_encodepoint (&Ia, ec, s, h, 0, &tbuf, &tlen);
+  if (rc)
+    goto leave;
+  if (tlen != rlen || memcmp (tbuf, rbuf, tlen))
+    {
+      rc = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  rc = 0;
+
+ leave:
+  xfree (encpk);
+  xfree (tbuf);
+  _gcry_mpi_release (s);
+  _gcry_mpi_release (h);
+  point_free (&Ia);
+  point_free (&Ib);
+  return rc;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-gost.c b/comm/third_party/libgcrypt/cipher/ecc-gost.c
new file mode 100644
index 0000000000..36230f8a32
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-gost.c
@@ -0,0 +1,218 @@
+/* ecc-gots.c  -  Elliptic Curve GOST signatures
+ * Copyright (C) 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 Dmitry Eremin-Solenikov
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "ecc-common.h"
+#include "pubkey-internal.h"
+
+
+/* Compute an GOST R 34.10-01/-12 signature.
+ * Return the signature struct (r,s) from the message hash.  The caller
+ * must have allocated R and S.
+ */
+gpg_err_code_t
+_gcry_ecc_gost_sign (gcry_mpi_t input, mpi_ec_t ec,
+                     gcry_mpi_t r, gcry_mpi_t s)
+{
+  gpg_err_code_t rc = 0;
+  gcry_mpi_t k, dr, sum, ke, x, e;
+  mpi_point_struct I;
+  gcry_mpi_t hash;
+  unsigned int qbits;
+
+  if (DBG_CIPHER)
+    log_mpidump ("gost sign hash  ", input );
+
+  qbits = mpi_get_nbits (ec->n);
+
+  /* Convert the INPUT into an MPI if needed.  */
+  rc = _gcry_dsa_normalize_hash (input, &hash, qbits);
+  if (rc)
+    return rc;
+
+  k = NULL;
+  dr = mpi_alloc (0);
+  sum = mpi_alloc (0);
+  ke = mpi_alloc (0);
+  e = mpi_alloc (0);
+  x = mpi_alloc (0);
+  point_init (&I);
+
+  mpi_mod (e, input, ec->n); /* e = hash mod n */
+
+  if (!mpi_cmp_ui (e, 0))
+    mpi_set_ui (e, 1);
+
+  /* Two loops to avoid R or S are zero.  This is more of a joke than
+     a real demand because the probability of them being zero is less
+     than any hardware failure.  Some specs however require it.  */
+  do
+    {
+      do
+        {
+          mpi_free (k);
+          k = _gcry_dsa_gen_k (ec->n, GCRY_STRONG_RANDOM);
+
+          _gcry_dsa_modify_k (k, ec->n, qbits);
+
+          _gcry_mpi_ec_mul_point (&I, k, ec->G, ec);
+          if (_gcry_mpi_ec_get_affine (x, NULL, &I, ec))
+            {
+              if (DBG_CIPHER)
+                log_debug ("ecc sign: Failed to get affine coordinates\n");
+              rc = GPG_ERR_BAD_SIGNATURE;
+              goto leave;
+            }
+          mpi_mod (r, x, ec->n);  /* r = x mod n */
+        }
+      while (!mpi_cmp_ui (r, 0));
+      mpi_mulm (dr, ec->d, r, ec->n); /* dr = d*r mod n  */
+      mpi_mulm (ke, k, e, ec->n); /* ke = k*e mod n */
+      mpi_addm (s, ke, dr, ec->n); /* sum = (k*e+ d*r) mod n  */
+    }
+  while (!mpi_cmp_ui (s, 0));
+
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("gost sign result r ", r);
+      log_mpidump ("gost sign result s ", s);
+    }
+
+ leave:
+  point_free (&I);
+  mpi_free (x);
+  mpi_free (e);
+  mpi_free (ke);
+  mpi_free (sum);
+  mpi_free (dr);
+  mpi_free (k);
+
+  if (hash != input)
+    mpi_free (hash);
+
+  return rc;
+}
+
+
+/* Verify a GOST R 34.10-01/-12 signature.
+ * Check if R and S verifies INPUT.
+ */
+gpg_err_code_t
+_gcry_ecc_gost_verify (gcry_mpi_t input, mpi_ec_t ec,
+                       gcry_mpi_t r, gcry_mpi_t s)
+{
+  gpg_err_code_t err = 0;
+  gcry_mpi_t e, x, z1, z2, v, rv, zero;
+  mpi_point_struct Q, Q1, Q2;
+
+  if (!_gcry_mpi_ec_curve_point (ec->Q, ec))
+    return GPG_ERR_BROKEN_PUBKEY;
+
+  if( !(mpi_cmp_ui (r, 0) > 0 && mpi_cmp (r, ec->n) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < r < n  failed.  */
+  if( !(mpi_cmp_ui (s, 0) > 0 && mpi_cmp (s, ec->n) < 0) )
+    return GPG_ERR_BAD_SIGNATURE; /* Assertion	0 < s < n  failed.  */
+
+  x = mpi_alloc (0);
+  e = mpi_alloc (0);
+  z1 = mpi_alloc (0);
+  z2 = mpi_alloc (0);
+  v = mpi_alloc (0);
+  rv = mpi_alloc (0);
+  zero = mpi_alloc (0);
+
+  point_init (&Q);
+  point_init (&Q1);
+  point_init (&Q2);
+
+  mpi_mod (e, input, ec->n); /* e = hash mod n */
+  if (!mpi_cmp_ui (e, 0))
+    mpi_set_ui (e, 1);
+  mpi_invm (v, e, ec->n); /* v = e^(-1) (mod n) */
+  mpi_mulm (z1, s, v, ec->n); /* z1 = s*v (mod n) */
+  mpi_mulm (rv, r, v, ec->n); /* rv = r*v (mod n) */
+  mpi_subm (z2, zero, rv, ec->n); /* z2 = -r*v (mod n) */
+
+  _gcry_mpi_ec_mul_point (&Q1, z1, ec->G, ec);
+/*   log_mpidump ("Q1.x", Q1.x); */
+/*   log_mpidump ("Q1.y", Q1.y); */
+/*   log_mpidump ("Q1.z", Q1.z); */
+  _gcry_mpi_ec_mul_point (&Q2, z2, ec->Q, ec);
+/*   log_mpidump ("Q2.x", Q2.x); */
+/*   log_mpidump ("Q2.y", Q2.y); */
+/*   log_mpidump ("Q2.z", Q2.z); */
+  _gcry_mpi_ec_add_points (&Q, &Q1, &Q2, ec);
+/*   log_mpidump (" Q.x", Q.x); */
+/*   log_mpidump (" Q.y", Q.y); */
+/*   log_mpidump (" Q.z", Q.z); */
+
+  if (!mpi_cmp_ui (Q.z, 0))
+    {
+      if (DBG_CIPHER)
+          log_debug ("ecc verify: Rejected\n");
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  if (_gcry_mpi_ec_get_affine (x, NULL, &Q, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("ecc verify: Failed to get affine coordinates\n");
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  mpi_mod (x, x, ec->n); /* x = x mod E_n */
+  if (mpi_cmp (x, r))   /* x != r */
+    {
+      if (DBG_CIPHER)
+        {
+          log_mpidump ("     x", x);
+          log_mpidump ("     r", r);
+          log_mpidump ("     s", s);
+          log_debug ("ecc verify: Not verified\n");
+        }
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  if (DBG_CIPHER)
+    log_debug ("ecc verify: Accepted\n");
+
+ leave:
+  point_free (&Q2);
+  point_free (&Q1);
+  point_free (&Q);
+  mpi_free (zero);
+  mpi_free (rv);
+  mpi_free (v);
+  mpi_free (z2);
+  mpi_free (z1);
+  mpi_free (x);
+  mpi_free (e);
+  return err;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-misc.c b/comm/third_party/libgcrypt/cipher/ecc-misc.c
new file mode 100644
index 0000000000..6470a83bf4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-misc.c
@@ -0,0 +1,438 @@
+/* ecc-misc.c  -  Elliptic Curve miscellaneous functions
+ * Copyright (C) 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "ecc-common.h"
+
+
+/*
+ * Release a curve object.
+ */
+void
+_gcry_ecc_curve_free (elliptic_curve_t *E)
+{
+  mpi_free (E->p); E->p = NULL;
+  mpi_free (E->a); E->a = NULL;
+  mpi_free (E->b);  E->b = NULL;
+  _gcry_mpi_point_free_parts (&E->G);
+  mpi_free (E->n);  E->n = NULL;
+}
+
+
+/*
+ * Return a copy of a curve object.
+ */
+elliptic_curve_t
+_gcry_ecc_curve_copy (elliptic_curve_t E)
+{
+  elliptic_curve_t R;
+
+  R.model = E.model;
+  R.dialect = E.dialect;
+  R.name = E.name;
+  R.p = mpi_copy (E.p);
+  R.a = mpi_copy (E.a);
+  R.b = mpi_copy (E.b);
+  _gcry_mpi_point_init (&R.G);
+  point_set (&R.G, &E.G);
+  R.n = mpi_copy (E.n);
+  R.h = E.h;
+
+  return R;
+}
+
+
+/*
+ * Return a description of the curve model.
+ */
+const char *
+_gcry_ecc_model2str (enum gcry_mpi_ec_models model)
+{
+  const char *str = "?";
+  switch (model)
+    {
+    case MPI_EC_WEIERSTRASS:    str = "Weierstrass"; break;
+    case MPI_EC_MONTGOMERY:     str = "Montgomery";  break;
+    case MPI_EC_EDWARDS:        str = "Edwards"; break;
+    }
+  return str;
+}
+
+
+/*
+ * Return a description of the curve dialect.
+ */
+const char *
+_gcry_ecc_dialect2str (enum ecc_dialects dialect)
+{
+  const char *str = "?";
+  switch (dialect)
+    {
+    case ECC_DIALECT_STANDARD:  str = "Standard"; break;
+    case ECC_DIALECT_ED25519:   str = "Ed25519"; break;
+    case ECC_DIALECT_SAFECURVE: str = "SafeCurve"; break;
+    }
+  return str;
+}
+
+
+gcry_mpi_t
+_gcry_ecc_ec2os (gcry_mpi_t x, gcry_mpi_t y, gcry_mpi_t p)
+{
+  gpg_err_code_t rc;
+  int pbytes = (mpi_get_nbits (p)+7)/8;
+  size_t n;
+  unsigned char *buf, *ptr;
+
+  buf = xmalloc ( 1 + 2*pbytes );
+  *buf = 04; /* Uncompressed point.  */
+  ptr = buf+1;
+  rc = _gcry_mpi_print (GCRYMPI_FMT_USG, ptr, pbytes, &n, x);
+  if (rc)
+    log_fatal ("mpi_print failed: %s\n", gpg_strerror (rc));
+  if (n < pbytes)
+    {
+      memmove (ptr+(pbytes-n), ptr, n);
+      memset (ptr, 0, (pbytes-n));
+    }
+  ptr += pbytes;
+  rc = _gcry_mpi_print (GCRYMPI_FMT_USG, ptr, pbytes, &n, y);
+  if (rc)
+    log_fatal ("mpi_print failed: %s\n", gpg_strerror (rc));
+  if (n < pbytes)
+    {
+      memmove (ptr+(pbytes-n), ptr, n);
+      memset (ptr, 0, (pbytes-n));
+    }
+
+  return mpi_set_opaque (NULL, buf, (1+2*pbytes)*8);
+}
+
+
+/* Convert POINT into affine coordinates using the context CTX and
+   return a newly allocated MPI.  If the conversion is not possible
+   NULL is returned.  This function won't print an error message.  */
+gcry_mpi_t
+_gcry_mpi_ec_ec2os (gcry_mpi_point_t point, mpi_ec_t ec)
+{
+  gcry_mpi_t g_x, g_y, result;
+
+  g_x = mpi_new (0);
+  g_y = mpi_new (0);
+  if (_gcry_mpi_ec_get_affine (g_x, g_y, point, ec))
+    result = NULL;
+  else
+    result = _gcry_ecc_ec2os (g_x, g_y, ec->p);
+  mpi_free (g_x);
+  mpi_free (g_y);
+
+  return result;
+}
+
+
+/* Decode octet string in VALUE into RESULT, in the format defined by SEC 1.
+   RESULT must have been initialized and is set on success to the
+   point given by VALUE.  */
+gpg_err_code_t
+_gcry_ecc_sec_decodepoint  (gcry_mpi_t value, mpi_ec_t ec, mpi_point_t result)
+{
+  gpg_err_code_t rc;
+  size_t n;
+  const unsigned char *buf;
+  unsigned char *buf_memory;
+  gcry_mpi_t x, y;
+
+  if (mpi_is_opaque (value))
+    {
+      unsigned int nbits;
+
+      buf = mpi_get_opaque (value, &nbits);
+      if (!buf)
+        return GPG_ERR_INV_OBJ;
+      n = (nbits + 7)/8;
+      buf_memory = NULL;
+    }
+  else
+    {
+      n = (mpi_get_nbits (value)+7)/8;
+      buf_memory = xmalloc (n);
+      rc = _gcry_mpi_print (GCRYMPI_FMT_USG, buf_memory, n, &n, value);
+      if (rc)
+        {
+          xfree (buf_memory);
+          return rc;
+        }
+      buf = buf_memory;
+    }
+
+  if (n < 1)
+    {
+      xfree (buf_memory);
+      return GPG_ERR_INV_OBJ;
+    }
+
+  if (*buf == 2 || *buf == 3)
+    {
+      gcry_mpi_t x3;
+      gcry_mpi_t t;
+      gcry_mpi_t p1_4;
+      int y_bit = (*buf == 3);
+
+      if (!mpi_test_bit (ec->p, 1))
+        {
+          xfree (buf_memory);
+          return GPG_ERR_NOT_IMPLEMENTED; /* No support for point compression.  */
+        }
+
+      n = n - 1;
+      rc = _gcry_mpi_scan (&x, GCRYMPI_FMT_USG, buf+1, n, NULL);
+      xfree (buf_memory);
+      if (rc)
+        return rc;
+
+      /*
+       * Recover Y.  The Weierstrass curve: y^2 = x^3 + a*x + b
+       */
+
+      x3 = mpi_new (0);
+      t = mpi_new (0);
+      p1_4 = mpi_new (0);
+      y = mpi_new (0);
+
+      /* Compute right hand side.  */
+      mpi_powm (x3, x, mpi_const (MPI_C_THREE), ec->p);
+      mpi_mul (t, ec->a, x);
+      mpi_mod (t, t, ec->p);
+      mpi_add (t, t, ec->b);
+      mpi_mod (t, t, ec->p);
+      mpi_add (t, t, x3);
+      mpi_mod (t, t, ec->p);
+
+      /*
+       * When p mod 4 = 3, modular square root of A can be computed by
+       * A^((p+1)/4) mod p
+       */
+
+      /* Compute (p+1)/4 into p1_4 */
+      mpi_rshift (p1_4, ec->p, 2);
+      _gcry_mpi_add_ui (p1_4, p1_4, 1);
+
+      mpi_powm (y, t, p1_4, ec->p);
+
+      if (y_bit != mpi_test_bit (y, 0))
+        mpi_sub (y, ec->p, y);
+
+      mpi_free (p1_4);
+      mpi_free (t);
+      mpi_free (x3);
+    }
+  else if (*buf == 4)
+    {
+      if ( ((n-1)%2) )
+        {
+          xfree (buf_memory);
+          return GPG_ERR_INV_OBJ;
+        }
+      n = (n-1)/2;
+      rc = _gcry_mpi_scan (&x, GCRYMPI_FMT_USG, buf+1, n, NULL);
+      if (rc)
+        {
+          xfree (buf_memory);
+          return rc;
+        }
+      rc = _gcry_mpi_scan (&y, GCRYMPI_FMT_USG, buf+1+n, n, NULL);
+      xfree (buf_memory);
+      if (rc)
+        {
+          mpi_free (x);
+          return rc;
+        }
+    }
+  else
+    {
+      xfree (buf_memory);
+      return GPG_ERR_INV_OBJ;
+    }
+
+  mpi_set (result->x, x);
+  mpi_set (result->y, y);
+  mpi_set_ui (result->z, 1);
+
+  mpi_free (x);
+  mpi_free (y);
+
+  return 0;
+}
+
+
+/* Compute the public key from the the context EC.  Obviously a
+   requirement is that the secret key is available in EC.  On success
+   Q is returned; on error NULL.  If Q is NULL a newly allocated point
+   is returned.  If G or D are given they override the values taken
+   from EC. */
+mpi_point_t
+_gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec)
+{
+  if (!ec->d || !ec->G || !ec->p || !ec->a)
+    return NULL;
+  if (ec->model == MPI_EC_EDWARDS && !ec->b)
+    return NULL;
+
+  if ((ec->dialect == ECC_DIALECT_ED25519 && (ec->flags & PUBKEY_FLAG_EDDSA))
+      || (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE))
+    {
+      gcry_mpi_t a;
+      unsigned char *digest;
+
+      if (_gcry_ecc_eddsa_compute_h_d (&digest, ec))
+        return NULL;
+
+      a = mpi_snew (0);
+      _gcry_mpi_set_buffer (a, digest, 32, 0);
+      xfree (digest);
+
+      /* And finally the public key.  */
+      if (!Q)
+        Q = mpi_point_new (0);
+      if (Q)
+        _gcry_mpi_ec_mul_point (Q, a, ec->G, ec);
+      mpi_free (a);
+    }
+  else
+    {
+      if (!Q)
+        Q = mpi_point_new (0);
+      if (Q)
+        _gcry_mpi_ec_mul_point (Q, ec->d, ec->G, ec);
+    }
+
+  return Q;
+}
+
+
+gpg_err_code_t
+_gcry_ecc_mont_encodepoint (gcry_mpi_t x, unsigned int nbits,
+                            int with_prefix,
+                            unsigned char **r_buffer, unsigned int *r_buflen)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+
+  rawmpi = _gcry_mpi_get_buffer_extra (x, (nbits+7)/8,
+                                       with_prefix? -1 : 0, &rawmpilen, NULL);
+  if (rawmpi == NULL)
+    return gpg_err_code_from_syserror ();
+
+  if (with_prefix)
+    {
+      rawmpi[0] = 0x40;
+      rawmpilen++;
+    }
+
+  *r_buffer = rawmpi;
+  *r_buflen = rawmpilen;
+  return 0;
+}
+
+
+gpg_err_code_t
+_gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ec, mpi_point_t result)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+  unsigned int nbytes = (ec->nbits+7)/8;
+
+  /*
+   * It is not reliable to assume that the first byte of 0x40
+   * means the prefix.
+   *
+   * For newer implementation, it is reliable since we always put
+   * 0x40 for x-only coordinate.
+   *
+   * For data by older implementation (non-released development
+   * version in 2015), there is no 0x40 prefix added.
+   *
+   * So, it is possible to have shorter length of data when it was
+   * handled as MPI, removing preceding zeros.
+   *
+   * Besides, when data was parsed as MPI, we might have 0x00
+   * prefix (when the MSB in the first byte is set).
+   */
+
+  if (mpi_is_opaque (pk))
+    {
+      const unsigned char *buf;
+      unsigned char *p;
+
+      buf = mpi_get_opaque (pk, &rawmpilen);
+      if (!buf)
+        return GPG_ERR_INV_OBJ;
+      rawmpilen = (rawmpilen + 7)/8;
+
+      if (rawmpilen > nbytes
+          && (buf[0] == 0x00 || buf[0] == 0x40))
+        {
+          rawmpilen--;
+          buf++;
+        }
+
+      rawmpi = xtrymalloc (nbytes);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+
+      p = rawmpi + rawmpilen;
+      while (p > rawmpi)
+        *--p = *buf++;
+
+      if (rawmpilen < nbytes)
+        memset (rawmpi + nbytes - rawmpilen, 0, nbytes - rawmpilen);
+    }
+  else
+    {
+      rawmpi = _gcry_mpi_get_buffer (pk, nbytes, &rawmpilen, NULL);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+      /*
+       * When we have the prefix (0x40 or 0x00), it comes at the end,
+       * since it is taken by _gcry_mpi_get_buffer with little endian.
+       * Just setting RAWMPILEN to NBYTES is enough in this case.
+       * Othewise, RAWMPILEN is NBYTES already.
+       */
+      rawmpilen = nbytes;
+    }
+
+  if ((ec->nbits % 8))
+    rawmpi[0] &= (1 << (ec->nbits % 8)) - 1;
+  _gcry_mpi_set_buffer (result->x, rawmpi, rawmpilen, 0);
+  xfree (rawmpi);
+  mpi_set_ui (result->z, 1);
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc-sm2.c b/comm/third_party/libgcrypt/cipher/ecc-sm2.c
new file mode 100644
index 0000000000..c52629fd3f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc-sm2.c
@@ -0,0 +1,569 @@
+/* ecc-sm2.c  -  Elliptic Curve SM2 implementation
+ * Copyright (C) 2020 Tianjia Zhang
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "pubkey-internal.h"
+#include "ecc-common.h"
+
+#define MPI_NBYTES(m)   ((mpi_get_nbits(m) + 7) / 8)
+
+
+/* Key derivation function from X9.63/SECG */
+static gpg_err_code_t
+kdf_x9_63 (int algo, const void *in, size_t inlen, void *out, size_t outlen)
+{
+  gpg_err_code_t rc;
+  gcry_md_hd_t hd;
+  int mdlen;
+  u32 counter = 1;
+  u32 counter_be;
+  unsigned char *dgst;
+  unsigned char *pout = out;
+  size_t rlen = outlen;
+  size_t len;
+
+  rc = _gcry_md_open (&hd, algo, 0);
+  if (rc)
+    return rc;
+
+  mdlen = _gcry_md_get_algo_dlen (algo);
+
+  while (rlen > 0)
+    {
+      counter_be = be_bswap32 (counter);   /* cpu_to_be32 */
+      counter++;
+
+      _gcry_md_write (hd, in, inlen);
+      _gcry_md_write (hd, &counter_be, sizeof(counter_be));
+
+      dgst = _gcry_md_read (hd, algo);
+      if (dgst == NULL)
+        {
+          rc = GPG_ERR_DIGEST_ALGO;
+          break;
+        }
+
+      len = mdlen < rlen ? mdlen : rlen;  /* min(mdlen, rlen) */
+      memcpy (pout, dgst, len);
+      rlen -= len;
+      pout += len;
+
+      _gcry_md_reset (hd);
+    }
+
+  _gcry_md_close (hd);
+  return rc;
+}
+
+
+/* _gcry_ecc_sm2_encrypt description:
+ *   input:
+ *     data[0] : octet string
+ *   output: A new S-expression with the parameters:
+ *     a: c1 : generated ephemeral public key (kG)
+ *     b: c3 : Hash(x2 || IN || y2)
+ *     c: c2 : cipher
+ *
+ * sm2_decrypt description:
+ *   in contrast to encrypt
+ */
+gpg_err_code_t
+_gcry_ecc_sm2_encrypt (gcry_sexp_t *r_ciph, gcry_mpi_t input, mpi_ec_t ec)
+{
+  gpg_err_code_t rc;
+  const int algo = GCRY_MD_SM3;
+  gcry_md_hd_t md = NULL;
+  int mdlen;
+  unsigned char *dgst;
+  gcry_mpi_t k = NULL;
+  mpi_point_struct kG, kP;
+  gcry_mpi_t x1, y1;
+  gcry_mpi_t x2, y2;
+  gcry_mpi_t x2y2 = NULL;
+  unsigned char *in = NULL;
+  unsigned int inlen;
+  unsigned char *raw;
+  unsigned int rawlen;
+  unsigned char *cipher = NULL;
+  int i;
+
+  point_init (&kG);
+  point_init (&kP);
+  x1 = mpi_new (0);
+  y1 = mpi_new (0);
+  x2 = mpi_new (0);
+  y2 = mpi_new (0);
+
+  in = _gcry_mpi_get_buffer (input, 0, &inlen, NULL);
+  if (!in)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  cipher = xtrymalloc (inlen);
+  if (!cipher)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  /* rand k in [1, n-1] */
+  k = _gcry_dsa_gen_k (ec->n, GCRY_VERY_STRONG_RANDOM);
+
+  /* [k]G = (x1, y1) */
+  _gcry_mpi_ec_mul_point (&kG, k, ec->G, ec);
+  if (_gcry_mpi_ec_get_affine (x1, y1, &kG, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: kG can not be a Point at Infinity!\n");
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* [k]P = (x2, y2) */
+  _gcry_mpi_ec_mul_point (&kP, k, ec->Q, ec);
+  if (_gcry_mpi_ec_get_affine (x2, y2, &kP, ec))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* t = KDF(x2 || y2, klen) */
+  x2y2 = _gcry_mpi_ec_ec2os (&kP, ec);
+  raw = mpi_get_opaque (x2y2, &rawlen);
+  rawlen = (rawlen + 7) / 8;
+
+  /* skip the prefix '0x04' */
+  raw += 1;
+  rawlen -= 1;
+  rc = kdf_x9_63 (algo, raw, rawlen, cipher, inlen);
+  if (rc)
+    goto leave;
+
+  /* cipher = t xor in */
+  for (i = 0; i < inlen; i++)
+    cipher[i] ^= in[i];
+
+  /* hash(x2 || IN || y2) */
+  mdlen = _gcry_md_get_algo_dlen (algo);
+  rc = _gcry_md_open (&md, algo, 0);
+  if (rc)
+    goto leave;
+  _gcry_md_write (md, raw, MPI_NBYTES(x2));
+  _gcry_md_write (md, in, inlen);
+  _gcry_md_write (md, raw + MPI_NBYTES(x2), MPI_NBYTES(y2));
+  dgst = _gcry_md_read (md, algo);
+  if (dgst == NULL)
+    {
+      rc = GPG_ERR_DIGEST_ALGO;
+      goto leave;
+    }
+
+  if (!rc)
+    {
+      gcry_mpi_t c1;
+      gcry_mpi_t c3;
+      gcry_mpi_t c2;
+
+      c3 = mpi_new (0);
+      c2 = mpi_new (0);
+
+      c1 = _gcry_ecc_ec2os (x1, y1, ec->p);
+      _gcry_mpi_set_opaque_copy (c3, dgst, mdlen * 8);
+      _gcry_mpi_set_opaque_copy (c2, cipher, inlen * 8);
+
+      rc = sexp_build (r_ciph, NULL,
+                       "(enc-val(flags sm2)(sm2(a%M)(b%M)(c%M)))",
+                       c1, c3, c2);
+
+      mpi_free (c1);
+      mpi_free (c3);
+      mpi_free (c2);
+    }
+
+leave:
+  _gcry_md_close (md);
+  mpi_free (x2y2);
+  mpi_free (k);
+
+  point_free (&kG);
+  point_free (&kP);
+  mpi_free (x1);
+  mpi_free (y1);
+  mpi_free (x2);
+  mpi_free (y2);
+
+  xfree (cipher);
+  xfree (in);
+
+  return rc;
+}
+
+
+gpg_err_code_t
+_gcry_ecc_sm2_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t data_list, mpi_ec_t ec)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t data_c1 = NULL;
+  gcry_mpi_t data_c3 = NULL;
+  gcry_mpi_t data_c2 = NULL;
+
+  /*
+   * Extract the data.
+   */
+  rc = sexp_extract_param (data_list, NULL, "/a/b/c",
+                           &data_c1, &data_c3, &data_c2, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("ecc_decrypt  d_c1", data_c1);
+      log_printmpi ("ecc_decrypt  d_c3", data_c3);
+      log_printmpi ("ecc_decrypt  d_c2", data_c2);
+    }
+
+  {
+    const int algo = GCRY_MD_SM3;
+    gcry_md_hd_t md = NULL;
+    int mdlen;
+    unsigned char *dgst;
+    mpi_point_struct c1;
+    mpi_point_struct kP;
+    gcry_mpi_t x2, y2;
+    gcry_mpi_t x2y2 = NULL;
+    unsigned char *in = NULL;
+    unsigned int inlen;
+    unsigned char *plain = NULL;
+    unsigned char *raw;
+    unsigned int rawlen;
+    unsigned char *c3 = NULL;
+    unsigned int c3_len;
+    int i;
+
+    point_init (&c1);
+    point_init (&kP);
+    x2 = mpi_new (0);
+    y2 = mpi_new (0);
+
+    in = mpi_get_opaque (data_c2, &inlen);
+    inlen = (inlen + 7) / 8;
+    plain = xtrymalloc (inlen);
+    if (!plain)
+      {
+        rc = gpg_err_code_from_syserror ();
+        goto leave_main;
+      }
+
+    rc = _gcry_ecc_sec_decodepoint (data_c1, ec, &c1);
+    if (rc)
+      goto leave_main;
+
+    if (!_gcry_mpi_ec_curve_point (&c1, ec))
+      {
+        rc = GPG_ERR_INV_DATA;
+        goto leave_main;
+      }
+
+    /* [d]C1 = (x2, y2), C1 = [k]G */
+    _gcry_mpi_ec_mul_point (&kP, ec->d, &c1, ec);
+    if (_gcry_mpi_ec_get_affine (x2, y2, &kP, ec))
+      {
+        rc = GPG_ERR_INV_DATA;
+        goto leave_main;
+      }
+
+    /* t = KDF(x2 || y2, inlen) */
+    x2y2 = _gcry_mpi_ec_ec2os (&kP, ec);
+    raw = mpi_get_opaque (x2y2, &rawlen);
+    rawlen = (rawlen + 7) / 8;
+    /* skip the prefix '0x04' */
+    raw += 1;
+    rawlen -= 1;
+    rc = kdf_x9_63 (algo, raw, rawlen, plain, inlen);
+    if (rc)
+      goto leave_main;
+
+    /* plain = C2 xor t */
+    for (i = 0; i < inlen; i++)
+      plain[i] ^= in[i];
+
+    /* Hash(x2 || IN || y2) == C3 */
+    mdlen = _gcry_md_get_algo_dlen (algo);
+    rc = _gcry_md_open (&md, algo, 0);
+    if (rc)
+      goto leave_main;
+    _gcry_md_write (md, raw, MPI_NBYTES(x2));
+    _gcry_md_write (md, plain, inlen);
+    _gcry_md_write (md, raw + MPI_NBYTES(x2), MPI_NBYTES(y2));
+    dgst = _gcry_md_read (md, algo);
+    if (dgst == NULL)
+      {
+        memset (plain, 0, inlen);
+        rc = GPG_ERR_DIGEST_ALGO;
+        goto leave_main;
+      }
+    c3 = mpi_get_opaque (data_c3, &c3_len);
+    c3_len = (c3_len + 7) / 8;
+    if (c3_len != mdlen || memcmp (dgst, c3, c3_len) != 0)
+      {
+        memset (plain, 0, inlen);
+        rc = GPG_ERR_INV_DATA;
+        goto leave_main;
+      }
+
+    if (!rc)
+      {
+        gcry_mpi_t r;
+
+        r = mpi_new (inlen * 8);
+        _gcry_mpi_set_buffer (r, plain, inlen, 0);
+
+        rc = sexp_build (r_plain, NULL, "(value %m)", r);
+
+        mpi_free (r);
+      }
+
+  leave_main:
+    _gcry_md_close (md);
+    mpi_free (x2y2);
+    xfree (plain);
+
+    point_free (&c1);
+    point_free (&kP);
+    mpi_free (x2);
+    mpi_free (y2);
+  }
+
+ leave:
+  _gcry_mpi_release (data_c1);
+  _gcry_mpi_release (data_c3);
+  _gcry_mpi_release (data_c2);
+
+  return rc;
+}
+
+
+/* Compute an SM2 signature.
+ * Return the signature struct (r,s) from the message hash.  The caller
+ * must have allocated R and S.
+ */
+gpg_err_code_t
+_gcry_ecc_sm2_sign (gcry_mpi_t input, mpi_ec_t ec,
+                    gcry_mpi_t r, gcry_mpi_t s,
+                    int flags, int hashalgo)
+{
+  gpg_err_code_t rc = 0;
+  int extraloops = 0;
+  gcry_mpi_t hash;
+  const void *abuf;
+  unsigned int abits, qbits;
+  gcry_mpi_t tmp = NULL;
+  gcry_mpi_t k = NULL;
+  gcry_mpi_t rk = NULL;
+  mpi_point_struct kG;
+  gcry_mpi_t x1;
+
+  if (DBG_CIPHER)
+    log_mpidump ("sm2 sign hash  ", input);
+
+  qbits = mpi_get_nbits (ec->n);
+
+  /* Convert the INPUT into an MPI if needed.  */
+  rc = _gcry_dsa_normalize_hash (input, &hash, qbits);
+  if (rc)
+    return rc;
+
+  point_init (&kG);
+  x1 = mpi_new (0);
+  rk = mpi_new (0);
+  tmp = mpi_new (0);
+
+  for (;;)
+    {
+      /* rand k in [1, n-1] */
+      if ((flags & PUBKEY_FLAG_RFC6979) && hashalgo)
+        {
+          /* Use Pornin's method for deterministic DSA.  If this
+             flag is set, it is expected that HASH is an opaque
+             MPI with the to be signed hash.  That hash is also
+             used as h1 from 3.2.a.  */
+          if (!mpi_is_opaque (input))
+            {
+              rc = GPG_ERR_CONFLICT;
+              goto leave;
+            }
+
+          abuf = mpi_get_opaque (input, &abits);
+          rc = _gcry_dsa_gen_rfc6979_k (&k, ec->n, ec->d,
+                                        abuf, (abits+7)/8,
+                                        hashalgo, extraloops);
+          if (rc)
+            goto leave;
+          extraloops++;
+        }
+      else
+        k = _gcry_dsa_gen_k (ec->n, GCRY_VERY_STRONG_RANDOM);
+
+      _gcry_dsa_modify_k (k, ec->n, qbits);
+
+      /* [k]G = (x1, y1) */
+      _gcry_mpi_ec_mul_point (&kG, k, ec->G, ec);
+      if (_gcry_mpi_ec_get_affine (x1, NULL, &kG, ec))
+        {
+          rc = GPG_ERR_INV_DATA;
+          goto leave;
+        }
+
+      /* r = (e + x1) % n */
+      mpi_addm (r, hash, x1, ec->n);
+
+      /* r != 0 && r + k != n */
+      if (mpi_cmp_ui (r, 0) == 0)
+        continue;
+      mpi_add (rk, r, k);
+      if (mpi_cmp (rk, ec->n) == 0)
+        continue;
+
+      /* s = ((d + 1)^-1 * (k - rd)) % n */
+      mpi_addm (s, ec->d, GCRYMPI_CONST_ONE, ec->n);
+      mpi_invm (s, s, ec->n);
+      mpi_mulm (tmp, r, ec->d, ec->n);
+      mpi_subm (tmp, k, tmp, ec->n);
+      mpi_mulm (s, s, tmp, ec->n);
+
+      /* s != 0 */
+      if (mpi_cmp_ui (s, 0) == 0)
+        continue;
+
+      break;    /* Okay */
+    }
+
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("sm2 sign result r ", r);
+      log_mpidump ("sm2 sign result s ", s);
+    }
+
+leave:
+  point_free (&kG);
+  mpi_free (k);
+  mpi_free (x1);
+  mpi_free (rk);
+  mpi_free (tmp);
+
+  if (hash != input)
+    mpi_free (hash);
+
+  return rc;
+}
+
+
+/* Verify an SM2 signature.
+ * Check if R and S verifies INPUT.
+ */
+gpg_err_code_t
+_gcry_ecc_sm2_verify (gcry_mpi_t input, mpi_ec_t ec,
+                      gcry_mpi_t r, gcry_mpi_t s)
+{
+  gpg_err_code_t err = 0;
+  gcry_mpi_t hash = NULL;
+  gcry_mpi_t t = NULL;
+  mpi_point_struct sG, tP;
+  gcry_mpi_t x1, y1;
+  unsigned int nbits;
+
+  if (!_gcry_mpi_ec_curve_point (ec->Q, ec))
+    return GPG_ERR_BROKEN_PUBKEY;
+
+  /* r, s within [1, n-1] */
+  if (mpi_cmp_ui (r, 1) < 0 || mpi_cmp (r, ec->n) > 0)
+    return GPG_ERR_BAD_SIGNATURE;
+  if (mpi_cmp_ui (s, 1) < 0 || mpi_cmp (s, ec->n) > 0)
+    return GPG_ERR_BAD_SIGNATURE;
+
+  nbits = mpi_get_nbits (ec->n);
+  err = _gcry_dsa_normalize_hash (input, &hash, nbits);
+  if (err)
+    return err;
+
+  point_init (&sG);
+  point_init (&tP);
+  x1 = mpi_new (0);
+  y1 = mpi_new (0);
+  t = mpi_new (0);
+
+  /* t = (r + s) % n, t != 0 */
+  mpi_addm (t, r, s, ec->n);
+  if (mpi_cmp_ui (t, 0) == 0)
+    {
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  /* sG + tP = (x1, y1) */
+  _gcry_mpi_ec_mul_point (&sG, s, ec->G, ec);
+  _gcry_mpi_ec_mul_point (&tP, t, ec->Q, ec);
+  _gcry_mpi_ec_add_points (&sG, &sG, &tP, ec);
+  if (_gcry_mpi_ec_get_affine (x1, y1, &sG, ec))
+    {
+      err = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* R = (e + x1) % n */
+  mpi_addm (t, hash, x1, ec->n);
+
+  /* check R == r */
+  if (mpi_cmp (t, r))
+    {
+      if (DBG_CIPHER)
+        {
+          log_mpidump ("     R", t);
+          log_mpidump ("     r", r);
+          log_mpidump ("     s", s);
+        }
+      err = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+  if (DBG_CIPHER)
+    log_debug ("sm2 verify: Accepted\n");
+
+ leave:
+  point_free (&sG);
+  point_free (&tP);
+  mpi_free (x1);
+  mpi_free (y1);
+  mpi_free (t);
+  if (hash != input)
+    mpi_free (hash);
+
+  return err;
+}
diff --git a/comm/third_party/libgcrypt/cipher/ecc.c b/comm/third_party/libgcrypt/cipher/ecc.c
new file mode 100644
index 0000000000..5d8c7607ab
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/ecc.c
@@ -0,0 +1,1779 @@
+/* ecc.c  -  Elliptic Curve Cryptography
+ * Copyright (C) 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013, 2015 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* This code is originally based on the Patch 0.1.6 for the gnupg
+   1.4.x branch as retrieved on 2007-03-21 from
+   http://www.calcurco.cat/eccGnuPG/src/gnupg-1.4.6-ecc0.2.0beta1.diff.bz2
+   The original authors are:
+     Written by
+      Sergi Blanch i Torne <d4372211 at alumnes.eup.udl.es>,
+      Ramiro Moreno Chiral <ramiro at eup.udl.es>
+     Maintainers
+      Sergi Blanch i Torne
+      Ramiro Moreno Chiral
+      Mikael Mylnikov (mmr)
+  For use in Libgcrypt the code has been heavily modified and cleaned
+  up. In fact there is not much left of the originally code except for
+  some variable names and the text book implementaion of the sign and
+  verification algorithms.  The arithmetic functions have entirely
+  been rewritten and moved to mpi/ec.c.
+
+  ECDH encrypt and decrypt code written by Andrey Jivsov.
+*/
+
+
+/* TODO:
+
+  - In mpi/ec.c we use mpi_powm for x^2 mod p: Either implement a
+    special case in mpi_powm or check whether mpi_mulm is faster.
+
+*/
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "ec-context.h"
+#include "pubkey-internal.h"
+#include "ecc-common.h"
+
+
+static const char *ecc_names[] =
+  {
+    "ecc",
+    "ecdsa",
+    "ecdh",
+    "eddsa",
+    "gost",
+    "sm2",
+    NULL,
+  };
+
+
+/* Sample NIST P-256 key from RFC 6979 A.2.5 */
+static const char sample_public_key_secp256[] =
+  "(public-key"
+  " (ecc"
+  "  (curve secp256r1)"
+  "  (q #04"
+  /**/  "60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6"
+  /**/  "7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299#)))";
+
+static const char sample_secret_key_secp256[] =
+  "(private-key"
+  " (ecc"
+  "  (curve secp256r1)"
+  "  (d #C9AFA9D845BA75166B5C215767B1D6934E50C3DB36E89B127B8A622B120F6721#)"
+  "  (q #04"
+  /**/  "60FED4BA255A9D31C961EB74C6356D68C049B8923B61FA6CE669622E60F29FB6"
+  /**/  "7903FE1008B8BC99A41AE9E95628BC64F2F1B20C2D7E9F5177A3C294D4462299#)))";
+
+
+/* Registered progress function and its callback value. */
+static void (*progress_cb) (void *, const char*, int, int, int);
+static void *progress_cb_data;
+
+
+
+/* Local prototypes. */
+static void test_keys (mpi_ec_t ec, unsigned int nbits);
+static void test_ecdh_only_keys (mpi_ec_t ec, unsigned int nbits, int flags);
+static unsigned int ecc_get_nbits (gcry_sexp_t parms);
+
+
+
+
+void
+_gcry_register_pk_ecc_progress (void (*cb) (void *, const char *,
+                                            int, int, int),
+                                void *cb_data)
+{
+  progress_cb = cb;
+  progress_cb_data = cb_data;
+}
+
+/* static void */
+/* progress (int c) */
+/* { */
+/*   if (progress_cb) */
+/*     progress_cb (progress_cb_data, "pk_ecc", c, 0, 0); */
+/* } */
+
+
+
+/**
+ * nist_generate_key - Standard version of the ECC key generation.
+ * @ec: Elliptic curve computation context.
+ * @flags: Flags controlling aspects of the creation.
+ * @r_x: On success this receives an allocated MPI with the affine
+ *       x-coordinate of the poblic key.  On error NULL is stored.
+ * @r_y: Ditto for the y-coordinate.
+ *
+ * Return: An error code.
+ *
+ * The @flags bits used by this function are %PUBKEY_FLAG_TRANSIENT to
+ * use a faster RNG, and %PUBKEY_FLAG_NO_KEYTEST to skip the assertion
+ * that the key works as expected.
+ *
+ * FIXME: Check whether N is needed.
+ */
+static gpg_err_code_t
+nist_generate_key (mpi_ec_t ec, int flags,
+                   gcry_mpi_t *r_x, gcry_mpi_t *r_y)
+{
+  mpi_point_struct Q;
+  gcry_random_level_t random_level;
+  gcry_mpi_t x, y;
+  const unsigned int pbits = ec->nbits;
+
+  point_init (&Q);
+
+  if ((flags & PUBKEY_FLAG_TRANSIENT_KEY))
+    random_level = GCRY_STRONG_RANDOM;
+  else
+    random_level = GCRY_VERY_STRONG_RANDOM;
+
+  /* Generate a secret.  */
+  if (ec->dialect == ECC_DIALECT_ED25519
+      || ec->dialect == ECC_DIALECT_SAFECURVE
+      || (flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      char *rndbuf;
+      int len = (pbits+7)/8;
+
+      rndbuf = _gcry_random_bytes_secure (len, random_level);
+      if (ec->dialect == ECC_DIALECT_SAFECURVE)
+        ec->d = mpi_set_opaque (NULL, rndbuf, len*8);
+      else
+        {
+          ec->d = mpi_snew (pbits);
+          if ((pbits % 8))
+            rndbuf[0] &= (1 << (pbits % 8)) - 1;
+          rndbuf[0] |= (1 << ((pbits + 7) % 8));
+          rndbuf[len-1] &= (256 - ec->h);
+          _gcry_mpi_set_buffer (ec->d, rndbuf, len, 0);
+          xfree (rndbuf);
+        }
+    }
+  else
+    ec->d = _gcry_dsa_gen_k (ec->n, random_level);
+
+  /* Compute Q.  */
+  _gcry_mpi_ec_mul_point (&Q, ec->d, ec->G, ec);
+
+  x = mpi_new (pbits);
+  if (r_y == NULL)
+    y = NULL;
+  else
+    y = mpi_new (pbits);
+  if (_gcry_mpi_ec_get_affine (x, y, &Q, ec))
+    log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q");
+
+  /* We want the Q=(x,y) be a "compliant key" in terms of the
+   * http://tools.ietf.org/html/draft-jivsov-ecc-compact, which simply
+   * means that we choose either Q=(x,y) or -Q=(x,p-y) such that we
+   * end up with the min(y,p-y) as the y coordinate.  Such a public
+   * key allows the most efficient compression: y can simply be
+   * dropped because we know that it's a minimum of the two
+   * possibilities without any loss of security.  Note that we don't
+   * do that for Ed25519 so that we do not violate the special
+   * construction of the secret key.  */
+  if (r_y == NULL || ec->dialect == ECC_DIALECT_ED25519)
+    ec->Q = mpi_point_set (NULL, Q.x, Q.y, Q.z);
+  else
+    {
+      gcry_mpi_t negative;
+
+      negative = mpi_new (pbits);
+
+      if (ec->model == MPI_EC_WEIERSTRASS)
+        mpi_sub (negative, ec->p, y);      /* negative = p - y */
+      else
+        mpi_sub (negative, ec->p, x);      /* negative = p - x */
+
+      if (mpi_cmp (negative, y) < 0)   /* p - y < p */
+        {
+          /* We need to end up with -Q; this assures that new Q's y is
+             the smallest one */
+          if (ec->model == MPI_EC_WEIERSTRASS)
+            {
+              mpi_free (y);
+              y = negative;
+            }
+          else
+            {
+              mpi_free (x);
+              x = negative;
+            }
+          mpi_sub (ec->d, ec->n, ec->d);   /* d = order - d */
+          ec->Q = mpi_point_set (NULL, x, y, mpi_const (MPI_C_ONE));
+
+          if (DBG_CIPHER)
+            log_debug ("ecgen converted Q to a compliant point\n");
+        }
+      else /* p - y >= p */
+        {
+          /* No change is needed exactly 50% of the time: just copy. */
+          mpi_free (negative);
+          ec->Q = mpi_point_set (NULL, Q.x, Q.y, Q.z);
+          if (DBG_CIPHER)
+            log_debug ("ecgen didn't need to convert Q to a compliant point\n");
+        }
+    }
+
+  *r_x = x;
+  if (r_y)
+    *r_y = y;
+
+  point_free (&Q);
+  /* Now we can test our keys (this should never fail!).  */
+  if ((flags & PUBKEY_FLAG_NO_KEYTEST))
+    ; /* User requested to skip the test.  */
+  else if (ec->model == MPI_EC_MONTGOMERY)
+    test_ecdh_only_keys (ec, ec->nbits - 63, flags);
+  else
+    test_keys (ec, ec->nbits - 64);
+
+  return 0;
+}
+
+
+/*
+ * To verify correct skey it use a random information.
+ * First, encrypt and decrypt this dummy value,
+ * test if the information is recuperated.
+ * Second, test with the sign and verify functions.
+ */
+static void
+test_keys (mpi_ec_t ec, unsigned int nbits)
+{
+  gcry_mpi_t test = mpi_new (nbits);
+  mpi_point_struct R_;
+  gcry_mpi_t c = mpi_new (nbits);
+  gcry_mpi_t out = mpi_new (nbits);
+  gcry_mpi_t r = mpi_new (nbits);
+  gcry_mpi_t s = mpi_new (nbits);
+
+  if (DBG_CIPHER)
+    log_debug ("Testing key.\n");
+
+  point_init (&R_);
+
+  _gcry_mpi_randomize (test, nbits, GCRY_WEAK_RANDOM);
+
+  if (_gcry_ecc_ecdsa_sign (test, ec, r, s, 0, 0) )
+    log_fatal ("ECDSA operation: sign failed\n");
+
+  if (_gcry_ecc_ecdsa_verify (test, ec, r, s))
+    {
+      log_fatal ("ECDSA operation: sign, verify failed\n");
+    }
+
+  if (DBG_CIPHER)
+    log_debug ("ECDSA operation: sign, verify ok.\n");
+
+  point_free (&R_);
+  mpi_free (s);
+  mpi_free (r);
+  mpi_free (out);
+  mpi_free (c);
+  mpi_free (test);
+}
+
+
+static void
+test_ecdh_only_keys (mpi_ec_t ec, unsigned int nbits, int flags)
+{
+  gcry_mpi_t test;
+  mpi_point_struct R_;
+  gcry_mpi_t x0, x1;
+
+  if (DBG_CIPHER)
+    log_debug ("Testing ECDH only key.\n");
+
+  point_init (&R_);
+
+  if (ec->dialect == ECC_DIALECT_SAFECURVE || (flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      char *rndbuf;
+      const unsigned int pbits = ec->nbits;
+      int len = (pbits+7)/8;
+
+      rndbuf = _gcry_random_bytes (len, GCRY_WEAK_RANDOM);
+      if (ec->dialect == ECC_DIALECT_SAFECURVE)
+        test = mpi_set_opaque (NULL, rndbuf, len*8);
+      else
+        {
+          test = mpi_new (pbits);
+          if ((pbits % 8))
+            rndbuf[0] &= (1 << (pbits % 8)) - 1;
+          rndbuf[0] |= (1 << ((pbits + 7) % 8));
+          rndbuf[len-1] &= (256 - ec->h);
+          _gcry_mpi_set_buffer (test, rndbuf, len, 0);
+          xfree (rndbuf);
+        }
+    }
+  else
+    {
+      test = mpi_new (nbits);
+      _gcry_mpi_randomize (test, nbits, GCRY_WEAK_RANDOM);
+    }
+
+  x0 = mpi_new (0);
+  x1 = mpi_new (0);
+
+  /* R_ = hkQ  <=>  R_ = hkdG  */
+  _gcry_mpi_ec_mul_point (&R_, test, ec->Q, ec);
+  if (ec->dialect == ECC_DIALECT_STANDARD && !(flags & PUBKEY_FLAG_DJB_TWEAK))
+    _gcry_mpi_ec_mul_point (&R_, _gcry_mpi_get_const (ec->h), &R_, ec);
+  if (_gcry_mpi_ec_get_affine (x0, NULL, &R_, ec))
+    log_fatal ("ecdh: Failed to get affine coordinates for hkQ\n");
+
+  _gcry_mpi_ec_mul_point (&R_, test, ec->G, ec);
+  _gcry_mpi_ec_mul_point (&R_, ec->d, &R_, ec);
+  /* R_ = hdkG */
+  if (ec->dialect == ECC_DIALECT_STANDARD && !(flags & PUBKEY_FLAG_DJB_TWEAK))
+    _gcry_mpi_ec_mul_point (&R_, _gcry_mpi_get_const (ec->h), &R_, ec);
+
+  if (_gcry_mpi_ec_get_affine (x1, NULL, &R_, ec))
+    log_fatal ("ecdh: Failed to get affine coordinates for hdkG\n");
+
+  if (mpi_cmp (x0, x1))
+    {
+      log_fatal ("ECDH test failed.\n");
+    }
+
+  mpi_free (x0);
+  mpi_free (x1);
+
+  point_free (&R_);
+  mpi_free (test);
+}
+
+
+/*
+ * To check the validity of the value, recalculate the correspondence
+ * between the public value and the secret one.
+ */
+static int
+check_secret_key (mpi_ec_t ec, int flags)
+{
+  int rc = 1;
+  mpi_point_struct Q;
+  gcry_mpi_t x1, y1;
+  gcry_mpi_t x2 = NULL;
+  gcry_mpi_t y2 = NULL;
+
+  point_init (&Q);
+  x1 = mpi_new (0);
+  if (ec->model == MPI_EC_MONTGOMERY)
+    y1 = NULL;
+  else
+    y1 = mpi_new (0);
+
+  /* G in E(F_p) */
+  if (!_gcry_mpi_ec_curve_point (ec->G, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: Point 'G' does not belong to curve 'E'!\n");
+      goto leave;
+    }
+
+  /* G != PaI */
+  if (!mpi_cmp_ui (ec->G->z, 0))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: 'G' cannot be Point at Infinity!\n");
+      goto leave;
+    }
+
+  /* Check order of curve.  */
+  if (ec->dialect == ECC_DIALECT_STANDARD && !(flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      _gcry_mpi_ec_mul_point (&Q, ec->n, ec->G, ec);
+      if (mpi_cmp_ui (Q.z, 0))
+        {
+          if (DBG_CIPHER)
+            log_debug ("check_secret_key: E is not a curve of order n\n");
+          goto leave;
+        }
+    }
+
+  /* Pubkey cannot be PaI */
+  if (!mpi_cmp_ui (ec->Q->z, 0))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: Q can not be a Point at Infinity!\n");
+      goto leave;
+    }
+
+  /* pubkey = [d]G over E */
+  if (!_gcry_ecc_compute_public (&Q, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: computation of dG failed\n");
+      goto leave;
+    }
+  if (_gcry_mpi_ec_get_affine (x1, y1, &Q, ec))
+    {
+      if (DBG_CIPHER)
+        log_debug ("Bad check: Q can not be a Point at Infinity!\n");
+      goto leave;
+    }
+
+  if ((flags & PUBKEY_FLAG_EDDSA)
+      || (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE))
+    ; /* Fixme: EdDSA is special.  */
+  else if (!mpi_cmp_ui (ec->Q->z, 1))
+    {
+      /* Fast path if Q is already in affine coordinates.  */
+      if (mpi_cmp (x1, ec->Q->x) || (y1 && mpi_cmp (y1, ec->Q->y)))
+        {
+          if (DBG_CIPHER)
+            log_debug
+              ("Bad check: There is NO correspondence between 'd' and 'Q'!\n");
+          goto leave;
+        }
+    }
+  else
+    {
+      x2 = mpi_new (0);
+      y2 = mpi_new (0);
+      if (_gcry_mpi_ec_get_affine (x2, y2, ec->Q, ec))
+        {
+          if (DBG_CIPHER)
+            log_debug ("Bad check: Q can not be a Point at Infinity!\n");
+          goto leave;
+        }
+
+      if (mpi_cmp (x1, x2) || mpi_cmp (y1, y2))
+        {
+          if (DBG_CIPHER)
+            log_debug
+              ("Bad check: There is NO correspondence between 'd' and 'Q'!\n");
+          goto leave;
+        }
+    }
+  rc = 0; /* Okay.  */
+
+ leave:
+  mpi_free (x2);
+  mpi_free (x1);
+  mpi_free (y1);
+  mpi_free (y2);
+  point_free (&Q);
+  return rc;
+}
+
+
+
+/*********************************************
+ **************  interface  ******************
+ *********************************************/
+
+static gcry_err_code_t
+ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t Gx = NULL;
+  gcry_mpi_t Gy = NULL;
+  gcry_mpi_t Qx = NULL;
+  gcry_mpi_t Qy = NULL;
+  mpi_ec_t ec = NULL;
+  gcry_sexp_t curve_info = NULL;
+  gcry_sexp_t curve_flags = NULL;
+  gcry_mpi_t base = NULL;
+  gcry_mpi_t public = NULL;
+  int flags = 0;
+
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecgen curve", genparms, NULL);
+  if (rc)
+    goto leave;
+
+  if ((flags & PUBKEY_FLAG_EDDSA)
+      || (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE))
+    rc = _gcry_ecc_eddsa_genkey (ec, flags);
+  else if (ec->model == MPI_EC_MONTGOMERY)
+    rc = nist_generate_key (ec, flags, &Qx, NULL);
+  else
+    rc = nist_generate_key (ec, flags, &Qx, &Qy);
+  if (rc)
+    goto leave;
+
+  /* Copy data to the result.  */
+  Gx = mpi_new (0);
+  Gy = mpi_new (0);
+  if (ec->model != MPI_EC_MONTGOMERY)
+    {
+      if (_gcry_mpi_ec_get_affine (Gx, Gy, ec->G, ec))
+        log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "G");
+      base = _gcry_ecc_ec2os (Gx, Gy, ec->p);
+    }
+  if (((ec->dialect == ECC_DIALECT_SAFECURVE && ec->model == MPI_EC_EDWARDS)
+       || ec->dialect == ECC_DIALECT_ED25519 || ec->model == MPI_EC_MONTGOMERY)
+      && !(flags & PUBKEY_FLAG_NOCOMP))
+    {
+      unsigned char *encpk;
+      unsigned int encpklen;
+
+      if (ec->model == MPI_EC_MONTGOMERY)
+        rc = _gcry_ecc_mont_encodepoint (Qx, ec->nbits,
+                                         ec->dialect != ECC_DIALECT_SAFECURVE,
+                                         &encpk, &encpklen);
+      else
+        /* (Gx and Gy are used as scratch variables)  */
+        rc = _gcry_ecc_eddsa_encodepoint (ec->Q, ec, Gx, Gy,
+                                          (ec->dialect != ECC_DIALECT_SAFECURVE
+                                           && !!(flags & PUBKEY_FLAG_COMP)),
+                                          &encpk, &encpklen);
+      if (rc)
+        goto leave;
+      public = mpi_new (0);
+      mpi_set_opaque (public, encpk, encpklen*8);
+    }
+  else
+    {
+      if (!Qx)
+        {
+          /* This is the case for a key from _gcry_ecc_eddsa_generate
+             with no compression.  */
+          Qx = mpi_new (0);
+          Qy = mpi_new (0);
+          if (_gcry_mpi_ec_get_affine (Qx, Qy, ec->Q, ec))
+            log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q");
+        }
+      public = _gcry_ecc_ec2os (Qx, Qy, ec->p);
+    }
+  if (ec->name)
+    {
+      rc = sexp_build (&curve_info, NULL, "(curve %s)", ec->name);
+      if (rc)
+        goto leave;
+    }
+
+  if ((flags & PUBKEY_FLAG_PARAM) || (flags & PUBKEY_FLAG_EDDSA)
+      || (flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      rc = sexp_build
+        (&curve_flags, NULL,
+         ((flags & PUBKEY_FLAG_PARAM) && (flags & PUBKEY_FLAG_EDDSA))?
+         "(flags param eddsa)" :
+         ((flags & PUBKEY_FLAG_PARAM) && (flags & PUBKEY_FLAG_DJB_TWEAK))?
+         "(flags param djb-tweak)" :
+         ((flags & PUBKEY_FLAG_PARAM))?
+         "(flags param)" : ((flags & PUBKEY_FLAG_EDDSA))?
+         "(flags eddsa)" : "(flags djb-tweak)" );
+      if (rc)
+        goto leave;
+    }
+
+  if ((flags & PUBKEY_FLAG_PARAM) && ec->name)
+    rc = sexp_build (r_skey, NULL,
+                     "(key-data"
+                     " (public-key"
+                     "  (ecc%S%S(p%m)(a%m)(b%m)(g%m)(n%m)(h%u)(q%m)))"
+                     " (private-key"
+                     "  (ecc%S%S(p%m)(a%m)(b%m)(g%m)(n%m)(h%u)(q%m)(d%m)))"
+                     " )",
+                     curve_info, curve_flags,
+                     ec->p, ec->a, ec->b, base, ec->n, ec->h, public,
+                     curve_info, curve_flags,
+                     ec->p, ec->a, ec->b, base, ec->n, ec->h, public,
+                     ec->d);
+  else
+    rc = sexp_build (r_skey, NULL,
+                     "(key-data"
+                     " (public-key"
+                     "  (ecc%S%S(q%m)))"
+                     " (private-key"
+                     "  (ecc%S%S(q%m)(d%m)))"
+                     " )",
+                     curve_info, curve_flags,
+                     public,
+                     curve_info, curve_flags,
+                     public, ec->d);
+  if (rc)
+    goto leave;
+
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("ecgen result  p", ec->p);
+      log_printmpi ("ecgen result  a", ec->a);
+      log_printmpi ("ecgen result  b", ec->b);
+      log_printmpi ("ecgen result  G", base);
+      log_printmpi ("ecgen result  n", ec->n);
+      log_debug    ("ecgen result  h:+%02x\n", ec->h);
+      log_printmpi ("ecgen result  Q", public);
+      log_printmpi ("ecgen result  d", ec->d);
+      if ((flags & PUBKEY_FLAG_EDDSA))
+        log_debug ("ecgen result  using Ed25519+EdDSA\n");
+    }
+
+ leave:
+  mpi_free (public);
+  mpi_free (base);
+  mpi_free (Gx);
+  mpi_free (Gy);
+  mpi_free (Qx);
+  mpi_free (Qy);
+  _gcry_mpi_ec_free (ec);
+  sexp_release (curve_flags);
+  sexp_release (curve_info);
+  return rc;
+}
+
+
+static gcry_err_code_t
+ecc_check_secret_key (gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  int flags = 0;
+  mpi_ec_t ec = NULL;
+
+  /*
+   * Extract the key.
+   */
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecc_testkey", keyparms, NULL);
+  if (rc)
+    goto leave;
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n || !ec->Q || !ec->d)
+    {
+      rc = GPG_ERR_NO_OBJ;
+      goto leave;
+    }
+
+  if (check_secret_key (ec, flags))
+    rc = GPG_ERR_BAD_SECKEY;
+
+ leave:
+  _gcry_mpi_ec_free (ec);
+  if (DBG_CIPHER)
+    log_debug ("ecc_testkey    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t data = NULL;
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+  mpi_ec_t ec = NULL;
+  int flags = 0;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_SIGN, 0);
+
+  /*
+   * Extract the key.
+   */
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecc_sign", keyparms, NULL);
+  if (rc)
+    goto leave;
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n || !ec->d)
+    {
+      rc = GPG_ERR_NO_OBJ;
+      goto leave;
+    }
+
+  ctx.flags |= flags;
+  if (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE)
+    ctx.flags |= PUBKEY_FLAG_EDDSA;
+  /* Clear hash algo for EdDSA.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA))
+    ctx.hash_algo = GCRY_MD_NONE;
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("ecc_sign   data", data);
+
+  /* Hash algo is determined by curve in EdDSA.  Fill it if not specified.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA) && !ctx.hash_algo)
+    {
+      if (ec->dialect == ECC_DIALECT_ED25519)
+        ctx.hash_algo = GCRY_MD_SHA512;
+      else if (ec->dialect == ECC_DIALECT_SAFECURVE)
+        ctx.hash_algo = GCRY_MD_SHAKE256;
+    }
+
+  sig_r = mpi_new (0);
+  sig_s = mpi_new (0);
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA))
+    {
+      /* EdDSA requires the public key.  */
+      rc = _gcry_ecc_eddsa_sign (data, ec, sig_r, sig_s, &ctx);
+      if (!rc)
+        rc = sexp_build (r_sig, NULL,
+                         "(sig-val(eddsa(r%M)(s%M)))", sig_r, sig_s);
+    }
+  else if ((ctx.flags & PUBKEY_FLAG_GOST))
+    {
+      rc = _gcry_ecc_gost_sign (data, ec, sig_r, sig_s);
+      if (!rc)
+        rc = sexp_build (r_sig, NULL,
+                         "(sig-val(gost(r%M)(s%M)))", sig_r, sig_s);
+    }
+  else if ((ctx.flags & PUBKEY_FLAG_SM2))
+    {
+      rc = _gcry_ecc_sm2_sign (data, ec, sig_r, sig_s,
+                               ctx.flags, ctx.hash_algo);
+      if (!rc)
+        rc = sexp_build (r_sig, NULL,
+                         "(sig-val(sm2(r%M)(s%M)))", sig_r, sig_s);
+    }
+  else
+    {
+      rc = _gcry_ecc_ecdsa_sign (data, ec, sig_r, sig_s,
+                                 ctx.flags, ctx.hash_algo);
+      if (!rc)
+        rc = sexp_build (r_sig, NULL,
+                         "(sig-val(ecdsa(r%M)(s%M)))", sig_r, sig_s);
+    }
+
+ leave:
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  _gcry_mpi_release (data);
+  _gcry_mpi_ec_free (ec);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("ecc_sign      => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+  gcry_mpi_t data = NULL;
+  int sigflags;
+  mpi_ec_t ec = NULL;
+  int flags = 0;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_VERIFY,
+                                   ecc_get_nbits (s_keyparms));
+
+  /*
+   * Extract the key.
+   */
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecc_verify",
+                                  s_keyparms, NULL);
+  if (rc)
+    goto leave;
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n || !ec->Q)
+    {
+      rc = GPG_ERR_NO_OBJ;
+      goto leave;
+    }
+
+  if (ec->model == MPI_EC_MONTGOMERY)
+    {
+      if (DBG_CIPHER)
+        log_debug ("ecc_verify: Can't use a Montgomery curve\n");
+      rc = GPG_ERR_INTERNAL;
+      goto leave;
+    }
+
+  ctx.flags |= flags;
+  if (ec->model == MPI_EC_EDWARDS && ec->dialect == ECC_DIALECT_SAFECURVE)
+    ctx.flags |= PUBKEY_FLAG_EDDSA;
+  /* Clear hash algo for EdDSA.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA))
+    ctx.hash_algo = GCRY_MD_NONE;
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("ecc_verify data", data);
+
+  /* Hash algo is determined by curve in EdDSA.  Fill it if not specified.  */
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA) && !ctx.hash_algo)
+    {
+      if (ec->dialect == ECC_DIALECT_ED25519)
+        ctx.hash_algo = GCRY_MD_SHA512;
+      else if (ec->dialect == ECC_DIALECT_SAFECURVE)
+        ctx.hash_algo = GCRY_MD_SHAKE256;
+    }
+
+  /*
+   * Extract the signature value.
+   */
+  rc = _gcry_pk_util_preparse_sigval (s_sig, ecc_names, &l1, &sigflags);
+  if (rc)
+    goto leave;
+  rc = sexp_extract_param (l1, NULL, (sigflags & PUBKEY_FLAG_EDDSA)? "/rs":"rs",
+                           &sig_r, &sig_s, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("ecc_verify  s_r", sig_r);
+      log_mpidump ("ecc_verify  s_s", sig_s);
+    }
+  if ((ctx.flags & PUBKEY_FLAG_EDDSA) ^ (sigflags & PUBKEY_FLAG_EDDSA))
+    {
+      rc = GPG_ERR_CONFLICT; /* Inconsistent use of flag/algoname.  */
+      goto leave;
+    }
+
+  /*
+   * Verify the signature.
+   */
+  if ((sigflags & PUBKEY_FLAG_EDDSA))
+    {
+      rc = _gcry_ecc_eddsa_verify (data, ec, sig_r, sig_s, &ctx);
+    }
+  else if ((sigflags & PUBKEY_FLAG_GOST))
+    {
+      rc = _gcry_ecc_gost_verify (data, ec, sig_r, sig_s);
+    }
+  else if ((sigflags & PUBKEY_FLAG_SM2))
+    {
+      rc = _gcry_ecc_sm2_verify (data, ec, sig_r, sig_s);
+    }
+  else
+    {
+      rc = _gcry_ecc_ecdsa_verify (data, ec, sig_r, sig_s);
+    }
+
+ leave:
+  _gcry_mpi_release (data);
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  _gcry_mpi_ec_free (ec);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("ecc_verify    => %s\n", rc?gpg_strerror (rc):"Good");
+  return rc;
+}
+
+
+/* ecdh raw is classic 2-round DH protocol published in 1976.
+ *
+ * Overview of ecc_encrypt_raw and ecc_decrypt_raw.
+ *
+ * As with any PK operation, encrypt version uses a public key and
+ * decrypt -- private.
+ *
+ * Symbols used below:
+ *     G - field generator point
+ *     d - private long-term scalar
+ *    dG - public long-term key
+ *     k - ephemeral scalar
+ *    kG - ephemeral public key
+ *   dkG - shared secret
+ *
+ * ecc_encrypt_raw description:
+ *   input:
+ *     data[0] : private scalar (k)
+ *   output: A new S-expression with the parameters:
+ *     s : shared point (kdG)
+ *     e : generated ephemeral public key (kG)
+ *
+ * ecc_decrypt_raw description:
+ *   input:
+ *     data[0] : a point kG (ephemeral public key)
+ *   output:
+ *     result[0] : shared point (kdG)
+ */
+static gcry_err_code_t
+ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  unsigned int nbits;
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t mpi_s = NULL;
+  gcry_mpi_t mpi_e = NULL;
+  gcry_mpi_t data = NULL;
+  mpi_ec_t ec = NULL;
+  int flags = 0;
+  int no_error_on_infinity;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_ENCRYPT,
+                                   (nbits = ecc_get_nbits (keyparms)));
+
+  /*
+   * Extract the key.
+   */
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecc_encrypt", keyparms, NULL);
+  if (rc)
+    goto leave;
+
+  if (ec->dialect == ECC_DIALECT_SAFECURVE)
+    {
+      ctx.flags |= PUBKEY_FLAG_RAW_FLAG;
+      no_error_on_infinity = 1;
+    }
+  else if ((flags & PUBKEY_FLAG_DJB_TWEAK))
+    no_error_on_infinity = 1;
+  else
+    no_error_on_infinity = 0;
+
+  /*
+   * Extract the data.
+   */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+
+  /*
+   * Tweak the scalar bits by cofactor and number of bits of the field.
+   * It assumes the cofactor is a power of 2.
+   */
+  if ((flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      int i;
+
+      for (i = 0; (ec->h & (1 << i)) == 0; i++)
+        mpi_clear_bit (data, i);
+      mpi_set_highbit (data, ec->nbits - 1);
+    }
+  if (DBG_CIPHER)
+    log_mpidump ("ecc_encrypt data", data);
+
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n || !ec->Q)
+    {
+      rc = GPG_ERR_NO_OBJ;
+      goto leave;
+    }
+
+  if ((ctx.flags & PUBKEY_FLAG_SM2))
+    {
+      /* All encryption will be done, return it.  */
+      rc = _gcry_ecc_sm2_encrypt (r_ciph, data, ec);
+      goto leave;
+    }
+
+  /* The following is false: assert( mpi_cmp_ui( R.x, 1 )==0 );, so */
+  {
+    mpi_point_struct R;  /* Result that we return.  */
+    gcry_mpi_t x, y;
+    unsigned char *rawmpi;
+    unsigned int rawmpilen;
+
+    rc = 0;
+    x = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);
+
+    point_init (&R);
+
+    /* R = kQ  <=>  R = kdG  */
+    _gcry_mpi_ec_mul_point (&R, data, ec->Q, ec);
+
+    if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
+      {
+        /*
+         * Here, X is 0.  In the X25519 computation on Curve25519, X0
+         * function maps infinity to zero.  So, when PUBKEY_FLAG_DJB_TWEAK
+         * is enabled, return the result of 0 not raising an error.
+         *
+         * This is a corner case.  It never occurs with properly
+         * generated public keys, but it might happen with blindly
+         * imported public key which might not follow the key
+         * generation procedure.
+         */
+        if (!no_error_on_infinity)
+          { /* It's not for X25519, then, the input data was simply wrong.  */
+            rc = GPG_ERR_INV_DATA;
+            goto leave_main;
+          }
+      }
+    if (y)
+      mpi_s = _gcry_ecc_ec2os (x, y, ec->p);
+    else
+      {
+        rc = _gcry_ecc_mont_encodepoint (x, nbits,
+                                         ec->dialect != ECC_DIALECT_SAFECURVE,
+                                         &rawmpi, &rawmpilen);
+        if (rc)
+          goto leave_main;
+        mpi_s = mpi_new (0);
+        mpi_set_opaque (mpi_s, rawmpi, rawmpilen*8);
+      }
+
+    /* R = kG */
+    _gcry_mpi_ec_mul_point (&R, data, ec->G, ec);
+
+    if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
+      {
+        rc = GPG_ERR_INV_DATA;
+        goto leave_main;
+      }
+    if (y)
+      mpi_e = _gcry_ecc_ec2os (x, y, ec->p);
+    else
+      {
+        rc = _gcry_ecc_mont_encodepoint (x, nbits,
+                                         ec->dialect != ECC_DIALECT_SAFECURVE,
+                                         &rawmpi, &rawmpilen);
+        if (!rc)
+          {
+            mpi_e = mpi_new (0);
+            mpi_set_opaque (mpi_e, rawmpi, rawmpilen*8);
+          }
+      }
+
+  leave_main:
+    mpi_free (x);
+    mpi_free (y);
+    point_free (&R);
+    if (rc)
+      goto leave;
+  }
+
+  if (!rc)
+    rc = sexp_build (r_ciph, NULL, "(enc-val(ecdh(s%m)(e%m)))", mpi_s, mpi_e);
+
+ leave:
+  _gcry_mpi_release (data);
+  _gcry_mpi_release (mpi_s);
+  _gcry_mpi_release (mpi_e);
+  _gcry_mpi_ec_free (ec);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("ecc_encrypt    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+/*  input:
+ *     data[0] : a point kG (ephemeral public key)
+ *   output:
+ *     resaddr[0] : shared point kdG
+ *
+ *  see ecc_encrypt_raw for details.
+ */
+static gcry_err_code_t
+ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  unsigned int nbits;
+  gpg_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t data_e = NULL;
+  mpi_ec_t ec = NULL;
+  mpi_point_struct kG;
+  mpi_point_struct R;
+  gcry_mpi_t r = NULL;
+  int flags = 0;
+  int enable_specific_point_validation;
+
+  point_init (&kG);
+  point_init (&R);
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT,
+                                   (nbits = ecc_get_nbits (keyparms)));
+
+  /*
+   * Extract the key.
+   */
+  rc = _gcry_mpi_ec_internal_new (&ec, &flags, "ecc_decrypt", keyparms, NULL);
+  if (rc)
+    goto leave;
+
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n || !ec->d)
+    {
+      rc = GPG_ERR_NO_OBJ;
+      goto leave;
+    }
+
+  /*
+   * Extract the data.
+   */
+  rc = _gcry_pk_util_preparse_encval (s_data, ecc_names, &l1, &ctx);
+  if (rc)
+    goto leave;
+  if ((ctx.flags & PUBKEY_FLAG_SM2))
+    {
+      /* All decryption will be done, return it.  */
+      rc = _gcry_ecc_sm2_decrypt (r_plain, l1, ec);
+      goto leave;
+    }
+  else
+    {
+      rc = sexp_extract_param (l1, NULL, "/e", &data_e, NULL);
+      if (rc)
+        goto leave;
+      if (DBG_CIPHER)
+        log_printmpi ("ecc_decrypt  d_e", data_e);
+    }
+
+  if (ec->dialect == ECC_DIALECT_SAFECURVE || (flags & PUBKEY_FLAG_DJB_TWEAK))
+    enable_specific_point_validation = 1;
+  else
+    enable_specific_point_validation = 0;
+
+  /*
+   * Compute the plaintext.
+   */
+  if (ec->model == MPI_EC_MONTGOMERY)
+    rc = _gcry_ecc_mont_decodepoint (data_e, ec, &kG);
+  else
+    rc = _gcry_ecc_sec_decodepoint (data_e, ec, &kG);
+  if (rc)
+    goto leave;
+
+  if (DBG_CIPHER)
+    log_printpnt ("ecc_decrypt    kG", &kG, NULL);
+
+  if (enable_specific_point_validation)
+    {
+      /* For X25519, by its definition, validation should not be done.  */
+      /* (Instead, we do output check.)
+       *
+       * However, to mitigate secret key leak from our implementation,
+       * we also do input validation here.  For constant-time
+       * implementation, we can remove this input validation.
+       */
+      if (_gcry_mpi_ec_bad_point (&kG, ec))
+        {
+          rc = GPG_ERR_INV_DATA;
+          goto leave;
+        }
+    }
+  else if (!_gcry_mpi_ec_curve_point (&kG, ec))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* R = dkG */
+  _gcry_mpi_ec_mul_point (&R, ec->d, &kG, ec);
+
+  /* The following is false: assert( mpi_cmp_ui( R.x, 1 )==0 );, so:  */
+  {
+    gcry_mpi_t x, y;
+
+    x = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);
+
+    if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
+      {
+        rc = GPG_ERR_INV_DATA;
+        goto leave;
+        /*
+         * Note for X25519.
+         *
+         * By the definition of X25519, this is the case where X25519
+         * returns 0, mapping infinity to zero.  However, we
+         * deliberately let it return an error.
+         *
+         * For X25519 ECDH, comming here means that it might be
+         * decrypted by anyone with the shared secret of 0 (the result
+         * of this function could be always 0 by other scalar values,
+         * other than the private key of D).
+         *
+         * So, it looks like an encrypted message but it can be
+         * decrypted by anyone, or at least something wrong
+         * happens.  Recipient should not proceed as if it were
+         * properly encrypted message.
+         *
+         * This handling is needed for our major usage of GnuPG,
+         * where it does the One-Pass Diffie-Hellman method,
+         * C(1, 1, ECC CDH), with an ephemeral key.
+         */
+      }
+
+    if (y)
+      r = _gcry_ecc_ec2os (x, y, ec->p);
+    else
+      {
+
+        unsigned char *rawmpi;
+        unsigned int rawmpilen;
+
+        rc = _gcry_ecc_mont_encodepoint (x, nbits,
+                                         ec->dialect != ECC_DIALECT_SAFECURVE,
+                                         &rawmpi, &rawmpilen);
+        if (rc)
+          goto leave;
+
+        r = mpi_new (0);
+        mpi_set_opaque (r, rawmpi, rawmpilen*8);
+      }
+    if (!r)
+      rc = gpg_err_code_from_syserror ();
+    else
+      rc = 0;
+    mpi_free (x);
+    mpi_free (y);
+  }
+  if (DBG_CIPHER)
+    log_printmpi ("ecc_decrypt  res", r);
+
+  if (!rc)
+    rc = sexp_build (r_plain, NULL, "(value %m)", r);
+
+ leave:
+  point_free (&R);
+  point_free (&kG);
+  _gcry_mpi_release (r);
+  _gcry_mpi_release (data_e);
+  sexp_release (l1);
+  _gcry_mpi_ec_free (ec);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("ecc_decrypt    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+/* Return the number of bits for the key described by PARMS.  On error
+ * 0 is returned.  The format of PARMS starts with the algorithm name;
+ * for example:
+ *
+ *   (ecc
+ *     (curve <name>)
+ *     (p <mpi>)
+ *     (a <mpi>)
+ *     (b <mpi>)
+ *     (g <mpi>)
+ *     (n <mpi>)
+ *     (q <mpi>))
+ *
+ * More parameters may be given. Either P or CURVE is needed.
+ */
+static unsigned int
+ecc_get_nbits (gcry_sexp_t parms)
+{
+  gcry_sexp_t l1;
+  gcry_mpi_t p;
+  unsigned int nbits = 0;
+  char *curve;
+
+  l1 = sexp_find_token (parms, "p", 1);
+  if (!l1)
+    { /* Parameter P not found - check whether we have "curve".  */
+      l1 = sexp_find_token (parms, "curve", 5);
+      if (!l1)
+        return 0; /* Neither P nor CURVE found.  */
+
+      curve = sexp_nth_string (l1, 1);
+      sexp_release (l1);
+      if (!curve)
+        return 0;  /* No curve name given (or out of core). */
+
+      if (_gcry_ecc_fill_in_curve (0, curve, NULL, &nbits))
+        nbits = 0;
+      xfree (curve);
+    }
+  else
+    {
+      p = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+      sexp_release (l1);
+      if (p)
+        {
+          nbits = mpi_get_nbits (p);
+          _gcry_mpi_release (p);
+        }
+    }
+  return nbits;
+}
+
+
+/* See rsa.c for a description of this function.  */
+static gpg_err_code_t
+compute_keygrip (gcry_md_hd_t md, gcry_sexp_t keyparms)
+{
+#define N_COMPONENTS 6
+  static const char names[N_COMPONENTS] = "pabgnq";
+  gpg_err_code_t rc;
+  gcry_sexp_t l1;
+  gcry_mpi_t values[N_COMPONENTS];
+  int idx;
+  char *curvename = NULL;
+  int flags = 0;
+  enum gcry_mpi_ec_models model = 0;
+  enum ecc_dialects dialect = 0;
+  const unsigned char *raw;
+  unsigned int n;
+
+  /* Clear the values first.  */
+  for (idx=0; idx < N_COMPONENTS; idx++)
+    values[idx] = NULL;
+
+
+  /* Look for flags. */
+  l1 = sexp_find_token (keyparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      if (rc)
+        goto leave;
+    }
+
+  /* Extract the parameters.  */
+  if ((flags & PUBKEY_FLAG_PARAM))
+    rc = sexp_extract_param (keyparms, NULL, "p?a?b?g?n?/q",
+                             &values[0], &values[1], &values[2],
+                             &values[3], &values[4], &values[5],
+                             NULL);
+  else
+    rc = sexp_extract_param (keyparms, NULL, "/q", &values[5], NULL);
+  if (rc)
+    goto leave;
+
+  /* Check whether a curve parameter is available and use that to fill
+     in missing values.  */
+  sexp_release (l1);
+  l1 = sexp_find_token (keyparms, "curve", 5);
+  if (l1)
+    {
+      curvename = sexp_nth_string (l1, 1);
+      if (curvename)
+        {
+          rc = _gcry_ecc_update_curve_param (curvename,
+                                             &model, &dialect,
+                                             &values[0], &values[1], &values[2],
+                                             &values[3], &values[4]);
+          if (rc)
+            goto leave;
+        }
+    }
+
+  /* Guess required fields if a curve parameter has not been given.
+     FIXME: This is a crude hacks.  We need to fix that.  */
+  if (!curvename)
+    {
+      model = ((flags & PUBKEY_FLAG_EDDSA)
+               ? MPI_EC_EDWARDS
+               : MPI_EC_WEIERSTRASS);
+      dialect = ((flags & PUBKEY_FLAG_EDDSA)
+                 ? ECC_DIALECT_ED25519
+                 : ECC_DIALECT_STANDARD);
+    }
+
+  /* Check that all parameters are known and normalize all MPIs (that
+     should not be required but we use an internal function later and
+     thus we better make 100% sure that they are normalized). */
+  for (idx = 0; idx < N_COMPONENTS; idx++)
+    if (!values[idx])
+      {
+        rc = GPG_ERR_NO_OBJ;
+        goto leave;
+      }
+    else
+      _gcry_mpi_normalize (values[idx]);
+
+  /* Uncompress the public key with the exception of EdDSA where
+     compression is the default and we thus compute the keygrip using
+     the compressed version.  Because we don't support any non-eddsa
+     compression, the only thing we need to do is to compress
+     EdDSA.  */
+  if ((flags & PUBKEY_FLAG_EDDSA) && dialect == ECC_DIALECT_ED25519)
+    {
+      const unsigned int pbits = mpi_get_nbits (values[0]);
+
+      rc = _gcry_ecc_eddsa_ensure_compact (values[5], pbits);
+      if (rc)
+        goto leave;
+    }
+  else if ((flags & PUBKEY_FLAG_DJB_TWEAK))
+    {
+      /* Remove the prefix 0x40 for keygrip computation.  */
+      raw = mpi_get_opaque (values[5], &n);
+      if (raw)
+        {
+          n = (n + 7)/8;
+
+          if (n > 1 && (n%2) && raw[0] == 0x40)
+            if (!_gcry_mpi_set_opaque_copy (values[5], raw + 1, (n - 1)*8))
+                rc = gpg_err_code_from_syserror ();
+        }
+      else
+        {
+          rc = GPG_ERR_INV_OBJ;
+          goto leave;
+        }
+    }
+
+  /* Hash them all.  */
+  for (idx = 0; idx < N_COMPONENTS; idx++)
+    {
+      char buf[30];
+
+      if (mpi_is_opaque (values[idx]))
+        {
+          raw = mpi_get_opaque (values[idx], &n);
+          n = (n + 7)/8;
+          snprintf (buf, sizeof buf, "(1:%c%u:", names[idx], n);
+          _gcry_md_write (md, buf, strlen (buf));
+          _gcry_md_write (md, raw, n);
+          _gcry_md_write (md, ")", 1);
+        }
+      else
+        {
+          unsigned char *rawmpi;
+          unsigned int rawmpilen;
+
+          rawmpi = _gcry_mpi_get_buffer (values[idx], 0, &rawmpilen, NULL);
+          if (!rawmpi)
+            {
+              rc = gpg_err_code_from_syserror ();
+              goto leave;
+            }
+          snprintf (buf, sizeof buf, "(1:%c%u:", names[idx], rawmpilen);
+          _gcry_md_write (md, buf, strlen (buf));
+          _gcry_md_write (md, rawmpi, rawmpilen);
+          _gcry_md_write (md, ")", 1);
+          xfree (rawmpi);
+        }
+    }
+
+ leave:
+  xfree (curvename);
+  sexp_release (l1);
+  for (idx = 0; idx < N_COMPONENTS; idx++)
+    _gcry_mpi_release (values[idx]);
+
+  return rc;
+#undef N_COMPONENTS
+}
+
+
+
+/*
+   Low-level API helper functions.
+ */
+
+/* This is the worker function for gcry_pubkey_get_sexp for ECC
+   algorithms.  Note that the caller has already stored NULL at
+   R_SEXP.  */
+gpg_err_code_t
+_gcry_pk_ecc_get_sexp (gcry_sexp_t *r_sexp, int mode, mpi_ec_t ec)
+{
+  gpg_err_code_t rc;
+  gcry_mpi_t mpi_G = NULL;
+  gcry_mpi_t mpi_Q = NULL;
+
+  if (!ec->p || !ec->a || !ec->b || !ec->G || !ec->n)
+    return GPG_ERR_BAD_CRYPT_CTX;
+
+  if (mode == GCRY_PK_GET_SECKEY && !ec->d)
+    return GPG_ERR_NO_SECKEY;
+
+  /* Compute the public point if it is missing.  */
+  if (!ec->Q && ec->d)
+    ec->Q = _gcry_ecc_compute_public (NULL, ec);
+
+  /* Encode G and Q.  */
+  mpi_G = _gcry_mpi_ec_ec2os (ec->G, ec);
+  if (!mpi_G)
+    {
+      rc = GPG_ERR_BROKEN_PUBKEY;
+      goto leave;
+    }
+  if (!ec->Q)
+    {
+      rc = GPG_ERR_BAD_CRYPT_CTX;
+      goto leave;
+    }
+
+  if (ec->dialect == ECC_DIALECT_ED25519)
+    {
+      unsigned char *encpk;
+      unsigned int encpklen;
+
+      rc = _gcry_ecc_eddsa_encodepoint (ec->Q, ec, NULL, NULL, 0,
+                                        &encpk, &encpklen);
+      if (rc)
+        goto leave;
+      mpi_Q = mpi_set_opaque (NULL, encpk, encpklen*8);
+      encpk = NULL;
+    }
+  else if (ec->model == MPI_EC_MONTGOMERY)
+    {
+      unsigned char *encpk;
+      unsigned int encpklen;
+
+      rc = _gcry_ecc_mont_encodepoint (ec->Q->x, ec->nbits,
+                                       ec->dialect != ECC_DIALECT_SAFECURVE,
+                                       &encpk, &encpklen);
+      if (rc)
+        goto leave;
+      mpi_Q = mpi_set_opaque (NULL, encpk, encpklen*8);
+    }
+  else
+    {
+      mpi_Q = _gcry_mpi_ec_ec2os (ec->Q, ec);
+    }
+  if (!mpi_Q)
+    {
+      rc = GPG_ERR_BROKEN_PUBKEY;
+      goto leave;
+    }
+
+  /* Fixme: We should return a curve name instead of the parameters if
+     if know that they match a curve.  */
+
+  if (ec->d && (!mode || mode == GCRY_PK_GET_SECKEY))
+    {
+      /* Let's return a private key. */
+      rc = sexp_build (r_sexp, NULL,
+                       "(private-key(ecc(p%m)(a%m)(b%m)(g%m)(n%m)(h%u)(q%m)(d%m)))",
+                       ec->p, ec->a, ec->b, mpi_G, ec->n, ec->h, mpi_Q, ec->d);
+    }
+  else if (ec->Q)
+    {
+      /* Let's return a public key.  */
+      rc = sexp_build (r_sexp, NULL,
+                       "(public-key(ecc(p%m)(a%m)(b%m)(g%m)(n%m)(h%u)(q%m)))",
+                       ec->p, ec->a, ec->b, mpi_G, ec->n, ec->h, mpi_Q);
+    }
+  else
+    rc = GPG_ERR_BAD_CRYPT_CTX;
+
+ leave:
+  mpi_free (mpi_Q);
+  mpi_free (mpi_G);
+  return rc;
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+static const char *
+selftest_sign (gcry_sexp_t pkey, gcry_sexp_t skey)
+{
+  /* Sample data from RFC 6979 section A.2.5, hash is of message "sample" */
+  static const char sample_data[] =
+    "(data (flags rfc6979)"
+    " (hash sha256 #af2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e98915"
+    /**/           "62113d8a62add1bf#))";
+  static const char sample_data_bad[] =
+    "(data (flags rfc6979)"
+    " (hash sha256 #bf2bdbe1aa9b6ec1e2ade1d694f41fc71a831d0268e98915"
+    /**/           "62113d8a62add1bf#))";
+  static const char signature_r[] =
+    "efd48b2aacb6a8fd1140dd9cd45e81d69d2c877b56aaf991c34d0ea84eaf3716";
+  static const char signature_s[] =
+    "f7cb1c942d657c41d436c7a1b6e29f65f3e900dbb9aff4064dc4ab2f843acda8";
+
+  const char *errtxt = NULL;
+  gcry_error_t err;
+  gcry_sexp_t data = NULL;
+  gcry_sexp_t data_bad = NULL;
+  gcry_sexp_t sig = NULL;
+  gcry_sexp_t l1 = NULL;
+  gcry_sexp_t l2 = NULL;
+  gcry_mpi_t r = NULL;
+  gcry_mpi_t s = NULL;
+  gcry_mpi_t calculated_r = NULL;
+  gcry_mpi_t calculated_s = NULL;
+  int cmp;
+
+  err = sexp_sscan (&data, NULL, sample_data, strlen (sample_data));
+  if (!err)
+    err = sexp_sscan (&data_bad, NULL,
+                      sample_data_bad, strlen (sample_data_bad));
+  if (!err)
+    err = _gcry_mpi_scan (&r, GCRYMPI_FMT_HEX, signature_r, 0, NULL);
+  if (!err)
+    err = _gcry_mpi_scan (&s, GCRYMPI_FMT_HEX, signature_s, 0, NULL);
+
+  if (err)
+    {
+      errtxt = "converting data failed";
+      goto leave;
+    }
+
+  err = _gcry_pk_sign (&sig, data, skey);
+  if (err)
+    {
+      errtxt = "signing failed";
+      goto leave;
+    }
+
+  /* check against known signature */
+  errtxt = "signature validity failed";
+  l1 = _gcry_sexp_find_token (sig, "sig-val", 0);
+  if (!l1)
+    goto leave;
+  l2 = _gcry_sexp_find_token (l1, "ecdsa", 0);
+  if (!l2)
+    goto leave;
+
+  sexp_release (l1);
+  l1 = l2;
+
+  l2 = _gcry_sexp_find_token (l1, "r", 0);
+  if (!l2)
+    goto leave;
+  calculated_r = _gcry_sexp_nth_mpi (l2, 1, GCRYMPI_FMT_USG);
+  if (!calculated_r)
+    goto leave;
+
+  sexp_release (l2);
+  l2 = _gcry_sexp_find_token (l1, "s", 0);
+  if (!l2)
+    goto leave;
+  calculated_s = _gcry_sexp_nth_mpi (l2, 1, GCRYMPI_FMT_USG);
+  if (!calculated_s)
+    goto leave;
+
+  errtxt = "known sig check failed";
+
+  cmp = _gcry_mpi_cmp (r, calculated_r);
+  if (cmp)
+    goto leave;
+  cmp = _gcry_mpi_cmp (s, calculated_s);
+  if (cmp)
+    goto leave;
+
+  errtxt = NULL;
+
+  /* verify generated signature */
+  err = _gcry_pk_verify (sig, data, pkey);
+  if (err)
+    {
+      errtxt = "verify failed";
+      goto leave;
+    }
+  err = _gcry_pk_verify (sig, data_bad, pkey);
+  if (gcry_err_code (err) != GPG_ERR_BAD_SIGNATURE)
+    {
+      errtxt = "bad signature not detected";
+      goto leave;
+    }
+
+
+ leave:
+  sexp_release (sig);
+  sexp_release (data_bad);
+  sexp_release (data);
+  sexp_release (l1);
+  sexp_release (l2);
+  mpi_release (r);
+  mpi_release (s);
+  mpi_release (calculated_r);
+  mpi_release (calculated_s);
+  return errtxt;
+}
+
+
+static gpg_err_code_t
+selftests_ecdsa (selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+  gcry_error_t err;
+  gcry_sexp_t skey = NULL;
+  gcry_sexp_t pkey = NULL;
+
+  what = "convert";
+  err = sexp_sscan (&skey, NULL, sample_secret_key_secp256,
+                    strlen (sample_secret_key_secp256));
+  if (!err)
+    err = sexp_sscan (&pkey, NULL, sample_public_key_secp256,
+                      strlen (sample_public_key_secp256));
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "key consistency";
+  err = ecc_check_secret_key(skey);
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "sign";
+  errtxt = selftest_sign (pkey, skey);
+  if (errtxt)
+    goto failed;
+
+  sexp_release(pkey);
+  sexp_release(skey);
+  return 0; /* Succeeded. */
+
+ failed:
+  sexp_release(pkey);
+  sexp_release(skey);
+  if (report)
+    report ("pubkey", GCRY_PK_ECC, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  (void)extended;
+
+  if (algo != GCRY_PK_ECC)
+    return GPG_ERR_PUBKEY_ALGO;
+
+  return selftests_ecdsa (report);
+}
+
+
+
+
+gcry_pk_spec_t _gcry_pubkey_spec_ecc =
+  {
+    GCRY_PK_ECC, { 0, 1 },
+    (GCRY_PK_USAGE_SIGN | GCRY_PK_USAGE_ENCR),
+    "ECC", ecc_names,
+    "pabgnhq", "pabgnhqd", "se", "rs", "pabgnhq",
+    ecc_generate,
+    ecc_check_secret_key,
+    ecc_encrypt_raw,
+    ecc_decrypt_raw,
+    ecc_sign,
+    ecc_verify,
+    ecc_get_nbits,
+    run_selftests,
+    compute_keygrip,
+    _gcry_ecc_get_curve,
+    _gcry_ecc_get_param_sexp
+  };
diff --git a/comm/third_party/libgcrypt/cipher/elgamal.c b/comm/third_party/libgcrypt/cipher/elgamal.c
new file mode 100644
index 0000000000..4eb52d620b
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/elgamal.c
@@ -0,0 +1,1149 @@
+/* Elgamal.c  -  Elgamal Public Key encryption
+ * Copyright (C) 1998, 2000, 2001, 2002, 2003,
+ *               2008  Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * For a description of the algorithm, see:
+ *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
+ *   ISBN 0-471-11709-9. Pages 476 ff.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+/* Blinding is used to mitigate side-channel attacks.  You may undef
+   this to speed up the operation in case the system is secured
+   against physical and network mounted side-channel attacks.  */
+#define USE_BLINDING 1
+
+
+typedef struct
+{
+  gcry_mpi_t p;	    /* prime */
+  gcry_mpi_t g;	    /* group generator */
+  gcry_mpi_t y;	    /* g^x mod p */
+} ELG_public_key;
+
+
+typedef struct
+{
+  gcry_mpi_t p;	    /* prime */
+  gcry_mpi_t g;	    /* group generator */
+  gcry_mpi_t y;	    /* g^x mod p */
+  gcry_mpi_t x;	    /* secret exponent */
+} ELG_secret_key;
+
+
+static const char *elg_names[] =
+  {
+    "elg",
+    "openpgp-elg",
+    "openpgp-elg-sig",
+    NULL,
+  };
+
+
+static int test_keys (ELG_secret_key *sk, unsigned int nbits, int nodie);
+static gcry_mpi_t gen_k (gcry_mpi_t p, int small_k);
+static gcry_err_code_t generate (ELG_secret_key *sk, unsigned nbits,
+                                 gcry_mpi_t **factors);
+static int  check_secret_key (ELG_secret_key *sk);
+static void do_encrypt (gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input,
+                        ELG_public_key *pkey);
+static void decrypt (gcry_mpi_t output, gcry_mpi_t a, gcry_mpi_t b,
+                     ELG_secret_key *skey);
+static void sign (gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input,
+                  ELG_secret_key *skey);
+static int  verify (gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input,
+                    ELG_public_key *pkey);
+static unsigned int elg_get_nbits (gcry_sexp_t parms);
+
+
+static void (*progress_cb) (void *, const char *, int, int, int);
+static void *progress_cb_data;
+
+void
+_gcry_register_pk_elg_progress (void (*cb) (void *, const char *,
+                                            int, int, int),
+				void *cb_data)
+{
+  progress_cb = cb;
+  progress_cb_data = cb_data;
+}
+
+
+static void
+progress (int c)
+{
+  if (progress_cb)
+    progress_cb (progress_cb_data, "pk_elg", c, 0, 0);
+}
+
+
+/****************
+ * Michael Wiener's table on subgroup sizes to match field sizes.
+ * (floating around somewhere, probably based on the paper from
+ * Eurocrypt 96, page 332)
+ */
+static unsigned int
+wiener_map( unsigned int n )
+{
+  static struct { unsigned int p_n, q_n; } t[] =
+    { /*   p	  q	 attack cost */
+      {  512, 119 },	/* 9 x 10^17 */
+      {  768, 145 },	/* 6 x 10^21 */
+      { 1024, 165 },	/* 7 x 10^24 */
+      { 1280, 183 },	/* 3 x 10^27 */
+      { 1536, 198 },	/* 7 x 10^29 */
+      { 1792, 212 },	/* 9 x 10^31 */
+      { 2048, 225 },	/* 8 x 10^33 */
+      { 2304, 237 },	/* 5 x 10^35 */
+      { 2560, 249 },	/* 3 x 10^37 */
+      { 2816, 259 },	/* 1 x 10^39 */
+      { 3072, 269 },	/* 3 x 10^40 */
+      { 3328, 279 },	/* 8 x 10^41 */
+      { 3584, 288 },	/* 2 x 10^43 */
+      { 3840, 296 },	/* 4 x 10^44 */
+      { 4096, 305 },	/* 7 x 10^45 */
+      { 4352, 313 },	/* 1 x 10^47 */
+      { 4608, 320 },	/* 2 x 10^48 */
+      { 4864, 328 },	/* 2 x 10^49 */
+      { 5120, 335 },	/* 3 x 10^50 */
+      { 0, 0 }
+    };
+  int i;
+
+  for(i=0; t[i].p_n; i++ )
+    {
+      if( n <= t[i].p_n )
+        return t[i].q_n;
+    }
+  /* Not in table - use an arbitrary high number. */
+  return  n / 8 + 200;
+}
+
+static int
+test_keys ( ELG_secret_key *sk, unsigned int nbits, int nodie )
+{
+  ELG_public_key pk;
+  gcry_mpi_t test   = mpi_new ( 0 );
+  gcry_mpi_t out1_a = mpi_new ( nbits );
+  gcry_mpi_t out1_b = mpi_new ( nbits );
+  gcry_mpi_t out2   = mpi_new ( nbits );
+  int failed = 0;
+
+  pk.p = sk->p;
+  pk.g = sk->g;
+  pk.y = sk->y;
+
+  _gcry_mpi_randomize ( test, nbits, GCRY_WEAK_RANDOM );
+
+  do_encrypt ( out1_a, out1_b, test, &pk );
+  decrypt ( out2, out1_a, out1_b, sk );
+  if ( mpi_cmp( test, out2 ) )
+    failed |= 1;
+
+  sign ( out1_a, out1_b, test, sk );
+  if ( !verify( out1_a, out1_b, test, &pk ) )
+    failed |= 2;
+
+  _gcry_mpi_release ( test );
+  _gcry_mpi_release ( out1_a );
+  _gcry_mpi_release ( out1_b );
+  _gcry_mpi_release ( out2 );
+
+  if (failed && !nodie)
+    log_fatal ("Elgamal test key for %s %s failed\n",
+               (failed & 1)? "encrypt+decrypt":"",
+               (failed & 2)? "sign+verify":"");
+  if (failed && DBG_CIPHER)
+    log_debug ("Elgamal test key for %s %s failed\n",
+               (failed & 1)? "encrypt+decrypt":"",
+               (failed & 2)? "sign+verify":"");
+
+  return failed;
+}
+
+
+/****************
+ * Generate a random secret exponent k from prime p, so that k is
+ * relatively prime to p-1.  With SMALL_K set, k will be selected for
+ * better encryption performance - this must never be used signing!
+ */
+static gcry_mpi_t
+gen_k( gcry_mpi_t p, int small_k )
+{
+  gcry_mpi_t k = mpi_alloc_secure( 0 );
+  gcry_mpi_t temp = mpi_alloc( mpi_get_nlimbs(p) );
+  gcry_mpi_t p_1 = mpi_copy(p);
+  unsigned int orig_nbits = mpi_get_nbits(p);
+  unsigned int nbits, nbytes;
+  char *rndbuf = NULL;
+
+  if (small_k)
+    {
+      /* Using a k much lesser than p is sufficient for encryption and
+       * it greatly improves the encryption performance.  We use
+       * Wiener's table and add a large safety margin. */
+      nbits = wiener_map( orig_nbits ) * 3 / 2;
+      if( nbits >= orig_nbits )
+        BUG();
+    }
+  else
+    nbits = orig_nbits;
+
+
+  nbytes = (nbits+7)/8;
+  if( DBG_CIPHER )
+    log_debug("choosing a random k\n");
+  mpi_sub_ui( p_1, p, 1);
+  for(;;)
+    {
+      if( !rndbuf || nbits < 32 )
+        {
+          xfree(rndbuf);
+          rndbuf = _gcry_random_bytes_secure( nbytes, GCRY_STRONG_RANDOM );
+        }
+      else
+        {
+          /* Change only some of the higher bits.  We could improve
+             this by directly requesting more memory at the first call
+             to get_random_bytes() and use this the here maybe it is
+             easier to do this directly in random.c Anyway, it is
+             highly inlikely that we will ever reach this code. */
+          char *pp = _gcry_random_bytes_secure( 4, GCRY_STRONG_RANDOM );
+          memcpy( rndbuf, pp, 4 );
+          xfree(pp);
+	}
+      _gcry_mpi_set_buffer( k, rndbuf, nbytes, 0 );
+
+      for(;;)
+        {
+          if( !(mpi_cmp( k, p_1 ) < 0) )  /* check: k < (p-1) */
+            {
+              if( DBG_CIPHER )
+                progress('+');
+              break; /* no  */
+            }
+          if( !(mpi_cmp_ui( k, 0 ) > 0) )  /* check: k > 0 */
+            {
+              if( DBG_CIPHER )
+                progress('-');
+              break; /* no */
+            }
+          if (mpi_gcd( temp, k, p_1 ))
+            goto found;  /* okay, k is relative prime to (p-1) */
+          mpi_add_ui( k, k, 1 );
+          if( DBG_CIPHER )
+            progress('.');
+	}
+    }
+ found:
+  xfree (rndbuf);
+  if( DBG_CIPHER )
+    progress('\n');
+  mpi_free(p_1);
+  mpi_free(temp);
+
+  return k;
+}
+
+/****************
+ * Generate a key pair with a key of size NBITS
+ * Returns: 2 structures filled with all needed values
+ *	    and an array with n-1 factors of (p-1)
+ */
+static gcry_err_code_t
+generate ( ELG_secret_key *sk, unsigned int nbits, gcry_mpi_t **ret_factors )
+{
+  gcry_err_code_t rc;
+  gcry_mpi_t p;    /* the prime */
+  gcry_mpi_t p_min1;
+  gcry_mpi_t g;
+  gcry_mpi_t x;    /* the secret exponent */
+  gcry_mpi_t y;
+  unsigned int qbits;
+  unsigned int xbits;
+  byte *rndbuf;
+
+  p_min1 = mpi_new ( nbits );
+  qbits = wiener_map( nbits );
+  if( qbits & 1 ) /* better have a even one */
+    qbits++;
+  g = mpi_alloc(1);
+  rc = _gcry_generate_elg_prime (0, nbits, qbits, g, &p, ret_factors);
+  if (rc)
+    {
+      mpi_free (p_min1);
+      mpi_free (g);
+      return rc;
+    }
+  mpi_sub_ui(p_min1, p, 1);
+
+
+  /* Select a random number which has these properties:
+   *	 0 < x < p-1
+   * This must be a very good random number because this is the
+   * secret part.  The prime is public and may be shared anyway,
+   * so a random generator level of 1 is used for the prime.
+   *
+   * I don't see a reason to have a x of about the same size
+   * as the p.  It should be sufficient to have one about the size
+   * of q or the later used k plus a large safety margin. Decryption
+   * will be much faster with such an x.
+   */
+  xbits = qbits * 3 / 2;
+  if( xbits >= nbits )
+    BUG();
+  x = mpi_snew ( xbits );
+  if( DBG_CIPHER )
+    log_debug("choosing a random x of size %u\n", xbits );
+  rndbuf = NULL;
+  do
+    {
+      if( DBG_CIPHER )
+        progress('.');
+      if( rndbuf )
+        { /* Change only some of the higher bits */
+          if( xbits < 16 ) /* should never happen ... */
+            {
+              xfree(rndbuf);
+              rndbuf = _gcry_random_bytes_secure ((xbits+7)/8,
+                                                  GCRY_VERY_STRONG_RANDOM);
+            }
+          else
+            {
+              char *r = _gcry_random_bytes_secure (2, GCRY_VERY_STRONG_RANDOM);
+              memcpy(rndbuf, r, 2 );
+              xfree (r);
+            }
+	}
+      else
+        {
+          rndbuf = _gcry_random_bytes_secure ((xbits+7)/8,
+                                              GCRY_VERY_STRONG_RANDOM );
+	}
+      _gcry_mpi_set_buffer( x, rndbuf, (xbits+7)/8, 0 );
+      mpi_clear_highbit( x, xbits+1 );
+    }
+  while( !( mpi_cmp_ui( x, 0 )>0 && mpi_cmp( x, p_min1 )<0 ) );
+  xfree(rndbuf);
+
+  y = mpi_new (nbits);
+  mpi_powm( y, g, x, p );
+
+  if( DBG_CIPHER )
+    {
+      progress ('\n');
+      log_mpidump ("elg  p", p );
+      log_mpidump ("elg  g", g );
+      log_mpidump ("elg  y", y );
+      log_mpidump ("elg  x", x );
+    }
+
+  /* Copy the stuff to the key structures */
+  sk->p = p;
+  sk->g = g;
+  sk->y = y;
+  sk->x = x;
+
+  _gcry_mpi_release ( p_min1 );
+
+  /* Now we can test our keys (this should never fail!) */
+  test_keys ( sk, nbits - 64, 0 );
+
+  return 0;
+}
+
+
+/* Generate a key pair with a key of size NBITS not using a random
+   value for the secret key but the one given as X.  This is useful to
+   implement a passphrase based decryption for a public key based
+   encryption.  It has appliactions in backup systems.
+
+   Returns: A structure filled with all needed values and an array
+ 	    with n-1 factors of (p-1).  */
+static gcry_err_code_t
+generate_using_x (ELG_secret_key *sk, unsigned int nbits, gcry_mpi_t x,
+                  gcry_mpi_t **ret_factors )
+{
+  gcry_err_code_t rc;
+  gcry_mpi_t p;      /* The prime.  */
+  gcry_mpi_t p_min1; /* The prime minus 1.  */
+  gcry_mpi_t g;      /* The generator.  */
+  gcry_mpi_t y;      /* g^x mod p.  */
+  unsigned int qbits;
+  unsigned int xbits;
+
+  sk->p = NULL;
+  sk->g = NULL;
+  sk->y = NULL;
+  sk->x = NULL;
+
+  /* Do a quick check to see whether X is suitable.  */
+  xbits = mpi_get_nbits (x);
+  if ( xbits < 64 || xbits >= nbits )
+    return GPG_ERR_INV_VALUE;
+
+  p_min1 = mpi_new ( nbits );
+  qbits  = wiener_map ( nbits );
+  if ( (qbits & 1) ) /* Better have an even one.  */
+    qbits++;
+  g = mpi_alloc (1);
+  rc = _gcry_generate_elg_prime (0, nbits, qbits, g, &p, ret_factors );
+  if (rc)
+    {
+      mpi_free (p_min1);
+      mpi_free (g);
+      return rc;
+    }
+  mpi_sub_ui (p_min1, p, 1);
+
+  if (DBG_CIPHER)
+    log_debug ("using a supplied x of size %u", xbits );
+  if ( !(mpi_cmp_ui ( x, 0 ) > 0 && mpi_cmp ( x, p_min1 ) <0 ) )
+    {
+      _gcry_mpi_release ( p_min1 );
+      _gcry_mpi_release ( p );
+      _gcry_mpi_release ( g );
+      return GPG_ERR_INV_VALUE;
+    }
+
+  y = mpi_new (nbits);
+  mpi_powm ( y, g, x, p );
+
+  if ( DBG_CIPHER )
+    {
+      progress ('\n');
+      log_mpidump ("elg  p", p );
+      log_mpidump ("elg  g", g );
+      log_mpidump ("elg  y", y );
+      log_mpidump ("elg  x", x );
+    }
+
+  /* Copy the stuff to the key structures */
+  sk->p = p;
+  sk->g = g;
+  sk->y = y;
+  sk->x = mpi_copy (x);
+
+  _gcry_mpi_release ( p_min1 );
+
+  /* Now we can test our keys. */
+  if ( test_keys ( sk, nbits - 64, 1 ) )
+    {
+      _gcry_mpi_release ( sk->p ); sk->p = NULL;
+      _gcry_mpi_release ( sk->g ); sk->g = NULL;
+      _gcry_mpi_release ( sk->y ); sk->y = NULL;
+      _gcry_mpi_release ( sk->x ); sk->x = NULL;
+      return GPG_ERR_BAD_SECKEY;
+    }
+
+  return 0;
+}
+
+
+/****************
+ * Test whether the secret key is valid.
+ * Returns: if this is a valid key.
+ */
+static int
+check_secret_key( ELG_secret_key *sk )
+{
+  int rc;
+  gcry_mpi_t y = mpi_alloc( mpi_get_nlimbs(sk->y) );
+
+  mpi_powm (y, sk->g, sk->x, sk->p);
+  rc = !mpi_cmp( y, sk->y );
+  mpi_free( y );
+  return rc;
+}
+
+
+static void
+do_encrypt(gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input, ELG_public_key *pkey )
+{
+  gcry_mpi_t k;
+
+  /* Note: maybe we should change the interface, so that it
+   * is possible to check that input is < p and return an
+   * error code.
+   */
+
+  k = gen_k( pkey->p, 1 );
+  mpi_powm (a, pkey->g, k, pkey->p);
+
+  /* b = (y^k * input) mod p
+   *	 = ((y^k mod p) * (input mod p)) mod p
+   * and because input is < p
+   *	 = ((y^k mod p) * input) mod p
+   */
+  mpi_powm (b, pkey->y, k, pkey->p);
+  mpi_mulm (b, b, input, pkey->p);
+#if 0
+  if( DBG_CIPHER )
+    {
+      log_mpidump("elg encrypted y", pkey->y);
+      log_mpidump("elg encrypted p", pkey->p);
+      log_mpidump("elg encrypted k", k);
+      log_mpidump("elg encrypted M", input);
+      log_mpidump("elg encrypted a", a);
+      log_mpidump("elg encrypted b", b);
+    }
+#endif
+  mpi_free(k);
+}
+
+
+
+
+static void
+decrypt (gcry_mpi_t output, gcry_mpi_t a, gcry_mpi_t b, ELG_secret_key *skey )
+{
+  gcry_mpi_t t1, t2, r;
+  unsigned int nbits = mpi_get_nbits (skey->p);
+
+  mpi_normalize (a);
+  mpi_normalize (b);
+
+  t1 = mpi_snew (nbits);
+
+#ifdef USE_BLINDING
+
+  t2 = mpi_snew (nbits);
+  r  = mpi_new (nbits);
+
+  /* We need a random number of about the prime size.  The random
+     number merely needs to be unpredictable; thus we use level 0.  */
+  _gcry_mpi_randomize (r, nbits, GCRY_WEAK_RANDOM);
+
+  /* t1 = r^x mod p */
+  mpi_powm (t1, r, skey->x, skey->p);
+  /* t2 = (a * r)^-x mod p */
+  mpi_mulm (t2, a, r, skey->p);
+  mpi_powm (t2, t2, skey->x, skey->p);
+  mpi_invm (t2, t2, skey->p);
+  /* t1 = (t1 * t2) mod p*/
+  mpi_mulm (t1, t1, t2, skey->p);
+
+  mpi_free (r);
+  mpi_free (t2);
+
+#else /*!USE_BLINDING*/
+
+  /* output = b/(a^x) mod p */
+  mpi_powm (t1, a, skey->x, skey->p);
+  mpi_invm (t1, t1, skey->p);
+
+#endif /*!USE_BLINDING*/
+
+  mpi_mulm (output, b, t1, skey->p);
+
+#if 0
+  if( DBG_CIPHER )
+    {
+      log_mpidump ("elg decrypted x", skey->x);
+      log_mpidump ("elg decrypted p", skey->p);
+      log_mpidump ("elg decrypted a", a);
+      log_mpidump ("elg decrypted b", b);
+      log_mpidump ("elg decrypted M", output);
+    }
+#endif
+  mpi_free (t1);
+}
+
+
+/****************
+ * Make an Elgamal signature out of INPUT
+ */
+
+static void
+sign(gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input, ELG_secret_key *skey )
+{
+    gcry_mpi_t k;
+    gcry_mpi_t t   = mpi_alloc( mpi_get_nlimbs(a) );
+    gcry_mpi_t inv = mpi_alloc( mpi_get_nlimbs(a) );
+    gcry_mpi_t p_1 = mpi_copy(skey->p);
+
+   /*
+    * b = (t * inv) mod (p-1)
+    * b = (t * inv(k,(p-1),(p-1)) mod (p-1)
+    * b = (((M-x*a) mod (p-1)) * inv(k,(p-1),(p-1))) mod (p-1)
+    *
+    */
+    mpi_sub_ui(p_1, p_1, 1);
+    k = gen_k( skey->p, 0 /* no small K ! */ );
+    mpi_powm( a, skey->g, k, skey->p );
+    mpi_mul(t, skey->x, a );
+    mpi_subm(t, input, t, p_1 );
+    mpi_invm(inv, k, p_1 );
+    mpi_mulm(b, t, inv, p_1 );
+
+#if 0
+    if( DBG_CIPHER )
+      {
+	log_mpidump ("elg sign p", skey->p);
+	log_mpidump ("elg sign g", skey->g);
+	log_mpidump ("elg sign y", skey->y);
+	log_mpidump ("elg sign x", skey->x);
+	log_mpidump ("elg sign k", k);
+	log_mpidump ("elg sign M", input);
+	log_mpidump ("elg sign a", a);
+	log_mpidump ("elg sign b", b);
+      }
+#endif
+    mpi_free(k);
+    mpi_free(t);
+    mpi_free(inv);
+    mpi_free(p_1);
+}
+
+
+/****************
+ * Returns true if the signature composed of A and B is valid.
+ */
+static int
+verify(gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input, ELG_public_key *pkey )
+{
+  int rc;
+  gcry_mpi_t t1;
+  gcry_mpi_t t2;
+  gcry_mpi_t base[4];
+  gcry_mpi_t ex[4];
+
+  if( !(mpi_cmp_ui( a, 0 ) > 0 && mpi_cmp( a, pkey->p ) < 0) )
+    return 0; /* assertion	0 < a < p  failed */
+
+  t1 = mpi_alloc( mpi_get_nlimbs(a) );
+  t2 = mpi_alloc( mpi_get_nlimbs(a) );
+
+#if 0
+  /* t1 = (y^a mod p) * (a^b mod p) mod p */
+  gcry_mpi_powm( t1, pkey->y, a, pkey->p );
+  gcry_mpi_powm( t2, a, b, pkey->p );
+  mpi_mulm( t1, t1, t2, pkey->p );
+
+  /* t2 = g ^ input mod p */
+  gcry_mpi_powm( t2, pkey->g, input, pkey->p );
+
+  rc = !mpi_cmp( t1, t2 );
+#elif 0
+  /* t1 = (y^a mod p) * (a^b mod p) mod p */
+  base[0] = pkey->y; ex[0] = a;
+  base[1] = a;       ex[1] = b;
+  base[2] = NULL;    ex[2] = NULL;
+  mpi_mulpowm( t1, base, ex, pkey->p );
+
+  /* t2 = g ^ input mod p */
+  gcry_mpi_powm( t2, pkey->g, input, pkey->p );
+
+  rc = !mpi_cmp( t1, t2 );
+#else
+  /* t1 = g ^ - input * y ^ a * a ^ b  mod p */
+  mpi_invm(t2, pkey->g, pkey->p );
+  base[0] = t2     ; ex[0] = input;
+  base[1] = pkey->y; ex[1] = a;
+  base[2] = a;       ex[2] = b;
+  base[3] = NULL;    ex[3] = NULL;
+  mpi_mulpowm( t1, base, ex, pkey->p );
+  rc = !mpi_cmp_ui( t1, 1 );
+
+#endif
+
+  mpi_free(t1);
+  mpi_free(t2);
+  return rc;
+}
+
+/*********************************************
+ **************  interface  ******************
+ *********************************************/
+
+static gpg_err_code_t
+elg_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
+{
+  gpg_err_code_t rc;
+  unsigned int nbits;
+  ELG_secret_key sk;
+  gcry_mpi_t xvalue = NULL;
+  gcry_sexp_t l1;
+  gcry_mpi_t *factors = NULL;
+  gcry_sexp_t misc_info = NULL;
+
+  memset (&sk, 0, sizeof sk);
+
+  rc = _gcry_pk_util_get_nbits (genparms, &nbits);
+  if (rc)
+    return rc;
+
+  /* Parse the optional xvalue element. */
+  l1 = sexp_find_token (genparms, "xvalue", 0);
+  if (l1)
+    {
+      xvalue = sexp_nth_mpi (l1, 1, 0);
+      sexp_release (l1);
+      if (!xvalue)
+        return GPG_ERR_BAD_MPI;
+    }
+
+  if (xvalue)
+    {
+      rc = generate_using_x (&sk, nbits, xvalue, &factors);
+      mpi_free (xvalue);
+    }
+  else
+    {
+      rc = generate (&sk, nbits, &factors);
+    }
+  if (rc)
+    goto leave;
+
+  if (factors && factors[0])
+    {
+      int nfac;
+      void **arg_list;
+      char *buffer, *p;
+
+      for (nfac = 0; factors[nfac]; nfac++)
+        ;
+      arg_list = xtrycalloc (nfac+1, sizeof *arg_list);
+      if (!arg_list)
+        {
+          rc = gpg_err_code_from_syserror ();
+          goto leave;
+        }
+      buffer = xtrymalloc (30 + nfac*2 + 2 + 1);
+      if (!buffer)
+        {
+          rc = gpg_err_code_from_syserror ();
+          xfree (arg_list);
+          goto leave;
+        }
+      p = stpcpy (buffer, "(misc-key-info(pm1-factors");
+      for(nfac = 0; factors[nfac]; nfac++)
+        {
+          p = stpcpy (p, "%m");
+          arg_list[nfac] = factors + nfac;
+        }
+      p = stpcpy (p, "))");
+      rc = sexp_build_array (&misc_info, NULL, buffer, arg_list);
+      xfree (arg_list);
+      xfree (buffer);
+      if (rc)
+        goto leave;
+    }
+
+  rc = sexp_build (r_skey, NULL,
+                   "(key-data"
+                   " (public-key"
+                   "  (elg(p%m)(g%m)(y%m)))"
+                   " (private-key"
+                   "  (elg(p%m)(g%m)(y%m)(x%m)))"
+                   " %S)",
+                   sk.p, sk.g, sk.y,
+                   sk.p, sk.g, sk.y, sk.x,
+                   misc_info);
+
+ leave:
+  mpi_free (sk.p);
+  mpi_free (sk.g);
+  mpi_free (sk.y);
+  mpi_free (sk.x);
+  sexp_release (misc_info);
+  if (factors)
+    {
+      gcry_mpi_t *mp;
+      for (mp = factors; *mp; mp++)
+        mpi_free (*mp);
+      xfree (factors);
+    }
+
+  return rc;
+}
+
+
+static gcry_err_code_t
+elg_check_secret_key (gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  ELG_secret_key sk = {NULL, NULL, NULL, NULL};
+
+  rc = sexp_extract_param (keyparms, NULL, "pgyx",
+                           &sk.p, &sk.g, &sk.y, &sk.x,
+                           NULL);
+  if (rc)
+    goto leave;
+
+  if (!check_secret_key (&sk))
+    rc = GPG_ERR_BAD_SECKEY;
+
+ leave:
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+  if (DBG_CIPHER)
+    log_debug ("elg_testkey    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+elg_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t mpi_a = NULL;
+  gcry_mpi_t mpi_b = NULL;
+  gcry_mpi_t data = NULL;
+  ELG_public_key pk = { NULL, NULL, NULL };
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_ENCRYPT,
+                                   elg_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("elg_encrypt data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "pgy",
+                           &pk.p, &pk.g, &pk.y, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("elg_encrypt  p", pk.p);
+      log_mpidump ("elg_encrypt  g", pk.g);
+      log_mpidump ("elg_encrypt  y", pk.y);
+    }
+
+  /* Do Elgamal computation and build result.  */
+  mpi_a = mpi_new (0);
+  mpi_b = mpi_new (0);
+  do_encrypt (mpi_a, mpi_b, data, &pk);
+  rc = sexp_build (r_ciph, NULL, "(enc-val(elg(a%m)(b%m)))", mpi_a, mpi_b);
+
+ leave:
+  _gcry_mpi_release (mpi_a);
+  _gcry_mpi_release (mpi_b);
+  _gcry_mpi_release (pk.p);
+  _gcry_mpi_release (pk.g);
+  _gcry_mpi_release (pk.y);
+  _gcry_mpi_release (data);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("elg_encrypt   => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+elg_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gpg_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t data_a = NULL;
+  gcry_mpi_t data_b = NULL;
+  ELG_secret_key sk = {NULL, NULL, NULL, NULL};
+  gcry_mpi_t plain = NULL;
+  unsigned char *unpad = NULL;
+  size_t unpadlen = 0;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT,
+                                   elg_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_preparse_encval (s_data, elg_names, &l1, &ctx);
+  if (rc)
+    goto leave;
+  rc = sexp_extract_param (l1, NULL, "ab", &data_a, &data_b, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("elg_decrypt  d_a", data_a);
+      log_printmpi ("elg_decrypt  d_b", data_b);
+    }
+  if (mpi_is_opaque (data_a) || mpi_is_opaque (data_b))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "pgyx",
+                           &sk.p, &sk.g, &sk.y, &sk.x,
+                           NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("elg_decrypt    p", sk.p);
+      log_printmpi ("elg_decrypt    g", sk.g);
+      log_printmpi ("elg_decrypt    y", sk.y);
+      if (!fips_mode ())
+        log_printmpi ("elg_decrypt    x", sk.x);
+    }
+
+  plain = mpi_snew (ctx.nbits);
+  decrypt (plain, data_a, data_b, &sk);
+  if (DBG_CIPHER)
+    log_printmpi ("elg_decrypt  res", plain);
+
+  /* Reverse the encoding and build the s-expression.  */
+  switch (ctx.encoding)
+    {
+    case PUBKEY_ENC_PKCS1:
+      rc = _gcry_rsa_pkcs1_decode_for_enc (&unpad, &unpadlen, ctx.nbits, plain);
+      mpi_free (plain); plain = NULL;
+      if (!rc)
+        rc = sexp_build (r_plain, NULL, "(value %b)", (int)unpadlen, unpad);
+      break;
+
+    case PUBKEY_ENC_OAEP:
+      rc = _gcry_rsa_oaep_decode (&unpad, &unpadlen,
+                                  ctx.nbits, ctx.hash_algo, plain,
+                                  ctx.label, ctx.labellen);
+      mpi_free (plain); plain = NULL;
+      if (!rc)
+        rc = sexp_build (r_plain, NULL, "(value %b)", (int)unpadlen, unpad);
+      break;
+
+    default:
+      /* Raw format.  For backward compatibility we need to assume a
+         signed mpi by using the sexp format string "%m".  */
+      rc = sexp_build (r_plain, NULL,
+                       (ctx.flags & PUBKEY_FLAG_LEGACYRESULT)
+                       ? "%m" : "(value %m)",
+                       plain);
+      break;
+    }
+
+
+ leave:
+  xfree (unpad);
+  _gcry_mpi_release (plain);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+  _gcry_mpi_release (data_a);
+  _gcry_mpi_release (data_b);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("elg_decrypt    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+elg_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t data = NULL;
+  ELG_secret_key sk = {NULL, NULL, NULL, NULL};
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_SIGN,
+                                   elg_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("elg_sign   data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "pgyx",
+                           &sk.p, &sk.g, &sk.y, &sk.x, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("elg_sign      p", sk.p);
+      log_mpidump ("elg_sign      g", sk.g);
+      log_mpidump ("elg_sign      y", sk.y);
+      if (!fips_mode ())
+        log_mpidump ("elg_sign      x", sk.x);
+    }
+
+  sig_r = mpi_new (0);
+  sig_s = mpi_new (0);
+  sign (sig_r, sig_s, data, &sk);
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("elg_sign  sig_r", sig_r);
+      log_mpidump ("elg_sign  sig_s", sig_s);
+    }
+  rc = sexp_build (r_sig, NULL, "(sig-val(elg(r%M)(s%M)))", sig_r, sig_s);
+
+ leave:
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.g);
+  _gcry_mpi_release (sk.y);
+  _gcry_mpi_release (sk.x);
+  _gcry_mpi_release (data);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("elg_sign      => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+elg_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t sig_r = NULL;
+  gcry_mpi_t sig_s = NULL;
+  gcry_mpi_t data = NULL;
+  ELG_public_key pk = { NULL, NULL, NULL };
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_VERIFY,
+                                   elg_get_nbits (s_keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("elg_verify data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the signature value.  */
+  rc = _gcry_pk_util_preparse_sigval (s_sig, elg_names, &l1, NULL);
+  if (rc)
+    goto leave;
+  rc = sexp_extract_param (l1, NULL, "rs", &sig_r, &sig_s, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("elg_verify  s_r", sig_r);
+      log_mpidump ("elg_verify  s_s", sig_s);
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (s_keyparms, NULL, "pgy",
+                                 &pk.p, &pk.g, &pk.y, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("elg_verify    p", pk.p);
+      log_mpidump ("elg_verify    g", pk.g);
+      log_mpidump ("elg_verify    y", pk.y);
+    }
+
+  /* Verify the signature.  */
+  if (!verify (sig_r, sig_s, data, &pk))
+    rc = GPG_ERR_BAD_SIGNATURE;
+
+ leave:
+  _gcry_mpi_release (pk.p);
+  _gcry_mpi_release (pk.g);
+  _gcry_mpi_release (pk.y);
+  _gcry_mpi_release (data);
+  _gcry_mpi_release (sig_r);
+  _gcry_mpi_release (sig_s);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("elg_verify    => %s\n", rc?gpg_strerror (rc):"Good");
+  return rc;
+}
+
+
+/* Return the number of bits for the key described by PARMS.  On error
+ * 0 is returned.  The format of PARMS starts with the algorithm name;
+ * for example:
+ *
+ *   (dsa
+ *     (p <mpi>)
+ *     (g <mpi>)
+ *     (y <mpi>))
+ *
+ * More parameters may be given but we only need P here.
+ */
+static unsigned int
+elg_get_nbits (gcry_sexp_t parms)
+{
+  gcry_sexp_t l1;
+  gcry_mpi_t p;
+  unsigned int nbits;
+
+  l1 = sexp_find_token (parms, "p", 1);
+  if (!l1)
+    return 0; /* Parameter P not found.  */
+
+  p= sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+  sexp_release (l1);
+  nbits = p? mpi_get_nbits (p) : 0;
+  _gcry_mpi_release (p);
+  return nbits;
+}
+
+
+
+gcry_pk_spec_t _gcry_pubkey_spec_elg =
+  {
+    GCRY_PK_ELG, { 0, 0 },
+    (GCRY_PK_USAGE_SIGN | GCRY_PK_USAGE_ENCR),
+    "ELG", elg_names,
+    "pgy", "pgyx", "ab", "rs", "pgy",
+    elg_generate,
+    elg_check_secret_key,
+    elg_encrypt,
+    elg_decrypt,
+    elg_sign,
+    elg_verify,
+    elg_get_nbits,
+  };
diff --git a/comm/third_party/libgcrypt/cipher/gost-s-box.c b/comm/third_party/libgcrypt/cipher/gost-s-box.c
new file mode 100644
index 0000000000..5d5ed7dc44
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/gost-s-box.c
@@ -0,0 +1,266 @@
+/* gost-s-box.c - GOST 28147-89 S-Box expander
+ * Copyright (C) 2013 Dmitry Eremin-Solenikov
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define DIM(v) (sizeof(v)/sizeof((v)[0]))
+
+struct gost_sbox
+{
+  const char *name;
+  const char *oid;
+  unsigned int keymeshing;
+  unsigned char sbox[16*8];
+} gost_sboxes[] = {
+  { "test_3411", "1.2.643.2.2.30.0", 0,
+    {
+      0x4, 0xE, 0x5, 0x7, 0x6, 0x4, 0xD, 0x1,
+      0xA, 0xB, 0x8, 0xD, 0xC, 0xB, 0xB, 0xF,
+      0x9, 0x4, 0x1, 0xA, 0x7, 0xA, 0x4, 0xD,
+      0x2, 0xC, 0xD, 0x1, 0x1, 0x0, 0x1, 0x0,
+
+      0xD, 0x6, 0xA, 0x0, 0x5, 0x7, 0x3, 0x5,
+      0x8, 0xD, 0x3, 0x8, 0xF, 0x2, 0xF, 0x7,
+      0x0, 0xF, 0x4, 0x9, 0xD, 0x1, 0x5, 0xA,
+      0xE, 0xA, 0x2, 0xF, 0x8, 0xD, 0x9, 0x4,
+
+      0x6, 0x2, 0xE, 0xE, 0x4, 0x3, 0x0, 0x9,
+      0xB, 0x3, 0xF, 0x4, 0xA, 0x6, 0xA, 0x2,
+      0x1, 0x8, 0xC, 0x6, 0x9, 0x8, 0xE, 0x3,
+      0xC, 0x1, 0x7, 0xC, 0xE, 0x5, 0x7, 0xE,
+
+      0x7, 0x0, 0x6, 0xB, 0x0, 0x9, 0x6, 0x6,
+      0xF, 0x7, 0x0, 0x2, 0x3, 0xC, 0x8, 0xB,
+      0x5, 0x5, 0x9, 0x5, 0xB, 0xF, 0x2, 0x8,
+      0x3, 0x9, 0xB, 0x3, 0x2, 0xE, 0xC, 0xC,
+    }
+  },
+  { "CryptoPro_3411", "1.2.643.2.2.30.1", 0,
+     {
+      0xA, 0x5, 0x7, 0x4, 0x7, 0x7, 0xD, 0x1,
+      0x4, 0xF, 0xF, 0xA, 0x6, 0x6, 0xE, 0x3,
+      0x5, 0x4, 0xC, 0x7, 0x4, 0x2, 0x4, 0xA,
+      0x6, 0x0, 0xE, 0xC, 0xB, 0x4, 0x1, 0x9,
+
+      0x8, 0x2, 0x9, 0x0, 0x9, 0xD, 0x7, 0x5,
+      0x1, 0xD, 0x4, 0xF, 0xC, 0x9, 0x0, 0xB,
+      0x3, 0xB, 0x1, 0x2, 0x2, 0xF, 0x5, 0x4,
+      0x7, 0x9, 0x0, 0x8, 0xA, 0x0, 0xA, 0xF,
+
+      0xD, 0x1, 0x3, 0xE, 0x1, 0xA, 0x3, 0x8,
+      0xC, 0x7, 0xB, 0x1, 0x8, 0x1, 0xC, 0x6,
+      0xE, 0x6, 0x5, 0x6, 0x0, 0x5, 0x8, 0x7,
+      0x0, 0x3, 0x2, 0x5, 0xE, 0xB, 0xF, 0xE,
+
+      0x9, 0xC, 0x6, 0xD, 0xF, 0x8, 0x6, 0xD,
+      0x2, 0xE, 0xA, 0xB, 0xD, 0xE, 0x2, 0x0,
+      0xB, 0xA, 0x8, 0x9, 0x3, 0xC, 0x9, 0x2,
+      0xF, 0x8, 0xD, 0x3, 0x5, 0x3, 0xB, 0xC,
+    }
+  },
+  { "Test_89", "1.2.643.2.2.31.0", 0,
+    {
+      0x4, 0xC, 0xD, 0xE, 0x3, 0x8, 0x9, 0xC,
+      0x2, 0x9, 0x8, 0x9, 0xE, 0xF, 0xB, 0x6,
+      0xF, 0xF, 0xE, 0xB, 0x5, 0x6, 0xC, 0x5,
+      0x5, 0xE, 0xC, 0x2, 0x9, 0xB, 0x0, 0x2,
+
+      0x9, 0x8, 0x7, 0x5, 0x6, 0x1, 0x3, 0xB,
+      0x1, 0x1, 0x3, 0xF, 0x8, 0x9, 0x6, 0x0,
+      0x0, 0x3, 0x9, 0x7, 0x0, 0xC, 0x7, 0x9,
+      0x8, 0xA, 0xA, 0x1, 0xD, 0x5, 0x5, 0xD,
+
+      0xE, 0x2, 0x1, 0x0, 0xA, 0xD, 0x4, 0x3,
+      0x3, 0x7, 0x5, 0xD, 0xB, 0x3, 0x8, 0xE,
+      0xB, 0x4, 0x2, 0xC, 0x7, 0x7, 0xE, 0x7,
+      0xC, 0xD, 0x4, 0x6, 0xC, 0xA, 0xF, 0xA,
+
+      0xD, 0x6, 0x6, 0xA, 0x2, 0x0, 0x1, 0xF,
+      0x7, 0x0, 0xF, 0x4, 0x1, 0xE, 0xA, 0x4,
+      0xA, 0xB, 0x0, 0x3, 0xF, 0x2, 0x2, 0x1,
+      0x6, 0x5, 0xB, 0x8, 0x4, 0x4, 0xD, 0x8,
+    }
+  },
+  { "CryptoPro_A", "1.2.643.2.2.31.1", 1,
+    {
+      0x9, 0x3, 0xE, 0xE, 0xB, 0x3, 0x1, 0xB,
+      0x6, 0x7, 0x4, 0x7, 0x5, 0xA, 0xD, 0xA,
+      0x3, 0xE, 0x6, 0xA, 0x1, 0xD, 0x2, 0xF,
+      0x2, 0x9, 0x2, 0xC, 0x9, 0xC, 0x9, 0x5,
+
+      0x8, 0x8, 0xB, 0xD, 0x8, 0x1, 0x7, 0x0,
+      0xB, 0xA, 0x3, 0x1, 0xD, 0x2, 0xA, 0xC,
+      0x1, 0xF, 0xD, 0x3, 0xF, 0x0, 0x6, 0xE,
+      0x7, 0x0, 0x8, 0x9, 0x0, 0xB, 0x0, 0x8,
+
+      0xA, 0x5, 0xC, 0x0, 0xE, 0x7, 0x8, 0x6,
+      0x4, 0x2, 0xF, 0x2, 0x4, 0x5, 0xC, 0x2,
+      0xE, 0x6, 0x5, 0xB, 0x2, 0x9, 0x4, 0x3,
+      0xF, 0xC, 0xA, 0x4, 0x3, 0x4, 0x5, 0x9,
+
+      0xC, 0xB, 0x0, 0xF, 0xC, 0x8, 0xF, 0x1,
+      0x0, 0x4, 0x7, 0x8, 0x7, 0xF, 0x3, 0x7,
+      0xD, 0xD, 0x1, 0x5, 0xA, 0xE, 0xB, 0xD,
+      0x5, 0x1, 0x9, 0x6, 0x6, 0x6, 0xE, 0x4,
+    }
+  },
+  { "CryptoPro_B", "1.2.643.2.2.31.2", 1,
+    {
+      0x8, 0x0, 0xE, 0x7, 0x2, 0x8, 0x5, 0x0,
+      0x4, 0x1, 0xC, 0x5, 0x7, 0x3, 0x2, 0x4,
+      0xB, 0x2, 0x0, 0x0, 0xC, 0x2, 0xA, 0xB,
+      0x1, 0xA, 0xA, 0xD, 0xF, 0x6, 0xB, 0xE,
+
+      0x3, 0x4, 0x9, 0xB, 0x9, 0x4, 0x9, 0x8,
+      0x5, 0xD, 0x2, 0x6, 0x5, 0xD, 0x1, 0x3,
+      0x0, 0x5, 0xD, 0x1, 0xA, 0xE, 0xC, 0x7,
+      0x9, 0xC, 0xB, 0x2, 0xB, 0xB, 0x3, 0x1,
+
+      0x2, 0x9, 0x7, 0x3, 0x1, 0xC, 0x7, 0xA,
+      0xE, 0x7, 0x5, 0xA, 0x4, 0x1, 0x4, 0x2,
+      0xA, 0x3, 0x8, 0xC, 0x0, 0x7, 0xD, 0x9,
+      0xC, 0xF, 0xF, 0xF, 0xD, 0xF, 0x0, 0x6,
+
+      0xD, 0xB, 0x3, 0x4, 0x6, 0xA, 0x6, 0xF,
+      0x6, 0x8, 0x6, 0xE, 0x8, 0x0, 0xF, 0xD,
+      0x7, 0x6, 0x1, 0x9, 0xE, 0x9, 0x8, 0x5,
+      0xF, 0xE, 0x4, 0x8, 0x3, 0x5, 0xE, 0xC,
+    }
+  },
+  { "CryptoPro_C", "1.2.643.2.2.31.3", 1,
+    {
+      0x1, 0x0, 0x8, 0x3, 0x8, 0xC, 0xA, 0x7,
+      0xB, 0x1, 0x2, 0x6, 0xD, 0x9, 0x9, 0x4,
+      0xC, 0x7, 0x5, 0x0, 0xB, 0xB, 0x6, 0x0,
+      0x2, 0xD, 0x0, 0x1, 0x0, 0x1, 0x8, 0x5,
+
+      0x9, 0xB, 0x4, 0x5, 0x4, 0x8, 0xD, 0xA,
+      0xD, 0x4, 0x9, 0xD, 0x5, 0xE, 0xE, 0x2,
+      0x0, 0x5, 0xF, 0xA, 0x1, 0x2, 0x2, 0xF,
+      0xF, 0x2, 0xA, 0x8, 0x2, 0x4, 0x0, 0xE,
+
+      0x4, 0x8, 0x3, 0xB, 0x9, 0x7, 0xF, 0xC,
+      0x5, 0xE, 0x7, 0x2, 0x3, 0x3, 0x3, 0x6,
+      0x8, 0xF, 0xC, 0x9, 0xC, 0x6, 0x5, 0x1,
+      0xE, 0xC, 0xD, 0x7, 0xE, 0x5, 0xB, 0xB,
+
+      0xA, 0x9, 0x6, 0xE, 0x6, 0xA, 0x4, 0xD,
+      0x7, 0xA, 0xE, 0xF, 0xF, 0x0, 0x1, 0x9,
+      0x6, 0x6, 0x1, 0xC, 0xA, 0xF, 0xC, 0x3,
+      0x3, 0x3, 0xB, 0x4, 0x7, 0xD, 0x7, 0x8,
+    }
+  },
+  { "CryptoPro_D", "1.2.643.2.2.31.4", 1,
+    {
+      0xF, 0xB, 0x1, 0x1, 0x0, 0x8, 0x3, 0x1,
+      0xC, 0x6, 0xC, 0x5, 0xC, 0x0, 0x0, 0xA,
+      0x2, 0x3, 0xB, 0xE, 0x8, 0xF, 0x6, 0x6,
+      0xA, 0x4, 0x0, 0xC, 0x9, 0x3, 0xF, 0x8,
+
+      0x6, 0xC, 0xF, 0xA, 0xD, 0x2, 0x1, 0xF,
+      0x4, 0xF, 0xE, 0x7, 0x2, 0x5, 0xE, 0xB,
+      0x5, 0xE, 0x6, 0x0, 0xA, 0xE, 0x9, 0x0,
+      0x0, 0x2, 0x5, 0xD, 0xB, 0xB, 0x2, 0x4,
+
+      0x7, 0x7, 0xA, 0x6, 0x7, 0x1, 0xD, 0xC,
+      0x9, 0xD, 0xD, 0x2, 0x3, 0xA, 0x8, 0x3,
+      0xE, 0x8, 0x4, 0xB, 0x6, 0x4, 0xC, 0x5,
+      0xD, 0x0, 0x8, 0x4, 0x5, 0x7, 0x4, 0x9,
+
+      0x1, 0x5, 0x9, 0x9, 0x4, 0xC, 0xB, 0x7,
+      0xB, 0xA, 0x3, 0x3, 0xE, 0x9, 0xA, 0xD,
+      0x8, 0x9, 0x7, 0xF, 0xF, 0xD, 0x5, 0x2,
+      0x3, 0x1, 0x2, 0x8, 0x1, 0x6, 0x7, 0xE,
+    }
+  },
+  { "TC26_Z", "1.2.643.7.1.2.5.1.1", 1,
+    {
+      0xc, 0x6, 0xb, 0xc, 0x7, 0x5, 0x8, 0x1,
+      0x4, 0x8, 0x3, 0x8, 0xf, 0xd, 0xe, 0x7,
+      0x6, 0x2, 0x5, 0x2, 0x5, 0xf, 0x2, 0xe,
+      0x2, 0x3, 0x8, 0x1, 0xa, 0x6, 0x5, 0xd,
+
+      0xa, 0x9, 0x2, 0xd, 0x8, 0x9, 0x6, 0x0,
+      0x5, 0xa, 0xf, 0x4, 0x1, 0x2, 0x9, 0x5,
+      0xb, 0x5, 0xa, 0xf, 0x6, 0xc, 0x1, 0x8,
+      0x9, 0xc, 0xd, 0x6, 0xd, 0xa, 0xc, 0x3,
+
+      0xe, 0x1, 0xe, 0x7, 0x0, 0xb, 0xf, 0x4,
+      0x8, 0xe, 0x1, 0x0, 0x9, 0x7, 0x4, 0xf,
+      0xd, 0x4, 0x7, 0xa, 0x3, 0x8, 0xb, 0xa,
+      0x7, 0x7, 0x4, 0x5, 0xe, 0x1, 0x0, 0x6,
+
+      0x0, 0xb, 0xc, 0x3, 0xb, 0x4, 0xd, 0x9,
+      0x3, 0xd, 0x9, 0xe, 0x4, 0x3, 0xa, 0xc,
+      0xf, 0x0, 0x6, 0x9, 0x2, 0xe, 0x3, 0xb,
+      0x1, 0xf, 0x0, 0xb, 0xc, 0x0, 0x7, 0x2,
+    }
+  },
+};
+
+int main(int argc, char **argv)
+{
+  unsigned int i, j, s;
+  FILE *f;
+
+  if (argc == 1)
+    f = stdin;
+  else
+    f = fopen(argv[1], "w");
+
+  if (!f)
+    {
+      perror("fopen");
+      exit(1);
+    }
+
+  for (s = 0; s < DIM(gost_sboxes); s++)
+    {
+      unsigned char *sbox = gost_sboxes[s].sbox;
+      fprintf (f, "static const u32 sbox_%s[4*256] =\n  {", gost_sboxes[s].name);
+      for (i = 0; i < 4; i++) {
+        fprintf (f, 	"\n    /* %d */\n   ", i);
+        for (j = 0; j < 256; j++) {
+          unsigned int val;
+          if (j % 4 == 0 && j != 0)
+            fprintf (f, "\n   ");
+          val = sbox[ (j & 0xf) * 8 + 2 * i + 0] |
+               (sbox[ (j >> 4)  * 8 + 2 * i + 1] << 4);
+          val <<= (8*i);
+          val = (val << 11) | (val >> 21);
+          fprintf (f, " 0x%08x,", val);
+        }
+      }
+      fprintf (f, "\n  };\n\n");
+    }
+
+  fprintf (f, "static struct\n{\n  const char *oid;\n  const u32 *sbox;\n  const int keymeshing;\n} gost_oid_map[] = {\n");
+
+  for (s = 0; s < DIM(gost_sboxes); s++)
+    {
+      fprintf (f, "  { \"%s\", sbox_%s, %d },\n", gost_sboxes[s].oid, gost_sboxes[s].name, gost_sboxes[s].keymeshing );
+    }
+
+  fprintf(f, "  { NULL, NULL, 0 }\n};\n");
+
+  fclose (f);
+
+  return 0;
+}
diff --git a/comm/third_party/libgcrypt/cipher/gost.h b/comm/third_party/libgcrypt/cipher/gost.h
new file mode 100644
index 0000000000..53a4050503
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/gost.h
@@ -0,0 +1,34 @@
+/* gost.h - GOST 28147-89 implementation
+ * Copyright (C) 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _GCRY_GOST_H
+#define _GCRY_GOST_H
+
+typedef struct {
+  u32 key[8];
+  const u32 *sbox;
+  unsigned int mesh_counter;
+  unsigned int mesh_limit;
+} GOST28147_context;
+
+/* This is a simple interface that will be used by GOST R 34.11-94 */
+unsigned int _gcry_gost_enc_data (const u32 *key,
+    u32 *o1, u32 *o2, u32 n1, u32 n2, int cryptopro);
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/gost28147.c b/comm/third_party/libgcrypt/cipher/gost28147.c
new file mode 100644
index 0000000000..9445b378c4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/gost28147.c
@@ -0,0 +1,553 @@
+/* gost28147.c - GOST 28147-89 implementation for Libgcrypt
+ * Copyright (C) 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* GOST 28147-89 defines several modes of encryption:
+ * - ECB which should be used only for key transfer
+ * - CFB mode
+ * - OFB-like mode with additional transformation on keystream
+ *   RFC 5830 names this 'counter encryption' mode
+ *   Original GOST text uses the term 'gammirovanie'
+ * - MAC mode ('imitovstavka')
+ *
+ * This implementation handles ECB and CFB modes via usual libgcrypt handling.
+ * OFB-like modes are unsupported.
+ */
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "mac-internal.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+
+#include "gost.h"
+#include "gost-sb.h"
+
+static void
+gost_do_set_sbox (GOST28147_context *ctx, unsigned int index)
+{
+  ctx->sbox = gost_oid_map[index].sbox;
+  ctx->mesh_limit = gost_oid_map[index].keymeshing ? 1024 : 0;
+}
+
+static gcry_err_code_t
+gost_setkey (void *c, const byte *key, unsigned keylen,
+             cipher_bulk_ops_t *bulk_ops)
+{
+  int i;
+  GOST28147_context *ctx = c;
+
+  (void)bulk_ops;
+
+  if (keylen != 256 / 8)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (!ctx->sbox)
+    gost_do_set_sbox (ctx, 0);
+
+  for (i = 0; i < 8; i++)
+    {
+      ctx->key[i] = buf_get_le32(&key[4*i]);
+    }
+
+  ctx->mesh_counter = 0;
+
+  return GPG_ERR_NO_ERROR;
+}
+
+static inline u32
+gost_val (u32 subkey, u32 cm1, const u32 *sbox)
+{
+  cm1 += subkey;
+  cm1 = sbox[0*256 + ((cm1 >>  0) & 0xff)] |
+        sbox[1*256 + ((cm1 >>  8) & 0xff)] |
+        sbox[2*256 + ((cm1 >> 16) & 0xff)] |
+        sbox[3*256 + ((cm1 >> 24) & 0xff)];
+  return cm1;
+}
+
+static unsigned int
+_gost_encrypt_data (const u32 *sbox, const u32 *key, u32 *o1, u32 *o2, u32 n1, u32 n2)
+{
+  n2 ^= gost_val (key[0], n1, sbox); n1 ^= gost_val (key[1], n2, sbox);
+  n2 ^= gost_val (key[2], n1, sbox); n1 ^= gost_val (key[3], n2, sbox);
+  n2 ^= gost_val (key[4], n1, sbox); n1 ^= gost_val (key[5], n2, sbox);
+  n2 ^= gost_val (key[6], n1, sbox); n1 ^= gost_val (key[7], n2, sbox);
+
+  n2 ^= gost_val (key[0], n1, sbox); n1 ^= gost_val (key[1], n2, sbox);
+  n2 ^= gost_val (key[2], n1, sbox); n1 ^= gost_val (key[3], n2, sbox);
+  n2 ^= gost_val (key[4], n1, sbox); n1 ^= gost_val (key[5], n2, sbox);
+  n2 ^= gost_val (key[6], n1, sbox); n1 ^= gost_val (key[7], n2, sbox);
+
+  n2 ^= gost_val (key[0], n1, sbox); n1 ^= gost_val (key[1], n2, sbox);
+  n2 ^= gost_val (key[2], n1, sbox); n1 ^= gost_val (key[3], n2, sbox);
+  n2 ^= gost_val (key[4], n1, sbox); n1 ^= gost_val (key[5], n2, sbox);
+  n2 ^= gost_val (key[6], n1, sbox); n1 ^= gost_val (key[7], n2, sbox);
+
+  n2 ^= gost_val (key[7], n1, sbox); n1 ^= gost_val (key[6], n2, sbox);
+  n2 ^= gost_val (key[5], n1, sbox); n1 ^= gost_val (key[4], n2, sbox);
+  n2 ^= gost_val (key[3], n1, sbox); n1 ^= gost_val (key[2], n2, sbox);
+  n2 ^= gost_val (key[1], n1, sbox); n1 ^= gost_val (key[0], n2, sbox);
+
+  *o1 = n2;
+  *o2 = n1;
+
+  return /* burn_stack */ 4*sizeof(void*) /* func call */ +
+                          3*sizeof(void*) /* stack */ +
+                          4*sizeof(void*) /* gost_val call */;
+}
+
+static unsigned int
+gost_encrypt_block (void *c, byte *outbuf, const byte *inbuf)
+{
+  GOST28147_context *ctx = c;
+  u32 n1, n2;
+  unsigned int burn;
+
+  n1 = buf_get_le32 (inbuf);
+  n2 = buf_get_le32 (inbuf+4);
+
+  burn = _gost_encrypt_data(ctx->sbox, ctx->key, &n1, &n2, n1, n2);
+
+  buf_put_le32 (outbuf+0, n1);
+  buf_put_le32 (outbuf+4, n2);
+
+  return /* burn_stack */ burn + 6*sizeof(void*) /* func call */;
+}
+
+unsigned int _gcry_gost_enc_data (const u32 *key,
+    u32 *o1, u32 *o2, u32 n1, u32 n2, int cryptopro)
+{
+  const u32 *sbox;
+  if (cryptopro)
+    sbox = sbox_CryptoPro_3411;
+  else
+    sbox = sbox_test_3411;
+  return _gost_encrypt_data (sbox, key, o1, o2, n1, n2) + 7 * sizeof(void *);
+}
+
+static unsigned int
+gost_decrypt_block (void *c, byte *outbuf, const byte *inbuf)
+{
+  GOST28147_context *ctx = c;
+  u32 n1, n2;
+  const u32 *sbox = ctx->sbox;
+
+  n1 = buf_get_le32 (inbuf);
+  n2 = buf_get_le32 (inbuf+4);
+
+  n2 ^= gost_val (ctx->key[0], n1, sbox); n1 ^= gost_val (ctx->key[1], n2, sbox);
+  n2 ^= gost_val (ctx->key[2], n1, sbox); n1 ^= gost_val (ctx->key[3], n2, sbox);
+  n2 ^= gost_val (ctx->key[4], n1, sbox); n1 ^= gost_val (ctx->key[5], n2, sbox);
+  n2 ^= gost_val (ctx->key[6], n1, sbox); n1 ^= gost_val (ctx->key[7], n2, sbox);
+
+  n2 ^= gost_val (ctx->key[7], n1, sbox); n1 ^= gost_val (ctx->key[6], n2, sbox);
+  n2 ^= gost_val (ctx->key[5], n1, sbox); n1 ^= gost_val (ctx->key[4], n2, sbox);
+  n2 ^= gost_val (ctx->key[3], n1, sbox); n1 ^= gost_val (ctx->key[2], n2, sbox);
+  n2 ^= gost_val (ctx->key[1], n1, sbox); n1 ^= gost_val (ctx->key[0], n2, sbox);
+
+  n2 ^= gost_val (ctx->key[7], n1, sbox); n1 ^= gost_val (ctx->key[6], n2, sbox);
+  n2 ^= gost_val (ctx->key[5], n1, sbox); n1 ^= gost_val (ctx->key[4], n2, sbox);
+  n2 ^= gost_val (ctx->key[3], n1, sbox); n1 ^= gost_val (ctx->key[2], n2, sbox);
+  n2 ^= gost_val (ctx->key[1], n1, sbox); n1 ^= gost_val (ctx->key[0], n2, sbox);
+
+  n2 ^= gost_val (ctx->key[7], n1, sbox); n1 ^= gost_val (ctx->key[6], n2, sbox);
+  n2 ^= gost_val (ctx->key[5], n1, sbox); n1 ^= gost_val (ctx->key[4], n2, sbox);
+  n2 ^= gost_val (ctx->key[3], n1, sbox); n1 ^= gost_val (ctx->key[2], n2, sbox);
+  n2 ^= gost_val (ctx->key[1], n1, sbox); n1 ^= gost_val (ctx->key[0], n2, sbox);
+
+  buf_put_le32 (outbuf+0, n2);
+  buf_put_le32 (outbuf+4, n1);
+
+  return /* burn_stack */ 4*sizeof(void*) /* func call */ +
+                          3*sizeof(void*) /* stack */ +
+                          4*sizeof(void*) /* gost_val call */;
+}
+
+static gpg_err_code_t
+gost_set_sbox (GOST28147_context *ctx, const char *oid)
+{
+  int i;
+
+  for (i = 0; gost_oid_map[i].oid; i++)
+    {
+      if (!strcmp(gost_oid_map[i].oid, oid))
+        {
+          gost_do_set_sbox (ctx, i);
+          return 0;
+        }
+    }
+  return GPG_ERR_VALUE_NOT_FOUND;
+}
+
+static gpg_err_code_t
+gost_set_extra_info (void *c, int what, const void *buffer, size_t buflen)
+{
+  GOST28147_context *ctx = c;
+  gpg_err_code_t ec = 0;
+
+  (void)buffer;
+  (void)buflen;
+
+  switch (what)
+    {
+    case GCRYCTL_SET_SBOX:
+      ec = gost_set_sbox (ctx, buffer);
+      break;
+
+    default:
+      ec = GPG_ERR_INV_OP;
+      break;
+    }
+  return ec;
+}
+
+static const byte CryptoProKeyMeshingKey[] = {
+    0x69, 0x00, 0x72, 0x22, 0x64, 0xC9, 0x04, 0x23,
+    0x8D, 0x3A, 0xDB, 0x96, 0x46, 0xE9, 0x2A, 0xC4,
+    0x18, 0xFE, 0xAC, 0x94, 0x00, 0xED, 0x07, 0x12,
+    0xC0, 0x86, 0xDC, 0xC2, 0xEF, 0x4C, 0xA9, 0x2B
+};
+
+/* Implements key meshing algorithm by modifing ctx and returning new IV.
+   Thanks to Dmitry Belyavskiy. */
+static void
+cryptopro_key_meshing (GOST28147_context *ctx)
+{
+    unsigned char newkey[32];
+    unsigned int i;
+
+    /* "Decrypt" the static keymeshing key */
+    for (i = 0; i < 4; i++)
+      {
+	gost_decrypt_block (ctx, newkey + i*8, CryptoProKeyMeshingKey + i*8);
+      }
+
+    /* Set new key */
+    for (i = 0; i < 8; i++)
+      {
+	ctx->key[i] = buf_get_le32(&newkey[4*i]);
+      }
+
+    ctx->mesh_counter = 0;
+}
+
+static unsigned int
+gost_encrypt_block_mesh (void *c, byte *outbuf, const byte *inbuf)
+{
+  GOST28147_context *ctx = c;
+  u32 n1, n2;
+  unsigned int burn;
+
+  n1 = buf_get_le32 (inbuf);
+  n2 = buf_get_le32 (inbuf+4);
+
+  if (ctx->mesh_limit && (ctx->mesh_counter == ctx->mesh_limit))
+    {
+      cryptopro_key_meshing (ctx);
+      /* Yes, encrypt twice: once for KeyMeshing procedure per RFC 4357,
+       * once for block encryption */
+      _gost_encrypt_data(ctx->sbox, ctx->key, &n1, &n2, n1, n2);
+    }
+
+  burn = _gost_encrypt_data(ctx->sbox, ctx->key, &n1, &n2, n1, n2);
+
+  ctx->mesh_counter += 8;
+
+  buf_put_le32 (outbuf+0, n1);
+  buf_put_le32 (outbuf+4, n2);
+
+  return /* burn_stack */ burn + 6*sizeof(void*) /* func call */;
+}
+
+static gcry_cipher_oid_spec_t oids_gost28147_mesh[] =
+  {
+    { "1.2.643.2.2.21", GCRY_CIPHER_MODE_CFB },
+    /* { "1.2.643.2.2.31.0", GCRY_CIPHER_MODE_CNTGOST }, */
+    { "1.2.643.2.2.31.1", GCRY_CIPHER_MODE_CFB },
+    { "1.2.643.2.2.31.2", GCRY_CIPHER_MODE_CFB },
+    { "1.2.643.2.2.31.3", GCRY_CIPHER_MODE_CFB },
+    { "1.2.643.2.2.31.4", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_gost28147 =
+  {
+    GCRY_CIPHER_GOST28147, {0, 0},
+    "GOST28147", NULL, NULL, 8, 256,
+    sizeof (GOST28147_context),
+    gost_setkey,
+    gost_encrypt_block,
+    gost_decrypt_block,
+    NULL, NULL, NULL, gost_set_extra_info,
+  };
+
+/* Meshing is used only for CFB, so no need to have separate
+ * gost_decrypt_block_mesh.
+ * Moreover key meshing is specified as encrypting the block (IV). Decrypting
+ * it afterwards would be meaningless. */
+gcry_cipher_spec_t _gcry_cipher_spec_gost28147_mesh =
+  {
+    GCRY_CIPHER_GOST28147_MESH, {0, 0},
+    "GOST28147_MESH", NULL, oids_gost28147_mesh, 8, 256,
+    sizeof (GOST28147_context),
+    gost_setkey,
+    gost_encrypt_block_mesh,
+    gost_decrypt_block,
+    NULL, NULL, NULL, gost_set_extra_info,
+  };
+
+static gcry_err_code_t
+gost_imit_open (gcry_mac_hd_t h)
+{
+  memset(&h->u.imit, 0, sizeof(h->u.imit));
+  return 0;
+}
+
+static void
+gost_imit_close (gcry_mac_hd_t h)
+{
+  (void) h;
+}
+
+static gcry_err_code_t
+gost_imit_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  int i;
+
+  if (keylen != 256 / 8)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (!h->u.imit.ctx.sbox)
+    h->u.imit.ctx.sbox = sbox_CryptoPro_A;
+
+  for (i = 0; i < 8; i++)
+    {
+      h->u.imit.ctx.key[i] = buf_get_le32(&key[4*i]);
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t
+gost_imit_setiv (gcry_mac_hd_t h,
+		 const unsigned char *iv,
+		 size_t ivlen)
+{
+  if (ivlen != 8)
+    return GPG_ERR_INV_LENGTH;
+
+  h->u.imit.n1 = buf_get_le32 (iv + 0);
+  h->u.imit.n2 = buf_get_le32 (iv + 4);
+
+  return 0;
+}
+
+static gcry_err_code_t
+gost_imit_reset (gcry_mac_hd_t h)
+{
+  h->u.imit.n1 = h->u.imit.n2 = 0;
+  h->u.imit.unused = 0;
+  return 0;
+}
+
+static unsigned int
+_gost_imit_block (const u32 *sbox, const u32 *key, u32 *o1, u32 *o2, u32 n1, u32 n2)
+{
+  n1 ^= *o1;
+  n2 ^= *o2;
+
+  n2 ^= gost_val (key[0], n1, sbox); n1 ^= gost_val (key[1], n2, sbox);
+  n2 ^= gost_val (key[2], n1, sbox); n1 ^= gost_val (key[3], n2, sbox);
+  n2 ^= gost_val (key[4], n1, sbox); n1 ^= gost_val (key[5], n2, sbox);
+  n2 ^= gost_val (key[6], n1, sbox); n1 ^= gost_val (key[7], n2, sbox);
+
+  n2 ^= gost_val (key[0], n1, sbox); n1 ^= gost_val (key[1], n2, sbox);
+  n2 ^= gost_val (key[2], n1, sbox); n1 ^= gost_val (key[3], n2, sbox);
+  n2 ^= gost_val (key[4], n1, sbox); n1 ^= gost_val (key[5], n2, sbox);
+  n2 ^= gost_val (key[6], n1, sbox); n1 ^= gost_val (key[7], n2, sbox);
+
+  *o1 = n1;
+  *o2 = n2;
+
+  return /* burn_stack */ 4*sizeof(void*) /* func call */ +
+                          3*sizeof(void*) /* stack */ +
+                          4*sizeof(void*) /* gost_val call */;
+}
+
+static inline unsigned int
+gost_imit_block (GOST28147_context *ctx, u32 *n1, u32 *n2, const unsigned char *buf)
+{
+  if (ctx->mesh_limit && (ctx->mesh_counter == ctx->mesh_limit))
+    cryptopro_key_meshing (ctx);
+
+  return _gost_imit_block (ctx->sbox, ctx->key,
+			   n1, n2,
+			   buf_get_le32 (buf+0),
+			   buf_get_le32 (buf+4));
+}
+
+static gcry_err_code_t
+gost_imit_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  const int blocksize = 8;
+  unsigned int burn = 0;
+  if (!buflen || !buf)
+    return GPG_ERR_NO_ERROR;
+
+  if (h->u.imit.unused)
+    {
+      for (; buflen && h->u.imit.unused < blocksize; buflen --)
+        h->u.imit.lastiv[h->u.imit.unused++] = *buf++;
+
+      if (h->u.imit.unused < blocksize)
+        return GPG_ERR_NO_ERROR;
+
+      h->u.imit.count ++;
+      burn = gost_imit_block (&h->u.imit.ctx,
+			      &h->u.imit.n1, &h->u.imit.n2,
+			      h->u.imit.lastiv);
+
+      h->u.imit.unused = 0;
+    }
+
+  while (buflen >= blocksize)
+    {
+      h->u.imit.count ++;
+      burn = gost_imit_block (&h->u.imit.ctx,
+			      &h->u.imit.n1, &h->u.imit.n2,
+			      buf);
+      buf += blocksize;
+      buflen -= blocksize;
+    }
+
+  for (; buflen; buflen--)
+    h->u.imit.lastiv[h->u.imit.unused++] = *buf++;
+
+  _gcry_burn_stack (burn);
+
+  return GPG_ERR_NO_ERROR;
+}
+
+static void
+gost_imit_finish (gcry_mac_hd_t h)
+{
+  static const unsigned char zero[8] = {0};
+
+  /* Fill till full block */
+  if (h->u.imit.unused)
+    gost_imit_write(h, zero, 8 - h->u.imit.unused);
+
+  if (h->u.imit.count == 1)
+    gost_imit_write(h, zero, 8);
+}
+
+static gcry_err_code_t
+gost_imit_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t * outlen)
+{
+  unsigned int dlen = 8;
+  unsigned char digest[8];
+
+  gost_imit_finish (h);
+
+  buf_put_le32 (digest+0, h->u.imit.n1);
+  buf_put_le32 (digest+4, h->u.imit.n2);
+
+  if (*outlen <= dlen)
+    buf_cpy (outbuf, digest, *outlen);
+  else
+    {
+      buf_cpy (outbuf, digest, dlen);
+      *outlen = dlen;
+    }
+  return 0;
+}
+
+static gcry_err_code_t
+gost_imit_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  unsigned char tbuf[8];
+
+  gost_imit_finish (h);
+
+  buf_put_le32 (tbuf+0, h->u.imit.n1);
+  buf_put_le32 (tbuf+4, h->u.imit.n2);
+
+  return buf_eq_const(tbuf, buf, buflen) ?
+             GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
+}
+
+static unsigned int
+gost_imit_get_maclen (int algo)
+{
+  (void) algo;
+  return 4; /* or 8 */
+}
+
+
+static unsigned int
+gost_imit_get_keylen (int algo)
+{
+  (void) algo;
+  return 256 / 8;
+}
+
+static gpg_err_code_t
+gost_imit_set_extra_info (gcry_mac_hd_t hd, int what, const void *buffer, size_t buflen)
+{
+  gpg_err_code_t ec = 0;
+
+  (void)buffer;
+  (void)buflen;
+
+  switch (what)
+    {
+    case GCRYCTL_SET_SBOX:
+      ec = gost_set_sbox (&hd->u.imit.ctx, buffer);
+      break;
+
+    default:
+      ec = GPG_ERR_INV_OP;
+      break;
+    }
+  return ec;
+}
+
+
+static gcry_mac_spec_ops_t gost_imit_ops = {
+  gost_imit_open,
+  gost_imit_close,
+  gost_imit_setkey,
+  gost_imit_setiv,
+  gost_imit_reset,
+  gost_imit_write,
+  gost_imit_read,
+  gost_imit_verify,
+  gost_imit_get_maclen,
+  gost_imit_get_keylen,
+  gost_imit_set_extra_info,
+  NULL
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_gost28147_imit =
+  {
+    GCRY_MAC_GOST28147_IMIT, {0, 0}, "GOST28147_IMIT",
+    &gost_imit_ops
+  };
diff --git a/comm/third_party/libgcrypt/cipher/gostr3411-94.c b/comm/third_party/libgcrypt/cipher/gostr3411-94.c
new file mode 100644
index 0000000000..7cf0637e26
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/gostr3411-94.c
@@ -0,0 +1,383 @@
+/* gostr3411-94.c - GOST R 34.11-94 hash function
+ * Copyright (C) 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+#include "gost.h"
+
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+
+typedef struct {
+  gcry_md_block_ctx_t bctx;
+  union {
+    u32 h[8];
+    byte result[32];
+  };
+  u32 sigma[8];
+  u32 len;
+  int cryptopro;
+} GOSTR3411_CONTEXT;
+
+static unsigned int
+transform (void *c, const unsigned char *data, size_t nblks);
+
+static void
+gost3411_init (void *context, unsigned int flags)
+{
+  GOSTR3411_CONTEXT *hd = context;
+
+  (void)flags;
+
+  memset (hd->h, 0, 32);
+  memset (hd->sigma, 0, 32);
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(32);
+  hd->bctx.bwrite = transform;
+  hd->cryptopro = 0;
+}
+
+static void
+gost3411_cp_init (void *context, unsigned int flags)
+{
+  GOSTR3411_CONTEXT *hd = context;
+  gost3411_init (context, flags);
+  hd->cryptopro = 1;
+}
+
+static void
+do_p (u32 *p, u32 *u, u32 *v)
+{
+  int k;
+  u32 t[8];
+
+  for (k = 0; k < 8; k++)
+    t[k] = u[k] ^ v[k];
+
+  k = 0;
+  p[k+0] = ((t[0] >> (8*k)) & 0xff) << 0 |
+           ((t[2] >> (8*k)) & 0xff) << 8 |
+           ((t[4] >> (8*k)) & 0xff) << 16 |
+           ((t[6] >> (8*k)) & 0xff) << 24;
+  p[k+4] = ((t[1] >> (8*k)) & 0xff) << 0 |
+           ((t[3] >> (8*k)) & 0xff) << 8 |
+           ((t[5] >> (8*k)) & 0xff) << 16 |
+           ((t[7] >> (8*k)) & 0xff) << 24;
+
+  k = 1;
+  p[k+0] = ((t[0] >> (8*k)) & 0xff) << 0 |
+           ((t[2] >> (8*k)) & 0xff) << 8 |
+           ((t[4] >> (8*k)) & 0xff) << 16 |
+           ((t[6] >> (8*k)) & 0xff) << 24;
+  p[k+4] = ((t[1] >> (8*k)) & 0xff) << 0 |
+           ((t[3] >> (8*k)) & 0xff) << 8 |
+           ((t[5] >> (8*k)) & 0xff) << 16 |
+           ((t[7] >> (8*k)) & 0xff) << 24;
+
+  k = 2;
+  p[k+0] = ((t[0] >> (8*k)) & 0xff) << 0 |
+           ((t[2] >> (8*k)) & 0xff) << 8 |
+           ((t[4] >> (8*k)) & 0xff) << 16 |
+           ((t[6] >> (8*k)) & 0xff) << 24;
+  p[k+4] = ((t[1] >> (8*k)) & 0xff) << 0 |
+           ((t[3] >> (8*k)) & 0xff) << 8 |
+           ((t[5] >> (8*k)) & 0xff) << 16 |
+           ((t[7] >> (8*k)) & 0xff) << 24;
+
+  k = 3;
+  p[k+0] = ((t[0] >> (8*k)) & 0xff) << 0 |
+           ((t[2] >> (8*k)) & 0xff) << 8 |
+           ((t[4] >> (8*k)) & 0xff) << 16 |
+           ((t[6] >> (8*k)) & 0xff) << 24;
+  p[k+4] = ((t[1] >> (8*k)) & 0xff) << 0 |
+           ((t[3] >> (8*k)) & 0xff) << 8 |
+           ((t[5] >> (8*k)) & 0xff) << 16 |
+           ((t[7] >> (8*k)) & 0xff) << 24;
+}
+
+static void
+do_a (u32 *u)
+{
+  u32 t[2];
+  int i;
+  memcpy(t, u, 2*4);
+  for (i = 0; i < 6; i++)
+    u[i] = u[i+2];
+  u[6] = u[0] ^ t[0];
+  u[7] = u[1] ^ t[1];
+}
+/* apply do_a twice: 1 2 3 4 -> 3 4 1^2 2^3 */
+static void
+do_a2 (u32 *u)
+{
+  u32 t[4];
+  int i;
+  memcpy (t, u, 16);
+  memcpy (u, u + 4, 16);
+  for (i = 0; i < 2; i++)
+    {
+      u[4+i] = t[i] ^ t[i + 2];
+      u[6+i] = u[i] ^ t[i + 2];
+    }
+}
+
+static void
+do_apply_c2 (u32 *u)
+{
+  u[ 0] ^= 0xff00ff00;
+  u[ 1] ^= 0xff00ff00;
+  u[ 2] ^= 0x00ff00ff;
+  u[ 3] ^= 0x00ff00ff;
+  u[ 4] ^= 0x00ffff00;
+  u[ 5] ^= 0xff0000ff;
+  u[ 6] ^= 0x000000ff;
+  u[ 7] ^= 0xff00ffff;
+}
+
+#define do_chi_step12(e) \
+  e[6] ^= ((e[6] >> 16) ^ e[7] ^ (e[7] >> 16) ^ e[4] ^ (e[5] >>16)) & 0xffff;
+
+#define do_chi_step13(e) \
+  e[6] ^= ((e[7] ^ (e[7] >> 16) ^ e[0] ^ (e[4] >> 16) ^ e[6]) & 0xffff) << 16;
+
+#define do_chi_doublestep(e, i) \
+  e[i] ^= (e[i] >> 16) ^ (e[(i+1)%8] << 16) ^ e[(i+1)%8] ^ (e[(i+1)%8] >> 16) ^ (e[(i+2)%8] << 16) ^ e[(i+6)%8] ^ (e[(i+7)%8] >> 16); \
+  e[i] ^= (e[i] << 16);
+
+static void
+do_chi_submix12 (u32 *e, u32 *x)
+{
+  e[6] ^= x[0];
+  e[7] ^= x[1];
+  e[0] ^= x[2];
+  e[1] ^= x[3];
+  e[2] ^= x[4];
+  e[3] ^= x[5];
+  e[4] ^= x[6];
+  e[5] ^= x[7];
+}
+
+static void
+do_chi_submix13 (u32 *e, u32 *x)
+{
+  e[6] ^= (x[0] << 16) | (x[7] >> 16);
+  e[7] ^= (x[1] << 16) | (x[0] >> 16);
+  e[0] ^= (x[2] << 16) | (x[1] >> 16);
+  e[1] ^= (x[3] << 16) | (x[2] >> 16);
+  e[2] ^= (x[4] << 16) | (x[3] >> 16);
+  e[3] ^= (x[5] << 16) | (x[4] >> 16);
+  e[4] ^= (x[6] << 16) | (x[5] >> 16);
+  e[5] ^= (x[7] << 16) | (x[6] >> 16);
+}
+
+static void
+do_add (u32 *s, u32 *a)
+{
+  u32 carry = 0;
+  int i;
+
+  for (i = 0; i < 8; i++)
+    {
+      u32 op = carry + a[i];
+      s[i] += op;
+      carry = (a[i] > op) || (op > s[i]);
+    }
+}
+
+static unsigned int
+do_hash_step (GOSTR3411_CONTEXT *hd, u32 *h, u32 *m)
+{
+  u32 u[8], v[8];
+  u32 s[8];
+  u32 k[8];
+  unsigned int burn;
+  int i;
+
+  memcpy (u, h, 32);
+  memcpy (v, m, 32);
+
+  for (i = 0; i < 4; i++) {
+    do_p (k, u, v);
+
+    burn = _gcry_gost_enc_data (k, &s[2*i], &s[2*i+1], h[2*i], h[2*i+1], hd->cryptopro);
+
+    do_a (u);
+    if (i == 1)
+      do_apply_c2 (u);
+    do_a2 (v);
+  }
+
+  for (i = 0; i < 5; i++)
+    {
+      do_chi_doublestep (s, 0);
+      do_chi_doublestep (s, 1);
+      do_chi_doublestep (s, 2);
+      do_chi_doublestep (s, 3);
+      do_chi_doublestep (s, 4);
+      /* That is in total 12 + 1 + 61 = 74 = 16 * 4 + 10 rounds */
+      if (i == 4)
+        break;
+      do_chi_doublestep (s, 5);
+      if (i == 0)
+        do_chi_submix12(s, m);
+      do_chi_step12 (s);
+      if (i == 0)
+        do_chi_submix13(s, h);
+      do_chi_step13 (s);
+      do_chi_doublestep (s, 7);
+    }
+
+  memcpy (h, s+5, 12);
+  memcpy (h+3, s, 20);
+
+  return /* burn_stack */ 4 * sizeof(void*) /* func call (ret addr + args) */ +
+                          4 * 32 + 2 * sizeof(int) /* stack */ +
+                          max(burn /* _gcry_gost_enc_one */,
+                              sizeof(void*) * 2 /* do_a2 call */ +
+                              16 + sizeof(int) /* do_a2 stack */ );
+}
+
+static unsigned int
+transform_blk (void *ctx, const unsigned char *data)
+{
+  GOSTR3411_CONTEXT *hd = ctx;
+  u32 m[8];
+  unsigned int burn;
+  int i;
+
+  for (i = 0; i < 8; i++)
+    m[i] = buf_get_le32(data + i*4);
+  burn = do_hash_step (hd, hd->h, m);
+  do_add (hd->sigma, m);
+
+  return /* burn_stack */ burn + 3 * sizeof(void*) + 32 + 2 * sizeof(void*);
+}
+
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 32;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+/*
+   The routine finally terminates the computation and returns the
+   digest.  The handle is prepared for a new cycle, but adding bytes
+   to the handle will the destroy the returned buffer.  Returns: 32
+   bytes with the message the digest.  */
+static void
+gost3411_final (void *context)
+{
+  GOSTR3411_CONTEXT *hd = context;
+  size_t padlen = 0;
+  u32 l[8];
+  int i;
+  MD_NBLOCKS_TYPE nblocks;
+
+  if (hd->bctx.count > 0)
+    {
+      padlen = 32 - hd->bctx.count;
+      memset (hd->bctx.buf + hd->bctx.count, 0, padlen);
+      hd->bctx.count += padlen;
+      _gcry_md_block_write (hd, NULL, 0); /* flush */;
+    }
+
+  if (hd->bctx.count != 0)
+    return; /* Something went wrong */
+
+  memset (l, 0, 32);
+
+  nblocks = hd->bctx.nblocks;
+  if (padlen)
+    {
+      nblocks --;
+      l[0] = 256 - padlen * 8;
+    }
+  l[0] |= nblocks << 8;
+  nblocks >>= 24;
+
+  for (i = 1; i < 8 && nblocks != 0; i++)
+    {
+      l[i] = nblocks;
+      nblocks >>= 24;
+    }
+
+  do_hash_step (hd, hd->h, l);
+  do_hash_step (hd, hd->h, hd->sigma);
+  for (i = 0; i < 8; i++)
+    hd->h[i] = le_bswap32(hd->h[i]);
+}
+
+static byte *
+gost3411_read (void *context)
+{
+  GOSTR3411_CONTEXT *hd = context;
+
+  return hd->result;
+}
+
+static unsigned char asn[6] = /* Object ID is 1.2.643.2.2.3 */
+  { 0x2a, 0x85, 0x03, 0x02, 0x02, 0x03 };
+
+static gcry_md_oid_spec_t oid_spec_gostr3411[] =
+  {
+    /* iso.member-body.ru.rans.cryptopro.3 (gostR3411-94-with-gostR3410-2001) */
+    { "1.2.643.2.2.3" },
+    /* iso.member-body.ru.rans.cryptopro.9 (gostR3411-94) */
+    { "1.2.643.2.2.9" },
+    {NULL},
+  };
+
+gcry_md_spec_t _gcry_digest_spec_gost3411_94 =
+  {
+    GCRY_MD_GOSTR3411_94, {0, 0},
+    "GOSTR3411_94", NULL, 0, NULL, 32,
+    gost3411_init, _gcry_md_block_write, gost3411_final, gost3411_read, NULL,
+    NULL, NULL,
+    sizeof (GOSTR3411_CONTEXT)
+  };
+gcry_md_spec_t _gcry_digest_spec_gost3411_cp =
+  {
+    GCRY_MD_GOSTR3411_CP, {0, 0},
+    "GOSTR3411_CP", asn, DIM (asn), oid_spec_gostr3411, 32,
+    gost3411_cp_init, _gcry_md_block_write, gost3411_final, gost3411_read, NULL,
+    NULL, NULL,
+    sizeof (GOSTR3411_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/hash-common.c b/comm/third_party/libgcrypt/cipher/hash-common.c
new file mode 100644
index 0000000000..ed2d7cacd1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/hash-common.c
@@ -0,0 +1,193 @@
+/* hash-common.c - Common code for hash algorithms
+ * Copyright (C) 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+
+#include "g10lib.h"
+#include "bufhelp.h"
+#include "hash-common.h"
+
+
+/* Run a selftest for hash algorithm ALGO.  If the resulting digest
+   matches EXPECT/EXPECTLEN and everything else is fine as well,
+   return NULL.  If an error occurs, return a static text string
+   describing the error.
+
+   DATAMODE controls what will be hashed according to this table:
+
+     0 - Hash the supplied DATA of DATALEN.
+     1 - Hash one million times a 'a'.  DATA and DATALEN are ignored.
+
+*/
+const char *
+_gcry_hash_selftest_check_one (int algo,
+                               int datamode, const void *data, size_t datalen,
+                               const void *expect, size_t expectlen)
+{
+  const char *result = NULL;
+  gcry_error_t err = 0;
+  gcry_md_hd_t hd;
+  unsigned char *digest;
+  char aaa[1000];
+  int xof = 0;
+
+  if (_gcry_md_get_algo_dlen (algo) == 0)
+    xof = 1;
+  else if (_gcry_md_get_algo_dlen (algo) != expectlen)
+    return "digest size does not match expected size";
+
+  err = _gcry_md_open (&hd, algo, 0);
+  if (err)
+    return "gcry_md_open failed";
+
+  switch (datamode)
+    {
+    case 0:
+      _gcry_md_write (hd, data, datalen);
+      break;
+
+    case 1: /* Hash one million times an "a". */
+      {
+        int i;
+
+        /* Write in odd size chunks so that we test the buffering.  */
+        memset (aaa, 'a', 1000);
+        for (i = 0; i < 1000; i++)
+          _gcry_md_write (hd, aaa, 1000);
+      }
+      break;
+
+    default:
+      result = "invalid DATAMODE";
+    }
+
+  if (!result)
+    {
+      if (!xof)
+	{
+	  digest = _gcry_md_read (hd, algo);
+
+	  if ( memcmp (digest, expect, expectlen) )
+	    result = "digest mismatch";
+	}
+      else
+	{
+	  gcry_assert(expectlen <= sizeof(aaa));
+
+	  err = _gcry_md_extract (hd, algo, aaa, expectlen);
+	  if (err)
+	    result = "error extracting output from XOF";
+	  else if ( memcmp (aaa, expect, expectlen) )
+	    result = "digest mismatch";
+	}
+    }
+
+  _gcry_md_close (hd);
+
+  return result;
+}
+
+
+/* Common function to write a chunk of data to the transform function
+   of a hash algorithm.  Note that the use of the term "block" does
+   not imply a fixed size block.  Note that we explicitly allow to use
+   this function after the context has been finalized; the result does
+   not have any meaning but writing after finalize is sometimes
+   helpful to mitigate timing attacks. */
+void
+_gcry_md_block_write (void *context, const void *inbuf_arg, size_t inlen)
+{
+  const unsigned char *inbuf = inbuf_arg;
+  gcry_md_block_ctx_t *hd = context;
+  unsigned int stack_burn = 0;
+  unsigned int nburn;
+  const unsigned int blocksize_shift = hd->blocksize_shift;
+  const unsigned int blocksize = 1 << blocksize_shift;
+  size_t inblocks;
+  size_t copylen;
+
+  if (sizeof(hd->buf) < blocksize)
+    BUG();
+
+  if (!hd->bwrite)
+    return;
+
+  if (hd->count > blocksize)
+    {
+      /* This happens only when gcry_md_write is called after final.
+       * Writing after final is used for mitigating timing attacks. */
+      hd->count = 0;
+    }
+
+  while (hd->count)
+    {
+      if (hd->count == blocksize)  /* Flush the buffer. */
+	{
+	  nburn = hd->bwrite (hd, hd->buf, 1);
+	  stack_burn = nburn > stack_burn ? nburn : stack_burn;
+	  hd->count = 0;
+	  if (!++hd->nblocks)
+	    hd->nblocks_high++;
+	}
+      else
+	{
+	  copylen = inlen;
+	  if (copylen > blocksize - hd->count)
+	    copylen = blocksize - hd->count;
+
+	  if (copylen == 0)
+	    break;
+
+	  buf_cpy (&hd->buf[hd->count], inbuf, copylen);
+	  hd->count += copylen;
+	  inbuf += copylen;
+	  inlen -= copylen;
+	}
+    }
+
+  if (inlen == 0)
+    return;
+
+  if (inlen >= blocksize)
+    {
+      inblocks = inlen >> blocksize_shift;
+      nburn = hd->bwrite (hd, inbuf, inblocks);
+      stack_burn = nburn > stack_burn ? nburn : stack_burn;
+      hd->count = 0;
+      hd->nblocks_high += (hd->nblocks + inblocks < inblocks);
+      hd->nblocks += inblocks;
+      inlen -= inblocks << blocksize_shift;
+      inbuf += inblocks << blocksize_shift;
+    }
+
+  if (inlen)
+    {
+      buf_cpy (hd->buf, inbuf, inlen);
+      hd->count = inlen;
+    }
+
+  if (stack_burn > 0)
+    _gcry_burn_stack (stack_burn);
+}
diff --git a/comm/third_party/libgcrypt/cipher/hash-common.h b/comm/third_party/libgcrypt/cipher/hash-common.h
new file mode 100644
index 0000000000..561e77a7e5
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/hash-common.h
@@ -0,0 +1,62 @@
+/* hash-common.h - Declarations of common code for hash algorithms.
+ * Copyright (C) 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_HASH_COMMON_H
+#define GCRY_HASH_COMMON_H
+
+#include "types.h"
+
+
+const char * _gcry_hash_selftest_check_one
+/**/         (int algo,
+              int datamode, const void *data, size_t datalen,
+              const void *expect, size_t expectlen);
+
+/* Type for the md_write helper function.  */
+typedef unsigned int (*_gcry_md_block_write_t) (void *c,
+						const unsigned char *blks,
+						size_t nblks);
+
+#if (defined(USE_SHA512) || defined(USE_WHIRLPOOL))
+/* SHA-512 and Whirlpool needs u64. SHA-512 needs larger buffer. */
+# define MD_BLOCK_MAX_BLOCKSIZE 128
+# define MD_NBLOCKS_TYPE u64
+#else
+# define MD_BLOCK_MAX_BLOCKSIZE 64
+# define MD_NBLOCKS_TYPE u32
+#endif
+
+/* SHA1 needs 2x64 bytes and SHA-512 needs 128 bytes. */
+#define MD_BLOCK_CTX_BUFFER_SIZE 128
+
+typedef struct gcry_md_block_ctx
+{
+    byte buf[MD_BLOCK_CTX_BUFFER_SIZE];
+    MD_NBLOCKS_TYPE nblocks;
+    MD_NBLOCKS_TYPE nblocks_high;
+    int count;
+    unsigned int blocksize_shift;
+    _gcry_md_block_write_t bwrite;
+} gcry_md_block_ctx_t;
+
+
+void
+_gcry_md_block_write( void *context, const void *inbuf_arg, size_t inlen);
+
+#endif /*GCRY_HASH_COMMON_H*/
diff --git a/comm/third_party/libgcrypt/cipher/idea.c b/comm/third_party/libgcrypt/cipher/idea.c
new file mode 100644
index 0000000000..0a81081810
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/idea.c
@@ -0,0 +1,382 @@
+/* idea.c  -  IDEA function
+ * Copyright 1997, 1998, 1999, 2001 Werner Koch (dd9jn)
+ * Copyright 2013 g10 Code GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * WERNER KOCH BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of Werner Koch shall not be
+ * used in advertising or otherwise to promote the sale, use or other dealings
+ * in this Software without prior written authorization from Werner Koch.
+ *
+ * Patents on IDEA have expired:
+ *   Europe: EP0482154 on 2011-05-16,
+ *   Japan:  JP3225440 on 2011-05-16,
+ *   U.S.:   5,214,703 on 2012-01-07.
+ */
+
+/*
+ * Please see http://www.noepatents.org/ to learn why software patents
+ * are bad for society and what you can do to fight them.
+ *
+ * The code herein is based on the one from:
+ *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
+ *   ISBN 0-471-11709-9.
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "cipher-internal.h"
+
+
+#define IDEA_KEYSIZE 16
+#define IDEA_BLOCKSIZE 8
+#define IDEA_ROUNDS 8
+#define IDEA_KEYLEN (6*IDEA_ROUNDS+4)
+
+typedef struct {
+    u16 ek[IDEA_KEYLEN];
+    u16 dk[IDEA_KEYLEN];
+    int have_dk;
+} IDEA_context;
+
+static const char *selftest(void);
+
+
+static u16
+mul_inv( u16 x )
+{
+    u16 t0, t1;
+    u16 q, y;
+
+    if( x < 2 )
+	return x;
+    t1 = 0x10001UL / x;
+    y =  0x10001UL % x;
+    if( y == 1 )
+	return (1-t1) & 0xffff;
+
+    t0 = 1;
+    do {
+	q = x / y;
+	x = x % y;
+	t0 += q * t1;
+	if( x == 1 )
+	    return t0;
+	q = y / x;
+	y = y % x;
+	t1 += q * t0;
+    } while( y != 1 );
+    return (1-t1) & 0xffff;
+}
+
+
+
+static void
+expand_key( const byte *userkey, u16 *ek )
+{
+    int i,j;
+
+    for(j=0; j < 8; j++ ) {
+	ek[j] = (*userkey << 8) + userkey[1];
+	userkey += 2;
+    }
+    for(i=0; j < IDEA_KEYLEN; j++ ) {
+	i++;
+	ek[i+7] = ek[i&7] << 9 | ek[(i+1)&7] >> 7;
+	ek += i & 8;
+	i &= 7;
+    }
+}
+
+
+static void
+invert_key( u16 *ek, u16 dk[IDEA_KEYLEN] )
+{
+    int i;
+    u16 t1, t2, t3;
+    u16 temp[IDEA_KEYLEN];
+    u16 *p = temp + IDEA_KEYLEN;
+
+    t1 = mul_inv( *ek++ );
+    t2 = -*ek++;
+    t3 = -*ek++;
+    *--p = mul_inv( *ek++ );
+    *--p = t3;
+    *--p = t2;
+    *--p = t1;
+
+    for(i=0; i < IDEA_ROUNDS-1; i++ ) {
+	t1 = *ek++;
+	*--p = *ek++;
+	*--p = t1;
+
+	t1 = mul_inv( *ek++ );
+	t2 = -*ek++;
+	t3 = -*ek++;
+	*--p = mul_inv( *ek++ );
+	*--p = t2;
+	*--p = t3;
+	*--p = t1;
+    }
+    t1 = *ek++;
+    *--p = *ek++;
+    *--p = t1;
+
+    t1 = mul_inv( *ek++ );
+    t2 = -*ek++;
+    t3 = -*ek++;
+    *--p = mul_inv( *ek++ );
+    *--p = t3;
+    *--p = t2;
+    *--p = t1;
+    memcpy(dk, temp, sizeof(temp) );
+    wipememory(temp, sizeof(temp));
+}
+
+
+static void
+cipher( byte *outbuf, const byte *inbuf, u16 *key )
+{
+    u16 s2, s3;
+    u16 in[4];
+    int r = IDEA_ROUNDS;
+#define x1 (in[0])
+#define x2 (in[1])
+#define x3 (in[2])
+#define x4 (in[3])
+#define MUL(x,y) \
+	do {u16 _t16; u32 _t32; 		    \
+	    if( (_t16 = (y)) ) {		    \
+		if( (x = (x)&0xffff) ) {	    \
+		    _t32 = (u32)x * _t16;	    \
+		    x = _t32 & 0xffff;		    \
+		    _t16 = _t32 >> 16;		    \
+		    x = ((x)-_t16) + (x<_t16?1:0);  \
+		}				    \
+		else {				    \
+		    x = 1 - _t16;		    \
+		}				    \
+	    }					    \
+	    else {				    \
+		x = 1 - x;			    \
+	    }					    \
+	} while(0)
+
+    memcpy (in, inbuf, sizeof in);
+#ifndef WORDS_BIGENDIAN
+    x1 = (x1>>8) | (x1<<8);
+    x2 = (x2>>8) | (x2<<8);
+    x3 = (x3>>8) | (x3<<8);
+    x4 = (x4>>8) | (x4<<8);
+#endif
+    do {
+	MUL(x1, *key++);
+	x2 += *key++;
+	x3 += *key++;
+	MUL(x4, *key++ );
+
+	s3 = x3;
+	x3 ^= x1;
+	MUL(x3, *key++);
+	s2 = x2;
+	x2 ^=x4;
+	x2 += x3;
+	MUL(x2, *key++);
+	x3 += x2;
+
+	x1 ^= x2;
+	x4 ^= x3;
+
+	x2 ^= s3;
+	x3 ^= s2;
+    } while( --r );
+    MUL(x1, *key++);
+    x3 += *key++;
+    x2 += *key++;
+    MUL(x4, *key);
+
+#ifndef WORDS_BIGENDIAN
+    x1 = (x1>>8) | (x1<<8);
+    x2 = (x2>>8) | (x2<<8);
+    x3 = (x3>>8) | (x3<<8);
+    x4 = (x4>>8) | (x4<<8);
+#endif
+    memcpy (outbuf+0, &x1, 2);
+    memcpy (outbuf+2, &x3, 2);
+    memcpy (outbuf+4, &x2, 2);
+    memcpy (outbuf+6, &x4, 2);
+#undef MUL
+#undef x1
+#undef x2
+#undef x3
+#undef x4
+}
+
+
+static int
+do_setkey( IDEA_context *c, const byte *key, unsigned int keylen )
+{
+    static int initialized = 0;
+    static const char *selftest_failed = 0;
+
+    if( !initialized ) {
+	initialized = 1;
+	selftest_failed = selftest();
+	if( selftest_failed )
+	    log_error( "%s\n", selftest_failed );
+    }
+    if( selftest_failed )
+	return GPG_ERR_SELFTEST_FAILED;
+
+    assert(keylen == 16);
+    c->have_dk = 0;
+    expand_key( key, c->ek );
+    invert_key( c->ek, c->dk );
+    return 0;
+}
+
+static gcry_err_code_t
+idea_setkey (void *context, const byte *key, unsigned int keylen,
+             cipher_bulk_ops_t *bulk_ops)
+{
+    IDEA_context *ctx = context;
+    int rc = do_setkey (ctx, key, keylen);
+    (void)bulk_ops;
+    _gcry_burn_stack (23+6*sizeof(void*));
+    return rc;
+}
+
+static void
+encrypt_block( IDEA_context *c, byte *outbuf, const byte *inbuf )
+{
+    cipher( outbuf, inbuf, c->ek );
+}
+
+static unsigned int
+idea_encrypt (void *context, byte *out, const byte *in)
+{
+    IDEA_context *ctx = context;
+    encrypt_block (ctx, out, in);
+    return /*burn_stack*/ (24+3*sizeof (void*));
+}
+
+static void
+decrypt_block( IDEA_context *c, byte *outbuf, const byte *inbuf )
+{
+    if( !c->have_dk ) {
+       c->have_dk = 1;
+       invert_key( c->ek, c->dk );
+    }
+    cipher( outbuf, inbuf, c->dk );
+}
+
+static unsigned int
+idea_decrypt (void *context, byte *out, const byte *in)
+{
+    IDEA_context *ctx = context;
+    decrypt_block (ctx, out, in);
+    return /*burn_stack*/ (24+3*sizeof (void*));
+}
+
+
+static const char *
+selftest( void )
+{
+static struct {
+    byte key[16];
+    byte plain[8];
+    byte cipher[8];
+} test_vectors[] = {
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03 },
+      { 0x11, 0xFB, 0xED, 0x2B, 0x01, 0x98, 0x6D, 0xE5 } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 },
+      { 0x54, 0x0E, 0x5F, 0xEA, 0x18, 0xC2, 0xF8, 0xB1 } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0x00, 0x19, 0x32, 0x4B, 0x64, 0x7D, 0x96, 0xAF },
+      { 0x9F, 0x0A, 0x0A, 0xB6, 0xE1, 0x0C, 0xED, 0x78 } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0xF5, 0x20, 0x2D, 0x5B, 0x9C, 0x67, 0x1B, 0x08 },
+      { 0xCF, 0x18, 0xFD, 0x73, 0x55, 0xE2, 0xC5, 0xC5 } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0xFA, 0xE6, 0xD2, 0xBE, 0xAA, 0x96, 0x82, 0x6E },
+      { 0x85, 0xDF, 0x52, 0x00, 0x56, 0x08, 0x19, 0x3D } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0x0A, 0x14, 0x1E, 0x28, 0x32, 0x3C, 0x46, 0x50 },
+      { 0x2F, 0x7D, 0xE7, 0x50, 0x21, 0x2F, 0xB7, 0x34 } },
+    { { 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04,
+	0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00, 0x08 },
+      { 0x05, 0x0A, 0x0F, 0x14, 0x19, 0x1E, 0x23, 0x28 },
+      { 0x7B, 0x73, 0x14, 0x92, 0x5D, 0xE5, 0x9C, 0x09 } },
+    { { 0x00, 0x05, 0x00, 0x0A, 0x00, 0x0F, 0x00, 0x14,
+	0x00, 0x19, 0x00, 0x1E, 0x00, 0x23, 0x00, 0x28 },
+      { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 },
+      { 0x3E, 0xC0, 0x47, 0x80, 0xBE, 0xFF, 0x6E, 0x20 } },
+    { { 0x3A, 0x98, 0x4E, 0x20, 0x00, 0x19, 0x5D, 0xB3,
+	0x2E, 0xE5, 0x01, 0xC8, 0xC4, 0x7C, 0xEA, 0x60 },
+      { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08 },
+      { 0x97, 0xBC, 0xD8, 0x20, 0x07, 0x80, 0xDA, 0x86 } },
+    { { 0x00, 0x64, 0x00, 0xC8, 0x01, 0x2C, 0x01, 0x90,
+	0x01, 0xF4, 0x02, 0x58, 0x02, 0xBC, 0x03, 0x20 },
+      { 0x05, 0x32, 0x0A, 0x64, 0x14, 0xC8, 0x19, 0xFA },
+      { 0x65, 0xBE, 0x87, 0xE7, 0xA2, 0x53, 0x8A, 0xED } },
+    { { 0x9D, 0x40, 0x75, 0xC1, 0x03, 0xBC, 0x32, 0x2A,
+	0xFB, 0x03, 0xE7, 0xBE, 0x6A, 0xB3, 0x00, 0x06 },
+      { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 },
+      { 0xF5, 0xDB, 0x1A, 0xC4, 0x5E, 0x5E, 0xF9, 0xF9 } }
+};
+    IDEA_context c;
+    byte buffer[8];
+    int i;
+
+    for(i=0; i < DIM(test_vectors); i++ ) {
+	do_setkey( &c, test_vectors[i].key, 16 );
+	encrypt_block( &c, buffer, test_vectors[i].plain );
+	if( memcmp( buffer, test_vectors[i].cipher, 8 ) )
+	    return "IDEA test encryption failed.";
+	decrypt_block( &c, buffer, test_vectors[i].cipher );
+	if( memcmp( buffer, test_vectors[i].plain, 8 ) )
+	    return "IDEA test decryption failed.";
+    }
+
+    return NULL;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_idea =
+  {
+    GCRY_CIPHER_IDEA, {0, 0},
+    "IDEA", NULL, NULL, IDEA_BLOCKSIZE, 128,
+    sizeof (IDEA_context),
+    idea_setkey, idea_encrypt, idea_decrypt
+  };
diff --git a/comm/third_party/libgcrypt/cipher/kdf-internal.h b/comm/third_party/libgcrypt/cipher/kdf-internal.h
new file mode 100644
index 0000000000..7079860e99
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/kdf-internal.h
@@ -0,0 +1,40 @@
+/* kdf-internal.h  - Internal defs for kdf.c
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_KDF_INTERNAL_H
+#define GCRY_KDF_INTERNAL_H
+
+/*-- kdf.c --*/
+gpg_err_code_t
+_gcry_kdf_pkdf2 (const void *passphrase, size_t passphraselen,
+                 int hashalgo,
+                 const void *salt, size_t saltlen,
+                 unsigned long iterations,
+                 size_t keysize, void *keybuffer);
+
+/*-- scrypt.c --*/
+gcry_err_code_t
+_gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
+                  int algo, int subalgo,
+                  const unsigned char *salt, size_t saltlen,
+                  unsigned long iterations,
+                  size_t dklen, unsigned char *dk);
+
+
+#endif /*GCRY_KDF_INTERNAL_H*/
diff --git a/comm/third_party/libgcrypt/cipher/kdf.c b/comm/third_party/libgcrypt/cipher/kdf.c
new file mode 100644
index 0000000000..93c2c9f65e
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/kdf.c
@@ -0,0 +1,503 @@
+/* kdf.c  - Key Derivation Functions
+ * Copyright (C) 1998, 2008, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "kdf-internal.h"
+
+
+/* Transform a passphrase into a suitable key of length KEYSIZE and
+   store this key in the caller provided buffer KEYBUFFER.  The caller
+   must provide an HASHALGO, a valid ALGO and depending on that algo a
+   SALT of 8 bytes and the number of ITERATIONS.  Code taken from
+   gnupg/agent/protect.c:hash_passphrase.  */
+static gpg_err_code_t
+openpgp_s2k (const void *passphrase, size_t passphraselen,
+             int algo, int hashalgo,
+             const void *salt, size_t saltlen,
+             unsigned long iterations,
+             size_t keysize, void *keybuffer)
+{
+  gpg_err_code_t ec;
+  gcry_md_hd_t md;
+  char *key = keybuffer;
+  int pass, i;
+  int used = 0;
+  int secmode;
+
+  if ((algo == GCRY_KDF_SALTED_S2K || algo == GCRY_KDF_ITERSALTED_S2K)
+      && (!salt || saltlen != 8))
+    return GPG_ERR_INV_VALUE;
+
+  secmode = _gcry_is_secure (passphrase) || _gcry_is_secure (keybuffer);
+
+  ec = _gcry_md_open (&md, hashalgo, secmode? GCRY_MD_FLAG_SECURE : 0);
+  if (ec)
+    return ec;
+
+  for (pass=0; used < keysize; pass++)
+    {
+      if (pass)
+        {
+          _gcry_md_reset (md);
+          for (i=0; i < pass; i++) /* Preset the hash context.  */
+            _gcry_md_putc (md, 0);
+	}
+
+      if (algo == GCRY_KDF_SALTED_S2K || algo == GCRY_KDF_ITERSALTED_S2K)
+        {
+          int len2 = passphraselen + 8;
+          unsigned long count = len2;
+
+          if (algo == GCRY_KDF_ITERSALTED_S2K)
+            {
+              count = iterations;
+              if (count < len2)
+                count = len2;
+            }
+
+          while (count > len2)
+            {
+              _gcry_md_write (md, salt, saltlen);
+              _gcry_md_write (md, passphrase, passphraselen);
+              count -= len2;
+            }
+          if (count < saltlen)
+            _gcry_md_write (md, salt, count);
+          else
+            {
+              _gcry_md_write (md, salt, saltlen);
+              count -= saltlen;
+              _gcry_md_write (md, passphrase, count);
+            }
+        }
+      else
+        _gcry_md_write (md, passphrase, passphraselen);
+
+      _gcry_md_final (md);
+      i = _gcry_md_get_algo_dlen (hashalgo);
+      if (i > keysize - used)
+        i = keysize - used;
+      memcpy (key+used, _gcry_md_read (md, hashalgo), i);
+      used += i;
+    }
+  _gcry_md_close (md);
+  return 0;
+}
+
+
+/* Transform a passphrase into a suitable key of length KEYSIZE and
+   store this key in the caller provided buffer KEYBUFFER.  The caller
+   must provide PRFALGO which indicates the pseudorandom function to
+   use: This shall be the algorithms id of a hash algorithm; it is
+   used in HMAC mode.  SALT is a salt of length SALTLEN and ITERATIONS
+   gives the number of iterations.  */
+gpg_err_code_t
+_gcry_kdf_pkdf2 (const void *passphrase, size_t passphraselen,
+                 int hashalgo,
+                 const void *salt, size_t saltlen,
+                 unsigned long iterations,
+                 size_t keysize, void *keybuffer)
+{
+  gpg_err_code_t ec;
+  gcry_md_hd_t md;
+  int secmode;
+  unsigned long dklen = keysize;
+  char *dk = keybuffer;
+  unsigned int hlen;   /* Output length of the digest function.  */
+  unsigned int l;      /* Rounded up number of blocks.  */
+  unsigned int r;      /* Number of octets in the last block.  */
+  char *sbuf;          /* Malloced buffer to concatenate salt and iter
+                          as well as space to hold TBUF and UBUF.  */
+  char *tbuf;          /* Buffer for T; ptr into SBUF, size is HLEN. */
+  char *ubuf;          /* Buffer for U; ptr into SBUF, size is HLEN. */
+  unsigned int lidx;   /* Current block number.  */
+  unsigned long iter;  /* Current iteration number.  */
+  unsigned int i;
+
+  /* We allow for a saltlen of 0 here to support scrypt.  It is not
+     clear whether rfc2898 allows for this this, thus we do a test on
+     saltlen > 0 only in gcry_kdf_derive.  */
+  if (!salt || !iterations || !dklen)
+    return GPG_ERR_INV_VALUE;
+
+  hlen = _gcry_md_get_algo_dlen (hashalgo);
+  if (!hlen)
+    return GPG_ERR_DIGEST_ALGO;
+
+  secmode = _gcry_is_secure (passphrase) || _gcry_is_secure (keybuffer);
+
+  /* Step 1 */
+  /* If dkLen > (2^32 - 1) * hLen, output "derived key too long" and
+   * stop.  We use a stronger inequality but only if our type can hold
+   * a larger value.  */
+
+#if SIZEOF_UNSIGNED_LONG > 4
+  if (dklen > 0xffffffffU)
+    return GPG_ERR_INV_VALUE;
+#endif
+
+
+  /* Step 2 */
+  l = ((dklen - 1)/ hlen) + 1;
+  r = dklen - (l - 1) * hlen;
+
+  /* Setup buffers and prepare a hash context.  */
+  sbuf = (secmode
+          ? xtrymalloc_secure (saltlen + 4 + hlen + hlen)
+          : xtrymalloc (saltlen + 4 + hlen + hlen));
+  if (!sbuf)
+    return gpg_err_code_from_syserror ();
+  tbuf = sbuf + saltlen + 4;
+  ubuf = tbuf + hlen;
+
+  ec = _gcry_md_open (&md, hashalgo, (GCRY_MD_FLAG_HMAC
+                                      | (secmode?GCRY_MD_FLAG_SECURE:0)));
+  if (ec)
+    {
+      xfree (sbuf);
+      return ec;
+    }
+
+  ec = _gcry_md_setkey (md, passphrase, passphraselen);
+  if (ec)
+    {
+      _gcry_md_close (md);
+      xfree (sbuf);
+      return ec;
+    }
+
+  /* Step 3 and 4. */
+  memcpy (sbuf, salt, saltlen);
+  for (lidx = 1; lidx <= l; lidx++)
+    {
+      for (iter = 0; iter < iterations; iter++)
+        {
+          _gcry_md_reset (md);
+          if (!iter) /* Compute U_1:  */
+            {
+              sbuf[saltlen]     = (lidx >> 24);
+              sbuf[saltlen + 1] = (lidx >> 16);
+              sbuf[saltlen + 2] = (lidx >> 8);
+              sbuf[saltlen + 3] = lidx;
+              _gcry_md_write (md, sbuf, saltlen + 4);
+              memcpy (ubuf, _gcry_md_read (md, 0), hlen);
+              memcpy (tbuf, ubuf, hlen);
+            }
+          else /* Compute U_(2..c):  */
+            {
+              _gcry_md_write (md, ubuf, hlen);
+              memcpy (ubuf, _gcry_md_read (md, 0), hlen);
+              for (i=0; i < hlen; i++)
+                tbuf[i] ^= ubuf[i];
+            }
+        }
+      if (lidx == l)  /* Last block.  */
+        memcpy (dk, tbuf, r);
+      else
+        {
+          memcpy (dk, tbuf, hlen);
+          dk += hlen;
+        }
+    }
+
+  _gcry_md_close (md);
+  xfree (sbuf);
+  return 0;
+}
+
+
+/* Derive a key from a passphrase.  KEYSIZE gives the requested size
+   of the keys in octets.  KEYBUFFER is a caller provided buffer
+   filled on success with the derived key.  The input passphrase is
+   taken from (PASSPHRASE,PASSPHRASELEN) which is an arbitrary memory
+   buffer.  ALGO specifies the KDF algorithm to use; these are the
+   constants GCRY_KDF_*.  SUBALGO specifies an algorithm used
+   internally by the KDF algorithms; this is usually a hash algorithm
+   but certain KDF algorithm may use it differently.  {SALT,SALTLEN}
+   is a salt as needed by most KDF algorithms.  ITERATIONS is a
+   positive integer parameter to most KDFs.  0 is returned on success,
+   or an error code on failure.  */
+gpg_err_code_t
+_gcry_kdf_derive (const void *passphrase, size_t passphraselen,
+                  int algo, int subalgo,
+                  const void *salt, size_t saltlen,
+                  unsigned long iterations,
+                  size_t keysize, void *keybuffer)
+{
+  gpg_err_code_t ec;
+
+  if (!passphrase)
+    {
+      ec = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  if (!keybuffer || !keysize)
+    {
+      ec = GPG_ERR_INV_VALUE;
+      goto leave;
+    }
+
+
+  switch (algo)
+    {
+    case GCRY_KDF_SIMPLE_S2K:
+    case GCRY_KDF_SALTED_S2K:
+    case GCRY_KDF_ITERSALTED_S2K:
+      if (!passphraselen)
+        ec = GPG_ERR_INV_DATA;
+      else
+        ec = openpgp_s2k (passphrase, passphraselen, algo, subalgo,
+                          salt, saltlen, iterations, keysize, keybuffer);
+      break;
+
+    case GCRY_KDF_PBKDF1:
+      ec = GPG_ERR_UNSUPPORTED_ALGORITHM;
+      break;
+
+    case GCRY_KDF_PBKDF2:
+      if (!saltlen)
+        ec = GPG_ERR_INV_VALUE;
+      else
+        ec = _gcry_kdf_pkdf2 (passphrase, passphraselen, subalgo,
+                              salt, saltlen, iterations, keysize, keybuffer);
+      break;
+
+    case 41:
+    case GCRY_KDF_SCRYPT:
+#if USE_SCRYPT
+      ec = _gcry_kdf_scrypt (passphrase, passphraselen, algo, subalgo,
+                             salt, saltlen, iterations, keysize, keybuffer);
+#else
+      ec = GPG_ERR_UNSUPPORTED_ALGORITHM;
+#endif /*USE_SCRYPT*/
+      break;
+
+    default:
+      ec = GPG_ERR_UNKNOWN_ALGORITHM;
+      break;
+    }
+
+ leave:
+  return ec;
+}
+
+
+/* Check one KDF call with ALGO and HASH_ALGO using the regular KDF
+ * API. (passphrase,passphraselen) is the password to be derived,
+ * (salt,saltlen) the salt for the key derivation,
+ * iterations is the number of the kdf iterations,
+ * and (expect,expectlen) the expected result. Returns NULL on
+ * success or a string describing the failure.  */
+
+static const char *
+check_one (int algo, int hash_algo,
+           const void *passphrase, size_t passphraselen,
+           const void *salt, size_t saltlen,
+           unsigned long iterations,
+           const void *expect, size_t expectlen)
+{
+  unsigned char key[512]; /* hardcoded to avoid allocation */
+  size_t keysize = expectlen;
+
+  if (keysize > sizeof(key))
+    return "invalid tests data";
+
+  if (_gcry_kdf_derive (passphrase, passphraselen, algo,
+                        hash_algo, salt, saltlen, iterations,
+                        keysize, key))
+    return "gcry_kdf_derive failed";
+
+  if (memcmp (key, expect, expectlen))
+    return "does not match";
+
+  return NULL;
+}
+
+
+static gpg_err_code_t
+selftest_pbkdf2 (int extended, selftest_report_func_t report)
+{
+  static const struct {
+    const char *desc;
+    const char *p;   /* Passphrase.  */
+    size_t plen;     /* Length of P. */
+    const char *salt;
+    size_t saltlen;
+    int hashalgo;
+    unsigned long c; /* Iterations.  */
+    int dklen;       /* Requested key length.  */
+    const char *dk;  /* Derived key.  */
+    int disabled;
+  } tv[] = {
+#if USE_SHA1
+#define NUM_TEST_VECTORS 9
+    /* SHA1 test vectors are from RFC-6070.  */
+    {
+      "Basic PBKDF2 SHA1 #1",
+      "password", 8,
+      "salt", 4,
+      GCRY_MD_SHA1,
+      1,
+      20,
+      "\x0c\x60\xc8\x0f\x96\x1f\x0e\x71\xf3\xa9"
+      "\xb5\x24\xaf\x60\x12\x06\x2f\xe0\x37\xa6"
+    },
+    {
+      "Basic PBKDF2 SHA1 #2",
+      "password", 8,
+      "salt", 4,
+      GCRY_MD_SHA1,
+      2,
+      20,
+      "\xea\x6c\x01\x4d\xc7\x2d\x6f\x8c\xcd\x1e"
+      "\xd9\x2a\xce\x1d\x41\xf0\xd8\xde\x89\x57"
+    },
+    {
+      "Basic PBKDF2 SHA1 #3",
+      "password", 8,
+      "salt", 4,
+      GCRY_MD_SHA1,
+      4096,
+      20,
+      "\x4b\x00\x79\x01\xb7\x65\x48\x9a\xbe\xad"
+      "\x49\xd9\x26\xf7\x21\xd0\x65\xa4\x29\xc1"
+    },
+    {
+      "Basic PBKDF2 SHA1 #4",
+      "password", 8,
+      "salt", 4,
+      GCRY_MD_SHA1,
+      16777216,
+      20,
+      "\xee\xfe\x3d\x61\xcd\x4d\xa4\xe4\xe9\x94"
+      "\x5b\x3d\x6b\xa2\x15\x8c\x26\x34\xe9\x84",
+      1 /* This test takes too long.  */
+    },
+    {
+      "Basic PBKDF2 SHA1 #5",
+      "passwordPASSWORDpassword", 24,
+      "saltSALTsaltSALTsaltSALTsaltSALTsalt", 36,
+      GCRY_MD_SHA1,
+      4096,
+      25,
+      "\x3d\x2e\xec\x4f\xe4\x1c\x84\x9b\x80\xc8"
+      "\xd8\x36\x62\xc0\xe4\x4a\x8b\x29\x1a\x96"
+      "\x4c\xf2\xf0\x70\x38"
+    },
+    {
+      "Basic PBKDF2 SHA1 #6",
+      "pass\0word", 9,
+      "sa\0lt", 5,
+      GCRY_MD_SHA1,
+      4096,
+      16,
+      "\x56\xfa\x6a\xa7\x55\x48\x09\x9d\xcc\x37"
+      "\xd7\xf0\x34\x25\xe0\xc3"
+    },
+    { /* empty password test, not in RFC-6070 */
+      "Basic PBKDF2 SHA1 #7",
+      "", 0,
+      "salt", 4,
+      GCRY_MD_SHA1,
+      2,
+      20,
+      "\x13\x3a\x4c\xe8\x37\xb4\xd2\x52\x1e\xe2"
+      "\xbf\x03\xe1\x1c\x71\xca\x79\x4e\x07\x97"
+    },
+#else
+#define NUM_TEST_VECTORS 2
+#endif
+    {
+      "Basic PBKDF2 SHA256",
+      "password", 8,
+      "salt", 4,
+      GCRY_MD_SHA256,
+      2,
+      32,
+      "\xae\x4d\x0c\x95\xaf\x6b\x46\xd3\x2d\x0a\xdf\xf9\x28\xf0\x6d\xd0"
+      "\x2a\x30\x3f\x8e\xf3\xc2\x51\xdf\xd6\xe2\xd8\x5a\x95\x47\x4c\x43"
+    },
+    {
+      "Extended PBKDF2 SHA256",
+      "passwordPASSWORDpassword", 24,
+      "saltSALTsaltSALTsaltSALTsaltSALTsalt", 36,
+      GCRY_MD_SHA256,
+      4096,
+      40,
+      "\x34\x8c\x89\xdb\xcb\xd3\x2b\x2f\x32\xd8\x14\xb8\x11\x6e\x84\xcf"
+      "\x2b\x17\x34\x7e\xbc\x18\x00\x18\x1c\x4e\x2a\x1f\xb8\xdd\x53\xe1"
+      "\xc6\x35\x51\x8c\x7d\xac\x47\xe9"
+    },
+    { NULL }
+  };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      if (tv[tvidx].disabled)
+        continue;
+      errtxt = check_one (GCRY_KDF_PBKDF2, tv[tvidx].hashalgo,
+                          tv[tvidx].p, tv[tvidx].plen,
+                          tv[tvidx].salt, tv[tvidx].saltlen,
+                          tv[tvidx].c,
+                          tv[tvidx].dk, tv[tvidx].dklen);
+      if (errtxt)
+        goto failed;
+      if (tvidx >= NUM_TEST_VECTORS - 1 && !extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("kdf", GCRY_KDF_PBKDF2, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run the selftests for KDF with KDF algorithm ALGO with optional
+   reporting function REPORT.  */
+gpg_error_t
+_gcry_kdf_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gcry_err_code_t ec = 0;
+
+  if (algo == GCRY_KDF_PBKDF2)
+    ec = selftest_pbkdf2 (extended, report);
+  else
+    {
+      ec = GPG_ERR_UNSUPPORTED_ALGORITHM;
+      if (report)
+        report ("kdf", algo, "module", "algorithm not available");
+    }
+  return gpg_error (ec);
+}
diff --git a/comm/third_party/libgcrypt/cipher/keccak-armv7-neon.S b/comm/third_party/libgcrypt/cipher/keccak-armv7-neon.S
new file mode 100644
index 0000000000..0bec8d50a9
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/keccak-armv7-neon.S
@@ -0,0 +1,945 @@
+/* keccak-armv7-neon.S  -  ARMv7/NEON implementation of Keccak
+ *
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+/* Based on public-domain/CC0 implementation from SUPERCOP package
+ * (keccakc1024/inplace-armv7a-neon/keccak2.s)
+ *
+ * Original copyright header follows:
+ */
+
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Michaël Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+
+.extern _gcry_keccak_round_consts_64bit;
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+@//  --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+    @Prepare Theta
+    @Ca = Aba^Aga^Aka^Ama^Asa@
+    @Ce = Abe^Age^Ake^Ame^Ase@
+    @Ci = Abi^Agi^Aki^Ami^Asi@
+    @Co = Abo^Ago^Ako^Amo^Aso@
+    @Cu = Abu^Agu^Aku^Amu^Asu@
+    @De = Ca^ROL64(Ci, 1)@
+    @Di = Ce^ROL64(Co, 1)@
+    @Do = Ci^ROL64(Cu, 1)@
+    @Du = Co^ROL64(Ca, 1)@
+    @Da = Cu^ROL64(Ce, 1)@
+
+    veor.64 q4, q6, q7
+    veor.64 q5, q9, q10
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d1,  d8,   d16
+    veor.64 d2,  d10,   d17
+
+    veor.64 q4, q11, q12
+    veor.64 q5, q14, q15
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d3,  d8,   d26
+
+    vadd.u64 q4, q1, q1
+    veor.64 d4,  d10,   d27
+    vmov.64  d0, d5
+    vsri.64 q4, q1, #63
+
+    vadd.u64 q5, q2, q2
+    veor.64 q4, q4, q0
+    vsri.64 q5, q2, #63
+    vadd.u64 d7, d1, d1
+    veor.64 \argA2, \argA2, d8
+    veor.64 q5, q5, q1
+
+    vsri.64 d7, d1, #63
+    vshl.u64 d1, \argA2, #44
+    veor.64 \argA3, \argA3, d9
+    veor.64 d7, d7, d4
+
+    @Ba = argA1^Da@
+    @Be = ROL64((argA2^De), 44)@
+    @Bi = ROL64((argA3^Di), 43)@
+    @Bo = ROL64((argA4^Do), 21)@
+    @Bu = ROL64((argA5^Du), 14)@
+    @argA2 =   Be ^((~Bi)& Bo )@
+    @argA3 =   Bi ^((~Bo)& Bu )@
+    @argA4 =   Bo ^((~Bu)& Ba )@
+    @argA5 =   Bu ^((~Ba)& Be )@
+    @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+    vsri.64 d1, \argA2, #64-44
+    vshl.u64 d2, \argA3, #43
+    vldr.64 d0, [sp, #\argA1]
+    veor.64 \argA4, \argA4, d10
+    vsri.64 d2, \argA3, #64-43
+    vshl.u64 d3, \argA4, #21
+    veor.64 \argA5, \argA5, d11
+    veor.64 d0, d0, d7
+    vsri.64 d3, \argA4, #64-21
+    vbic.64   d5, d2, d1
+    vshl.u64 d4, \argA5, #14
+    vbic.64   \argA2, d3, d2
+    vld1.64   d6, [ip]!
+    veor.64   d5, d0
+    vsri.64 d4, \argA5, #64-14
+    veor.64   d5, d6
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA2, d1
+    vstr.64   d5, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
+
+    @d2 = ROL64((argA1^Da), 3)@
+    @d3 = ROL64((argA2^De), 45)@
+    @d4 = ROL64((argA3^Di), 61)@
+    @d0 = ROL64((argA4^Do), 28)@
+    @d1 = ROL64((argA5^Du), 20)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d3, \argA2, #45
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d4, \argA3, #61
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d3, \argA2, #64-45
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d4, \argA3, #64-61
+    vshl.u64  d0, \argA4, #28
+    veor.64 d6, d6, d7
+    vshl.u64  d1, \argA5, #20
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA4, #64-28
+    vbic.64   \argA4, d0, d4
+    vshl.u64  d2, d6, #3
+    vsri.64  d1, \argA5, #64-20
+    veor.64   \argA4, d3
+    vsri.64  d2, d6, #64-3
+    vbic.64   \argA5, d1, d0
+    vbic.64   d6, d2, d1
+    vbic.64   \argA2, d3, d2
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+    @d4 = ROL64((argA1^Da), 18)@
+    @d0 = ROL64((argA2^De), 1)@
+    @d1 = ROL64((argA3^Di), 6)@
+    @d2 = ROL64((argA4^Do), 25)@
+    @d3 = ROL64((argA5^Du), 8)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA3, \argA3, d9
+    veor.64 \argA4, \argA4, d10
+    vshl.u64  d1, \argA3, #6
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d2, \argA4, #25
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d1, \argA3, #64-6
+    veor.64 \argA2, \argA2, d8
+    vsri.64  d2, \argA4, #64-25
+    vext.8  d3, \argA5, \argA5, #7
+    veor.64 d6, d6, d7
+    vbic.64  \argA3, d2, d1
+    vadd.u64  d0, \argA2, \argA2
+    vbic.64   \argA4, d3, d2
+    vsri.64  d0, \argA2, #64-1
+    vshl.u64  d4, d6, #18
+    veor.64  \argA2, d1, \argA4
+    veor.64  \argA3, d0
+    vsri.64  d4, d6, #64-18
+    vstr.64   \argA3, [sp, #\argA1]
+    veor.64  d5, \argA3
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+    @d1 = ROL64((argA1^Da), 36)@
+    @d2 = ROL64((argA2^De), 10)@
+    @d3 = ROL64((argA3^Di), 15)@
+    @d4 = ROL64((argA4^Do), 56)@
+    @d0 = ROL64((argA5^Du), 27)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d2, \argA2, #10
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d3, \argA3, #15
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d2, \argA2, #64-10
+    vsri.64  d3, \argA3, #64-15
+    veor.64 \argA5, \argA5, d11
+    vext.8  d4, \argA4, \argA4, #1
+    vbic.64   \argA2, d3, d2
+    vshl.u64  d0, \argA5, #27
+    veor.64 d6, d6, d7
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA5, #64-27
+    vshl.u64  d1, d6, #36
+    veor.64   \argA3, d2
+    vbic.64   \argA4, d0, d4
+    vsri.64  d1, d6, #64-36
+
+    veor.64   \argA4, d3
+    vbic.64   d6, d2, d1
+    vbic.64   \argA5, d1, d0
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+    @d3 = ROL64((argA1^Da), 41)@
+    @d4 = ROL64((argA2^De), 2)@
+    @d0 = ROL64((argA3^Di), 62)@
+    @d1 = ROL64((argA4^Do), 55)@
+    @d2 = ROL64((argA5^Du), 39)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d4, \argA2, #2
+    veor.64 \argA5, \argA5, d11
+    vshl.u64  d0, \argA3, #62
+    vldr.64 d6, [sp, #\argA1]
+    vsri.64  d4, \argA2, #64-2
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d0, \argA3, #64-62
+
+    vshl.u64  d1, \argA4, #55
+    veor.64 d6, d6, d7
+    vshl.u64  d2, \argA5, #39
+    vsri.64  d1, \argA4, #64-55
+    vbic.64  \argA4, d0, d4
+    vsri.64  d2, \argA5, #64-39
+    vbic.64  \argA2, d1, d0
+    vshl.u64  d3, d6, #41
+    veor.64  \argA5, d4, \argA2
+    vbic.64  \argA2, d2, d1
+    vsri.64  d3, d6, #64-41
+    veor.64  d6, d0, \argA2
+
+    vbic.64 \argA2, d3, d2
+    vbic.64 \argA3, d4, d3
+    veor.64 \argA2, d1
+    vstr.64 d6, [sp, #\argA1]
+    veor.64 d5, d6
+    veor.64 \argA3, d2
+    veor.64 \argA4, d3
+
+    .endm
+
+
+@// --- code
+
+@not callable from C!
+.p2align 3
+.type  KeccakF_armv7a_neon_asm,%function;
+KeccakF_armv7a_neon_asm:  @
+
+.LroundLoop:
+
+    KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
+    KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
+    KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
+    KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
+    KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
+
+    KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
+    KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
+    KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
+    KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
+    KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
+
+    KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+    KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
+    KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
+    KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
+    KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
+
+    KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+    KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
+    KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
+    ldr    r0, [ip]
+    KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
+    cmp    r0, #0xFFFFFFFF
+    KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
+
+    bne    .LroundLoop
+    sub    ip, #(8*24)
+    bx    lr
+.p2align 2
+.ltorg
+.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
+
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state)  callable from C
+.p2align 3
+.global   _gcry_keccak_permute_armv7_neon
+.type  _gcry_keccak_permute_armv7_neon,%function;
+_gcry_keccak_permute_armv7_neon:
+
+    push   {ip, lr}
+    vpush  {q4-q7}
+    sub    sp,sp, #5*8
+
+    vldr.64  d0,  [r0, #0*8]
+    vldr.64  d12, [r0, #1*8]
+    vldr.64  d17, [r0, #2*8]
+    vldr.64  d22, [r0, #3*8]
+    vldr.64  d27, [r0, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r0, #5*8]
+    vldr.64  d13, [r0, #6*8]
+    vldr.64  d18, [r0, #7*8]
+    vldr.64  d23, [r0, #8*8]
+    vldr.64  d28, [r0, #9*8]
+
+    vldr.64  d2,  [r0, #10*8]
+    vldr.64  d14, [r0, #11*8]
+    vldr.64  d19, [r0, #12*8]
+    vldr.64  d24, [r0, #13*8]
+    vldr.64  d29, [r0, #14*8]
+
+    vldr.64  d3,  [r0, #15*8]
+    vldr.64  d15, [r0, #16*8]
+    vldr.64  d20, [r0, #17*8]
+    vldr.64  d25, [r0, #18*8]
+    vldr.64  d30, [r0, #19*8]
+
+    vldr.64  d4,  [r0, #20*8]
+    vldr.64  d16, [r0, #21*8]
+    vldr.64  d21, [r0, #22*8]
+    vldr.64  d26, [r0, #23*8]
+    vldr.64  d31, [r0, #24*8]
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    mov      r1, r0
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    vpop.64  { d0- d4 }
+
+    vstr.64  d0,  [r1, #0*8]
+    vstr.64  d12, [r1, #1*8]
+    vstr.64  d17, [r1, #2*8]
+    vstr.64  d22, [r1, #3*8]
+    vstr.64  d27, [r1, #4*8]
+
+    vstr.64  d1,  [r1, #5*8]
+    vstr.64  d13, [r1, #6*8]
+    vstr.64  d18, [r1, #7*8]
+    vstr.64  d23, [r1, #8*8]
+    vstr.64  d28, [r1, #9*8]
+
+    vstr.64  d2,  [r1, #10*8]
+    vstr.64  d14, [r1, #11*8]
+    vstr.64  d19, [r1, #12*8]
+    vstr.64  d24, [r1, #13*8]
+    vstr.64  d29, [r1, #14*8]
+
+    vstr.64  d3,  [r1, #15*8]
+    vstr.64  d15, [r1, #16*8]
+    vstr.64  d20, [r1, #17*8]
+    vstr.64  d25, [r1, #18*8]
+    vstr.64  d30, [r1, #19*8]
+
+    vstr.64  d4,  [r1, #20*8]
+    vstr.64  d16, [r1, #21*8]
+    vstr.64  d21, [r1, #22*8]
+    vstr.64  d26, [r1, #23*8]
+    vstr.64  d31, [r1, #24*8]
+
+    mov   r0, #112
+    vpop  {q4-q7}
+    pop   {ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
+@					    int pos,    @r1
+@					    const byte *lanes,   @r2
+@					    unsigned int nlanes, @r3
+@					    int blocklanes) @ r5 callable from C
+.p2align 3
+.global   _gcry_keccak_absorb_lanes64_armv7_neon
+.type  _gcry_keccak_absorb_lanes64_armv7_neon,%function;
+_gcry_keccak_absorb_lanes64_armv7_neon:
+
+    cmp    r3, #0	@ nlanes == 0
+    itt eq
+    moveq  r0, #0
+    bxeq   lr
+
+    push   {r4-r5, ip, lr}
+    beq    .Lout
+    mov    r4, r0
+    ldr    r5, [sp, #(4*4)]
+    vpush  {q4-q7}
+
+    @ load state
+    vldr.64  d0,  [r4, #0*8]
+    vldr.64  d12, [r4, #1*8]
+    vldr.64  d17, [r4, #2*8]
+    vldr.64  d22, [r4, #3*8]
+    vldr.64  d27, [r4, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r4, #5*8]
+    vldr.64  d13, [r4, #6*8]
+    vldr.64  d18, [r4, #7*8]
+    vldr.64  d23, [r4, #8*8]
+    vldr.64  d28, [r4, #9*8]
+
+    vldr.64  d2,  [r4, #10*8]
+    vldr.64  d14, [r4, #11*8]
+    vldr.64  d19, [r4, #12*8]
+    vldr.64  d24, [r4, #13*8]
+    vldr.64  d29, [r4, #14*8]
+
+    vldr.64  d3,  [r4, #15*8]
+    vldr.64  d15, [r4, #16*8]
+    vldr.64  d20, [r4, #17*8]
+    vldr.64  d25, [r4, #18*8]
+    vldr.64  d30, [r4, #19*8]
+
+    vldr.64  d4,  [r4, #20*8]
+    vldr.64  d16, [r4, #21*8]
+    vldr.64  d21, [r4, #22*8]
+    vldr.64  d26, [r4, #23*8]
+    vldr.64  d31, [r4, #24*8]
+
+.Lmain_loop:
+
+    @ detect absorb mode (full blocks vs lanes)
+
+    cmp r1, #0		@ pos != 0
+    bne .Llanes_loop
+
+.Lmain_loop_pos0:
+
+    @ full blocks mode
+
+    @ switch (blocksize)
+    cmp r5, #21
+    beq .Lfull_block_21
+    cmp r5, #18
+    beq .Lfull_block_18
+    cmp r5, #17
+    beq .Lfull_block_17
+    cmp r5, #13
+    beq .Lfull_block_13
+    cmp r5, #9
+    beq .Lfull_block_9
+
+    @ unknown blocksize
+    b .Llanes_loop
+
+.Lfull_block_21:
+
+    @ SHAKE128
+
+    cmp r3, #21		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+    veor d25, d9
+    veor d30, d10
+
+    veor d4,  d11
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #21	@ nlanes -= 21
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_21
+
+.Lfull_block_18:
+
+    @ SHA3-224
+
+    cmp r3, #18		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #18	@ nlanes -= 18
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_18
+
+.Lfull_block_17:
+
+    @ SHA3-256 & SHAKE256
+
+    cmp r3, #17		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d7}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #17	@ nlanes -= 17
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_17
+
+.Lfull_block_13:
+
+    @ SHA3-384
+
+    cmp r3, #13		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d10}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    veor d14, d9
+    veor d19, d10
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #13	@ nlanes -= 13
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_13
+
+.Lfull_block_9:
+
+    @ SHA3-512
+
+    cmp r3, #9		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d6}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    veor d18, d5
+    veor d23, d6
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #9		@ nlanes -= 9
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_9
+
+.Llanes_loop:
+
+    @ per-lane mode
+
+    @ switch (pos)
+    ldrb r0, [pc, r1]
+    add pc, pc, r0, lsl #2
+.Lswitch_table:
+    .byte (.Llane0-.Lswitch_table-4)/4
+    .byte (.Llane1-.Lswitch_table-4)/4
+    .byte (.Llane2-.Lswitch_table-4)/4
+    .byte (.Llane3-.Lswitch_table-4)/4
+    .byte (.Llane4-.Lswitch_table-4)/4
+    .byte (.Llane5-.Lswitch_table-4)/4
+    .byte (.Llane6-.Lswitch_table-4)/4
+    .byte (.Llane7-.Lswitch_table-4)/4
+    .byte (.Llane8-.Lswitch_table-4)/4
+    .byte (.Llane9-.Lswitch_table-4)/4
+    .byte (.Llane10-.Lswitch_table-4)/4
+    .byte (.Llane11-.Lswitch_table-4)/4
+    .byte (.Llane12-.Lswitch_table-4)/4
+    .byte (.Llane13-.Lswitch_table-4)/4
+    .byte (.Llane14-.Lswitch_table-4)/4
+    .byte (.Llane15-.Lswitch_table-4)/4
+    .byte (.Llane16-.Lswitch_table-4)/4
+    .byte (.Llane17-.Lswitch_table-4)/4
+    .byte (.Llane18-.Lswitch_table-4)/4
+    .byte (.Llane19-.Lswitch_table-4)/4
+    .byte (.Llane20-.Lswitch_table-4)/4
+    .byte (.Llane21-.Lswitch_table-4)/4
+    .byte (.Llane22-.Lswitch_table-4)/4
+    .byte (.Llane23-.Lswitch_table-4)/4
+    .byte (.Llane24-.Lswitch_table-4)/4
+.p2align 2
+
+#define ABSORB_LANE(label, vreg) \
+    label: \
+      add     r1, #1; \
+      vld1.64 d5, [r2]!; \
+      cmp     r1, r5; /* pos == blocklanes */ \
+      veor    vreg, vreg, d5; \
+      beq     .Llanes_permute; \
+      subs    r3, #1; \
+      beq     .Ldone;
+
+    ABSORB_LANE(.Llane0, d0)
+    ABSORB_LANE(.Llane1, d12)
+    ABSORB_LANE(.Llane2, d17)
+    ABSORB_LANE(.Llane3, d22)
+    ABSORB_LANE(.Llane4, d27)
+
+    ABSORB_LANE(.Llane5, d1)
+    ABSORB_LANE(.Llane6, d13)
+    ABSORB_LANE(.Llane7, d18)
+    ABSORB_LANE(.Llane8, d23)
+    ABSORB_LANE(.Llane9, d28)
+
+    ABSORB_LANE(.Llane10, d2)
+    ABSORB_LANE(.Llane11, d14)
+    ABSORB_LANE(.Llane12, d19)
+    ABSORB_LANE(.Llane13, d24)
+    ABSORB_LANE(.Llane14, d29)
+
+    ABSORB_LANE(.Llane15, d3)
+    ABSORB_LANE(.Llane16, d15)
+    ABSORB_LANE(.Llane17, d20)
+    ABSORB_LANE(.Llane18, d25)
+    ABSORB_LANE(.Llane19, d30)
+
+    ABSORB_LANE(.Llane20, d4)
+    ABSORB_LANE(.Llane21, d16)
+    ABSORB_LANE(.Llane22, d21)
+    ABSORB_LANE(.Llane23, d26)
+    ABSORB_LANE(.Llane24, d31)
+
+    b .Llanes_loop
+
+.Llanes_permute:
+
+    sub    sp,sp, #5*8
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    mov  r1, #0   @ pos <= 0
+    subs r3, #1
+
+    vpop.64  { d0-d4 }
+
+    beq  .Ldone
+
+    b .Lmain_loop_pos0
+
+.Ldone:
+
+    @ save state
+    vstr.64  d0,  [r4, #0*8]
+    vstr.64  d12, [r4, #1*8]
+    vstr.64  d17, [r4, #2*8]
+    vstr.64  d22, [r4, #3*8]
+    vstr.64  d27, [r4, #4*8]
+
+    vstr.64  d1,  [r4, #5*8]
+    vstr.64  d13, [r4, #6*8]
+    vstr.64  d18, [r4, #7*8]
+    vstr.64  d23, [r4, #8*8]
+    vstr.64  d28, [r4, #9*8]
+
+    vstr.64  d2,  [r4, #10*8]
+    vstr.64  d14, [r4, #11*8]
+    vstr.64  d19, [r4, #12*8]
+    vstr.64  d24, [r4, #13*8]
+    vstr.64  d29, [r4, #14*8]
+
+    vstr.64  d3,  [r4, #15*8]
+    vstr.64  d15, [r4, #16*8]
+    vstr.64  d20, [r4, #17*8]
+    vstr.64  d25, [r4, #18*8]
+    vstr.64  d30, [r4, #19*8]
+
+    vstr.64  d4,  [r4, #20*8]
+    vstr.64  d16, [r4, #21*8]
+    vstr.64  d21, [r4, #22*8]
+    vstr.64  d26, [r4, #23*8]
+    vstr.64  d31, [r4, #24*8]
+
+    mov   r0, #120
+    vpop  {q4-q7}
+.Lout:
+    pop   {r4-r5, ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/keccak.c b/comm/third_party/libgcrypt/cipher/keccak.c
new file mode 100644
index 0000000000..795a02e5b9
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/keccak.c
@@ -0,0 +1,1577 @@
+/* keccak.c - SHA3 hash functions
+ * Copyright (C) 2015  g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <config.h>
+#include <string.h>
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+
+
+/* USE_64BIT indicates whether to use 64-bit generic implementation.
+ * USE_32BIT indicates whether to use 32-bit generic implementation. */
+#undef USE_64BIT
+#if defined(__x86_64__) || SIZEOF_UNSIGNED_LONG == 8
+# define USE_64BIT 1
+#else
+# define USE_32BIT 1
+#endif
+
+
+/* USE_64BIT_BMI2 indicates whether to compile with 64-bit Intel BMI2 code. */
+#undef USE_64BIT_BMI2
+#if defined(USE_64BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_CPU_ARCH_X86)
+# define USE_64BIT_BMI2 1
+#endif
+
+
+/* USE_64BIT_SHLD indicates whether to compile with 64-bit Intel SHLD code. */
+#undef USE_64BIT_SHLD
+#if defined(USE_64BIT) && defined (__GNUC__) && defined(__x86_64__) && \
+    defined(HAVE_CPU_ARCH_X86)
+# define USE_64BIT_SHLD 1
+#endif
+
+
+/* USE_32BIT_BMI2 indicates whether to compile with 32-bit Intel BMI2 code. */
+#undef USE_32BIT_BMI2
+#if defined(USE_32BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_CPU_ARCH_X86)
+# define USE_32BIT_BMI2 1
+#endif
+
+
+/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
+ * code. */
+#undef USE_64BIT_ARM_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_64BIT_ARM_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+/* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define USE_S390X_CRYPTO 1
+#endif /* USE_S390X_CRYPTO */
+
+
+#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
+# define NEED_COMMON64 1
+#endif
+
+#ifdef USE_32BIT
+# define NEED_COMMON32BI 1
+#endif
+
+
+#define SHA3_DELIMITED_SUFFIX 0x06
+#define SHAKE_DELIMITED_SUFFIX 0x1F
+
+
+typedef struct
+{
+  union {
+#ifdef NEED_COMMON64
+    u64 state64[25];
+#endif
+#ifdef NEED_COMMON32BI
+    u32 state32bi[50];
+#endif
+  } u;
+} KECCAK_STATE;
+
+
+typedef struct
+{
+  unsigned int (*permute)(KECCAK_STATE *hd);
+  unsigned int (*absorb)(KECCAK_STATE *hd, int pos, const byte *lanes,
+			 unsigned int nlanes, int blocklanes);
+  unsigned int (*extract) (KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+			   unsigned int outlen);
+} keccak_ops_t;
+
+
+typedef struct KECCAK_CONTEXT_S
+{
+  KECCAK_STATE state;
+  unsigned int outlen;
+  unsigned int blocksize;
+  unsigned int count;
+  unsigned int suffix;
+  const keccak_ops_t *ops;
+#ifdef USE_S390X_CRYPTO
+  unsigned int kimd_func;
+  unsigned int buf_pos;
+  byte buf[1344 / 8]; /* SHAKE128 requires biggest buffer, 1344 bits. */
+#endif
+} KECCAK_CONTEXT;
+
+
+
+#ifdef NEED_COMMON64
+
+const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
+{
+  U64_C(0x0000000000000001), U64_C(0x0000000000008082),
+  U64_C(0x800000000000808A), U64_C(0x8000000080008000),
+  U64_C(0x000000000000808B), U64_C(0x0000000080000001),
+  U64_C(0x8000000080008081), U64_C(0x8000000000008009),
+  U64_C(0x000000000000008A), U64_C(0x0000000000000088),
+  U64_C(0x0000000080008009), U64_C(0x000000008000000A),
+  U64_C(0x000000008000808B), U64_C(0x800000000000008B),
+  U64_C(0x8000000000008089), U64_C(0x8000000000008003),
+  U64_C(0x8000000000008002), U64_C(0x8000000000000080),
+  U64_C(0x000000000000800A), U64_C(0x800000008000000A),
+  U64_C(0x8000000080008081), U64_C(0x8000000000008080),
+  U64_C(0x0000000080000001), U64_C(0x8000000080008008),
+  U64_C(0xFFFFFFFFFFFFFFFF)
+};
+
+static unsigned int
+keccak_extract64(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+		 unsigned int outlen)
+{
+  unsigned int i;
+
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
+    {
+      u64 tmp = hd->u.state64[i];
+      buf_put_le64(outbuf, tmp);
+      outbuf += 8;
+    }
+
+  return 0;
+}
+
+#endif /* NEED_COMMON64 */
+
+
+#ifdef NEED_COMMON32BI
+
+static const u32 round_consts_32bit[2 * 24] =
+{
+  0x00000001UL, 0x00000000UL, 0x00000000UL, 0x00000089UL,
+  0x00000000UL, 0x8000008bUL, 0x00000000UL, 0x80008080UL,
+  0x00000001UL, 0x0000008bUL, 0x00000001UL, 0x00008000UL,
+  0x00000001UL, 0x80008088UL, 0x00000001UL, 0x80000082UL,
+  0x00000000UL, 0x0000000bUL, 0x00000000UL, 0x0000000aUL,
+  0x00000001UL, 0x00008082UL, 0x00000000UL, 0x00008003UL,
+  0x00000001UL, 0x0000808bUL, 0x00000001UL, 0x8000000bUL,
+  0x00000001UL, 0x8000008aUL, 0x00000001UL, 0x80000081UL,
+  0x00000000UL, 0x80000081UL, 0x00000000UL, 0x80000008UL,
+  0x00000000UL, 0x00000083UL, 0x00000000UL, 0x80008003UL,
+  0x00000001UL, 0x80008088UL, 0x00000000UL, 0x80000088UL,
+  0x00000001UL, 0x00008000UL, 0x00000000UL, 0x80008082UL
+};
+
+static unsigned int
+keccak_extract32bi(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+		   unsigned int outlen)
+{
+  unsigned int i;
+  u32 x0;
+  u32 x1;
+  u32 t;
+
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
+    {
+      x0 = hd->u.state32bi[i * 2 + 0];
+      x1 = hd->u.state32bi[i * 2 + 1];
+
+      t = (x0 & 0x0000FFFFUL) + (x1 << 16);
+      x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL);
+      x0 = t;
+      t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8);
+      t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4);
+      t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2);
+      t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1);
+      t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8);
+      t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4);
+      t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2);
+      t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1);
+
+      buf_put_le32(&outbuf[0], x0);
+      buf_put_le32(&outbuf[4], x1);
+      outbuf += 8;
+    }
+
+  return 0;
+}
+
+static inline void
+keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
+{
+  u32 t;
+
+  t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1);
+  t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2);
+  t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4);
+  t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8);
+  t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1);
+  t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2);
+  t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4);
+  t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8);
+  lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16);
+  lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL);
+}
+
+#endif /* NEED_COMMON32BI */
+
+
+/* Construct generic 64-bit implementation. */
+#ifdef USE_64BIT
+
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "movdqu 2*16(%[dst]), %%xmm2\n\t"
+       "movdqu 3*16(%[dst]), %%xmm3\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu 2*16(%[in]), %%xmm4\n\t"
+       "movdqu 3*16(%[in]), %%xmm5\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "pxor %%xmm4, %%xmm2\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       "pxor %%xmm5, %%xmm3\n\t"
+       "movdqu %%xmm2, 2*16(%[dst])\n\t"
+       "movdqu %%xmm3, 3*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+  dst[4] ^= buf_get_le64(in + 8 * 4);
+  dst[5] ^= buf_get_le64(in + 8 * 5);
+  dst[6] ^= buf_get_le64(in + 8 * 6);
+  dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
+# define ANDN64(x, y) (~(x) & (y))
+# define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
+		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
+# include "keccak_permute_64.h"
+
+# undef ANDN64
+# undef ROL64
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
+
+static const keccak_ops_t keccak_generic64_ops =
+{
+  .permute = keccak_f1600_state_permute64,
+  .absorb = keccak_absorb_lanes64,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT */
+
+
+/* Construct 64-bit Intel SHLD implementation. */
+#ifdef USE_64BIT_SHLD
+
+# define ANDN64(x, y) (~(x) & (y))
+# define ROL64(x, n) ({ \
+			u64 tmp = (x); \
+			asm ("shldq %1, %0, %0" \
+			     : "+r" (tmp) \
+			     : "J" ((n) & 63) \
+			     : "cc"); \
+			tmp; })
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
+# include "keccak_permute_64.h"
+
+# undef ANDN64
+# undef ROL64
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
+
+static const keccak_ops_t keccak_shld_64_ops =
+{
+  .permute = keccak_f1600_state_permute64_shld,
+  .absorb = keccak_absorb_lanes64_shld,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_SHLD */
+
+
+/* Construct 64-bit Intel BMI2 implementation. */
+#ifdef USE_64BIT_BMI2
+
+# define ANDN64(x, y) ({ \
+			u64 tmp; \
+			asm ("andnq %2, %1, %0" \
+			     : "=r" (tmp) \
+			     : "r0" (x), "rm" (y)); \
+			tmp; })
+
+# define ROL64(x, n) ({ \
+			u64 tmp; \
+			asm ("rorxq %2, %1, %0" \
+			     : "=r" (tmp) \
+			     : "rm0" (x), "J" (64 - ((n) & 63))); \
+			tmp; })
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
+# include "keccak_permute_64.h"
+
+# undef ANDN64
+# undef ROL64
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
+
+static const keccak_ops_t keccak_bmi2_64_ops =
+{
+  .permute = keccak_f1600_state_permute64_bmi2,
+  .absorb = keccak_absorb_lanes64_bmi2,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_BMI2 */
+
+
+/* 64-bit ARMv7/NEON implementation. */
+#ifdef USE_64BIT_ARM_NEON
+
+unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
+unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
+						    const byte *lanes,
+						    unsigned int nlanes,
+						    int blocklanes);
+
+static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
+{
+  return _gcry_keccak_permute_armv7_neon(hd->u.state64);
+}
+
+static unsigned int
+keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
+				 unsigned int nlanes, int blocklanes)
+{
+  if (blocklanes < 0)
+    {
+      /* blocklanes == -1, permutationless absorb from keccak_final. */
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+	}
+
+      return 0;
+    }
+  else
+    {
+      return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
+						    nlanes, blocklanes);
+    }
+}
+
+static const keccak_ops_t keccak_armv7_neon_64_ops =
+{
+  .permute = keccak_permute64_armv7_neon,
+  .absorb = keccak_absorb_lanes64_armv7_neon,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_ARM_NEON */
+
+
+/* Construct generic 32-bit implementation. */
+#ifdef USE_32BIT
+
+# define ANDN32(x, y) (~(x) & (y))
+# define ROL32(x, n) (((x) << ((unsigned int)n & 31)) | \
+		      ((x) >> ((32 - (unsigned int)(n)) & 31)))
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi
+# include "keccak_permute_32.h"
+
+# undef ANDN32
+# undef ROL32
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+
+static unsigned int
+keccak_absorb_lanes32bi(KECCAK_STATE *hd, int pos, const byte *lanes,
+		        unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      keccak_absorb_lane32bi(&hd->u.state32bi[pos * 2],
+			     buf_get_le32(lanes + 0),
+			     buf_get_le32(lanes + 4));
+      lanes += 8;
+      nlanes--;
+
+      if (++pos == blocklanes)
+	{
+	  burn = keccak_f1600_state_permute32bi(hd);
+	  pos = 0;
+	}
+    }
+
+  return burn;
+}
+
+static const keccak_ops_t keccak_generic32bi_ops =
+{
+  .permute = keccak_f1600_state_permute32bi,
+  .absorb = keccak_absorb_lanes32bi,
+  .extract = keccak_extract32bi,
+};
+
+#endif /* USE_32BIT */
+
+
+/* Construct 32-bit Intel BMI2 implementation. */
+#ifdef USE_32BIT_BMI2
+
+# define ANDN32(x, y) ({ \
+			u32 tmp; \
+			asm ("andnl %2, %1, %0" \
+			     : "=r" (tmp) \
+			     : "r0" (x), "rm" (y)); \
+			tmp; })
+
+# define ROL32(x, n) ({ \
+			u32 tmp; \
+			asm ("rorxl %2, %1, %0" \
+			     : "=r" (tmp) \
+			     : "rm0" (x), "J" (32 - ((n) & 31))); \
+			tmp; })
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi_bmi2
+# include "keccak_permute_32.h"
+
+# undef ANDN32
+# undef ROL32
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+
+static inline u32 pext(u32 x, u32 mask)
+{
+  u32 tmp;
+  asm ("pextl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask));
+  return tmp;
+}
+
+static inline u32 pdep(u32 x, u32 mask)
+{
+  u32 tmp;
+  asm ("pdepl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask));
+  return tmp;
+}
+
+static inline void
+keccak_absorb_lane32bi_bmi2(u32 *lane, u32 x0, u32 x1)
+{
+  x0 = pdep(pext(x0, 0x55555555), 0x0000ffff) | (pext(x0, 0xaaaaaaaa) << 16);
+  x1 = pdep(pext(x1, 0x55555555), 0x0000ffff) | (pext(x1, 0xaaaaaaaa) << 16);
+
+  lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16);
+  lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL);
+}
+
+static unsigned int
+keccak_absorb_lanes32bi_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
+		             unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      keccak_absorb_lane32bi_bmi2(&hd->u.state32bi[pos * 2],
+			          buf_get_le32(lanes + 0),
+			          buf_get_le32(lanes + 4));
+      lanes += 8;
+      nlanes--;
+
+      if (++pos == blocklanes)
+	{
+	  burn = keccak_f1600_state_permute32bi_bmi2(hd);
+	  pos = 0;
+	}
+    }
+
+  return burn;
+}
+
+static unsigned int
+keccak_extract32bi_bmi2(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+			unsigned int outlen)
+{
+  unsigned int i;
+  u32 x0;
+  u32 x1;
+  u32 t;
+
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
+    {
+      x0 = hd->u.state32bi[i * 2 + 0];
+      x1 = hd->u.state32bi[i * 2 + 1];
+
+      t = (x0 & 0x0000FFFFUL) + (x1 << 16);
+      x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL);
+      x0 = t;
+
+      x0 = pdep(pext(x0, 0xffff0001), 0xaaaaaaab) | pdep(x0 >> 1, 0x55555554);
+      x1 = pdep(pext(x1, 0xffff0001), 0xaaaaaaab) | pdep(x1 >> 1, 0x55555554);
+
+      buf_put_le32(&outbuf[0], x0);
+      buf_put_le32(&outbuf[4], x1);
+      outbuf += 8;
+    }
+
+  return 0;
+}
+
+static const keccak_ops_t keccak_bmi2_32bi_ops =
+{
+  .permute = keccak_f1600_state_permute32bi_bmi2,
+  .absorb = keccak_absorb_lanes32bi_bmi2,
+  .extract = keccak_extract32bi_bmi2,
+};
+
+#endif /* USE_32BIT_BMI2 */
+
+
+#ifdef USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static inline void
+keccak_bwrite_s390x (void *context, const byte *in, size_t inlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+
+  /* Write full-blocks. */
+  kimd_execute (ctx->kimd_func, &ctx->state, in, inlen);
+  return;
+}
+
+static inline void
+keccak_final_s390x (void *context)
+{
+  KECCAK_CONTEXT *ctx = context;
+
+  if (ctx->suffix == SHA3_DELIMITED_SUFFIX)
+    {
+      klmd_execute (ctx->kimd_func, &ctx->state, ctx->buf, ctx->count);
+    }
+  else
+    {
+      klmd_shake_execute (ctx->kimd_func, &ctx->state, NULL, 0, ctx->buf,
+			  ctx->count);
+      ctx->count = 0;
+      ctx->buf_pos = 0;
+    }
+
+  return;
+}
+
+static inline void
+keccak_bextract_s390x (void *context, byte *out, size_t outlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+
+  /* Extract full-blocks. */
+  klmd_shake_execute (ctx->kimd_func | KLMD_PADDING_STATE, &ctx->state,
+		      out, outlen, NULL, 0);
+  return;
+}
+
+static void
+keccak_write_s390x (void *context, const byte *inbuf, size_t inlen)
+{
+  KECCAK_CONTEXT *hd = context;
+  const size_t blocksize = hd->blocksize;
+  size_t inblocks;
+  size_t copylen;
+
+  while (hd->count)
+    {
+      if (hd->count == blocksize)  /* Flush the buffer. */
+	{
+	  keccak_bwrite_s390x (hd, hd->buf, blocksize);
+	  hd->count = 0;
+	}
+      else
+	{
+	  copylen = inlen;
+	  if (copylen > blocksize - hd->count)
+	    copylen = blocksize - hd->count;
+
+	  if (copylen == 0)
+	    break;
+
+	  buf_cpy (&hd->buf[hd->count], inbuf, copylen);
+	  hd->count += copylen;
+	  inbuf += copylen;
+	  inlen -= copylen;
+	}
+    }
+
+  if (inlen == 0)
+    return;
+
+  if (inlen >= blocksize)
+    {
+      inblocks = inlen / blocksize;
+      keccak_bwrite_s390x (hd, inbuf, inblocks * blocksize);
+      hd->count = 0;
+      inlen -= inblocks * blocksize;
+      inbuf += inblocks * blocksize;
+    }
+
+  if (inlen)
+    {
+      buf_cpy (hd->buf, inbuf, inlen);
+      hd->count = inlen;
+    }
+}
+
+static void
+keccak_extract_s390x (void *context, void *outbuf_arg, size_t outlen)
+{
+  KECCAK_CONTEXT *hd = context;
+  const size_t blocksize = hd->blocksize;
+  byte *outbuf = outbuf_arg;
+
+  while (outlen)
+    {
+      gcry_assert(hd->count == 0 || hd->buf_pos < hd->count);
+
+      if (hd->buf_pos < hd->count && outlen)
+	{
+	  size_t copylen = hd->count - hd->buf_pos;
+
+	  if (copylen > outlen)
+	    copylen = outlen;
+
+	  buf_cpy (outbuf, &hd->buf[hd->buf_pos], copylen);
+
+	  outbuf += copylen;
+	  outlen -= copylen;
+	  hd->buf_pos += copylen;
+	}
+
+      if (hd->buf_pos == hd->count)
+	{
+	  hd->buf_pos = 0;
+	  hd->count = 0;
+	}
+
+      if (outlen == 0)
+	return;
+
+      if (outlen >= blocksize)
+	{
+	  size_t outblocks = outlen / blocksize;
+
+	  keccak_bextract_s390x (context, outbuf, outblocks * blocksize);
+
+	  outlen -= outblocks * blocksize;
+	  outbuf += outblocks * blocksize;
+
+	  if (outlen == 0)
+	    return;
+	}
+
+      keccak_bextract_s390x (context, hd->buf, blocksize);
+      hd->count = blocksize;
+    }
+}
+#endif /* USE_S390X_CRYPTO */
+
+
+static void
+keccak_write (void *context, const void *inbuf_arg, size_t inlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+  const size_t bsize = ctx->blocksize;
+  const size_t blocklanes = bsize / 8;
+  const byte *inbuf = inbuf_arg;
+  unsigned int nburn, burn = 0;
+  unsigned int count, i;
+  unsigned int pos, nlanes;
+
+#ifdef USE_S390X_CRYPTO
+  if (ctx->kimd_func)
+    {
+      keccak_write_s390x (context, inbuf, inlen);
+      return;
+    }
+#endif
+
+  count = ctx->count;
+
+  if (inlen && (count % 8))
+    {
+      byte lane[8] = { 0, };
+
+      /* Complete absorbing partial input lane. */
+
+      pos = count / 8;
+
+      for (i = count % 8; inlen && i < 8; i++)
+	{
+	  lane[i] = *inbuf++;
+	  inlen--;
+	  count++;
+	}
+
+      if (count == bsize)
+	count = 0;
+
+      nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1,
+			       (count % 8) ? -1 : blocklanes);
+      burn = nburn > burn ? nburn : burn;
+    }
+
+  /* Absorb full input lanes. */
+
+  pos = count / 8;
+  nlanes = inlen / 8;
+  if (nlanes > 0)
+    {
+      nburn = ctx->ops->absorb(&ctx->state, pos, inbuf, nlanes, blocklanes);
+      burn = nburn > burn ? nburn : burn;
+      inlen -= nlanes * 8;
+      inbuf += nlanes * 8;
+      count += nlanes * 8;
+      count = count % bsize;
+    }
+
+  if (inlen)
+    {
+      byte lane[8] = { 0, };
+
+      /* Absorb remaining partial input lane. */
+
+      pos = count / 8;
+
+      for (i = count % 8; inlen && i < 8; i++)
+	{
+	  lane[i] = *inbuf++;
+	  inlen--;
+	  count++;
+	}
+
+      nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, -1);
+      burn = nburn > burn ? nburn : burn;
+
+      gcry_assert(count < bsize);
+    }
+
+  ctx->count = count;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+static void
+keccak_init (int algo, void *context, unsigned int flags)
+{
+  KECCAK_CONTEXT *ctx = context;
+  KECCAK_STATE *hd = &ctx->state;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)flags;
+  (void)features;
+
+  memset (hd, 0, sizeof *hd);
+
+  ctx->count = 0;
+
+  /* Select generic implementation. */
+#ifdef USE_64BIT
+  ctx->ops = &keccak_generic64_ops;
+#elif defined USE_32BIT
+  ctx->ops = &keccak_generic32bi_ops;
+#endif
+
+  /* Select optimized implementation based in hw features. */
+  if (0) {}
+#ifdef USE_64BIT_ARM_NEON
+  else if (features & HWF_ARM_NEON)
+    ctx->ops = &keccak_armv7_neon_64_ops;
+#endif
+#ifdef USE_64BIT_BMI2
+  else if (features & HWF_INTEL_BMI2)
+    ctx->ops = &keccak_bmi2_64_ops;
+#endif
+#ifdef USE_32BIT_BMI2
+  else if (features & HWF_INTEL_BMI2)
+    ctx->ops = &keccak_bmi2_32bi_ops;
+#endif
+#ifdef USE_64BIT_SHLD
+  else if (features & HWF_INTEL_FAST_SHLD)
+    ctx->ops = &keccak_shld_64_ops;
+#endif
+
+  /* Set input block size, in Keccak terms this is called 'rate'. */
+
+  switch (algo)
+    {
+    case GCRY_MD_SHA3_224:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
+      ctx->blocksize = 1152 / 8;
+      ctx->outlen = 224 / 8;
+      break;
+    case GCRY_MD_SHA3_256:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
+      ctx->blocksize = 1088 / 8;
+      ctx->outlen = 256 / 8;
+      break;
+    case GCRY_MD_SHA3_384:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
+      ctx->blocksize = 832 / 8;
+      ctx->outlen = 384 / 8;
+      break;
+    case GCRY_MD_SHA3_512:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
+      ctx->blocksize = 576 / 8;
+      ctx->outlen = 512 / 8;
+      break;
+    case GCRY_MD_SHAKE128:
+      ctx->suffix = SHAKE_DELIMITED_SUFFIX;
+      ctx->blocksize = 1344 / 8;
+      ctx->outlen = 0;
+      break;
+    case GCRY_MD_SHAKE256:
+      ctx->suffix = SHAKE_DELIMITED_SUFFIX;
+      ctx->blocksize = 1088 / 8;
+      ctx->outlen = 0;
+      break;
+    default:
+      BUG();
+    }
+
+#ifdef USE_S390X_CRYPTO
+  ctx->kimd_func = 0;
+  if ((features & HWF_S390X_MSA) != 0)
+    {
+      unsigned int kimd_func = 0;
+
+      switch (algo)
+	{
+	case GCRY_MD_SHA3_224:
+	  kimd_func = KMID_FUNCTION_SHA3_224;
+	  break;
+	case GCRY_MD_SHA3_256:
+	  kimd_func = KMID_FUNCTION_SHA3_256;
+	  break;
+	case GCRY_MD_SHA3_384:
+	  kimd_func = KMID_FUNCTION_SHA3_384;
+	  break;
+	case GCRY_MD_SHA3_512:
+	  kimd_func = KMID_FUNCTION_SHA3_512;
+	  break;
+	case GCRY_MD_SHAKE128:
+	  kimd_func = KMID_FUNCTION_SHAKE128;
+	  break;
+	case GCRY_MD_SHAKE256:
+	  kimd_func = KMID_FUNCTION_SHAKE256;
+	  break;
+	}
+
+      if ((kimd_query () & km_function_to_mask (kimd_func)) &&
+	  (klmd_query () & km_function_to_mask (kimd_func)))
+	{
+	  ctx->kimd_func = kimd_func;
+	}
+    }
+#endif
+}
+
+static void
+sha3_224_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHA3_224, context, flags);
+}
+
+static void
+sha3_256_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHA3_256, context, flags);
+}
+
+static void
+sha3_384_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHA3_384, context, flags);
+}
+
+static void
+sha3_512_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHA3_512, context, flags);
+}
+
+static void
+shake128_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHAKE128, context, flags);
+}
+
+static void
+shake256_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHAKE256, context, flags);
+}
+
+/* The routine final terminates the computation and
+ * returns the digest.
+ * The handle is prepared for a new cycle, but adding bytes to the
+ * handle will the destroy the returned buffer.
+ * Returns: 64 bytes representing the digest.  When used for sha384,
+ * we take the leftmost 48 of those bytes.
+ */
+static void
+keccak_final (void *context)
+{
+  KECCAK_CONTEXT *ctx = context;
+  KECCAK_STATE *hd = &ctx->state;
+  const size_t bsize = ctx->blocksize;
+  const byte suffix = ctx->suffix;
+  unsigned int nburn, burn = 0;
+  unsigned int lastbytes;
+  byte lane[8];
+
+#ifdef USE_S390X_CRYPTO
+  if (ctx->kimd_func)
+    {
+      keccak_final_s390x (context);
+      return;
+    }
+#endif
+
+  lastbytes = ctx->count;
+
+  /* Do the padding and switch to the squeezing phase */
+
+  /* Absorb the last few bits and add the first bit of padding (which
+     coincides with the delimiter in delimited suffix) */
+  buf_put_le64(lane, (u64)suffix << ((lastbytes % 8) * 8));
+  nburn = ctx->ops->absorb(&ctx->state, lastbytes / 8, lane, 1, -1);
+  burn = nburn > burn ? nburn : burn;
+
+  /* Add the second bit of padding. */
+  buf_put_le64(lane, (u64)0x80 << (((bsize - 1) % 8) * 8));
+  nburn = ctx->ops->absorb(&ctx->state, (bsize - 1) / 8, lane, 1, -1);
+  burn = nburn > burn ? nburn : burn;
+
+  if (suffix == SHA3_DELIMITED_SUFFIX)
+    {
+      /* Switch to the squeezing phase. */
+      nburn = ctx->ops->permute(hd);
+      burn = nburn > burn ? nburn : burn;
+
+      /* Squeeze out the SHA3 digest. */
+      nburn = ctx->ops->extract(hd, 0, (void *)hd, ctx->outlen);
+      burn = nburn > burn ? nburn : burn;
+    }
+  else
+    {
+      /* Output for SHAKE can now be read with md_extract(). */
+
+      ctx->count = 0;
+    }
+
+  wipememory(lane, sizeof(lane));
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+static byte *
+keccak_read (void *context)
+{
+  KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context;
+  KECCAK_STATE *hd = &ctx->state;
+  return (byte *)&hd->u;
+}
+
+
+static void
+keccak_extract (void *context, void *out, size_t outlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+  KECCAK_STATE *hd = &ctx->state;
+  const size_t bsize = ctx->blocksize;
+  unsigned int nburn, burn = 0;
+  byte *outbuf = out;
+  unsigned int nlanes;
+  unsigned int nleft;
+  unsigned int count;
+  unsigned int i;
+  byte lane[8];
+
+#ifdef USE_S390X_CRYPTO
+  if (ctx->kimd_func)
+    {
+      keccak_extract_s390x (context, out, outlen);
+      return;
+    }
+#endif
+
+  count = ctx->count;
+
+  while (count && outlen && (outlen < 8 || count % 8))
+    {
+      /* Extract partial lane. */
+      nburn = ctx->ops->extract(hd, count / 8, lane, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      for (i = count % 8; outlen && i < 8; i++)
+	{
+	  *outbuf++ = lane[i];
+	  outlen--;
+	  count++;
+	}
+
+      gcry_assert(count <= bsize);
+
+      if (count == bsize)
+	count = 0;
+    }
+
+  if (outlen >= 8 && count)
+    {
+      /* Extract tail of partial block. */
+      nlanes = outlen / 8;
+      nleft = (bsize - count) / 8;
+      nlanes = nlanes < nleft ? nlanes : nleft;
+
+      nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
+      burn = nburn > burn ? nburn : burn;
+      outlen -= nlanes * 8;
+      outbuf += nlanes * 8;
+      count += nlanes * 8;
+
+      gcry_assert(count <= bsize);
+
+      if (count == bsize)
+	count = 0;
+    }
+
+  while (outlen >= bsize)
+    {
+      gcry_assert(count == 0);
+
+      /* Squeeze more. */
+      nburn = ctx->ops->permute(hd);
+      burn = nburn > burn ? nburn : burn;
+
+      /* Extract full block. */
+      nburn = ctx->ops->extract(hd, 0, outbuf, bsize);
+      burn = nburn > burn ? nburn : burn;
+
+      outlen -= bsize;
+      outbuf += bsize;
+    }
+
+  if (outlen)
+    {
+      gcry_assert(outlen < bsize);
+
+      if (count == 0)
+	{
+	  /* Squeeze more. */
+	  nburn = ctx->ops->permute(hd);
+	  burn = nburn > burn ? nburn : burn;
+	}
+
+      if (outlen >= 8)
+	{
+	  /* Extract head of partial block. */
+	  nlanes = outlen / 8;
+	  nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
+	  burn = nburn > burn ? nburn : burn;
+	  outlen -= nlanes * 8;
+	  outbuf += nlanes * 8;
+	  count += nlanes * 8;
+
+	  gcry_assert(count < bsize);
+	}
+
+      if (outlen)
+	{
+	  /* Extract head of partial lane. */
+	  nburn = ctx->ops->extract(hd, count / 8, lane, 8);
+	  burn = nburn > burn ? nburn : burn;
+
+	  for (i = count % 8; outlen && i < 8; i++)
+	    {
+	      *outbuf++ = lane[i];
+	      outlen--;
+	      count++;
+	    }
+
+	  gcry_assert(count < bsize);
+	}
+    }
+
+  ctx->count = count;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 'spec->mdlen' bytes.  */
+static void
+_gcry_sha3_hash_buffer (void *outbuf, const void *buffer, size_t length,
+                        const gcry_md_spec_t *spec)
+{
+  KECCAK_CONTEXT hd;
+
+  spec->init (&hd, 0);
+  keccak_write (&hd, buffer, length);
+  keccak_final (&hd);
+  memcpy (outbuf, keccak_read (&hd), spec->mdlen);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha3_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt,
+                         const gcry_md_spec_t *spec)
+{
+  KECCAK_CONTEXT hd;
+
+  spec->init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    keccak_write (&hd, (const char*)iov[0].data + iov[0].off, iov[0].len);
+  keccak_final (&hd);
+  memcpy (outbuf, keccak_read (&hd), spec->mdlen);
+}
+
+
+static void
+_gcry_sha3_224_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  _gcry_sha3_hash_buffer (outbuf, buffer, length, &_gcry_digest_spec_sha3_224);
+}
+
+static void
+_gcry_sha3_256_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  _gcry_sha3_hash_buffer (outbuf, buffer, length, &_gcry_digest_spec_sha3_256);
+}
+
+static void
+_gcry_sha3_384_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  _gcry_sha3_hash_buffer (outbuf, buffer, length, &_gcry_digest_spec_sha3_384);
+}
+
+static void
+_gcry_sha3_512_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  _gcry_sha3_hash_buffer (outbuf, buffer, length, &_gcry_digest_spec_sha3_512);
+}
+
+static void
+_gcry_sha3_224_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+                             int iovcnt)
+{
+  _gcry_sha3_hash_buffers (outbuf, iov, iovcnt, &_gcry_digest_spec_sha3_224);
+}
+
+static void
+_gcry_sha3_256_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+                             int iovcnt)
+{
+  _gcry_sha3_hash_buffers (outbuf, iov, iovcnt, &_gcry_digest_spec_sha3_256);
+}
+
+static void
+_gcry_sha3_384_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+                             int iovcnt)
+{
+  _gcry_sha3_hash_buffers (outbuf, iov, iovcnt, &_gcry_digest_spec_sha3_384);
+}
+
+static void
+_gcry_sha3_512_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+                             int iovcnt)
+{
+  _gcry_sha3_hash_buffers (outbuf, iov, iovcnt, &_gcry_digest_spec_sha3_512);
+}
+
+
+/*
+     Self-test section.
+ */
+
+
+static gpg_err_code_t
+selftests_keccak (int algo, int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+  const char *short_hash;
+  const char *long_hash;
+  const char *one_million_a_hash;
+  int hash_len;
+
+  switch (algo)
+  {
+    default:
+      BUG();
+
+    case GCRY_MD_SHA3_224:
+      short_hash =
+	"\xe6\x42\x82\x4c\x3f\x8c\xf2\x4a\xd0\x92\x34\xee\x7d\x3c\x76\x6f"
+	"\xc9\xa3\xa5\x16\x8d\x0c\x94\xad\x73\xb4\x6f\xdf";
+      long_hash =
+	"\x54\x3e\x68\x68\xe1\x66\x6c\x1a\x64\x36\x30\xdf\x77\x36\x7a\xe5"
+	"\xa6\x2a\x85\x07\x0a\x51\xc1\x4c\xbf\x66\x5c\xbc";
+      one_million_a_hash =
+	"\xd6\x93\x35\xb9\x33\x25\x19\x2e\x51\x6a\x91\x2e\x6d\x19\xa1\x5c"
+	"\xb5\x1c\x6e\xd5\xc1\x52\x43\xe7\xa7\xfd\x65\x3c";
+      hash_len = 28;
+      break;
+
+    case GCRY_MD_SHA3_256:
+      short_hash =
+	"\x3a\x98\x5d\xa7\x4f\xe2\x25\xb2\x04\x5c\x17\x2d\x6b\xd3\x90\xbd"
+	"\x85\x5f\x08\x6e\x3e\x9d\x52\x5b\x46\xbf\xe2\x45\x11\x43\x15\x32";
+      long_hash =
+	"\x91\x6f\x60\x61\xfe\x87\x97\x41\xca\x64\x69\xb4\x39\x71\xdf\xdb"
+	"\x28\xb1\xa3\x2d\xc3\x6c\xb3\x25\x4e\x81\x2b\xe2\x7a\xad\x1d\x18";
+      one_million_a_hash =
+	"\x5c\x88\x75\xae\x47\x4a\x36\x34\xba\x4f\xd5\x5e\xc8\x5b\xff\xd6"
+	"\x61\xf3\x2a\xca\x75\xc6\xd6\x99\xd0\xcd\xcb\x6c\x11\x58\x91\xc1";
+      hash_len = 32;
+      break;
+
+    case GCRY_MD_SHA3_384:
+      short_hash =
+	"\xec\x01\x49\x82\x88\x51\x6f\xc9\x26\x45\x9f\x58\xe2\xc6\xad\x8d"
+	"\xf9\xb4\x73\xcb\x0f\xc0\x8c\x25\x96\xda\x7c\xf0\xe4\x9b\xe4\xb2"
+	"\x98\xd8\x8c\xea\x92\x7a\xc7\xf5\x39\xf1\xed\xf2\x28\x37\x6d\x25";
+      long_hash =
+	"\x79\x40\x7d\x3b\x59\x16\xb5\x9c\x3e\x30\xb0\x98\x22\x97\x47\x91"
+	"\xc3\x13\xfb\x9e\xcc\x84\x9e\x40\x6f\x23\x59\x2d\x04\xf6\x25\xdc"
+	"\x8c\x70\x9b\x98\xb4\x3b\x38\x52\xb3\x37\x21\x61\x79\xaa\x7f\xc7";
+      one_million_a_hash =
+	"\xee\xe9\xe2\x4d\x78\xc1\x85\x53\x37\x98\x34\x51\xdf\x97\xc8\xad"
+	"\x9e\xed\xf2\x56\xc6\x33\x4f\x8e\x94\x8d\x25\x2d\x5e\x0e\x76\x84"
+	"\x7a\xa0\x77\x4d\xdb\x90\xa8\x42\x19\x0d\x2c\x55\x8b\x4b\x83\x40";
+      hash_len = 48;
+      break;
+
+    case GCRY_MD_SHA3_512:
+      short_hash =
+	"\xb7\x51\x85\x0b\x1a\x57\x16\x8a\x56\x93\xcd\x92\x4b\x6b\x09\x6e"
+	"\x08\xf6\x21\x82\x74\x44\xf7\x0d\x88\x4f\x5d\x02\x40\xd2\x71\x2e"
+	"\x10\xe1\x16\xe9\x19\x2a\xf3\xc9\x1a\x7e\xc5\x76\x47\xe3\x93\x40"
+	"\x57\x34\x0b\x4c\xf4\x08\xd5\xa5\x65\x92\xf8\x27\x4e\xec\x53\xf0";
+      long_hash =
+	"\xaf\xeb\xb2\xef\x54\x2e\x65\x79\xc5\x0c\xad\x06\xd2\xe5\x78\xf9"
+	"\xf8\xdd\x68\x81\xd7\xdc\x82\x4d\x26\x36\x0f\xee\xbf\x18\xa4\xfa"
+	"\x73\xe3\x26\x11\x22\x94\x8e\xfc\xfd\x49\x2e\x74\xe8\x2e\x21\x89"
+	"\xed\x0f\xb4\x40\xd1\x87\xf3\x82\x27\x0c\xb4\x55\xf2\x1d\xd1\x85";
+      one_million_a_hash =
+	"\x3c\x3a\x87\x6d\xa1\x40\x34\xab\x60\x62\x7c\x07\x7b\xb9\x8f\x7e"
+	"\x12\x0a\x2a\x53\x70\x21\x2d\xff\xb3\x38\x5a\x18\xd4\xf3\x88\x59"
+	"\xed\x31\x1d\x0a\x9d\x51\x41\xce\x9c\xc5\xc6\x6e\xe6\x89\xb2\x66"
+	"\xa8\xaa\x18\xac\xe8\x28\x2a\x0e\x0d\xb5\x96\xc9\x0b\x0a\x7b\x87";
+      hash_len = 64;
+      break;
+
+    case GCRY_MD_SHAKE128:
+      short_hash =
+	"\x58\x81\x09\x2d\xd8\x18\xbf\x5c\xf8\xa3\xdd\xb7\x93\xfb\xcb\xa7"
+	"\x40\x97\xd5\xc5\x26\xa6\xd3\x5f\x97\xb8\x33\x51\x94\x0f\x2c\xc8";
+      long_hash =
+	"\x7b\x6d\xf6\xff\x18\x11\x73\xb6\xd7\x89\x8d\x7f\xf6\x3f\xb0\x7b"
+	"\x7c\x23\x7d\xaf\x47\x1a\x5a\xe5\x60\x2a\xdb\xcc\xef\x9c\xcf\x4b";
+      one_million_a_hash =
+	"\x9d\x22\x2c\x79\xc4\xff\x9d\x09\x2c\xf6\xca\x86\x14\x3a\xa4\x11"
+	"\xe3\x69\x97\x38\x08\xef\x97\x09\x32\x55\x82\x6c\x55\x72\xef\x58";
+      hash_len = 32;
+      break;
+
+    case GCRY_MD_SHAKE256:
+      short_hash =
+	"\x48\x33\x66\x60\x13\x60\xa8\x77\x1c\x68\x63\x08\x0c\xc4\x11\x4d"
+	"\x8d\xb4\x45\x30\xf8\xf1\xe1\xee\x4f\x94\xea\x37\xe7\x8b\x57\x39";
+      long_hash =
+	"\x98\xbe\x04\x51\x6c\x04\xcc\x73\x59\x3f\xef\x3e\xd0\x35\x2e\xa9"
+	"\xf6\x44\x39\x42\xd6\x95\x0e\x29\xa3\x72\xa6\x81\xc3\xde\xaf\x45";
+      one_million_a_hash =
+	"\x35\x78\xa7\xa4\xca\x91\x37\x56\x9c\xdf\x76\xed\x61\x7d\x31\xbb"
+	"\x99\x4f\xca\x9c\x1b\xbf\x8b\x18\x40\x13\xde\x82\x34\xdf\xd1\x3a";
+      hash_len = 32;
+      break;
+  }
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one (algo, 0, "abc", 3, short_hash,
+					  hash_len);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+	(algo, 0,
+	"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+	"hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+	long_hash, hash_len);
+      if (errtxt)
+	goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one (algo, 1, NULL, 0,
+					      one_million_a_hash, hash_len);
+      if (errtxt)
+	goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+failed:
+  if (report)
+    report ("digest", algo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MD_SHA3_224:
+    case GCRY_MD_SHA3_256:
+    case GCRY_MD_SHA3_384:
+    case GCRY_MD_SHA3_512:
+    case GCRY_MD_SHAKE128:
+    case GCRY_MD_SHAKE256:
+      ec = selftests_keccak (algo, extended, report);
+      break;
+    default:
+      ec = GPG_ERR_DIGEST_ALGO;
+      break;
+    }
+
+  return ec;
+}
+
+
+
+
+static byte sha3_224_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_sha3_224[] =
+  {
+    { "2.16.840.1.101.3.4.2.7" },
+    /* PKCS#1 sha3_224WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte sha3_256_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_sha3_256[] =
+  {
+    { "2.16.840.1.101.3.4.2.8" },
+    /* PKCS#1 sha3_256WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte sha3_384_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_sha3_384[] =
+  {
+    { "2.16.840.1.101.3.4.2.9" },
+    /* PKCS#1 sha3_384WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte sha3_512_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_sha3_512[] =
+  {
+    { "2.16.840.1.101.3.4.2.10" },
+    /* PKCS#1 sha3_512WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte shake128_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_shake128[] =
+  {
+    { "2.16.840.1.101.3.4.2.11" },
+    /* PKCS#1 shake128WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte shake256_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_shake256[] =
+  {
+    { "2.16.840.1.101.3.4.2.12" },
+    /* PKCS#1 shake256WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha3_224 =
+  {
+    GCRY_MD_SHA3_224, {0, 1},
+    "SHA3-224", sha3_224_asn, DIM (sha3_224_asn), oid_spec_sha3_224, 28,
+    sha3_224_init, keccak_write, keccak_final, keccak_read, NULL,
+    _gcry_sha3_224_hash_buffer, _gcry_sha3_224_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_sha3_256 =
+  {
+    GCRY_MD_SHA3_256, {0, 1},
+    "SHA3-256", sha3_256_asn, DIM (sha3_256_asn), oid_spec_sha3_256, 32,
+    sha3_256_init, keccak_write, keccak_final, keccak_read, NULL,
+    _gcry_sha3_256_hash_buffer, _gcry_sha3_256_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_sha3_384 =
+  {
+    GCRY_MD_SHA3_384, {0, 1},
+    "SHA3-384", sha3_384_asn, DIM (sha3_384_asn), oid_spec_sha3_384, 48,
+    sha3_384_init, keccak_write, keccak_final, keccak_read, NULL,
+    _gcry_sha3_384_hash_buffer, _gcry_sha3_384_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_sha3_512 =
+  {
+    GCRY_MD_SHA3_512, {0, 1},
+    "SHA3-512", sha3_512_asn, DIM (sha3_512_asn), oid_spec_sha3_512, 64,
+    sha3_512_init, keccak_write, keccak_final, keccak_read, NULL,
+    _gcry_sha3_512_hash_buffer, _gcry_sha3_512_hash_buffers,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_shake128 =
+  {
+    GCRY_MD_SHAKE128, {0, 1},
+    "SHAKE128", shake128_asn, DIM (shake128_asn), oid_spec_shake128, 0,
+    shake128_init, keccak_write, keccak_final, NULL, keccak_extract,
+    NULL, NULL,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_shake256 =
+  {
+    GCRY_MD_SHAKE256, {0, 1},
+    "SHAKE256", shake256_asn, DIM (shake256_asn), oid_spec_shake256, 0,
+    shake256_init, keccak_write, keccak_final, NULL, keccak_extract,
+    NULL, NULL,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/keccak_permute_32.h b/comm/third_party/libgcrypt/cipher/keccak_permute_32.h
new file mode 100644
index 0000000000..1ce42a42fc
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/keccak_permute_32.h
@@ -0,0 +1,536 @@
+/* keccak_permute_32.h - Keccak permute function (simple 32bit bit-interleaved)
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 "keccakc1024/simple32bi/
+ * Keccak-simple32BI.c" implementation by Ronny Van Keer from SUPERCOP toolkit
+ * package.
+ */
+
+/* Function that computes the Keccak-f[1600] permutation on the given state. */
+static unsigned int
+KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
+{
+  const u32 *round_consts = round_consts_32bit;
+  const u32 *round_consts_end = round_consts_32bit + 2 * 24;
+  u32 Aba0, Abe0, Abi0, Abo0, Abu0;
+  u32 Aba1, Abe1, Abi1, Abo1, Abu1;
+  u32 Aga0, Age0, Agi0, Ago0, Agu0;
+  u32 Aga1, Age1, Agi1, Ago1, Agu1;
+  u32 Aka0, Ake0, Aki0, Ako0, Aku0;
+  u32 Aka1, Ake1, Aki1, Ako1, Aku1;
+  u32 Ama0, Ame0, Ami0, Amo0, Amu0;
+  u32 Ama1, Ame1, Ami1, Amo1, Amu1;
+  u32 Asa0, Ase0, Asi0, Aso0, Asu0;
+  u32 Asa1, Ase1, Asi1, Aso1, Asu1;
+  u32 BCa0, BCe0, BCi0, BCo0, BCu0;
+  u32 BCa1, BCe1, BCi1, BCo1, BCu1;
+  u32 Da0, De0, Di0, Do0, Du0;
+  u32 Da1, De1, Di1, Do1, Du1;
+  u32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0;
+  u32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1;
+  u32 Ega0, Ege0, Egi0, Ego0, Egu0;
+  u32 Ega1, Ege1, Egi1, Ego1, Egu1;
+  u32 Eka0, Eke0, Eki0, Eko0, Eku0;
+  u32 Eka1, Eke1, Eki1, Eko1, Eku1;
+  u32 Ema0, Eme0, Emi0, Emo0, Emu0;
+  u32 Ema1, Eme1, Emi1, Emo1, Emu1;
+  u32 Esa0, Ese0, Esi0, Eso0, Esu0;
+  u32 Esa1, Ese1, Esi1, Eso1, Esu1;
+  u32 *state = hd->u.state32bi;
+
+  Aba0 = state[0];
+  Aba1 = state[1];
+  Abe0 = state[2];
+  Abe1 = state[3];
+  Abi0 = state[4];
+  Abi1 = state[5];
+  Abo0 = state[6];
+  Abo1 = state[7];
+  Abu0 = state[8];
+  Abu1 = state[9];
+  Aga0 = state[10];
+  Aga1 = state[11];
+  Age0 = state[12];
+  Age1 = state[13];
+  Agi0 = state[14];
+  Agi1 = state[15];
+  Ago0 = state[16];
+  Ago1 = state[17];
+  Agu0 = state[18];
+  Agu1 = state[19];
+  Aka0 = state[20];
+  Aka1 = state[21];
+  Ake0 = state[22];
+  Ake1 = state[23];
+  Aki0 = state[24];
+  Aki1 = state[25];
+  Ako0 = state[26];
+  Ako1 = state[27];
+  Aku0 = state[28];
+  Aku1 = state[29];
+  Ama0 = state[30];
+  Ama1 = state[31];
+  Ame0 = state[32];
+  Ame1 = state[33];
+  Ami0 = state[34];
+  Ami1 = state[35];
+  Amo0 = state[36];
+  Amo1 = state[37];
+  Amu0 = state[38];
+  Amu1 = state[39];
+  Asa0 = state[40];
+  Asa1 = state[41];
+  Ase0 = state[42];
+  Ase1 = state[43];
+  Asi0 = state[44];
+  Asi1 = state[45];
+  Aso0 = state[46];
+  Aso1 = state[47];
+  Asu0 = state[48];
+  Asu1 = state[49];
+
+  do
+    {
+      /* prepareTheta */
+      BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0;
+      BCa1 = Aba1 ^ Aga1 ^ Aka1 ^ Ama1 ^ Asa1;
+      BCe0 = Abe0 ^ Age0 ^ Ake0 ^ Ame0 ^ Ase0;
+      BCe1 = Abe1 ^ Age1 ^ Ake1 ^ Ame1 ^ Ase1;
+      BCi0 = Abi0 ^ Agi0 ^ Aki0 ^ Ami0 ^ Asi0;
+      BCi1 = Abi1 ^ Agi1 ^ Aki1 ^ Ami1 ^ Asi1;
+      BCo0 = Abo0 ^ Ago0 ^ Ako0 ^ Amo0 ^ Aso0;
+      BCo1 = Abo1 ^ Ago1 ^ Ako1 ^ Amo1 ^ Aso1;
+      BCu0 = Abu0 ^ Agu0 ^ Aku0 ^ Amu0 ^ Asu0;
+      BCu1 = Abu1 ^ Agu1 ^ Aku1 ^ Amu1 ^ Asu1;
+
+      /* thetaRhoPiChiIota(round  , A, E) */
+      Da0 = BCu0 ^ ROL32(BCe1, 1);
+      Da1 = BCu1 ^ BCe0;
+      De0 = BCa0 ^ ROL32(BCi1, 1);
+      De1 = BCa1 ^ BCi0;
+      Di0 = BCe0 ^ ROL32(BCo1, 1);
+      Di1 = BCe1 ^ BCo0;
+      Do0 = BCi0 ^ ROL32(BCu1, 1);
+      Do1 = BCi1 ^ BCu0;
+      Du0 = BCo0 ^ ROL32(BCa1, 1);
+      Du1 = BCo1 ^ BCa0;
+
+      Aba0 ^= Da0;
+      BCa0 = Aba0;
+      Age0 ^= De0;
+      BCe0 = ROL32(Age0, 22);
+      Aki1 ^= Di1;
+      BCi0 = ROL32(Aki1, 22);
+      Amo1 ^= Do1;
+      BCo0 = ROL32(Amo1, 11);
+      Asu0 ^= Du0;
+      BCu0 = ROL32(Asu0, 7);
+      Eba0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Eba0 ^= *(round_consts++);
+      Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Ebu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Aba1 ^= Da1;
+      BCa1 = Aba1;
+      Age1 ^= De1;
+      BCe1 = ROL32(Age1, 22);
+      Aki0 ^= Di0;
+      BCi1 = ROL32(Aki0, 21);
+      Amo0 ^= Do0;
+      BCo1 = ROL32(Amo0, 10);
+      Asu1 ^= Du1;
+      BCu1 = ROL32(Asu1, 7);
+      Eba1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Eba1 ^= *(round_consts++);
+      Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Ebu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Abo0 ^= Do0;
+      BCa0 = ROL32(Abo0, 14);
+      Agu0 ^= Du0;
+      BCe0 = ROL32(Agu0, 10);
+      Aka1 ^= Da1;
+      BCi0 = ROL32(Aka1, 2);
+      Ame1 ^= De1;
+      BCo0 = ROL32(Ame1, 23);
+      Asi1 ^= Di1;
+      BCu0 = ROL32(Asi1, 31);
+      Ega0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Ege0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Egi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Ego0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Egu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Abo1 ^= Do1;
+      BCa1 = ROL32(Abo1, 14);
+      Agu1 ^= Du1;
+      BCe1 = ROL32(Agu1, 10);
+      Aka0 ^= Da0;
+      BCi1 = ROL32(Aka0, 1);
+      Ame0 ^= De0;
+      BCo1 = ROL32(Ame0, 22);
+      Asi0 ^= Di0;
+      BCu1 = ROL32(Asi0, 30);
+      Ega1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Ege1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Egi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Ego1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Egu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Abe1 ^= De1;
+      BCa0 = ROL32(Abe1, 1);
+      Agi0 ^= Di0;
+      BCe0 = ROL32(Agi0, 3);
+      Ako1 ^= Do1;
+      BCi0 = ROL32(Ako1, 13);
+      Amu0 ^= Du0;
+      BCo0 = ROL32(Amu0, 4);
+      Asa0 ^= Da0;
+      BCu0 = ROL32(Asa0, 9);
+      Eka0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Eke0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Eki0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Eko0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Eku0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Abe0 ^= De0;
+      BCa1 = Abe0;
+      Agi1 ^= Di1;
+      BCe1 = ROL32(Agi1, 3);
+      Ako0 ^= Do0;
+      BCi1 = ROL32(Ako0, 12);
+      Amu1 ^= Du1;
+      BCo1 = ROL32(Amu1, 4);
+      Asa1 ^= Da1;
+      BCu1 = ROL32(Asa1, 9);
+      Eka1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Eke1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Eki1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Eko1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Eku1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Abu1 ^= Du1;
+      BCa0 = ROL32(Abu1, 14);
+      Aga0 ^= Da0;
+      BCe0 = ROL32(Aga0, 18);
+      Ake0 ^= De0;
+      BCi0 = ROL32(Ake0, 5);
+      Ami1 ^= Di1;
+      BCo0 = ROL32(Ami1, 8);
+      Aso0 ^= Do0;
+      BCu0 = ROL32(Aso0, 28);
+      Ema0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Eme0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Emi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Emo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Emu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Abu0 ^= Du0;
+      BCa1 = ROL32(Abu0, 13);
+      Aga1 ^= Da1;
+      BCe1 = ROL32(Aga1, 18);
+      Ake1 ^= De1;
+      BCi1 = ROL32(Ake1, 5);
+      Ami0 ^= Di0;
+      BCo1 = ROL32(Ami0, 7);
+      Aso1 ^= Do1;
+      BCu1 = ROL32(Aso1, 28);
+      Ema1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Eme1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Emi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Emo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Emu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Abi0 ^= Di0;
+      BCa0 = ROL32(Abi0, 31);
+      Ago1 ^= Do1;
+      BCe0 = ROL32(Ago1, 28);
+      Aku1 ^= Du1;
+      BCi0 = ROL32(Aku1, 20);
+      Ama1 ^= Da1;
+      BCo0 = ROL32(Ama1, 21);
+      Ase0 ^= De0;
+      BCu0 = ROL32(Ase0, 1);
+      Esa0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Ese0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Esi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Eso0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Esu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Abi1 ^= Di1;
+      BCa1 = ROL32(Abi1, 31);
+      Ago0 ^= Do0;
+      BCe1 = ROL32(Ago0, 27);
+      Aku0 ^= Du0;
+      BCi1 = ROL32(Aku0, 19);
+      Ama0 ^= Da0;
+      BCo1 = ROL32(Ama0, 20);
+      Ase1 ^= De1;
+      BCu1 = ROL32(Ase1, 1);
+      Esa1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Ese1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Esi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Eso1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Esu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      /* prepareTheta */
+      BCa0 = Eba0 ^ Ega0 ^ Eka0 ^ Ema0 ^ Esa0;
+      BCa1 = Eba1 ^ Ega1 ^ Eka1 ^ Ema1 ^ Esa1;
+      BCe0 = Ebe0 ^ Ege0 ^ Eke0 ^ Eme0 ^ Ese0;
+      BCe1 = Ebe1 ^ Ege1 ^ Eke1 ^ Eme1 ^ Ese1;
+      BCi0 = Ebi0 ^ Egi0 ^ Eki0 ^ Emi0 ^ Esi0;
+      BCi1 = Ebi1 ^ Egi1 ^ Eki1 ^ Emi1 ^ Esi1;
+      BCo0 = Ebo0 ^ Ego0 ^ Eko0 ^ Emo0 ^ Eso0;
+      BCo1 = Ebo1 ^ Ego1 ^ Eko1 ^ Emo1 ^ Eso1;
+      BCu0 = Ebu0 ^ Egu0 ^ Eku0 ^ Emu0 ^ Esu0;
+      BCu1 = Ebu1 ^ Egu1 ^ Eku1 ^ Emu1 ^ Esu1;
+
+      /* thetaRhoPiChiIota(round+1, E, A) */
+      Da0 = BCu0 ^ ROL32(BCe1, 1);
+      Da1 = BCu1 ^ BCe0;
+      De0 = BCa0 ^ ROL32(BCi1, 1);
+      De1 = BCa1 ^ BCi0;
+      Di0 = BCe0 ^ ROL32(BCo1, 1);
+      Di1 = BCe1 ^ BCo0;
+      Do0 = BCi0 ^ ROL32(BCu1, 1);
+      Do1 = BCi1 ^ BCu0;
+      Du0 = BCo0 ^ ROL32(BCa1, 1);
+      Du1 = BCo1 ^ BCa0;
+
+      Eba0 ^= Da0;
+      BCa0 = Eba0;
+      Ege0 ^= De0;
+      BCe0 = ROL32(Ege0, 22);
+      Eki1 ^= Di1;
+      BCi0 = ROL32(Eki1, 22);
+      Emo1 ^= Do1;
+      BCo0 = ROL32(Emo1, 11);
+      Esu0 ^= Du0;
+      BCu0 = ROL32(Esu0, 7);
+      Aba0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Aba0 ^= *(round_consts++);
+      Abe0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Abi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Abo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Abu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Eba1 ^= Da1;
+      BCa1 = Eba1;
+      Ege1 ^= De1;
+      BCe1 = ROL32(Ege1, 22);
+      Eki0 ^= Di0;
+      BCi1 = ROL32(Eki0, 21);
+      Emo0 ^= Do0;
+      BCo1 = ROL32(Emo0, 10);
+      Esu1 ^= Du1;
+      BCu1 = ROL32(Esu1, 7);
+      Aba1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Aba1 ^= *(round_consts++);
+      Abe1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Abi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Abo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Abu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Ebo0 ^= Do0;
+      BCa0 = ROL32(Ebo0, 14);
+      Egu0 ^= Du0;
+      BCe0 = ROL32(Egu0, 10);
+      Eka1 ^= Da1;
+      BCi0 = ROL32(Eka1, 2);
+      Eme1 ^= De1;
+      BCo0 = ROL32(Eme1, 23);
+      Esi1 ^= Di1;
+      BCu0 = ROL32(Esi1, 31);
+      Aga0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Age0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Agi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Ago0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Agu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Ebo1 ^= Do1;
+      BCa1 = ROL32(Ebo1, 14);
+      Egu1 ^= Du1;
+      BCe1 = ROL32(Egu1, 10);
+      Eka0 ^= Da0;
+      BCi1 = ROL32(Eka0, 1);
+      Eme0 ^= De0;
+      BCo1 = ROL32(Eme0, 22);
+      Esi0 ^= Di0;
+      BCu1 = ROL32(Esi0, 30);
+      Aga1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Age1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Agi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Ago1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Agu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Ebe1 ^= De1;
+      BCa0 = ROL32(Ebe1, 1);
+      Egi0 ^= Di0;
+      BCe0 = ROL32(Egi0, 3);
+      Eko1 ^= Do1;
+      BCi0 = ROL32(Eko1, 13);
+      Emu0 ^= Du0;
+      BCo0 = ROL32(Emu0, 4);
+      Esa0 ^= Da0;
+      BCu0 = ROL32(Esa0, 9);
+      Aka0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Ake0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Aki0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Ako0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Aku0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Ebe0 ^= De0;
+      BCa1 = Ebe0;
+      Egi1 ^= Di1;
+      BCe1 = ROL32(Egi1, 3);
+      Eko0 ^= Do0;
+      BCi1 = ROL32(Eko0, 12);
+      Emu1 ^= Du1;
+      BCo1 = ROL32(Emu1, 4);
+      Esa1 ^= Da1;
+      BCu1 = ROL32(Esa1, 9);
+      Aka1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Ake1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Aki1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Ako1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Aku1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Ebu1 ^= Du1;
+      BCa0 = ROL32(Ebu1, 14);
+      Ega0 ^= Da0;
+      BCe0 = ROL32(Ega0, 18);
+      Eke0 ^= De0;
+      BCi0 = ROL32(Eke0, 5);
+      Emi1 ^= Di1;
+      BCo0 = ROL32(Emi1, 8);
+      Eso0 ^= Do0;
+      BCu0 = ROL32(Eso0, 28);
+      Ama0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Ame0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Ami0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Amo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Amu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Ebu0 ^= Du0;
+      BCa1 = ROL32(Ebu0, 13);
+      Ega1 ^= Da1;
+      BCe1 = ROL32(Ega1, 18);
+      Eke1 ^= De1;
+      BCi1 = ROL32(Eke1, 5);
+      Emi0 ^= Di0;
+      BCo1 = ROL32(Emi0, 7);
+      Eso1 ^= Do1;
+      BCu1 = ROL32(Eso1, 28);
+      Ama1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Ame1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Ami1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Amo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Amu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+      Ebi0 ^= Di0;
+      BCa0 = ROL32(Ebi0, 31);
+      Ego1 ^= Do1;
+      BCe0 = ROL32(Ego1, 28);
+      Eku1 ^= Du1;
+      BCi0 = ROL32(Eku1, 20);
+      Ema1 ^= Da1;
+      BCo0 = ROL32(Ema1, 21);
+      Ese0 ^= De0;
+      BCu0 = ROL32(Ese0, 1);
+      Asa0 = BCa0 ^ ANDN32(BCe0, BCi0);
+      Ase0 = BCe0 ^ ANDN32(BCi0, BCo0);
+      Asi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+      Aso0 = BCo0 ^ ANDN32(BCu0, BCa0);
+      Asu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+      Ebi1 ^= Di1;
+      BCa1 = ROL32(Ebi1, 31);
+      Ego0 ^= Do0;
+      BCe1 = ROL32(Ego0, 27);
+      Eku0 ^= Du0;
+      BCi1 = ROL32(Eku0, 19);
+      Ema0 ^= Da0;
+      BCo1 = ROL32(Ema0, 20);
+      Ese1 ^= De1;
+      BCu1 = ROL32(Ese1, 1);
+      Asa1 = BCa1 ^ ANDN32(BCe1, BCi1);
+      Ase1 = BCe1 ^ ANDN32(BCi1, BCo1);
+      Asi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+      Aso1 = BCo1 ^ ANDN32(BCu1, BCa1);
+      Asu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+    }
+  while (round_consts < round_consts_end);
+
+  state[0] = Aba0;
+  state[1] = Aba1;
+  state[2] = Abe0;
+  state[3] = Abe1;
+  state[4] = Abi0;
+  state[5] = Abi1;
+  state[6] = Abo0;
+  state[7] = Abo1;
+  state[8] = Abu0;
+  state[9] = Abu1;
+  state[10] = Aga0;
+  state[11] = Aga1;
+  state[12] = Age0;
+  state[13] = Age1;
+  state[14] = Agi0;
+  state[15] = Agi1;
+  state[16] = Ago0;
+  state[17] = Ago1;
+  state[18] = Agu0;
+  state[19] = Agu1;
+  state[20] = Aka0;
+  state[21] = Aka1;
+  state[22] = Ake0;
+  state[23] = Ake1;
+  state[24] = Aki0;
+  state[25] = Aki1;
+  state[26] = Ako0;
+  state[27] = Ako1;
+  state[28] = Aku0;
+  state[29] = Aku1;
+  state[30] = Ama0;
+  state[31] = Ama1;
+  state[32] = Ame0;
+  state[33] = Ame1;
+  state[34] = Ami0;
+  state[35] = Ami1;
+  state[36] = Amo0;
+  state[37] = Amo1;
+  state[38] = Amu0;
+  state[39] = Amu1;
+  state[40] = Asa0;
+  state[41] = Asa1;
+  state[42] = Ase0;
+  state[43] = Ase1;
+  state[44] = Asi0;
+  state[45] = Asi1;
+  state[46] = Aso0;
+  state[47] = Aso1;
+  state[48] = Asu0;
+  state[49] = Asu1;
+
+  return sizeof(void *) * 4 + sizeof(u32) * 12 * 5 * 2;
+}
diff --git a/comm/third_party/libgcrypt/cipher/keccak_permute_64.h b/comm/third_party/libgcrypt/cipher/keccak_permute_64.h
new file mode 100644
index 0000000000..b28c871ec1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/keccak_permute_64.h
@@ -0,0 +1,385 @@
+/* keccak_permute_64.h - Keccak permute function (simple 64bit)
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 "keccakc1024/simple/Keccak-simple.c"
+ * implementation by Ronny Van Keer from SUPERCOP toolkit package.
+ */
+
+/* Function that computes the Keccak-f[1600] permutation on the given state. */
+static unsigned int
+KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
+{
+  const u64 *round_consts = _gcry_keccak_round_consts_64bit;
+  const u64 *round_consts_end = _gcry_keccak_round_consts_64bit + 24;
+  u64 Aba, Abe, Abi, Abo, Abu;
+  u64 Aga, Age, Agi, Ago, Agu;
+  u64 Aka, Ake, Aki, Ako, Aku;
+  u64 Ama, Ame, Ami, Amo, Amu;
+  u64 Asa, Ase, Asi, Aso, Asu;
+  u64 BCa, BCe, BCi, BCo, BCu;
+  u64 Da, De, Di, Do, Du;
+  u64 Eba, Ebe, Ebi, Ebo, Ebu;
+  u64 Ega, Ege, Egi, Ego, Egu;
+  u64 Eka, Eke, Eki, Eko, Eku;
+  u64 Ema, Eme, Emi, Emo, Emu;
+  u64 Esa, Ese, Esi, Eso, Esu;
+  u64 *state = hd->u.state64;
+
+  Aba = state[0];
+  Abe = state[1];
+  Abi = state[2];
+  Abo = state[3];
+  Abu = state[4];
+  Aga = state[5];
+  Age = state[6];
+  Agi = state[7];
+  Ago = state[8];
+  Agu = state[9];
+  Aka = state[10];
+  Ake = state[11];
+  Aki = state[12];
+  Ako = state[13];
+  Aku = state[14];
+  Ama = state[15];
+  Ame = state[16];
+  Ami = state[17];
+  Amo = state[18];
+  Amu = state[19];
+  Asa = state[20];
+  Ase = state[21];
+  Asi = state[22];
+  Aso = state[23];
+  Asu = state[24];
+
+  do
+    {
+      /* prepareTheta */
+      BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
+      BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
+      BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
+      BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
+      BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;
+
+      /* thetaRhoPiChiIotaPrepareTheta(round  , A, E) */
+      Da = BCu ^ ROL64(BCe, 1);
+      De = BCa ^ ROL64(BCi, 1);
+      Di = BCe ^ ROL64(BCo, 1);
+      Do = BCi ^ ROL64(BCu, 1);
+      Du = BCo ^ ROL64(BCa, 1);
+
+      Aba ^= Da;
+      BCa = Aba;
+      Age ^= De;
+      BCe = ROL64(Age, 44);
+      Aki ^= Di;
+      BCi = ROL64(Aki, 43);
+      Amo ^= Do;
+      BCo = ROL64(Amo, 21);
+      Asu ^= Du;
+      BCu = ROL64(Asu, 14);
+      Eba = BCa ^ ANDN64(BCe, BCi);
+      Eba ^= *(round_consts++);
+      Ebe = BCe ^ ANDN64(BCi, BCo);
+      Ebi = BCi ^ ANDN64(BCo, BCu);
+      Ebo = BCo ^ ANDN64(BCu, BCa);
+      Ebu = BCu ^ ANDN64(BCa, BCe);
+
+      Abo ^= Do;
+      BCa = ROL64(Abo, 28);
+      Agu ^= Du;
+      BCe = ROL64(Agu, 20);
+      Aka ^= Da;
+      BCi = ROL64(Aka, 3);
+      Ame ^= De;
+      BCo = ROL64(Ame, 45);
+      Asi ^= Di;
+      BCu = ROL64(Asi, 61);
+      Ega = BCa ^ ANDN64(BCe, BCi);
+      Ege = BCe ^ ANDN64(BCi, BCo);
+      Egi = BCi ^ ANDN64(BCo, BCu);
+      Ego = BCo ^ ANDN64(BCu, BCa);
+      Egu = BCu ^ ANDN64(BCa, BCe);
+
+      Abe ^= De;
+      BCa = ROL64(Abe, 1);
+      Agi ^= Di;
+      BCe = ROL64(Agi, 6);
+      Ako ^= Do;
+      BCi = ROL64(Ako, 25);
+      Amu ^= Du;
+      BCo = ROL64(Amu, 8);
+      Asa ^= Da;
+      BCu = ROL64(Asa, 18);
+      Eka = BCa ^ ANDN64(BCe, BCi);
+      Eke = BCe ^ ANDN64(BCi, BCo);
+      Eki = BCi ^ ANDN64(BCo, BCu);
+      Eko = BCo ^ ANDN64(BCu, BCa);
+      Eku = BCu ^ ANDN64(BCa, BCe);
+
+      Abu ^= Du;
+      BCa = ROL64(Abu, 27);
+      Aga ^= Da;
+      BCe = ROL64(Aga, 36);
+      Ake ^= De;
+      BCi = ROL64(Ake, 10);
+      Ami ^= Di;
+      BCo = ROL64(Ami, 15);
+      Aso ^= Do;
+      BCu = ROL64(Aso, 56);
+      Ema = BCa ^ ANDN64(BCe, BCi);
+      Eme = BCe ^ ANDN64(BCi, BCo);
+      Emi = BCi ^ ANDN64(BCo, BCu);
+      Emo = BCo ^ ANDN64(BCu, BCa);
+      Emu = BCu ^ ANDN64(BCa, BCe);
+
+      Abi ^= Di;
+      BCa = ROL64(Abi, 62);
+      Ago ^= Do;
+      BCe = ROL64(Ago, 55);
+      Aku ^= Du;
+      BCi = ROL64(Aku, 39);
+      Ama ^= Da;
+      BCo = ROL64(Ama, 41);
+      Ase ^= De;
+      BCu = ROL64(Ase, 2);
+      Esa = BCa ^ ANDN64(BCe, BCi);
+      Ese = BCe ^ ANDN64(BCi, BCo);
+      Esi = BCi ^ ANDN64(BCo, BCu);
+      Eso = BCo ^ ANDN64(BCu, BCa);
+      Esu = BCu ^ ANDN64(BCa, BCe);
+
+      /* prepareTheta */
+      BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
+      BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
+      BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
+      BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
+      BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;
+
+      /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */
+      Da = BCu ^ ROL64(BCe, 1);
+      De = BCa ^ ROL64(BCi, 1);
+      Di = BCe ^ ROL64(BCo, 1);
+      Do = BCi ^ ROL64(BCu, 1);
+      Du = BCo ^ ROL64(BCa, 1);
+
+      Eba ^= Da;
+      BCa = Eba;
+      Ege ^= De;
+      BCe = ROL64(Ege, 44);
+      Eki ^= Di;
+      BCi = ROL64(Eki, 43);
+      Emo ^= Do;
+      BCo = ROL64(Emo, 21);
+      Esu ^= Du;
+      BCu = ROL64(Esu, 14);
+      Aba = BCa ^ ANDN64(BCe, BCi);
+      Aba ^= *(round_consts++);
+      Abe = BCe ^ ANDN64(BCi, BCo);
+      Abi = BCi ^ ANDN64(BCo, BCu);
+      Abo = BCo ^ ANDN64(BCu, BCa);
+      Abu = BCu ^ ANDN64(BCa, BCe);
+
+      Ebo ^= Do;
+      BCa = ROL64(Ebo, 28);
+      Egu ^= Du;
+      BCe = ROL64(Egu, 20);
+      Eka ^= Da;
+      BCi = ROL64(Eka, 3);
+      Eme ^= De;
+      BCo = ROL64(Eme, 45);
+      Esi ^= Di;
+      BCu = ROL64(Esi, 61);
+      Aga = BCa ^ ANDN64(BCe, BCi);
+      Age = BCe ^ ANDN64(BCi, BCo);
+      Agi = BCi ^ ANDN64(BCo, BCu);
+      Ago = BCo ^ ANDN64(BCu, BCa);
+      Agu = BCu ^ ANDN64(BCa, BCe);
+
+      Ebe ^= De;
+      BCa = ROL64(Ebe, 1);
+      Egi ^= Di;
+      BCe = ROL64(Egi, 6);
+      Eko ^= Do;
+      BCi = ROL64(Eko, 25);
+      Emu ^= Du;
+      BCo = ROL64(Emu, 8);
+      Esa ^= Da;
+      BCu = ROL64(Esa, 18);
+      Aka = BCa ^ ANDN64(BCe, BCi);
+      Ake = BCe ^ ANDN64(BCi, BCo);
+      Aki = BCi ^ ANDN64(BCo, BCu);
+      Ako = BCo ^ ANDN64(BCu, BCa);
+      Aku = BCu ^ ANDN64(BCa, BCe);
+
+      Ebu ^= Du;
+      BCa = ROL64(Ebu, 27);
+      Ega ^= Da;
+      BCe = ROL64(Ega, 36);
+      Eke ^= De;
+      BCi = ROL64(Eke, 10);
+      Emi ^= Di;
+      BCo = ROL64(Emi, 15);
+      Eso ^= Do;
+      BCu = ROL64(Eso, 56);
+      Ama = BCa ^ ANDN64(BCe, BCi);
+      Ame = BCe ^ ANDN64(BCi, BCo);
+      Ami = BCi ^ ANDN64(BCo, BCu);
+      Amo = BCo ^ ANDN64(BCu, BCa);
+      Amu = BCu ^ ANDN64(BCa, BCe);
+
+      Ebi ^= Di;
+      BCa = ROL64(Ebi, 62);
+      Ego ^= Do;
+      BCe = ROL64(Ego, 55);
+      Eku ^= Du;
+      BCi = ROL64(Eku, 39);
+      Ema ^= Da;
+      BCo = ROL64(Ema, 41);
+      Ese ^= De;
+      BCu = ROL64(Ese, 2);
+      Asa = BCa ^ ANDN64(BCe, BCi);
+      Ase = BCe ^ ANDN64(BCi, BCo);
+      Asi = BCi ^ ANDN64(BCo, BCu);
+      Aso = BCo ^ ANDN64(BCu, BCa);
+      Asu = BCu ^ ANDN64(BCa, BCe);
+    }
+  while (round_consts < round_consts_end);
+
+  state[0] = Aba;
+  state[1] = Abe;
+  state[2] = Abi;
+  state[3] = Abo;
+  state[4] = Abu;
+  state[5] = Aga;
+  state[6] = Age;
+  state[7] = Agi;
+  state[8] = Ago;
+  state[9] = Agu;
+  state[10] = Aka;
+  state[11] = Ake;
+  state[12] = Aki;
+  state[13] = Ako;
+  state[14] = Aku;
+  state[15] = Ama;
+  state[16] = Ame;
+  state[17] = Ami;
+  state[18] = Amo;
+  state[19] = Amu;
+  state[20] = Asa;
+  state[21] = Ase;
+  state[22] = Asi;
+  state[23] = Aso;
+  state[24] = Asu;
+
+  return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
+}
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+			      unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      switch (blocklanes)
+	{
+	case 21:
+	  /* SHAKE128 */
+	  while (pos == 0 && nlanes >= 21)
+	    {
+	      nlanes -= 21;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[16], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[20], lanes); lanes += 8 * 1;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 18:
+	  /* SHA3-224 */
+	  while (pos == 0 && nlanes >= 18)
+	    {
+	      nlanes -= 18;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_2(&hd->u.state64[16], lanes); lanes += 8 * 2;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 17:
+	  /* SHA3-256 & SHAKE256 */
+	  while (pos == 0 && nlanes >= 17)
+	    {
+	      nlanes -= 17;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[16], lanes); lanes += 8 * 1;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 13:
+	  /* SHA3-384 */
+	  while (pos == 0 && nlanes >= 13)
+	    {
+	      nlanes -= 13;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[8], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[12], lanes); lanes += 8 * 1;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 9:
+	  /* SHA3-512 */
+	  while (pos == 0 && nlanes >= 9)
+	    {
+	      nlanes -= 9;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[8], lanes); lanes += 8 * 1;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+	}
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+
+	  if (++pos == blocklanes)
+	    {
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	      pos = 0;
+	      break;
+	    }
+	}
+    }
+
+  return burn;
+}
diff --git a/comm/third_party/libgcrypt/cipher/mac-cmac.c b/comm/third_party/libgcrypt/cipher/mac-cmac.c
new file mode 100644
index 0000000000..8d5d5ca304
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac-cmac.c
@@ -0,0 +1,524 @@
+/* mac-cmac.c  -  CMAC glue for MAC API
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "./mac-internal.h"
+
+
+static int
+map_mac_algo_to_cipher (int mac_algo)
+{
+  switch (mac_algo)
+    {
+    default:
+      return GCRY_CIPHER_NONE;
+    case GCRY_MAC_CMAC_AES:
+      return GCRY_CIPHER_AES;
+    case GCRY_MAC_CMAC_3DES:
+      return GCRY_CIPHER_3DES;
+    case GCRY_MAC_CMAC_CAMELLIA:
+      return GCRY_CIPHER_CAMELLIA128;
+    case GCRY_MAC_CMAC_IDEA:
+      return GCRY_CIPHER_IDEA;
+    case GCRY_MAC_CMAC_CAST5:
+      return GCRY_CIPHER_CAST5;
+    case GCRY_MAC_CMAC_BLOWFISH:
+      return GCRY_CIPHER_BLOWFISH;
+    case GCRY_MAC_CMAC_TWOFISH:
+      return GCRY_CIPHER_TWOFISH;
+    case GCRY_MAC_CMAC_SERPENT:
+      return GCRY_CIPHER_SERPENT128;
+    case GCRY_MAC_CMAC_SEED:
+      return GCRY_CIPHER_SEED;
+    case GCRY_MAC_CMAC_RFC2268:
+      return GCRY_CIPHER_RFC2268_128;
+    case GCRY_MAC_CMAC_GOST28147:
+      return GCRY_CIPHER_GOST28147;
+    case GCRY_MAC_CMAC_SM4:
+      return GCRY_CIPHER_SM4;
+    }
+}
+
+
+static gcry_err_code_t
+cmac_open (gcry_mac_hd_t h)
+{
+  gcry_err_code_t err;
+  gcry_cipher_hd_t hd;
+  int secure = (h->magic == CTX_MAC_MAGIC_SECURE);
+  int cipher_algo;
+  unsigned int flags;
+
+  cipher_algo = map_mac_algo_to_cipher (h->spec->algo);
+  flags = (secure ? GCRY_CIPHER_SECURE : 0);
+
+  err = _gcry_cipher_open_internal (&hd, cipher_algo, GCRY_CIPHER_MODE_CMAC,
+                                    flags);
+  if (err)
+    return err;
+
+  h->u.cmac.cipher_algo = cipher_algo;
+  h->u.cmac.ctx = hd;
+  h->u.cmac.blklen = _gcry_cipher_get_algo_blklen (cipher_algo);
+  return 0;
+}
+
+
+static void
+cmac_close (gcry_mac_hd_t h)
+{
+  _gcry_cipher_close (h->u.cmac.ctx);
+  h->u.cmac.ctx = NULL;
+}
+
+
+static gcry_err_code_t
+cmac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  return _gcry_cipher_setkey (h->u.cmac.ctx, key, keylen);
+}
+
+
+static gcry_err_code_t
+cmac_reset (gcry_mac_hd_t h)
+{
+  return _gcry_cipher_reset (h->u.cmac.ctx);
+}
+
+
+static gcry_err_code_t
+cmac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  return _gcry_cipher_cmac_authenticate (h->u.cmac.ctx, buf, buflen);
+}
+
+
+static gcry_err_code_t
+cmac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t * outlen)
+{
+  if (*outlen > h->u.cmac.blklen)
+    *outlen = h->u.cmac.blklen;
+  return _gcry_cipher_cmac_get_tag (h->u.cmac.ctx, outbuf, *outlen);
+}
+
+
+static gcry_err_code_t
+cmac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  return _gcry_cipher_cmac_check_tag (h->u.cmac.ctx, buf, buflen);
+}
+
+
+static unsigned int
+cmac_get_maclen (int algo)
+{
+  return _gcry_cipher_get_algo_blklen (map_mac_algo_to_cipher (algo));
+}
+
+
+static unsigned int
+cmac_get_keylen (int algo)
+{
+  return _gcry_cipher_get_algo_keylen (map_mac_algo_to_cipher (algo));
+}
+
+
+/* Check one CMAC with MAC ALGO using the regular MAC
+ * API. (DATA,DATALEN) is the data to be MACed, (KEY,KEYLEN) the key
+ * and (EXPECT,EXPECTLEN) the expected result.  Returns NULL on
+ * success or a string describing the failure.  */
+static const char *
+check_one (int algo, const char *data, size_t datalen,
+           const char *key, size_t keylen,
+           const char *expect, size_t expectlen)
+{
+  gcry_mac_hd_t hd;
+  unsigned char mac[512]; /* hardcoded to avoid allocation */
+  unsigned int maclen;
+  size_t macoutlen;
+  int i;
+  gcry_error_t err = 0;
+
+  err = _gcry_mac_open (&hd, algo, 0, NULL);
+  if (err)
+    return "gcry_mac_open failed";
+
+  i = _gcry_mac_get_algo (hd);
+  if (i != algo)
+    return "gcry_mac_get_algo failed";
+
+  maclen = _gcry_mac_get_algo_maclen (algo);
+  if (maclen < 1 || maclen > 500)
+    return "gcry_mac_get_algo_maclen failed";
+
+  if (maclen != expectlen)
+    return "invalid tests data";
+
+  err = _gcry_mac_setkey (hd, key, keylen);
+  if (err)
+    {
+      _gcry_mac_close (hd);
+      return "gcry_mac_setkey failed";
+    }
+
+  err = _gcry_mac_write (hd, data, datalen);
+  if (err)
+    {
+      _gcry_mac_close (hd);
+      return "gcry_mac_write failed";
+    }
+
+  err = _gcry_mac_verify (hd, expect, maclen);
+  if (err)
+    {
+      _gcry_mac_close (hd);
+      return "gcry_mac_verify failed";
+    }
+
+  macoutlen = maclen;
+  err = _gcry_mac_read (hd, mac, &macoutlen);
+  _gcry_mac_close (hd);
+  if (err)
+    return "gcry_mac_read failed";
+
+  if (memcmp (mac, expect, maclen))
+    return "does not match";
+
+  return NULL;
+}
+
+
+/*
+ * CMAC AES and DES test vectors are from
+ * http://web.archive.org/web/20130930212819/http://csrc.nist.gov/publica \
+ * tions/nistpubs/800-38B/Updated_CMAC_Examples.pdf
+ */
+
+static gpg_err_code_t
+selftests_cmac_3des (int extended, selftest_report_func_t report)
+{
+  static const struct
+  {
+    const char *desc;
+    const char *data;
+    const char *key;
+    const char *expect;
+  } tv[] =
+    {
+      { "Basic 3DES",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57",
+        "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+        "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+        "\x74\x3d\xdb\xe0\xce\x2d\xc2\xed" },
+      { "Extended 3DES #1",
+        "",
+        "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+        "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+        "\xb7\xa6\x88\xe1\x22\xff\xaf\x95" },
+      { "Extended 3DES #2",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96",
+        "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+        "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+        "\x8e\x8f\x29\x31\x36\x28\x37\x97" },
+      { "Extended 3DES #3",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51",
+        "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+        "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+        "\x33\xe6\xb1\x09\x24\x00\xea\xe5" },
+      { "Extended 3DES #4",
+        "",
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38"
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5",
+        "\xbd\x2e\xbf\x9a\x3b\xa0\x03\x61" },
+      { "Extended 3DES #5",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96",
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38"
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5",
+        "\x4f\xf2\xab\x81\x3c\x53\xce\x83" },
+      { "Extended 3DES #6",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57",
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38"
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5",
+        "\x62\xdd\x1b\x47\x19\x02\xbd\x4e" },
+      { "Extended 3DES #7",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51",
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38"
+        "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5",
+        "\x31\xb1\xe4\x31\xda\xbc\x4e\xb8" },
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MAC_CMAC_3DES,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, 8);
+      if (errtxt)
+        goto failed;
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cmac", GCRY_MAC_CMAC_3DES, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+
+static gpg_err_code_t
+selftests_cmac_aes (int extended, selftest_report_func_t report)
+{
+  static const struct
+  {
+    const char *desc;
+    const char *data;
+    const char *key;
+    const char *expect;
+  } tv[] =
+    {
+      { "Basic AES128",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
+        "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+        "\xdf\xa6\x67\x47\xde\x9a\xe6\x30\x30\xca\x32\x61\x14\x97\xc8\x27" },
+      { "Basic AES192",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
+        "\x8e\x73\xb0\xf7\xda\x0e\x64\x52\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+        "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+        "\x8a\x1d\xe5\xbe\x2e\xb3\x1a\xad\x08\x9a\x82\xe6\xee\x90\x8b\x0e" },
+      { "Basic AES256",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
+        "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+        "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+        "\xaa\xf3\xd8\xf1\xde\x56\x40\xc2\x32\xf5\xb1\x69\xb9\xc9\x11\xe6" },
+      { "Extended AES #1",
+        "",
+        "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+        "\xbb\x1d\x69\x29\xe9\x59\x37\x28\x7f\xa3\x7d\x12\x9b\x75\x67\x46" },
+      { "Extended AES #2",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+        "\x8e\x73\xb0\xf7\xda\x0e\x64\x52\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+        "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+        "\x9e\x99\xa7\xbf\x31\xe7\x10\x90\x06\x62\xf6\x5e\x61\x7c\x51\x84" },
+      { "Extended AES #3",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+        "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+        "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+        "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+        "\xe1\x99\x21\x90\x54\x9f\x6e\xd5\x69\x6a\x2c\x05\x6c\x31\x54\x10" },
+      { "Extended AES #4",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+        "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+        "\x07\x0a\x16\xb4\x6b\x4d\x41\x44\xf7\x9b\xdd\x9d\xd0\x4a\x28\x7c" },
+      { "Extended AES #5",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+        "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+        "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+        "\x51\xf0\xbe\xbf\x7e\x3b\x9d\x92\xfc\x49\x74\x17\x79\x36\x3c\xfe" },
+      { "Extended AES #6",
+        "",
+        "\x8e\x73\xb0\xf7\xda\x0e\x64\x52\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+        "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+        "\xd1\x7d\xdf\x46\xad\xaa\xcd\xe5\x31\xca\xc4\x83\xde\x7a\x93\x67" },
+      { "Extended AES #7",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+        "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+        "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+        "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+        "\x8e\x73\xb0\xf7\xda\x0e\x64\x52\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+        "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+        "\xa1\xd5\xdf\x0e\xed\x79\x0f\x79\x4d\x77\x58\x96\x59\xf3\x9a\x11" },
+      { "Extended AES #8",
+        "",
+        "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+        "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+        "\x02\x89\x62\xf6\x1b\x7b\xf8\x9e\xfc\x6b\x55\x1f\x46\x67\xd9\x83" },
+      { "Extended AES #9",
+        "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+        "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+        "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+        "\x28\xa7\x02\x3f\x45\x2e\x8f\x82\xbd\x4b\xf2\x8d\x8c\x37\xc3\x5c" },
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MAC_CMAC_AES,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, strlen (tv[tvidx].expect));
+      if (errtxt)
+        goto failed;
+      if (tvidx >= 2 && !extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cmac", GCRY_MAC_CMAC_AES, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+cmac_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MAC_CMAC_3DES:
+      ec = selftests_cmac_3des (extended, report);
+      break;
+    case GCRY_MAC_CMAC_AES:
+      ec = selftests_cmac_aes (extended, report);
+      break;
+
+    default:
+      ec = GPG_ERR_MAC_ALGO;
+      break;
+    }
+
+  return ec;
+}
+
+
+static gcry_mac_spec_ops_t cmac_ops = {
+  cmac_open,
+  cmac_close,
+  cmac_setkey,
+  NULL,
+  cmac_reset,
+  cmac_write,
+  cmac_read,
+  cmac_verify,
+  cmac_get_maclen,
+  cmac_get_keylen,
+  NULL,
+  cmac_selftest
+};
+
+
+#if USE_BLOWFISH
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_blowfish = {
+  GCRY_MAC_CMAC_BLOWFISH, {0, 0}, "CMAC_BLOWFISH",
+  &cmac_ops
+};
+#endif
+#if USE_DES
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_tripledes = {
+  GCRY_MAC_CMAC_3DES, {0, 1}, "CMAC_3DES",
+  &cmac_ops
+};
+#endif
+#if USE_CAST5
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_cast5 = {
+  GCRY_MAC_CMAC_CAST5, {0, 0}, "CMAC_CAST5",
+  &cmac_ops
+};
+#endif
+#if USE_AES
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_aes = {
+  GCRY_MAC_CMAC_AES, {0, 1}, "CMAC_AES",
+  &cmac_ops
+};
+#endif
+#if USE_TWOFISH
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_twofish = {
+  GCRY_MAC_CMAC_TWOFISH, {0, 0}, "CMAC_TWOFISH",
+  &cmac_ops
+};
+#endif
+#if USE_SERPENT
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_serpent = {
+  GCRY_MAC_CMAC_SERPENT, {0, 0}, "CMAC_SERPENT",
+  &cmac_ops
+};
+#endif
+#if USE_RFC2268
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_rfc2268 = {
+  GCRY_MAC_CMAC_RFC2268, {0, 0}, "CMAC_RFC2268",
+  &cmac_ops
+};
+#endif
+#if USE_SEED
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_seed = {
+  GCRY_MAC_CMAC_SEED, {0, 0}, "CMAC_SEED",
+  &cmac_ops
+};
+#endif
+#if USE_CAMELLIA
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_camellia = {
+  GCRY_MAC_CMAC_CAMELLIA, {0, 0}, "CMAC_CAMELLIA",
+  &cmac_ops
+};
+#endif
+#ifdef USE_IDEA
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_idea = {
+  GCRY_MAC_CMAC_IDEA, {0, 0}, "CMAC_IDEA",
+  &cmac_ops
+};
+#endif
+#if USE_GOST28147
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_gost28147 = {
+  GCRY_MAC_CMAC_GOST28147, {0, 0}, "CMAC_GOST28147",
+  &cmac_ops
+};
+#endif
+#if USE_SM4
+gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4 = {
+  GCRY_MAC_CMAC_SM4, {0, 0}, "CMAC_SM4",
+  &cmac_ops
+};
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/mac-gmac.c b/comm/third_party/libgcrypt/cipher/mac-gmac.c
new file mode 100644
index 0000000000..e04c6d1ef0
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac-gmac.c
@@ -0,0 +1,187 @@
+/* mac-gmac.c  -  GMAC glue for MAC API
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "./mac-internal.h"
+
+
+static int
+map_mac_algo_to_cipher (int mac_algo)
+{
+  switch (mac_algo)
+    {
+    default:
+      return GCRY_CIPHER_NONE;
+    case GCRY_MAC_GMAC_AES:
+      return GCRY_CIPHER_AES;
+    case GCRY_MAC_GMAC_CAMELLIA:
+      return GCRY_CIPHER_CAMELLIA128;
+    case GCRY_MAC_GMAC_TWOFISH:
+      return GCRY_CIPHER_TWOFISH;
+    case GCRY_MAC_GMAC_SERPENT:
+      return GCRY_CIPHER_SERPENT128;
+    case GCRY_MAC_GMAC_SEED:
+      return GCRY_CIPHER_SEED;
+    }
+}
+
+
+static gcry_err_code_t
+gmac_open (gcry_mac_hd_t h)
+{
+  gcry_err_code_t err;
+  gcry_cipher_hd_t hd;
+  int secure = (h->magic == CTX_MAC_MAGIC_SECURE);
+  int cipher_algo;
+  unsigned int flags;
+
+  cipher_algo = map_mac_algo_to_cipher (h->spec->algo);
+  flags = (secure ? GCRY_CIPHER_SECURE : 0);
+
+  err = _gcry_cipher_open_internal (&hd, cipher_algo, GCRY_CIPHER_MODE_GCM,
+                                    flags);
+  if (err)
+    return err;
+
+  h->u.gmac.cipher_algo = cipher_algo;
+  h->u.gmac.ctx = hd;
+  return 0;
+}
+
+
+static void
+gmac_close (gcry_mac_hd_t h)
+{
+  _gcry_cipher_close (h->u.gmac.ctx);
+  h->u.gmac.ctx = NULL;
+}
+
+
+static gcry_err_code_t
+gmac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  return _gcry_cipher_setkey (h->u.gmac.ctx, key, keylen);
+}
+
+
+static gcry_err_code_t
+gmac_setiv (gcry_mac_hd_t h, const unsigned char *iv, size_t ivlen)
+{
+  return _gcry_cipher_setiv (h->u.gmac.ctx, iv, ivlen);
+}
+
+
+static gcry_err_code_t
+gmac_reset (gcry_mac_hd_t h)
+{
+  return _gcry_cipher_reset (h->u.gmac.ctx);
+}
+
+
+static gcry_err_code_t
+gmac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  return _gcry_cipher_authenticate (h->u.gmac.ctx, buf, buflen);
+}
+
+
+static gcry_err_code_t
+gmac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t * outlen)
+{
+  if (*outlen > GCRY_GCM_BLOCK_LEN)
+    *outlen = GCRY_GCM_BLOCK_LEN;
+  return _gcry_cipher_gettag (h->u.gmac.ctx, outbuf, *outlen);
+}
+
+
+static gcry_err_code_t
+gmac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  return _gcry_cipher_checktag (h->u.gmac.ctx, buf, buflen);
+}
+
+
+static unsigned int
+gmac_get_maclen (int algo)
+{
+  (void)algo;
+  return GCRY_GCM_BLOCK_LEN;
+}
+
+
+static unsigned int
+gmac_get_keylen (int algo)
+{
+  return _gcry_cipher_get_algo_keylen (map_mac_algo_to_cipher (algo));
+}
+
+
+static gcry_mac_spec_ops_t gmac_ops = {
+  gmac_open,
+  gmac_close,
+  gmac_setkey,
+  gmac_setiv,
+  gmac_reset,
+  gmac_write,
+  gmac_read,
+  gmac_verify,
+  gmac_get_maclen,
+  gmac_get_keylen,
+  NULL,
+  NULL
+};
+
+
+#if USE_AES
+gcry_mac_spec_t _gcry_mac_type_spec_gmac_aes = {
+  GCRY_MAC_GMAC_AES, {0, 1}, "GMAC_AES",
+  &gmac_ops
+};
+#endif
+#if USE_TWOFISH
+gcry_mac_spec_t _gcry_mac_type_spec_gmac_twofish = {
+  GCRY_MAC_GMAC_TWOFISH, {0, 0}, "GMAC_TWOFISH",
+  &gmac_ops
+};
+#endif
+#if USE_SERPENT
+gcry_mac_spec_t _gcry_mac_type_spec_gmac_serpent = {
+  GCRY_MAC_GMAC_SERPENT, {0, 0}, "GMAC_SERPENT",
+  &gmac_ops
+};
+#endif
+#if USE_SEED
+gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed = {
+  GCRY_MAC_GMAC_SEED, {0, 0}, "GMAC_SEED",
+  &gmac_ops
+};
+#endif
+#if USE_CAMELLIA
+gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia = {
+  GCRY_MAC_GMAC_CAMELLIA, {0, 0}, "GMAC_CAMELLIA",
+  &gmac_ops
+};
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/mac-hmac.c b/comm/third_party/libgcrypt/cipher/mac-hmac.c
new file mode 100644
index 0000000000..4e10dd2c9e
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac-hmac.c
@@ -0,0 +1,1495 @@
+/* mac-hmac.c  -  HMAC glue for MAC API
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "./mac-internal.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hmac256.h"
+
+
+static int
+map_mac_algo_to_md (int mac_algo)
+{
+  switch (mac_algo)
+    {
+    default:
+      return GCRY_MD_NONE;
+    case GCRY_MAC_HMAC_MD2:
+      return GCRY_MD_MD2;
+    case GCRY_MAC_HMAC_MD4:
+      return GCRY_MD_MD4;
+    case GCRY_MAC_HMAC_MD5:
+      return GCRY_MD_MD5;
+    case GCRY_MAC_HMAC_SHA1:
+      return GCRY_MD_SHA1;
+    case GCRY_MAC_HMAC_SHA224:
+      return GCRY_MD_SHA224;
+    case GCRY_MAC_HMAC_SHA256:
+      return GCRY_MD_SHA256;
+    case GCRY_MAC_HMAC_SHA384:
+      return GCRY_MD_SHA384;
+    case GCRY_MAC_HMAC_SHA512:
+      return GCRY_MD_SHA512;
+    case GCRY_MAC_HMAC_SHA512_256:
+      return GCRY_MD_SHA512_256;
+    case GCRY_MAC_HMAC_SHA512_224:
+      return GCRY_MD_SHA512_224;
+    case GCRY_MAC_HMAC_SHA3_224:
+      return GCRY_MD_SHA3_224;
+    case GCRY_MAC_HMAC_SHA3_256:
+      return GCRY_MD_SHA3_256;
+    case GCRY_MAC_HMAC_SHA3_384:
+      return GCRY_MD_SHA3_384;
+    case GCRY_MAC_HMAC_SHA3_512:
+      return GCRY_MD_SHA3_512;
+    case GCRY_MAC_HMAC_RMD160:
+      return GCRY_MD_RMD160;
+    case GCRY_MAC_HMAC_TIGER1:
+      return GCRY_MD_TIGER1;
+    case GCRY_MAC_HMAC_WHIRLPOOL:
+      return GCRY_MD_WHIRLPOOL;
+    case GCRY_MAC_HMAC_GOSTR3411_94:
+      return GCRY_MD_GOSTR3411_94;
+    case GCRY_MAC_HMAC_GOSTR3411_CP:
+      return GCRY_MD_GOSTR3411_CP;
+    case GCRY_MAC_HMAC_STRIBOG256:
+      return GCRY_MD_STRIBOG256;
+    case GCRY_MAC_HMAC_STRIBOG512:
+      return GCRY_MD_STRIBOG512;
+    case GCRY_MAC_HMAC_BLAKE2B_512:
+      return GCRY_MD_BLAKE2B_512;
+    case GCRY_MAC_HMAC_BLAKE2B_384:
+      return GCRY_MD_BLAKE2B_384;
+    case GCRY_MAC_HMAC_BLAKE2B_256:
+      return GCRY_MD_BLAKE2B_256;
+    case GCRY_MAC_HMAC_BLAKE2B_160:
+      return GCRY_MD_BLAKE2B_160;
+    case GCRY_MAC_HMAC_BLAKE2S_256:
+      return GCRY_MD_BLAKE2S_256;
+    case GCRY_MAC_HMAC_BLAKE2S_224:
+      return GCRY_MD_BLAKE2S_224;
+    case GCRY_MAC_HMAC_BLAKE2S_160:
+      return GCRY_MD_BLAKE2S_160;
+    case GCRY_MAC_HMAC_BLAKE2S_128:
+      return GCRY_MD_BLAKE2S_128;
+    case GCRY_MAC_HMAC_SM3:
+      return GCRY_MD_SM3;
+    }
+}
+
+
+static gcry_err_code_t
+hmac_open (gcry_mac_hd_t h)
+{
+  gcry_err_code_t err;
+  gcry_md_hd_t hd;
+  int secure = (h->magic == CTX_MAC_MAGIC_SECURE);
+  unsigned int flags;
+  int md_algo;
+
+  md_algo = map_mac_algo_to_md (h->spec->algo);
+
+  flags = GCRY_MD_FLAG_HMAC;
+  flags |= (secure ? GCRY_MD_FLAG_SECURE : 0);
+
+  err = _gcry_md_open (&hd, md_algo, flags);
+  if (err)
+    return err;
+
+  h->u.hmac.md_algo = md_algo;
+  h->u.hmac.md_ctx = hd;
+  return 0;
+}
+
+
+static void
+hmac_close (gcry_mac_hd_t h)
+{
+  _gcry_md_close (h->u.hmac.md_ctx);
+  h->u.hmac.md_ctx = NULL;
+}
+
+
+static gcry_err_code_t
+hmac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  return _gcry_md_setkey (h->u.hmac.md_ctx, key, keylen);
+}
+
+
+static gcry_err_code_t
+hmac_reset (gcry_mac_hd_t h)
+{
+  _gcry_md_reset (h->u.hmac.md_ctx);
+  return 0;
+}
+
+
+static gcry_err_code_t
+hmac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  _gcry_md_write (h->u.hmac.md_ctx, buf, buflen);
+  return 0;
+}
+
+
+static gcry_err_code_t
+hmac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t * outlen)
+{
+  unsigned int dlen;
+  const unsigned char *digest;
+
+  dlen = _gcry_md_get_algo_dlen (h->u.hmac.md_algo);
+  digest = _gcry_md_read (h->u.hmac.md_ctx, h->u.hmac.md_algo);
+
+  if (*outlen <= dlen)
+    buf_cpy (outbuf, digest, *outlen);
+  else
+    {
+      buf_cpy (outbuf, digest, dlen);
+      *outlen = dlen;
+    }
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+hmac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  unsigned int dlen;
+  const unsigned char *digest;
+
+  dlen = _gcry_md_get_algo_dlen (h->u.hmac.md_algo);
+  digest = _gcry_md_read (h->u.hmac.md_ctx, h->u.hmac.md_algo);
+
+  if (buflen > dlen)
+    return GPG_ERR_INV_LENGTH;
+
+  return buf_eq_const (buf, digest, buflen) ? 0 : GPG_ERR_CHECKSUM;
+}
+
+
+static unsigned int
+hmac_get_maclen (int algo)
+{
+  return _gcry_md_get_algo_dlen (map_mac_algo_to_md (algo));
+}
+
+
+static unsigned int
+hmac_get_keylen (int algo)
+{
+  /* Return blocksize for default key length. */
+  switch (algo)
+    {
+    case GCRY_MD_SHA3_224:
+      return 1152 / 8;
+    case GCRY_MD_SHA3_256:
+      return 1088 / 8;
+    case GCRY_MD_SHA3_384:
+      return 832 / 8;
+    case GCRY_MD_SHA3_512:
+      return 576 / 8;
+    case GCRY_MAC_HMAC_SHA384:
+    case GCRY_MAC_HMAC_SHA512:
+      return 128;
+    case GCRY_MAC_HMAC_GOSTR3411_94:
+      return 32;
+    default:
+      return 64;
+    }
+}
+
+
+/* Check one HMAC with digest ALGO using the regualr HAMC
+ * API. (DATA,DATALEN) is the data to be MACed, (KEY,KEYLEN) the key
+ * and (EXPECT,EXPECTLEN) the expected result.  If TRUNC is set, the
+ * EXPECTLEN may be less than the digest length.  Returns NULL on
+ * success or a string describing the failure.  */
+static const char *
+check_one (int algo,
+           const void *data, size_t datalen,
+           const void *key, size_t keylen,
+           const void *expect, size_t expectlen, int trunc)
+{
+  gcry_md_hd_t hd;
+  const unsigned char *digest;
+
+/*   printf ("HMAC algo %d\n", algo); */
+  if (trunc)
+    {
+      if (_gcry_md_get_algo_dlen (algo) < expectlen)
+        return "invalid tests data";
+    }
+  else
+    {
+      if (_gcry_md_get_algo_dlen (algo) != expectlen)
+        return "invalid tests data";
+    }
+  if (_gcry_md_open (&hd, algo, GCRY_MD_FLAG_HMAC))
+    return "gcry_md_open failed";
+  if (_gcry_md_setkey (hd, key, keylen))
+    {
+      _gcry_md_close (hd);
+      return "gcry_md_setkey failed";
+    }
+  _gcry_md_write (hd, data, datalen);
+  digest = _gcry_md_read (hd, algo);
+  if (!digest)
+    {
+      _gcry_md_close (hd);
+      return "gcry_md_read failed";
+    }
+  if (memcmp (digest, expect, expectlen))
+    {
+/*       int i; */
+
+/*       fputs ("        {", stdout); */
+/*       for (i=0; i < expectlen-1; i++) */
+/*         { */
+/*           if (i && !(i % 8)) */
+/*             fputs ("\n         ", stdout); */
+/*           printf (" 0x%02x,", digest[i]); */
+/*         } */
+/*       printf (" 0x%02x } },\n", digest[i]); */
+
+      _gcry_md_close (hd);
+      return "does not match";
+    }
+  _gcry_md_close (hd);
+  return NULL;
+}
+
+
+static gpg_err_code_t
+selftests_sha1 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+  unsigned char key[128];
+  int i, j;
+
+  what = "FIPS-198a, A.1";
+  for (i=0; i < 64; i++)
+    key[i] = i;
+  errtxt = check_one (GCRY_MD_SHA1,
+                      "Sample #1", 9,
+                      key, 64,
+                      "\x4f\x4c\xa3\xd5\xd6\x8b\xa7\xcc\x0a\x12"
+                      "\x08\xc9\xc6\x1e\x9c\x5d\xa0\x40\x3c\x0a", 20, 0);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "FIPS-198a, A.2";
+      for (i=0, j=0x30; i < 20; i++)
+        key[i] = j++;
+      errtxt = check_one (GCRY_MD_SHA1,
+                          "Sample #2", 9,
+                          key, 20,
+                          "\x09\x22\xd3\x40\x5f\xaa\x3d\x19\x4f\x82"
+                          "\xa4\x58\x30\x73\x7d\x5c\xc6\xc7\x5d\x24", 20, 0);
+      if (errtxt)
+        goto failed;
+
+      what = "FIPS-198a, A.3";
+      for (i=0, j=0x50; i < 100; i++)
+        key[i] = j++;
+      errtxt = check_one (GCRY_MD_SHA1,
+                          "Sample #3", 9,
+                          key, 100,
+                          "\xbc\xf4\x1e\xab\x8b\xb2\xd8\x02\xf3\xd0"
+                          "\x5c\xaf\x7c\xb0\x92\xec\xf8\xd1\xa3\xaa", 20, 0);
+      if (errtxt)
+        goto failed;
+
+      what = "FIPS-198a, A.4";
+      for (i=0, j=0x70; i < 49; i++)
+        key[i] = j++;
+      errtxt = check_one (GCRY_MD_SHA1,
+                          "Sample #4", 9,
+                          key, 49,
+                          "\x9e\xa8\x86\xef\xe2\x68\xdb\xec\xce\x42"
+                          "\x0c\x75\x24\xdf\x32\xe0\x75\x1a\x2a\x26", 20, 0);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", GCRY_MD_SHA1, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+
+static gpg_err_code_t
+selftests_sha224 (int extended, selftest_report_func_t report)
+{
+  static struct
+  {
+    const char * const desc;
+    const char * const data;
+    const char * const key;
+    const char expect[28];
+  } tv[] =
+    {
+      { "data-28 key-4",
+        "what do ya want for nothing?",
+        "Jefe",
+        { 0xa3, 0x0e, 0x01, 0x09, 0x8b, 0xc6, 0xdb, 0xbf,
+          0x45, 0x69, 0x0f, 0x3a, 0x7e, 0x9e, 0x6d, 0x0f,
+          0x8b, 0xbe, 0xa2, 0xa3, 0x9e, 0x61, 0x48, 0x00,
+          0x8f, 0xd0, 0x5e, 0x44 } },
+
+      { "data-9 key-20",
+        "Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+        "\x0b\x0b\x0b\x0b",
+        { 0x89, 0x6f, 0xb1, 0x12, 0x8a, 0xbb, 0xdf, 0x19,
+          0x68, 0x32, 0x10, 0x7c, 0xd4, 0x9d, 0xf3, 0x3f,
+          0x47, 0xb4, 0xb1, 0x16, 0x99, 0x12, 0xba, 0x4f,
+          0x53, 0x68, 0x4b, 0x22 } },
+
+      { "data-50 key-20",
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa",
+        { 0x7f, 0xb3, 0xcb, 0x35, 0x88, 0xc6, 0xc1, 0xf6,
+          0xff, 0xa9, 0x69, 0x4d, 0x7d, 0x6a, 0xd2, 0x64,
+          0x93, 0x65, 0xb0, 0xc1, 0xf6, 0x5d, 0x69, 0xd1,
+          0xec, 0x83, 0x33, 0xea } },
+
+      { "data-50 key-26",
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd",
+	"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+        "\x11\x12\x13\x14\x15\x16\x17\x18\x19",
+        { 0x6c, 0x11, 0x50, 0x68, 0x74, 0x01, 0x3c, 0xac,
+          0x6a, 0x2a, 0xbc, 0x1b, 0xb3, 0x82, 0x62, 0x7c,
+          0xec, 0x6a, 0x90, 0xd8, 0x6e, 0xfc, 0x01, 0x2d,
+          0xe7, 0xaf, 0xec, 0x5a } },
+
+      { "data-54 key-131",
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0x95, 0xe9, 0xa0, 0xdb, 0x96, 0x20, 0x95, 0xad,
+          0xae, 0xbe, 0x9b, 0x2d, 0x6f, 0x0d, 0xbc, 0xe2,
+          0xd4, 0x99, 0xf1, 0x12, 0xf2, 0xd2, 0xb7, 0x27,
+          0x3f, 0xa6, 0x87, 0x0e } },
+
+      { "data-152 key-131",
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0x3a, 0x85, 0x41, 0x66, 0xac, 0x5d, 0x9f, 0x02,
+          0x3f, 0x54, 0xd5, 0x17, 0xd0, 0xb3, 0x9d, 0xbd,
+          0x94, 0x67, 0x70, 0xdb, 0x9c, 0x2b, 0x95, 0xc9,
+          0xf6, 0xf5, 0x65, 0xd1 } },
+
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MD_SHA224,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, DIM (tv[tvidx].expect), 0);
+      if (errtxt)
+        goto failed;
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", GCRY_MD_SHA224, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+static gpg_err_code_t
+selftests_sha256 (int extended, selftest_report_func_t report)
+{
+  static struct
+  {
+    const char * const desc;
+    const char * const data;
+    const char * const key;
+    const char expect[32];
+  } tv[] =
+    {
+      { "data-28 key-4",
+        "what do ya want for nothing?",
+        "Jefe",
+	{ 0x5b, 0xdc, 0xc1, 0x46, 0xbf, 0x60, 0x75, 0x4e,
+          0x6a, 0x04, 0x24, 0x26, 0x08, 0x95, 0x75, 0xc7,
+          0x5a, 0x00, 0x3f, 0x08, 0x9d, 0x27, 0x39, 0x83,
+          0x9d, 0xec, 0x58, 0xb9, 0x64, 0xec, 0x38, 0x43 } },
+
+      { "data-9 key-20",
+        "Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+        "\x0b\x0b\x0b\x0b",
+        { 0xb0, 0x34, 0x4c, 0x61, 0xd8, 0xdb, 0x38, 0x53,
+          0x5c, 0xa8, 0xaf, 0xce, 0xaf, 0x0b, 0xf1, 0x2b,
+          0x88, 0x1d, 0xc2, 0x00, 0xc9, 0x83, 0x3d, 0xa7,
+          0x26, 0xe9, 0x37, 0x6c, 0x2e, 0x32, 0xcf, 0xf7 } },
+
+      { "data-50 key-20",
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa",
+        { 0x77, 0x3e, 0xa9, 0x1e, 0x36, 0x80, 0x0e, 0x46,
+          0x85, 0x4d, 0xb8, 0xeb, 0xd0, 0x91, 0x81, 0xa7,
+          0x29, 0x59, 0x09, 0x8b, 0x3e, 0xf8, 0xc1, 0x22,
+          0xd9, 0x63, 0x55, 0x14, 0xce, 0xd5, 0x65, 0xfe } },
+
+      { "data-50 key-26",
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd",
+	"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+        "\x11\x12\x13\x14\x15\x16\x17\x18\x19",
+	{ 0x82, 0x55, 0x8a, 0x38, 0x9a, 0x44, 0x3c, 0x0e,
+          0xa4, 0xcc, 0x81, 0x98, 0x99, 0xf2, 0x08, 0x3a,
+          0x85, 0xf0, 0xfa, 0xa3, 0xe5, 0x78, 0xf8, 0x07,
+          0x7a, 0x2e, 0x3f, 0xf4, 0x67, 0x29, 0x66, 0x5b } },
+
+      { "data-54 key-131",
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+	{ 0x60, 0xe4, 0x31, 0x59, 0x1e, 0xe0, 0xb6, 0x7f,
+          0x0d, 0x8a, 0x26, 0xaa, 0xcb, 0xf5, 0xb7, 0x7f,
+          0x8e, 0x0b, 0xc6, 0x21, 0x37, 0x28, 0xc5, 0x14,
+          0x05, 0x46, 0x04, 0x0f, 0x0e, 0xe3, 0x7f, 0x54 } },
+
+      { "data-152 key-131",
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+	{ 0x9b, 0x09, 0xff, 0xa7, 0x1b, 0x94, 0x2f, 0xcb,
+          0x27, 0x63, 0x5f, 0xbc, 0xd5, 0xb0, 0xe9, 0x44,
+          0xbf, 0xdc, 0x63, 0x64, 0x4f, 0x07, 0x13, 0x93,
+          0x8a, 0x7f, 0x51, 0x53, 0x5c, 0x3a, 0x35, 0xe2 } },
+
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      hmac256_context_t hmachd;
+      const unsigned char *digest;
+      size_t dlen;
+
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MD_SHA256,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, DIM (tv[tvidx].expect), 0);
+      if (errtxt)
+        goto failed;
+
+      hmachd = _gcry_hmac256_new (tv[tvidx].key, strlen (tv[tvidx].key));
+      if (!hmachd)
+        {
+          errtxt = "_gcry_hmac256_new failed";
+          goto failed;
+        }
+      _gcry_hmac256_update (hmachd, tv[tvidx].data, strlen (tv[tvidx].data));
+      digest = _gcry_hmac256_finalize (hmachd, &dlen);
+      if (!digest)
+        {
+          errtxt = "_gcry_hmac256_finalize failed";
+          _gcry_hmac256_release (hmachd);
+          goto failed;
+        }
+      if (dlen != DIM (tv[tvidx].expect)
+          || memcmp (digest, tv[tvidx].expect, DIM (tv[tvidx].expect)))
+        {
+          errtxt = "does not match in second implementation";
+          _gcry_hmac256_release (hmachd);
+          goto failed;
+        }
+      _gcry_hmac256_release (hmachd);
+
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", GCRY_MD_SHA256, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+static gpg_err_code_t
+selftests_sha384 (int extended, selftest_report_func_t report)
+{
+  static struct
+  {
+    const char * const desc;
+    const char * const data;
+    const char * const key;
+    const char expect[48];
+  } tv[] =
+    {
+      { "data-28 key-4",
+        "what do ya want for nothing?",
+        "Jefe",
+        { 0xaf, 0x45, 0xd2, 0xe3, 0x76, 0x48, 0x40, 0x31,
+          0x61, 0x7f, 0x78, 0xd2, 0xb5, 0x8a, 0x6b, 0x1b,
+          0x9c, 0x7e, 0xf4, 0x64, 0xf5, 0xa0, 0x1b, 0x47,
+          0xe4, 0x2e, 0xc3, 0x73, 0x63, 0x22, 0x44, 0x5e,
+          0x8e, 0x22, 0x40, 0xca, 0x5e, 0x69, 0xe2, 0xc7,
+          0x8b, 0x32, 0x39, 0xec, 0xfa, 0xb2, 0x16, 0x49 } },
+
+      { "data-9 key-20",
+        "Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+        "\x0b\x0b\x0b\x0b",
+        { 0xaf, 0xd0, 0x39, 0x44, 0xd8, 0x48, 0x95, 0x62,
+          0x6b, 0x08, 0x25, 0xf4, 0xab, 0x46, 0x90, 0x7f,
+          0x15, 0xf9, 0xda, 0xdb, 0xe4, 0x10, 0x1e, 0xc6,
+          0x82, 0xaa, 0x03, 0x4c, 0x7c, 0xeb, 0xc5, 0x9c,
+          0xfa, 0xea, 0x9e, 0xa9, 0x07, 0x6e, 0xde, 0x7f,
+          0x4a, 0xf1, 0x52, 0xe8, 0xb2, 0xfa, 0x9c, 0xb6 } },
+
+      { "data-50 key-20",
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa",
+        { 0x88, 0x06, 0x26, 0x08, 0xd3, 0xe6, 0xad, 0x8a,
+          0x0a, 0xa2, 0xac, 0xe0, 0x14, 0xc8, 0xa8, 0x6f,
+          0x0a, 0xa6, 0x35, 0xd9, 0x47, 0xac, 0x9f, 0xeb,
+          0xe8, 0x3e, 0xf4, 0xe5, 0x59, 0x66, 0x14, 0x4b,
+          0x2a, 0x5a, 0xb3, 0x9d, 0xc1, 0x38, 0x14, 0xb9,
+          0x4e, 0x3a, 0xb6, 0xe1, 0x01, 0xa3, 0x4f, 0x27 } },
+
+      { "data-50 key-26",
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd",
+	"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+        "\x11\x12\x13\x14\x15\x16\x17\x18\x19",
+        { 0x3e, 0x8a, 0x69, 0xb7, 0x78, 0x3c, 0x25, 0x85,
+          0x19, 0x33, 0xab, 0x62, 0x90, 0xaf, 0x6c, 0xa7,
+          0x7a, 0x99, 0x81, 0x48, 0x08, 0x50, 0x00, 0x9c,
+          0xc5, 0x57, 0x7c, 0x6e, 0x1f, 0x57, 0x3b, 0x4e,
+          0x68, 0x01, 0xdd, 0x23, 0xc4, 0xa7, 0xd6, 0x79,
+          0xcc, 0xf8, 0xa3, 0x86, 0xc6, 0x74, 0xcf, 0xfb } },
+
+      { "data-54 key-131",
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0x4e, 0xce, 0x08, 0x44, 0x85, 0x81, 0x3e, 0x90,
+          0x88, 0xd2, 0xc6, 0x3a, 0x04, 0x1b, 0xc5, 0xb4,
+          0x4f, 0x9e, 0xf1, 0x01, 0x2a, 0x2b, 0x58, 0x8f,
+          0x3c, 0xd1, 0x1f, 0x05, 0x03, 0x3a, 0xc4, 0xc6,
+          0x0c, 0x2e, 0xf6, 0xab, 0x40, 0x30, 0xfe, 0x82,
+          0x96, 0x24, 0x8d, 0xf1, 0x63, 0xf4, 0x49, 0x52 } },
+
+      { "data-152 key-131",
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0x66, 0x17, 0x17, 0x8e, 0x94, 0x1f, 0x02, 0x0d,
+          0x35, 0x1e, 0x2f, 0x25, 0x4e, 0x8f, 0xd3, 0x2c,
+          0x60, 0x24, 0x20, 0xfe, 0xb0, 0xb8, 0xfb, 0x9a,
+          0xdc, 0xce, 0xbb, 0x82, 0x46, 0x1e, 0x99, 0xc5,
+          0xa6, 0x78, 0xcc, 0x31, 0xe7, 0x99, 0x17, 0x6d,
+          0x38, 0x60, 0xe6, 0x11, 0x0c, 0x46, 0x52, 0x3e } },
+
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MD_SHA384,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, DIM (tv[tvidx].expect), 0);
+      if (errtxt)
+        goto failed;
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", GCRY_MD_SHA384, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+static gpg_err_code_t
+selftests_sha512 (int extended, selftest_report_func_t report)
+{
+  static struct
+  {
+    const char * const desc;
+    const char * const data;
+    const char * const key;
+    const char expect[64];
+  } tv[] =
+    {
+      { "data-28 key-4",
+        "what do ya want for nothing?",
+        "Jefe",
+        { 0x16, 0x4b, 0x7a, 0x7b, 0xfc, 0xf8, 0x19, 0xe2,
+          0xe3, 0x95, 0xfb, 0xe7, 0x3b, 0x56, 0xe0, 0xa3,
+          0x87, 0xbd, 0x64, 0x22, 0x2e, 0x83, 0x1f, 0xd6,
+          0x10, 0x27, 0x0c, 0xd7, 0xea, 0x25, 0x05, 0x54,
+          0x97, 0x58, 0xbf, 0x75, 0xc0, 0x5a, 0x99, 0x4a,
+          0x6d, 0x03, 0x4f, 0x65, 0xf8, 0xf0, 0xe6, 0xfd,
+          0xca, 0xea, 0xb1, 0xa3, 0x4d, 0x4a, 0x6b, 0x4b,
+          0x63, 0x6e, 0x07, 0x0a, 0x38, 0xbc, 0xe7, 0x37 } },
+
+      { "data-9 key-20",
+        "Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+        "\x0b\x0b\x0b\x0b",
+        { 0x87, 0xaa, 0x7c, 0xde, 0xa5, 0xef, 0x61, 0x9d,
+          0x4f, 0xf0, 0xb4, 0x24, 0x1a, 0x1d, 0x6c, 0xb0,
+          0x23, 0x79, 0xf4, 0xe2, 0xce, 0x4e, 0xc2, 0x78,
+          0x7a, 0xd0, 0xb3, 0x05, 0x45, 0xe1, 0x7c, 0xde,
+          0xda, 0xa8, 0x33, 0xb7, 0xd6, 0xb8, 0xa7, 0x02,
+          0x03, 0x8b, 0x27, 0x4e, 0xae, 0xa3, 0xf4, 0xe4,
+          0xbe, 0x9d, 0x91, 0x4e, 0xeb, 0x61, 0xf1, 0x70,
+          0x2e, 0x69, 0x6c, 0x20, 0x3a, 0x12, 0x68, 0x54 } },
+
+      { "data-50 key-20",
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa",
+        { 0xfa, 0x73, 0xb0, 0x08, 0x9d, 0x56, 0xa2, 0x84,
+          0xef, 0xb0, 0xf0, 0x75, 0x6c, 0x89, 0x0b, 0xe9,
+          0xb1, 0xb5, 0xdb, 0xdd, 0x8e, 0xe8, 0x1a, 0x36,
+          0x55, 0xf8, 0x3e, 0x33, 0xb2, 0x27, 0x9d, 0x39,
+          0xbf, 0x3e, 0x84, 0x82, 0x79, 0xa7, 0x22, 0xc8,
+          0x06, 0xb4, 0x85, 0xa4, 0x7e, 0x67, 0xc8, 0x07,
+          0xb9, 0x46, 0xa3, 0x37, 0xbe, 0xe8, 0x94, 0x26,
+          0x74, 0x27, 0x88, 0x59, 0xe1, 0x32, 0x92, 0xfb } },
+
+      { "data-50 key-26",
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd",
+	"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+        "\x11\x12\x13\x14\x15\x16\x17\x18\x19",
+        { 0xb0, 0xba, 0x46, 0x56, 0x37, 0x45, 0x8c, 0x69,
+          0x90, 0xe5, 0xa8, 0xc5, 0xf6, 0x1d, 0x4a, 0xf7,
+          0xe5, 0x76, 0xd9, 0x7f, 0xf9, 0x4b, 0x87, 0x2d,
+          0xe7, 0x6f, 0x80, 0x50, 0x36, 0x1e, 0xe3, 0xdb,
+          0xa9, 0x1c, 0xa5, 0xc1, 0x1a, 0xa2, 0x5e, 0xb4,
+          0xd6, 0x79, 0x27, 0x5c, 0xc5, 0x78, 0x80, 0x63,
+          0xa5, 0xf1, 0x97, 0x41, 0x12, 0x0c, 0x4f, 0x2d,
+          0xe2, 0xad, 0xeb, 0xeb, 0x10, 0xa2, 0x98, 0xdd } },
+
+      { "data-54 key-131",
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0x80, 0xb2, 0x42, 0x63, 0xc7, 0xc1, 0xa3, 0xeb,
+          0xb7, 0x14, 0x93, 0xc1, 0xdd, 0x7b, 0xe8, 0xb4,
+          0x9b, 0x46, 0xd1, 0xf4, 0x1b, 0x4a, 0xee, 0xc1,
+          0x12, 0x1b, 0x01, 0x37, 0x83, 0xf8, 0xf3, 0x52,
+          0x6b, 0x56, 0xd0, 0x37, 0xe0, 0x5f, 0x25, 0x98,
+          0xbd, 0x0f, 0xd2, 0x21, 0x5d, 0x6a, 0x1e, 0x52,
+          0x95, 0xe6, 0x4f, 0x73, 0xf6, 0x3f, 0x0a, 0xec,
+          0x8b, 0x91, 0x5a, 0x98, 0x5d, 0x78, 0x65, 0x98 } },
+
+      { "data-152 key-131",
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+        { 0xe3, 0x7b, 0x6a, 0x77, 0x5d, 0xc8, 0x7d, 0xba,
+          0xa4, 0xdf, 0xa9, 0xf9, 0x6e, 0x5e, 0x3f, 0xfd,
+          0xde, 0xbd, 0x71, 0xf8, 0x86, 0x72, 0x89, 0x86,
+          0x5d, 0xf5, 0xa3, 0x2d, 0x20, 0xcd, 0xc9, 0x44,
+          0xb6, 0x02, 0x2c, 0xac, 0x3c, 0x49, 0x82, 0xb1,
+          0x0d, 0x5e, 0xeb, 0x55, 0xc3, 0xe4, 0xde, 0x15,
+          0x13, 0x46, 0x76, 0xfb, 0x6d, 0xe0, 0x44, 0x60,
+          0x65, 0xc9, 0x74, 0x40, 0xfa, 0x8c, 0x6a, 0x58 } },
+
+      { NULL }
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+
+  for (tvidx=0; tv[tvidx].desc; tvidx++)
+    {
+      what = tv[tvidx].desc;
+      errtxt = check_one (GCRY_MD_SHA512,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          tv[tvidx].expect, DIM (tv[tvidx].expect), 0);
+      if (errtxt)
+        goto failed;
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", GCRY_MD_SHA512, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+
+/* Test for the SHA3 algorithms.  Vectors taken on 2017-07-18 from
+ * http://www.wolfgang-ehrhardt.de/hmac-sha3-testvectors.html  */
+static gpg_err_code_t
+selftests_sha3 (int hashalgo, int extended, selftest_report_func_t report)
+{
+  static struct
+  {
+    const char * const desc;
+    const char * const data;
+    const char * const key;
+    const char expect_224[28];
+    const char expect_256[32];
+    const char expect_384[48];
+    const char expect_512[64];
+    unsigned char trunc;
+  } tv[] =
+    {
+      { "data-9 key-20", /* Test 1 */
+        "Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+        "\x0b\x0b\x0b\x0b",
+
+        { 0x3b, 0x16, 0x54, 0x6b, 0xbc, 0x7b, 0xe2, 0x70,
+          0x6a, 0x03, 0x1d, 0xca, 0xfd, 0x56, 0x37, 0x3d,
+          0x98, 0x84, 0x36, 0x76, 0x41, 0xd8, 0xc5, 0x9a,
+          0xf3, 0xc8, 0x60, 0xf7                          },
+        { 0xba, 0x85, 0x19, 0x23, 0x10, 0xdf, 0xfa, 0x96,
+          0xe2, 0xa3, 0xa4, 0x0e, 0x69, 0x77, 0x43, 0x51,
+          0x14, 0x0b, 0xb7, 0x18, 0x5e, 0x12, 0x02, 0xcd,
+          0xcc, 0x91, 0x75, 0x89, 0xf9, 0x5e, 0x16, 0xbb  },
+        { 0x68, 0xd2, 0xdc, 0xf7, 0xfd, 0x4d, 0xdd, 0x0a,
+          0x22, 0x40, 0xc8, 0xa4, 0x37, 0x30, 0x5f, 0x61,
+          0xfb, 0x73, 0x34, 0xcf, 0xb5, 0xd0, 0x22, 0x6e,
+          0x1b, 0xc2, 0x7d, 0xc1, 0x0a, 0x2e, 0x72, 0x3a,
+          0x20, 0xd3, 0x70, 0xb4, 0x77, 0x43, 0x13, 0x0e,
+          0x26, 0xac, 0x7e, 0x3d, 0x53, 0x28, 0x86, 0xbd  },
+        { 0xeb, 0x3f, 0xbd, 0x4b, 0x2e, 0xaa, 0xb8, 0xf5,
+          0xc5, 0x04, 0xbd, 0x3a, 0x41, 0x46, 0x5a, 0xac,
+          0xec, 0x15, 0x77, 0x0a, 0x7c, 0xab, 0xac, 0x53,
+          0x1e, 0x48, 0x2f, 0x86, 0x0b, 0x5e, 0xc7, 0xba,
+          0x47, 0xcc, 0xb2, 0xc6, 0xf2, 0xaf, 0xce, 0x8f,
+          0x88, 0xd2, 0x2b, 0x6d, 0xc6, 0x13, 0x80, 0xf2,
+          0x3a, 0x66, 0x8f, 0xd3, 0x88, 0x8b, 0xb8, 0x05,
+          0x37, 0xc0, 0xa0, 0xb8, 0x64, 0x07, 0x68, 0x9e  }
+      },
+
+      { "data-28 key-4",  /* Test 2  */
+        /* Test with a key shorter than the length of the HMAC output. */
+        "what do ya want for nothing?",
+        "Jefe",
+
+        { 0x7f, 0xdb, 0x8d, 0xd8, 0x8b, 0xd2, 0xf6, 0x0d,
+          0x1b, 0x79, 0x86, 0x34, 0xad, 0x38, 0x68, 0x11,
+          0xc2, 0xcf, 0xc8, 0x5b, 0xfa, 0xf5, 0xd5, 0x2b,
+          0xba, 0xce, 0x5e, 0x66                          },
+        { 0xc7, 0xd4, 0x07, 0x2e, 0x78, 0x88, 0x77, 0xae,
+          0x35, 0x96, 0xbb, 0xb0, 0xda, 0x73, 0xb8, 0x87,
+          0xc9, 0x17, 0x1f, 0x93, 0x09, 0x5b, 0x29, 0x4a,
+          0xe8, 0x57, 0xfb, 0xe2, 0x64, 0x5e, 0x1b, 0xa5  },
+        { 0xf1, 0x10, 0x1f, 0x8c, 0xbf, 0x97, 0x66, 0xfd,
+          0x67, 0x64, 0xd2, 0xed, 0x61, 0x90, 0x3f, 0x21,
+          0xca, 0x9b, 0x18, 0xf5, 0x7c, 0xf3, 0xe1, 0xa2,
+          0x3c, 0xa1, 0x35, 0x08, 0xa9, 0x32, 0x43, 0xce,
+          0x48, 0xc0, 0x45, 0xdc, 0x00, 0x7f, 0x26, 0xa2,
+          0x1b, 0x3f, 0x5e, 0x0e, 0x9d, 0xf4, 0xc2, 0x0a  },
+        { 0x5a, 0x4b, 0xfe, 0xab, 0x61, 0x66, 0x42, 0x7c,
+          0x7a, 0x36, 0x47, 0xb7, 0x47, 0x29, 0x2b, 0x83,
+          0x84, 0x53, 0x7c, 0xdb, 0x89, 0xaf, 0xb3, 0xbf,
+          0x56, 0x65, 0xe4, 0xc5, 0xe7, 0x09, 0x35, 0x0b,
+          0x28, 0x7b, 0xae, 0xc9, 0x21, 0xfd, 0x7c, 0xa0,
+          0xee, 0x7a, 0x0c, 0x31, 0xd0, 0x22, 0xa9, 0x5e,
+          0x1f, 0xc9, 0x2b, 0xa9, 0xd7, 0x7d, 0xf8, 0x83,
+          0x96, 0x02, 0x75, 0xbe, 0xb4, 0xe6, 0x20, 0x24  }
+      },
+
+      { "data-50 key-20",  /* Test 3 */
+        /* Test with a combined length of key and data that is larger
+         * than 64 bytes (= block-size of SHA-224 and SHA-256).  */
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd\xdd"
+        "\xdd\xdd",
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa",
+
+        { 0x67, 0x6c, 0xfc, 0x7d, 0x16, 0x15, 0x36, 0x38,
+          0x78, 0x03, 0x90, 0x69, 0x2b, 0xe1, 0x42, 0xd2,
+          0xdf, 0x7c, 0xe9, 0x24, 0xb9, 0x09, 0xc0, 0xc0,
+          0x8d, 0xbf, 0xdc, 0x1a  },
+        { 0x84, 0xec, 0x79, 0x12, 0x4a, 0x27, 0x10, 0x78,
+          0x65, 0xce, 0xdd, 0x8b, 0xd8, 0x2d, 0xa9, 0x96,
+          0x5e, 0x5e, 0xd8, 0xc3, 0x7b, 0x0a, 0xc9, 0x80,
+          0x05, 0xa7, 0xf3, 0x9e, 0xd5, 0x8a, 0x42, 0x07  },
+        { 0x27, 0x5c, 0xd0, 0xe6, 0x61, 0xbb, 0x8b, 0x15,
+          0x1c, 0x64, 0xd2, 0x88, 0xf1, 0xf7, 0x82, 0xfb,
+          0x91, 0xa8, 0xab, 0xd5, 0x68, 0x58, 0xd7, 0x2b,
+          0xab, 0xb2, 0xd4, 0x76, 0xf0, 0x45, 0x83, 0x73,
+          0xb4, 0x1b, 0x6a, 0xb5, 0xbf, 0x17, 0x4b, 0xec,
+          0x42, 0x2e, 0x53, 0xfc, 0x31, 0x35, 0xac, 0x6e  },
+        { 0x30, 0x9e, 0x99, 0xf9, 0xec, 0x07, 0x5e, 0xc6,
+          0xc6, 0xd4, 0x75, 0xed, 0xa1, 0x18, 0x06, 0x87,
+          0xfc, 0xf1, 0x53, 0x11, 0x95, 0x80, 0x2a, 0x99,
+          0xb5, 0x67, 0x74, 0x49, 0xa8, 0x62, 0x51, 0x82,
+          0x85, 0x1c, 0xb3, 0x32, 0xaf, 0xb6, 0xa8, 0x9c,
+          0x41, 0x13, 0x25, 0xfb, 0xcb, 0xcd, 0x42, 0xaf,
+          0xcb, 0x7b, 0x6e, 0x5a, 0xab, 0x7e, 0xa4, 0x2c,
+          0x66, 0x0f, 0x97, 0xfd, 0x85, 0x84, 0xbf, 0x03  }
+      },
+
+      { "data-50 key-25",  /* Test 4 */
+        /* Test with a combined length of key and data that is larger
+         * than 64 bytes (= block-size of SHA-224 and SHA-256).  */
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd\xcd"
+        "\xcd\xcd",
+        "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+        "\x11\x12\x13\x14\x15\x16\x17\x18\x19",
+
+        { 0xa9, 0xd7, 0x68, 0x5a, 0x19, 0xc4, 0xe0, 0xdb,
+          0xd9, 0xdf, 0x25, 0x56, 0xcc, 0x8a, 0x7d, 0x2a,
+          0x77, 0x33, 0xb6, 0x76, 0x25, 0xce, 0x59, 0x4c,
+          0x78, 0x27, 0x0e, 0xeb   },
+        { 0x57, 0x36, 0x6a, 0x45, 0xe2, 0x30, 0x53, 0x21,
+          0xa4, 0xbc, 0x5a, 0xa5, 0xfe, 0x2e, 0xf8, 0xa9,
+          0x21, 0xf6, 0xaf, 0x82, 0x73, 0xd7, 0xfe, 0x7b,
+          0xe6, 0xcf, 0xed, 0xb3, 0xf0, 0xae, 0xa6, 0xd7  },
+        { 0x3a, 0x5d, 0x7a, 0x87, 0x97, 0x02, 0xc0, 0x86,
+          0xbc, 0x96, 0xd1, 0xdd, 0x8a, 0xa1, 0x5d, 0x9c,
+          0x46, 0x44, 0x6b, 0x95, 0x52, 0x13, 0x11, 0xc6,
+          0x06, 0xfd, 0xc4, 0xe3, 0x08, 0xf4, 0xb9, 0x84,
+          0xda, 0x2d, 0x0f, 0x94, 0x49, 0xb3, 0xba, 0x84,
+          0x25, 0xec, 0x7f, 0xb8, 0xc3, 0x1b, 0xc1, 0x36  },
+        { 0xb2, 0x7e, 0xab, 0x1d, 0x6e, 0x8d, 0x87, 0x46,
+          0x1c, 0x29, 0xf7, 0xf5, 0x73, 0x9d, 0xd5, 0x8e,
+          0x98, 0xaa, 0x35, 0xf8, 0xe8, 0x23, 0xad, 0x38,
+          0xc5, 0x49, 0x2a, 0x20, 0x88, 0xfa, 0x02, 0x81,
+          0x99, 0x3b, 0xbf, 0xff, 0x9a, 0x0e, 0x9c, 0x6b,
+          0xf1, 0x21, 0xae, 0x9e, 0xc9, 0xbb, 0x09, 0xd8,
+          0x4a, 0x5e, 0xba, 0xc8, 0x17, 0x18, 0x2e, 0xa9,
+          0x74, 0x67, 0x3f, 0xb1, 0x33, 0xca, 0x0d, 0x1d  }
+      },
+
+      { "data-20 key-20 trunc",  /* Test 5 */
+        /* Test with a truncation of output to 128 bits.  */
+        "Test With Truncation",
+        "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
+        "\x0c\x0c\x0c\x0c",
+
+        { 0x49, 0xfd, 0xd3, 0xab, 0xd0, 0x05, 0xeb, 0xb8,
+          0xae, 0x63, 0xfe, 0xa9, 0x46, 0xd1, 0x88, 0x3c  },
+        { 0x6e, 0x02, 0xc6, 0x45, 0x37, 0xfb, 0x11, 0x80,
+          0x57, 0xab, 0xb7, 0xfb, 0x66, 0xa2, 0x3b, 0x3c  },
+        { 0x47, 0xc5, 0x1a, 0xce, 0x1f, 0xfa, 0xcf, 0xfd,
+          0x74, 0x94, 0x72, 0x46, 0x82, 0x61, 0x57, 0x83  },
+        { 0x0f, 0xa7, 0x47, 0x59, 0x48, 0xf4, 0x3f, 0x48,
+          0xca, 0x05, 0x16, 0x67, 0x1e, 0x18, 0x97, 0x8c  },
+        16
+      },
+
+      { "data-54 key-131",  /* Test 6 */
+        /* Test with a key larger than 128 bytes (= block-size of
+         * SHA-384 and SHA-512).  */
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+
+        { 0xb4, 0xa1, 0xf0, 0x4c, 0x00, 0x28, 0x7a, 0x9b,
+          0x7f, 0x60, 0x75, 0xb3, 0x13, 0xd2, 0x79, 0xb8,
+          0x33, 0xbc, 0x8f, 0x75, 0x12, 0x43, 0x52, 0xd0,
+          0x5f, 0xb9, 0x99, 0x5f  },
+        { 0xed, 0x73, 0xa3, 0x74, 0xb9, 0x6c, 0x00, 0x52,
+          0x35, 0xf9, 0x48, 0x03, 0x2f, 0x09, 0x67, 0x4a,
+          0x58, 0xc0, 0xce, 0x55, 0x5c, 0xfc, 0x1f, 0x22,
+          0x3b, 0x02, 0x35, 0x65, 0x60, 0x31, 0x2c, 0x3b  },
+        { 0x0f, 0xc1, 0x95, 0x13, 0xbf, 0x6b, 0xd8, 0x78,
+          0x03, 0x70, 0x16, 0x70, 0x6a, 0x0e, 0x57, 0xbc,
+          0x52, 0x81, 0x39, 0x83, 0x6b, 0x9a, 0x42, 0xc3,
+          0xd4, 0x19, 0xe4, 0x98, 0xe0, 0xe1, 0xfb, 0x96,
+          0x16, 0xfd, 0x66, 0x91, 0x38, 0xd3, 0x3a, 0x11,
+          0x05, 0xe0, 0x7c, 0x72, 0xb6, 0x95, 0x3b, 0xcc  },
+        { 0x00, 0xf7, 0x51, 0xa9, 0xe5, 0x06, 0x95, 0xb0,
+          0x90, 0xed, 0x69, 0x11, 0xa4, 0xb6, 0x55, 0x24,
+          0x95, 0x1c, 0xdc, 0x15, 0xa7, 0x3a, 0x5d, 0x58,
+          0xbb, 0x55, 0x21, 0x5e, 0xa2, 0xcd, 0x83, 0x9a,
+          0xc7, 0x9d, 0x2b, 0x44, 0xa3, 0x9b, 0xaf, 0xab,
+          0x27, 0xe8, 0x3f, 0xde, 0x9e, 0x11, 0xf6, 0x34,
+          0x0b, 0x11, 0xd9, 0x91, 0xb1, 0xb9, 0x1b, 0xf2,
+          0xee, 0xe7, 0xfc, 0x87, 0x24, 0x26, 0xc3, 0xa4  }
+      },
+
+      { "data-54 key-147",  /* Test 6a */
+        /* Test with a key larger than 144 bytes (= block-size of
+         * SHA3-224).  */
+        "Test Using Larger Than Block-Size Key - Hash Key First",
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+
+        { 0xb9, 0x6d, 0x73, 0x0c, 0x14, 0x8c, 0x2d, 0xaa,
+          0xd8, 0x64, 0x9d, 0x83, 0xde, 0xfa, 0xa3, 0x71,
+          0x97, 0x38, 0xd3, 0x47, 0x75, 0x39, 0x7b, 0x75,
+          0x71, 0xc3, 0x85, 0x15  },
+        { 0xa6, 0x07, 0x2f, 0x86, 0xde, 0x52, 0xb3, 0x8b,
+          0xb3, 0x49, 0xfe, 0x84, 0xcd, 0x6d, 0x97, 0xfb,
+          0x6a, 0x37, 0xc4, 0xc0, 0xf6, 0x2a, 0xae, 0x93,
+          0x98, 0x11, 0x93, 0xa7, 0x22, 0x9d, 0x34, 0x67  },
+        { 0x71, 0x3d, 0xff, 0x03, 0x02, 0xc8, 0x50, 0x86,
+          0xec, 0x5a, 0xd0, 0x76, 0x8d, 0xd6, 0x5a, 0x13,
+          0xdd, 0xd7, 0x90, 0x68, 0xd8, 0xd4, 0xc6, 0x21,
+          0x2b, 0x71, 0x2e, 0x41, 0x64, 0x94, 0x49, 0x11,
+          0x14, 0x80, 0x23, 0x00, 0x44, 0x18, 0x5a, 0x99,
+          0x10, 0x3e, 0xd8, 0x20, 0x04, 0xdd, 0xbf, 0xcc  },
+        { 0xb1, 0x48, 0x35, 0xc8, 0x19, 0xa2, 0x90, 0xef,
+          0xb0, 0x10, 0xac, 0xe6, 0xd8, 0x56, 0x8d, 0xc6,
+          0xb8, 0x4d, 0xe6, 0x0b, 0xc4, 0x9b, 0x00, 0x4c,
+          0x3b, 0x13, 0xed, 0xa7, 0x63, 0x58, 0x94, 0x51,
+          0xe5, 0xdd, 0x74, 0x29, 0x28, 0x84, 0xd1, 0xbd,
+          0xce, 0x64, 0xe6, 0xb9, 0x19, 0xdd, 0x61, 0xdc,
+          0x9c, 0x56, 0xa2, 0x82, 0xa8, 0x1c, 0x0b, 0xd1,
+          0x4f, 0x1f, 0x36, 0x5b, 0x49, 0xb8, 0x3a, 0x5b  }
+      },
+
+      { "data-152 key-131",  /* Test 7  */
+        /* Test with a key and data that is larger than 128 bytes (=
+         * block-size of SHA-384 and SHA-512).  */
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+
+        { 0x05, 0xd8, 0xcd, 0x6d, 0x00, 0xfa, 0xea, 0x8d,
+          0x1e, 0xb6, 0x8a, 0xde, 0x28, 0x73, 0x0b, 0xbd,
+          0x3c, 0xba, 0xb6, 0x92, 0x9f, 0x0a, 0x08, 0x6b,
+          0x29, 0xcd, 0x62, 0xa0  },
+        { 0x65, 0xc5, 0xb0, 0x6d, 0x4c, 0x3d, 0xe3, 0x2a,
+          0x7a, 0xef, 0x87, 0x63, 0x26, 0x1e, 0x49, 0xad,
+          0xb6, 0xe2, 0x29, 0x3e, 0xc8, 0xe7, 0xc6, 0x1e,
+          0x8d, 0xe6, 0x17, 0x01, 0xfc, 0x63, 0xe1, 0x23  },
+        { 0x02, 0x6f, 0xdf, 0x6b, 0x50, 0x74, 0x1e, 0x37,
+          0x38, 0x99, 0xc9, 0xf7, 0xd5, 0x40, 0x6d, 0x4e,
+          0xb0, 0x9f, 0xc6, 0x66, 0x56, 0x36, 0xfc, 0x1a,
+          0x53, 0x00, 0x29, 0xdd, 0xf5, 0xcf, 0x3c, 0xa5,
+          0xa9, 0x00, 0xed, 0xce, 0x01, 0xf5, 0xf6, 0x1e,
+          0x2f, 0x40, 0x8c, 0xdf, 0x2f, 0xd3, 0xe7, 0xe8  },
+        { 0x38, 0xa4, 0x56, 0xa0, 0x04, 0xbd, 0x10, 0xd3,
+          0x2c, 0x9a, 0xb8, 0x33, 0x66, 0x84, 0x11, 0x28,
+          0x62, 0xc3, 0xdb, 0x61, 0xad, 0xcc, 0xa3, 0x18,
+          0x29, 0x35, 0x5e, 0xaf, 0x46, 0xfd, 0x5c, 0x73,
+          0xd0, 0x6a, 0x1f, 0x0d, 0x13, 0xfe, 0xc9, 0xa6,
+          0x52, 0xfb, 0x38, 0x11, 0xb5, 0x77, 0xb1, 0xb1,
+          0xd1, 0xb9, 0x78, 0x9f, 0x97, 0xae, 0x5b, 0x83,
+          0xc6, 0xf4, 0x4d, 0xfc, 0xf1, 0xd6, 0x7e, 0xba  }
+      },
+
+      { "data-152 key-147",  /* Test 7a  */
+        /* Test with a key larger than 144 bytes (= block-size of
+         * SHA3-224). */
+        "This is a test using a larger than block-size key and a larger "
+        "than block-size data. The key needs to be hashed before being "
+        "used by the HMAC algorithm.",
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+        "\xaa\xaa\xaa",
+
+        { 0xc7, 0x9c, 0x9b, 0x09, 0x34, 0x24, 0xe5, 0x88,
+          0xa9, 0x87, 0x8b, 0xbc, 0xb0, 0x89, 0xe0, 0x18,
+          0x27, 0x00, 0x96, 0xe9, 0xb4, 0xb1, 0xa9, 0xe8,
+          0x22, 0x0c, 0x86, 0x6a  },
+        { 0xe6, 0xa3, 0x6d, 0x9b, 0x91, 0x5f, 0x86, 0xa0,
+          0x93, 0xca, 0xc7, 0xd1, 0x10, 0xe9, 0xe0, 0x4c,
+          0xf1, 0xd6, 0x10, 0x0d, 0x30, 0x47, 0x55, 0x09,
+          0xc2, 0x47, 0x5f, 0x57, 0x1b, 0x75, 0x8b, 0x5a  },
+        { 0xca, 0xd1, 0x8a, 0x8f, 0xf6, 0xc4, 0xcc, 0x3a,
+          0xd4, 0x87, 0xb9, 0x5f, 0x97, 0x69, 0xe9, 0xb6,
+          0x1c, 0x06, 0x2a, 0xef, 0xd6, 0x95, 0x25, 0x69,
+          0xe6, 0xe6, 0x42, 0x18, 0x97, 0x05, 0x4c, 0xfc,
+          0x70, 0xb5, 0xfd, 0xc6, 0x60, 0x5c, 0x18, 0x45,
+          0x71, 0x12, 0xfc, 0x6a, 0xaa, 0xd4, 0x55, 0x85  },
+        { 0xdc, 0x03, 0x0e, 0xe7, 0x88, 0x70, 0x34, 0xf3,
+          0x2c, 0xf4, 0x02, 0xdf, 0x34, 0x62, 0x2f, 0x31,
+          0x1f, 0x3e, 0x6c, 0xf0, 0x48, 0x60, 0xc6, 0xbb,
+          0xd7, 0xfa, 0x48, 0x86, 0x74, 0x78, 0x2b, 0x46,
+          0x59, 0xfd, 0xbd, 0xf3, 0xfd, 0x87, 0x78, 0x52,
+          0x88, 0x5c, 0xfe, 0x6e, 0x22, 0x18, 0x5f, 0xe7,
+          0xb2, 0xee, 0x95, 0x20, 0x43, 0x62, 0x9b, 0xc9,
+          0xd5, 0xf3, 0x29, 0x8a, 0x41, 0xd0, 0x2c, 0x66  }
+      }/*,*/
+
+      /* Our API does not allow to specify a bit count and thus we
+       * can't use the following test.  */
+      /* { "data-5bit key-4",  /\* Test 8  *\/ */
+      /*   /\* Test with data bit size no multiple of 8, the data bits are */
+      /*    * '11001' from the NIST example using SHA-3 order (= 5 bits */
+      /*    * from LSB hex byte 13 or 5 bits from MSB hex byte c8).  *\/ */
+      /*   "\xc8", */
+      /*   "Jefe", */
+
+      /*   { 0x5f, 0x8c, 0x0e, 0xa7, 0xfa, 0xfe, 0xcd, 0x0c, */
+      /*     0x34, 0x63, 0xaa, 0xd0, 0x97, 0x42, 0xce, 0xce, */
+      /*     0xb1, 0x42, 0xfe, 0x0a, 0xb6, 0xf4, 0x53, 0x94, */
+      /*     0x38, 0xc5, 0x9d, 0xe8  }, */
+      /*   { 0xec, 0x82, 0x22, 0x77, 0x3f, 0xac, 0x68, 0xb3, */
+      /*     0xd3, 0xdc, 0xb1, 0x82, 0xae, 0xc8, 0xb0, 0x50, */
+      /*     0x7a, 0xce, 0x44, 0x48, 0xd2, 0x0a, 0x11, 0x47, */
+      /*     0xe6, 0x82, 0x11, 0x8d, 0xa4, 0xe3, 0xf4, 0x4c  }, */
+      /*   { 0x21, 0xfb, 0xd3, 0xbf, 0x3e, 0xbb, 0xa3, 0xcf, */
+      /*     0xc9, 0xef, 0x64, 0xc0, 0x59, 0x1c, 0x92, 0xc5, */
+      /*     0xac, 0xb2, 0x65, 0xe9, 0x2d, 0x87, 0x61, 0xd1, */
+      /*     0xf9, 0x1a, 0x52, 0xa1, 0x03, 0xa6, 0xc7, 0x96, */
+      /*     0x94, 0xcf, 0xd6, 0x7a, 0x9a, 0x2a, 0xc1, 0x32, */
+      /*     0x4f, 0x02, 0xfe, 0xa6, 0x3b, 0x81, 0xef, 0xfc  }, */
+      /*   { 0x27, 0xf9, 0x38, 0x8c, 0x15, 0x67, 0xef, 0x4e, */
+      /*     0xf2, 0x00, 0x60, 0x2a, 0x6c, 0xf8, 0x71, 0xd6, */
+      /*     0x8a, 0x6f, 0xb0, 0x48, 0xd4, 0x73, 0x7a, 0xc4, */
+      /*     0x41, 0x8a, 0x2f, 0x02, 0x12, 0x89, 0xd1, 0x3d, */
+      /*     0x1f, 0xd1, 0x12, 0x0f, 0xec, 0xb9, 0xcf, 0x96, */
+      /*     0x4c, 0x5b, 0x11, 0x7a, 0xb5, 0xb1, 0x1c, 0x61, */
+      /*     0x4b, 0x2d, 0xa3, 0x9d, 0xad, 0xd5, 0x1f, 0x2f, */
+      /*     0x5e, 0x22, 0xaa, 0xcc, 0xec, 0x7d, 0x57, 0x6e  } */
+      /* } */
+
+    };
+  const char *what;
+  const char *errtxt;
+  int tvidx;
+  const char *expect;
+  int nexpect;
+
+  for (tvidx=0; tvidx < DIM(tv); tvidx++)
+    {
+      what = tv[tvidx].desc;
+      if (hashalgo == GCRY_MD_SHA3_224)
+        {
+          expect = tv[tvidx].expect_224;
+          nexpect = DIM (tv[tvidx].expect_224);
+        }
+      else if (hashalgo == GCRY_MD_SHA3_256)
+        {
+          expect = tv[tvidx].expect_256;
+          nexpect = DIM (tv[tvidx].expect_256);
+        }
+      else if (hashalgo == GCRY_MD_SHA3_384)
+        {
+          expect = tv[tvidx].expect_384;
+          nexpect = DIM (tv[tvidx].expect_384);
+        }
+      else if (hashalgo == GCRY_MD_SHA3_512)
+        {
+          expect = tv[tvidx].expect_512;
+          nexpect = DIM (tv[tvidx].expect_512);
+        }
+      else
+        BUG();
+
+      if (tv[tvidx].trunc && tv[tvidx].trunc < nexpect)
+        nexpect = tv[tvidx].trunc;
+
+      errtxt = check_one (hashalgo,
+                          tv[tvidx].data, strlen (tv[tvidx].data),
+                          tv[tvidx].key, strlen (tv[tvidx].key),
+                          expect, nexpect, !!tv[tvidx].trunc);
+      if (errtxt)
+        goto failed;
+      if (!extended)
+        break;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("hmac", hashalgo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+static gpg_err_code_t
+hmac_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MAC_HMAC_SHA1:
+      ec = selftests_sha1 (extended, report);
+      break;
+    case GCRY_MAC_HMAC_SHA224:
+      ec = selftests_sha224 (extended, report);
+      break;
+    case GCRY_MAC_HMAC_SHA256:
+      ec = selftests_sha256 (extended, report);
+      break;
+    case GCRY_MAC_HMAC_SHA384:
+      ec = selftests_sha384 (extended, report);
+      break;
+    case GCRY_MAC_HMAC_SHA512:
+      ec = selftests_sha512 (extended, report);
+      break;
+
+    case GCRY_MAC_HMAC_SHA3_224:
+    case GCRY_MAC_HMAC_SHA3_256:
+    case GCRY_MAC_HMAC_SHA3_384:
+    case GCRY_MAC_HMAC_SHA3_512:
+      {
+        int md_algo = map_mac_algo_to_md (algo);
+        ec = selftests_sha3 (md_algo, extended, report);
+      }
+      break;
+
+    default:
+      ec = GPG_ERR_MAC_ALGO;
+      break;
+    }
+
+  return ec;
+}
+
+
+static const gcry_mac_spec_ops_t hmac_ops = {
+  hmac_open,
+  hmac_close,
+  hmac_setkey,
+  NULL,
+  hmac_reset,
+  hmac_write,
+  hmac_read,
+  hmac_verify,
+  hmac_get_maclen,
+  hmac_get_keylen,
+  NULL,
+  hmac_selftest
+};
+
+
+#if USE_SHA1
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha1 = {
+  GCRY_MAC_HMAC_SHA1, {0, 1}, "HMAC_SHA1",
+  &hmac_ops
+};
+#endif
+#if USE_SHA256
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha256 = {
+  GCRY_MAC_HMAC_SHA256, {0, 1}, "HMAC_SHA256",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha224 = {
+  GCRY_MAC_HMAC_SHA224, {0, 1}, "HMAC_SHA224",
+  &hmac_ops
+};
+#endif
+#if USE_SHA512
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512 = {
+  GCRY_MAC_HMAC_SHA512, {0, 1}, "HMAC_SHA512",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384 = {
+  GCRY_MAC_HMAC_SHA384, {0, 1}, "HMAC_SHA384",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256 = {
+  GCRY_MAC_HMAC_SHA512_256, {0, 1}, "HMAC_SHA512_256",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224 = {
+  GCRY_MAC_HMAC_SHA512_224, {0, 1}, "HMAC_SHA512_224",
+  &hmac_ops
+};
+
+#endif
+#if USE_SHA3
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224 = {
+  GCRY_MAC_HMAC_SHA3_224, {0, 1}, "HMAC_SHA3_224",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_256 = {
+  GCRY_MAC_HMAC_SHA3_256, {0, 1}, "HMAC_SHA3_256",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_384 = {
+  GCRY_MAC_HMAC_SHA3_384, {0, 1}, "HMAC_SHA3_384",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_512 = {
+  GCRY_MAC_HMAC_SHA3_512, {0, 1}, "HMAC_SHA3_512",
+  &hmac_ops
+};
+#endif
+#ifdef USE_GOST_R_3411_94
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_gost3411_94 = {
+  GCRY_MAC_HMAC_GOSTR3411_94, {0, 0}, "HMAC_GOSTR3411_94",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_gost3411_cp = {
+  GCRY_MAC_HMAC_GOSTR3411_CP, {0, 0}, "HMAC_GOSTR3411_CP",
+  &hmac_ops
+};
+#endif
+#ifdef USE_GOST_R_3411_12
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_stribog256 = {
+  GCRY_MAC_HMAC_STRIBOG256, {0, 0}, "HMAC_STRIBOG256",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_stribog512 = {
+  GCRY_MAC_HMAC_STRIBOG512, {0, 0}, "HMAC_STRIBOG512",
+  &hmac_ops
+};
+#endif
+#if USE_WHIRLPOOL
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_whirlpool = {
+  GCRY_MAC_HMAC_WHIRLPOOL, {0, 0}, "HMAC_WHIRLPOOL",
+  &hmac_ops
+};
+#endif
+#if USE_RMD160
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_rmd160 = {
+  GCRY_MAC_HMAC_RMD160, {0, 0}, "HMAC_RIPEMD160",
+  &hmac_ops
+};
+#endif
+#if USE_TIGER
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_tiger1 = {
+  GCRY_MAC_HMAC_TIGER1, {0, 0}, "HMAC_TIGER",
+  &hmac_ops
+};
+#endif
+#if USE_MD5
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_md5 = {
+  GCRY_MAC_HMAC_MD5, {0, 0}, "HMAC_MD5",
+  &hmac_ops
+};
+#endif
+#if USE_MD4
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_md4 = {
+  GCRY_MAC_HMAC_MD4, {0, 0}, "HMAC_MD4",
+  &hmac_ops
+};
+#endif
+#if USE_MD2
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_md2 = {
+  GCRY_MAC_HMAC_MD2, {0, 0}, "HMAC_MD2",
+  &hmac_ops
+};
+#endif
+#if USE_BLAKE2
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_512 = {
+  GCRY_MAC_HMAC_BLAKE2B_512, {0, 0}, "HMAC_BLAKE2B_512",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_384 = {
+  GCRY_MAC_HMAC_BLAKE2B_384, {0, 0}, "HMAC_BLAKE2B_384",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_256 = {
+  GCRY_MAC_HMAC_BLAKE2B_256, {0, 0}, "HMAC_BLAKE2B_256",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_160 = {
+  GCRY_MAC_HMAC_BLAKE2B_160, {0, 0}, "HMAC_BLAKE2B_160",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_256 = {
+  GCRY_MAC_HMAC_BLAKE2S_256, {0, 0}, "HMAC_BLAKE2S_256",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_224 = {
+  GCRY_MAC_HMAC_BLAKE2S_224, {0, 0}, "HMAC_BLAKE2S_224",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_160 = {
+  GCRY_MAC_HMAC_BLAKE2S_160, {0, 0}, "HMAC_BLAKE2S_160",
+  &hmac_ops
+};
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_128 = {
+  GCRY_MAC_HMAC_BLAKE2S_128, {0, 0}, "HMAC_BLAKE2S_128",
+  &hmac_ops
+};
+#endif
+#if USE_SM3
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sm3 = {
+  GCRY_MAC_HMAC_SM3, {0, 0}, "HMAC_SM3",
+  &hmac_ops
+};
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/mac-internal.h b/comm/third_party/libgcrypt/cipher/mac-internal.h
new file mode 100644
index 0000000000..e49885beec
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac-internal.h
@@ -0,0 +1,275 @@
+/* mac-internal.h  -  Internal defs for mac.c
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "g10lib.h"
+#include "cipher-proto.h"
+#include "gost.h"
+
+
+/* The data object used to hold a handle to an encryption object.  */
+struct gcry_mac_handle;
+
+/* The data object used to hold poly1305-mac context.  */
+struct poly1305mac_context_s;
+
+
+/*
+ *
+ * Message authentication code related definitions.
+ *
+ */
+
+
+/* Magic values for the context structure.  */
+#define CTX_MAC_MAGIC_NORMAL 0x59d9b8af
+#define CTX_MAC_MAGIC_SECURE 0x12c27cd0
+
+
+/* MAC module functions. */
+typedef gcry_err_code_t (*gcry_mac_open_func_t)(gcry_mac_hd_t h);
+typedef void (*gcry_mac_close_func_t)(gcry_mac_hd_t h);
+typedef gcry_err_code_t (*gcry_mac_setkey_func_t)(gcry_mac_hd_t h,
+						  const unsigned char *key,
+						  size_t keylen);
+typedef gcry_err_code_t (*gcry_mac_setiv_func_t)(gcry_mac_hd_t h,
+						 const unsigned char *iv,
+						 size_t ivlen);
+typedef gcry_err_code_t (*gcry_mac_reset_func_t)(gcry_mac_hd_t h);
+typedef gcry_err_code_t (*gcry_mac_write_func_t)(gcry_mac_hd_t h,
+						 const unsigned char *inbuf,
+						 size_t inlen);
+typedef gcry_err_code_t (*gcry_mac_read_func_t)(gcry_mac_hd_t h,
+						unsigned char *outbuf,
+						size_t *outlen);
+typedef gcry_err_code_t (*gcry_mac_verify_func_t)(gcry_mac_hd_t h,
+						  const unsigned char *inbuf,
+						  size_t inlen);
+typedef unsigned int (*gcry_mac_get_maclen_func_t)(int algo);
+typedef unsigned int (*gcry_mac_get_keylen_func_t)(int algo);
+
+/* The type used to convey additional information to a MAC.  */
+typedef gpg_err_code_t (*gcry_mac_set_extra_info_t)
+     (gcry_mac_hd_t h, int what, const void *buffer, size_t buflen);
+
+typedef struct gcry_mac_spec_ops
+{
+  gcry_mac_open_func_t open;
+  gcry_mac_close_func_t close;
+  gcry_mac_setkey_func_t setkey;
+  gcry_mac_setiv_func_t setiv;
+  gcry_mac_reset_func_t reset;
+  gcry_mac_write_func_t write;
+  gcry_mac_read_func_t read;
+  gcry_mac_verify_func_t verify;
+  gcry_mac_get_maclen_func_t get_maclen;
+  gcry_mac_get_keylen_func_t get_keylen;
+  gcry_mac_set_extra_info_t set_extra_info;
+  selftest_func_t selftest;
+} gcry_mac_spec_ops_t;
+
+
+/* Module specification structure for message authentication codes.  */
+typedef struct gcry_mac_spec
+{
+  int algo;
+  struct {
+    unsigned int disabled:1;
+    unsigned int fips:1;
+  } flags;
+  const char *name;
+  const gcry_mac_spec_ops_t *ops;
+} gcry_mac_spec_t;
+
+/* The handle structure.  */
+struct gcry_mac_handle
+{
+  int magic;
+  int algo;
+  const gcry_mac_spec_t *spec;
+  gcry_ctx_t gcry_ctx;
+  union {
+    struct {
+      gcry_md_hd_t md_ctx;
+      int md_algo;
+    } hmac;
+    struct {
+      gcry_cipher_hd_t ctx;
+      int cipher_algo;
+      unsigned int blklen;
+    } cmac;
+    struct {
+      gcry_cipher_hd_t ctx;
+      int cipher_algo;
+    } gmac;
+    struct {
+      struct poly1305mac_context_s *ctx;
+    } poly1305mac;
+    struct {
+      GOST28147_context ctx;
+      u32 n1, n2;
+      unsigned int unused;
+      unsigned int count;
+      unsigned char lastiv[8]; /* IMIT blocksize */
+    } imit;
+  } u;
+};
+
+
+/*
+ * The HMAC algorithm specifications (mac-hmac.c).
+ */
+#if USE_SHA1
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha1;
+#endif
+#if USE_SHA256
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha256;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha224;
+#endif
+#if USE_SHA512
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256;
+#endif
+#if USE_SHA3
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_256;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_384;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_512;
+#endif
+#ifdef USE_GOST_R_3411_94
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_gost3411_94;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_gost3411_cp;
+#endif
+#ifdef USE_GOST_R_3411_12
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_stribog256;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_stribog512;
+#endif
+#if USE_WHIRLPOOL
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_whirlpool;
+#endif
+#if USE_RMD160
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_rmd160;
+#endif
+#if USE_TIGER
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_tiger1;
+#endif
+#if USE_MD5
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_md5;
+#endif
+#if USE_MD4
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_md4;
+#endif
+#if USE_BLAKE2
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_512;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_384;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_256;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2b_160;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_256;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_224;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_160;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_blake2s_128;
+#endif
+#if USE_SM3
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sm3;
+#endif
+
+/*
+ * The CMAC algorithm specifications (mac-cmac.c).
+ */
+#if USE_BLOWFISH
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_blowfish;
+#endif
+#if USE_DES
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_tripledes;
+#endif
+#if USE_CAST5
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_cast5;
+#endif
+#if USE_AES
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_aes;
+#endif
+#if USE_TWOFISH
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_twofish;
+#endif
+#if USE_SERPENT
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_serpent;
+#endif
+#if USE_RFC2268
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_rfc2268;
+#endif
+#if USE_SEED
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_seed;
+#endif
+#if USE_CAMELLIA
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_camellia;
+#endif
+#ifdef USE_IDEA
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_idea;
+#endif
+#if USE_GOST28147
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_gost28147;
+#endif
+#if USE_GOST28147
+extern gcry_mac_spec_t _gcry_mac_type_spec_gost28147_imit;
+#endif
+#if USE_SM4
+extern gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4;
+#endif
+
+/*
+ * The GMAC algorithm specifications (mac-gmac.c).
+ */
+#if USE_AES
+extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_aes;
+#endif
+#if USE_TWOFISH
+extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_twofish;
+#endif
+#if USE_SERPENT
+extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_serpent;
+#endif
+#if USE_SEED
+extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed;
+#endif
+#if USE_CAMELLIA
+extern gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia;
+#endif
+
+/*
+ * The Poly1305 MAC algorithm specifications (mac-poly1305.c).
+ */
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac;
+#if USE_AES
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes;
+#endif
+#if USE_CAMELLIA
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia;
+#endif
+#if USE_TWOFISH
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish;
+#endif
+#if USE_SERPENT
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent;
+#endif
+#if USE_SEED
+extern gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed;
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/mac-poly1305.c b/comm/third_party/libgcrypt/cipher/mac-poly1305.c
new file mode 100644
index 0000000000..46ea735f89
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac-poly1305.c
@@ -0,0 +1,364 @@
+/* mac-poly1305.c  -  Poly1305 based MACs
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mac-internal.h"
+#include "poly1305-internal.h"
+
+
+struct poly1305mac_context_s {
+  poly1305_context_t ctx;
+  gcry_cipher_hd_t hd;
+  struct {
+    unsigned int key_set:1;
+    unsigned int nonce_set:1;
+    unsigned int tag:1;
+  } marks;
+  byte tag[POLY1305_TAGLEN];
+  byte key[POLY1305_KEYLEN];
+};
+
+
+static gcry_err_code_t
+poly1305mac_open (gcry_mac_hd_t h)
+{
+  struct poly1305mac_context_s *mac_ctx;
+  int secure = (h->magic == CTX_MAC_MAGIC_SECURE);
+  unsigned int flags = (secure ? GCRY_CIPHER_SECURE : 0);
+  gcry_err_code_t err;
+  int cipher_algo;
+
+  if (secure)
+    mac_ctx = xtrycalloc_secure (1, sizeof(*mac_ctx));
+  else
+    mac_ctx = xtrycalloc (1, sizeof(*mac_ctx));
+
+  if (!mac_ctx)
+    return gpg_err_code_from_syserror ();
+
+  h->u.poly1305mac.ctx = mac_ctx;
+
+  switch (h->spec->algo)
+    {
+    default:
+      /* already checked. */
+    case GCRY_MAC_POLY1305:
+      /* plain Poly1305. */
+      cipher_algo = -1;
+      return 0;
+    case GCRY_MAC_POLY1305_AES:
+      cipher_algo = GCRY_CIPHER_AES;
+      break;
+    case GCRY_MAC_POLY1305_CAMELLIA:
+      cipher_algo = GCRY_CIPHER_CAMELLIA128;
+      break;
+    case GCRY_MAC_POLY1305_TWOFISH:
+      cipher_algo = GCRY_CIPHER_TWOFISH;
+      break;
+    case GCRY_MAC_POLY1305_SERPENT:
+      cipher_algo = GCRY_CIPHER_SERPENT128;
+      break;
+    case GCRY_MAC_POLY1305_SEED:
+      cipher_algo = GCRY_CIPHER_SEED;
+      break;
+    }
+
+  err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo,
+				    GCRY_CIPHER_MODE_ECB, flags);
+  if (err)
+    goto err_free;
+
+  return 0;
+
+err_free:
+  xfree(h->u.poly1305mac.ctx);
+  return err;
+}
+
+
+static void
+poly1305mac_close (gcry_mac_hd_t h)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+
+  if (h->spec->algo != GCRY_MAC_POLY1305)
+    _gcry_cipher_close (mac_ctx->hd);
+
+  xfree(mac_ctx);
+}
+
+
+static gcry_err_code_t
+poly1305mac_prepare_key (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+  size_t block_keylen = keylen - 16;
+
+  /* Need at least 16 + 1 byte key. */
+  if (keylen <= 16)
+    return GPG_ERR_INV_KEYLEN;
+
+  /* For Poly1305-AES, first part of key is passed to Poly1305 as is. */
+  memcpy (mac_ctx->key, key + block_keylen, 16);
+
+  /* Remaining part is used as key for the block cipher. */
+  return _gcry_cipher_setkey (mac_ctx->hd, key, block_keylen);
+}
+
+
+static gcry_err_code_t
+poly1305mac_setkey (gcry_mac_hd_t h, const unsigned char *key, size_t keylen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+  gcry_err_code_t err;
+
+  memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx));
+  memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag));
+  memset(&mac_ctx->key, 0, sizeof(mac_ctx->key));
+
+  mac_ctx->marks.key_set = 0;
+  mac_ctx->marks.nonce_set = 0;
+  mac_ctx->marks.tag = 0;
+
+  if (h->spec->algo != GCRY_MAC_POLY1305)
+    {
+      err = poly1305mac_prepare_key (h, key, keylen);
+      if (err)
+        return err;
+
+      /* Poly1305-AES/etc also need nonce. */
+      mac_ctx->marks.key_set = 1;
+      mac_ctx->marks.nonce_set = 0;
+    }
+  else
+    {
+      /* For plain Poly1305, key is the nonce and setup is complete now. */
+
+      if (keylen != POLY1305_KEYLEN)
+        return GPG_ERR_INV_KEYLEN;
+
+      memcpy (mac_ctx->key, key, keylen);
+
+      err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN);
+      if (err)
+        {
+          memset(&mac_ctx->key, 0, sizeof(mac_ctx->key));
+          return err;
+        }
+
+      mac_ctx->marks.key_set = 1;
+      mac_ctx->marks.nonce_set = 1;
+    }
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+poly1305mac_setiv (gcry_mac_hd_t h, const unsigned char *iv, size_t ivlen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+  gcry_err_code_t err;
+
+  if (h->spec->algo == GCRY_MAC_POLY1305)
+    return GPG_ERR_INV_ARG;
+
+  if (ivlen != 16)
+    return GPG_ERR_INV_ARG;
+
+  if (!mac_ctx->marks.key_set)
+    return 0;
+
+  memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx));
+  memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag));
+  mac_ctx->marks.nonce_set = 0;
+  mac_ctx->marks.tag = 0;
+
+  /* Prepare second part of the poly1305 key. */
+
+  err = _gcry_cipher_encrypt (mac_ctx->hd, mac_ctx->key + 16, 16, iv, 16);
+  if (err)
+    return err;
+
+  err = _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN);
+  if (err)
+    return err;
+
+  mac_ctx->marks.nonce_set = 1;
+  return 0;
+}
+
+
+static gcry_err_code_t
+poly1305mac_reset (gcry_mac_hd_t h)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+
+  if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set)
+    return GPG_ERR_INV_STATE;
+
+  memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx));
+  memset(&mac_ctx->tag, 0, sizeof(mac_ctx->tag));
+
+  mac_ctx->marks.key_set = 1;
+  mac_ctx->marks.nonce_set = 1;
+  mac_ctx->marks.tag = 0;
+
+  return _gcry_poly1305_init (&mac_ctx->ctx, mac_ctx->key, POLY1305_KEYLEN);
+}
+
+
+static gcry_err_code_t
+poly1305mac_write (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+
+  if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set ||
+      mac_ctx->marks.tag)
+    return GPG_ERR_INV_STATE;
+
+  _gcry_poly1305_update (&mac_ctx->ctx, buf, buflen);
+  return 0;
+}
+
+
+static gcry_err_code_t
+poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+
+  if (!mac_ctx->marks.key_set || !mac_ctx->marks.nonce_set)
+    return GPG_ERR_INV_STATE;
+
+  if (!mac_ctx->marks.tag)
+    {
+      _gcry_poly1305_finish(&mac_ctx->ctx, mac_ctx->tag);
+
+      memset(&mac_ctx->ctx, 0, sizeof(mac_ctx->ctx));
+      mac_ctx->marks.tag = 1;
+    }
+
+  if (*outlen == 0)
+    return 0;
+
+  if (*outlen <= POLY1305_TAGLEN)
+    buf_cpy (outbuf, mac_ctx->tag, *outlen);
+  else
+    {
+      buf_cpy (outbuf, mac_ctx->tag, POLY1305_TAGLEN);
+      *outlen = POLY1305_TAGLEN;
+    }
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+poly1305mac_verify (gcry_mac_hd_t h, const unsigned char *buf, size_t buflen)
+{
+  struct poly1305mac_context_s *mac_ctx = h->u.poly1305mac.ctx;
+  gcry_err_code_t err;
+  size_t outlen = 0;
+
+  /* Check and finalize tag. */
+  err = poly1305mac_read(h, NULL, &outlen);
+  if (err)
+    return err;
+
+  if (buflen > POLY1305_TAGLEN)
+    return GPG_ERR_INV_LENGTH;
+
+  return buf_eq_const (buf, mac_ctx->tag, buflen) ? 0 : GPG_ERR_CHECKSUM;
+}
+
+
+static unsigned int
+poly1305mac_get_maclen (int algo)
+{
+  (void)algo;
+
+  return POLY1305_TAGLEN;
+}
+
+
+static unsigned int
+poly1305mac_get_keylen (int algo)
+{
+  (void)algo;
+
+  return POLY1305_KEYLEN;
+}
+
+
+static gcry_mac_spec_ops_t poly1305mac_ops = {
+  poly1305mac_open,
+  poly1305mac_close,
+  poly1305mac_setkey,
+  poly1305mac_setiv,
+  poly1305mac_reset,
+  poly1305mac_write,
+  poly1305mac_read,
+  poly1305mac_verify,
+  poly1305mac_get_maclen,
+  poly1305mac_get_keylen,
+  NULL,
+  NULL,
+};
+
+
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac = {
+  GCRY_MAC_POLY1305, {0, 0}, "POLY1305",
+  &poly1305mac_ops
+};
+#if USE_AES
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aes = {
+  GCRY_MAC_POLY1305_AES, {0, 0}, "POLY1305_AES",
+  &poly1305mac_ops
+};
+#endif
+#if USE_CAMELLIA
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_camellia = {
+  GCRY_MAC_POLY1305_CAMELLIA, {0, 0}, "POLY1305_CAMELLIA",
+  &poly1305mac_ops
+};
+#endif
+#if USE_TWOFISH
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_twofish = {
+  GCRY_MAC_POLY1305_TWOFISH, {0, 0}, "POLY1305_TWOFISH",
+  &poly1305mac_ops
+};
+#endif
+#if USE_SERPENT
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent = {
+  GCRY_MAC_POLY1305_SERPENT, {0, 0}, "POLY1305_SERPENT",
+  &poly1305mac_ops
+};
+#endif
+#if USE_SEED
+gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed = {
+  GCRY_MAC_POLY1305_SEED, {0, 0}, "POLY1305_SEED",
+  &poly1305mac_ops
+};
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/mac.c b/comm/third_party/libgcrypt/cipher/mac.c
new file mode 100644
index 0000000000..babe99e3a8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/mac.c
@@ -0,0 +1,808 @@
+/* mac.c  -  message authentication code dispatcher
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mac-internal.h"
+
+
+/* This is the list of the digest implementations included in
+   libgcrypt.  */
+static gcry_mac_spec_t * const mac_list[] = {
+#if USE_SHA1
+  &_gcry_mac_type_spec_hmac_sha1,
+#endif
+#if USE_SHA256
+  &_gcry_mac_type_spec_hmac_sha256,
+  &_gcry_mac_type_spec_hmac_sha224,
+#endif
+#if USE_SHA512
+  &_gcry_mac_type_spec_hmac_sha512,
+  &_gcry_mac_type_spec_hmac_sha384,
+  &_gcry_mac_type_spec_hmac_sha512_256,
+  &_gcry_mac_type_spec_hmac_sha512_224,
+#endif
+#if USE_SHA3
+  &_gcry_mac_type_spec_hmac_sha3_224,
+  &_gcry_mac_type_spec_hmac_sha3_256,
+  &_gcry_mac_type_spec_hmac_sha3_384,
+  &_gcry_mac_type_spec_hmac_sha3_512,
+#endif
+#ifdef USE_GOST_R_3411_94
+  &_gcry_mac_type_spec_hmac_gost3411_94,
+  &_gcry_mac_type_spec_hmac_gost3411_cp,
+#endif
+#ifdef USE_GOST_R_3411_12
+  &_gcry_mac_type_spec_hmac_stribog256,
+  &_gcry_mac_type_spec_hmac_stribog512,
+#endif
+#if USE_WHIRLPOOL
+  &_gcry_mac_type_spec_hmac_whirlpool,
+#endif
+#if USE_RMD160
+  &_gcry_mac_type_spec_hmac_rmd160,
+#endif
+#if USE_TIGER
+  &_gcry_mac_type_spec_hmac_tiger1,
+#endif
+#if USE_MD5
+  &_gcry_mac_type_spec_hmac_md5,
+#endif
+#if USE_MD4
+  &_gcry_mac_type_spec_hmac_md4,
+#endif
+#if USE_BLAKE2
+  &_gcry_mac_type_spec_hmac_blake2b_512,
+  &_gcry_mac_type_spec_hmac_blake2b_384,
+  &_gcry_mac_type_spec_hmac_blake2b_256,
+  &_gcry_mac_type_spec_hmac_blake2b_160,
+  &_gcry_mac_type_spec_hmac_blake2s_256,
+  &_gcry_mac_type_spec_hmac_blake2s_224,
+  &_gcry_mac_type_spec_hmac_blake2s_160,
+  &_gcry_mac_type_spec_hmac_blake2s_128,
+#endif
+#if USE_SM3
+  &_gcry_mac_type_spec_hmac_sm3,
+#endif
+#if USE_BLOWFISH
+  &_gcry_mac_type_spec_cmac_blowfish,
+#endif
+#if USE_DES
+  &_gcry_mac_type_spec_cmac_tripledes,
+#endif
+#if USE_CAST5
+  &_gcry_mac_type_spec_cmac_cast5,
+#endif
+#if USE_AES
+  &_gcry_mac_type_spec_cmac_aes,
+  &_gcry_mac_type_spec_gmac_aes,
+  &_gcry_mac_type_spec_poly1305mac_aes,
+#endif
+#if USE_TWOFISH
+  &_gcry_mac_type_spec_cmac_twofish,
+  &_gcry_mac_type_spec_gmac_twofish,
+  &_gcry_mac_type_spec_poly1305mac_twofish,
+#endif
+#if USE_SERPENT
+  &_gcry_mac_type_spec_cmac_serpent,
+  &_gcry_mac_type_spec_gmac_serpent,
+  &_gcry_mac_type_spec_poly1305mac_serpent,
+#endif
+#if USE_RFC2268
+  &_gcry_mac_type_spec_cmac_rfc2268,
+#endif
+#if USE_SEED
+  &_gcry_mac_type_spec_cmac_seed,
+  &_gcry_mac_type_spec_gmac_seed,
+  &_gcry_mac_type_spec_poly1305mac_seed,
+#endif
+#if USE_CAMELLIA
+  &_gcry_mac_type_spec_cmac_camellia,
+  &_gcry_mac_type_spec_gmac_camellia,
+  &_gcry_mac_type_spec_poly1305mac_camellia,
+#endif
+#ifdef USE_IDEA
+  &_gcry_mac_type_spec_cmac_idea,
+#endif
+#if USE_GOST28147
+  &_gcry_mac_type_spec_cmac_gost28147,
+  &_gcry_mac_type_spec_gost28147_imit,
+#endif
+  &_gcry_mac_type_spec_poly1305mac,
+#if USE_SM4
+  &_gcry_mac_type_spec_cmac_sm4,
+#endif
+  NULL,
+};
+
+/* HMAC implementations start with index 101 (enum gcry_mac_algos) */
+static gcry_mac_spec_t * const mac_list_algo101[] =
+  {
+#if USE_SHA256
+    &_gcry_mac_type_spec_hmac_sha256,
+    &_gcry_mac_type_spec_hmac_sha224,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_mac_type_spec_hmac_sha512,
+    &_gcry_mac_type_spec_hmac_sha384,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_SHA1
+    &_gcry_mac_type_spec_hmac_sha1,
+#else
+    NULL,
+#endif
+#if USE_MD5
+    &_gcry_mac_type_spec_hmac_md5,
+#else
+    NULL,
+#endif
+#if USE_MD4
+    &_gcry_mac_type_spec_hmac_md4,
+#else
+    NULL,
+#endif
+#if USE_RMD160
+    &_gcry_mac_type_spec_hmac_rmd160,
+#else
+    NULL,
+#endif
+#if USE_TIGER
+    &_gcry_mac_type_spec_hmac_tiger1,
+#else
+    NULL,
+#endif
+#if USE_WHIRLPOOL
+    &_gcry_mac_type_spec_hmac_whirlpool,
+#else
+    NULL,
+#endif
+#ifdef USE_GOST_R_3411_94
+    &_gcry_mac_type_spec_hmac_gost3411_94,
+#else
+    NULL,
+#endif
+#ifdef USE_GOST_R_3411_12
+    &_gcry_mac_type_spec_hmac_stribog256,
+    &_gcry_mac_type_spec_hmac_stribog512,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_MD2
+    &_gcry_mac_type_spec_hmac_md2,
+#else
+    NULL,
+#endif
+#if USE_SHA3
+    &_gcry_mac_type_spec_hmac_sha3_224,
+    &_gcry_mac_type_spec_hmac_sha3_256,
+    &_gcry_mac_type_spec_hmac_sha3_384,
+    &_gcry_mac_type_spec_hmac_sha3_512,
+#else
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+#endif
+#ifdef USE_GOST_R_3411_94
+    &_gcry_mac_type_spec_hmac_gost3411_cp,
+#else
+    NULL,
+#endif
+#if USE_BLAKE2
+    &_gcry_mac_type_spec_hmac_blake2b_512,
+    &_gcry_mac_type_spec_hmac_blake2b_384,
+    &_gcry_mac_type_spec_hmac_blake2b_256,
+    &_gcry_mac_type_spec_hmac_blake2b_160,
+    &_gcry_mac_type_spec_hmac_blake2s_256,
+    &_gcry_mac_type_spec_hmac_blake2s_224,
+    &_gcry_mac_type_spec_hmac_blake2s_160,
+    &_gcry_mac_type_spec_hmac_blake2s_128,
+#else
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_SM3
+    &_gcry_mac_type_spec_hmac_sm3,
+#else
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_mac_type_spec_hmac_sha512_256,
+    &_gcry_mac_type_spec_hmac_sha512_224,
+#else
+    NULL,
+    NULL,
+#endif
+  };
+
+/* CMAC implementations start with index 201 (enum gcry_mac_algos) */
+static gcry_mac_spec_t * const mac_list_algo201[] =
+  {
+#if USE_AES
+    &_gcry_mac_type_spec_cmac_aes,
+#else
+    NULL,
+#endif
+#if USE_DES
+    &_gcry_mac_type_spec_cmac_tripledes,
+#else
+    NULL,
+#endif
+#if USE_CAMELLIA
+    &_gcry_mac_type_spec_cmac_camellia,
+#else
+    NULL,
+#endif
+#if USE_CAST5
+    &_gcry_mac_type_spec_cmac_cast5,
+#else
+    NULL,
+#endif
+#if USE_BLOWFISH
+    &_gcry_mac_type_spec_cmac_blowfish,
+#else
+    NULL,
+#endif
+#if USE_TWOFISH
+    &_gcry_mac_type_spec_cmac_twofish,
+#else
+    NULL,
+#endif
+#if USE_SERPENT
+    &_gcry_mac_type_spec_cmac_serpent,
+#else
+    NULL,
+#endif
+#if USE_SEED
+    &_gcry_mac_type_spec_cmac_seed,
+#else
+    NULL,
+#endif
+#if USE_RFC2268
+    &_gcry_mac_type_spec_cmac_rfc2268,
+#else
+    NULL,
+#endif
+#ifdef USE_IDEA
+    &_gcry_mac_type_spec_cmac_idea,
+#else
+    NULL,
+#endif
+#if USE_GOST28147
+    &_gcry_mac_type_spec_cmac_gost28147,
+#else
+    NULL,
+#endif
+#if USE_SM4
+    &_gcry_mac_type_spec_cmac_sm4
+#else
+    NULL
+#endif
+  };
+
+/* GMAC implementations start with index 401 (enum gcry_mac_algos) */
+static gcry_mac_spec_t * const mac_list_algo401[] =
+  {
+#if USE_AES
+    &_gcry_mac_type_spec_gmac_aes,
+#else
+    NULL,
+#endif
+#if USE_CAMELLIA
+    &_gcry_mac_type_spec_gmac_camellia,
+#else
+    NULL,
+#endif
+#if USE_TWOFISH
+    &_gcry_mac_type_spec_gmac_twofish,
+#else
+    NULL,
+#endif
+#if USE_SERPENT
+    &_gcry_mac_type_spec_gmac_serpent,
+#else
+    NULL,
+#endif
+#if USE_SEED
+    &_gcry_mac_type_spec_gmac_seed
+#else
+    NULL
+#endif
+  };
+
+/* Poly1305-MAC implementations start with index 501 (enum gcry_mac_algos) */
+static gcry_mac_spec_t * const mac_list_algo501[] =
+  {
+    &_gcry_mac_type_spec_poly1305mac,
+#if USE_AES
+    &_gcry_mac_type_spec_poly1305mac_aes,
+#else
+    NULL,
+#endif
+#if USE_CAMELLIA
+    &_gcry_mac_type_spec_poly1305mac_camellia,
+#else
+    NULL,
+#endif
+#if USE_TWOFISH
+    &_gcry_mac_type_spec_poly1305mac_twofish,
+#else
+    NULL,
+#endif
+#if USE_SERPENT
+    &_gcry_mac_type_spec_poly1305mac_serpent,
+#else
+    NULL,
+#endif
+#if USE_SEED
+    &_gcry_mac_type_spec_poly1305mac_seed
+#else
+    NULL
+#endif
+  };
+
+
+
+
+/* Explicitly initialize this module.  */
+gcry_err_code_t
+_gcry_mac_init (void)
+{
+  if (fips_mode())
+    {
+      /* disable algorithms that are disallowed in fips */
+      int idx;
+      gcry_mac_spec_t *spec;
+
+      for (idx = 0; (spec = mac_list[idx]); idx++)
+        if (!spec->flags.fips)
+          spec->flags.disabled = 1;
+    }
+
+  return 0;
+}
+
+
+/* Return the spec structure for the MAC algorithm ALGO.  For an
+   unknown algorithm NULL is returned.  */
+static gcry_mac_spec_t *
+spec_from_algo (int algo)
+{
+  gcry_mac_spec_t *spec = NULL;
+
+  if (algo >= 101 && algo < 101 + DIM(mac_list_algo101))
+    spec = mac_list_algo101[algo - 101];
+  else if (algo >= 201 && algo < 201 + DIM(mac_list_algo201))
+    spec = mac_list_algo201[algo - 201];
+  else if (algo >= 401 && algo < 401 + DIM(mac_list_algo401))
+    spec = mac_list_algo401[algo - 401];
+  else if (algo >= 501 && algo < 501 + DIM(mac_list_algo501))
+    spec = mac_list_algo501[algo - 501];
+#ifdef USE_GOST28147
+  else if (algo == GCRY_MAC_GOST28147_IMIT)
+    spec = &_gcry_mac_type_spec_gost28147_imit;
+#endif
+
+  if (spec)
+    gcry_assert (spec->algo == algo);
+
+  return spec;
+}
+
+
+/* Lookup a mac's spec by its name.  */
+static gcry_mac_spec_t *
+spec_from_name (const char *name)
+{
+  gcry_mac_spec_t *spec;
+  int idx;
+
+  for (idx = 0; (spec = mac_list[idx]); idx++)
+    if (!stricmp (name, spec->name))
+      return spec;
+
+  return NULL;
+}
+
+
+/****************
+ * Map a string to the mac algo
+ */
+int
+_gcry_mac_map_name (const char *string)
+{
+  gcry_mac_spec_t *spec;
+
+  if (!string)
+    return 0;
+
+  /* Not found, search a matching mac name.  */
+  spec = spec_from_name (string);
+  if (spec)
+    return spec->algo;
+
+  return 0;
+}
+
+
+/****************
+ * This function simply returns the name of the algorithm or some constant
+ * string when there is no algo.  It will never return NULL.
+ * Use the macro gcry_mac_test_algo() to check whether the algorithm
+ * is valid.
+ */
+const char *
+_gcry_mac_algo_name (int algorithm)
+{
+  gcry_mac_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  return spec ? spec->name : "?";
+}
+
+
+static gcry_err_code_t
+check_mac_algo (int algorithm)
+{
+  gcry_mac_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  if (spec && !spec->flags.disabled)
+    return 0;
+
+  return GPG_ERR_MAC_ALGO;
+}
+
+
+/****************
+ * Open a message digest handle for use with algorithm ALGO.
+ */
+static gcry_err_code_t
+mac_open (gcry_mac_hd_t * hd, int algo, int secure, gcry_ctx_t ctx)
+{
+  gcry_mac_spec_t *spec;
+  gcry_err_code_t err;
+  gcry_mac_hd_t h;
+
+  spec = spec_from_algo (algo);
+  if (!spec)
+    return GPG_ERR_MAC_ALGO;
+  else if (spec->flags.disabled)
+    return GPG_ERR_MAC_ALGO;
+  else if (!spec->ops)
+    return GPG_ERR_MAC_ALGO;
+  else if (!spec->ops->open || !spec->ops->write || !spec->ops->setkey ||
+           !spec->ops->read || !spec->ops->verify || !spec->ops->reset)
+    return GPG_ERR_MAC_ALGO;
+
+  if (secure)
+    h = xtrycalloc_secure (1, sizeof (*h));
+  else
+    h = xtrycalloc (1, sizeof (*h));
+
+  if (!h)
+    return gpg_err_code_from_syserror ();
+
+  h->magic = secure ? CTX_MAC_MAGIC_SECURE : CTX_MAC_MAGIC_NORMAL;
+  h->spec = spec;
+  h->algo = algo;
+  h->gcry_ctx = ctx;
+
+  err = h->spec->ops->open (h);
+  if (err)
+    xfree (h);
+  else
+    *hd = h;
+
+  return err;
+}
+
+
+static gcry_err_code_t
+mac_reset (gcry_mac_hd_t hd)
+{
+  if (hd->spec->ops->reset)
+    return hd->spec->ops->reset (hd);
+
+  return 0;
+}
+
+
+static void
+mac_close (gcry_mac_hd_t hd)
+{
+  if (hd->spec->ops->close)
+    hd->spec->ops->close (hd);
+
+  wipememory (hd, sizeof (*hd));
+
+  xfree (hd);
+}
+
+
+static gcry_err_code_t
+mac_setkey (gcry_mac_hd_t hd, const void *key, size_t keylen)
+{
+  if (!hd->spec->ops->setkey)
+    return GPG_ERR_INV_ARG;
+  if (keylen > 0 && !key)
+    return GPG_ERR_INV_ARG;
+
+  return hd->spec->ops->setkey (hd, key, keylen);
+}
+
+
+static gcry_err_code_t
+mac_setiv (gcry_mac_hd_t hd, const void *iv, size_t ivlen)
+{
+  if (!hd->spec->ops->setiv)
+    return GPG_ERR_INV_ARG;
+  if (ivlen > 0 && !iv)
+    return GPG_ERR_INV_ARG;
+
+  return hd->spec->ops->setiv (hd, iv, ivlen);
+}
+
+
+static gcry_err_code_t
+mac_write (gcry_mac_hd_t hd, const void *inbuf, size_t inlen)
+{
+  if (!hd->spec->ops->write)
+    return GPG_ERR_INV_ARG;
+  if (inlen > 0 && !inbuf)
+    return GPG_ERR_INV_ARG;
+
+  return hd->spec->ops->write (hd, inbuf, inlen);
+}
+
+
+static gcry_err_code_t
+mac_read (gcry_mac_hd_t hd, void *outbuf, size_t * outlen)
+{
+  if (!outbuf || !outlen || *outlen == 0 || !hd->spec->ops->read)
+    return GPG_ERR_INV_ARG;
+
+  return hd->spec->ops->read (hd, outbuf, outlen);
+}
+
+
+static gcry_err_code_t
+mac_verify (gcry_mac_hd_t hd, const void *buf, size_t buflen)
+{
+  if (!buf || buflen == 0 || !hd->spec->ops->verify)
+    return GPG_ERR_INV_ARG;
+
+  return hd->spec->ops->verify (hd, buf, buflen);
+}
+
+
+/* Create a MAC object for algorithm ALGO.  FLAGS may be
+   given as an bitwise OR of the gcry_mac_flags values.
+   H is guaranteed to be a valid handle or NULL on error.  */
+gpg_err_code_t
+_gcry_mac_open (gcry_mac_hd_t * h, int algo, unsigned int flags,
+                gcry_ctx_t ctx)
+{
+  gcry_err_code_t rc;
+  gcry_mac_hd_t hd = NULL;
+
+  if ((flags & ~GCRY_MAC_FLAG_SECURE))
+    rc = GPG_ERR_INV_ARG;
+  else
+    rc = mac_open (&hd, algo, !!(flags & GCRY_MAC_FLAG_SECURE), ctx);
+
+  *h = rc ? NULL : hd;
+  return rc;
+}
+
+
+void
+_gcry_mac_close (gcry_mac_hd_t hd)
+{
+  if (hd)
+    mac_close (hd);
+}
+
+
+gcry_err_code_t
+_gcry_mac_setkey (gcry_mac_hd_t hd, const void *key, size_t keylen)
+{
+  return mac_setkey (hd, key, keylen);
+}
+
+
+gcry_err_code_t
+_gcry_mac_setiv (gcry_mac_hd_t hd, const void *iv, size_t ivlen)
+{
+  return mac_setiv (hd, iv, ivlen);
+}
+
+
+gcry_err_code_t
+_gcry_mac_write (gcry_mac_hd_t hd, const void *inbuf, size_t inlen)
+{
+  return mac_write (hd, inbuf, inlen);
+}
+
+
+gcry_err_code_t
+_gcry_mac_read (gcry_mac_hd_t hd, void *outbuf, size_t * outlen)
+{
+  return mac_read (hd, outbuf, outlen);
+}
+
+
+gcry_err_code_t
+_gcry_mac_verify (gcry_mac_hd_t hd, const void *buf, size_t buflen)
+{
+  return mac_verify (hd, buf, buflen);
+}
+
+
+int
+_gcry_mac_get_algo (gcry_mac_hd_t hd)
+{
+  return hd->algo;
+}
+
+
+unsigned int
+_gcry_mac_get_algo_maclen (int algo)
+{
+  gcry_mac_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (!spec || !spec->ops || !spec->ops->get_maclen)
+    return 0;
+
+  return spec->ops->get_maclen (algo);
+}
+
+
+unsigned int
+_gcry_mac_get_algo_keylen (int algo)
+{
+  gcry_mac_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (!spec || !spec->ops || !spec->ops->get_keylen)
+    return 0;
+
+  return spec->ops->get_keylen (algo);
+}
+
+
+gcry_err_code_t
+_gcry_mac_ctl (gcry_mac_hd_t hd, int cmd, void *buffer, size_t buflen)
+{
+  gcry_err_code_t rc;
+
+  /* Currently not used.  */
+  (void) hd;
+  (void) buffer;
+  (void) buflen;
+
+  switch (cmd)
+    {
+    case GCRYCTL_RESET:
+      rc = mac_reset (hd);
+      break;
+    case GCRYCTL_SET_SBOX:
+      if (hd->spec->ops->set_extra_info)
+        rc = hd->spec->ops->set_extra_info
+          (hd, GCRYCTL_SET_SBOX, buffer, buflen);
+      else
+        rc = GPG_ERR_NOT_SUPPORTED;
+      break;
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+  return rc;
+}
+
+
+/* Return information about the given MAC algorithm ALGO.
+
+    GCRYCTL_TEST_ALGO:
+        Returns 0 if the specified algorithm ALGO is available for use.
+        BUFFER and NBYTES must be zero.
+
+   Note: Because this function is in most cases used to return an
+   integer value, we can make it easier for the caller to just look at
+   the return value.  The caller will in all cases consult the value
+   and thereby detecting whether a error occurred or not (i.e. while
+   checking the block size)
+ */
+gcry_err_code_t
+_gcry_mac_algo_info (int algo, int what, void *buffer, size_t * nbytes)
+{
+  gcry_err_code_t rc = 0;
+  unsigned int ui;
+
+  switch (what)
+    {
+    case GCRYCTL_GET_KEYLEN:
+      if (buffer || (!nbytes))
+        rc = GPG_ERR_INV_ARG;
+      else
+        {
+          ui = _gcry_mac_get_algo_keylen (algo);
+          if (ui > 0)
+            *nbytes = (size_t) ui;
+          else
+            /* The only reason for an error is an invalid algo.  */
+            rc = GPG_ERR_MAC_ALGO;
+        }
+      break;
+    case GCRYCTL_TEST_ALGO:
+      if (buffer || nbytes)
+        rc = GPG_ERR_INV_ARG;
+      else
+        rc = check_mac_algo (algo);
+      break;
+
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+
+/* Run the self-tests for the MAC.  */
+gpg_error_t
+_gcry_mac_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gcry_err_code_t ec;
+  gcry_mac_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (spec && !spec->flags.disabled && spec->ops && spec->ops->selftest)
+    ec = spec->ops->selftest (algo, extended, report);
+  else
+    {
+      ec = GPG_ERR_MAC_ALGO;
+      if (report)
+        report ("mac", algo, "module",
+                spec && !spec->flags.disabled?
+                "no selftest available" :
+                spec? "algorithm disabled" :
+                "algorithm not found");
+    }
+
+  return gpg_error (ec);
+}
diff --git a/comm/third_party/libgcrypt/cipher/md.c b/comm/third_party/libgcrypt/cipher/md.c
new file mode 100644
index 0000000000..efb7376a1a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/md.c
@@ -0,0 +1,1639 @@
+/* md.c  -  message digest dispatcher
+ * Copyright (C) 1998, 1999, 2002, 2003, 2006,
+ *               2008 Free Software Foundation, Inc.
+ * Copyright (C) 2013, 2014 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+
+
+/* This is the list of the digest implementations included in
+   libgcrypt.  */
+static gcry_md_spec_t * const digest_list[] =
+  {
+#if USE_CRC
+     &_gcry_digest_spec_crc32,
+     &_gcry_digest_spec_crc32_rfc1510,
+     &_gcry_digest_spec_crc24_rfc2440,
+#endif
+#if USE_SHA1
+     &_gcry_digest_spec_sha1,
+#endif
+#if USE_SHA256
+     &_gcry_digest_spec_sha256,
+     &_gcry_digest_spec_sha224,
+#endif
+#if USE_SHA512
+     &_gcry_digest_spec_sha512,
+     &_gcry_digest_spec_sha384,
+     &_gcry_digest_spec_sha512_256,
+     &_gcry_digest_spec_sha512_224,
+#endif
+#if USE_SHA3
+     &_gcry_digest_spec_sha3_224,
+     &_gcry_digest_spec_sha3_256,
+     &_gcry_digest_spec_sha3_384,
+     &_gcry_digest_spec_sha3_512,
+     &_gcry_digest_spec_shake128,
+     &_gcry_digest_spec_shake256,
+#endif
+#if USE_GOST_R_3411_94
+     &_gcry_digest_spec_gost3411_94,
+     &_gcry_digest_spec_gost3411_cp,
+#endif
+#if USE_GOST_R_3411_12
+     &_gcry_digest_spec_stribog_256,
+     &_gcry_digest_spec_stribog_512,
+#endif
+#if USE_WHIRLPOOL
+     &_gcry_digest_spec_whirlpool,
+#endif
+#if USE_RMD160
+     &_gcry_digest_spec_rmd160,
+#endif
+#if USE_TIGER
+     &_gcry_digest_spec_tiger,
+     &_gcry_digest_spec_tiger1,
+     &_gcry_digest_spec_tiger2,
+#endif
+#if USE_MD5
+     &_gcry_digest_spec_md5,
+#endif
+#if USE_MD4
+     &_gcry_digest_spec_md4,
+#endif
+#if USE_MD2
+     &_gcry_digest_spec_md2,
+#endif
+#if USE_BLAKE2
+     &_gcry_digest_spec_blake2b_512,
+     &_gcry_digest_spec_blake2b_384,
+     &_gcry_digest_spec_blake2b_256,
+     &_gcry_digest_spec_blake2b_160,
+     &_gcry_digest_spec_blake2s_256,
+     &_gcry_digest_spec_blake2s_224,
+     &_gcry_digest_spec_blake2s_160,
+     &_gcry_digest_spec_blake2s_128,
+#endif
+#if USE_SM3
+     &_gcry_digest_spec_sm3,
+#endif
+     NULL
+  };
+
+/* Digest implementations starting with index 0 (enum gcry_md_algos) */
+static gcry_md_spec_t * const digest_list_algo0[] =
+  {
+    NULL, /* GCRY_MD_NONE */
+#if USE_MD5
+    &_gcry_digest_spec_md5,
+#else
+    NULL,
+#endif
+#if USE_SHA1
+    &_gcry_digest_spec_sha1,
+#else
+    NULL,
+#endif
+#if USE_RMD160
+    &_gcry_digest_spec_rmd160,
+#else
+    NULL,
+#endif
+    NULL, /* Unused index 4 */
+#if USE_MD2
+    &_gcry_digest_spec_md2,
+#else
+    NULL,
+#endif
+#if USE_TIGER
+    &_gcry_digest_spec_tiger,
+#else
+    NULL,
+#endif
+    NULL, /* GCRY_MD_HAVAL */
+#if USE_SHA256
+    &_gcry_digest_spec_sha256,
+#else
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_digest_spec_sha384,
+    &_gcry_digest_spec_sha512,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_SHA256
+    &_gcry_digest_spec_sha224
+#else
+    NULL
+#endif
+  };
+
+/* Digest implementations starting with index 301 (enum gcry_md_algos) */
+static gcry_md_spec_t * const digest_list_algo301[] =
+  {
+#if USE_MD4
+    &_gcry_digest_spec_md4,
+#else
+    NULL,
+#endif
+#if USE_CRC
+    &_gcry_digest_spec_crc32,
+    &_gcry_digest_spec_crc32_rfc1510,
+    &_gcry_digest_spec_crc24_rfc2440,
+#else
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_WHIRLPOOL
+    &_gcry_digest_spec_whirlpool,
+#else
+    NULL,
+#endif
+#if USE_TIGER
+    &_gcry_digest_spec_tiger1,
+    &_gcry_digest_spec_tiger2,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_GOST_R_3411_94
+    &_gcry_digest_spec_gost3411_94,
+#else
+    NULL,
+#endif
+#if USE_GOST_R_3411_12
+    &_gcry_digest_spec_stribog_256,
+    &_gcry_digest_spec_stribog_512,
+#else
+    NULL,
+    NULL,
+#endif
+#if USE_GOST_R_3411_94
+    &_gcry_digest_spec_gost3411_cp,
+#else
+    NULL,
+#endif
+#if USE_SHA3
+    &_gcry_digest_spec_sha3_224,
+    &_gcry_digest_spec_sha3_256,
+    &_gcry_digest_spec_sha3_384,
+    &_gcry_digest_spec_sha3_512,
+    &_gcry_digest_spec_shake128,
+    &_gcry_digest_spec_shake256,
+#else
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_BLAKE2
+    &_gcry_digest_spec_blake2b_512,
+    &_gcry_digest_spec_blake2b_384,
+    &_gcry_digest_spec_blake2b_256,
+    &_gcry_digest_spec_blake2b_160,
+    &_gcry_digest_spec_blake2s_256,
+    &_gcry_digest_spec_blake2s_224,
+    &_gcry_digest_spec_blake2s_160,
+    &_gcry_digest_spec_blake2s_128,
+#else
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+#endif
+#if USE_SM3
+    &_gcry_digest_spec_sm3,
+#else
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_digest_spec_sha512_256,
+    &_gcry_digest_spec_sha512_224,
+#else
+    NULL,
+    NULL,
+#endif
+  };
+
+
+typedef struct gcry_md_list
+{
+  gcry_md_spec_t *spec;
+  struct gcry_md_list *next;
+  size_t actual_struct_size;     /* Allocated size of this structure. */
+  PROPERLY_ALIGNED_TYPE context[1];
+} GcryDigestEntry;
+
+/* This structure is put right after the gcry_md_hd_t buffer, so that
+ * only one memory block is needed. */
+struct gcry_md_context
+{
+  int  magic;
+  size_t actual_handle_size;     /* Allocated size of this handle. */
+  FILE  *debug;
+  struct {
+    unsigned int secure:1;
+    unsigned int finalized:1;
+    unsigned int bugemu1:1;
+    unsigned int hmac:1;
+  } flags;
+  GcryDigestEntry *list;
+};
+
+
+#define CTX_MAGIC_NORMAL 0x11071961
+#define CTX_MAGIC_SECURE 0x16917011
+
+static gcry_err_code_t md_enable (gcry_md_hd_t hd, int algo);
+static void md_close (gcry_md_hd_t a);
+static void md_write (gcry_md_hd_t a, const void *inbuf, size_t inlen);
+static byte *md_read( gcry_md_hd_t a, int algo );
+static int md_get_algo( gcry_md_hd_t a );
+static int md_digest_length( int algo );
+static void md_start_debug ( gcry_md_hd_t a, const char *suffix );
+static void md_stop_debug ( gcry_md_hd_t a );
+
+
+
+static int
+map_algo (int algo)
+{
+  return algo;
+}
+
+
+/* Return the spec structure for the hash algorithm ALGO.  For an
+   unknown algorithm NULL is returned.  */
+static gcry_md_spec_t *
+spec_from_algo (int algo)
+{
+  gcry_md_spec_t *spec = NULL;
+
+  algo = map_algo (algo);
+
+  if (algo >= 0 && algo < DIM(digest_list_algo0))
+    spec = digest_list_algo0[algo];
+  else if (algo >= 301 && algo < 301 + DIM(digest_list_algo301))
+    spec = digest_list_algo301[algo - 301];
+
+  if (spec)
+    gcry_assert (spec->algo == algo);
+
+  return spec;
+}
+
+
+/* Lookup a hash's spec by its name.  */
+static gcry_md_spec_t *
+spec_from_name (const char *name)
+{
+  gcry_md_spec_t *spec;
+  int idx;
+
+  for (idx=0; (spec = digest_list[idx]); idx++)
+    {
+      if (!stricmp (name, spec->name))
+        return spec;
+    }
+
+  return NULL;
+}
+
+
+/* Lookup a hash's spec by its OID.  */
+static gcry_md_spec_t *
+spec_from_oid (const char *oid)
+{
+  gcry_md_spec_t *spec;
+  gcry_md_oid_spec_t *oid_specs;
+  int idx, j;
+
+  for (idx=0; (spec = digest_list[idx]); idx++)
+    {
+      oid_specs = spec->oids;
+      if (oid_specs)
+        {
+          for (j = 0; oid_specs[j].oidstring; j++)
+            if (!stricmp (oid, oid_specs[j].oidstring))
+              return spec;
+        }
+    }
+
+  return NULL;
+}
+
+
+static gcry_md_spec_t *
+search_oid (const char *oid, gcry_md_oid_spec_t *oid_spec)
+{
+  gcry_md_spec_t *spec;
+  int i;
+
+  if (!oid)
+    return NULL;
+
+  if (!strncmp (oid, "oid.", 4) || !strncmp (oid, "OID.", 4))
+    oid += 4;
+
+  spec = spec_from_oid (oid);
+  if (spec && spec->oids)
+    {
+      for (i = 0; spec->oids[i].oidstring; i++)
+	if (!stricmp (oid, spec->oids[i].oidstring))
+	  {
+	    if (oid_spec)
+	      *oid_spec = spec->oids[i];
+	    return spec;
+	  }
+    }
+
+  return NULL;
+}
+
+
+/****************
+ * Map a string to the digest algo
+ */
+int
+_gcry_md_map_name (const char *string)
+{
+  gcry_md_spec_t *spec;
+
+  if (!string)
+    return 0;
+
+  /* If the string starts with a digit (optionally prefixed with
+     either "OID." or "oid."), we first look into our table of ASN.1
+     object identifiers to figure out the algorithm */
+  spec = search_oid (string, NULL);
+  if (spec)
+    return spec->algo;
+
+  /* Not found, search a matching digest name.  */
+  spec = spec_from_name (string);
+  if (spec)
+    return spec->algo;
+
+  return 0;
+}
+
+
+/****************
+ * This function simply returns the name of the algorithm or some constant
+ * string when there is no algo.  It will never return NULL.
+ * Use	the macro gcry_md_test_algo() to check whether the algorithm
+ * is valid.
+ */
+const char *
+_gcry_md_algo_name (int algorithm)
+{
+  gcry_md_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  return spec ? spec->name : "?";
+}
+
+
+static gcry_err_code_t
+check_digest_algo (int algorithm)
+{
+  gcry_md_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  if (spec && !spec->flags.disabled)
+    return 0;
+
+  return GPG_ERR_DIGEST_ALGO;
+
+}
+
+
+/****************
+ * Open a message digest handle for use with algorithm ALGO.
+ * More algorithms may be added by md_enable(). The initial algorithm
+ * may be 0.
+ */
+static gcry_err_code_t
+md_open (gcry_md_hd_t *h, int algo, unsigned int flags)
+{
+  gcry_err_code_t err = 0;
+  int secure = !!(flags & GCRY_MD_FLAG_SECURE);
+  int hmac =   !!(flags & GCRY_MD_FLAG_HMAC);
+  int bufsize = secure ? 512 : 1024;
+  struct gcry_md_context *ctx;
+  gcry_md_hd_t hd;
+  size_t n;
+
+  /* Allocate a memory area to hold the caller visible buffer with it's
+   * control information and the data required by this module. Set the
+   * context pointer at the beginning to this area.
+   * We have to use this strange scheme because we want to hide the
+   * internal data but have a variable sized buffer.
+   *
+   *	+---+------+---........------+-------------+
+   *	!ctx! bctl !  buffer	     ! private	   !
+   *	+---+------+---........------+-------------+
+   *	  !			      ^
+   *	  !---------------------------!
+   *
+   * We have to make sure that private is well aligned.
+   */
+  n = sizeof (struct gcry_md_handle) + bufsize;
+  n = ((n + sizeof (PROPERLY_ALIGNED_TYPE) - 1)
+       / sizeof (PROPERLY_ALIGNED_TYPE)) * sizeof (PROPERLY_ALIGNED_TYPE);
+
+  /* Allocate and set the Context pointer to the private data */
+  if (secure)
+    hd = xtrymalloc_secure (n + sizeof (struct gcry_md_context));
+  else
+    hd = xtrymalloc (n + sizeof (struct gcry_md_context));
+
+  if (! hd)
+    err = gpg_err_code_from_errno (errno);
+
+  if (! err)
+    {
+      hd->ctx = ctx = (void *) ((char *) hd + n);
+      /* Setup the globally visible data (bctl in the diagram).*/
+      hd->bufsize = n - sizeof (struct gcry_md_handle) + 1;
+      hd->bufpos = 0;
+
+      /* Initialize the private data. */
+      memset (hd->ctx, 0, sizeof *hd->ctx);
+      ctx->magic = secure ? CTX_MAGIC_SECURE : CTX_MAGIC_NORMAL;
+      ctx->actual_handle_size = n + sizeof (struct gcry_md_context);
+      ctx->flags.secure = secure;
+      ctx->flags.hmac = hmac;
+      ctx->flags.bugemu1 = !!(flags & GCRY_MD_FLAG_BUGEMU1);
+    }
+
+  if (! err)
+    {
+      /* Hmmm, should we really do that? - yes [-wk] */
+      _gcry_fast_random_poll ();
+
+      if (algo)
+	{
+	  err = md_enable (hd, algo);
+	  if (err)
+	    md_close (hd);
+	}
+    }
+
+  if (! err)
+    *h = hd;
+
+  return err;
+}
+
+/* Create a message digest object for algorithm ALGO.  FLAGS may be
+   given as an bitwise OR of the gcry_md_flags values.  ALGO may be
+   given as 0 if the algorithms to be used are later set using
+   gcry_md_enable. H is guaranteed to be a valid handle or NULL on
+   error.  */
+gcry_err_code_t
+_gcry_md_open (gcry_md_hd_t *h, int algo, unsigned int flags)
+{
+  gcry_err_code_t rc;
+  gcry_md_hd_t hd;
+
+  if ((flags & ~(GCRY_MD_FLAG_SECURE
+                 | GCRY_MD_FLAG_HMAC
+                 | GCRY_MD_FLAG_BUGEMU1)))
+    rc = GPG_ERR_INV_ARG;
+  else
+    rc = md_open (&hd, algo, flags);
+
+  *h = rc? NULL : hd;
+  return rc;
+}
+
+
+
+static gcry_err_code_t
+md_enable (gcry_md_hd_t hd, int algorithm)
+{
+  struct gcry_md_context *h = hd->ctx;
+  gcry_md_spec_t *spec;
+  GcryDigestEntry *entry;
+  gcry_err_code_t err = 0;
+
+  for (entry = h->list; entry; entry = entry->next)
+    if (entry->spec->algo == algorithm)
+      return 0; /* Already enabled */
+
+  spec = spec_from_algo (algorithm);
+  if (!spec)
+    {
+      log_debug ("md_enable: algorithm %d not available\n", algorithm);
+      err = GPG_ERR_DIGEST_ALGO;
+    }
+
+
+  if (!err && algorithm == GCRY_MD_MD5 && fips_mode ())
+    {
+      _gcry_inactivate_fips_mode ("MD5 used");
+      if (_gcry_enforced_fips_mode () )
+        {
+          /* We should never get to here because we do not register
+             MD5 in enforced fips mode. But better throw an error.  */
+          err = GPG_ERR_DIGEST_ALGO;
+        }
+    }
+
+  if (!err && h->flags.hmac && spec->read == NULL)
+    {
+      /* Expandable output function cannot act as part of HMAC. */
+      err = GPG_ERR_DIGEST_ALGO;
+    }
+
+  if (!err)
+    {
+      size_t size = (sizeof (*entry)
+                     + spec->contextsize * (h->flags.hmac? 3 : 1)
+                     - sizeof (entry->context));
+
+      /* And allocate a new list entry. */
+      if (h->flags.secure)
+	entry = xtrymalloc_secure (size);
+      else
+	entry = xtrymalloc (size);
+
+      if (! entry)
+	err = gpg_err_code_from_errno (errno);
+      else
+	{
+	  entry->spec = spec;
+	  entry->next = h->list;
+          entry->actual_struct_size = size;
+	  h->list = entry;
+
+	  /* And init this instance. */
+	  entry->spec->init (entry->context,
+                             h->flags.bugemu1? GCRY_MD_FLAG_BUGEMU1:0);
+	}
+    }
+
+  return err;
+}
+
+
+gcry_err_code_t
+_gcry_md_enable (gcry_md_hd_t hd, int algorithm)
+{
+  return md_enable (hd, algorithm);
+}
+
+
+static gcry_err_code_t
+md_copy (gcry_md_hd_t ahd, gcry_md_hd_t *b_hd)
+{
+  gcry_err_code_t err = 0;
+  struct gcry_md_context *a = ahd->ctx;
+  struct gcry_md_context *b;
+  GcryDigestEntry *ar, *br;
+  gcry_md_hd_t bhd;
+  size_t n;
+
+  if (ahd->bufpos)
+    md_write (ahd, NULL, 0);
+
+  n = (char *) ahd->ctx - (char *) ahd;
+  if (a->flags.secure)
+    bhd = xtrymalloc_secure (n + sizeof (struct gcry_md_context));
+  else
+    bhd = xtrymalloc (n + sizeof (struct gcry_md_context));
+
+  if (!bhd)
+    {
+      err = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  bhd->ctx = b = (void *) ((char *) bhd + n);
+  /* No need to copy the buffer due to the write above. */
+  gcry_assert (ahd->bufsize == (n - sizeof (struct gcry_md_handle) + 1));
+  bhd->bufsize = ahd->bufsize;
+  bhd->bufpos = 0;
+  gcry_assert (! ahd->bufpos);
+  memcpy (b, a, sizeof *a);
+  b->list = NULL;
+  b->debug = NULL;
+
+  /* Copy the complete list of algorithms.  The copied list is
+     reversed, but that doesn't matter. */
+  for (ar = a->list; ar; ar = ar->next)
+    {
+      if (a->flags.secure)
+        br = xtrymalloc_secure (ar->actual_struct_size);
+      else
+        br = xtrymalloc (ar->actual_struct_size);
+      if (!br)
+        {
+          err = gpg_err_code_from_syserror ();
+          md_close (bhd);
+          goto leave;
+        }
+
+      memcpy (br, ar, ar->actual_struct_size);
+      br->next = b->list;
+      b->list = br;
+    }
+
+  if (a->debug)
+    md_start_debug (bhd, "unknown");
+
+  *b_hd = bhd;
+
+ leave:
+  return err;
+}
+
+
+gcry_err_code_t
+_gcry_md_copy (gcry_md_hd_t *handle, gcry_md_hd_t hd)
+{
+  gcry_err_code_t rc;
+
+  rc = md_copy (hd, handle);
+  if (rc)
+    *handle = NULL;
+  return rc;
+}
+
+
+/*
+ * Reset all contexts and discard any buffered stuff.  This may be used
+ * instead of a md_close(); md_open().
+ */
+void
+_gcry_md_reset (gcry_md_hd_t a)
+{
+  GcryDigestEntry *r;
+
+  /* Note: We allow this even in fips non operational mode.  */
+
+  a->bufpos = a->ctx->flags.finalized = 0;
+
+  if (a->ctx->flags.hmac)
+    for (r = a->ctx->list; r; r = r->next)
+      {
+        memcpy (r->context, (char *)r->context + r->spec->contextsize,
+                r->spec->contextsize);
+      }
+  else
+    for (r = a->ctx->list; r; r = r->next)
+      {
+        memset (r->context, 0, r->spec->contextsize);
+        (*r->spec->init) (r->context,
+                          a->ctx->flags.bugemu1? GCRY_MD_FLAG_BUGEMU1:0);
+      }
+}
+
+
+static void
+md_close (gcry_md_hd_t a)
+{
+  GcryDigestEntry *r, *r2;
+
+  if (! a)
+    return;
+  if (a->ctx->debug)
+    md_stop_debug (a);
+  for (r = a->ctx->list; r; r = r2)
+    {
+      r2 = r->next;
+      wipememory (r, r->actual_struct_size);
+      xfree (r);
+    }
+
+  wipememory (a, a->ctx->actual_handle_size);
+  xfree(a);
+}
+
+
+void
+_gcry_md_close (gcry_md_hd_t hd)
+{
+  /* Note: We allow this even in fips non operational mode.  */
+  md_close (hd);
+}
+
+
+static void
+md_write (gcry_md_hd_t a, const void *inbuf, size_t inlen)
+{
+  GcryDigestEntry *r;
+
+  if (a->ctx->debug)
+    {
+      if (a->bufpos && fwrite (a->buf, a->bufpos, 1, a->ctx->debug) != 1)
+	BUG();
+      if (inlen && fwrite (inbuf, inlen, 1, a->ctx->debug) != 1)
+	BUG();
+    }
+
+  for (r = a->ctx->list; r; r = r->next)
+    {
+      if (a->bufpos)
+	(*r->spec->write) (r->context, a->buf, a->bufpos);
+      (*r->spec->write) (r->context, inbuf, inlen);
+    }
+  a->bufpos = 0;
+}
+
+
+/* Note that this function may be used after finalize and read to keep
+   on writing to the transform function so to mitigate timing
+   attacks.  */
+void
+_gcry_md_write (gcry_md_hd_t hd, const void *inbuf, size_t inlen)
+{
+  md_write (hd, inbuf, inlen);
+}
+
+
+static void
+md_final (gcry_md_hd_t a)
+{
+  GcryDigestEntry *r;
+
+  if (a->ctx->flags.finalized)
+    return;
+
+  if (a->bufpos)
+    md_write (a, NULL, 0);
+
+  for (r = a->ctx->list; r; r = r->next)
+    (*r->spec->final) (r->context);
+
+  a->ctx->flags.finalized = 1;
+
+  if (!a->ctx->flags.hmac)
+    return;
+
+  for (r = a->ctx->list; r; r = r->next)
+    {
+      byte *p;
+      size_t dlen = r->spec->mdlen;
+      byte *hash;
+      gcry_err_code_t err;
+
+      if (r->spec->read == NULL)
+        continue;
+
+      p = r->spec->read (r->context);
+
+      if (a->ctx->flags.secure)
+        hash = xtrymalloc_secure (dlen);
+      else
+        hash = xtrymalloc (dlen);
+      if (!hash)
+        {
+          err = gpg_err_code_from_errno (errno);
+          _gcry_fatal_error (err, NULL);
+        }
+
+      memcpy (hash, p, dlen);
+      memcpy (r->context, (char *)r->context + r->spec->contextsize * 2,
+              r->spec->contextsize);
+      (*r->spec->write) (r->context, hash, dlen);
+      (*r->spec->final) (r->context);
+      xfree (hash);
+    }
+}
+
+
+static gcry_err_code_t
+md_setkey (gcry_md_hd_t h, const unsigned char *key, size_t keylen)
+{
+  gcry_err_code_t rc = 0;
+  GcryDigestEntry *r;
+  int algo_had_setkey = 0;
+
+  if (!h->ctx->list)
+    return GPG_ERR_DIGEST_ALGO; /* Might happen if no algo is enabled.  */
+
+  if (h->ctx->flags.hmac)
+    return GPG_ERR_DIGEST_ALGO; /* Tried md_setkey for HMAC md. */
+
+  for (r = h->ctx->list; r; r = r->next)
+    {
+      switch (r->spec->algo)
+	{
+#if USE_BLAKE2
+	/* TODO? add spec->init_with_key? */
+	case GCRY_MD_BLAKE2B_512:
+	case GCRY_MD_BLAKE2B_384:
+	case GCRY_MD_BLAKE2B_256:
+	case GCRY_MD_BLAKE2B_160:
+	case GCRY_MD_BLAKE2S_256:
+	case GCRY_MD_BLAKE2S_224:
+	case GCRY_MD_BLAKE2S_160:
+	case GCRY_MD_BLAKE2S_128:
+	  algo_had_setkey = 1;
+	  memset (r->context, 0, r->spec->contextsize);
+	  rc = _gcry_blake2_init_with_key (r->context,
+					   h->ctx->flags.bugemu1
+					     ? GCRY_MD_FLAG_BUGEMU1:0,
+					   key, keylen, r->spec->algo);
+	  break;
+#endif
+	default:
+	  rc = GPG_ERR_DIGEST_ALGO;
+	  break;
+	}
+
+      if (rc)
+	break;
+    }
+
+  if (rc && !algo_had_setkey)
+    {
+      /* None of algorithms had setkey implementation, so contexts were not
+       * modified. Just return error. */
+      return rc;
+    }
+  else if (rc && algo_had_setkey)
+    {
+      /* Some of the contexts have been modified, but got error. Reset
+       * all contexts. */
+      _gcry_md_reset (h);
+      return rc;
+    }
+
+  /* Successful md_setkey implies reset. */
+  h->bufpos = h->ctx->flags.finalized = 0;
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+prepare_macpads (gcry_md_hd_t a, const unsigned char *key, size_t keylen)
+{
+  GcryDigestEntry *r;
+
+  if (!a->ctx->list)
+    return GPG_ERR_DIGEST_ALGO; /* Might happen if no algo is enabled.  */
+
+  if (!a->ctx->flags.hmac)
+    return GPG_ERR_DIGEST_ALGO; /* Tried prepare_macpads for non-HMAC md. */
+
+  for (r = a->ctx->list; r; r = r->next)
+    {
+      const unsigned char *k;
+      size_t k_len;
+      unsigned char *key_allocated = NULL;
+      int macpad_Bsize;
+      int i;
+
+      switch (r->spec->algo)
+        {
+	/* TODO: add spec->blocksize */
+        case GCRY_MD_SHA3_224:
+          macpad_Bsize = 1152 / 8;
+          break;
+        case GCRY_MD_SHA3_256:
+          macpad_Bsize = 1088 / 8;
+          break;
+        case GCRY_MD_SHA3_384:
+          macpad_Bsize = 832 / 8;
+          break;
+        case GCRY_MD_SHA3_512:
+          macpad_Bsize = 576 / 8;
+          break;
+        case GCRY_MD_SHA384:
+        case GCRY_MD_SHA512:
+        case GCRY_MD_SHA512_256:
+        case GCRY_MD_SHA512_224:
+        case GCRY_MD_BLAKE2B_512:
+        case GCRY_MD_BLAKE2B_384:
+        case GCRY_MD_BLAKE2B_256:
+        case GCRY_MD_BLAKE2B_160:
+          macpad_Bsize = 128;
+          break;
+        case GCRY_MD_GOSTR3411_94:
+        case GCRY_MD_GOSTR3411_CP:
+          macpad_Bsize = 32;
+          break;
+        default:
+          macpad_Bsize = 64;
+          break;
+        }
+
+      if ( keylen > macpad_Bsize )
+        {
+          k = key_allocated = xtrymalloc_secure (r->spec->mdlen);
+          if (!k)
+            return gpg_err_code_from_errno (errno);
+          _gcry_md_hash_buffer (r->spec->algo, key_allocated, key, keylen);
+          k_len = r->spec->mdlen;
+          gcry_assert ( k_len <= macpad_Bsize );
+        }
+      else
+        {
+          k = key;
+          k_len = keylen;
+        }
+
+      (*r->spec->init) (r->context,
+                        a->ctx->flags.bugemu1? GCRY_MD_FLAG_BUGEMU1:0);
+      a->bufpos = 0;
+      for (i=0; i < k_len; i++ )
+        _gcry_md_putc (a, k[i] ^ 0x36);
+      for (; i < macpad_Bsize; i++ )
+        _gcry_md_putc (a, 0x36);
+      (*r->spec->write) (r->context, a->buf, a->bufpos);
+      memcpy ((char *)r->context + r->spec->contextsize, r->context,
+              r->spec->contextsize);
+
+      (*r->spec->init) (r->context,
+                        a->ctx->flags.bugemu1? GCRY_MD_FLAG_BUGEMU1:0);
+      a->bufpos = 0;
+      for (i=0; i < k_len; i++ )
+        _gcry_md_putc (a, k[i] ^ 0x5c);
+      for (; i < macpad_Bsize; i++ )
+        _gcry_md_putc (a, 0x5c);
+      (*r->spec->write) (r->context, a->buf, a->bufpos);
+      memcpy ((char *)r->context + r->spec->contextsize*2, r->context,
+              r->spec->contextsize);
+
+      xfree (key_allocated);
+    }
+
+  a->bufpos = 0;
+  return 0;
+}
+
+
+gcry_err_code_t
+_gcry_md_ctl (gcry_md_hd_t hd, int cmd, void *buffer, size_t buflen)
+{
+  gcry_err_code_t rc = 0;
+
+  (void)buflen; /* Currently not used.  */
+
+  switch (cmd)
+    {
+    case GCRYCTL_FINALIZE:
+      md_final (hd);
+      break;
+    case GCRYCTL_START_DUMP:
+      md_start_debug (hd, buffer);
+      break;
+    case GCRYCTL_STOP_DUMP:
+      md_stop_debug ( hd );
+      break;
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+  return rc;
+}
+
+
+gcry_err_code_t
+_gcry_md_setkey (gcry_md_hd_t hd, const void *key, size_t keylen)
+{
+  gcry_err_code_t rc;
+
+  if (hd->ctx->flags.hmac)
+    {
+      rc = prepare_macpads (hd, key, keylen);
+      if (!rc)
+	_gcry_md_reset (hd);
+    }
+  else
+    {
+      rc = md_setkey (hd, key, keylen);
+    }
+
+  return rc;
+}
+
+
+/* The new debug interface.  If SUFFIX is a string it creates an debug
+   file for the context HD.  IF suffix is NULL, the file is closed and
+   debugging is stopped.  */
+void
+_gcry_md_debug (gcry_md_hd_t hd, const char *suffix)
+{
+  if (suffix)
+    md_start_debug (hd, suffix);
+  else
+    md_stop_debug (hd);
+}
+
+
+/****************
+ * If ALGO is null get the digest for the used algo (which should be
+ * only one)
+ */
+static byte *
+md_read( gcry_md_hd_t a, int algo )
+{
+  GcryDigestEntry *r = a->ctx->list;
+
+  if (! algo)
+    {
+      /* Return the first algorithm */
+      if (r)
+        {
+          if (r->next)
+            log_debug ("more than one algorithm in md_read(0)\n");
+          if (r->spec->read)
+            return r->spec->read (r->context);
+        }
+    }
+  else
+    {
+      for (r = a->ctx->list; r; r = r->next)
+	if (r->spec->algo == algo)
+	  {
+	    if (r->spec->read)
+              return r->spec->read (r->context);
+            break;
+	  }
+    }
+
+  if (r && !r->spec->read)
+    _gcry_fatal_error (GPG_ERR_DIGEST_ALGO,
+                       "requested algo has no fixed digest length");
+  else
+    _gcry_fatal_error (GPG_ERR_DIGEST_ALGO, "requested algo not in md context");
+  return NULL;
+}
+
+
+/*
+ * Read out the complete digest, this function implictly finalizes
+ * the hash.
+ */
+byte *
+_gcry_md_read (gcry_md_hd_t hd, int algo)
+{
+  /* This function is expected to always return a digest, thus we
+     can't return an error which we actually should do in
+     non-operational state.  */
+  _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+  return md_read (hd, algo);
+}
+
+
+/****************
+ * If ALGO is null get the digest for the used algo (which should be
+ * only one)
+ */
+static gcry_err_code_t
+md_extract(gcry_md_hd_t a, int algo, void *out, size_t outlen)
+{
+  GcryDigestEntry *r = a->ctx->list;
+
+  if (!algo)
+    {
+      /* Return the first algorithm */
+      if (r && r->spec->extract)
+	{
+	  if (r->next)
+	    log_debug ("more than one algorithm in md_extract(0)\n");
+	  r->spec->extract (r->context, out, outlen);
+	  return 0;
+	}
+    }
+  else
+    {
+      for (r = a->ctx->list; r; r = r->next)
+	if (r->spec->algo == algo && r->spec->extract)
+	  {
+	    r->spec->extract (r->context, out, outlen);
+	    return 0;
+	  }
+    }
+
+  return GPG_ERR_DIGEST_ALGO;
+}
+
+
+/*
+ * Expand the output from XOF class digest, this function implictly finalizes
+ * the hash.
+ */
+gcry_err_code_t
+_gcry_md_extract (gcry_md_hd_t hd, int algo, void *out, size_t outlen)
+{
+  _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+  return md_extract (hd, algo, out, outlen);
+}
+
+
+/*
+ * Read out an intermediate digest.  Not yet functional.
+ */
+gcry_err_code_t
+_gcry_md_get (gcry_md_hd_t hd, int algo, byte *buffer, int buflen)
+{
+  (void)hd;
+  (void)algo;
+  (void)buffer;
+  (void)buflen;
+
+  /*md_digest ... */
+  fips_signal_error ("unimplemented function called");
+  return GPG_ERR_INTERNAL;
+}
+
+
+/*
+ * Shortcut function to hash a buffer with a given algo. The only
+ * guaranteed supported algorithms are RIPE-MD160 and SHA-1. The
+ * supplied digest buffer must be large enough to store the resulting
+ * hash.  No error is returned, the function will abort on an invalid
+ * algo.  DISABLED_ALGOS are ignored here.  */
+void
+_gcry_md_hash_buffer (int algo, void *digest,
+                      const void *buffer, size_t length)
+{
+  gcry_md_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (!spec)
+    {
+      log_debug ("md_hash_buffer: algorithm %d not available\n", algo);
+      return;
+    }
+
+  if (algo == GCRY_MD_MD5 && fips_mode ())
+    {
+      _gcry_inactivate_fips_mode ("MD5 used");
+      if (_gcry_enforced_fips_mode () )
+        {
+          /* We should never get to here because we do not register
+             MD5 in enforced fips mode.  */
+          _gcry_fips_noreturn ();
+        }
+    }
+
+  if (spec->hash_buffer != NULL)
+    {
+      spec->hash_buffer (digest, buffer, length);
+    }
+  else if (spec->hash_buffers != NULL)
+    {
+      gcry_buffer_t iov;
+
+      iov.size = 0;
+      iov.data = (void *)buffer;
+      iov.off = 0;
+      iov.len = length;
+
+      spec->hash_buffers (digest, &iov, 1);
+    }
+  else
+    {
+      /* For the others we do not have a fast function, so we use the
+         normal functions. */
+      gcry_md_hd_t h;
+      gpg_err_code_t err;
+
+      err = md_open (&h, algo, 0);
+      if (err)
+        log_bug ("gcry_md_open failed for algo %d: %s",
+                algo, gpg_strerror (gcry_error(err)));
+      md_write (h, (byte *) buffer, length);
+      md_final (h);
+      memcpy (digest, md_read (h, algo), md_digest_length (algo));
+      md_close (h);
+    }
+}
+
+
+/* Shortcut function to hash multiple buffers with a given algo.  In
+   contrast to gcry_md_hash_buffer, this function returns an error on
+   invalid arguments or on other problems; disabled algorithms are
+   _not_ ignored but flagged as an error.
+
+   The data to sign is taken from the array IOV which has IOVCNT items.
+
+   The only supported flag in FLAGS is GCRY_MD_FLAG_HMAC which turns
+   this function into a HMAC function; the first item in IOV is then
+   used as the key.
+
+   On success 0 is returned and resulting hash or HMAC is stored at
+   DIGEST which must have been provided by the caller with an
+   appropriate length.  */
+gpg_err_code_t
+_gcry_md_hash_buffers (int algo, unsigned int flags, void *digest,
+                       const gcry_buffer_t *iov, int iovcnt)
+{
+  gcry_md_spec_t *spec;
+  int hmac;
+
+  if (!iov || iovcnt < 0)
+    return GPG_ERR_INV_ARG;
+  if (flags & ~(GCRY_MD_FLAG_HMAC))
+    return GPG_ERR_INV_ARG;
+
+  hmac = !!(flags & GCRY_MD_FLAG_HMAC);
+  if (hmac && iovcnt < 1)
+    return GPG_ERR_INV_ARG;
+
+  spec = spec_from_algo (algo);
+  if (!spec)
+    {
+      log_debug ("md_hash_buffers: algorithm %d not available\n", algo);
+      return GPG_ERR_DIGEST_ALGO;
+    }
+
+  if (algo == GCRY_MD_MD5 && fips_mode ())
+    {
+      _gcry_inactivate_fips_mode ("MD5 used");
+      if (_gcry_enforced_fips_mode () )
+        {
+          /* We should never get to here because we do not register
+             MD5 in enforced fips mode.  */
+          _gcry_fips_noreturn ();
+        }
+    }
+
+  if (!hmac && spec->hash_buffers)
+    {
+      spec->hash_buffers (digest, iov, iovcnt);
+    }
+  else
+    {
+      /* For the others we do not have a fast function, so we use the
+         normal functions.  */
+      gcry_md_hd_t h;
+      gpg_err_code_t rc;
+      int dlen;
+
+      /* Detect SHAKE128 like algorithms which we can't use because
+       * our API does not allow for a variable length digest.  */
+      dlen = md_digest_length (algo);
+      if (!dlen)
+        return GPG_ERR_DIGEST_ALGO;
+
+      rc = md_open (&h, algo, (hmac? GCRY_MD_FLAG_HMAC:0));
+      if (rc)
+        return rc;
+
+      if (hmac)
+        {
+          rc = _gcry_md_setkey (h,
+                                (const char*)iov[0].data + iov[0].off,
+                                iov[0].len);
+          if (rc)
+            {
+              md_close (h);
+              return rc;
+            }
+          iov++; iovcnt--;
+        }
+      for (;iovcnt; iov++, iovcnt--)
+        md_write (h, (const char*)iov[0].data + iov[0].off, iov[0].len);
+      md_final (h);
+      memcpy (digest, md_read (h, algo), dlen);
+      md_close (h);
+    }
+
+  return 0;
+}
+
+
+static int
+md_get_algo (gcry_md_hd_t a)
+{
+  GcryDigestEntry *r = a->ctx->list;
+
+  if (r && r->next)
+    {
+      fips_signal_error ("possible usage error");
+      log_error ("WARNING: more than one algorithm in md_get_algo()\n");
+    }
+  return r ? r->spec->algo : 0;
+}
+
+
+int
+_gcry_md_get_algo (gcry_md_hd_t hd)
+{
+  return md_get_algo (hd);
+}
+
+
+/****************
+ * Return the length of the digest
+ */
+static int
+md_digest_length (int algorithm)
+{
+  gcry_md_spec_t *spec;
+
+  spec = spec_from_algo (algorithm);
+  return spec? spec->mdlen : 0;
+}
+
+
+/****************
+ * Return the length of the digest in bytes.
+ * This function will return 0 in case of errors.
+ */
+unsigned int
+_gcry_md_get_algo_dlen (int algorithm)
+{
+  return md_digest_length (algorithm);
+}
+
+
+/* Hmmm: add a mode to enumerate the OIDs
+ *	to make g10/sig-check.c more portable */
+static const byte *
+md_asn_oid (int algorithm, size_t *asnlen, size_t *mdlen)
+{
+  gcry_md_spec_t *spec;
+  const byte *asnoid = NULL;
+
+  spec = spec_from_algo (algorithm);
+  if (spec)
+    {
+      if (asnlen)
+	*asnlen = spec->asnlen;
+      if (mdlen)
+	*mdlen = spec->mdlen;
+      asnoid = spec->asnoid;
+    }
+  else
+    log_bug ("no ASN.1 OID for md algo %d\n", algorithm);
+
+  return asnoid;
+}
+
+
+/****************
+ * Return information about the given cipher algorithm
+ * WHAT select the kind of information returned:
+ *  GCRYCTL_TEST_ALGO:
+ *	Returns 0 when the specified algorithm is available for use.
+ *	buffer and nbytes must be zero.
+ *  GCRYCTL_GET_ASNOID:
+ *	Return the ASNOID of the algorithm in buffer. if buffer is NULL, only
+ *	the required length is returned.
+ *  GCRYCTL_SELFTEST
+ *      Helper for the regression tests - shall not be used by applications.
+ *
+ * Note:  Because this function is in most cases used to return an
+ * integer value, we can make it easier for the caller to just look at
+ * the return value.  The caller will in all cases consult the value
+ * and thereby detecting whether a error occurred or not (i.e. while checking
+ * the block size)
+ */
+gcry_err_code_t
+_gcry_md_algo_info (int algo, int what, void *buffer, size_t *nbytes)
+{
+  gcry_err_code_t rc;
+
+  switch (what)
+    {
+    case GCRYCTL_TEST_ALGO:
+      if (buffer || nbytes)
+	rc = GPG_ERR_INV_ARG;
+      else
+	rc = check_digest_algo (algo);
+      break;
+
+    case GCRYCTL_GET_ASNOID:
+      /* We need to check that the algo is available because
+         md_asn_oid would otherwise raise an assertion. */
+      rc = check_digest_algo (algo);
+      if (!rc)
+        {
+          const char unsigned *asn;
+          size_t asnlen;
+
+          asn = md_asn_oid (algo, &asnlen, NULL);
+          if (buffer && (*nbytes >= asnlen))
+            {
+              memcpy (buffer, asn, asnlen);
+              *nbytes = asnlen;
+            }
+          else if (!buffer && nbytes)
+            *nbytes = asnlen;
+          else
+            {
+              if (buffer)
+                rc = GPG_ERR_TOO_SHORT;
+              else
+                rc = GPG_ERR_INV_ARG;
+            }
+        }
+      break;
+
+    case GCRYCTL_SELFTEST:
+      /* Helper function for the regression tests.  */
+      rc = gpg_err_code (_gcry_md_selftest (algo, nbytes? (int)*nbytes : 0,
+                                             NULL));
+      break;
+
+    default:
+      rc = GPG_ERR_INV_OP;
+      break;
+  }
+
+  return rc;
+}
+
+
+static void
+md_start_debug ( gcry_md_hd_t md, const char *suffix )
+{
+  static int idx=0;
+  char buf[50];
+
+  if (fips_mode ())
+    return;
+
+  if ( md->ctx->debug )
+    {
+      log_debug("Oops: md debug already started\n");
+      return;
+    }
+  idx++;
+  snprintf (buf, DIM(buf)-1, "dbgmd-%05d.%.10s", idx, suffix );
+  md->ctx->debug = fopen(buf, "w");
+  if ( !md->ctx->debug )
+    log_debug("md debug: can't open %s\n", buf );
+}
+
+
+static void
+md_stop_debug( gcry_md_hd_t md )
+{
+  if ( md->ctx->debug )
+    {
+      if ( md->bufpos )
+        md_write ( md, NULL, 0 );
+      fclose (md->ctx->debug);
+      md->ctx->debug = NULL;
+    }
+
+  {  /* a kludge to pull in the __muldi3 for Solaris */
+    volatile u32 a = (u32)(uintptr_t)md;
+    volatile u64 b = 42;
+    volatile u64 c;
+    c = a * b;
+    (void)c;
+  }
+}
+
+
+
+/*
+ * Return information about the digest handle.
+ *  GCRYCTL_IS_SECURE:
+ *	Returns 1 when the handle works on secured memory
+ *	otherwise 0 is returned.  There is no error return.
+ *  GCRYCTL_IS_ALGO_ENABLED:
+ *     Returns 1 if the algo is enabled for that handle.
+ *     The algo must be passed as the address of an int.
+ */
+gcry_err_code_t
+_gcry_md_info (gcry_md_hd_t h, int cmd, void *buffer, size_t *nbytes)
+{
+  gcry_err_code_t rc = 0;
+
+  switch (cmd)
+    {
+    case GCRYCTL_IS_SECURE:
+      *nbytes = h->ctx->flags.secure;
+      break;
+
+    case GCRYCTL_IS_ALGO_ENABLED:
+      {
+	GcryDigestEntry *r;
+	int algo;
+
+	if ( !buffer || !nbytes || *nbytes != sizeof (int))
+	  rc = GPG_ERR_INV_ARG;
+	else
+	  {
+	    algo = *(int*)buffer;
+
+	    *nbytes = 0;
+	    for(r=h->ctx->list; r; r = r->next ) {
+	      if (r->spec->algo == algo)
+		{
+		  *nbytes = 1;
+		  break;
+		}
+	    }
+	  }
+	break;
+      }
+
+  default:
+    rc = GPG_ERR_INV_OP;
+  }
+
+  return rc;
+}
+
+
+/* Explicitly initialize this module.  */
+gcry_err_code_t
+_gcry_md_init (void)
+{
+  if (fips_mode())
+    {
+      /* disable algorithms that are disallowed in fips */
+      int idx;
+      gcry_md_spec_t *spec;
+
+      for (idx = 0; (spec = digest_list[idx]); idx++)
+        if (!spec->flags.fips)
+          spec->flags.disabled = 1;
+    }
+
+  return 0;
+}
+
+
+int
+_gcry_md_is_secure (gcry_md_hd_t a)
+{
+  size_t value;
+
+  if (_gcry_md_info (a, GCRYCTL_IS_SECURE, NULL, &value))
+    value = 1; /* It seems to be better to assume secure memory on
+                  error. */
+  return value;
+}
+
+
+int
+_gcry_md_is_enabled (gcry_md_hd_t a, int algo)
+{
+  size_t value;
+
+  value = sizeof algo;
+  if (_gcry_md_info (a, GCRYCTL_IS_ALGO_ENABLED, &algo, &value))
+    value = 0;
+  return value;
+}
+
+
+/* Run the selftests for digest algorithm ALGO with optional reporting
+   function REPORT.  */
+gpg_error_t
+_gcry_md_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gcry_err_code_t ec = 0;
+  gcry_md_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (spec && !spec->flags.disabled && spec->selftest)
+    ec = spec->selftest (algo, extended, report);
+  else
+    {
+      ec = (spec && spec->selftest) ? GPG_ERR_DIGEST_ALGO
+        /* */                       : GPG_ERR_NOT_IMPLEMENTED;
+      if (report)
+        report ("digest", algo, "module",
+                (spec && !spec->flags.disabled)?
+                "no selftest available" :
+                spec? "algorithm disabled" : "algorithm not found");
+    }
+
+  return gpg_error (ec);
+}
diff --git a/comm/third_party/libgcrypt/cipher/md4.c b/comm/third_party/libgcrypt/cipher/md4.c
new file mode 100644
index 0000000000..b55443a8aa
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/md4.c
@@ -0,0 +1,296 @@
+/* md4.c - MD4 Message-Digest Algorithm
+ * Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Based on md5.c in libgcrypt, but rewritten to compute md4 checksums
+ * using a public domain md4 implementation with the following comments:
+ *
+ * Modified by Wei Dai from Andrew M. Kuchling's md4.c
+ * The original code and all modifications are in the public domain.
+ *
+ * This is the original introductory comment:
+ *
+ *  md4.c : MD4 hash algorithm.
+ *
+ * Part of the Python Cryptography Toolkit, version 1.1
+ *
+ * Distribute and use freely; there are no restrictions on further
+ * dissemination and usage except those imposed by the laws of your
+ * country of residence.
+ *
+ */
+
+/* MD4 test suite:
+ * MD4 ("") = 31d6cfe0d16ae931b73c59d7e0c089c0
+ * MD4 ("a") = bde52cb31de33e46245e05fbdbd6fb24
+ * MD4 ("abc") = a448017aaf21d8525fc10ae87aa6729d
+ * MD4 ("message digest") = d9130a8164549fe818874806e1c7014b
+ * MD4 ("abcdefghijklmnopqrstuvwxyz") = d79e1c308aa5bbcdeea8ed63df412da9
+ * MD4 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") =
+ * 043f8582f241db351ce627e153e7f0e4
+ * MD4 ("123456789012345678901234567890123456789012345678901234567890123456
+ * 78901234567890") = e33b4ddc9c38f2199c3e7b164fcc0536
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "hash-common.h"
+
+
+typedef struct {
+    gcry_md_block_ctx_t bctx;
+    u32 A,B,C,D;	  /* chaining variables */
+} MD4_CONTEXT;
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks );
+
+static void
+md4_init (void *context, unsigned int flags)
+{
+  MD4_CONTEXT *ctx = context;
+
+  (void)flags;
+
+  ctx->A = 0x67452301;
+  ctx->B = 0xefcdab89;
+  ctx->C = 0x98badcfe;
+  ctx->D = 0x10325476;
+
+  ctx->bctx.nblocks = 0;
+  ctx->bctx.nblocks_high = 0;
+  ctx->bctx.count = 0;
+  ctx->bctx.blocksize_shift = _gcry_ctz(64);
+  ctx->bctx.bwrite = transform;
+}
+
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+
+/****************
+ * transform 64 bytes
+ */
+static unsigned int
+transform_blk ( void *c, const unsigned char *data )
+{
+  MD4_CONTEXT *ctx = c;
+  u32 in[16];
+  register u32 A = ctx->A;
+  register u32 B = ctx->B;
+  register u32 C = ctx->C;
+  register u32 D = ctx->D;
+  int i;
+
+  for ( i = 0; i < 16; i++ )
+    in[i] = buf_get_le32(data + i * 4);
+
+  /* Round 1.  */
+#define function(a,b,c,d,k,s) a=rol(a+F(b,c,d)+in[k],s);
+  function(A,B,C,D, 0, 3);
+  function(D,A,B,C, 1, 7);
+  function(C,D,A,B, 2,11);
+  function(B,C,D,A, 3,19);
+  function(A,B,C,D, 4, 3);
+  function(D,A,B,C, 5, 7);
+  function(C,D,A,B, 6,11);
+  function(B,C,D,A, 7,19);
+  function(A,B,C,D, 8, 3);
+  function(D,A,B,C, 9, 7);
+  function(C,D,A,B,10,11);
+  function(B,C,D,A,11,19);
+  function(A,B,C,D,12, 3);
+  function(D,A,B,C,13, 7);
+  function(C,D,A,B,14,11);
+  function(B,C,D,A,15,19);
+
+#undef function
+
+  /* Round 2.  */
+#define function(a,b,c,d,k,s) a=rol(a+G(b,c,d)+in[k]+0x5a827999,s);
+
+  function(A,B,C,D, 0, 3);
+  function(D,A,B,C, 4, 5);
+  function(C,D,A,B, 8, 9);
+  function(B,C,D,A,12,13);
+  function(A,B,C,D, 1, 3);
+  function(D,A,B,C, 5, 5);
+  function(C,D,A,B, 9, 9);
+  function(B,C,D,A,13,13);
+  function(A,B,C,D, 2, 3);
+  function(D,A,B,C, 6, 5);
+  function(C,D,A,B,10, 9);
+  function(B,C,D,A,14,13);
+  function(A,B,C,D, 3, 3);
+  function(D,A,B,C, 7, 5);
+  function(C,D,A,B,11, 9);
+  function(B,C,D,A,15,13);
+
+#undef function
+
+  /* Round 3.  */
+#define function(a,b,c,d,k,s) a=rol(a+H(b,c,d)+in[k]+0x6ed9eba1,s);
+
+  function(A,B,C,D, 0, 3);
+  function(D,A,B,C, 8, 9);
+  function(C,D,A,B, 4,11);
+  function(B,C,D,A,12,15);
+  function(A,B,C,D, 2, 3);
+  function(D,A,B,C,10, 9);
+  function(C,D,A,B, 6,11);
+  function(B,C,D,A,14,15);
+  function(A,B,C,D, 1, 3);
+  function(D,A,B,C, 9, 9);
+  function(C,D,A,B, 5,11);
+  function(B,C,D,A,13,15);
+  function(A,B,C,D, 3, 3);
+  function(D,A,B,C,11, 9);
+  function(C,D,A,B, 7,11);
+  function(B,C,D,A,15,15);
+
+
+  /* Put checksum in context given as argument.  */
+  ctx->A += A;
+  ctx->B += B;
+  ctx->C += C;
+  ctx->D += D;
+
+  return /*burn_stack*/ 80+6*sizeof(void*);
+}
+
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+/* The routine final terminates the message-digest computation and
+ * ends with the desired message digest in mdContext->digest[0...15].
+ * The handle is prepared for a new MD4 cycle.
+ * Returns 16 bytes representing the digest.
+ */
+
+static void
+md4_final( void *context )
+{
+  MD4_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if( (lsb += hd->bctx.count) < t )
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
+    }
+  else /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 64 + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 64 + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0)
+  X(A);
+  X(B);
+  X(C);
+  X(D);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+md4_read (void *context)
+{
+  MD4_CONTEXT *hd = context;
+  return hd->bctx.buf;
+}
+
+static byte asn[18] = /* Object ID is 1.2.840.113549.2.4 */
+  { 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, 0x2a, 0x86,0x48,
+    0x86, 0xf7, 0x0d, 0x02, 0x04, 0x05, 0x00, 0x04, 0x10 };
+
+static gcry_md_oid_spec_t oid_spec_md4[] =
+  {
+    /* iso.member-body.us.rsadsi.digestAlgorithm.md4 */
+    { "1.2.840.113549.2.4" },
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_md4 =
+  {
+    GCRY_MD_MD4, {0, 0},
+    "MD4", asn, DIM (asn), oid_spec_md4,16,
+    md4_init, _gcry_md_block_write, md4_final, md4_read, NULL,
+    NULL, NULL,
+    sizeof (MD4_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/md5.c b/comm/third_party/libgcrypt/cipher/md5.c
new file mode 100644
index 0000000000..32cb535aaa
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/md5.c
@@ -0,0 +1,322 @@
+/* md5.c - MD5 Message-Digest Algorithm
+ * Copyright (C) 1995,1996,1998,1999,2001,2002,
+ *               2003  Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * According to the definition of MD5 in RFC 1321 from April 1992.
+ * NOTE: This is *not* the same file as the one from glibc.
+ * Written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
+ * heavily modified for GnuPG by Werner Koch <wk@gnupg.org>
+ */
+
+/* Test values:
+ * ""                  D4 1D 8C D9 8F 00 B2 04  E9 80 09 98 EC F8 42 7E
+ * "a"                 0C C1 75 B9 C0 F1 B6 A8  31 C3 99 E2 69 77 26 61
+ * "abc                90 01 50 98 3C D2 4F B0  D6 96 3F 7D 28 E1 7F 72
+ * "message digest"    F9 6B 69 7D 7C B7 93 8D  52 5A 2F 31 AA F1 61 D0
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "hash-common.h"
+
+
+typedef struct {
+    gcry_md_block_ctx_t bctx;
+    u32 A,B,C,D;	  /* chaining variables */
+} MD5_CONTEXT;
+
+static unsigned int
+transform ( void *ctx, const unsigned char *data, size_t datalen );
+
+static void
+md5_init( void *context, unsigned int flags)
+{
+  MD5_CONTEXT *ctx = context;
+
+  (void)flags;
+
+  ctx->A = 0x67452301;
+  ctx->B = 0xefcdab89;
+  ctx->C = 0x98badcfe;
+  ctx->D = 0x10325476;
+
+  ctx->bctx.nblocks = 0;
+  ctx->bctx.nblocks_high = 0;
+  ctx->bctx.count = 0;
+  ctx->bctx.blocksize_shift = _gcry_ctz(64);
+  ctx->bctx.bwrite = transform;
+}
+
+
+/* These are the four functions used in the four steps of the MD5 algorithm
+   and defined in the RFC 1321.  The first function is a little bit optimized
+   (as found in Colin Plumbs public domain implementation).  */
+/* #define FF(b, c, d) ((b & c) | (~b & d)) */
+#define FF(b, c, d) (d ^ (b & (c ^ d)))
+#define FG(b, c, d) FF (d, b, c)
+#define FH(b, c, d) (b ^ c ^ d)
+#define FI(b, c, d) (c ^ (b | ~d))
+
+
+/****************
+ * transform 64 bytes
+ */
+static unsigned int
+transform_blk ( void *c, const unsigned char *data )
+{
+  MD5_CONTEXT *ctx = c;
+  u32 correct_words[16];
+  register u32 A = ctx->A;
+  register u32 B = ctx->B;
+  register u32 C = ctx->C;
+  register u32 D = ctx->D;
+  u32 *cwp = correct_words;
+  int i;
+
+  for ( i = 0; i < 16; i++ )
+    correct_words[i] = buf_get_le32(data + i * 4);
+
+#define OP(a, b, c, d, s, T) \
+  do			         	   \
+    {					   \
+      a += FF (b, c, d) + (*cwp++) + T;    \
+      a = rol(a, s);			   \
+      a += b;				   \
+    }					   \
+  while (0)
+
+  /* Before we start, one word about the strange constants.
+     They are defined in RFC 1321 as
+
+     T[i] = (int) (4294967296.0 * fabs (sin (i))), i=1..64
+  */
+
+  /* Round 1.  */
+  OP (A, B, C, D,  7, 0xd76aa478);
+  OP (D, A, B, C, 12, 0xe8c7b756);
+  OP (C, D, A, B, 17, 0x242070db);
+  OP (B, C, D, A, 22, 0xc1bdceee);
+  OP (A, B, C, D,  7, 0xf57c0faf);
+  OP (D, A, B, C, 12, 0x4787c62a);
+  OP (C, D, A, B, 17, 0xa8304613);
+  OP (B, C, D, A, 22, 0xfd469501);
+  OP (A, B, C, D,  7, 0x698098d8);
+  OP (D, A, B, C, 12, 0x8b44f7af);
+  OP (C, D, A, B, 17, 0xffff5bb1);
+  OP (B, C, D, A, 22, 0x895cd7be);
+  OP (A, B, C, D,  7, 0x6b901122);
+  OP (D, A, B, C, 12, 0xfd987193);
+  OP (C, D, A, B, 17, 0xa679438e);
+  OP (B, C, D, A, 22, 0x49b40821);
+
+#undef OP
+#define OP(f, a, b, c, d, k, s, T)  \
+    do								      \
+      { 							      \
+	a += f (b, c, d) + correct_words[k] + T;		      \
+	a = rol(a, s);						      \
+	a += b; 						      \
+      } 							      \
+    while (0)
+
+  /* Round 2.  */
+  OP (FG, A, B, C, D,  1,  5, 0xf61e2562);
+  OP (FG, D, A, B, C,  6,  9, 0xc040b340);
+  OP (FG, C, D, A, B, 11, 14, 0x265e5a51);
+  OP (FG, B, C, D, A,  0, 20, 0xe9b6c7aa);
+  OP (FG, A, B, C, D,  5,  5, 0xd62f105d);
+  OP (FG, D, A, B, C, 10,  9, 0x02441453);
+  OP (FG, C, D, A, B, 15, 14, 0xd8a1e681);
+  OP (FG, B, C, D, A,  4, 20, 0xe7d3fbc8);
+  OP (FG, A, B, C, D,  9,  5, 0x21e1cde6);
+  OP (FG, D, A, B, C, 14,  9, 0xc33707d6);
+  OP (FG, C, D, A, B,  3, 14, 0xf4d50d87);
+  OP (FG, B, C, D, A,  8, 20, 0x455a14ed);
+  OP (FG, A, B, C, D, 13,  5, 0xa9e3e905);
+  OP (FG, D, A, B, C,  2,  9, 0xfcefa3f8);
+  OP (FG, C, D, A, B,  7, 14, 0x676f02d9);
+  OP (FG, B, C, D, A, 12, 20, 0x8d2a4c8a);
+
+  /* Round 3.  */
+  OP (FH, A, B, C, D,  5,  4, 0xfffa3942);
+  OP (FH, D, A, B, C,  8, 11, 0x8771f681);
+  OP (FH, C, D, A, B, 11, 16, 0x6d9d6122);
+  OP (FH, B, C, D, A, 14, 23, 0xfde5380c);
+  OP (FH, A, B, C, D,  1,  4, 0xa4beea44);
+  OP (FH, D, A, B, C,  4, 11, 0x4bdecfa9);
+  OP (FH, C, D, A, B,  7, 16, 0xf6bb4b60);
+  OP (FH, B, C, D, A, 10, 23, 0xbebfbc70);
+  OP (FH, A, B, C, D, 13,  4, 0x289b7ec6);
+  OP (FH, D, A, B, C,  0, 11, 0xeaa127fa);
+  OP (FH, C, D, A, B,  3, 16, 0xd4ef3085);
+  OP (FH, B, C, D, A,  6, 23, 0x04881d05);
+  OP (FH, A, B, C, D,  9,  4, 0xd9d4d039);
+  OP (FH, D, A, B, C, 12, 11, 0xe6db99e5);
+  OP (FH, C, D, A, B, 15, 16, 0x1fa27cf8);
+  OP (FH, B, C, D, A,  2, 23, 0xc4ac5665);
+
+  /* Round 4.  */
+  OP (FI, A, B, C, D,  0,  6, 0xf4292244);
+  OP (FI, D, A, B, C,  7, 10, 0x432aff97);
+  OP (FI, C, D, A, B, 14, 15, 0xab9423a7);
+  OP (FI, B, C, D, A,  5, 21, 0xfc93a039);
+  OP (FI, A, B, C, D, 12,  6, 0x655b59c3);
+  OP (FI, D, A, B, C,  3, 10, 0x8f0ccc92);
+  OP (FI, C, D, A, B, 10, 15, 0xffeff47d);
+  OP (FI, B, C, D, A,  1, 21, 0x85845dd1);
+  OP (FI, A, B, C, D,  8,  6, 0x6fa87e4f);
+  OP (FI, D, A, B, C, 15, 10, 0xfe2ce6e0);
+  OP (FI, C, D, A, B,  6, 15, 0xa3014314);
+  OP (FI, B, C, D, A, 13, 21, 0x4e0811a1);
+  OP (FI, A, B, C, D,  4,  6, 0xf7537e82);
+  OP (FI, D, A, B, C, 11, 10, 0xbd3af235);
+  OP (FI, C, D, A, B,  2, 15, 0x2ad7d2bb);
+  OP (FI, B, C, D, A,  9, 21, 0xeb86d391);
+
+  /* Put checksum in context given as argument.  */
+  ctx->A += A;
+  ctx->B += B;
+  ctx->C += C;
+  ctx->D += D;
+
+  return /*burn_stack*/ 80+6*sizeof(void*);
+}
+
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+/* The routine final terminates the message-digest computation and
+ * ends with the desired message digest in mdContext->digest[0...15].
+ * The handle is prepared for a new MD5 cycle.
+ * Returns 16 bytes representing the digest.
+ */
+
+static void
+md5_final( void *context)
+{
+  MD5_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if( (lsb += hd->bctx.count) < t )
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
+    }
+  else /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 64 + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 64 + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0)
+  X(A);
+  X(B);
+  X(C);
+  X(D);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+md5_read( void *context )
+{
+  MD5_CONTEXT *hd = (MD5_CONTEXT *) context;
+  return hd->bctx.buf;
+}
+
+static byte asn[18] = /* Object ID is 1.2.840.113549.2.5 */
+  { 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, 0x2a, 0x86,0x48,
+    0x86, 0xf7, 0x0d, 0x02, 0x05, 0x05, 0x00, 0x04, 0x10 };
+
+static gcry_md_oid_spec_t oid_spec_md5[] =
+  {
+    /* iso.member-body.us.rsadsi.pkcs.pkcs-1.4 (md5WithRSAEncryption) */
+    { "1.2.840.113549.1.1.4" },
+    /* RSADSI digestAlgorithm MD5 */
+    { "1.2.840.113549.2.5" },
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_md5 =
+  {
+    GCRY_MD_MD5, {0, 0},
+    "MD5", asn, DIM (asn), oid_spec_md5, 16,
+    md5_init, _gcry_md_block_write, md5_final, md5_read, NULL,
+    NULL, NULL,
+    sizeof (MD5_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/poly1305-internal.h b/comm/third_party/libgcrypt/cipher/poly1305-internal.h
new file mode 100644
index 0000000000..19cee5f6f3
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/poly1305-internal.h
@@ -0,0 +1,64 @@
+/* poly1305-internal.h  -  Poly1305 internals
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_POLY1305_INTERNAL_H
+#define G10_POLY1305_INTERNAL_H
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+
+#define POLY1305_TAGLEN 16
+#define POLY1305_KEYLEN 32
+#define POLY1305_BLOCKSIZE 16
+
+
+typedef struct
+{
+  u32 k[4];
+  u32 r[4];
+  u32 h[5];
+} POLY1305_STATE;
+
+typedef struct poly1305_context_s
+{
+  POLY1305_STATE state;
+  byte buffer[POLY1305_BLOCKSIZE];
+  unsigned int leftover;
+} poly1305_context_t;
+
+
+gcry_err_code_t _gcry_poly1305_init (poly1305_context_t *ctx, const byte *key,
+				     size_t keylen);
+
+void _gcry_poly1305_finish (poly1305_context_t *ctx,
+			     byte mac[POLY1305_TAGLEN]);
+
+void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
+			     size_t buflen);
+
+unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx,
+					 const byte *m, size_t bytes);
+
+#endif /* G10_POLY1305_INTERNAL_H */
diff --git a/comm/third_party/libgcrypt/cipher/poly1305-s390x.S b/comm/third_party/libgcrypt/cipher/poly1305-s390x.S
new file mode 100644
index 0000000000..844245f6ad
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/poly1305-s390x.S
@@ -0,0 +1,87 @@
+/* poly1305-s390x.S  -  zSeries implementation of Poly1305
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+
+#include "asm-poly1305-s390x.h"
+
+.text
+
+.balign 8
+.globl _gcry_poly1305_s390x_blocks1
+ELF(.type _gcry_poly1305_s390x_blocks1,@function;)
+
+_gcry_poly1305_s390x_blocks1:
+	/* input:
+	 *	%r2: poly1305-state
+	 *	%r3: src
+	 *	%r4: len
+	 *	%r5: high_pad
+	 */
+	CFI_STARTPROC();
+
+	stmg %r6, %r14, 6 * 8(%r15);
+
+	lgr POLY_RSTATE, %r2;
+	lgr POLY_RSRC, %r3;
+	srlg %r0, %r4, 4;
+
+	cgije %r5, 0, .Lpoly_high0;
+
+	POLY1305_LOAD_STATE();
+
+.balign 4
+.Lpoly_loop_high1:
+	POLY1305_BLOCK_PART1(0 * 16);
+	INC_POLY1305_SRC(1 * 16);
+.Lpoly_block_part2:
+	POLY1305_BLOCK_PART2();
+	POLY1305_BLOCK_PART3();
+	POLY1305_BLOCK_PART4();
+	POLY1305_BLOCK_PART5();
+	POLY1305_BLOCK_PART6();
+	POLY1305_BLOCK_PART7();
+	POLY1305_BLOCK_PART8();
+
+	brctg %r0, .Lpoly_loop_high1;
+
+.balign 4
+.Lpoly_done:
+	POLY1305_STORE_STATE();
+
+	lmg %r6, %r14, 6 * 8(%r15);
+	xgr %r2, %r2;
+	br %r14;
+
+.balign 4
+.Lpoly_high0:
+	lghi %r0, 1;
+	POLY1305_LOAD_STATE();
+	POLY1305_BLOCK_PART1_HB(0 * 16, 0);
+	j .Lpoly_block_part2;
+
+	CFI_ENDPROC();
+ELF(.size _gcry_poly1305_s390x_blocks1,
+    .-_gcry_poly1305_s390x_blocks1;)
+
+#endif /*HAVE_GCC_INLINE_ASM_S390X*/
+#endif /*__s390x__*/
diff --git a/comm/third_party/libgcrypt/cipher/poly1305.c b/comm/third_party/libgcrypt/cipher/poly1305.c
new file mode 100644
index 0000000000..6cb4d2b72d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/poly1305.c
@@ -0,0 +1,740 @@
+/* poly1305.c  -  Poly1305 internals and generic implementation
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "poly1305-internal.h"
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+
+static const char *selftest (void);
+
+
+#undef HAVE_ASM_POLY1305_BLOCKS
+
+
+#undef USE_MPI_64BIT
+#undef USE_MPI_32BIT
+#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64)
+# define USE_MPI_64BIT 1
+#elif BYTES_PER_MPI_LIMB == 4
+# define USE_MPI_32BIT 1
+#else
+# error please implement for this limb size.
+#endif
+
+
+/* USE_S390X_ASM indicates whether to enable zSeries code. */
+#undef USE_S390X_ASM
+#if BYTES_PER_MPI_LIMB == 8
+# if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+#  if defined(HAVE_GCC_INLINE_ASM_S390X)
+#   define USE_S390X_ASM 1
+#  endif /* USE_S390X_ASM */
+# endif
+#endif
+
+
+#ifdef USE_S390X_ASM
+
+#define HAVE_ASM_POLY1305_BLOCKS 1
+
+extern unsigned int _gcry_poly1305_s390x_blocks1(void *state,
+						 const byte *buf, size_t len,
+						 byte high_pad);
+
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
+{
+  return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad);
+}
+
+#endif /* USE_S390X_ASM */
+
+
+static void poly1305_init (poly1305_context_t *ctx,
+			   const byte key[POLY1305_KEYLEN])
+{
+  POLY1305_STATE *st = &ctx->state;
+
+  ctx->leftover = 0;
+
+  st->h[0] = 0;
+  st->h[1] = 0;
+  st->h[2] = 0;
+  st->h[3] = 0;
+  st->h[4] = 0;
+
+  st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
+  st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
+  st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
+  st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
+
+  st->k[0] = buf_get_le32(key + 16);
+  st->k[1] = buf_get_le32(key + 20);
+  st->k[2] = buf_get_le32(key + 24);
+  st->k[3] = buf_get_le32(key + 28);
+}
+
+
+#ifdef USE_MPI_64BIT
+
+#if defined (__aarch64__) && __GNUC__ >= 4
+
+/* A += B (armv8/aarch64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("adds %0, %3, %0\n" \
+	       "adcs %1, %4, %1\n" \
+	       "adc  %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#endif /* __aarch64__ */
+
+#if defined (__x86_64__) && __GNUC__ >= 4
+
+/* A += B (x86-64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addq %3, %0\n" \
+	       "adcq %4, %1\n" \
+	       "adcq %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "g" (B0), "g" (B1), "g" (B2) \
+	       : "cc" )
+
+#endif /* __x86_64__ */
+
+#if defined (__powerpc__) && __GNUC__ >= 4
+
+/* A += B (ppc64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addc %0, %3, %0\n" \
+	       "adde %1, %4, %1\n" \
+	       "adde %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#endif /* __powerpc__ */
+
+#ifndef ADD_1305_64
+/* A += B (generic, mpi) */
+#  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
+    u64 carry; \
+    add_ssaaaa(carry, A0, 0, A0, 0, B0); \
+    add_ssaaaa(A2, A1, A2, A1, B2, B1); \
+    add_ssaaaa(A2, A1, A2, A1, 0, carry); \
+  } while (0)
+#endif
+
+/* H = H * R mod 2¹³⁰-5 */
+#define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
+    u64 x0_lo, x0_hi, x1_lo, x1_hi; \
+    u64 t0_lo, t0_hi, t1_lo, t1_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    \
+    umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
+    add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
+    umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
+    add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
+    t1_hi = H2 * R0;       /* h2 * r0 */ \
+    add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    /* carry propagation */ \
+    H2 = H0 & 3; \
+    H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
+    ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
+  } while (0)
+
+#ifndef HAVE_ASM_POLY1305_BLOCKS
+
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
+{
+  POLY1305_STATE *st = &ctx->state;
+  u64 r0, r1, r1_mult5;
+  u64 h0, h1, h2;
+  u64 m0, m1, m2;
+
+  m2 = high_pad;
+
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
+
+  r0 = st->r[0] + ((u64)st->r[1] << 32);
+  r1 = st->r[2] + ((u64)st->r[3] << 32);
+
+  r1_mult5 = (r1 >> 2) + r1;
+
+  m0 = buf_get_le64(buf + 0);
+  m1 = buf_get_le64(buf + 8);
+  buf += POLY1305_BLOCKSIZE;
+  len -= POLY1305_BLOCKSIZE;
+
+  while (len >= POLY1305_BLOCKSIZE)
+    {
+      /* a = h + m */
+      ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+      m0 = buf_get_le64(buf + 0);
+      m1 = buf_get_le64(buf + 8);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
+    }
+
+  /* a = h + m */
+  ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+  /* h = a * r (partial mod 2^130-5) */
+  MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
+
+  st->h[0] = h0;
+  st->h[1] = h0 >> 32;
+  st->h[2] = h1;
+  st->h[3] = h1 >> 32;
+  st->h[4] = h2;
+
+  return 6 * sizeof (void *) + 18 * sizeof (u64);
+}
+
+#endif /* !HAVE_ASM_POLY1305_BLOCKS */
+
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
+{
+  POLY1305_STATE *st = &ctx->state;
+  unsigned int burn = 0;
+  u64 u, carry;
+  u64 k0, k1;
+  u64 h0, h1;
+  u64 h2;
+
+  /* process the remaining block */
+  if (ctx->leftover)
+    {
+      ctx->buffer[ctx->leftover++] = 1;
+      if (ctx->leftover < POLY1305_BLOCKSIZE)
+	{
+	  memset (&ctx->buffer[ctx->leftover], 0,
+		  POLY1305_BLOCKSIZE - ctx->leftover);
+	  ctx->leftover = POLY1305_BLOCKSIZE;
+	}
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
+    }
+
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
+
+  k0 = st->k[0] + ((u64)st->k[1] << 32);
+  k1 = st->k[2] + ((u64)st->k[3] << 32);
+
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, u, 0, h0, 0, 5);
+  add_ssaaaa(carry, u, 0, carry, 0, h1);
+  u = (carry + h2) >> 2; /* u == 0 or 1 */
+
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(h1, h0, h1, h0, 0, u);
+
+  /* add high part of key + h */
+  add_ssaaaa(h1, h0, h1, h0, k1, k0);
+  buf_put_le64(mac + 0, h0);
+  buf_put_le64(mac + 8, h1);
+
+  /* burn_stack */
+  return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
+}
+
+#endif /* USE_MPI_64BIT */
+
+#ifdef USE_MPI_32BIT
+
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+/* HI:LO += A * B (arm) */
+#define UMUL_ADD_32(HI, LO, A, B) \
+      __asm__ ("umlal %1, %0, %4, %5" \
+	       : "=r" (HI), "=r" (LO) \
+	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
+
+/* A += B (arm) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("adds %0, %0, %5\n" \
+	       "adcs %1, %1, %6\n" \
+	       "adcs %2, %2, %7\n" \
+	       "adcs %3, %3, %8\n" \
+	       "adc %4, %4, %9\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
+	       : "cc" )
+
+#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
+
+#if defined (__i386__) && __GNUC__ >= 4
+
+/* A += B (i386) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("addl %5, %0\n" \
+	       "adcl %6, %1\n" \
+	       "adcl %7, %2\n" \
+	       "adcl %8, %3\n" \
+	       "adcl %9, %4\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
+	       : "cc" )
+
+#endif /* __i386__ */
+
+#ifndef UMUL_ADD_32
+/* HI:LO += A * B (generic, mpi) */
+#  define UMUL_ADD_32(HI, LO, A, B) do { \
+    u32 t_lo, t_hi; \
+    umul_ppmm(t_hi, t_lo, A, B); \
+    add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
+  } while (0)
+#endif
+
+#ifndef ADD_1305_32
+/* A += B (generic, mpi) */
+#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
+    u32 carry0, carry1, carry2; \
+    add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
+    add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
+    add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
+    add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
+    add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
+    add_ssaaaa(A4, A3, A4, A3, B4, B3); \
+    add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
+  } while (0)
+#endif
+
+/* H = H * R mod 2¹³⁰-5 */
+#define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
+                        R3_MULT5, R2_MULT5, R1_MULT5) do { \
+    u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
+    u32 t0_lo, t0_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
+    umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
+    H1 = x0_hi; \
+    UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
+    \
+    t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
+    t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
+    add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
+    add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
+    t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
+    t0_hi = H4 * R0;       /* h4 * r0 */ \
+    add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
+    \
+    /* carry propagation */ \
+    H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
+    H4 = H4 & 3; \
+    ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
+  } while (0)
+
+#ifndef HAVE_ASM_POLY1305_BLOCKS
+
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
+{
+  POLY1305_STATE *st = &ctx->state;
+  u32 r1_mult5, r2_mult5, r3_mult5;
+  u32 h0, h1, h2, h3, h4;
+  u32 m0, m1, m2, m3, m4;
+
+  m4 = high_pad;
+
+  h0 = st->h[0];
+  h1 = st->h[1];
+  h2 = st->h[2];
+  h3 = st->h[3];
+  h4 = st->h[4];
+
+  r1_mult5 = (st->r[1] >> 2) + st->r[1];
+  r2_mult5 = (st->r[2] >> 2) + st->r[2];
+  r3_mult5 = (st->r[3] >> 2) + st->r[3];
+
+  while (len >= POLY1305_BLOCKSIZE)
+    {
+      m0 = buf_get_le32(buf + 0);
+      m1 = buf_get_le32(buf + 4);
+      m2 = buf_get_le32(buf + 8);
+      m3 = buf_get_le32(buf + 12);
+
+      /* a = h + m */
+      ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_32(h4, h3, h2, h1, h0,
+		      st->r[3], st->r[2], st->r[1], st->r[0],
+		      r3_mult5, r2_mult5, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
+    }
+
+  st->h[0] = h0;
+  st->h[1] = h1;
+  st->h[2] = h2;
+  st->h[3] = h3;
+  st->h[4] = h4;
+
+  return 6 * sizeof (void *) + 28 * sizeof (u32);
+}
+
+#endif /* !HAVE_ASM_POLY1305_BLOCKS */
+
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
+{
+  POLY1305_STATE *st = &ctx->state;
+  unsigned int burn = 0;
+  u32 carry, tmp0, tmp1, tmp2, u;
+  u32 h4, h3, h2, h1, h0;
+
+  /* process the remaining block */
+  if (ctx->leftover)
+    {
+      ctx->buffer[ctx->leftover++] = 1;
+      if (ctx->leftover < POLY1305_BLOCKSIZE)
+	{
+	  memset (&ctx->buffer[ctx->leftover], 0,
+		  POLY1305_BLOCKSIZE - ctx->leftover);
+	  ctx->leftover = POLY1305_BLOCKSIZE;
+	}
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
+    }
+
+  h0 = st->h[0];
+  h1 = st->h[1];
+  h2 = st->h[2];
+  h3 = st->h[3];
+  h4 = st->h[4];
+
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
+  u = (carry + h4) >> 2; /* u == 0 or 1 */
+
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(carry, h0, 0, h0, 0, u);
+  add_ssaaaa(carry, h1, 0, h1, 0, carry);
+  add_ssaaaa(carry, h2, 0, h2, 0, carry);
+  add_ssaaaa(carry, h3, 0, h3, 0, carry);
+
+  /* add high part of key + h */
+  add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
+  add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
+  add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
+  add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
+  add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
+  add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
+  h3 += tmp2;
+
+  buf_put_le32(mac + 0, h0);
+  buf_put_le32(mac + 4, h1);
+  buf_put_le32(mac + 8, h2);
+  buf_put_le32(mac + 12, h3);
+
+  /* burn_stack */
+  return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
+}
+
+#endif /* USE_MPI_32BIT */
+
+
+unsigned int
+_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
+			    size_t bytes)
+{
+  unsigned int burn = 0;
+
+  /* handle leftover */
+  if (ctx->leftover)
+    {
+      size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
+      if (want > bytes)
+	want = bytes;
+      buf_cpy (ctx->buffer + ctx->leftover, m, want);
+      bytes -= want;
+      m += want;
+      ctx->leftover += want;
+      if (ctx->leftover < POLY1305_BLOCKSIZE)
+	return 0;
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
+      ctx->leftover = 0;
+    }
+
+  /* process full blocks */
+  if (bytes >= POLY1305_BLOCKSIZE)
+    {
+      size_t nblks = bytes / POLY1305_BLOCKSIZE;
+      burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
+      m += nblks * POLY1305_BLOCKSIZE;
+      bytes -= nblks * POLY1305_BLOCKSIZE;
+    }
+
+  /* store leftover */
+  if (bytes)
+    {
+      buf_cpy (ctx->buffer + ctx->leftover, m, bytes);
+      ctx->leftover += bytes;
+    }
+
+  return burn;
+}
+
+
+void
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+{
+  unsigned int burn;
+
+  burn = _gcry_poly1305_update_burn (ctx, m, bytes);
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
+void
+_gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
+{
+  unsigned int burn;
+
+  burn = poly1305_final (ctx, mac);
+
+  _gcry_burn_stack (burn);
+}
+
+
+gcry_err_code_t
+_gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
+		     size_t keylen)
+{
+  static int initialized;
+  static const char *selftest_failed;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+	log_error ("Poly1305 selftest failed (%s)\n", selftest_failed);
+    }
+
+  if (keylen != POLY1305_KEYLEN)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  poly1305_init (ctx, key);
+
+  return 0;
+}
+
+
+static void
+poly1305_auth (byte mac[POLY1305_TAGLEN], const byte * m, size_t bytes,
+	       const byte * key)
+{
+  poly1305_context_t ctx;
+
+  memset (&ctx, 0, sizeof (ctx));
+
+  _gcry_poly1305_init (&ctx, key, POLY1305_KEYLEN);
+  _gcry_poly1305_update (&ctx, m, bytes);
+  _gcry_poly1305_finish (&ctx, mac);
+
+  wipememory (&ctx, sizeof (ctx));
+}
+
+
+static const char *
+selftest (void)
+{
+  /* example from nacl */
+  static const byte nacl_key[POLY1305_KEYLEN] = {
+    0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
+    0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
+    0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
+    0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80,
+  };
+
+  static const byte nacl_msg[131] = {
+    0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
+    0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
+    0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
+    0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
+    0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
+    0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
+    0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
+    0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
+    0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
+    0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
+    0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
+    0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
+    0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
+    0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
+    0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
+    0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
+    0xe3, 0x55, 0xa5
+  };
+
+  static const byte nacl_mac[16] = {
+    0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
+    0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
+  };
+
+  /* generates a final value of (2^130 - 2) == 3 */
+  static const byte wrap_key[POLY1305_KEYLEN] = {
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+
+  static const byte wrap_msg[16] = {
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  };
+
+  static const byte wrap_mac[16] = {
+    0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+
+  /* mac of the macs of messages of length 0 to 256, where the key and messages
+   * have all their values set to the length
+   */
+  static const byte total_key[POLY1305_KEYLEN] = {
+    0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  };
+
+  static const byte total_mac[16] = {
+    0x64, 0xaf, 0xe2, 0xe8, 0xd6, 0xad, 0x7b, 0xbd,
+    0xd2, 0x87, 0xf9, 0x7c, 0x44, 0x62, 0x3d, 0x39
+  };
+
+  poly1305_context_t ctx;
+  poly1305_context_t total_ctx;
+  byte all_key[POLY1305_KEYLEN];
+  byte all_msg[256];
+  byte mac[16];
+  size_t i, j;
+
+  memset (&ctx, 0, sizeof (ctx));
+  memset (&total_ctx, 0, sizeof (total_ctx));
+
+  memset (mac, 0, sizeof (mac));
+  poly1305_auth (mac, nacl_msg, sizeof (nacl_msg), nacl_key);
+  if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
+    return "Poly1305 test 1 failed.";
+
+  /* SSE2/AVX have a 32 byte block size, but also support 64 byte blocks, so
+   * make sure everything still works varying between them */
+  memset (mac, 0, sizeof (mac));
+  _gcry_poly1305_init (&ctx, nacl_key, POLY1305_KEYLEN);
+  _gcry_poly1305_update (&ctx, nacl_msg + 0, 32);
+  _gcry_poly1305_update (&ctx, nacl_msg + 32, 64);
+  _gcry_poly1305_update (&ctx, nacl_msg + 96, 16);
+  _gcry_poly1305_update (&ctx, nacl_msg + 112, 8);
+  _gcry_poly1305_update (&ctx, nacl_msg + 120, 4);
+  _gcry_poly1305_update (&ctx, nacl_msg + 124, 2);
+  _gcry_poly1305_update (&ctx, nacl_msg + 126, 1);
+  _gcry_poly1305_update (&ctx, nacl_msg + 127, 1);
+  _gcry_poly1305_update (&ctx, nacl_msg + 128, 1);
+  _gcry_poly1305_update (&ctx, nacl_msg + 129, 1);
+  _gcry_poly1305_update (&ctx, nacl_msg + 130, 1);
+  _gcry_poly1305_finish (&ctx, mac);
+  if (memcmp (nacl_mac, mac, sizeof (nacl_mac)) != 0)
+    return "Poly1305 test 2 failed.";
+
+  memset (mac, 0, sizeof (mac));
+  poly1305_auth (mac, wrap_msg, sizeof (wrap_msg), wrap_key);
+  if (memcmp (wrap_mac, mac, sizeof (nacl_mac)) != 0)
+    return "Poly1305 test 3 failed.";
+
+  _gcry_poly1305_init (&total_ctx, total_key, POLY1305_KEYLEN);
+  for (i = 0; i < 256; i++)
+    {
+      /* set key and message to 'i,i,i..' */
+      for (j = 0; j < sizeof (all_key); j++)
+	all_key[j] = i;
+      for (j = 0; j < i; j++)
+	all_msg[j] = i;
+      poly1305_auth (mac, all_msg, i, all_key);
+      _gcry_poly1305_update (&total_ctx, mac, 16);
+    }
+  _gcry_poly1305_finish (&total_ctx, mac);
+  if (memcmp (total_mac, mac, sizeof (total_mac)) != 0)
+    return "Poly1305 test 4 failed.";
+
+  return NULL;
+}
diff --git a/comm/third_party/libgcrypt/cipher/primegen.c b/comm/third_party/libgcrypt/cipher/primegen.c
new file mode 100644
index 0000000000..e24de4dc7c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/primegen.c
@@ -0,0 +1,1878 @@
+/* primegen.c - prime number generator
+ * Copyright (C) 1998, 2000, 2001, 2002, 2003
+ *               2004, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+
+static gcry_mpi_t gen_prime (unsigned int nbits, int secret, int randomlevel,
+                             int (*extra_check)(void *, gcry_mpi_t),
+                             void *extra_check_arg);
+static int check_prime( gcry_mpi_t prime, gcry_mpi_t val_2, int rm_rounds,
+                        gcry_prime_check_func_t cb_func, void *cb_arg );
+static int is_prime (gcry_mpi_t n, int steps, unsigned int *count);
+static void m_out_of_n( char *array, int m, int n );
+
+static void (*progress_cb) (void *,const char*,int,int, int );
+static void *progress_cb_data;
+
+/* Note: 2 is not included because it can be tested more easily by
+   looking at bit 0. The last entry in this list is marked by a zero */
+static ushort small_prime_numbers[] = {
+    3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43,
+    47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101,
+    103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
+    157, 163, 167, 173, 179, 181, 191, 193, 197, 199,
+    211, 223, 227, 229, 233, 239, 241, 251, 257, 263,
+    269, 271, 277, 281, 283, 293, 307, 311, 313, 317,
+    331, 337, 347, 349, 353, 359, 367, 373, 379, 383,
+    389, 397, 401, 409, 419, 421, 431, 433, 439, 443,
+    449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
+    509, 521, 523, 541, 547, 557, 563, 569, 571, 577,
+    587, 593, 599, 601, 607, 613, 617, 619, 631, 641,
+    643, 647, 653, 659, 661, 673, 677, 683, 691, 701,
+    709, 719, 727, 733, 739, 743, 751, 757, 761, 769,
+    773, 787, 797, 809, 811, 821, 823, 827, 829, 839,
+    853, 857, 859, 863, 877, 881, 883, 887, 907, 911,
+    919, 929, 937, 941, 947, 953, 967, 971, 977, 983,
+    991, 997, 1009, 1013, 1019, 1021, 1031, 1033,
+    1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091,
+    1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
+    1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213,
+    1217, 1223, 1229, 1231, 1237, 1249, 1259, 1277,
+    1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307,
+    1319, 1321, 1327, 1361, 1367, 1373, 1381, 1399,
+    1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
+    1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
+    1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559,
+    1567, 1571, 1579, 1583, 1597, 1601, 1607, 1609,
+    1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667,
+    1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
+    1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789,
+    1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871,
+    1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931,
+    1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997,
+    1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
+    2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111,
+    2113, 2129, 2131, 2137, 2141, 2143, 2153, 2161,
+    2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243,
+    2251, 2267, 2269, 2273, 2281, 2287, 2293, 2297,
+    2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
+    2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411,
+    2417, 2423, 2437, 2441, 2447, 2459, 2467, 2473,
+    2477, 2503, 2521, 2531, 2539, 2543, 2549, 2551,
+    2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633,
+    2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
+    2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729,
+    2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791,
+    2797, 2801, 2803, 2819, 2833, 2837, 2843, 2851,
+    2857, 2861, 2879, 2887, 2897, 2903, 2909, 2917,
+    2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
+    3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061,
+    3067, 3079, 3083, 3089, 3109, 3119, 3121, 3137,
+    3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209,
+    3217, 3221, 3229, 3251, 3253, 3257, 3259, 3271,
+    3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
+    3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391,
+    3407, 3413, 3433, 3449, 3457, 3461, 3463, 3467,
+    3469, 3491, 3499, 3511, 3517, 3527, 3529, 3533,
+    3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583,
+    3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
+    3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709,
+    3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779,
+    3793, 3797, 3803, 3821, 3823, 3833, 3847, 3851,
+    3853, 3863, 3877, 3881, 3889, 3907, 3911, 3917,
+    3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
+    4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049,
+    4051, 4057, 4073, 4079, 4091, 4093, 4099, 4111,
+    4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177,
+    4201, 4211, 4217, 4219, 4229, 4231, 4241, 4243,
+    4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
+    4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391,
+    4397, 4409, 4421, 4423, 4441, 4447, 4451, 4457,
+    4463, 4481, 4483, 4493, 4507, 4513, 4517, 4519,
+    4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597,
+    4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
+    4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729,
+    4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799,
+    4801, 4813, 4817, 4831, 4861, 4871, 4877, 4889,
+    4903, 4909, 4919, 4931, 4933, 4937, 4943, 4951,
+    4957, 4967, 4969, 4973, 4987, 4993, 4999,
+    0
+};
+static int no_of_small_prime_numbers = DIM (small_prime_numbers) - 1;
+
+
+
+/* An object and a list to build up a global pool of primes.  See
+   save_pool_prime and get_pool_prime. */
+struct primepool_s
+{
+  struct primepool_s *next;
+  gcry_mpi_t prime;      /* If this is NULL the entry is not used. */
+  unsigned int nbits;
+  gcry_random_level_t randomlevel;
+};
+struct primepool_s *primepool;
+/* Mutex used to protect access to the primepool.  */
+GPGRT_LOCK_DEFINE (primepool_lock);
+
+
+gcry_err_code_t
+_gcry_primegen_init (void)
+{
+  /* This function was formerly used to initialize the primepool
+     Mutex. This has been replace by a static initialization.  */
+  return 0;
+}
+
+
+/* Save PRIME which has been generated at RANDOMLEVEL for later
+   use. Needs to be called while primepool_lock is being hold.  Note
+   that PRIME should be considered released after calling this
+   function. */
+static void
+save_pool_prime (gcry_mpi_t prime, gcry_random_level_t randomlevel)
+{
+  struct primepool_s *item, *item2;
+  size_t n;
+
+  for (n=0, item = primepool; item; item = item->next, n++)
+    if (!item->prime)
+      break;
+  if (!item && n > 100)
+    {
+      /* Remove some of the entries.  Our strategy is removing
+         the last third from the list. */
+      int i;
+
+      for (i=0, item2 = primepool; item2; item2 = item2->next)
+        {
+          if (i >= n/3*2)
+            {
+              _gcry_mpi_release (item2->prime);
+              item2->prime = NULL;
+              if (!item)
+                item = item2;
+            }
+        }
+    }
+  if (!item)
+    {
+      item = xtrycalloc (1, sizeof *item);
+      if (!item)
+        {
+          /* Out of memory.  Silently giving up. */
+          _gcry_mpi_release (prime);
+          return;
+        }
+      item->next = primepool;
+      primepool = item;
+    }
+  item->prime = prime;
+  item->nbits = mpi_get_nbits (prime);
+  item->randomlevel = randomlevel;
+}
+
+
+/* Return a prime for the prime pool or NULL if none has been found.
+   The prime needs to match NBITS and randomlevel. This function needs
+   to be called with the primepool_look is being hold. */
+static gcry_mpi_t
+get_pool_prime (unsigned int nbits, gcry_random_level_t randomlevel)
+{
+  struct primepool_s *item;
+
+  for (item = primepool; item; item = item->next)
+    if (item->prime
+        && item->nbits == nbits && item->randomlevel == randomlevel)
+      {
+        gcry_mpi_t prime = item->prime;
+        item->prime = NULL;
+        gcry_assert (nbits == mpi_get_nbits (prime));
+        return prime;
+      }
+  return NULL;
+}
+
+
+
+
+
+
+void
+_gcry_register_primegen_progress ( void (*cb)(void *,const char*,int,int,int),
+                                   void *cb_data )
+{
+  progress_cb = cb;
+  progress_cb_data = cb_data;
+}
+
+
+static void
+progress( int c )
+{
+  if ( progress_cb )
+    progress_cb ( progress_cb_data, "primegen", c, 0, 0 );
+}
+
+
+/****************
+ * Generate a prime number (stored in secure memory)
+ */
+gcry_mpi_t
+_gcry_generate_secret_prime (unsigned int nbits,
+                             gcry_random_level_t random_level,
+                             int (*extra_check)(void*, gcry_mpi_t),
+                             void *extra_check_arg)
+{
+  gcry_mpi_t prime;
+
+  prime = gen_prime (nbits, 1, random_level, extra_check, extra_check_arg);
+  progress('\n');
+  return prime;
+}
+
+
+/* Generate a prime number which may be public, i.e. not allocated in
+   secure memory.  */
+gcry_mpi_t
+_gcry_generate_public_prime (unsigned int nbits,
+                             gcry_random_level_t random_level,
+                             int (*extra_check)(void*, gcry_mpi_t),
+                             void *extra_check_arg)
+{
+  gcry_mpi_t prime;
+
+  prime = gen_prime (nbits, 0, random_level, extra_check, extra_check_arg);
+  progress('\n');
+  return prime;
+}
+
+
+/* Core prime generation function.  The algorithm used to generate
+   practically save primes is due to Lim and Lee as described in the
+   CRYPTO '97 proceedings (ISBN3540633847) page 260.
+
+   NEED_Q_FACTOR: If true make sure that at least one factor is of
+                  size qbits.  This is for example required for DSA.
+   PRIME_GENERATED: Adresss of a variable where the resulting prime
+                    number will be stored.
+   PBITS: Requested size of the prime number.  At least 48.
+   QBITS: One factor of the prime needs to be of this size.  Maybe 0
+          if this is not required.  See also MODE.
+   G: If not NULL an MPI which will receive a generator for the prime
+      for use with Elgamal.
+   RET_FACTORS: if not NULL, an array with all factors are stored at
+                that address.
+   ALL_FACTORS: If set to true all factors of prime-1 are returned.
+   RANDOMLEVEL:  How strong should the random numers be.
+   FLAGS: Prime generation bit flags. Currently supported:
+          GCRY_PRIME_FLAG_SECRET - The prime needs to be kept secret.
+   CB_FUNC, CB_ARG:  Callback to be used for extra checks.
+
+ */
+static gcry_err_code_t
+prime_generate_internal (int need_q_factor,
+			 gcry_mpi_t *prime_generated, unsigned int pbits,
+			 unsigned int qbits, gcry_mpi_t g,
+			 gcry_mpi_t **ret_factors,
+			 gcry_random_level_t randomlevel, unsigned int flags,
+                         int all_factors,
+                         gcry_prime_check_func_t cb_func, void *cb_arg)
+{
+  gcry_err_code_t err = 0;
+  gcry_mpi_t *factors_new = NULL; /* Factors to return to the
+				     caller.  */
+  gcry_mpi_t *factors = NULL;	/* Current factors.  */
+  gcry_random_level_t poolrandomlevel; /* Random level used for pool primes. */
+  gcry_mpi_t *pool = NULL;	/* Pool of primes.  */
+  int *pool_in_use = NULL;      /* Array with currently used POOL elements. */
+  unsigned char *perms = NULL;	/* Permutations of POOL.  */
+  gcry_mpi_t q_factor = NULL;	/* Used if QBITS is non-zero.  */
+  unsigned int fbits = 0;	/* Length of prime factors.  */
+  unsigned int n = 0;		/* Number of factors.  */
+  unsigned int m = 0;		/* Number of primes in pool.  */
+  gcry_mpi_t q = NULL;		/* First prime factor.  */
+  gcry_mpi_t prime = NULL;	/* Prime candidate.  */
+  unsigned int nprime = 0;	/* Bits of PRIME.  */
+  unsigned int req_qbits;       /* The original QBITS value.  */
+  gcry_mpi_t val_2;             /* For check_prime().  */
+  int is_locked = 0;            /* Flag to help unlocking the primepool. */
+  unsigned int is_secret = (flags & GCRY_PRIME_FLAG_SECRET);
+  unsigned int count1 = 0, count2 = 0;
+  unsigned int i = 0, j = 0;
+
+  if (pbits < 48)
+    return GPG_ERR_INV_ARG;
+
+  /* We won't use a too strong random elvel for the pooled subprimes. */
+  poolrandomlevel = (randomlevel > GCRY_STRONG_RANDOM?
+                     GCRY_STRONG_RANDOM : randomlevel);
+
+
+  /* If QBITS is not given, assume a reasonable value. */
+  if (!qbits)
+    qbits = pbits / 3;
+
+  req_qbits = qbits;
+
+  /* Find number of needed prime factors N.  */
+  for (n = 1; (pbits - qbits - 1) / n  >= qbits; n++)
+    ;
+  n--;
+
+  val_2 = mpi_alloc_set_ui (2);
+
+  if ((! n) || ((need_q_factor) && (n < 2)))
+    {
+      err = GPG_ERR_INV_ARG;
+      goto leave;
+    }
+
+  if (need_q_factor)
+    {
+      n--;  /* Need one factor less because we want a specific Q-FACTOR. */
+      fbits = (pbits - 2 * req_qbits -1) / n;
+      qbits =  pbits - req_qbits - n * fbits;
+    }
+  else
+    {
+      fbits = (pbits - req_qbits -1) / n;
+      qbits = pbits - n * fbits;
+    }
+
+  if (DBG_CIPHER)
+    log_debug ("gen prime: pbits=%u qbits=%u fbits=%u/%u n=%d\n",
+               pbits, req_qbits, qbits, fbits, n);
+
+  /* Allocate an integer to old the new prime. */
+  prime = mpi_new (pbits);
+
+  /* Generate first prime factor.  */
+  q = gen_prime (qbits, is_secret, randomlevel, NULL, NULL);
+
+  /* Generate a specific Q-Factor if requested. */
+  if (need_q_factor)
+    q_factor = gen_prime (req_qbits, is_secret, randomlevel, NULL, NULL);
+
+  /* Allocate an array to hold all factors + 2 for later usage.  */
+  factors = xtrycalloc (n + 2, sizeof (*factors));
+  if (!factors)
+    {
+      err = gpg_err_code_from_errno (errno);
+      goto leave;
+    }
+
+  /* Allocate an array to track pool usage. */
+  pool_in_use = xtrymalloc (n * sizeof *pool_in_use);
+  if (!pool_in_use)
+    {
+      err = gpg_err_code_from_errno (errno);
+      goto leave;
+    }
+  for (i=0; i < n; i++)
+    pool_in_use[i] = -1;
+
+  /* Make a pool of 3n+5 primes (this is an arbitrary value).  We
+     require at least 30 primes for are useful selection process.
+
+     Fixme: We need to research the best formula for sizing the pool.
+  */
+  m = n * 3 + 5;
+  if (need_q_factor) /* Need some more in this case. */
+    m += 5;
+  if (m < 30)
+    m = 30;
+  pool = xtrycalloc (m , sizeof (*pool));
+  if (! pool)
+    {
+      err = gpg_err_code_from_errno (errno);
+      goto leave;
+    }
+
+  /* Permutate over the pool of primes until we find a prime of the
+     requested length.  */
+  do
+    {
+    next_try:
+      for (i=0; i < n; i++)
+        pool_in_use[i] = -1;
+
+      if (!perms)
+        {
+          /* Allocate new primes.  This is done right at the beginning
+             of the loop and if we have later run out of primes. */
+          for (i = 0; i < m; i++)
+            {
+              mpi_free (pool[i]);
+              pool[i] = NULL;
+            }
+
+          /* Init m_out_of_n().  */
+          perms = xtrycalloc (1, m);
+          if (!perms)
+            {
+              err = gpg_err_code_from_errno (errno);
+              goto leave;
+            }
+
+          err = gpgrt_lock_lock (&primepool_lock);
+          if (err)
+            goto leave;
+          is_locked = 1;
+
+          for (i = 0; i < n; i++)
+            {
+              perms[i] = 1;
+              /* At a maximum we use strong random for the factors.
+                 This saves us a lot of entropy. Given that Q and
+                 possible Q-factor are also used in the final prime
+                 this should be acceptable.  We also don't allocate in
+                 secure memory to save on that scare resource too.  If
+                 Q has been allocated in secure memory, the final
+                 prime will be saved there anyway.  This is because
+                 our MPI routines take care of that.  GnuPG has worked
+                 this way ever since.  */
+              pool[i] = NULL;
+              if (is_locked)
+                {
+                  pool[i] = get_pool_prime (fbits, poolrandomlevel);
+                  if (!pool[i])
+                    {
+                      err = gpgrt_lock_unlock (&primepool_lock);
+                      if (err)
+                        goto leave;
+                      is_locked = 0;
+                    }
+                }
+              if (!pool[i])
+                pool[i] = gen_prime (fbits, 0, poolrandomlevel, NULL, NULL);
+              pool_in_use[i] = i;
+              factors[i] = pool[i];
+            }
+
+          if (is_locked && (err = gpgrt_lock_unlock (&primepool_lock)))
+            goto leave;
+          is_locked = 0;
+        }
+      else
+        {
+          /* Get next permutation. */
+          m_out_of_n ( (char*)perms, n, m);
+
+          if ((err = gpgrt_lock_lock (&primepool_lock)))
+            goto leave;
+          is_locked = 1;
+
+          for (i = j = 0; (i < m) && (j < n); i++)
+            if (perms[i])
+              {
+                /* If the subprime has not yet beed generated do it now. */
+                if (!pool[i] && is_locked)
+                  {
+                    pool[i] = get_pool_prime (fbits, poolrandomlevel);
+                    if (!pool[i])
+                      {
+                        if ((err = gpgrt_lock_unlock (&primepool_lock)))
+                          goto leave;
+                        is_locked = 0;
+                      }
+                  }
+                if (!pool[i])
+                  pool[i] = gen_prime (fbits, 0, poolrandomlevel, NULL, NULL);
+                pool_in_use[j] = i;
+                factors[j++] = pool[i];
+              }
+
+          if (is_locked && (err = gpgrt_lock_unlock (&primepool_lock)))
+            goto leave;
+          is_locked = 0;
+
+          if (i == n)
+            {
+              /* Ran out of permutations: Allocate new primes.  */
+              xfree (perms);
+              perms = NULL;
+              progress ('!');
+              goto next_try;
+            }
+        }
+
+	/* Generate next prime candidate:
+	   p = 2 * q [ * q_factor] * factor_0 * factor_1 * ... * factor_n + 1.
+         */
+	mpi_set (prime, q);
+	mpi_mul_ui (prime, prime, 2);
+	if (need_q_factor)
+	  mpi_mul (prime, prime, q_factor);
+	for(i = 0; i < n; i++)
+	  mpi_mul (prime, prime, factors[i]);
+	mpi_add_ui (prime, prime, 1);
+	nprime = mpi_get_nbits (prime);
+
+	if (nprime < pbits)
+	  {
+	    if (++count1 > 20)
+	      {
+		count1 = 0;
+		qbits++;
+		progress('>');
+		mpi_free (q);
+		q = gen_prime (qbits, is_secret, randomlevel, NULL, NULL);
+		goto next_try;
+	      }
+	  }
+	else
+	  count1 = 0;
+
+	if (nprime > pbits)
+	  {
+	    if (++count2 > 20)
+	      {
+		count2 = 0;
+		qbits--;
+		progress('<');
+		mpi_free (q);
+		q = gen_prime (qbits, is_secret, randomlevel, NULL, NULL);
+		goto next_try;
+	      }
+	  }
+	else
+	  count2 = 0;
+    }
+  while (! ((nprime == pbits) && check_prime (prime, val_2, 5,
+                                              cb_func, cb_arg)));
+
+  if (DBG_CIPHER)
+    {
+      progress ('\n');
+      log_mpidump ("prime    ", prime);
+      log_mpidump ("factor  q", q);
+      if (need_q_factor)
+        log_mpidump ("factor q0", q_factor);
+      for (i = 0; i < n; i++)
+        log_mpidump ("factor pi", factors[i]);
+      log_debug ("bit sizes: prime=%u, q=%u",
+                 mpi_get_nbits (prime), mpi_get_nbits (q));
+      if (need_q_factor)
+        log_printf (", q0=%u", mpi_get_nbits (q_factor));
+      for (i = 0; i < n; i++)
+        log_printf (", p%d=%u", i, mpi_get_nbits (factors[i]));
+      log_printf ("\n");
+    }
+
+  if (ret_factors)
+    {
+      /* Caller wants the factors.  */
+      factors_new = xtrycalloc (n + 4, sizeof (*factors_new));
+      if (! factors_new)
+        {
+          err = gpg_err_code_from_errno (errno);
+          goto leave;
+        }
+
+      if (all_factors)
+        {
+          i = 0;
+          factors_new[i++] = mpi_set_ui (NULL, 2);
+          factors_new[i++] = mpi_copy (q);
+          if (need_q_factor)
+            factors_new[i++] = mpi_copy (q_factor);
+          for(j=0; j < n; j++)
+            factors_new[i++] = mpi_copy (factors[j]);
+        }
+      else
+        {
+          i = 0;
+          if (need_q_factor)
+            {
+              factors_new[i++] = mpi_copy (q_factor);
+              for (; i <= n; i++)
+                factors_new[i] = mpi_copy (factors[i]);
+            }
+          else
+            for (; i < n; i++ )
+              factors_new[i] = mpi_copy (factors[i]);
+        }
+    }
+
+  if (g && need_q_factor)
+    err = GPG_ERR_NOT_IMPLEMENTED;
+  else if (g)
+    {
+      /* Create a generator (start with 3).  */
+      gcry_mpi_t tmp = mpi_alloc (mpi_get_nlimbs (prime));
+      gcry_mpi_t b = mpi_alloc (mpi_get_nlimbs (prime));
+      gcry_mpi_t pmin1 = mpi_alloc (mpi_get_nlimbs (prime));
+
+      factors[n] = q;
+      factors[n + 1] = mpi_alloc_set_ui (2);
+      mpi_sub_ui (pmin1, prime, 1);
+      mpi_set_ui (g, 2);
+      do
+        {
+          mpi_add_ui (g, g, 1);
+          if (DBG_CIPHER)
+            log_printmpi ("checking g", g);
+          else
+            progress('^');
+          for (i = 0; i < n + 2; i++)
+            {
+              mpi_fdiv_q (tmp, pmin1, factors[i]);
+              /* No mpi_pow(), but it is okay to use this with mod
+                 prime.  */
+              mpi_powm (b, g, tmp, prime);
+              if (! mpi_cmp_ui (b, 1))
+                break;
+            }
+          if (DBG_CIPHER)
+            progress('\n');
+        }
+      while (i < n + 2);
+
+      mpi_free (factors[n+1]);
+      mpi_free (tmp);
+      mpi_free (b);
+      mpi_free (pmin1);
+    }
+
+  if (! DBG_CIPHER)
+    progress ('\n');
+
+
+ leave:
+  if (pool)
+    {
+      is_locked = !gpgrt_lock_lock (&primepool_lock);
+      for(i = 0; i < m; i++)
+        {
+          if (pool[i])
+            {
+              for (j=0; j < n; j++)
+                if (pool_in_use[j] == i)
+                  break;
+              if (j == n && is_locked)
+                {
+                  /* This pooled subprime has not been used. */
+                  save_pool_prime (pool[i], poolrandomlevel);
+                }
+              else
+                mpi_free (pool[i]);
+            }
+        }
+      if (is_locked)
+        err = gpgrt_lock_unlock (&primepool_lock);
+      is_locked = 0;
+      xfree (pool);
+    }
+  xfree (pool_in_use);
+  if (factors)
+    xfree (factors);  /* Factors are shallow copies.  */
+  if (perms)
+    xfree (perms);
+
+  mpi_free (val_2);
+  mpi_free (q);
+  mpi_free (q_factor);
+
+  if (! err)
+    {
+      *prime_generated = prime;
+      if (ret_factors)
+	*ret_factors = factors_new;
+    }
+  else
+    {
+      if (factors_new)
+	{
+	  for (i = 0; factors_new[i]; i++)
+	    mpi_free (factors_new[i]);
+	  xfree (factors_new);
+	}
+      mpi_free (prime);
+    }
+
+  return err;
+}
+
+
+/* Generate a prime used for discrete logarithm algorithms; i.e. this
+   prime will be public and no strong random is required.  On success
+   R_PRIME receives a new MPI with the prime.  On error R_PRIME is set
+   to NULL and an error code is returned.  If RET_FACTORS is not NULL
+   it is set to an allocated array of factors on success or to NULL on
+   error.  */
+gcry_err_code_t
+_gcry_generate_elg_prime (int mode, unsigned pbits, unsigned qbits,
+			  gcry_mpi_t g,
+                          gcry_mpi_t *r_prime, gcry_mpi_t **ret_factors)
+{
+  *r_prime = NULL;
+  if (ret_factors)
+    *ret_factors = NULL;
+  return prime_generate_internal ((mode == 1), r_prime, pbits, qbits, g,
+                                  ret_factors, GCRY_WEAK_RANDOM, 0, 0,
+                                  NULL, NULL);
+}
+
+
+static gcry_mpi_t
+gen_prime (unsigned int nbits, int secret, int randomlevel,
+           int (*extra_check)(void *, gcry_mpi_t), void *extra_check_arg)
+{
+  gcry_mpi_t prime, ptest, pminus1, val_2, val_3, result;
+  int i;
+  unsigned int x, step;
+  unsigned int count1, count2;
+  int *mods;
+
+/*   if (  DBG_CIPHER ) */
+/*     log_debug ("generate a prime of %u bits ", nbits ); */
+
+  if (nbits < 16)
+    log_fatal ("can't generate a prime with less than %d bits\n", 16);
+
+  mods = (secret? xmalloc_secure (no_of_small_prime_numbers * sizeof *mods)
+          /* */ : xmalloc (no_of_small_prime_numbers * sizeof *mods));
+  /* Make nbits fit into gcry_mpi_t implementation. */
+  val_2  = mpi_alloc_set_ui( 2 );
+  val_3 = mpi_alloc_set_ui( 3);
+  prime  = secret? mpi_snew (nbits): mpi_new (nbits);
+  result = mpi_alloc_like( prime );
+  pminus1= mpi_alloc_like( prime );
+  ptest  = mpi_alloc_like( prime );
+  count1 = count2 = 0;
+  for (;;)
+    {  /* try forvever */
+      int dotcount=0;
+
+      /* generate a random number */
+      _gcry_mpi_randomize( prime, nbits, randomlevel );
+
+      /* Set high order bit to 1, set low order bit to 1.  If we are
+         generating a secret prime we are most probably doing that
+         for RSA, to make sure that the modulus does have the
+         requested key size we set the 2 high order bits. */
+      mpi_set_highbit (prime, nbits-1);
+      if (secret)
+        mpi_set_bit (prime, nbits-2);
+      mpi_set_bit(prime, 0);
+
+      /* Calculate all remainders. */
+      for (i=0; (x = small_prime_numbers[i]); i++ )
+        mods[i] = mpi_fdiv_r_ui(NULL, prime, x);
+
+      /* Now try some primes starting with prime. */
+      for(step=0; step < 20000; step += 2 )
+        {
+          /* Check against all the small primes we have in mods. */
+          count1++;
+          for (i=0; (x = small_prime_numbers[i]); i++ )
+            {
+              while ( mods[i] + step >= x )
+                mods[i] -= x;
+              if ( !(mods[i] + step) )
+                break;
+	    }
+          if ( x )
+            continue;   /* Found a multiple of an already known prime. */
+
+          mpi_add_ui( ptest, prime, step );
+
+          /* Do a fast Fermat test now. */
+          count2++;
+          mpi_sub_ui( pminus1, ptest, 1);
+          mpi_powm( result, val_2, pminus1, ptest );
+          if ( !mpi_cmp_ui( result, 1 ) )
+            {
+              /* Not composite, perform stronger tests */
+              if (is_prime(ptest, 5, &count2 ))
+                {
+                  if (!mpi_test_bit( ptest, nbits-1-secret ))
+                    {
+                      progress('\n');
+                      log_debug ("overflow in prime generation\n");
+                      break; /* Stop loop, continue with a new prime. */
+                    }
+
+                  if (extra_check && extra_check (extra_check_arg, ptest))
+                    {
+                      /* The extra check told us that this prime is
+                         not of the caller's taste. */
+                      progress ('/');
+                    }
+                  else
+                    {
+                      /* Got it. */
+                      mpi_free(val_2);
+                      mpi_free(val_3);
+                      mpi_free(result);
+                      mpi_free(pminus1);
+                      mpi_free(prime);
+                      xfree(mods);
+                      return ptest;
+                    }
+                }
+	    }
+          if (++dotcount == 10 )
+            {
+              progress('.');
+              dotcount = 0;
+	    }
+	}
+      progress(':'); /* restart with a new random value */
+    }
+}
+
+/****************
+ * Returns: true if this may be a prime
+ * RM_ROUNDS gives the number of Rabin-Miller tests to run.
+ */
+static int
+check_prime( gcry_mpi_t prime, gcry_mpi_t val_2, int rm_rounds,
+             gcry_prime_check_func_t cb_func, void *cb_arg)
+{
+  int i;
+  unsigned int x;
+  unsigned int count=0;
+
+  /* Check against small primes. */
+  for (i=0; (x = small_prime_numbers[i]); i++ )
+    {
+      if ( mpi_divisible_ui( prime, x ) )
+        return !mpi_cmp_ui (prime, x);
+    }
+
+  /* A quick Fermat test. */
+  {
+    gcry_mpi_t result = mpi_alloc_like( prime );
+    gcry_mpi_t pminus1 = mpi_alloc_like( prime );
+    mpi_sub_ui( pminus1, prime, 1);
+    mpi_powm( result, val_2, pminus1, prime );
+    mpi_free( pminus1 );
+    if ( mpi_cmp_ui( result, 1 ) )
+      {
+        /* Is composite. */
+        mpi_free( result );
+        progress('.');
+        return 0;
+      }
+    mpi_free( result );
+  }
+
+  if (!cb_func || cb_func (cb_arg, GCRY_PRIME_CHECK_AT_MAYBE_PRIME, prime))
+    {
+      /* Perform stronger tests. */
+      if ( is_prime( prime, rm_rounds, &count ) )
+        {
+          if (!cb_func
+              || cb_func (cb_arg, GCRY_PRIME_CHECK_AT_GOT_PRIME, prime))
+            return 1; /* Probably a prime. */
+        }
+    }
+  progress('.');
+  return 0;
+}
+
+
+/*
+ * Return true if n is probably a prime
+ */
+static int
+is_prime (gcry_mpi_t n, int steps, unsigned int *count)
+{
+  gcry_mpi_t x = mpi_alloc( mpi_get_nlimbs( n ) );
+  gcry_mpi_t y = mpi_alloc( mpi_get_nlimbs( n ) );
+  gcry_mpi_t z = mpi_alloc( mpi_get_nlimbs( n ) );
+  gcry_mpi_t nminus1 = mpi_alloc( mpi_get_nlimbs( n ) );
+  gcry_mpi_t a2 = mpi_alloc_set_ui( 2 );
+  gcry_mpi_t q;
+  unsigned i, j, k;
+  int rc = 0;
+  unsigned nbits = mpi_get_nbits( n );
+
+  if (steps < 5) /* Make sure that we do at least 5 rounds. */
+    steps = 5;
+
+  mpi_sub_ui( nminus1, n, 1 );
+
+  /* Find q and k, so that n = 1 + 2^k * q . */
+  q = mpi_copy ( nminus1 );
+  k = mpi_trailing_zeros ( q );
+  mpi_tdiv_q_2exp (q, q, k);
+
+  for (i=0 ; i < steps; i++ )
+    {
+      ++*count;
+      if( !i )
+        {
+          mpi_set_ui( x, 2 );
+        }
+      else
+        {
+          /* We need to loop to avoid an X with value 0 or 1.  */
+          do
+            {
+              _gcry_mpi_randomize (x, nbits, GCRY_WEAK_RANDOM);
+
+              /* Make sure that the number is smaller than the prime
+               * and keep the randomness of the high bit. */
+              if (mpi_test_bit (x, nbits-2))
+                {
+                  mpi_set_highbit (x, nbits-2); /* Clear all higher bits. */
+                }
+              else
+                {
+                  mpi_set_highbit (x, nbits-2);
+                  mpi_clear_bit (x, nbits-2);
+                }
+            }
+          while (mpi_cmp_ui (x, 1) <= 0);
+          gcry_assert (mpi_cmp (x, nminus1) < 0);
+	}
+      mpi_powm ( y, x, q, n);
+      if ( mpi_cmp_ui(y, 1) && mpi_cmp( y, nminus1 ) )
+        {
+          for ( j=1; j < k && mpi_cmp( y, nminus1 ); j++ )
+            {
+              mpi_powm(y, y, a2, n);
+              if( !mpi_cmp_ui( y, 1 ) )
+                goto leave; /* Not a prime. */
+            }
+          if (mpi_cmp( y, nminus1 ) )
+            goto leave; /* Not a prime. */
+	}
+      progress('+');
+    }
+  rc = 1; /* May be a prime. */
+
+ leave:
+  mpi_free( x );
+  mpi_free( y );
+  mpi_free( z );
+  mpi_free( nminus1 );
+  mpi_free( q );
+  mpi_free( a2 );
+
+  return rc;
+}
+
+
+/* Given ARRAY of size N with M elements set to true produce a
+   modified array with the next permutation of M elements.  Note, that
+   ARRAY is used in a one-bit-per-byte approach.  To detected the last
+   permutation it is useful to initialize the array with the first M
+   element set to true and use this test:
+       m_out_of_n (array, m, n);
+       for (i = j = 0; i < n && j < m; i++)
+         if (array[i])
+           j++;
+       if (j == m)
+         goto ready;
+
+   This code is based on the algorithm 452 from the "Collected
+   Algorithms From ACM, Volume II" by C. N. Liu and D. T. Tang.
+*/
+static void
+m_out_of_n ( char *array, int m, int n )
+{
+  int i=0, i1=0, j=0, jp=0,  j1=0, k1=0, k2=0;
+
+  if( !m || m >= n )
+    return;
+
+  /* Need to handle this simple case separately. */
+  if( m == 1 )
+    {
+      for (i=0; i < n; i++ )
+        {
+          if ( array[i] )
+            {
+              array[i++] = 0;
+              if( i >= n )
+                i = 0;
+              array[i] = 1;
+              return;
+            }
+        }
+      BUG();
+    }
+
+
+  for (j=1; j < n; j++ )
+    {
+      if ( array[n-1] == array[n-j-1])
+        continue;
+      j1 = j;
+      break;
+    }
+
+  if ( (m & 1) )
+    {
+      /* M is odd. */
+      if( array[n-1] )
+        {
+          if( j1 & 1 )
+            {
+              k1 = n - j1;
+              k2 = k1+2;
+              if( k2 > n )
+                k2 = n;
+              goto leave;
+            }
+          goto scan;
+        }
+      k2 = n - j1 - 1;
+      if( k2 == 0 )
+        {
+          k1 = i;
+          k2 = n - j1;
+        }
+      else if( array[k2] && array[k2-1] )
+        k1 = n;
+      else
+        k1 = k2 + 1;
+    }
+  else
+    {
+      /* M is even. */
+      if( !array[n-1] )
+        {
+          k1 = n - j1;
+          k2 = k1 + 1;
+          goto leave;
+        }
+
+      if( !(j1 & 1) )
+        {
+          k1 = n - j1;
+          k2 = k1+2;
+          if( k2 > n )
+            k2 = n;
+          goto leave;
+        }
+    scan:
+      jp = n - j1 - 1;
+      for (i=1; i <= jp; i++ )
+        {
+          i1 = jp + 2 - i;
+          if( array[i1-1]  )
+            {
+              if( array[i1-2] )
+                {
+                  k1 = i1 - 1;
+                  k2 = n - j1;
+		}
+              else
+                {
+                  k1 = i1 - 1;
+                  k2 = n + 1 - j1;
+                }
+              goto leave;
+            }
+        }
+      k1 = 1;
+      k2 = n + 1 - m;
+    }
+ leave:
+  /* Now complement the two selected bits. */
+  array[k1-1] = !array[k1-1];
+  array[k2-1] = !array[k2-1];
+}
+
+
+/* Generate a new prime number of PRIME_BITS bits and store it in
+   PRIME.  If FACTOR_BITS is non-zero, one of the prime factors of
+   (prime - 1) / 2 must be FACTOR_BITS bits long.  If FACTORS is
+   non-zero, allocate a new, NULL-terminated array holding the prime
+   factors and store it in FACTORS.  FLAGS might be used to influence
+   the prime number generation process.  */
+gcry_err_code_t
+_gcry_prime_generate (gcry_mpi_t *prime, unsigned int prime_bits,
+                      unsigned int factor_bits, gcry_mpi_t **factors,
+                      gcry_prime_check_func_t cb_func, void *cb_arg,
+                      gcry_random_level_t random_level,
+                      unsigned int flags)
+{
+  gcry_err_code_t rc = 0;
+  gcry_mpi_t *factors_generated = NULL;
+  gcry_mpi_t prime_generated = NULL;
+  unsigned int mode = 0;
+
+  if (!prime)
+    return GPG_ERR_INV_ARG;
+  *prime = NULL;
+
+  if (flags & GCRY_PRIME_FLAG_SPECIAL_FACTOR)
+    mode = 1;
+
+  /* Generate.  */
+  rc = prime_generate_internal ((mode==1), &prime_generated, prime_bits,
+                                factor_bits, NULL,
+                                factors? &factors_generated : NULL,
+                                random_level, flags, 1,
+                                cb_func, cb_arg);
+
+  if (!rc && cb_func)
+    {
+      /* Additional check. */
+      if ( !cb_func (cb_arg, GCRY_PRIME_CHECK_AT_FINISH, prime_generated))
+        {
+          /* Failed, deallocate resources.  */
+          unsigned int i;
+
+          mpi_free (prime_generated);
+          if (factors)
+            {
+              for (i = 0; factors_generated[i]; i++)
+                mpi_free (factors_generated[i]);
+              xfree (factors_generated);
+            }
+          rc = GPG_ERR_GENERAL;
+        }
+    }
+
+  if (!rc)
+    {
+      if (factors)
+        *factors = factors_generated;
+      *prime = prime_generated;
+    }
+
+  return rc;
+}
+
+/* Check whether the number X is prime.  */
+gcry_err_code_t
+_gcry_prime_check (gcry_mpi_t x, unsigned int flags)
+{
+  (void)flags;
+
+  switch (mpi_cmp_ui (x, 2))
+    {
+    case 0:  return 0;                /* 2 is a prime */
+    case -1: return GPG_ERR_NO_PRIME; /* Only numbers > 1 are primes.  */
+    }
+
+  /* We use 64 rounds because the prime we are going to test is not
+     guaranteed to be a random one. */
+  if (check_prime (x, mpi_const (MPI_C_TWO), 64, NULL, NULL))
+    return 0;
+
+  return GPG_ERR_NO_PRIME;
+}
+
+
+/* Check whether the number X is prime according to FIPS 186-4 table C.2.  */
+gcry_err_code_t
+_gcry_fips186_4_prime_check (gcry_mpi_t x, unsigned int bits)
+{
+  gcry_err_code_t ec = GPG_ERR_NO_ERROR;
+
+  switch (mpi_cmp_ui (x, 2))
+    {
+    case 0:  return ec;               /* 2 is a prime */
+    case -1: return GPG_ERR_NO_PRIME; /* Only numbers > 1 are primes.  */
+    }
+
+  /* We use 5 or 4 rounds as specified in table C.2 */
+  if (! check_prime (x, mpi_const (MPI_C_TWO), bits > 1024 ? 4 : 5, NULL, NULL))
+    ec = GPG_ERR_NO_PRIME;
+
+  return ec;
+}
+
+
+/* Find a generator for PRIME where the factorization of (prime-1) is
+   in the NULL terminated array FACTORS. Return the generator as a
+   newly allocated MPI in R_G.  If START_G is not NULL, use this as s
+   atart for the search. Returns 0 on success.*/
+gcry_err_code_t
+_gcry_prime_group_generator (gcry_mpi_t *r_g,
+                             gcry_mpi_t prime, gcry_mpi_t *factors,
+                             gcry_mpi_t start_g)
+{
+  gcry_mpi_t tmp, b, pmin1, g;
+  int first, i, n;
+
+  if (!r_g)
+    return GPG_ERR_INV_ARG;
+  *r_g = NULL;
+  if (!factors || !prime)
+    return GPG_ERR_INV_ARG;
+
+  for (n=0; factors[n]; n++)
+    ;
+  if (n < 2)
+    return GPG_ERR_INV_ARG;
+
+  tmp   = mpi_new (0);
+  b     = mpi_new (0);
+  pmin1 = mpi_new (0);
+  g     = start_g? mpi_copy (start_g) : mpi_set_ui (NULL, 3);
+
+  /* Extra sanity check - usually disabled. */
+/*   mpi_set (tmp, factors[0]); */
+/*   for(i = 1; i < n; i++) */
+/*     mpi_mul (tmp, tmp, factors[i]); */
+/*   mpi_add_ui (tmp, tmp, 1); */
+/*   if (mpi_cmp (prime, tmp)) */
+/*     return gpg_error (GPG_ERR_INV_ARG); */
+
+  mpi_sub_ui (pmin1, prime, 1);
+  first = 1;
+  do
+    {
+      if (first)
+        first = 0;
+      else
+        mpi_add_ui (g, g, 1);
+
+      if (DBG_CIPHER)
+        log_printmpi ("checking g", g);
+      else
+        progress('^');
+
+      for (i = 0; i < n; i++)
+        {
+          mpi_fdiv_q (tmp, pmin1, factors[i]);
+          mpi_powm (b, g, tmp, prime);
+          if (! mpi_cmp_ui (b, 1))
+            break;
+        }
+      if (DBG_CIPHER)
+        progress('\n');
+    }
+  while (i < n);
+
+  _gcry_mpi_release (tmp);
+  _gcry_mpi_release (b);
+  _gcry_mpi_release (pmin1);
+  *r_g = g;
+
+  return 0;
+}
+
+/* Convenience function to release the factors array. */
+void
+_gcry_prime_release_factors (gcry_mpi_t *factors)
+{
+  if (factors)
+    {
+      int i;
+
+      for (i=0; factors[i]; i++)
+        mpi_free (factors[i]);
+      xfree (factors);
+    }
+}
+
+
+
+/* Helper for _gcry_derive_x931_prime.  */
+static gcry_mpi_t
+find_x931_prime (const gcry_mpi_t pfirst)
+{
+  gcry_mpi_t val_2 = mpi_alloc_set_ui (2);
+  gcry_mpi_t prime;
+
+  prime = mpi_copy (pfirst);
+  /* If P is even add 1.  */
+  mpi_set_bit (prime, 0);
+
+  /* We use 64 Rabin-Miller rounds which is better and thus
+     sufficient.  We do not have a Lucas test implementation thus we
+     can't do it in the X9.31 preferred way of running a few
+     Rabin-Miller followed by one Lucas test.  */
+  while ( !check_prime (prime, val_2, 64, NULL, NULL) )
+    mpi_add_ui (prime, prime, 2);
+
+  mpi_free (val_2);
+
+  return prime;
+}
+
+
+/* Generate a prime using the algorithm from X9.31 appendix B.4.
+
+   This function requires that the provided public exponent E is odd.
+   XP, XP1 and XP2 are the seed values.  All values are mandatory.
+
+   On success the prime is returned.  If R_P1 or R_P2 are given the
+   internal values P1 and P2 are saved at these addresses.  On error
+   NULL is returned.  */
+gcry_mpi_t
+_gcry_derive_x931_prime (const gcry_mpi_t xp,
+                         const gcry_mpi_t xp1, const gcry_mpi_t xp2,
+                         const gcry_mpi_t e,
+                         gcry_mpi_t *r_p1, gcry_mpi_t *r_p2)
+{
+  gcry_mpi_t p1, p2, p1p2, yp0;
+
+  if (!xp || !xp1 || !xp2)
+    return NULL;
+  if (!e || !mpi_test_bit (e, 0))
+    return NULL;  /* We support only odd values for E.  */
+
+  p1 = find_x931_prime (xp1);
+  p2 = find_x931_prime (xp2);
+  p1p2 = mpi_alloc_like (xp);
+  mpi_mul (p1p2, p1, p2);
+
+  {
+    gcry_mpi_t r1, tmp;
+
+    /* r1 = (p2^{-1} mod p1)p2 - (p1^{-1} mod p2) */
+    tmp = mpi_alloc_like (p1);
+    mpi_invm (tmp, p2, p1);
+    mpi_mul (tmp, tmp, p2);
+    r1 = tmp;
+
+    tmp = mpi_alloc_like (p2);
+    mpi_invm (tmp, p1, p2);
+    mpi_mul (tmp, tmp, p1);
+    mpi_sub (r1, r1, tmp);
+
+    /* Fixup a negative value.  */
+    if (mpi_has_sign (r1))
+      mpi_add (r1, r1, p1p2);
+
+    /* yp0 = xp + (r1 - xp mod p1*p2)  */
+    yp0 = tmp; tmp = NULL;
+    mpi_subm (yp0, r1, xp, p1p2);
+    mpi_add (yp0, yp0, xp);
+    mpi_free (r1);
+
+    /* Fixup a negative value.  */
+    if (mpi_cmp (yp0, xp) < 0 )
+      mpi_add (yp0, yp0, p1p2);
+  }
+
+  /* yp0 is now the first integer greater than xp with p1 being a
+     large prime factor of yp0-1 and p2 a large prime factor of yp0+1.  */
+
+  /* Note that the first example from X9.31 (D.1.1) which uses
+       (Xq1 #1A5CF72EE770DE50CB09ACCEA9#)
+       (Xq2 #134E4CAA16D2350A21D775C404#)
+       (Xq  #CC1092495D867E64065DEE3E7955F2EBC7D47A2D
+             7C9953388F97DDDC3E1CA19C35CA659EDC2FC325
+             6D29C2627479C086A699A49C4C9CEE7EF7BD1B34
+             321DE34A#))))
+     returns an yp0 of
+            #CC1092495D867E64065DEE3E7955F2EBC7D47A2D
+             7C9953388F97DDDC3E1CA19C35CA659EDC2FC4E3
+             BF20CB896EE37E098A906313271422162CB6C642
+             75C1201F#
+     and not
+            #CC1092495D867E64065DEE3E7955F2EBC7D47A2D
+             7C9953388F97DDDC3E1CA19C35CA659EDC2FC2E6
+             C88FE299D52D78BE405A97E01FD71DD7819ECB91
+             FA85A076#
+     as stated in the standard.  This seems to be a bug in X9.31.
+   */
+
+  {
+    gcry_mpi_t val_2 = mpi_alloc_set_ui (2);
+    gcry_mpi_t gcdtmp = mpi_alloc_like (yp0);
+    int gcdres;
+
+    mpi_sub_ui (p1p2, p1p2, 1); /* Adjust for loop body.  */
+    mpi_sub_ui (yp0, yp0, 1);   /* Ditto.  */
+    for (;;)
+      {
+        gcdres = mpi_gcd (gcdtmp, e, yp0);
+        mpi_add_ui (yp0, yp0, 1);
+        if (!gcdres)
+          progress ('/');  /* gcd (e, yp0-1) != 1  */
+        else if (check_prime (yp0, val_2, 64, NULL, NULL))
+          break; /* Found.  */
+        /* We add p1p2-1 because yp0 is incremented after the gcd test.  */
+        mpi_add (yp0, yp0, p1p2);
+      }
+    mpi_free (gcdtmp);
+    mpi_free (val_2);
+  }
+
+  mpi_free (p1p2);
+
+  progress('\n');
+  if (r_p1)
+    *r_p1 = p1;
+  else
+    mpi_free (p1);
+  if (r_p2)
+    *r_p2 = p2;
+  else
+    mpi_free (p2);
+  return yp0;
+}
+
+
+
+/* Generate the two prime used for DSA using the algorithm specified
+   in FIPS 186-2.  PBITS is the desired length of the prime P and a
+   QBITS the length of the prime Q.  If SEED is not supplied and
+   SEEDLEN is 0 the function generates an appropriate SEED.  On
+   success the generated primes are stored at R_Q and R_P, the counter
+   value is stored at R_COUNTER and the seed actually used for
+   generation is stored at R_SEED and R_SEEDVALUE.  */
+gpg_err_code_t
+_gcry_generate_fips186_2_prime (unsigned int pbits, unsigned int qbits,
+                                const void *seed, size_t seedlen,
+                                gcry_mpi_t *r_q, gcry_mpi_t *r_p,
+                                int *r_counter,
+                                void **r_seed, size_t *r_seedlen)
+{
+  gpg_err_code_t ec;
+  unsigned char seed_help_buffer[160/8];  /* Used to hold a generated SEED. */
+  unsigned char *seed_plus;     /* Malloced buffer to hold SEED+x.  */
+  unsigned char digest[160/8];  /* Helper buffer for SHA-1 digest.  */
+  gcry_mpi_t val_2 = NULL;      /* Helper for the prime test.  */
+  gcry_mpi_t tmpval = NULL;     /* Helper variable.  */
+  int i;
+
+  unsigned char value_u[160/8];
+  int value_n, value_b, value_k;
+  int counter;
+  gcry_mpi_t value_w = NULL;
+  gcry_mpi_t value_x = NULL;
+  gcry_mpi_t prime_q = NULL;
+  gcry_mpi_t prime_p = NULL;
+
+  /* FIPS 186-2 allows only for 1024/160 bit.  */
+  if (pbits != 1024 || qbits != 160)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (!seed && !seedlen)
+    ; /* No seed value given:  We are asked to generate it.  */
+  else if (!seed || seedlen < qbits/8)
+    return GPG_ERR_INV_ARG;
+
+  /* Allocate a buffer to later compute SEED+some_increment. */
+  seed_plus = xtrymalloc (seedlen < 20? 20:seedlen);
+  if (!seed_plus)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  val_2   = mpi_alloc_set_ui (2);
+  value_n = (pbits - 1) / qbits;
+  value_b = (pbits - 1) - value_n * qbits;
+  value_w = mpi_new (pbits);
+  value_x = mpi_new (pbits);
+
+ restart:
+  /* Generate Q.  */
+  for (;;)
+    {
+      /* Step 1: Generate a (new) seed unless one has been supplied.  */
+      if (!seed)
+        {
+          seedlen = sizeof seed_help_buffer;
+          _gcry_create_nonce (seed_help_buffer, seedlen);
+          seed = seed_help_buffer;
+        }
+
+      /* Step 2: U = sha1(seed) ^ sha1((seed+1) mod 2^{qbits})  */
+      memcpy (seed_plus, seed, seedlen);
+      for (i=seedlen-1; i >= 0; i--)
+        {
+          seed_plus[i]++;
+          if (seed_plus[i])
+            break;
+        }
+      _gcry_md_hash_buffer (GCRY_MD_SHA1, value_u, seed, seedlen);
+      _gcry_md_hash_buffer (GCRY_MD_SHA1, digest, seed_plus, seedlen);
+      for (i=0; i < sizeof value_u; i++)
+        value_u[i] ^= digest[i];
+
+      /* Step 3:  Form q from U  */
+      _gcry_mpi_release (prime_q); prime_q = NULL;
+      ec = _gcry_mpi_scan (&prime_q, GCRYMPI_FMT_USG,
+                           value_u, sizeof value_u, NULL);
+      if (ec)
+        goto leave;
+      mpi_set_highbit (prime_q, qbits-1 );
+      mpi_set_bit (prime_q, 0);
+
+      /* Step 4:  Test whether Q is prime using 64 round of Rabin-Miller.  */
+      if (check_prime (prime_q, val_2, 64, NULL, NULL))
+        break; /* Yes, Q is prime.  */
+
+      /* Step 5.  */
+      seed = NULL;  /* Force a new seed at Step 1.  */
+    }
+
+  /* Step 6.  Note that we do no use an explicit offset but increment
+     SEED_PLUS accordingly.  SEED_PLUS is currently SEED+1.  */
+  counter = 0;
+
+  /* Generate P. */
+  prime_p = mpi_new (pbits);
+  for (;;)
+    {
+      /* Step 7: For k = 0,...n let
+                   V_k = sha1(seed+offset+k) mod 2^{qbits}
+         Step 8: W = V_0 + V_1*2^160 +
+                         ...
+                         + V_{n-1}*2^{(n-1)*160}
+                         + (V_{n} mod 2^b)*2^{n*160}
+       */
+      mpi_set_ui (value_w, 0);
+      for (value_k=0; value_k <= value_n; value_k++)
+        {
+          /* There is no need to have an explicit offset variable:  In
+             the first round we shall have an offset of 2, this is
+             achieved by using SEED_PLUS which is already at SEED+1,
+             thus we just need to increment it once again.  The
+             requirement for the next round is to update offset by N,
+             which we implictly did at the end of this loop, and then
+             to add one; this one is the same as in the first round.  */
+          for (i=seedlen-1; i >= 0; i--)
+            {
+              seed_plus[i]++;
+              if (seed_plus[i])
+                break;
+            }
+          _gcry_md_hash_buffer (GCRY_MD_SHA1, digest, seed_plus, seedlen);
+
+          _gcry_mpi_release (tmpval); tmpval = NULL;
+          ec = _gcry_mpi_scan (&tmpval, GCRYMPI_FMT_USG,
+                               digest, sizeof digest, NULL);
+          if (ec)
+            goto leave;
+          if (value_k == value_n)
+            mpi_clear_highbit (tmpval, value_b); /* (V_n mod 2^b) */
+          mpi_lshift (tmpval, tmpval, value_k*qbits);
+          mpi_add (value_w, value_w, tmpval);
+        }
+
+      /* Step 8 continued: X = W + 2^{L-1}  */
+      mpi_set_ui (value_x, 0);
+      mpi_set_highbit (value_x, pbits-1);
+      mpi_add (value_x, value_x, value_w);
+
+      /* Step 9:  c = X mod 2q,  p = X - (c - 1)  */
+      mpi_mul_2exp (tmpval, prime_q, 1);
+      mpi_mod (tmpval, value_x, tmpval);
+      mpi_sub_ui (tmpval, tmpval, 1);
+      mpi_sub (prime_p, value_x, tmpval);
+
+      /* Step 10: If  p < 2^{L-1}  skip the primality test.  */
+      /* Step 11 and 12: Primality test.  */
+      if (mpi_get_nbits (prime_p) >= pbits-1
+          && check_prime (prime_p, val_2, 64, NULL, NULL) )
+        break; /* Yes, P is prime, continue with Step 15.  */
+
+      /* Step 13: counter = counter + 1, offset = offset + n + 1. */
+      counter++;
+
+      /* Step 14: If counter >= 2^12  goto Step 1.  */
+      if (counter >= 4096)
+        goto restart;
+    }
+
+  /* Step 15:  Save p, q, counter and seed.  */
+/*   log_debug ("fips186-2 pbits p=%u q=%u counter=%d\n", */
+/*              mpi_get_nbits (prime_p), mpi_get_nbits (prime_q), counter); */
+/*   log_printhex("fips186-2 seed:", seed, seedlen); */
+/*   log_mpidump ("fips186-2 prime p", prime_p); */
+/*   log_mpidump ("fips186-2 prime q", prime_q); */
+  if (r_q)
+    {
+      *r_q = prime_q;
+      prime_q = NULL;
+    }
+  if (r_p)
+    {
+      *r_p = prime_p;
+      prime_p = NULL;
+    }
+  if (r_counter)
+    *r_counter = counter;
+  if (r_seed && r_seedlen)
+    {
+      memcpy (seed_plus, seed, seedlen);
+      *r_seed = seed_plus;
+      seed_plus = NULL;
+      *r_seedlen = seedlen;
+    }
+
+
+ leave:
+  _gcry_mpi_release (tmpval);
+  _gcry_mpi_release (value_x);
+  _gcry_mpi_release (value_w);
+  _gcry_mpi_release (prime_p);
+  _gcry_mpi_release (prime_q);
+  xfree (seed_plus);
+  _gcry_mpi_release (val_2);
+  return ec;
+}
+
+
+
+/* WARNING: The code below has not yet been tested!
+ *
+ * Generate the two prime used for DSA using the algorithm specified
+ * in FIPS 186-3, A.1.1.2.  PBITS is the desired length of the prime P
+ * and a QBITS the length of the prime Q.  If SEED is not supplied and
+ * SEEDLEN is 0 the function generates an appropriate SEED.  On
+ * success the generated primes are stored at R_Q and R_P, the counter
+ * value is stored at R_COUNTER and the seed actually used for
+ * generation is stored at R_SEED and R_SEEDVALUE.  The hash algorithm
+ * used is stored at R_HASHALGO.
+ *
+ * Note that this function is very similar to the fips186_2 code.  Due
+ * to the minor differences, other buffer sizes and for documentarion,
+ * we use a separate function.
+ */
+gpg_err_code_t
+_gcry_generate_fips186_3_prime (unsigned int pbits, unsigned int qbits,
+                                const void *seed, size_t seedlen,
+                                gcry_mpi_t *r_q, gcry_mpi_t *r_p,
+                                int *r_counter,
+                                void **r_seed, size_t *r_seedlen,
+                                int *r_hashalgo)
+{
+  gpg_err_code_t ec;
+  unsigned char seed_help_buffer[256/8];  /* Used to hold a generated SEED. */
+  unsigned char *seed_plus;     /* Malloced buffer to hold SEED+x.  */
+  unsigned char digest[256/8];  /* Helper buffer for SHA-2 digest.  */
+  gcry_mpi_t val_2 = NULL;      /* Helper for the prime test.  */
+  gcry_mpi_t tmpval = NULL;     /* Helper variable.  */
+  int hashalgo;                 /* The id of the Approved Hash Function.  */
+  int i;
+
+  unsigned char value_u[256/8];
+  int value_n, value_b, value_j;
+  int counter;
+  gcry_mpi_t value_w = NULL;
+  gcry_mpi_t value_x = NULL;
+  gcry_mpi_t prime_q = NULL;
+  gcry_mpi_t prime_p = NULL;
+
+  gcry_assert (sizeof seed_help_buffer == sizeof digest
+               && sizeof seed_help_buffer == sizeof value_u);
+
+  /* Step 1:  Check the requested prime lengths.  */
+  /* Note that due to the size of our buffers QBITS is limited to 256.  */
+  if (pbits == 2048 && qbits == 224)
+    hashalgo = GCRY_MD_SHA224;
+  else if (pbits == 2048 && qbits == 256)
+    hashalgo = GCRY_MD_SHA256;
+  else if (pbits == 3072 && qbits == 256)
+    hashalgo = GCRY_MD_SHA256;
+  else
+    return GPG_ERR_INV_KEYLEN;
+
+  /* Also check that the hash algorithm is available.  */
+  ec = _gcry_md_test_algo (hashalgo);
+  if (ec)
+    return ec;
+  gcry_assert (qbits/8 <= sizeof digest);
+  gcry_assert (_gcry_md_get_algo_dlen (hashalgo) == qbits/8);
+
+
+  /* Step 2:  Check seedlen.  */
+  if (!seed && !seedlen)
+    ; /* No seed value given:  We are asked to generate it.  */
+  else if (!seed || seedlen < qbits/8)
+    return GPG_ERR_INV_ARG;
+
+  /* Allocate a buffer to later compute SEED+some_increment and a few
+     helper variables.  */
+  seed_plus = xtrymalloc (seedlen < sizeof seed_help_buffer?
+                          sizeof seed_help_buffer : seedlen);
+  if (!seed_plus)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  val_2   = mpi_alloc_set_ui (2);
+  value_w = mpi_new (pbits);
+  value_x = mpi_new (pbits);
+
+  /* Step 3: n = \lceil L / outlen \rceil - 1  */
+  value_n = (pbits + qbits - 1) / qbits - 1;
+  /* Step 4: b = L - 1 - (n * outlen)  */
+  value_b = pbits - 1 - (value_n * qbits);
+
+ restart:
+  /* Generate Q.  */
+  for (;;)
+    {
+      /* Step 5:  Generate a (new) seed unless one has been supplied.  */
+      if (!seed)
+        {
+          seedlen = qbits/8;
+          gcry_assert (seedlen <= sizeof seed_help_buffer);
+          _gcry_create_nonce (seed_help_buffer, seedlen);
+          seed = seed_help_buffer;
+        }
+
+      /* Step 6:  U = hash(seed)  */
+      _gcry_md_hash_buffer (hashalgo, value_u, seed, seedlen);
+
+      /* Step 7:  q = 2^{N-1} + U + 1 - (U mod 2)  */
+      if ( !(value_u[qbits/8-1] & 0x01) )
+        {
+          for (i=qbits/8-1; i >= 0; i--)
+            {
+              value_u[i]++;
+              if (value_u[i])
+                break;
+            }
+        }
+      _gcry_mpi_release (prime_q); prime_q = NULL;
+      ec = _gcry_mpi_scan (&prime_q, GCRYMPI_FMT_USG,
+                           value_u, qbits/8, NULL);
+      if (ec)
+        goto leave;
+      mpi_set_highbit (prime_q, qbits-1 );
+
+      /* Step 8:  Test whether Q is prime using 64 round of Rabin-Miller.
+                  According to table C.1 this is sufficient for all
+                  supported prime sizes (i.e. up 3072/256).  */
+      if (check_prime (prime_q, val_2, 64, NULL, NULL))
+        break; /* Yes, Q is prime.  */
+
+      /* Step 8.  */
+      seed = NULL;  /* Force a new seed at Step 5.  */
+    }
+
+  /* Step 11.  Note that we do no use an explicit offset but increment
+     SEED_PLUS accordingly.  */
+  memcpy (seed_plus, seed, seedlen);
+  counter = 0;
+
+  /* Generate P. */
+  prime_p = mpi_new (pbits);
+  for (;;)
+    {
+      /* Step 11.1: For j = 0,...n let
+                      V_j = hash(seed+offset+j)
+         Step 11.2: W = V_0 + V_1*2^outlen +
+                            ...
+                            + V_{n-1}*2^{(n-1)*outlen}
+                            + (V_{n} mod 2^b)*2^{n*outlen}
+       */
+      mpi_set_ui (value_w, 0);
+      for (value_j=0; value_j <= value_n; value_j++)
+        {
+          /* There is no need to have an explicit offset variable: In
+             the first round we shall have an offset of 1 and a j of
+             0.  This is achieved by incrementing SEED_PLUS here.  For
+             the next round offset is implicitly updated by using
+             SEED_PLUS again.  */
+          for (i=seedlen-1; i >= 0; i--)
+            {
+              seed_plus[i]++;
+              if (seed_plus[i])
+                break;
+            }
+          _gcry_md_hash_buffer (hashalgo, digest, seed_plus, seedlen);
+
+          _gcry_mpi_release (tmpval); tmpval = NULL;
+          ec = _gcry_mpi_scan (&tmpval, GCRYMPI_FMT_USG,
+                               digest, qbits/8, NULL);
+          if (ec)
+            goto leave;
+          if (value_j == value_n)
+            mpi_clear_highbit (tmpval, value_b); /* (V_n mod 2^b) */
+          mpi_lshift (tmpval, tmpval, value_j*qbits);
+          mpi_add (value_w, value_w, tmpval);
+        }
+
+      /* Step 11.3: X = W + 2^{L-1}  */
+      mpi_set_ui (value_x, 0);
+      mpi_set_highbit (value_x, pbits-1);
+      mpi_add (value_x, value_x, value_w);
+
+      /* Step 11.4:  c = X mod 2q  */
+      mpi_mul_2exp (tmpval, prime_q, 1);
+      mpi_mod (tmpval, value_x, tmpval);
+
+      /* Step 11.5:  p = X - (c - 1)  */
+      mpi_sub_ui (tmpval, tmpval, 1);
+      mpi_sub (prime_p, value_x, tmpval);
+
+      /* Step 11.6: If  p < 2^{L-1}  skip the primality test.  */
+      /* Step 11.7 and 11.8: Primality test.  */
+      if (mpi_get_nbits (prime_p) >= pbits-1
+          && check_prime (prime_p, val_2, 64, NULL, NULL) )
+        break; /* Yes, P is prime, continue with Step 15.  */
+
+      /* Step 11.9: counter = counter + 1, offset = offset + n + 1.
+                    If counter >= 4L  goto Step 5.  */
+      counter++;
+      if (counter >= 4*pbits)
+        goto restart;
+    }
+
+  /* Step 12:  Save p, q, counter and seed.  */
+  /* log_debug ("fips186-3 pbits p=%u q=%u counter=%d\n", */
+  /*            mpi_get_nbits (prime_p), mpi_get_nbits (prime_q), counter); */
+  /* log_printhex ("fips186-3 seed", seed, seedlen); */
+  /* log_printmpi ("fips186-3    p", prime_p); */
+  /* log_printmpi ("fips186-3    q", prime_q); */
+
+  if (r_q)
+    {
+      *r_q = prime_q;
+      prime_q = NULL;
+    }
+  if (r_p)
+    {
+      *r_p = prime_p;
+      prime_p = NULL;
+    }
+  if (r_counter)
+    *r_counter = counter;
+  if (r_seed && r_seedlen)
+    {
+      memcpy (seed_plus, seed, seedlen);
+      *r_seed = seed_plus;
+      seed_plus = NULL;
+      *r_seedlen = seedlen;
+    }
+  if (r_hashalgo)
+    *r_hashalgo = hashalgo;
+
+ leave:
+  _gcry_mpi_release (tmpval);
+  _gcry_mpi_release (value_x);
+  _gcry_mpi_release (value_w);
+  _gcry_mpi_release (prime_p);
+  _gcry_mpi_release (prime_q);
+  xfree (seed_plus);
+  _gcry_mpi_release (val_2);
+  return ec;
+}
diff --git a/comm/third_party/libgcrypt/cipher/pubkey-internal.h b/comm/third_party/libgcrypt/cipher/pubkey-internal.h
new file mode 100644
index 0000000000..d31e26f392
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/pubkey-internal.h
@@ -0,0 +1,105 @@
+/* pubkey-internal.h  - Internal defs for pubkey.c
+ * Copyright (C) 2013 g10 code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_PUBKEY_INTERNAL_H
+#define GCRY_PUBKEY_INTERNAL_H
+
+/*-- pubkey-util.c --*/
+gpg_err_code_t _gcry_pk_util_parse_flaglist (gcry_sexp_t list,
+                                             int *r_flags,
+                                             enum pk_encoding *r_encoding);
+gpg_err_code_t _gcry_pk_util_get_nbits (gcry_sexp_t list,
+                                        unsigned int *r_nbits);
+gpg_err_code_t _gcry_pk_util_get_rsa_use_e (gcry_sexp_t list,
+                                            unsigned long *r_e);
+gpg_err_code_t _gcry_pk_util_preparse_sigval (gcry_sexp_t s_sig,
+                                              const char **algo_names,
+                                              gcry_sexp_t *r_parms,
+                                              int *r_eccflags);
+gpg_err_code_t _gcry_pk_util_preparse_encval (gcry_sexp_t sexp,
+                                              const char **algo_names,
+                                              gcry_sexp_t *r_parms,
+                                              struct pk_encoding_ctx *ctx);
+void _gcry_pk_util_init_encoding_ctx (struct pk_encoding_ctx *ctx,
+                                      enum pk_operation op,
+                                      unsigned int nbits);
+void _gcry_pk_util_free_encoding_ctx (struct pk_encoding_ctx *ctx);
+gcry_err_code_t _gcry_pk_util_data_to_mpi (gcry_sexp_t input,
+                                           gcry_mpi_t *ret_mpi,
+                                           struct pk_encoding_ctx *ctx);
+
+
+
+/*-- rsa-common.c --*/
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_for_enc (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen,
+                                const unsigned char *random_override,
+                                size_t random_override_len);
+gpg_err_code_t
+_gcry_rsa_pkcs1_decode_for_enc (unsigned char **r_result, size_t *r_resultlen,
+                                unsigned int nbits, gcry_mpi_t value);
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_raw_for_sig (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen);
+
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen,
+                                int algo);
+gpg_err_code_t
+_gcry_rsa_oaep_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
+                       const unsigned char *value, size_t valuelen,
+                       const unsigned char *label, size_t labellen,
+                       const void *random_override, size_t random_override_len);
+gpg_err_code_t
+_gcry_rsa_oaep_decode (unsigned char **r_result, size_t *r_resultlen,
+                       unsigned int nbits, int algo,
+                       gcry_mpi_t value,
+                       const unsigned char *label, size_t labellen);
+gpg_err_code_t
+_gcry_rsa_pss_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
+                      const unsigned char *value, size_t valuelen, int saltlen,
+                      const void *random_override, size_t random_override_len);
+gpg_err_code_t
+_gcry_rsa_pss_verify (gcry_mpi_t value, gcry_mpi_t encoded,
+                      unsigned int nbits, int algo, size_t saltlen);
+
+
+
+/*-- dsa-common.c --*/
+void _gcry_dsa_modify_k (gcry_mpi_t k, gcry_mpi_t q, int qbits);
+gcry_mpi_t _gcry_dsa_gen_k (gcry_mpi_t q, int security_level);
+gpg_err_code_t _gcry_dsa_gen_rfc6979_k (gcry_mpi_t *r_k,
+                                        gcry_mpi_t dsa_q, gcry_mpi_t dsa_x,
+                                        const unsigned char *h1,
+                                        unsigned int h1len,
+                                        int halgo,
+                                        unsigned int extraloops);
+
+gpg_err_code_t _gcry_dsa_normalize_hash (gcry_mpi_t input,
+                                         gcry_mpi_t *out,
+                                         unsigned int qbits);
+
+/*-- ecc.c --*/
+gpg_err_code_t _gcry_pk_ecc_get_sexp (gcry_sexp_t *r_sexp, int mode,
+                                      mpi_ec_t ec);
+
+
+#endif /*GCRY_PUBKEY_INTERNAL_H*/
diff --git a/comm/third_party/libgcrypt/cipher/pubkey-util.c b/comm/third_party/libgcrypt/cipher/pubkey-util.c
new file mode 100644
index 0000000000..7ddef7dc31
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/pubkey-util.c
@@ -0,0 +1,1160 @@
+/* pubkey-util.c - Supporting functions for all pubkey modules.
+ * Copyright (C) 1998, 1999, 2000, 2002, 2003, 2005,
+ *               2007, 2008, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013, 2015 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+/* Callback for the pubkey algorithm code to verify PSS signatures.
+   OPAQUE is the data provided by the actual caller.  The meaning of
+   TMP depends on the actual algorithm (but there is only RSA); now
+   for RSA it is the output of running the public key function on the
+   input.  */
+static int
+pss_verify_cmp (void *opaque, gcry_mpi_t tmp)
+{
+  struct pk_encoding_ctx *ctx = opaque;
+  gcry_mpi_t hash = ctx->verify_arg;
+
+  return _gcry_rsa_pss_verify (hash, tmp, ctx->nbits - 1,
+                               ctx->hash_algo, ctx->saltlen);
+}
+
+
+/* Parser for a flag list.  On return the encoding is stored at
+   R_ENCODING and the flags are stored at R_FLAGS.  If any of them is
+   not needed, NULL may be passed.  The function returns 0 on success
+   or an error code. */
+gpg_err_code_t
+_gcry_pk_util_parse_flaglist (gcry_sexp_t list,
+                              int *r_flags, enum pk_encoding *r_encoding)
+{
+  gpg_err_code_t rc = 0;
+  const char *s;
+  size_t n;
+  int i;
+  int encoding = PUBKEY_ENC_UNKNOWN;
+  int flags = 0;
+  int igninvflag = 0;
+
+  for (i = list ? sexp_length (list)-1 : 0; i > 0; i--)
+    {
+      s = sexp_nth_data (list, i, &n);
+      if (!s)
+        continue; /* Not a data element. */
+
+      switch (n)
+        {
+        case 3:
+          if (!memcmp (s, "pss", 3) && encoding == PUBKEY_ENC_UNKNOWN)
+            {
+              encoding = PUBKEY_ENC_PSS;
+              flags |= PUBKEY_FLAG_FIXEDLEN;
+            }
+          else if (!memcmp (s, "raw", 3) && encoding == PUBKEY_ENC_UNKNOWN)
+            {
+              encoding = PUBKEY_ENC_RAW;
+              flags |= PUBKEY_FLAG_RAW_FLAG; /* Explicitly given.  */
+            }
+          else if (!memcmp (s, "sm2", 3))
+            {
+                encoding = PUBKEY_ENC_RAW;
+                flags |= PUBKEY_FLAG_SM2 | PUBKEY_FLAG_RAW_FLAG;
+            }
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 4:
+          if (!memcmp (s, "comp", 4))
+            flags |= PUBKEY_FLAG_COMP;
+          else if (!memcmp (s, "oaep", 4) && encoding == PUBKEY_ENC_UNKNOWN)
+            {
+              encoding = PUBKEY_ENC_OAEP;
+              flags |= PUBKEY_FLAG_FIXEDLEN;
+            }
+          else if (!memcmp (s, "gost", 4))
+            {
+              encoding = PUBKEY_ENC_RAW;
+              flags |= PUBKEY_FLAG_GOST;
+            }
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 5:
+          if (!memcmp (s, "eddsa", 5))
+            {
+              encoding = PUBKEY_ENC_RAW;
+              flags |= PUBKEY_FLAG_EDDSA;
+              flags |= PUBKEY_FLAG_DJB_TWEAK;
+            }
+          else if (!memcmp (s, "pkcs1", 5) && encoding == PUBKEY_ENC_UNKNOWN)
+            {
+              encoding = PUBKEY_ENC_PKCS1;
+              flags |= PUBKEY_FLAG_FIXEDLEN;
+            }
+          else if (!memcmp (s, "param", 5))
+            flags |= PUBKEY_FLAG_PARAM;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 6:
+          if (!memcmp (s, "nocomp", 6))
+            flags |= PUBKEY_FLAG_NOCOMP;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 7:
+          if (!memcmp (s, "rfc6979", 7))
+            flags |= PUBKEY_FLAG_RFC6979;
+          else if (!memcmp (s, "noparam", 7))
+            ; /* Ignore - it is the default.  */
+          else if (!memcmp (s, "prehash", 7))
+            flags |= PUBKEY_FLAG_PREHASH;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 8:
+          if (!memcmp (s, "use-x931", 8))
+            flags |= PUBKEY_FLAG_USE_X931;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 9:
+          if (!memcmp (s, "pkcs1-raw", 9) && encoding == PUBKEY_ENC_UNKNOWN)
+            {
+              encoding = PUBKEY_ENC_PKCS1_RAW;
+              flags |= PUBKEY_FLAG_FIXEDLEN;
+            }
+          else if (!memcmp (s, "djb-tweak", 9))
+            {
+              encoding = PUBKEY_ENC_RAW;
+              flags |= PUBKEY_FLAG_DJB_TWEAK;
+            }
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 10:
+          if (!memcmp (s, "igninvflag", 10))
+            igninvflag = 1;
+          else if (!memcmp (s, "no-keytest", 10))
+            flags |= PUBKEY_FLAG_NO_KEYTEST;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 11:
+          if (!memcmp (s, "no-blinding", 11))
+            flags |= PUBKEY_FLAG_NO_BLINDING;
+          else if (!memcmp (s, "use-fips186", 11))
+            flags |= PUBKEY_FLAG_USE_FIPS186;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        case 13:
+          if (!memcmp (s, "use-fips186-2", 13))
+            flags |= PUBKEY_FLAG_USE_FIPS186_2;
+          else if (!memcmp (s, "transient-key", 13))
+            flags |= PUBKEY_FLAG_TRANSIENT_KEY;
+          else if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+
+        default:
+          if (!igninvflag)
+            rc = GPG_ERR_INV_FLAG;
+          break;
+        }
+    }
+
+  if (r_flags)
+    *r_flags = flags;
+  if (r_encoding)
+    *r_encoding = encoding;
+
+  return rc;
+}
+
+
+static int
+get_hash_algo (const char *s, size_t n)
+{
+  static const struct { const char *name; int algo; } hashnames[] = {
+    { "sha1",   GCRY_MD_SHA1 },
+    { "md5",    GCRY_MD_MD5 },
+    { "sha256", GCRY_MD_SHA256 },
+    { "ripemd160", GCRY_MD_RMD160 },
+    { "rmd160", GCRY_MD_RMD160 },
+    { "sha384", GCRY_MD_SHA384 },
+    { "sha512", GCRY_MD_SHA512 },
+    { "sha224", GCRY_MD_SHA224 },
+    { "md2",    GCRY_MD_MD2 },
+    { "md4",    GCRY_MD_MD4 },
+    { "tiger",  GCRY_MD_TIGER },
+    { "haval",  GCRY_MD_HAVAL },
+    { "sha3-224", GCRY_MD_SHA3_224 },
+    { "sha3-256", GCRY_MD_SHA3_256 },
+    { "sha3-384", GCRY_MD_SHA3_384 },
+    { "sha3-512", GCRY_MD_SHA3_512 },
+    { "sm3", GCRY_MD_SM3 },
+    { "shake128", GCRY_MD_SHAKE128 },
+    { "shake256", GCRY_MD_SHAKE256 },
+    { NULL, 0 }
+  };
+  int algo;
+  int i;
+
+  for (i=0; hashnames[i].name; i++)
+    {
+      if ( strlen (hashnames[i].name) == n
+	   && !memcmp (hashnames[i].name, s, n))
+	break;
+    }
+  if (hashnames[i].name)
+    algo = hashnames[i].algo;
+  else
+    {
+      /* In case of not listed or dynamically allocated hash
+	 algorithm we fall back to this somewhat slower
+	 method.  Further, it also allows to use OIDs as
+	 algorithm names. */
+      char *tmpname;
+
+      tmpname = xtrymalloc (n+1);
+      if (!tmpname)
+	algo = 0;  /* Out of core - silently give up.  */
+      else
+	{
+	  memcpy (tmpname, s, n);
+	  tmpname[n] = 0;
+	  algo = _gcry_md_map_name (tmpname);
+	  xfree (tmpname);
+	}
+    }
+  return algo;
+}
+
+
+/* Get the "nbits" parameter from an s-expression of the format:
+ *
+ *   (algo
+ *     (parameter_name_1 ....)
+ *      ....
+ *     (parameter_name_n ....))
+ *
+ * Example:
+ *
+ *   (rsa
+ *     (nbits 4:2048))
+ *
+ * On success the value for nbits is stored at R_NBITS.  If no nbits
+ * parameter is found, the function returns success and stores 0 at
+ * R_NBITS.  For parsing errors the function returns an error code and
+ * stores 0 at R_NBITS.
+ */
+gpg_err_code_t
+_gcry_pk_util_get_nbits (gcry_sexp_t list, unsigned int *r_nbits)
+{
+  char buf[50];
+  const char *s;
+  size_t n;
+
+  *r_nbits = 0;
+
+  list = sexp_find_token (list, "nbits", 0);
+  if (!list)
+    return 0; /* No NBITS found.  */
+
+  s = sexp_nth_data (list, 1, &n);
+  if (!s || n >= DIM (buf) - 1 )
+    {
+      /* NBITS given without a cdr.  */
+      sexp_release (list);
+      return GPG_ERR_INV_OBJ;
+    }
+  memcpy (buf, s, n);
+  buf[n] = 0;
+  *r_nbits = (unsigned int)strtoul (buf, NULL, 0);
+  sexp_release (list);
+  return 0;
+}
+
+
+/* Get the optional "rsa-use-e" parameter from an s-expression of the
+ * format:
+ *
+ *   (algo
+ *     (parameter_name_1 ....)
+ *      ....
+ *     (parameter_name_n ....))
+ *
+ * Example:
+ *
+ *   (rsa
+ *     (nbits 4:2048)
+ *     (rsa-use-e 2:41))
+ *
+ * On success the value for nbits is stored at R_E.  If no rsa-use-e
+ * parameter is found, the function returns success and stores 65537 at
+ * R_E.  For parsing errors the function returns an error code and
+ * stores 0 at R_E.
+ */
+gpg_err_code_t
+_gcry_pk_util_get_rsa_use_e (gcry_sexp_t list, unsigned long *r_e)
+{
+  char buf[50];
+  const char *s;
+  size_t n;
+
+  *r_e = 0;
+
+  list = sexp_find_token (list, "rsa-use-e", 0);
+  if (!list)
+    {
+      *r_e = 65537; /* Not given, use the value generated by old versions. */
+      return 0;
+    }
+
+  s = sexp_nth_data (list, 1, &n);
+  if (!s || n >= DIM (buf) - 1 )
+    {
+      /* No value or value too large.  */
+      sexp_release (list);
+      return GPG_ERR_INV_OBJ;
+    }
+  memcpy (buf, s, n);
+  buf[n] = 0;
+  *r_e = strtoul (buf, NULL, 0);
+  sexp_release (list);
+  return 0;
+}
+
+
+/* Parse a "sig-val" s-expression and store the inner parameter list at
+   R_PARMS.  ALGO_NAMES is used to verify that the algorithm in
+   "sig-val" is valid.  Returns 0 on success and stores a new list at
+   R_PARMS which must be freed by the caller.  On error R_PARMS is set
+   to NULL and an error code returned.  If R_ECCFLAGS is not NULL flag
+   values are set into it; as of now they are only used with ecc
+   algorithms.  */
+gpg_err_code_t
+_gcry_pk_util_preparse_sigval (gcry_sexp_t s_sig, const char **algo_names,
+                               gcry_sexp_t *r_parms, int *r_eccflags)
+{
+  gpg_err_code_t rc;
+  gcry_sexp_t l1 = NULL;
+  gcry_sexp_t l2 = NULL;
+  char *name = NULL;
+  int i;
+
+  *r_parms = NULL;
+  if (r_eccflags)
+    *r_eccflags = 0;
+
+  /* Extract the signature value.  */
+  l1 = sexp_find_token (s_sig, "sig-val", 0);
+  if (!l1)
+    {
+      rc = GPG_ERR_INV_OBJ; /* Does not contain a signature value object.  */
+      goto leave;
+    }
+
+  l2 = sexp_nth (l1, 1);
+  if (!l2)
+    {
+      rc = GPG_ERR_NO_OBJ;   /* No cadr for the sig object.  */
+      goto leave;
+    }
+  name = sexp_nth_string (l2, 0);
+  if (!name)
+    {
+      rc = GPG_ERR_INV_OBJ;  /* Invalid structure of object.  */
+      goto leave;
+    }
+  else if (!strcmp (name, "flags"))
+    {
+      /* Skip a "flags" parameter and look again for the algorithm
+	 name.  This is not used but here just for the sake of
+	 consistent S-expressions we need to handle it. */
+      sexp_release (l2);
+      l2 = sexp_nth (l1, 2);
+      if (!l2)
+	{
+	  rc = GPG_ERR_INV_OBJ;
+          goto leave;
+	}
+      xfree (name);
+      name = sexp_nth_string (l2, 0);
+      if (!name)
+        {
+          rc = GPG_ERR_INV_OBJ;  /* Invalid structure of object.  */
+          goto leave;
+        }
+    }
+
+  for (i=0; algo_names[i]; i++)
+    if (!stricmp (name, algo_names[i]))
+      break;
+  if (!algo_names[i])
+    {
+      rc = GPG_ERR_CONFLICT; /* "sig-val" uses an unexpected algo. */
+      goto leave;
+    }
+  if (r_eccflags)
+    {
+      if (!strcmp (name, "eddsa"))
+        *r_eccflags = PUBKEY_FLAG_EDDSA;
+      if (!strcmp (name, "gost"))
+        *r_eccflags = PUBKEY_FLAG_GOST;
+      if (!strcmp (name, "sm2"))
+        *r_eccflags = PUBKEY_FLAG_SM2;
+    }
+
+  *r_parms = l2;
+  l2 = NULL;
+  rc = 0;
+
+ leave:
+  xfree (name);
+  sexp_release (l2);
+  sexp_release (l1);
+  return rc;
+}
+
+
+/* Parse a "enc-val" s-expression and store the inner parameter list
+   at R_PARMS.  ALGO_NAMES is used to verify that the algorithm in
+   "enc-val" is valid.  Returns 0 on success and stores a new list at
+   R_PARMS which must be freed by the caller.  On error R_PARMS is set
+   to NULL and an error code returned.  If R_ECCFLAGS is not NULL flag
+   values are set into it; as of now they are only used with ecc
+   algorithms.
+
+     (enc-val
+       [(flags [raw, pkcs1, oaep, no-blinding])]
+       [(hash-algo <algo>)]
+       [(label <label>)]
+        (<algo>
+          (<param_name1> <mpi>)
+          ...
+          (<param_namen> <mpi>)))
+
+   HASH-ALGO and LABEL are specific to OAEP.  CTX will be updated with
+   encoding information.  */
+gpg_err_code_t
+_gcry_pk_util_preparse_encval (gcry_sexp_t sexp, const char **algo_names,
+                               gcry_sexp_t *r_parms,
+                               struct pk_encoding_ctx *ctx)
+{
+  gcry_err_code_t rc = 0;
+  gcry_sexp_t l1 = NULL;
+  gcry_sexp_t l2 = NULL;
+  char *name = NULL;
+  size_t n;
+  int parsed_flags = 0;
+  int i;
+
+  *r_parms = NULL;
+
+  /* Check that the first element is valid.  */
+  l1 = sexp_find_token (sexp, "enc-val" , 0);
+  if (!l1)
+    {
+      rc = GPG_ERR_INV_OBJ; /* Does not contain an encrypted value object.  */
+      goto leave;
+    }
+
+  l2 = sexp_nth (l1, 1);
+  if (!l2)
+    {
+      rc = GPG_ERR_NO_OBJ;  /* No cadr for the data object.  */
+      goto leave;
+    }
+
+  /* Extract identifier of sublist.  */
+  name = sexp_nth_string (l2, 0);
+  if (!name)
+    {
+      rc = GPG_ERR_INV_OBJ; /* Invalid structure of object.  */
+      goto leave;
+    }
+
+  if (!strcmp (name, "flags"))
+    {
+      const char *s;
+
+      /* There is a flags element - process it.  */
+      rc = _gcry_pk_util_parse_flaglist (l2, &parsed_flags, &ctx->encoding);
+      if (rc)
+        goto leave;
+      if (ctx->encoding == PUBKEY_ENC_PSS)
+        {
+          rc = GPG_ERR_CONFLICT;
+          goto leave;
+        }
+
+      /* Get the OAEP parameters HASH-ALGO and LABEL, if any. */
+      if (ctx->encoding == PUBKEY_ENC_OAEP)
+	{
+	  /* Get HASH-ALGO. */
+          sexp_release (l2);
+	  l2 = sexp_find_token (l1, "hash-algo", 0);
+	  if (l2)
+	    {
+	      s = sexp_nth_data (l2, 1, &n);
+	      if (!s)
+		rc = GPG_ERR_NO_OBJ;
+	      else
+		{
+		  ctx->hash_algo = get_hash_algo (s, n);
+		  if (!ctx->hash_algo)
+		    rc = GPG_ERR_DIGEST_ALGO;
+		}
+	      if (rc)
+		goto leave;
+	    }
+
+	  /* Get LABEL. */
+          sexp_release (l2);
+	  l2 = sexp_find_token (l1, "label", 0);
+	  if (l2)
+	    {
+	      s = sexp_nth_data (l2, 1, &n);
+	      if (!s)
+		rc = GPG_ERR_NO_OBJ;
+	      else if (n > 0)
+		{
+		  ctx->label = xtrymalloc (n);
+		  if (!ctx->label)
+		    rc = gpg_err_code_from_syserror ();
+		  else
+		    {
+		      memcpy (ctx->label, s, n);
+		      ctx->labellen = n;
+		    }
+		}
+	      if (rc)
+		goto leave;
+	    }
+	}
+
+      /* Get the next which has the actual data - skip HASH-ALGO and LABEL. */
+      for (i = 2; (sexp_release (l2), l2 = sexp_nth (l1, i)); i++)
+	{
+	  s = sexp_nth_data (l2, 0, &n);
+	  if (!(n == 9 && !memcmp (s, "hash-algo", 9))
+	      && !(n == 5 && !memcmp (s, "label", 5))
+	      && !(n == 15 && !memcmp (s, "random-override", 15)))
+	    break;
+	}
+      if (!l2)
+        {
+          rc = GPG_ERR_NO_OBJ; /* No cadr for the data object. */
+          goto leave;
+        }
+
+      /* Extract sublist identifier.  */
+      xfree (name);
+      name = sexp_nth_string (l2, 0);
+      if (!name)
+        {
+          rc = GPG_ERR_INV_OBJ; /* Invalid structure of object. */
+          goto leave;
+        }
+    }
+  else /* No flags - flag as legacy structure.  */
+    parsed_flags |= PUBKEY_FLAG_LEGACYRESULT;
+
+  for (i=0; algo_names[i]; i++)
+    if (!stricmp (name, algo_names[i]))
+      break;
+  if (!algo_names[i])
+    {
+      rc = GPG_ERR_CONFLICT; /* "enc-val" uses an unexpected algo. */
+      goto leave;
+    }
+
+  *r_parms = l2;
+  l2 = NULL;
+  ctx->flags |= parsed_flags;
+  rc = 0;
+
+ leave:
+  xfree (name);
+  sexp_release (l2);
+  sexp_release (l1);
+  return rc;
+}
+
+
+/* Initialize an encoding context.  */
+void
+_gcry_pk_util_init_encoding_ctx (struct pk_encoding_ctx *ctx,
+                                 enum pk_operation op,
+                                 unsigned int nbits)
+{
+  ctx->op = op;
+  ctx->nbits = nbits;
+  ctx->encoding = PUBKEY_ENC_UNKNOWN;
+  ctx->flags = 0;
+  if (fips_mode ())
+    {
+      ctx->hash_algo = GCRY_MD_SHA256;
+    }
+  else
+    {
+      ctx->hash_algo = GCRY_MD_SHA1;
+    }
+  ctx->label = NULL;
+  ctx->labellen = 0;
+  ctx->saltlen = 20;
+  ctx->verify_cmp = NULL;
+  ctx->verify_arg = NULL;
+}
+
+/* Free a context initialzied by _gcry_pk_util_init_encoding_ctx.  */
+void
+_gcry_pk_util_free_encoding_ctx (struct pk_encoding_ctx *ctx)
+{
+  xfree (ctx->label);
+}
+
+
+/* Take the hash value and convert into an MPI, suitable for
+   passing to the low level functions.  We currently support the
+   old style way of passing just a MPI and the modern interface which
+   allows to pass flags so that we can choose between raw and pkcs1
+   padding - may be more padding options later.
+
+   (<mpi>)
+   or
+   (data
+    [(flags [raw, direct, pkcs1, oaep, pss,
+             no-blinding, rfc6979, eddsa, prehash])]
+    [(hash <algo> <value>)]
+    [(value <text>)]
+    [(hash-algo <algo>)]
+    [(label <label>)]
+    [(salt-length <length>)]
+    [(random-override <data>)]
+   )
+
+   Either the VALUE or the HASH element must be present for use
+   with signatures.  VALUE is used for encryption.
+
+   HASH-ALGO is specific to OAEP and EDDSA.
+
+   LABEL is specific to OAEP.
+
+   SALT-LENGTH is for PSS it is limited to 16384 bytes.
+
+   RANDOM-OVERRIDE is used to replace random nonces for regression
+   testing.  */
+gcry_err_code_t
+_gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi,
+                           struct pk_encoding_ctx *ctx)
+{
+  gcry_err_code_t rc = 0;
+  gcry_sexp_t ldata, lhash, lvalue;
+  size_t n;
+  const char *s;
+  int unknown_flag = 0;
+  int parsed_flags = 0;
+
+  *ret_mpi = NULL;
+  ldata = sexp_find_token (input, "data", 0);
+  if (!ldata)
+    { /* assume old style */
+      int mpifmt = (ctx->flags & PUBKEY_FLAG_RAW_FLAG) ?
+        GCRYMPI_FMT_OPAQUE : GCRYMPI_FMT_STD;
+
+      *ret_mpi = sexp_nth_mpi (input, 0, mpifmt);
+      return *ret_mpi ? GPG_ERR_NO_ERROR : GPG_ERR_INV_OBJ;
+    }
+
+  /* See whether there is a flags list.  */
+  {
+    gcry_sexp_t lflags = sexp_find_token (ldata, "flags", 0);
+    if (lflags)
+      {
+        if (_gcry_pk_util_parse_flaglist (lflags,
+                                          &parsed_flags, &ctx->encoding))
+          unknown_flag = 1;
+        sexp_release (lflags);
+      }
+  }
+
+  if (ctx->encoding == PUBKEY_ENC_UNKNOWN)
+    ctx->encoding = PUBKEY_ENC_RAW; /* default to raw */
+
+  /* Get HASH or MPI */
+  lhash = sexp_find_token (ldata, "hash", 0);
+  lvalue = lhash? NULL : sexp_find_token (ldata, "value", 0);
+
+  if (!(!lhash ^ !lvalue))
+    rc = GPG_ERR_INV_OBJ; /* none or both given */
+  else if (unknown_flag)
+    rc = GPG_ERR_INV_FLAG;
+  else if (ctx->encoding == PUBKEY_ENC_RAW
+           && ((parsed_flags & PUBKEY_FLAG_EDDSA)
+               || (ctx->flags & PUBKEY_FLAG_EDDSA)))
+    {
+      /* Prepare for EdDSA.  */
+      gcry_sexp_t list;
+      void *value;
+      size_t valuelen;
+
+      if (!lvalue)
+        {
+          rc = GPG_ERR_INV_OBJ;
+          goto leave;
+        }
+      /* Hash algo is determined by curve.  No hash-algo is OK.  */
+      /* Get HASH-ALGO. */
+      list = sexp_find_token (ldata, "hash-algo", 0);
+      if (list)
+        {
+          s = sexp_nth_data (list, 1, &n);
+          if (!s)
+            rc = GPG_ERR_NO_OBJ;
+          else
+            {
+              ctx->hash_algo = get_hash_algo (s, n);
+              if (!ctx->hash_algo)
+                rc = GPG_ERR_DIGEST_ALGO;
+            }
+          sexp_release (list);
+        }
+      if (rc)
+        goto leave;
+
+      /* Get LABEL. */
+      list = sexp_find_token (ldata, "label", 0);
+      if (list)
+        {
+          s = sexp_nth_data (list, 1, &n);
+          if (!s)
+            rc = GPG_ERR_NO_OBJ;
+          else if (n > 0)
+            {
+              ctx->label = xtrymalloc (n);
+              if (!ctx->label)
+                rc = gpg_err_code_from_syserror ();
+              else
+                {
+                  memcpy (ctx->label, s, n);
+                  ctx->labellen = n;
+                }
+            }
+          sexp_release (list);
+          if (rc)
+            goto leave;
+        }
+
+      /* Get VALUE.  */
+      value = sexp_nth_buffer (lvalue, 1, &valuelen);
+      if (!value)
+        {
+          /* We assume that a zero length message is meant by
+             "(value)".  This is commonly used by test vectors.  Note
+             that S-expression do not allow zero length items. */
+          valuelen = 0;
+          value = xtrymalloc (1);
+          if (!value)
+            rc = gpg_err_code_from_syserror ();
+        }
+      else if ((valuelen * 8) < valuelen)
+        {
+          xfree (value);
+          rc = GPG_ERR_TOO_LARGE;
+        }
+      if (rc)
+        goto leave;
+
+      /* Note that mpi_set_opaque takes ownership of VALUE.  */
+      *ret_mpi = mpi_set_opaque (NULL, value, valuelen*8);
+    }
+  else if (ctx->encoding == PUBKEY_ENC_RAW && lhash
+           && ((parsed_flags & PUBKEY_FLAG_RAW_FLAG)
+               || (parsed_flags & PUBKEY_FLAG_RFC6979)))
+    {
+      /* Raw encoding along with a hash element.  This is commonly
+         used for DSA.  For better backward error compatibility we
+         allow this only if either the rfc6979 flag has been given or
+         the raw flags was explicitly given.  */
+      if (sexp_length (lhash) != 3)
+        rc = GPG_ERR_INV_OBJ;
+      else if ( !(s=sexp_nth_data (lhash, 1, &n)) || !n )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        {
+          void *value;
+          size_t valuelen;
+
+	  ctx->hash_algo = get_hash_algo (s, n);
+          if (!ctx->hash_algo)
+            rc = GPG_ERR_DIGEST_ALGO;
+          else if (!(value=sexp_nth_buffer (lhash, 2, &valuelen)))
+            rc = GPG_ERR_INV_OBJ;
+          else if ((valuelen * 8) < valuelen)
+            {
+              xfree (value);
+              rc = GPG_ERR_TOO_LARGE;
+            }
+          else
+            *ret_mpi = mpi_set_opaque (NULL, value, valuelen*8);
+        }
+    }
+  else if (ctx->encoding == PUBKEY_ENC_RAW && lvalue)
+    {
+      /* RFC6969 may only be used with the a hash value and not the
+         MPI based value.  */
+      if (parsed_flags & PUBKEY_FLAG_RFC6979)
+        {
+          rc = GPG_ERR_CONFLICT;
+          goto leave;
+        }
+
+      /* Get the value */
+      *ret_mpi = sexp_nth_mpi (lvalue, 1, GCRYMPI_FMT_USG);
+      if (!*ret_mpi)
+        rc = GPG_ERR_INV_OBJ;
+    }
+  else if (ctx->encoding == PUBKEY_ENC_PKCS1 && lvalue
+	   && ctx->op == PUBKEY_OP_ENCRYPT)
+    {
+      const void * value;
+      size_t valuelen;
+      gcry_sexp_t list;
+      void *random_override = NULL;
+      size_t random_override_len = 0;
+
+      if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        {
+          /* Get optional RANDOM-OVERRIDE.  */
+          list = sexp_find_token (ldata, "random-override", 0);
+          if (list)
+            {
+              s = sexp_nth_data (list, 1, &n);
+              if (!s)
+                rc = GPG_ERR_NO_OBJ;
+              else if (n > 0)
+                {
+                  random_override = xtrymalloc (n);
+                  if (!random_override)
+                    rc = gpg_err_code_from_syserror ();
+                  else
+                    {
+                      memcpy (random_override, s, n);
+                      random_override_len = n;
+                    }
+                }
+              sexp_release (list);
+              if (rc)
+                goto leave;
+            }
+
+          rc = _gcry_rsa_pkcs1_encode_for_enc (ret_mpi, ctx->nbits,
+                                               value, valuelen,
+                                               random_override,
+                                               random_override_len);
+          xfree (random_override);
+        }
+    }
+  else if (ctx->encoding == PUBKEY_ENC_PKCS1 && lhash
+	   && (ctx->op == PUBKEY_OP_SIGN || ctx->op == PUBKEY_OP_VERIFY))
+    {
+      if (sexp_length (lhash) != 3)
+        rc = GPG_ERR_INV_OBJ;
+      else if ( !(s=sexp_nth_data (lhash, 1, &n)) || !n )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        {
+          const void * value;
+          size_t valuelen;
+
+	  ctx->hash_algo = get_hash_algo (s, n);
+
+          if (!ctx->hash_algo)
+            rc = GPG_ERR_DIGEST_ALGO;
+          else if ( !(value=sexp_nth_data (lhash, 2, &valuelen))
+                    || !valuelen )
+            rc = GPG_ERR_INV_OBJ;
+          else
+	    rc = _gcry_rsa_pkcs1_encode_for_sig (ret_mpi, ctx->nbits,
+                                                 value, valuelen,
+                                                 ctx->hash_algo);
+        }
+    }
+  else if (ctx->encoding == PUBKEY_ENC_PKCS1_RAW && lvalue
+	   && (ctx->op == PUBKEY_OP_SIGN || ctx->op == PUBKEY_OP_VERIFY))
+    {
+      const void * value;
+      size_t valuelen;
+
+      if (sexp_length (lvalue) != 2)
+        rc = GPG_ERR_INV_OBJ;
+      else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen))
+                || !valuelen )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        rc = _gcry_rsa_pkcs1_encode_raw_for_sig (ret_mpi, ctx->nbits,
+                                                 value, valuelen);
+    }
+  else if (ctx->encoding == PUBKEY_ENC_OAEP && lvalue
+	   && ctx->op == PUBKEY_OP_ENCRYPT)
+    {
+      const void * value;
+      size_t valuelen;
+
+      if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
+	rc = GPG_ERR_INV_OBJ;
+      else
+	{
+	  gcry_sexp_t list;
+          void *random_override = NULL;
+          size_t random_override_len = 0;
+
+	  /* Get HASH-ALGO. */
+	  list = sexp_find_token (ldata, "hash-algo", 0);
+	  if (list)
+	    {
+	      s = sexp_nth_data (list, 1, &n);
+	      if (!s)
+		rc = GPG_ERR_NO_OBJ;
+	      else
+		{
+		  ctx->hash_algo = get_hash_algo (s, n);
+		  if (!ctx->hash_algo)
+		    rc = GPG_ERR_DIGEST_ALGO;
+		}
+	      sexp_release (list);
+	      if (rc)
+		goto leave;
+	    }
+
+	  /* Get LABEL. */
+	  list = sexp_find_token (ldata, "label", 0);
+	  if (list)
+	    {
+	      s = sexp_nth_data (list, 1, &n);
+	      if (!s)
+		rc = GPG_ERR_NO_OBJ;
+	      else if (n > 0)
+		{
+		  ctx->label = xtrymalloc (n);
+		  if (!ctx->label)
+		    rc = gpg_err_code_from_syserror ();
+		  else
+		    {
+		      memcpy (ctx->label, s, n);
+		      ctx->labellen = n;
+		    }
+		}
+	      sexp_release (list);
+	      if (rc)
+		goto leave;
+	    }
+          /* Get optional RANDOM-OVERRIDE.  */
+          list = sexp_find_token (ldata, "random-override", 0);
+          if (list)
+            {
+              s = sexp_nth_data (list, 1, &n);
+              if (!s)
+                rc = GPG_ERR_NO_OBJ;
+              else if (n > 0)
+                {
+                  random_override = xtrymalloc (n);
+                  if (!random_override)
+                    rc = gpg_err_code_from_syserror ();
+                  else
+                    {
+                      memcpy (random_override, s, n);
+                      random_override_len = n;
+                    }
+                }
+              sexp_release (list);
+              if (rc)
+                goto leave;
+            }
+
+	  rc = _gcry_rsa_oaep_encode (ret_mpi, ctx->nbits, ctx->hash_algo,
+                                      value, valuelen,
+                                      ctx->label, ctx->labellen,
+                                      random_override, random_override_len);
+
+          xfree (random_override);
+	}
+    }
+  else if (ctx->encoding == PUBKEY_ENC_PSS && lhash
+	   && ctx->op == PUBKEY_OP_SIGN)
+    {
+      if (sexp_length (lhash) != 3)
+        rc = GPG_ERR_INV_OBJ;
+      else if ( !(s=sexp_nth_data (lhash, 1, &n)) || !n )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        {
+          const void * value;
+          size_t valuelen;
+          void *random_override = NULL;
+          size_t random_override_len = 0;
+
+	  ctx->hash_algo = get_hash_algo (s, n);
+
+          if (!ctx->hash_algo)
+            rc = GPG_ERR_DIGEST_ALGO;
+          else if ( !(value=sexp_nth_data (lhash, 2, &valuelen))
+                    || !valuelen )
+            rc = GPG_ERR_INV_OBJ;
+          else
+	    {
+	      gcry_sexp_t list;
+
+	      /* Get SALT-LENGTH. */
+	      list = sexp_find_token (ldata, "salt-length", 0);
+	      if (list)
+		{
+		  s = sexp_nth_data (list, 1, &n);
+		  if (!s)
+		    {
+		      rc = GPG_ERR_NO_OBJ;
+		      goto leave;
+		    }
+		  ctx->saltlen = (unsigned int)strtoul (s, NULL, 10);
+		  sexp_release (list);
+		}
+
+              /* Get optional RANDOM-OVERRIDE.  */
+              list = sexp_find_token (ldata, "random-override", 0);
+              if (list)
+                {
+                  s = sexp_nth_data (list, 1, &n);
+                  if (!s)
+                    rc = GPG_ERR_NO_OBJ;
+                  else if (n > 0)
+                    {
+                      random_override = xtrymalloc (n);
+                      if (!random_override)
+                        rc = gpg_err_code_from_syserror ();
+                      else
+                        {
+                          memcpy (random_override, s, n);
+                          random_override_len = n;
+                        }
+                    }
+                  sexp_release (list);
+                  if (rc)
+                    goto leave;
+                }
+
+              /* Encode the data.  (NBITS-1 is due to 8.1.1, step 1.) */
+	      rc = _gcry_rsa_pss_encode (ret_mpi, ctx->nbits - 1,
+                                         ctx->hash_algo,
+                                         value, valuelen, ctx->saltlen,
+                                         random_override, random_override_len);
+
+              xfree (random_override);
+	    }
+        }
+    }
+  else if (ctx->encoding == PUBKEY_ENC_PSS && lhash
+	   && ctx->op == PUBKEY_OP_VERIFY)
+    {
+      if (sexp_length (lhash) != 3)
+        rc = GPG_ERR_INV_OBJ;
+      else if ( !(s=sexp_nth_data (lhash, 1, &n)) || !n )
+        rc = GPG_ERR_INV_OBJ;
+      else
+        {
+	  ctx->hash_algo = get_hash_algo (s, n);
+
+          if (!ctx->hash_algo)
+            rc = GPG_ERR_DIGEST_ALGO;
+	  else
+	    {
+	      gcry_sexp_t list;
+	      /* Get SALT-LENGTH. */
+	      list = sexp_find_token (ldata, "salt-length", 0);
+	      if (list)
+		{
+                  unsigned long ul;
+
+		  s = sexp_nth_data (list, 1, &n);
+		  if (!s)
+		    {
+		      rc = GPG_ERR_NO_OBJ;
+                      sexp_release (list);
+		      goto leave;
+		    }
+		  ul = strtoul (s, NULL, 10);
+                  if (ul > 16384)
+                    {
+                      rc = GPG_ERR_TOO_LARGE;
+                      sexp_release (list);
+                      goto leave;
+                    }
+                  ctx->saltlen = ul;
+		  sexp_release (list);
+		}
+
+	      *ret_mpi = sexp_nth_mpi (lhash, 2, GCRYMPI_FMT_USG);
+	      if (!*ret_mpi)
+		rc = GPG_ERR_INV_OBJ;
+	      ctx->verify_cmp = pss_verify_cmp;
+	      ctx->verify_arg = *ret_mpi;
+	    }
+	}
+    }
+  else
+    rc = GPG_ERR_CONFLICT;
+
+ leave:
+  sexp_release (ldata);
+  sexp_release (lhash);
+  sexp_release (lvalue);
+
+  if (!rc)
+    ctx->flags |= parsed_flags;
+  else
+    {
+      xfree (ctx->label);
+      ctx->label = NULL;
+    }
+
+  return rc;
+}
diff --git a/comm/third_party/libgcrypt/cipher/pubkey.c b/comm/third_party/libgcrypt/cipher/pubkey.c
new file mode 100644
index 0000000000..4c07e33bfc
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/pubkey.c
@@ -0,0 +1,970 @@
+/* pubkey.c  -	pubkey dispatcher
+ * Copyright (C) 1998, 1999, 2000, 2002, 2003, 2005,
+ *               2007, 2008, 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "context.h"
+#include "pubkey-internal.h"
+
+
+/* This is the list of the public-key algorithms included in
+   Libgcrypt.  */
+static gcry_pk_spec_t * const pubkey_list[] =
+  {
+#if USE_ECC
+    &_gcry_pubkey_spec_ecc,
+#endif
+#if USE_RSA
+    &_gcry_pubkey_spec_rsa,
+#endif
+#if USE_DSA
+    &_gcry_pubkey_spec_dsa,
+#endif
+#if USE_ELGAMAL
+    &_gcry_pubkey_spec_elg,
+#endif
+    NULL
+  };
+
+
+static int
+map_algo (int algo)
+{
+ switch (algo)
+   {
+   case GCRY_PK_RSA_E: return GCRY_PK_RSA;
+   case GCRY_PK_RSA_S: return GCRY_PK_RSA;
+   case GCRY_PK_ELG_E: return GCRY_PK_ELG;
+   case GCRY_PK_ECDSA: return GCRY_PK_ECC;
+   case GCRY_PK_ECDH:  return GCRY_PK_ECC;
+   default:            return algo;
+   }
+}
+
+
+/* Return the spec structure for the public key algorithm ALGO.  For
+   an unknown algorithm NULL is returned.  */
+static gcry_pk_spec_t *
+spec_from_algo (int algo)
+{
+  int idx;
+  gcry_pk_spec_t *spec;
+
+  algo = map_algo (algo);
+
+  for (idx = 0; (spec = pubkey_list[idx]); idx++)
+    if (algo == spec->algo)
+      return spec;
+  return NULL;
+}
+
+
+/* Return the spec structure for the public key algorithm with NAME.
+   For an unknown name NULL is returned.  */
+static gcry_pk_spec_t *
+spec_from_name (const char *name)
+{
+  gcry_pk_spec_t *spec;
+  int idx;
+  const char **aliases;
+
+  for (idx=0; (spec = pubkey_list[idx]); idx++)
+    {
+      if (!stricmp (name, spec->name))
+        return spec;
+      for (aliases = spec->aliases; *aliases; aliases++)
+        if (!stricmp (name, *aliases))
+          return spec;
+    }
+
+  return NULL;
+}
+
+
+
+/* Given the s-expression SEXP with the first element be either
+ * "private-key" or "public-key" return the spec structure for it.  We
+ * look through the list to find a list beginning with "private-key"
+ * or "public-key" - the first one found is used.  If WANT_PRIVATE is
+ * set the function will only succeed if a private key has been given.
+ * On success the spec is stored at R_SPEC.  On error NULL is stored
+ * at R_SPEC and an error code returned.  If R_PARMS is not NULL and
+ * the function returns success, the parameter list below
+ * "private-key" or "public-key" is stored there and the caller must
+ * call gcry_sexp_release on it.
+ */
+static gcry_err_code_t
+spec_from_sexp (gcry_sexp_t sexp, int want_private,
+                gcry_pk_spec_t **r_spec, gcry_sexp_t *r_parms)
+{
+  gcry_sexp_t list, l2;
+  char *name;
+  gcry_pk_spec_t *spec;
+
+  *r_spec = NULL;
+  if (r_parms)
+    *r_parms = NULL;
+
+  /* Check that the first element is valid.  If we are looking for a
+     public key but a private key was supplied, we allow the use of
+     the private key anyway.  The rationale for this is that the
+     private key is a superset of the public key.  */
+  list = sexp_find_token (sexp, want_private? "private-key":"public-key", 0);
+  if (!list && !want_private)
+    list = sexp_find_token (sexp, "private-key", 0);
+  if (!list)
+    return GPG_ERR_INV_OBJ; /* Does not contain a key object.  */
+
+  l2 = sexp_cadr (list);
+  sexp_release (list);
+  list = l2;
+  name = sexp_nth_string (list, 0);
+  if (!name)
+    {
+      sexp_release ( list );
+      return GPG_ERR_INV_OBJ;      /* Invalid structure of object. */
+    }
+  spec = spec_from_name (name);
+  xfree (name);
+  if (!spec)
+    {
+      sexp_release (list);
+      return GPG_ERR_PUBKEY_ALGO; /* Unknown algorithm. */
+    }
+  *r_spec = spec;
+  if (r_parms)
+    *r_parms = list;
+  else
+    sexp_release (list);
+  return 0;
+}
+
+
+
+/* Disable the use of the algorithm ALGO.  This is not thread safe and
+   should thus be called early.  */
+static void
+disable_pubkey_algo (int algo)
+{
+  gcry_pk_spec_t *spec = spec_from_algo (algo);
+
+  if (spec)
+    spec->flags.disabled = 1;
+}
+
+
+
+/*
+ * Map a string to the pubkey algo
+ */
+int
+_gcry_pk_map_name (const char *string)
+{
+  gcry_pk_spec_t *spec;
+
+  if (!string)
+    return 0;
+  spec = spec_from_name (string);
+  if (!spec)
+    return 0;
+  if (spec->flags.disabled)
+    return 0;
+  return spec->algo;
+}
+
+
+/* Map the public key algorithm whose ID is contained in ALGORITHM to
+   a string representation of the algorithm name.  For unknown
+   algorithm IDs this functions returns "?". */
+const char *
+_gcry_pk_algo_name (int algo)
+{
+  gcry_pk_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (spec)
+    return spec->name;
+  return "?";
+}
+
+
+/****************
+ * A USE of 0 means: don't care.
+ */
+static gcry_err_code_t
+check_pubkey_algo (int algo, unsigned use)
+{
+  gcry_err_code_t err = 0;
+  gcry_pk_spec_t *spec;
+
+  spec = spec_from_algo (algo);
+  if (spec)
+    {
+      if (((use & GCRY_PK_USAGE_SIGN)
+	   && (! (spec->use & GCRY_PK_USAGE_SIGN)))
+	  || ((use & GCRY_PK_USAGE_ENCR)
+	      && (! (spec->use & GCRY_PK_USAGE_ENCR))))
+	err = GPG_ERR_WRONG_PUBKEY_ALGO;
+    }
+  else
+    err = GPG_ERR_PUBKEY_ALGO;
+
+  return err;
+}
+
+
+/****************
+ * Return the number of public key material numbers
+ */
+static int
+pubkey_get_npkey (int algo)
+{
+  gcry_pk_spec_t *spec = spec_from_algo (algo);
+
+  return spec? strlen (spec->elements_pkey) : 0;
+}
+
+
+/****************
+ * Return the number of secret key material numbers
+ */
+static int
+pubkey_get_nskey (int algo)
+{
+  gcry_pk_spec_t *spec = spec_from_algo (algo);
+
+  return spec? strlen (spec->elements_skey) : 0;
+}
+
+
+/****************
+ * Return the number of signature material numbers
+ */
+static int
+pubkey_get_nsig (int algo)
+{
+  gcry_pk_spec_t *spec = spec_from_algo (algo);
+
+  return spec? strlen (spec->elements_sig) : 0;
+}
+
+/****************
+ * Return the number of encryption material numbers
+ */
+static int
+pubkey_get_nenc (int algo)
+{
+  gcry_pk_spec_t *spec = spec_from_algo (algo);
+
+  return spec? strlen (spec->elements_enc) : 0;
+}
+
+
+
+/*
+   Do a PK encrypt operation
+
+   Caller has to provide a public key as the SEXP pkey and data as a
+   SEXP with just one MPI in it. Alternatively S_DATA might be a
+   complex S-Expression, similar to the one used for signature
+   verification.  This provides a flag which allows to handle PKCS#1
+   block type 2 padding.  The function returns a sexp which may be
+   passed to to pk_decrypt.
+
+   Returns: 0 or an errorcode.
+
+   s_data = See comment for _gcry_pk_util_data_to_mpi
+   s_pkey = <key-as-defined-in-sexp_to_key>
+   r_ciph = (enc-val
+               (<algo>
+                 (<param_name1> <mpi>)
+                 ...
+                 (<param_namen> <mpi>)
+               ))
+
+*/
+gcry_err_code_t
+_gcry_pk_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t s_pkey)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms;
+
+  *r_ciph = NULL;
+
+  rc = spec_from_sexp (s_pkey, 0, &spec, &keyparms);
+  if (rc)
+    goto leave;
+
+  if (spec->encrypt)
+    rc = spec->encrypt (r_ciph, s_data, keyparms);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (keyparms);
+  return rc;
+}
+
+
+/*
+   Do a PK decrypt operation
+
+   Caller has to provide a secret key as the SEXP skey and data in a
+   format as created by gcry_pk_encrypt.  For historic reasons the
+   function returns simply an MPI as an S-expression part; this is
+   deprecated and the new method should be used which returns a real
+   S-expressionl this is selected by adding at least an empty flags
+   list to S_DATA.
+
+   Returns: 0 or an errorcode.
+
+   s_data = (enc-val
+              [(flags [raw, pkcs1, oaep])]
+              (<algo>
+                (<param_name1> <mpi>)
+                ...
+                (<param_namen> <mpi>)
+              ))
+   s_skey = <key-as-defined-in-sexp_to_key>
+   r_plain= Either an incomplete S-expression without the parentheses
+            or if the flags list is used (even if empty) a real S-expression:
+            (value PLAIN).  In raw mode (or no flags given) the returned value
+            is to be interpreted as a signed MPI, thus it may have an extra
+            leading zero octet even if not included in the original data.
+            With pkcs1 or oaep decoding enabled the returned value is a
+            verbatim octet string.
+ */
+gcry_err_code_t
+_gcry_pk_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t s_skey)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms;
+
+  *r_plain = NULL;
+
+  rc = spec_from_sexp (s_skey, 1, &spec, &keyparms);
+  if (rc)
+    goto leave;
+
+  if (spec->decrypt)
+    rc = spec->decrypt (r_plain, s_data, keyparms);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (keyparms);
+  return rc;
+}
+
+
+
+/*
+   Create a signature.
+
+   Caller has to provide a secret key as the SEXP skey and data
+   expressed as a SEXP list hash with only one element which should
+   instantly be available as a MPI. Alternatively the structure given
+   below may be used for S_HASH, it provides the abiliy to pass flags
+   to the operation; the flags defined by now are "pkcs1" which does
+   PKCS#1 block type 1 style padding and "pss" for PSS encoding.
+
+   Returns: 0 or an errorcode.
+            In case of 0 the function returns a new SEXP with the
+            signature value; the structure of this signature depends on the
+            other arguments but is always suitable to be passed to
+            gcry_pk_verify
+
+   s_hash = See comment for _gcry-pk_util_data_to_mpi
+
+   s_skey = <key-as-defined-in-sexp_to_key>
+   r_sig  = (sig-val
+              (<algo>
+                (<param_name1> <mpi>)
+                ...
+                (<param_namen> <mpi>))
+             [(hash algo)])
+
+  Note that (hash algo) in R_SIG is not used.
+*/
+gcry_err_code_t
+_gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms;
+
+  *r_sig = NULL;
+
+  rc = spec_from_sexp (s_skey, 1, &spec, &keyparms);
+  if (rc)
+    goto leave;
+
+  if (spec->sign)
+    rc = spec->sign (r_sig, s_hash, keyparms);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (keyparms);
+  return rc;
+}
+
+
+/*
+   Verify a signature.
+
+   Caller has to supply the public key pkey, the signature sig and his
+   hashvalue data.  Public key has to be a standard public key given
+   as an S-Exp, sig is a S-Exp as returned from gcry_pk_sign and data
+   must be an S-Exp like the one in sign too.  */
+gcry_err_code_t
+_gcry_pk_verify (gcry_sexp_t s_sig, gcry_sexp_t s_hash, gcry_sexp_t s_pkey)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms;
+
+  rc = spec_from_sexp (s_pkey, 0, &spec, &keyparms);
+  if (rc)
+    goto leave;
+
+  if (spec->verify)
+    rc = spec->verify (s_sig, s_hash, keyparms);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (keyparms);
+  return rc;
+}
+
+
+/*
+   Test a key.
+
+   This may be used either for a public or a secret key to see whether
+   the internal structure is okay.
+
+   Returns: 0 or an errorcode.
+
+   NOTE: We currently support only secret key checking. */
+gcry_err_code_t
+_gcry_pk_testkey (gcry_sexp_t s_key)
+{
+  gcry_err_code_t rc;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms;
+
+  rc = spec_from_sexp (s_key, 1, &spec, &keyparms);
+  if (rc)
+    goto leave;
+
+  if (spec->check_secret_key)
+    rc = spec->check_secret_key (keyparms);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (keyparms);
+  return rc;
+}
+
+
+/*
+  Create a public key pair and return it in r_key.
+  How the key is created depends on s_parms:
+  (genkey
+   (algo
+     (parameter_name_1 ....)
+      ....
+     (parameter_name_n ....)
+  ))
+  The key is returned in a format depending on the
+  algorithm. Both, private and secret keys are returned
+  and optionally some additional informatin.
+  For elgamal we return this structure:
+  (key-data
+   (public-key
+     (elg
+ 	(p <mpi>)
+ 	(g <mpi>)
+ 	(y <mpi>)
+     )
+   )
+   (private-key
+     (elg
+ 	(p <mpi>)
+ 	(g <mpi>)
+ 	(y <mpi>)
+ 	(x <mpi>)
+     )
+   )
+   (misc-key-info
+      (pm1-factors n1 n2 ... nn)
+   ))
+ */
+gcry_err_code_t
+_gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms)
+{
+  gcry_pk_spec_t *spec = NULL;
+  gcry_sexp_t list = NULL;
+  gcry_sexp_t l2 = NULL;
+  char *name = NULL;
+  gcry_err_code_t rc;
+
+  *r_key = NULL;
+
+  list = sexp_find_token (s_parms, "genkey", 0);
+  if (!list)
+    {
+      rc = GPG_ERR_INV_OBJ; /* Does not contain genkey data. */
+      goto leave;
+    }
+
+  l2 = sexp_cadr (list);
+  sexp_release (list);
+  list = l2;
+  l2 = NULL;
+  if (! list)
+    {
+      rc = GPG_ERR_NO_OBJ; /* No cdr for the genkey. */
+      goto leave;
+    }
+
+  name = _gcry_sexp_nth_string (list, 0);
+  if (!name)
+    {
+      rc = GPG_ERR_INV_OBJ; /* Algo string missing.  */
+      goto leave;
+    }
+
+  spec = spec_from_name (name);
+  xfree (name);
+  name = NULL;
+  if (!spec)
+    {
+      rc = GPG_ERR_PUBKEY_ALGO; /* Unknown algorithm.  */
+      goto leave;
+    }
+
+  if (spec->generate)
+    rc = spec->generate (list, r_key);
+  else
+    rc = GPG_ERR_NOT_IMPLEMENTED;
+
+ leave:
+  sexp_release (list);
+  xfree (name);
+  sexp_release (l2);
+
+  return rc;
+}
+
+
+/*
+   Get the number of nbits from the public key.
+
+   Hmmm: Should we have really this function or is it better to have a
+   more general function to retrieve different properties of the key?  */
+unsigned int
+_gcry_pk_get_nbits (gcry_sexp_t key)
+{
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t parms;
+  unsigned int nbits;
+
+  /* Parsing KEY might be considered too much overhead.  For example
+     for RSA we would only need to look at P and stop parsing right
+     away.  However, with ECC things are more complicate in that only
+     a curve name might be specified.  Thus we need to tear the sexp
+     apart. */
+
+  if (spec_from_sexp (key, 0, &spec, &parms))
+    return 0; /* Error - 0 is a suitable indication for that.  */
+
+  nbits = spec->get_nbits (parms);
+  sexp_release (parms);
+  return nbits;
+}
+
+
+/* Return the so called KEYGRIP which is the SHA-1 hash of the public
+   key parameters expressed in a way depending on the algorithm.
+
+   ARRAY must either be 20 bytes long or NULL; in the latter case a
+   newly allocated array of that size is returned, otherwise ARRAY or
+   NULL is returned to indicate an error which is most likely an
+   unknown algorithm.  The function accepts public or secret keys. */
+unsigned char *
+_gcry_pk_get_keygrip (gcry_sexp_t key, unsigned char *array)
+{
+  gcry_sexp_t list = NULL;
+  gcry_sexp_t l2 = NULL;
+  gcry_pk_spec_t *spec = NULL;
+  const char *s;
+  char *name = NULL;
+  int idx;
+  const char *elems;
+  gcry_md_hd_t md = NULL;
+  int okay = 0;
+
+  /* Check that the first element is valid. */
+  list = sexp_find_token (key, "public-key", 0);
+  if (! list)
+    list = sexp_find_token (key, "private-key", 0);
+  if (! list)
+    list = sexp_find_token (key, "protected-private-key", 0);
+  if (! list)
+    list = sexp_find_token (key, "shadowed-private-key", 0);
+  if (! list)
+    return NULL; /* No public- or private-key object. */
+
+  l2 = sexp_cadr (list);
+  sexp_release (list);
+  list = l2;
+  l2 = NULL;
+
+  name = _gcry_sexp_nth_string (list, 0);
+  if (!name)
+    goto fail; /* Invalid structure of object. */
+
+  spec = spec_from_name (name);
+  if (!spec)
+    goto fail; /* Unknown algorithm.  */
+
+  elems = spec->elements_grip;
+  if (!elems)
+    goto fail; /* No grip parameter.  */
+
+  if (_gcry_md_open (&md, GCRY_MD_SHA1, 0))
+    goto fail;
+
+  if (spec->comp_keygrip)
+    {
+      /* Module specific method to compute a keygrip.  */
+      if (spec->comp_keygrip (md, list))
+        goto fail;
+    }
+  else
+    {
+      /* Generic method to compute a keygrip.  */
+      for (idx = 0, s = elems; *s; s++, idx++)
+        {
+          const char *data;
+          size_t datalen;
+          char buf[30];
+
+          l2 = sexp_find_token (list, s, 1);
+          if (! l2)
+            goto fail;
+          data = sexp_nth_data (l2, 1, &datalen);
+          if (! data)
+            goto fail;
+
+          snprintf (buf, sizeof buf, "(1:%c%u:", *s, (unsigned int)datalen);
+          _gcry_md_write (md, buf, strlen (buf));
+          _gcry_md_write (md, data, datalen);
+          sexp_release (l2);
+          l2 = NULL;
+          _gcry_md_write (md, ")", 1);
+        }
+    }
+
+  if (!array)
+    {
+      array = xtrymalloc (20);
+      if (! array)
+        goto fail;
+    }
+
+  memcpy (array, _gcry_md_read (md, GCRY_MD_SHA1), 20);
+  okay = 1;
+
+ fail:
+  xfree (name);
+  sexp_release (l2);
+  _gcry_md_close (md);
+  sexp_release (list);
+  return okay? array : NULL;
+}
+
+
+
+const char *
+_gcry_pk_get_curve (gcry_sexp_t key, int iterator, unsigned int *r_nbits)
+{
+  const char *result = NULL;
+  gcry_pk_spec_t *spec;
+  gcry_sexp_t keyparms = NULL;
+
+  if (r_nbits)
+    *r_nbits = 0;
+
+  if (key)
+    {
+      iterator = 0;
+
+      if (spec_from_sexp (key, 0, &spec, &keyparms))
+        return NULL;
+    }
+  else
+    {
+      spec = spec_from_name ("ecc");
+      if (!spec)
+        return NULL;
+    }
+
+  if (spec->get_curve)
+    result = spec->get_curve (keyparms, iterator, r_nbits);
+
+  sexp_release (keyparms);
+  return result;
+}
+
+
+
+gcry_sexp_t
+_gcry_pk_get_param (int algo, const char *name)
+{
+  gcry_sexp_t result = NULL;
+  gcry_pk_spec_t *spec = NULL;
+
+  algo = map_algo (algo);
+
+  if (algo != GCRY_PK_ECC)
+    return NULL;
+
+  spec = spec_from_name ("ecc");
+  if (spec)
+    {
+      if (spec && spec->get_curve_param)
+        result = spec->get_curve_param (name);
+    }
+  return result;
+}
+
+
+
+gcry_err_code_t
+_gcry_pk_ctl (int cmd, void *buffer, size_t buflen)
+{
+  gcry_err_code_t rc = 0;
+
+  switch (cmd)
+    {
+    case GCRYCTL_DISABLE_ALGO:
+      /* This one expects a buffer pointing to an integer with the
+         algo number.  */
+      if ((! buffer) || (buflen != sizeof (int)))
+	rc = GPG_ERR_INV_ARG;
+      else
+	disable_pubkey_algo (*((int *) buffer));
+      break;
+
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+
+/* Return information about the given algorithm
+
+   WHAT selects the kind of information returned:
+
+    GCRYCTL_TEST_ALGO:
+        Returns 0 when the specified algorithm is available for use.
+        Buffer must be NULL, nbytes  may have the address of a variable
+        with the required usage of the algorithm. It may be 0 for don't
+        care or a combination of the GCRY_PK_USAGE_xxx flags;
+
+    GCRYCTL_GET_ALGO_USAGE:
+        Return the usage flags for the given algo.  An invalid algo
+        returns 0.  Disabled algos are ignored here because we
+        only want to know whether the algo is at all capable of
+        the usage.
+
+   Note: Because this function is in most cases used to return an
+   integer value, we can make it easier for the caller to just look at
+   the return value.  The caller will in all cases consult the value
+   and thereby detecting whether a error occurred or not (i.e. while
+   checking the block size) */
+gcry_err_code_t
+_gcry_pk_algo_info (int algorithm, int what, void *buffer, size_t *nbytes)
+{
+  gcry_err_code_t rc = 0;
+
+  switch (what)
+    {
+    case GCRYCTL_TEST_ALGO:
+      {
+	int use = nbytes ? *nbytes : 0;
+	if (buffer)
+	  rc = GPG_ERR_INV_ARG;
+	else if (check_pubkey_algo (algorithm, use))
+	  rc = GPG_ERR_PUBKEY_ALGO;
+	break;
+      }
+
+    case GCRYCTL_GET_ALGO_USAGE:
+      {
+	gcry_pk_spec_t *spec;
+
+	spec = spec_from_algo (algorithm);
+        *nbytes = spec? spec->use : 0;
+	break;
+      }
+
+    case GCRYCTL_GET_ALGO_NPKEY:
+      {
+	/* FIXME?  */
+	int npkey = pubkey_get_npkey (algorithm);
+	*nbytes = npkey;
+	break;
+      }
+    case GCRYCTL_GET_ALGO_NSKEY:
+      {
+	/* FIXME?  */
+	int nskey = pubkey_get_nskey (algorithm);
+	*nbytes = nskey;
+	break;
+      }
+    case GCRYCTL_GET_ALGO_NSIGN:
+      {
+	/* FIXME?  */
+	int nsign = pubkey_get_nsig (algorithm);
+	*nbytes = nsign;
+	break;
+      }
+    case GCRYCTL_GET_ALGO_NENCR:
+      {
+	/* FIXME?  */
+	int nencr = pubkey_get_nenc (algorithm);
+	*nbytes = nencr;
+	break;
+      }
+
+    default:
+      rc = GPG_ERR_INV_OP;
+    }
+
+  return rc;
+}
+
+
+/* Return an S-expression representing the context CTX.  Depending on
+   the state of that context, the S-expression may either be a public
+   key, a private key or any other object used with public key
+   operations.  On success a new S-expression is stored at R_SEXP and
+   0 is returned, on error NULL is store there and an error code is
+   returned.  MODE is either 0 or one of the GCRY_PK_GET_xxx values.
+
+   As of now it only support certain ECC operations because a context
+   object is right now only defined for ECC.  Over time this function
+   will be extended to cover more algorithms.  Note also that the name
+   of the function is gcry_pubkey_xxx and not gcry_pk_xxx.  The idea
+   is that we will eventually provide variants of the existing
+   gcry_pk_xxx functions which will take a context parameter.   */
+gcry_err_code_t
+_gcry_pubkey_get_sexp (gcry_sexp_t *r_sexp, int mode, gcry_ctx_t ctx)
+{
+  mpi_ec_t ec;
+
+  if (!r_sexp)
+    return GPG_ERR_INV_VALUE;
+  *r_sexp = NULL;
+  switch (mode)
+    {
+    case 0:
+    case GCRY_PK_GET_PUBKEY:
+    case GCRY_PK_GET_SECKEY:
+      break;
+    default:
+      return GPG_ERR_INV_VALUE;
+    }
+  if (!ctx)
+    return GPG_ERR_NO_CRYPT_CTX;
+
+  ec = _gcry_ctx_find_pointer (ctx, CONTEXT_TYPE_EC);
+  if (ec)
+    return _gcry_pk_ecc_get_sexp (r_sexp, mode, ec);
+
+  return GPG_ERR_WRONG_CRYPT_CTX;
+}
+
+
+
+/* Explicitly initialize this module.  */
+gcry_err_code_t
+_gcry_pk_init (void)
+{
+  if (fips_mode())
+    {
+      /* disable algorithms that are disallowed in fips */
+      int idx;
+      gcry_pk_spec_t *spec;
+
+      for (idx = 0; (spec = pubkey_list[idx]); idx++)
+        if (!spec->flags.fips)
+          spec->flags.disabled = 1;
+    }
+
+  return 0;
+}
+
+
+/* Run the selftests for pubkey algorithm ALGO with optional reporting
+   function REPORT.  */
+gpg_error_t
+_gcry_pk_selftest (int algo, int extended, selftest_report_func_t report)
+{
+  gcry_err_code_t ec;
+  gcry_pk_spec_t *spec;
+
+  algo = map_algo (algo);
+  spec = spec_from_algo (algo);
+  if (spec && !spec->flags.disabled && spec->selftest)
+    ec = spec->selftest (algo, extended, report);
+  else
+    {
+      ec = GPG_ERR_PUBKEY_ALGO;
+      /* Fixme: We need to change the report function to allow passing
+         of an encryption mode (e.g. pkcs1, ecdsa, or ecdh).  */
+      if (report)
+        report ("pubkey", algo, "module",
+                spec && !spec->flags.disabled?
+                "no selftest available" :
+                spec? "algorithm disabled" :
+                "algorithm not found");
+    }
+
+  return gpg_error (ec);
+}
diff --git a/comm/third_party/libgcrypt/cipher/rfc2268.c b/comm/third_party/libgcrypt/cipher/rfc2268.c
new file mode 100644
index 0000000000..f018b64038
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rfc2268.c
@@ -0,0 +1,378 @@
+/* rfc2268.c  - The cipher described in rfc2268; aka Ron's Cipher 2.
+ * Copyright (C) 2003 Nikos Mavroyanopoulos
+ * Copyright (C) 2004 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+/* This implementation was written by Nikos Mavroyanopoulos for GNUTLS
+ * as a Libgcrypt module (gnutls/lib/x509/rc2.c) and later adapted for
+ * direct use by Libgcrypt by Werner Koch.  This implementation is
+ * only useful for pkcs#12 decryption.
+ *
+ * The implementation here is based on Peter Gutmann's RRC.2 paper.
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "g10lib.h"
+#include "types.h"
+#include "cipher.h"
+#include "cipher-internal.h"
+
+#define RFC2268_BLOCKSIZE 8
+
+typedef struct
+{
+  u16 S[64];
+} RFC2268_context;
+
+static const unsigned char rfc2268_sbox[] = {
+  217, 120, 249, 196,  25, 221, 181, 237,
+   40, 233, 253, 121,  74, 160, 216, 157,
+  198, 126,  55, 131,  43, 118,  83, 142,
+   98,  76, 100, 136,  68, 139, 251, 162,
+   23, 154,  89, 245, 135, 179,  79,  19,
+   97,  69, 109, 141,   9, 129, 125,  50,
+  189, 143,  64, 235, 134, 183, 123,  11,
+  240, 149,  33,  34,  92, 107,  78, 130,
+   84, 214, 101, 147, 206,  96, 178,  28,
+  115,  86, 192,  20, 167, 140, 241, 220,
+   18, 117, 202,  31,  59, 190, 228, 209,
+   66,  61, 212,  48, 163,  60, 182,  38,
+  111, 191,  14, 218,  70, 105,   7,  87,
+   39, 242,  29, 155, 188, 148,  67,   3,
+  248,  17, 199, 246, 144, 239,  62, 231,
+    6, 195, 213,  47, 200, 102,  30, 215,
+    8, 232, 234, 222, 128,  82, 238, 247,
+  132, 170, 114, 172,  53,  77, 106,  42,
+  150,  26, 210, 113,  90,  21,  73, 116,
+   75, 159, 208,  94,   4,  24, 164, 236,
+  194, 224,  65, 110,  15,  81, 203, 204,
+   36, 145, 175,  80, 161, 244, 112,  57,
+  153, 124,  58, 133,  35, 184, 180, 122,
+  252,   2,  54,  91,  37,  85, 151,  49,
+   45,  93, 250, 152, 227, 138, 146, 174,
+    5, 223,  41,  16, 103, 108, 186, 201,
+  211,   0, 230, 207, 225, 158, 168,  44,
+   99,  22,   1,  63,  88, 226, 137, 169,
+   13,  56,  52,  27, 171,  51, 255, 176,
+  187,  72,  12,  95, 185, 177, 205,  46,
+  197, 243, 219,  71, 229, 165, 156, 119,
+   10, 166,  32, 104, 254, 127, 193, 173
+};
+
+#define rotl16(x,n)   (((x) << ((u16)(n))) | ((x) >> (16 - (u16)(n))))
+#define rotr16(x,n)   (((x) >> ((u16)(n))) | ((x) << (16 - (u16)(n))))
+
+static const char *selftest (void);
+
+
+static void
+do_encrypt (void *context, unsigned char *outbuf, const unsigned char *inbuf)
+{
+  RFC2268_context *ctx = context;
+  register int i, j;
+  u16 word0 = 0, word1 = 0, word2 = 0, word3 = 0;
+
+  word0 = (word0 << 8) | inbuf[1];
+  word0 = (word0 << 8) | inbuf[0];
+  word1 = (word1 << 8) | inbuf[3];
+  word1 = (word1 << 8) | inbuf[2];
+  word2 = (word2 << 8) | inbuf[5];
+  word2 = (word2 << 8) | inbuf[4];
+  word3 = (word3 << 8) | inbuf[7];
+  word3 = (word3 << 8) | inbuf[6];
+
+  for (i = 0; i < 16; i++)
+    {
+      j = i * 4;
+      /* For some reason I cannot combine those steps. */
+      word0 += (word1 & ~word3) + (word2 & word3) + ctx->S[j];
+      word0 = rotl16(word0, 1);
+
+      word1 += (word2 & ~word0) + (word3 & word0) + ctx->S[j + 1];
+      word1 = rotl16(word1, 2);
+
+      word2 += (word3 & ~word1) + (word0 & word1) + ctx->S[j + 2];
+      word2 = rotl16(word2, 3);
+
+      word3 += (word0 & ~word2) + (word1 & word2) + ctx->S[j + 3];
+      word3 = rotl16(word3, 5);
+
+      if (i == 4 || i == 10)
+        {
+          word0 += ctx->S[word3 & 63];
+          word1 += ctx->S[word0 & 63];
+          word2 += ctx->S[word1 & 63];
+          word3 += ctx->S[word2 & 63];
+        }
+
+    }
+
+  outbuf[0] = word0 & 255;
+  outbuf[1] = word0 >> 8;
+  outbuf[2] = word1 & 255;
+  outbuf[3] = word1 >> 8;
+  outbuf[4] = word2 & 255;
+  outbuf[5] = word2 >> 8;
+  outbuf[6] = word3 & 255;
+  outbuf[7] = word3 >> 8;
+}
+
+static unsigned int
+encrypt_block (void *context, unsigned char *outbuf, const unsigned char *inbuf)
+{
+  do_encrypt (context, outbuf, inbuf);
+  return /*burn_stack*/ (4 * sizeof(void *) + sizeof(void *) + sizeof(u32) * 4);
+}
+
+static void
+do_decrypt (void *context, unsigned char *outbuf, const unsigned char *inbuf)
+{
+  RFC2268_context *ctx = context;
+  register int i, j;
+  u16 word0 = 0, word1 = 0, word2 = 0, word3 = 0;
+
+  word0 = (word0 << 8) | inbuf[1];
+  word0 = (word0 << 8) | inbuf[0];
+  word1 = (word1 << 8) | inbuf[3];
+  word1 = (word1 << 8) | inbuf[2];
+  word2 = (word2 << 8) | inbuf[5];
+  word2 = (word2 << 8) | inbuf[4];
+  word3 = (word3 << 8) | inbuf[7];
+  word3 = (word3 << 8) | inbuf[6];
+
+  for (i = 15; i >= 0; i--)
+    {
+      j = i * 4;
+
+      word3 = rotr16(word3, 5);
+      word3 -= (word0 & ~word2) + (word1 & word2) + ctx->S[j + 3];
+
+      word2 = rotr16(word2, 3);
+      word2 -= (word3 & ~word1) + (word0 & word1) + ctx->S[j + 2];
+
+      word1 = rotr16(word1, 2);
+      word1 -= (word2 & ~word0) + (word3 & word0) + ctx->S[j + 1];
+
+      word0 = rotr16(word0, 1);
+      word0 -= (word1 & ~word3) + (word2 & word3) + ctx->S[j];
+
+      if (i == 5 || i == 11)
+        {
+          word3 = word3 - ctx->S[word2 & 63];
+          word2 = word2 - ctx->S[word1 & 63];
+          word1 = word1 - ctx->S[word0 & 63];
+          word0 = word0 - ctx->S[word3 & 63];
+        }
+
+    }
+
+  outbuf[0] = word0 & 255;
+  outbuf[1] = word0 >> 8;
+  outbuf[2] = word1 & 255;
+  outbuf[3] = word1 >> 8;
+  outbuf[4] = word2 & 255;
+  outbuf[5] = word2 >> 8;
+  outbuf[6] = word3 & 255;
+  outbuf[7] = word3 >> 8;
+}
+
+static unsigned int
+decrypt_block (void *context, unsigned char *outbuf, const unsigned char *inbuf)
+{
+  do_decrypt (context, outbuf, inbuf);
+  return /*burn_stack*/ (4 * sizeof(void *) + sizeof(void *) + sizeof(u32) * 4);
+}
+
+
+static gpg_err_code_t
+setkey_core (void *context, const unsigned char *key, unsigned int keylen, int with_phase2)
+{
+  static int initialized;
+  static const char *selftest_failed;
+  RFC2268_context *ctx = context;
+  unsigned int i;
+  unsigned char *S, x;
+  int len;
+  int bits = keylen * 8;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("RFC2268 selftest failed (%s).\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen < 40 / 8)	/* We want at least 40 bits. */
+    return GPG_ERR_INV_KEYLEN;
+
+  S = (unsigned char *) ctx->S;
+
+  for (i = 0; i < keylen; i++)
+    S[i] = key[i];
+
+  for (i = keylen; i < 128; i++)
+    S[i] = rfc2268_sbox[(S[i - keylen] + S[i - 1]) & 255];
+
+  S[0] = rfc2268_sbox[S[0]];
+
+  /* Phase 2 - reduce effective key size to "bits". This was not
+   * discussed in Gutmann's paper. I've copied that from the public
+   * domain code posted in sci.crypt. */
+  if (with_phase2)
+    {
+      len = (bits + 7) >> 3;
+      i = 128 - len;
+      x = rfc2268_sbox[S[i] & (255 >> (7 & -bits))];
+      S[i] = x;
+
+      while (i--)
+        {
+          x = rfc2268_sbox[x ^ S[i + len]];
+          S[i] = x;
+        }
+    }
+
+  /* Make the expanded key, endian independent. */
+  for (i = 0; i < 64; i++)
+    ctx->S[i] = ( (u16) S[i * 2] | (((u16) S[i * 2 + 1]) << 8));
+
+  return 0;
+}
+
+static gpg_err_code_t
+do_setkey (void *context, const unsigned char *key, unsigned int keylen,
+           cipher_bulk_ops_t *bulk_ops)
+{
+  (void)bulk_ops;
+  return setkey_core (context, key, keylen, 1);
+}
+
+static const char *
+selftest (void)
+{
+  RFC2268_context ctx;
+  unsigned char scratch[16];
+
+  /* Test vectors from Peter Gutmann's paper. */
+  static unsigned char key_1[] =
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    };
+  static unsigned char plaintext_1[] =
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static const unsigned char ciphertext_1[] =
+    { 0x1C, 0x19, 0x8A, 0x83, 0x8D, 0xF0, 0x28, 0xB7 };
+
+  static unsigned char key_2[] =
+    { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+      0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+    };
+  static unsigned char plaintext_2[] =
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static unsigned char ciphertext_2[] =
+    { 0x50, 0xDC, 0x01, 0x62, 0xBD, 0x75, 0x7F, 0x31 };
+
+  /* This one was checked against libmcrypt's RFC2268. */
+  static unsigned char key_3[] =
+    { 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    };
+  static unsigned char plaintext_3[] =
+    { 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static unsigned char ciphertext_3[] =
+    { 0x8f, 0xd1, 0x03, 0x89, 0x33, 0x6b, 0xf9, 0x5e };
+
+
+  /* First test. */
+  setkey_core (&ctx, key_1, sizeof(key_1), 0);
+  do_encrypt (&ctx, scratch, plaintext_1);
+
+  if (memcmp (scratch, ciphertext_1, sizeof(ciphertext_1)))
+    return "RFC2268 encryption test 1 failed.";
+
+  setkey_core (&ctx, key_1, sizeof(key_1), 0);
+  do_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext_1, sizeof(plaintext_1)))
+    return "RFC2268 decryption test 1 failed.";
+
+  /* Second test. */
+  setkey_core (&ctx, key_2, sizeof(key_2), 0);
+  do_encrypt (&ctx, scratch, plaintext_2);
+  if (memcmp (scratch, ciphertext_2, sizeof(ciphertext_2)))
+    return "RFC2268 encryption test 2 failed.";
+
+  setkey_core (&ctx, key_2, sizeof(key_2), 0);
+  do_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext_2, sizeof(plaintext_2)))
+    return "RFC2268 decryption test 2 failed.";
+
+  /* Third test. */
+  setkey_core(&ctx, key_3, sizeof(key_3), 0);
+  do_encrypt(&ctx, scratch, plaintext_3);
+
+  if (memcmp(scratch, ciphertext_3, sizeof(ciphertext_3)))
+    return "RFC2268 encryption test 3 failed.";
+
+  setkey_core (&ctx, key_3, sizeof(key_3), 0);
+  do_decrypt (&ctx, scratch, scratch);
+  if (memcmp(scratch, plaintext_3, sizeof(plaintext_3)))
+    return "RFC2268 decryption test 3 failed.";
+
+  return NULL;
+}
+
+
+
+static gcry_cipher_oid_spec_t oids_rfc2268_40[] =
+  {
+    /*{ "1.2.840.113549.3.2", GCRY_CIPHER_MODE_CBC },*/
+    /* pbeWithSHAAnd40BitRC2_CBC */
+    { "1.2.840.113549.1.12.1.6", GCRY_CIPHER_MODE_CBC },
+    { NULL }
+  };
+
+static gcry_cipher_oid_spec_t oids_rfc2268_128[] =
+  {
+    /* pbeWithSHAAnd128BitRC2_CBC */
+    { "1.2.840.113549.1.12.1.5", GCRY_CIPHER_MODE_CBC },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_rfc2268_40 =
+  {
+    GCRY_CIPHER_RFC2268_40, {0, 0},
+    "RFC2268_40", NULL, oids_rfc2268_40,
+    RFC2268_BLOCKSIZE, 40, sizeof(RFC2268_context),
+    do_setkey, encrypt_block, decrypt_block
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_rfc2268_128 =
+  {
+    GCRY_CIPHER_RFC2268_128, {0, 0},
+    "RFC2268_128", NULL, oids_rfc2268_128,
+    RFC2268_BLOCKSIZE, 128, sizeof(RFC2268_context),
+    do_setkey, encrypt_block, decrypt_block
+  };
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S b/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S
new file mode 100644
index 0000000000..e77dd4e0b8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-aarch64.S
@@ -0,0 +1,514 @@
+/* rijndael-aarch64.S  -  ARMv8/AArch64 assembly implementation of AES cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* register macros */
+#define CTX	x0
+#define RDST	x1
+#define RSRC	x2
+#define NROUNDS	w3
+#define RTAB	x4
+#define RMASK	w5
+
+#define RA	w8
+#define RB	w9
+#define RC	w10
+#define RD	w11
+
+#define RNA	w12
+#define RNB	w13
+#define RNC	w14
+#define RND	w15
+
+#define RT0	w6
+#define RT1	w7
+#define RT2	w16
+#define xRT0	x6
+#define xRT1	x7
+#define xRT2	x16
+
+#define xw8	x8
+#define xw9	x9
+#define xw10	x10
+#define xw11	x11
+
+#define xw12	x12
+#define xw13	x13
+#define xw14	x14
+#define xw15	x15
+
+/***********************************************************************
+ * ARMv8/AArch64 assembly implementation of the AES cipher
+ ***********************************************************************/
+#define preload_first_key(round, ra) \
+	ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
+
+#define dummy(round, ra) /* nothing */
+
+#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldp rna, rnb, [CTX]; \
+	ldp rnc, rnd, [CTX, #8]; \
+	eor ra, ra, rna; \
+	eor rb, rb, rnb; \
+	eor rc, rc, rnc; \
+	preload_key(1, rna); \
+	eor rd, rd, rnd;
+
+#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#2; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldr RT0, [RTAB, xRT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldr ra,  [RTAB, x##ra]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	eor rnb, rnb, ra, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldr rd,  [RTAB, x##rd]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	eor rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	eor rna, rna, rd, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldr rc,  [RTAB, x##rc]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	eor rnd, rnd, rc, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rb,  [RTAB, x##rb]; \
+	\
+	eor rnd, rnd, RT2, ror #16; \
+	preload_key((next_r) + 1, ra); \
+	eor rnc, rnc, rb, ror #8;
+
+#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra, lsl#2; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldrb rna, [RTAB, xRT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	ldrb rnd, [RTAB, xRT1]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldrb rnc, [RTAB, xRT2]; \
+	ror rnd, rnd, #24; \
+	ldrb rnb, [RTAB, x##ra]; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	ror rnc, rnc, #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	ror rnb, rnb, #8; \
+	ldrb RT0, [RTAB, xRT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	ldrb RT1, [RTAB, xRT1]; \
+	\
+	orr rnd, rnd, RT0; \
+	ldrb RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldrb rd,  [RTAB, x##rd]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	orr rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	orr rna, rna, rd, ror #8; \
+	ldrb RT0, [RTAB, xRT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	ldrb RT1, [RTAB, xRT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldrb RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldrb rc,  [RTAB, x##rc]; \
+	orr rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	orr rna, rna, RT2, ror #16; \
+	ldrb RT0, [RTAB, xRT0]; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	ldrb RT1, [RTAB, xRT1]; \
+	orr rnd, rnd, rc, ror #8; \
+	ldrb RT2, [RTAB, xRT2]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	ldrb rb,  [RTAB, x##rb]; \
+	\
+	orr rnb, rnb, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnd, rnd, RT2, ror #16; \
+	orr rnc, rnc, rb, ror #8;
+
+#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add CTX, CTX, #(((round) + 1) * 16); \
+	add RTAB, RTAB, #1; \
+	do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.globl _gcry_aes_arm_encrypt_block
+ELF(.type   _gcry_aes_arm_encrypt_block,%function;)
+
+_gcry_aes_arm_encrypt_block:
+	/* input:
+	 *	%x0: keysched, CTX
+	 *	%x1: dst
+	 *	%x2: src
+	 *	%w3: number of rounds.. 10, 12 or 14
+	 *      %x4: encryption table
+	 */
+	CFI_STARTPROC();
+
+	/* read input block */
+
+	/* aligned load */
+	ldp	RA, RB, [RSRC];
+	ldp	RC, RD, [RSRC, #8];
+#ifndef __AARCH64EL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+
+	mov	RMASK, #(0xff<<2);
+
+	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+
+	cmp	NROUNDS, #12;
+	bge	.Lenc_not_128;
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+.Lenc_done:
+
+	/* store output block */
+
+	/* aligned store */
+#ifndef __AARCH64EL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stp	RA, RB, [RDST];
+	stp	RC, RD, [RDST, #8];
+
+	mov     x0, #(0);
+	ret;
+
+.ltorg
+.Lenc_not_128:
+	beq .Lenc_192
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+
+.ltorg
+.Lenc_192:
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;)
+
+#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
+	ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
+	eor ra, ra, rna; \
+	ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
+	eor rb, rb, rnb; \
+	ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
+	eor rc, rc, rnc; \
+	preload_first_key((round) - 1, rna); \
+	eor rd, rd, rnd;
+
+#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#2; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldr RT0, [RTAB, xRT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldr ra,  [RTAB, x##ra]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	eor rnd, rnd, ra, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldr rb,  [RTAB, x##rb]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	eor rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	eor rna, rna, rb, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldr rc,  [RTAB, x##rc]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	eor rnb, rnb, rc, ror #8; \
+	ldr RT0, [RTAB, xRT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, xRT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, xRT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rd,  [RTAB, x##rd]; \
+	\
+	eor rnb, rnb, RT2, ror #16; \
+	preload_key((next_r) - 1, ra); \
+	eor rnc, rnc, rd, ror #8;
+
+#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra; \
+	and RT1, RMASK, ra, lsr#8; \
+	and RT2, RMASK, ra, lsr#16; \
+	ldrb rna, [RTAB, xRT0]; \
+	lsr ra,  ra, #24; \
+	ldrb rnb, [RTAB, xRT1]; \
+	and RT0, RMASK, rb; \
+	ldrb rnc, [RTAB, xRT2]; \
+	ror rnb, rnb, #24; \
+	ldrb rnd, [RTAB, x##ra]; \
+	and RT1, RMASK, rb, lsr#8; \
+	ror rnc, rnc, #16; \
+	and RT2, RMASK, rb, lsr#16; \
+	ror rnd, rnd, #8; \
+	ldrb RT0, [RTAB, xRT0]; \
+	lsr rb,  rb, #24; \
+	ldrb RT1, [RTAB, xRT1]; \
+	\
+	orr rnb, rnb, RT0; \
+	ldrb RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rc; \
+	ldrb rb,  [RTAB, x##rb]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#8; \
+	orr rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#16; \
+	orr rna, rna, rb, ror #8; \
+	ldrb RT0, [RTAB, xRT0]; \
+	lsr rc,  rc, #24; \
+	ldrb RT1, [RTAB, xRT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldrb RT2, [RTAB, xRT2]; \
+	and RT0, RMASK, rd; \
+	ldrb rc,  [RTAB, x##rc]; \
+	orr rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#8; \
+	orr rna, rna, RT2, ror #16; \
+	ldrb RT0, [RTAB, xRT0]; \
+	and RT2, RMASK, rd, lsr#16; \
+	ldrb RT1, [RTAB, xRT1]; \
+	orr rnb, rnb, rc, ror #8; \
+	ldrb RT2, [RTAB, xRT2]; \
+	lsr rd,  rd, #24; \
+	ldrb rd,  [RTAB, x##rd]; \
+	\
+	orr rnd, rnd, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnb, rnb, RT2, ror #16; \
+	orr rnc, rnc, rd, ror #8;
+
+#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define set_last_round_rmask(_, __) \
+	mov RMASK, #0xff;
+
+#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add RTAB, RTAB, #(4 * 256); \
+	do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.globl _gcry_aes_arm_decrypt_block
+ELF(.type   _gcry_aes_arm_decrypt_block,%function;)
+
+_gcry_aes_arm_decrypt_block:
+	/* input:
+	 *	%x0: keysched, CTX
+	 *	%x1: dst
+	 *	%x2: src
+	 *	%w3: number of rounds.. 10, 12 or 14
+	 *      %x4: decryption table
+	 */
+	CFI_STARTPROC();
+
+	/* read input block */
+
+	/* aligned load */
+	ldp	RA, RB, [RSRC];
+	ldp	RC, RD, [RSRC, #8];
+#ifndef __AARCH64EL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+
+	mov	RMASK, #(0xff << 2);
+
+	cmp	NROUNDS, #12;
+	bge	.Ldec_256;
+
+	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+.Ldec_tail:
+	decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
+	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	/* store output block */
+
+	/* aligned store */
+#ifndef __AARCH64EL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stp	RA, RB, [RDST];
+	stp	RC, RD, [RDST, #8];
+
+	mov     x0, #(0);
+	ret;
+
+.ltorg
+.Ldec_256:
+	beq .Ldec_192;
+
+	firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+
+.ltorg
+.Ldec_192:
+	firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_arm_decrypt_block,.-_gcry_aes_arm_decrypt_block;)
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__ */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-aesni.c b/comm/third_party/libgcrypt/cipher/rijndael-aesni.c
new file mode 100644
index 0000000000..95ec4c2bb7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-aesni.c
@@ -0,0 +1,3965 @@
+/* AES-NI accelerated AES for Libgcrypt
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_AESNI
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+typedef struct u128_s
+{
+  u32 a, b, c, d;
+} __attribute__((packed, aligned(1), may_alias)) u128_t;
+
+
+/* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
+   because of 'pragma target'. */
+static ASM_FUNC_ATTR_INLINE const unsigned char *
+aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
+{
+  unsigned long ntz;
+
+  /* Assumes that N != 0. */
+  asm ("rep;bsfl %k[low], %k[ntz]\n\t"
+        : [ntz] "=r" (ntz)
+        : [low] "r" ((unsigned long)n)
+        : "cc");
+
+  return c->u_mode.ocb.L[ntz];
+}
+
+
+/* Two macros to be called prior and after the use of AESNI
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define aesni_prepare_2_7_variable char win64tmp[16 * 2]
+# define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8]
+# define aesni_prepare() do { } while (0)
+# define aesni_prepare_2_7()                                            \
+   do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
+		      "movdqu %%xmm7, %1\n\t"                           \
+                      : "=m" (*win64tmp), "=m" (*(win64tmp+16))         \
+                      :                                                 \
+                      : "memory");                                      \
+   } while (0)
+# define aesni_prepare_8_15()                                           \
+   do { asm volatile ("movdqu %%xmm8,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 2*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 7*16(%0)\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp8_15)                              \
+                      : "memory");                                      \
+   } while (0)
+# define aesni_cleanup()                                                \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
+                      "pxor %%xmm1, %%xmm1\n" :: );                     \
+   } while (0)
+# define aesni_cleanup_2_7()                                            \
+   do { asm volatile ("movdqu %0,   %%xmm6\n\t"                         \
+		      "movdqu %1,   %%xmm7\n\t"                         \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      :                                                 \
+                      : "m" (*win64tmp), "m" (*(win64tmp+16))           \
+                      : "memory");                                      \
+   } while (0)
+# define aesni_cleanup_8_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 3*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm15\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp8_15)                              \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define aesni_prepare_2_7_variable
+# define aesni_prepare() do { } while (0)
+# define aesni_prepare_2_7() do { } while (0)
+# define aesni_cleanup()                                                \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
+                      "pxor %%xmm1, %%xmm1\n" :: );                     \
+   } while (0)
+# define aesni_cleanup_2_7()                                            \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm2, %%xmm2\n\t"                         \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n":: );                      \
+   } while (0)
+# ifdef __x86_64__
+#  define aesni_prepare_8_15_variable
+#  define aesni_prepare_8_15() do { } while (0)
+#  define aesni_cleanup_8_15()                                          \
+   do { asm volatile ("pxor %%xmm8, %%xmm8\n"                           \
+                      "pxor %%xmm9, %%xmm9\n"                           \
+                      "pxor %%xmm10, %%xmm10\n"                         \
+                      "pxor %%xmm11, %%xmm11\n"                         \
+                      "pxor %%xmm12, %%xmm12\n"                         \
+                      "pxor %%xmm13, %%xmm13\n"                         \
+                      "pxor %%xmm14, %%xmm14\n"                         \
+                      "pxor %%xmm15, %%xmm15\n":: );                    \
+   } while (0)
+# endif
+#endif
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare();
+  aesni_prepare_2_7();
+
+  if (ctx->rounds < 12)
+    {
+      /* 128-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEY_EXPAND128 \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm3\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm1\n\t" \
+	"pslldq $4, %%xmm3\n\t" \
+	"pxor   %%xmm3, %%xmm2\n\t" \
+	"pxor   %%xmm2, %%xmm1\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key   */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x01)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x10(%[ksch])\n\t" /* ksch[1] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x02)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x04)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x08)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x10)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x50(%[ksch])\n\t" /* ksch[5] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x20)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x40)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x70(%[ksch])\n\t" /* ksch[7] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x80)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x1b)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x36)
+                    AESKEY_EXPAND128
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEY_EXPAND128
+    }
+  else if (ctx->rounds == 12)
+    {
+      /* 192-bit key */
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND192 \
+	"pshufd $0x55, %%xmm2, %%xmm2\n\t" \
+	"movdqu %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t" \
+	"pshufd $0xff, %%xmm1, %%xmm2\n\t" \
+	"movdqu %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movq 16(%[key]), %%xmm3\n\t"     /* xmm3 := key[16..23]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"    /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x10(%[ksch])\n\t" /* ksch[1] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x20(%[ksch])\n\t" /* ksch[2] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x30(%[ksch])\n\t" /* ksch[3] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x40(%[ksch])\n\t" /* ksch[4] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x50(%[ksch])\n\t" /* ksch[5] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0x70(%[ksch])\n\t" /* ksch[7] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0x80(%[ksch])\n\t" /* ksch[8] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0x90(%[ksch])\n\t" /* ksch[9] := xmm1  */
+                    "movdqa %%xmm3, %%xmm5\n\t"
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND192
+                    "shufpd $0, %%xmm1, %%xmm5\n\t"
+                    "movdqa %%xmm5, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm5  */
+                    "movdqa %%xmm1, %%xmm6\n\t"
+                    "shufpd $1, %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm6, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm6  */
+                    AESKEYGENASSIST_xmm3_xmm2(0x80)
+                    AESKEY_EXPAND192
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND192
+    }
+  else if (ctx->rounds > 12)
+    {
+      /* 256-bit key */
+#define AESKEYGENASSIST_xmm1_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd1, " #imm8 " \n\t"
+#define AESKEYGENASSIST_xmm3_xmm2(imm8) \
+	".byte 0x66, 0x0f, 0x3a, 0xdf, 0xd3, " #imm8 " \n\t"
+#define AESKEY_EXPAND256_A \
+	"pshufd $0xff, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm1, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm1\n\t" \
+	"pxor %%xmm2, %%xmm1\n\t"
+#define AESKEY_EXPAND256_B \
+	"pshufd $0xaa, %%xmm2, %%xmm2\n\t" \
+	"movdqa %%xmm3, %%xmm4\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pslldq $4, %%xmm4\n\t" \
+	"pxor %%xmm4, %%xmm3\n\t" \
+	"pxor %%xmm2, %%xmm3\n\t"
+
+      asm volatile ("movdqu (%[key]), %%xmm1\n\t"     /* xmm1 := key[0..15]   */
+                    "movdqu 16(%[key]), %%xmm3\n\t"   /* xmm3 := key[16..31]  */
+                    "movdqa %%xmm1, (%[ksch])\n\t"     /* ksch[0] := xmm1  */
+                    "movdqa %%xmm3, 0x10(%[ksch])\n\t" /* ksch[1] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x01)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x20(%[ksch])\n\t" /* ksch[2] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x30(%[ksch])\n\t" /* ksch[3] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x02)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x40(%[ksch])\n\t" /* ksch[4] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x50(%[ksch])\n\t" /* ksch[5] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x04)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x60(%[ksch])\n\t" /* ksch[6] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x70(%[ksch])\n\t" /* ksch[7] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x08)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0x80(%[ksch])\n\t" /* ksch[8] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0x90(%[ksch])\n\t" /* ksch[9] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x10)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xa0(%[ksch])\n\t" /* ksch[10] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xb0(%[ksch])\n\t" /* ksch[11] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x20)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xc0(%[ksch])\n\t" /* ksch[12] := xmm1  */
+                    AESKEYGENASSIST_xmm1_xmm2(0x00)
+                    AESKEY_EXPAND256_B
+                    "movdqa %%xmm3, 0xd0(%[ksch])\n\t" /* ksch[13] := xmm3  */
+
+                    AESKEYGENASSIST_xmm3_xmm2(0x40)
+                    AESKEY_EXPAND256_A
+                    "movdqa %%xmm1, 0xe0(%[ksch])\n\t" /* ksch[14] := xmm1  */
+
+                    :
+                    : [key] "r" (key), [ksch] "r" (ctx->keyschenc)
+                    : "cc", "memory" );
+#undef AESKEYGENASSIST_xmm1_xmm2
+#undef AESKEYGENASSIST_xmm3_xmm2
+#undef AESKEY_EXPAND256_A
+#undef AESKEY_EXPAND256_B
+    }
+
+  aesni_cleanup();
+  aesni_cleanup_2_7();
+}
+
+
+/* Make a decryption key from an encryption key. */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  /* The AES-NI decrypt instructions use the Equivalent Inverse
+     Cipher, thus we can't use the the standard decrypt key
+     preparation.  */
+  u128_t *ekey = (u128_t *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)ctx->keyschdec;
+  int rr;
+  int r;
+
+#define DO_AESNI_AESIMC() \
+  asm volatile ("movdqa %[ekey], %%xmm1\n\t" \
+                /*"aesimc %%xmm1, %%xmm1\n\t"*/ \
+                ".byte 0x66, 0x0f, 0x38, 0xdb, 0xc9\n\t" \
+                "movdqa %%xmm1, %[dkey]" \
+                : [dkey] "=m" (dkey[r]) \
+                : [ekey] "m" (ekey[rr]) \
+                : "memory")
+
+  dkey[0] = ekey[ctx->rounds];
+  r=1;
+  rr=ctx->rounds-1;
+  DO_AESNI_AESIMC(); r++; rr--; /* round 1 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 2 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 3 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 4 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 5 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 6 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 7 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 8 */
+  DO_AESNI_AESIMC(); r++; rr--; /* round 9 */
+  if (ctx->rounds > 10)
+    {
+      DO_AESNI_AESIMC(); r++; rr--; /* round 10 */
+      DO_AESNI_AESIMC(); r++; rr--; /* round 11 */
+      if (ctx->rounds > 12)
+        {
+          DO_AESNI_AESIMC(); r++; rr--; /* round 12 */
+          DO_AESNI_AESIMC(); r++; rr--; /* round 13 */
+        }
+    }
+
+  dkey[r] = ekey[0];
+
+#undef DO_AESNI_AESIMC
+}
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  aesni_prepare();
+  do_aesni_prepare_decryption (ctx);
+  aesni_cleanup();
+}
+
+
+/* Encrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_enc (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "\n"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Decrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_dec (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
+#define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesdec_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm1_xmm0
+                "\n"
+                :
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm1_xmm0
+#undef aesdeclast_xmm1_xmm0
+}
+
+
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
+/* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm0_xmm1
+                aesdeclast_xmm0_xmm2
+                aesdeclast_xmm0_xmm3
+                aesdeclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
+/* Perform a CTR encryption round using the counter CTR and the input
+   block A.  Write the result to the output block B and update CTR.
+   CTR needs to be a 16 byte aligned little-endian value.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_ctr (const RIJNDAEL_context *ctx,
+              unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5)  */
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++ (big endian) */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "cmpl   $0xffffffff, 12(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "psubq   %%xmm1, %%xmm5\n\t"    /* add carry to upper 64bits */
+
+                ".Lno_carry%=:\n\t"
+
+                "pshufb %%xmm6, %%xmm5\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).       */
+
+                "pxor (%[key]), %%xmm0\n\t"     /* xmm1 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                "movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
+                "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
+
+                : [dst] "=m" (*b)
+                : [src] "m" (*a),
+                  [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [rounds] "g" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+}
+
+
+/* Four blocks at a time variant of do_aesni_ctr.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
+    };
+  const void *bige_addb = bige_addb_const;
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
+#define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
+#define aesenc_xmm1_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm1_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd1\n\t"
+#define aesenclast_xmm1_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd9\n\t"
+#define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "addb   $4, 15(%[ctr])\n\t"
+                "jc     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa 3*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(4) */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(4) + CTR (xmm0) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "jmp    .Ldone_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR.  */
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                ".Ldone_ctr%=:\n\t"
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Lenclast%=\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                aesenc_xmm1_xmm0
+                aesenc_xmm1_xmm2
+                aesenc_xmm1_xmm3
+                aesenc_xmm1_xmm4
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                aesenclast_xmm1_xmm0
+                aesenclast_xmm1_xmm2
+                aesenclast_xmm1_xmm3
+                aesenclast_xmm1_xmm4
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+
+  asm volatile ("movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
+                "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
+                "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
+
+                "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
+                "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
+                "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
+
+                "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
+                "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
+                "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
+
+                "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
+                "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
+                "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenc_xmm1_xmm2
+#undef aesenc_xmm1_xmm3
+#undef aesenc_xmm1_xmm4
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm1_xmm2
+#undef aesenclast_xmm1_xmm3
+#undef aesenclast_xmm1_xmm4
+}
+
+
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr.  */
+static ASM_FUNC_ATTR_INLINE void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+    };
+  const void *bige_addb = bige_addb_const;
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+      xmm8  CTR-4
+      xmm9  CTR-5
+      xmm10 CTR-6
+      xmm11 CTR-7
+      xmm12 temp
+      xmm13 temp
+      xmm14 temp
+      xmm15 temp
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "addb   $8, 15(%[ctr])\n\t"
+                "jc     .Ladd32bit%=\n\t"
+
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm2\n\t"     /* xmm2 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm3\n\t"     /* xmm3 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm4\n\t"     /* xmm4 := CTR (xmm5) */
+                "paddb  0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) + CTR */
+                "paddb  1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) + CTR */
+                "paddb  2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) + CTR */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "aesenc %%xmm7, %%xmm0\n\t"
+                "aesenc %%xmm7, %%xmm2\n\t"
+                "aesenc %%xmm7, %%xmm3\n\t"
+                "aesenc %%xmm7, %%xmm4\n\t"
+                "movdqa %%xmm5, %%xmm8\n\t"     /* xmm8 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm9\n\t"     /* xmm9 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm10\n\t"    /* xmm10 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm11\n\t"    /* xmm11 := CTR (xmm5) */
+                "paddb  3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) + CTR */
+                "paddb  4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) + CTR */
+                "paddb  5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) + CTR */
+                "paddb  6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) + CTR */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "aesenc %%xmm7, %%xmm8\n\t"
+                "aesenc %%xmm7, %%xmm9\n\t"
+                "aesenc %%xmm7, %%xmm10\n\t"
+                "aesenc %%xmm7, %%xmm11\n\t"
+
+                "paddb  7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) + CTR */
+
+                "jmp    .Ldone_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR. */
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
+                "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
+                "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
+                "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
+                "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
+                "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
+                "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
+                "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
+                "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffff8, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
+                "cmpl   $0xfffffffa, %%esi\n\t"
+                "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
+                "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
+                "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm8%=:\n\t"
+                "psubq   %%xmm1, %%xmm8\n\t"
+                ".Lcarry_xmm9%=:\n\t"
+                "psubq   %%xmm1, %%xmm9\n\t"
+                ".Lcarry_xmm10%=:\n\t"
+                "psubq   %%xmm1, %%xmm10\n\t"
+                ".Lcarry_xmm11%=:\n\t"
+                "psubq   %%xmm1, %%xmm11\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "aesenc %%xmm7, %%xmm0\n\t"
+                "aesenc %%xmm7, %%xmm2\n\t"
+                "aesenc %%xmm7, %%xmm3\n\t"
+                "aesenc %%xmm7, %%xmm4\n\t"
+                "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
+                "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
+                "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
+                "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "aesenc %%xmm7, %%xmm8\n\t"
+                "aesenc %%xmm7, %%xmm9\n\t"
+                "aesenc %%xmm7, %%xmm10\n\t"
+                "aesenc %%xmm7, %%xmm11\n\t"
+
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                ".align 16\n\t"
+                ".Ldone_ctr%=:\n\t"
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("movdqa 0x20(%[key]), %%xmm1\n\t"
+                "movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm7\n\t"  /* Get block 5.      */
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "jb .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "je .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds),
+                  [src] "r" (a)
+                : "cc", "memory");
+
+  asm volatile ("pxor %%xmm1, %%xmm12\n\t"         /* block1 ^= lastkey */
+                "pxor %%xmm1, %%xmm13\n\t"         /* block2 ^= lastkey */
+                "pxor %%xmm1, %%xmm14\n\t"         /* block3 ^= lastkey */
+                "pxor %%xmm1, %%xmm15\n\t"         /* block4 ^= lastkey */
+                "aesenclast %%xmm12, %%xmm0\n\t"
+                "aesenclast %%xmm13, %%xmm2\n\t"
+                "aesenclast %%xmm14, %%xmm3\n\t"
+                "aesenclast %%xmm15, %%xmm4\n\t"
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1.    */
+                "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
+                "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "pxor %%xmm1, %%xmm7\n\t"          /* block5 ^= lastkey */
+                "pxor %%xmm1, %%xmm12\n\t"         /* block6 ^= lastkey */
+                "pxor %%xmm1, %%xmm13\n\t"         /* block7 ^= lastkey */
+                "pxor %%xmm1, %%xmm14\n\t"         /* block8 ^= lastkey */
+                "aesenclast %%xmm7, %%xmm8\n\t"
+                "aesenclast %%xmm12, %%xmm9\n\t"
+                "aesenclast %%xmm13, %%xmm10\n\t"
+                "aesenclast %%xmm14, %%xmm11\n\t"
+                "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
+                "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
+                "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
+                "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                         const unsigned char *src)
+{
+  aesni_prepare ();
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_enc (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
+  aesni_cleanup ();
+  return 0;
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  aesni_prepare ();
+
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
+                    "pxor %%xmm1, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks, int cbc_mac)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm5, %%xmm0\n\t"
+                    : /* No output */
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      if (!cbc_mac)
+        outbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+                "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
+                : /* No output */
+                : [mask] "m" (*be_mask),
+                  [ctr] "m" (*ctr)
+                : "memory");
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
+    {
+      do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_ctr (ctx, ctr, outbuf, inbuf);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                         const unsigned char *src)
+{
+  aesni_prepare ();
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_dec (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
+  aesni_cleanup ();
+  return 0;
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  asm volatile ("movdqu %[iv], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  /* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for ( ;nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqa (%[key]), %%xmm0\n\t"
+
+	     "movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+	     "movdqa %%xmm2, %%xmm12\n\t"
+	     "movdqa %%xmm3, %%xmm13\n\t"
+	     "movdqa %%xmm4, %%xmm14\n\t"
+	     "movdqa %%xmm8, %%xmm15\n\t"
+
+             "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+             "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+             "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+             "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+             "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+             "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+             "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+             "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [key] "r" (ctx->keyschenc)
+	     : "memory");
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile
+	    (
+	     "pxor %%xmm0, %%xmm12\n\t"
+	     "pxor %%xmm0, %%xmm13\n\t"
+	     "pxor %%xmm0, %%xmm14\n\t"
+	     "pxor %%xmm0, %%xmm15\n\t"
+	     "aesenclast %%xmm12, %%xmm1\n\t"
+	     "aesenclast %%xmm13, %%xmm2\n\t"
+	     "aesenclast %%xmm14, %%xmm3\n\t"
+	     "aesenclast %%xmm15, %%xmm4\n\t"
+
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+	     "pxor %%xmm0, %%xmm12\n\t"
+	     "pxor %%xmm0, %%xmm13\n\t"
+	     "pxor %%xmm0, %%xmm14\n\t"
+	     "pxor %%xmm0, %%xmm15\n\t"
+
+	     "aesenclast %%xmm12, %%xmm8\n\t"
+	     "aesenclast %%xmm13, %%xmm9\n\t"
+	     "aesenclast %%xmm14, %%xmm10\n\t"
+	     "aesenclast %%xmm15, %%xmm11\n\t"
+
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile
+        ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+         "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+         "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+         "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+         "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+         : /* No output */
+         : [inbuf] "r" (inbuf)
+         : "memory");
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile
+        ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm1\n\t"
+         "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+         "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm2\n\t"
+         "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+         "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm3\n\t"
+         "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+         "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+         "pxor %%xmm5, %%xmm4\n\t"
+         "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+         : /* No output */
+         : [inbuf] "r" (inbuf),
+           [outbuf] "r" (outbuf)
+         : "memory");
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc");
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm6, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  if ( !ctx->decryption_prepared )
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  asm volatile
+    ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
+     : /* No output */
+     : [iv] "m" (*iv)
+     : "memory");
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile
+	    ("movdqa (%[key]), %%xmm0\n\t"
+
+	     "movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqa %%xmm1, %%xmm12\n\t"
+	     "movdqa %%xmm2, %%xmm13\n\t"
+	     "movdqa %%xmm3, %%xmm14\n\t"
+	     "movdqa %%xmm4, %%xmm15\n\t"
+
+	     "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [key] "r" (ctx->keyschdec)
+	     : "memory");
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile
+	    (
+	     "pxor %%xmm0, %%xmm5\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm15\n\t"			/* xor IV with key */
+
+	     "aesdeclast %%xmm5, %%xmm1\n\t"
+	     "aesdeclast %%xmm12, %%xmm2\n\t"
+	     "aesdeclast %%xmm13, %%xmm3\n\t"
+	     "aesdeclast %%xmm14, %%xmm4\n\t"
+
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
+
+	     "aesdeclast %%xmm15, %%xmm8\n\t"
+	     "aesdeclast %%xmm12, %%xmm9\n\t"
+	     "aesdeclast %%xmm13, %%xmm10\n\t"
+	     "aesdeclast %%xmm14, %%xmm11\n\t"
+
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
+    {
+      asm volatile
+        ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+         "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+         "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+         "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+         : /* No output */
+         : [inbuf] "r" (inbuf)
+         : "memory");
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile
+        ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+         "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm2\n\t"		/* xor IV with output */
+         "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm3\n\t"		/* xor IV with output */
+         "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+         "pxor %%xmm5, %%xmm4\n\t"		/* xor IV with output */
+         "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+         "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+         : /* No output */
+         : [inbuf] "r" (inbuf),
+           [outbuf] "r" (outbuf)
+         : "memory");
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile
+        ("movdqu %[inbuf], %%xmm0\n\t"
+         "movdqa %%xmm0, %%xmm2\n\t"    /* use xmm2 as savebuf */
+         : /* No output */
+         : [inbuf] "m" (*inbuf)
+         : "memory");
+
+      /* uses only xmm0 and xmm1 */
+      do_aesni_dec (ctx);
+
+      asm volatile
+        ("pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
+         "movdqu %%xmm0, %[outbuf]\n\t"
+         "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
+         : [outbuf] "=m" (*outbuf)
+         :
+         : "memory");
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile
+    ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
+     : /* No output */
+     : [iv] "m" (*iv)
+     : "memory");
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
+		    size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+
+  /* Calculate checksum */
+  asm volatile ("movdqu %[checksum], %%xmm6\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                :
+                :[checksum] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  if (0) {}
+#if defined(HAVE_GCC_INLINE_ASM_AVX2)
+  else if (nblocks >= 16 && ctx->use_avx2)
+    {
+      /* Use wider 256-bit registers for fast xoring of plaintext. */
+      asm volatile ("vzeroupper\n\t"
+		    "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
+		    "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
+		    "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
+		    "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+                    :
+                    :
+                    : "memory");
+
+      for (;nblocks >= 16; nblocks -= 16)
+	{
+	  asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
+			"vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
+			"vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
+			"vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
+			:
+			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
+			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
+			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
+			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
+			: "memory" );
+	  asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
+			"vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
+			"vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
+			"vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
+			:
+			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
+			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
+			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
+			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
+			: "memory" );
+	  plaintext += BLOCKSIZE * 16;
+	}
+
+      asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
+		    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
+		    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+		    "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
+		    "vextracti128 $1, %%ymm6, %%xmm0\n\t"
+		    "vextracti128 $1, %%ymm1, %%xmm4\n\t"
+		    "vextracti128 $1, %%ymm2, %%xmm5\n\t"
+		    "vextracti128 $1, %%ymm3, %%xmm7\n\t"
+		    "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
+		    "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
+		    "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
+		    "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
+		    "vzeroupper\n\t"
+		    :
+		    :
+		    : "memory" );
+    }
+#endif
+#if defined(HAVE_GCC_INLINE_ASM_AVX)
+  else if (nblocks >= 16 && ctx->use_avx)
+    {
+      /* Same as AVX2, except using 256-bit floating point instructions. */
+      asm volatile ("vzeroupper\n\t"
+		    "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
+		    "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
+		    "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
+		    "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
+                    :
+                    :
+                    : "memory");
+
+      for (;nblocks >= 16; nblocks -= 16)
+	{
+	  asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
+			"vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
+			"vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
+			"vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
+			:
+			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
+			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
+			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
+			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
+			: "memory" );
+	  asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
+			"vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
+			"vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
+			"vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
+			:
+			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
+			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
+			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
+			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
+			: "memory" );
+	  plaintext += BLOCKSIZE * 16;
+	}
+
+      asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
+		    "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
+		    "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
+		    "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
+		    "vextractf128 $1, %%ymm6, %%xmm0\n\t"
+		    "vextractf128 $1, %%ymm1, %%xmm4\n\t"
+		    "vextractf128 $1, %%ymm2, %%xmm5\n\t"
+		    "vextractf128 $1, %%ymm3, %%xmm7\n\t"
+		    "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
+		    "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
+		    "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
+		    "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
+		    "vzeroupper\n\t"
+		    :
+		    :
+		    : "memory" );
+    }
+#endif
+
+  for (;nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "movdqu %[ptr1], %%xmm4\n\t"
+		    "movdqu %[ptr2], %%xmm5\n\t"
+		    "movdqu %[ptr3], %%xmm7\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    "pxor %%xmm4, %%xmm1\n\t"
+		    "pxor %%xmm5, %%xmm2\n\t"
+		    "pxor %%xmm7, %%xmm3\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
+		      [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
+		      [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
+		      [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE * 4;
+    }
+
+  for (;nblocks >= 1; nblocks -= 1)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE;
+    }
+
+  asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+		"pxor %%xmm2, %%xmm6\n\t"
+		"pxor %%xmm3, %%xmm6\n\t"
+		"movdqu %%xmm6, %[checksum]\n\t"
+		: [checksum] "=m" (*c->u_ctr.ctr)
+		:
+		: "memory" );
+}
+
+
+static unsigned int ASM_FUNC_ATTR_NOINLINE
+aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const unsigned char *l;
+  byte tmpbuf_store[3 * 16 + 15];
+  byte *tmpbuf;
+  aesni_prepare_2_7_variable;
+
+  asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
+  tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
+
+  aesni_prepare ();
+  aesni_prepare_2_7 ();
+
+  /* Preload Offset */
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+		"movdqu %[ctr], %%xmm7\n\t"
+		: /* No output */
+		: [iv] "m" (*c->u_iv.iv),
+		  [ctr] "m" (*c->u_ctr.ctr)
+		: "memory" );
+
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm0,   %%xmm7\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      unsigned char last_xor_first_key_store[16 + 15];
+      unsigned char *lxf_key;
+      aesni_prepare_8_15_variable;
+
+      asm volatile (""
+                    : "=r" (lxf_key)
+		    : "0" (last_xor_first_key_store)
+		    : "memory");
+      lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
+
+      aesni_prepare_8_15();
+
+      asm volatile ("movdqu %[l0], %%xmm6\n\t"
+		    "movdqa %[last_key], %%xmm0\n\t"
+		    "pxor %[first_key], %%xmm5\n\t"
+		    "pxor %[first_key], %%xmm0\n\t"
+		    "movdqa %%xmm0, %[lxfkey]\n\t"
+		    : [lxfkey] "=m" (*lxf_key)
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [last_key] "m" (ctx->keyschenc[ctx->rounds][0][0]),
+		      [first_key] "m" (ctx->keyschenc[0][0][0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
+			:
+			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [l3] "m" (*l)
+			: "memory" );
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
+			:
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
+			:
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm1,    %%xmm7\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm0,    %%xmm12\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm2,    %%xmm7\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"pxor   %%xmm0,    %%xmm13\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm3,    %%xmm7\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm0,    %%xmm14\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
+			"pxor   %%xmm4,    %%xmm7\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			"pxor   %%xmm0,    %%xmm15\n\t"
+
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm8,    %%xmm7\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm0,    %[tmpbuf0]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm9,    %%xmm7\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"pxor   %[lxfkey], %%xmm0\n"
+			"movdqa %%xmm0,    %[tmpbuf1]\n\t"
+			: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
+			  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
+			: [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm10,   %%xmm7\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"pxor   %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm0,    %[tmpbuf2]\n\t"
+			: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
+			  [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqa 0x10(%[key]), %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm7\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
+			  [key] "r" (ctx->keyschenc)
+			: "memory" );
+
+	  asm volatile ("cmpl $12, %[rounds]\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x20(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x30(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x40(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x50(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x60(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x70(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x80(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x90(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"jb .Ldeclast%=\n\t"
+			"movdqa 0xa0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xb0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"je .Ldeclast%=\n\t"
+			"movdqa 0xc0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xd0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+
+			".Ldeclast%=:\n\t"
+			:
+			: [key] "r" (ctx->keyschenc),
+			  [rounds] "r" (ctx->rounds)
+			: "cc", "memory");
+
+	  asm volatile ("aesenclast %%xmm12,   %%xmm1\n\t"
+			"aesenclast %%xmm13,   %%xmm2\n\t"
+			"aesenclast %%xmm14,   %%xmm3\n\t"
+			"aesenclast %%xmm15,   %%xmm4\n\t"
+			"aesenclast %[tmpbuf0],%%xmm8\n\t"
+			"aesenclast %[tmpbuf1],%%xmm9\n\t"
+			"aesenclast %[tmpbuf2],%%xmm10\n\t"
+			:
+			: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
+			  [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
+			  [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE)),
+			  [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("aesenclast %%xmm5,    %%xmm11\n\t"
+			"pxor   %[lxfkey], %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+			: [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			:
+			: "memory" );
+	  asm volatile ("movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      asm volatile ("pxor %[first_key], %%xmm5\n\t"
+		    "pxor %%xmm0, %%xmm0\n\t"
+		    "movdqu %%xmm0, %[lxfkey]\n\t"
+		    : [lxfkey] "=m" (*lxf_key)
+		    : [first_key] "m" (ctx->keyschenc[0][0][0])
+		    : "memory" );
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
+    {
+      n += 4;
+      l = aes_ocb_get_l(c, n);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "movdqu %[l0l1],   %%xmm3\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm1,    %%xmm7\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqa %%xmm0,    %[tmpbuf0]\n\t"
+		    : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [l3] "m" (*l)
+		    : "memory" );
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "pxor   %%xmm2,    %%xmm7\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqa %%xmm3,    %[tmpbuf1]\n\t"
+		    : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm3,    %%xmm7\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqa %%xmm0,    %[tmpbuf2]\n\t"
+		    : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm4,    %%xmm7\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("pxor   %[tmpbuf0],%%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "pxor   %[tmpbuf1],%%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
+		      [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %[tmpbuf2],%%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+		    "pxor   %%xmm0,   %%xmm7\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                "movdqu %%xmm7, %[ctr]\n\t"
+		: [iv] "=m" (*c->u_iv.iv),
+		  [ctr] "=m" (*c->u_ctr.ctr)
+                :
+                : "memory" );
+
+  asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
+                "movdqa %%xmm0, %[tmpbuf0]\n\t"
+                "movdqa %%xmm0, %[tmpbuf1]\n\t"
+                "movdqa %%xmm0, %[tmpbuf2]\n\t"
+		: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
+		  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
+		  [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
+}
+
+
+static unsigned int ASM_FUNC_ATTR_NOINLINE
+aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks_arg)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const unsigned char *l;
+  size_t nblocks = nblocks_arg;
+  byte tmpbuf_store[3 * 16 + 15];
+  byte *tmpbuf;
+  aesni_prepare_2_7_variable;
+
+  asm volatile ("" : "=r" (tmpbuf) : "0" (tmpbuf_store) : "memory");
+  tmpbuf = tmpbuf + (-(uintptr_t)tmpbuf & 15);
+
+  aesni_prepare ();
+  aesni_prepare_2_7 ();
+
+  if ( !ctx->decryption_prepared )
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Preload Offset */
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_iv.iv)
+                : "memory" );
+
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      unsigned char last_xor_first_key_store[16 + 15];
+      unsigned char *lxf_key;
+      aesni_prepare_8_15_variable;
+
+      asm volatile (""
+                    : "=r" (lxf_key)
+		    : "0" (last_xor_first_key_store)
+		    : "memory");
+      lxf_key = lxf_key + (-(uintptr_t)lxf_key & 15);
+
+      aesni_prepare_8_15();
+
+      asm volatile ("movdqu %[l0], %%xmm6\n\t"
+		    "movdqa %[last_key], %%xmm0\n\t"
+		    "pxor %[first_key], %%xmm5\n\t"
+		    "pxor %[first_key], %%xmm0\n\t"
+		    "movdqa %%xmm0, %[lxfkey]\n\t"
+		    : [lxfkey] "=m" (*lxf_key)
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
+		      [first_key] "m" (ctx->keyschdec[0][0][0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
+			"movdqu %[l1],     %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
+			:
+			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
+			  [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [l3] "m" (*l)
+			: "memory" );
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
+			:
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
+			:
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqa %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm0,    %%xmm12\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"pxor   %%xmm0,    %%xmm13\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm0,    %%xmm14\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			"pxor   %%xmm0,    %%xmm15\n\t"
+
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm0,    %[tmpbuf0]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"pxor   %[lxfkey], %%xmm0\n"
+			"movdqa %%xmm0,    %[tmpbuf1]\n\t"
+			: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
+			  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
+			: [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"pxor   %[lxfkey], %%xmm0\n\t"
+			"movdqa %%xmm0,    %[tmpbuf2]\n\t"
+			: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE)),
+			  [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqa 0x10(%[key]), %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE)),
+			  [key] "r" (ctx->keyschdec)
+			: "memory" );
+
+	  asm volatile ("cmpl $12, %[rounds]\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x20(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x30(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x40(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x50(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x60(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x70(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x80(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x90(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"jb .Ldeclast%=\n\t"
+			"movdqa 0xa0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xb0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"je .Ldeclast%=\n\t"
+			"movdqa 0xc0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xd0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+
+			".Ldeclast%=:\n\t"
+			:
+			: [key] "r" (ctx->keyschdec),
+			  [rounds] "r" (ctx->rounds)
+			: "cc", "memory");
+
+	  asm volatile ("aesdeclast %%xmm12,   %%xmm1\n\t"
+			"aesdeclast %%xmm13,   %%xmm2\n\t"
+			"aesdeclast %%xmm14,   %%xmm3\n\t"
+			"aesdeclast %%xmm15,   %%xmm4\n\t"
+			"aesdeclast %[tmpbuf0],%%xmm8\n\t"
+			"aesdeclast %[tmpbuf1],%%xmm9\n\t"
+			"aesdeclast %[tmpbuf2],%%xmm10\n\t"
+			:
+			: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
+			  [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE)),
+			  [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("aesdeclast %%xmm5,    %%xmm11\n\t"
+			"pxor   %[lxfkey], %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+			: [lxfkey] "m" (*lxf_key)
+			: "memory" );
+	  asm volatile ("movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			:
+			: "memory" );
+	  asm volatile ("movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      asm volatile ("pxor %[first_key], %%xmm5\n\t"
+		    "pxor %%xmm0, %%xmm0\n\t"
+		    "movdqu %%xmm0, %[lxfkey]\n\t"
+		    : [lxfkey] "=m" (*lxf_key)
+		    : [first_key] "m" (ctx->keyschdec[0][0][0])
+		    : "memory" );
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
+    {
+      n += 4;
+      l = aes_ocb_get_l(c, n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "movdqu %[l0l1],   %%xmm3\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqa %%xmm0,    %[tmpbuf0]\n\t"
+		    : [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [l3] "m" (*l)
+		    : "memory" );
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqa %%xmm3,    %[tmpbuf1]\n\t"
+		    : [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqa %%xmm0,    %[tmpbuf2]\n\t"
+		    : [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("pxor   %[tmpbuf0],%%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "pxor   %[tmpbuf1],%%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
+		      [tmpbuf1] "m" (*(tmpbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %[tmpbuf2],%%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                : [iv] "=m" (*c->u_iv.iv)
+                :
+                : "memory" );
+
+  asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
+                "movdqa %%xmm0, %[tmpbuf0]\n\t"
+                "movdqa %%xmm0, %[tmpbuf1]\n\t"
+                "movdqa %%xmm0, %[tmpbuf2]\n\t"
+		: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
+		  [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE)),
+		  [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
+                :
+                : "memory" );
+
+  aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
+}
+
+
+size_t ASM_FUNC_ATTR
+_gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+                          const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+  else
+    return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+}
+
+
+size_t ASM_FUNC_ATTR
+_gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                          size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 n = c->u_mode.ocb.aad_nblocks;
+  const unsigned char *l;
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7 ();
+
+  /* Preload Offset and Sum */
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "m" (*c->u_mode.ocb.aad_sum)
+                : "memory" );
+
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      asm volatile ("movdqu %[l0],     %%xmm7\n\t"
+		    "movdqu %[l0l1],   %%xmm12\n\t"
+		    "movdqu %[l1],     %%xmm13\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
+		      [l1] "m" (*c->u_mode.ocb.L[1])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[l3],   %%xmm0\n\t"
+			"pxor   %%xmm13, %%xmm0\n\t"
+			:
+			: [l3] "m" (*l)
+			: "memory" );
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[l7],   %%xmm14\n\t"
+			"pxor   %%xmm13, %%xmm14\n\t"
+			:
+			: [l7] "m" (*l)
+			: "memory" );
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
+			"movdqu %[abuf1],  %%xmm2\n\t"
+			"movdqu %[abuf2],  %%xmm3\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			:
+			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)),
+			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)),
+			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
+			"movdqu %[abuf5],  %%xmm9\n\t"
+			"movdqu %[abuf6],  %%xmm10\n\t"
+			"movdqu %[abuf7],  %%xmm11\n\t"
+			:
+			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)),
+			  [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)),
+			  [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)),
+			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("pxor   %%xmm7,    %%xmm1\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+
+			"pxor   %%xmm12,   %%xmm2\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+
+			"pxor   %%xmm13,   %%xmm3\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqa (%[key]),  %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+
+			"pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+			"pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+			"pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+			"pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+
+			"pxor   %%xmm7,    %%xmm8\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+
+			"pxor   %%xmm14,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+
+			"pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+			"pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+			"pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+			"pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+			:
+			: [key] "r" (ctx->keyschenc)
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile (
+			"aesenclast %%xmm0, %%xmm1\n\t"
+			"aesenclast %%xmm0, %%xmm2\n\t"
+			"aesenclast %%xmm0, %%xmm3\n\t"
+			"aesenclast %%xmm0, %%xmm4\n\t"
+			"aesenclast %%xmm0, %%xmm8\n\t"
+			"aesenclast %%xmm0, %%xmm9\n\t"
+			"aesenclast %%xmm0, %%xmm10\n\t"
+			"aesenclast %%xmm0, %%xmm11\n\t"
+			"pxor   %%xmm2,   %%xmm1\n\t"
+			"pxor   %%xmm3,   %%xmm1\n\t"
+			"pxor   %%xmm4,   %%xmm1\n\t"
+			"pxor   %%xmm8,   %%xmm1\n\t"
+			"pxor   %%xmm9,   %%xmm6\n\t"
+			"pxor   %%xmm10,  %%xmm6\n\t"
+			"pxor   %%xmm11,  %%xmm6\n\t"
+			"pxor   %%xmm1,   %%xmm6\n\t"
+			:
+			:
+			: "memory" );
+
+	  abuf += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
+    {
+      n += 4;
+      l = aes_ocb_get_l(c, n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[abuf0],  %%xmm1\n\t"
+		    "movdqu %[l0l1],   %%xmm3\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
+		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm7\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    :
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [l3] "m" (*l)
+		    : "memory" );
+      asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    :
+		    : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    :
+		    : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %%xmm7,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
+		    "movdqu %[abuf3],  %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
+		    "pxor   %%xmm2,   %%xmm6\n\t"
+		    "pxor   %%xmm3,   %%xmm6\n\t"
+		    "pxor   %%xmm4,   %%xmm6\n\t"
+		    :
+		    :
+		    : "memory" );
+
+      abuf += 4*BLOCKSIZE;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = n;
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "=m" (*c->u_mode.ocb.aad_sum)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
+}
+
+
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+  { 0x87, 0x01 };
+
+
+static void ASM_FUNC_ATTR
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+		: [tweak] "=m" (*tweak)
+		:
+		: "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+static void ASM_FUNC_ATTR
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7 ();
+
+  if ( !ctx->decryption_prepared )
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+                : [tweak] "=m" (*tweak)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			   unsigned char *outbuf, const unsigned char *inbuf,
+			   size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+  else
+    _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* USE_AESNI */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-amd64.S b/comm/third_party/libgcrypt/cipher/rijndael-amd64.S
new file mode 100644
index 0000000000..3dcaa856b7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-amd64.S
@@ -0,0 +1,477 @@
+/* rinjdael-amd64.S  -  AMD64 assembly implementation of AES cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_AES)
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* table macros */
+#define E0	(0)
+#define Es0	(1)
+#define Esize	4
+#define Essize	4
+
+#define D0	(0)
+#define Ds0	(4 * 256)
+#define Dsize	4
+#define Dssize	1
+
+/* register macros */
+#define CTX	%rdi
+#define RTAB	%r12
+
+#define RA	%rax
+#define RB	%rbx
+#define RC	%rcx
+#define RD	%rdx
+
+#define RAd	%eax
+#define RBd	%ebx
+#define RCd	%ecx
+#define RDd	%edx
+
+#define RAbl	%al
+#define RBbl	%bl
+#define RCbl	%cl
+#define RDbl	%dl
+
+#define RAbh	%ah
+#define RBbh	%bh
+#define RCbh	%ch
+#define RDbh	%dh
+
+#define RNA	%r8
+#define RNB	%r9
+#define RNC	%r10
+#define RND	%r11
+
+#define RNAd	%r8d
+#define RNBd	%r9d
+#define RNCd	%r10d
+#define RNDd	%r11d
+
+#define RT0	%rbp
+#define RT1	%rsi
+
+#define RT0d	%ebp
+#define RT1d	%esi
+
+/* helper macros */
+#define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bl,			t0 ## d; \
+	movzbl source ## bh,			t1 ## d; \
+	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
+	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
+
+#define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bl,			t0 ## d; \
+	movzbl source ## bh,			t1 ## d; \
+	shrl $(shf),				source ## d; \
+	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
+	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
+
+#define last_do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bl,			t0 ## d; \
+	movzbl source ## bh,			t1 ## d; \
+	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
+	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
+	op ## l t0 ## d,			dest1 ## d; \
+	op ## l t1 ## d,			dest2 ## d;
+
+#define last_do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bl,			t0 ## d; \
+	movzbl source ## bh,			t1 ## d; \
+	shrl $(shf),				source ## d; \
+	movzbl table1(RTAB,t0,tablemul),	t0 ## d; \
+	movzbl table2(RTAB,t1,tablemul),	t1 ## d; \
+	op ## l t0 ## d,			dest1 ## d; \
+	op ## l t1 ## d,			dest2 ## d;
+
+/***********************************************************************
+ * AMD64 assembly implementation of the AES cipher
+ ***********************************************************************/
+#define addroundkey(round, ra, rb, rc, rd) \
+	xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \
+	xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \
+	xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \
+	xorl (((round) * 16) + 3 * 4)(CTX), rd ## d;
+
+#define do_encround(next_r) \
+	do16bit_shr(16, mov, RA, Esize, E0, RNA, E0, RND, RT0, RT1); \
+	do16bit(        mov, RA, Esize, E0, RNC, E0, RNB, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	roll $8, RNDd; \
+	xorl RNAd, RAd; \
+	roll $8, RNCd; \
+	roll $8, RNBd; \
+	roll $8, RAd; \
+	\
+	do16bit_shr(16, xor, RD, Esize, E0, RND, E0, RNC, RT0, RT1); \
+	do16bit(        xor, RD, Esize, E0, RNB, E0, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	roll $8, RNCd; \
+	xorl RNDd, RDd; \
+	roll $8, RNBd; \
+	roll $8, RAd; \
+	roll $8, RDd; \
+	\
+	do16bit_shr(16, xor, RC, Esize, E0, RNC, E0, RNB, RT0, RT1); \
+	do16bit(        xor, RC, Esize, E0, RA,  E0, RD,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	roll $8, RNBd; \
+	xorl RNCd, RCd; \
+	roll $8, RAd; \
+	roll $8, RDd; \
+	roll $8, RCd; \
+	\
+	do16bit_shr(16, xor, RB, Esize, E0, RNB, E0, RA,  RT0, RT1); \
+	do16bit(        xor, RB, Esize, E0, RD,  E0, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	roll $8, RAd; \
+	xorl RNBd, RBd; \
+	roll $16, RDd; \
+	roll $24, RCd;
+
+#define do_lastencround(next_r) \
+	do16bit_shr(16, movzb, RA, Essize, Es0, RNA, Es0, RND, RT0, RT1); \
+	do16bit(        movzb, RA, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	roll $8, RNDd; \
+	xorl RNAd, RAd; \
+	roll $8, RNCd; \
+	roll $8, RNBd; \
+	roll $8, RAd; \
+	\
+	last_do16bit_shr(16, xor, RD, Essize, Es0, RND, Es0, RNC, RT0, RT1); \
+	last_do16bit(        xor, RD, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	roll $8, RNCd; \
+	xorl RNDd, RDd; \
+	roll $8, RNBd; \
+	roll $8, RAd; \
+	roll $8, RDd; \
+	\
+	last_do16bit_shr(16, xor, RC, Essize, Es0, RNC, Es0, RNB, RT0, RT1); \
+	last_do16bit(        xor, RC, Essize, Es0, RA,  Es0, RD,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	roll $8, RNBd; \
+	xorl RNCd, RCd; \
+	roll $8, RAd; \
+	roll $8, RDd; \
+	roll $8, RCd; \
+	\
+	last_do16bit_shr(16, xor, RB, Essize, Es0, RNB, Es0, RA,  RT0, RT1); \
+	last_do16bit(        xor, RB, Essize, Es0, RD,  Es0, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	roll $8, RAd; \
+	xorl RNBd, RBd; \
+	roll $16, RDd; \
+	roll $24, RCd;
+
+#define firstencround(round) \
+	addroundkey(round, RA, RB, RC, RD); \
+	do_encround((round) + 1);
+
+#define encround(round) \
+	do_encround((round) + 1);
+
+#define lastencround(round) \
+	do_lastencround((round) + 1);
+
+.align 8
+.globl _gcry_aes_amd64_encrypt_block
+ELF(.type   _gcry_aes_amd64_encrypt_block,@function;)
+
+_gcry_aes_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: keysched, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%ecx: number of rounds.. 10, 12 or 14
+	 *	%r8:  encryption tables
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_5
+
+	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
+	movq %rsi, (0 * 8)(%rsp);
+	movl %ecx, (1 * 8)(%rsp);
+	movq %rbp, (2 * 8)(%rsp);
+	movq %rbx, (3 * 8)(%rsp);
+	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
+
+	leaq (%r8), RTAB;
+
+	/* read input block */
+	movl 0 * 4(%rdx), RAd;
+	movl 1 * 4(%rdx), RBd;
+	movl 2 * 4(%rdx), RCd;
+	movl 3 * 4(%rdx), RDd;
+
+	firstencround(0);
+	encround(1);
+	encround(2);
+	encround(3);
+	encround(4);
+	encround(5);
+	encround(6);
+	encround(7);
+	encround(8);
+	cmpl $12, (1 * 8)(%rsp);
+	jnb .Lenc_not_128;
+	lastencround(9);
+
+.align 4
+.Lenc_done:
+	/* write output block */
+	movq (0 * 8)(%rsp), %rsi;
+	movl RAd, 0 * 4(%rsi);
+	movl RBd, 1 * 4(%rsi);
+	movl RCd, 2 * 4(%rsi);
+	movl RDd, 3 * 4(%rsi);
+
+	CFI_REMEMBER_STATE();
+
+	movq (4 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
+	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
+
+	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
+	ret;
+
+	CFI_RESTORE_STATE();
+.align 4
+.Lenc_not_128:
+	je .Lenc_192
+
+	encround(9);
+	encround(10);
+	encround(11);
+	encround(12);
+	lastencround(13);
+
+	jmp .Lenc_done;
+
+.align 4
+.Lenc_192:
+	encround(9);
+	encround(10);
+	lastencround(11);
+
+	jmp .Lenc_done;
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
+
+#define do_decround(next_r) \
+	do16bit_shr(16, mov, RA, Dsize, D0, RNA, D0, RNB, RT0, RT1); \
+	do16bit(        mov, RA, Dsize, D0, RNC, D0, RND, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	roll $8, RNBd; \
+	xorl RNAd, RAd; \
+	roll $8, RNCd; \
+	roll $8, RNDd; \
+	roll $8, RAd; \
+	\
+	do16bit_shr(16, xor, RB, Dsize, D0, RNB, D0, RNC, RT0, RT1); \
+	do16bit(        xor, RB, Dsize, D0, RND, D0, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	roll $8, RNCd; \
+	xorl RNBd, RBd; \
+	roll $8, RNDd; \
+	roll $8, RAd; \
+	roll $8, RBd; \
+	\
+	do16bit_shr(16, xor, RC, Dsize, D0, RNC, D0, RND, RT0, RT1); \
+	do16bit(        xor, RC, Dsize, D0, RA,  D0, RB,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	roll $8, RNDd; \
+	xorl RNCd, RCd; \
+	roll $8, RAd; \
+	roll $8, RBd; \
+	roll $8, RCd; \
+	\
+	do16bit_shr(16, xor, RD, Dsize, D0, RND, D0, RA,  RT0, RT1); \
+	do16bit(        xor, RD, Dsize, D0, RB,  D0, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	roll $8, RAd; \
+	xorl RNDd, RDd; \
+	roll $16, RBd; \
+	roll $24, RCd;
+
+#define do_lastdecround(next_r) \
+	do16bit_shr(16, movzb, RA, Dssize, Ds0, RNA, Ds0, RNB, RT0, RT1); \
+	do16bit(        movzb, RA, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	roll $8, RNBd; \
+	xorl RNAd, RAd; \
+	roll $8, RNCd; \
+	roll $8, RNDd; \
+	roll $8, RAd; \
+	\
+	last_do16bit_shr(16, xor, RB, Dssize, Ds0, RNB, Ds0, RNC, RT0, RT1); \
+	last_do16bit(        xor, RB, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	roll $8, RNCd; \
+	xorl RNBd, RBd; \
+	roll $8, RNDd; \
+	roll $8, RAd; \
+	roll $8, RBd; \
+	\
+	last_do16bit_shr(16, xor, RC, Dssize, Ds0, RNC, Ds0, RND, RT0, RT1); \
+	last_do16bit(        xor, RC, Dssize, Ds0, RA,  Ds0, RB,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	roll $8, RNDd; \
+	xorl RNCd, RCd; \
+	roll $8, RAd; \
+	roll $8, RBd; \
+	roll $8, RCd; \
+	\
+	last_do16bit_shr(16, xor, RD, Dssize, Ds0, RND, Ds0, RA,  RT0, RT1); \
+	last_do16bit(        xor, RD, Dssize, Ds0, RB,  Ds0, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	roll $8, RAd; \
+	xorl RNDd, RDd; \
+	roll $16, RBd; \
+	roll $24, RCd;
+
+#define firstdecround(round) \
+	addroundkey((round + 1), RA, RB, RC, RD); \
+	do_decround(round);
+
+#define decround(round) \
+	do_decround(round);
+
+#define lastdecround(round) \
+	do_lastdecround(round);
+
+.align 8
+.globl _gcry_aes_amd64_decrypt_block
+ELF(.type   _gcry_aes_amd64_decrypt_block,@function;)
+
+_gcry_aes_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: keysched, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%ecx: number of rounds.. 10, 12 or 14
+	 *	%r8:  decryption tables
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_5
+
+	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
+	movq %rsi, (0 * 8)(%rsp);
+	movl %ecx, (1 * 8)(%rsp);
+	movq %rbp, (2 * 8)(%rsp);
+	movq %rbx, (3 * 8)(%rsp);
+	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
+
+	leaq (%r8), RTAB;
+
+	/* read input block */
+	movl 0 * 4(%rdx), RAd;
+	movl 1 * 4(%rdx), RBd;
+	movl 2 * 4(%rdx), RCd;
+	movl 3 * 4(%rdx), RDd;
+
+	cmpl $12, (1 * 8)(%rsp);
+	jnb .Ldec_256;
+
+	firstdecround(9);
+.align 4
+.Ldec_tail:
+	decround(8);
+	decround(7);
+	decround(6);
+	decround(5);
+	decround(4);
+	decround(3);
+	decround(2);
+	decround(1);
+	lastdecround(0);
+
+	/* write output block */
+	movq (0 * 8)(%rsp), %rsi;
+	movl RAd, 0 * 4(%rsi);
+	movl RBd, 1 * 4(%rsi);
+	movl RCd, 2 * 4(%rsi);
+	movl RDd, 3 * 4(%rsi);
+
+	CFI_REMEMBER_STATE();
+
+	movq (4 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
+	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
+
+	movl $(6 * 8), %eax;
+
+	EXIT_SYSV_FUNC
+	ret;
+
+	CFI_RESTORE_STATE();
+.align 4
+.Ldec_256:
+	je .Ldec_192;
+
+	firstdecround(13);
+	decround(12);
+	decround(11);
+	decround(10);
+	decround(9);
+
+	jmp .Ldec_tail;
+
+.align 4
+.Ldec_192:
+	firstdecround(11);
+	decround(10);
+	decround(9);
+
+	jmp .Ldec_tail;
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;)
+
+#endif /*USE_AES*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-arm.S b/comm/third_party/libgcrypt/cipher/rijndael-arm.S
new file mode 100644
index 0000000000..e680c817b2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-arm.S
@@ -0,0 +1,581 @@
+/* rijndael-arm.S  -  ARM assembly implementation of AES cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* register macros */
+#define CTX	%r0
+#define RTAB	%lr
+#define RMASK	%ip
+
+#define RA	%r4
+#define RB	%r5
+#define RC	%r6
+#define RD	%r7
+
+#define RNA	%r8
+#define RNB	%r9
+#define RNC	%r10
+#define RND	%r11
+
+#define RT0	%r1
+#define RT1	%r2
+#define RT2	%r3
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+/***********************************************************************
+ * ARM assembly implementation of the AES cipher
+ ***********************************************************************/
+#define preload_first_key(round, ra) \
+	ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
+
+#define dummy(round, ra) /* nothing */
+
+#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldm CTX, {rna, rnb, rnc, rnd}; \
+	eor ra, rna; \
+	eor rb, rnb; \
+	eor rc, rnc; \
+	preload_key(1, rna); \
+	eor rd, rnd;
+
+#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#2; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldr RT0, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldr ra,  [RTAB, ra]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	eor rnb, rnb, ra, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldr rd,  [RTAB, rd]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	eor rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	eor rna, rna, rd, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldr rc,  [RTAB, rc]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	eor rnd, rnd, rc, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rb,  [RTAB, rb]; \
+	\
+	eor rnd, rnd, RT2, ror #16; \
+	preload_key((next_r) + 1, ra); \
+	eor rnc, rnc, rb, ror #8;
+
+#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra, lsl#2; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldrb rna, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	ldrb rnd, [RTAB, RT1]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldrb rnc, [RTAB, RT2]; \
+	mov rnd, rnd, ror #24; \
+	ldrb rnb, [RTAB, ra]; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	mov rnc, rnc, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	mov rnb, rnb, ror #8; \
+	ldrb RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	ldrb RT1, [RTAB, RT1]; \
+	\
+	orr rnd, rnd, RT0; \
+	ldrb RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldrb rd,  [RTAB, rd]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	orr rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	orr rna, rna, rd, ror #8; \
+	ldrb RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	ldrb RT1, [RTAB, RT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldrb RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldrb rc,  [RTAB, rc]; \
+	orr rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	orr rna, rna, RT2, ror #16; \
+	ldrb RT0, [RTAB, RT0]; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	ldrb RT1, [RTAB, RT1]; \
+	orr rnd, rnd, rc, ror #8; \
+	ldrb RT2, [RTAB, RT2]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	ldrb rb,  [RTAB, rb]; \
+	\
+	orr rnb, rnb, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnd, rnd, RT2, ror #16; \
+	orr rnc, rnc, rb, ror #8;
+
+#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add CTX, #(((round) + 1) * 16); \
+	add RTAB, #1; \
+	do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.align 3
+.globl _gcry_aes_arm_encrypt_block
+.type   _gcry_aes_arm_encrypt_block,%function;
+
+_gcry_aes_arm_encrypt_block:
+	/* input:
+	 *	%r0: keysched, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: number of rounds.. 10, 12 or 14
+	 *      %st+0: encryption table
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	/* read input block */
+
+	/* test if src is unaligned */
+	tst	%r2, #3;
+	beq	1f;
+
+	/* unaligned load */
+	ldr_unaligned_le(RA, %r2, 0, RNA);
+	ldr_unaligned_le(RB, %r2, 4, RNB);
+	ldr_unaligned_le(RC, %r2, 8, RNA);
+	ldr_unaligned_le(RD, %r2, 12, RNB);
+	b	2f;
+.ltorg
+1:
+	/* aligned load */
+	ldm	%r2, {RA, RB, RC, RD};
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+2:
+	ldr     RTAB, [%sp, #40];
+	sub	%sp, #16;
+
+	str	%r1, [%sp, #4];		/* dst */
+	mov	RMASK, #0xff;
+	str	%r3, [%sp, #8];		/* nrounds */
+	mov	RMASK, RMASK, lsl#2;	/* byte mask */
+
+	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+
+	ldr	RT0, [%sp, #8];		/* nrounds */
+	cmp	RT0, #12;
+	bge	.Lenc_not_128;
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+.Lenc_done:
+	ldr	RT0, [%sp, #4];		/* dst */
+	add	%sp, #16;
+
+	/* store output block */
+
+	/* test if dst is unaligned */
+	tst	RT0, #3;
+	beq	1f;
+
+	/* unaligned store */
+	str_unaligned_le(RA, RT0, 0, RNA, RNB);
+	str_unaligned_le(RB, RT0, 4, RNA, RNB);
+	str_unaligned_le(RC, RT0, 8, RNA, RNB);
+	str_unaligned_le(RD, RT0, 12, RNA, RNB);
+	b	2f;
+.ltorg
+1:
+	/* aligned store */
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stm	RT0, {RA, RB, RC, RD};
+2:
+
+	mov     r0, #(10 * 4);
+	pop {%r4-%r11, %ip, %pc};
+
+.ltorg
+.Lenc_not_128:
+	beq .Lenc_192
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+
+.ltorg
+.Lenc_192:
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
+
+#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
+	ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
+	eor ra, rna; \
+	ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
+	eor rb, rnb; \
+	ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
+	eor rc, rnc; \
+	preload_first_key((round) - 1, rna); \
+	eor rd, rnd;
+
+#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#2; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 2); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 2); \
+	ldr RT0, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#2; \
+	ldr ra,  [RTAB, ra]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 2); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 2); \
+	eor rnd, rnd, ra, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#2; \
+	ldr rb,  [RTAB, rb]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 2); \
+	eor rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 2); \
+	eor rna, rna, rb, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd, lsl#2; \
+	ldr rc,  [RTAB, rc]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 2); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 2); \
+	eor rnb, rnb, rc, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 2); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rd,  [RTAB, rd]; \
+	\
+	eor rnb, rnb, RT2, ror #16; \
+	preload_key((next_r) - 1, ra); \
+	eor rnc, rnc, rd, ror #8;
+
+#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra; \
+	and RT1, RMASK, ra, lsr#8; \
+	and RT2, RMASK, ra, lsr#16; \
+	ldrb rna, [RTAB, RT0]; \
+	mov ra,  ra, lsr#24; \
+	ldrb rnb, [RTAB, RT1]; \
+	and RT0, RMASK, rb; \
+	ldrb rnc, [RTAB, RT2]; \
+	mov rnb, rnb, ror #24; \
+	ldrb rnd, [RTAB, ra]; \
+	and RT1, RMASK, rb, lsr#8; \
+	mov rnc, rnc, ror #16; \
+	and RT2, RMASK, rb, lsr#16; \
+	mov rnd, rnd, ror #8; \
+	ldrb RT0, [RTAB, RT0]; \
+	mov rb,  rb, lsr#24; \
+	ldrb RT1, [RTAB, RT1]; \
+	\
+	orr rnb, rnb, RT0; \
+	ldrb RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc; \
+	ldrb rb,  [RTAB, rb]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#8; \
+	orr rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#16; \
+	orr rna, rna, rb, ror #8; \
+	ldrb RT0, [RTAB, RT0]; \
+	mov rc,  rc, lsr#24; \
+	ldrb RT1, [RTAB, RT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldrb RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd; \
+	ldrb rc,  [RTAB, rc]; \
+	orr rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#8; \
+	orr rna, rna, RT2, ror #16; \
+	ldrb RT0, [RTAB, RT0]; \
+	and RT2, RMASK, rd, lsr#16; \
+	ldrb RT1, [RTAB, RT1]; \
+	orr rnb, rnb, rc, ror #8; \
+	ldrb RT2, [RTAB, RT2]; \
+	mov rd,  rd, lsr#24; \
+	ldrb rd,  [RTAB, rd]; \
+	\
+	orr rnd, rnd, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnb, rnb, RT2, ror #16; \
+	orr rnc, rnc, rd, ror #8;
+
+#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define set_last_round_rmask(_, __) \
+	mov RMASK, #0xff;
+
+#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add RTAB, #(4 * 256); \
+	do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.align 3
+.globl _gcry_aes_arm_decrypt_block
+.type   _gcry_aes_arm_decrypt_block,%function;
+
+_gcry_aes_arm_decrypt_block:
+	/* input:
+	 *	%r0: keysched, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: number of rounds.. 10, 12 or 14
+	 *      %st+0: decryption table
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	/* read input block */
+
+	/* test if src is unaligned */
+	tst	%r2, #3;
+	beq	1f;
+
+	/* unaligned load */
+	ldr_unaligned_le(RA, %r2, 0, RNA);
+	ldr_unaligned_le(RB, %r2, 4, RNB);
+	ldr_unaligned_le(RC, %r2, 8, RNA);
+	ldr_unaligned_le(RD, %r2, 12, RNB);
+	b	2f;
+.ltorg
+1:
+	/* aligned load */
+	ldm	%r2, {RA, RB, RC, RD};
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+2:
+	ldr     RTAB, [%sp, #40];
+	sub	%sp, #16;
+
+	mov	RMASK, #0xff;
+	str	%r1, [%sp, #4];		/* dst */
+	mov	RMASK, RMASK, lsl#2;	/* byte mask */
+
+	cmp	%r3, #12;
+	bge	.Ldec_256;
+
+	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+.Ldec_tail:
+	decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
+	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	ldr	RT0, [%sp, #4];		/* dst */
+	add	%sp, #16;
+
+	/* store output block */
+
+	/* test if dst is unaligned */
+	tst	RT0, #3;
+	beq	1f;
+
+	/* unaligned store */
+	str_unaligned_le(RA, RT0, 0, RNA, RNB);
+	str_unaligned_le(RB, RT0, 4, RNA, RNB);
+	str_unaligned_le(RC, RT0, 8, RNA, RNB);
+	str_unaligned_le(RD, RT0, 12, RNA, RNB);
+	b	2f;
+.ltorg
+1:
+	/* aligned store */
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stm	RT0, {RA, RB, RC, RD};
+2:
+	mov     r0, #(10 * 4);
+	pop {%r4-%r11, %ip, %pc};
+
+.ltorg
+.Ldec_256:
+	beq .Ldec_192;
+
+	firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+
+.ltorg
+.Ldec_192:
+	firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS*/
+#endif /*__ARMEL__ */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S
new file mode 100644
index 0000000000..66440bd4eb
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S
@@ -0,0 +1,1867 @@
+/* rijndael-armv8-aarch32-ce.S - ARMv8/CE accelerated AES
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+
+.syntax unified
+.arch armv8-a
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* AES macros */
+
+#define aes_preload_keys(keysched, rekeysched) \
+        vldmia   keysched!, {q5-q7}; \
+        mov      rekeysched, keysched; \
+        vldmialo keysched!, {q8-q15}; /* 128-bit */ \
+        addeq    keysched, #(2*16); \
+        vldmiaeq keysched!, {q10-q15}; /* 192-bit */ \
+        addhi    keysched, #(4*16); \
+        vldmiahi keysched!, {q12-q15}; /* 256-bit */ \
+
+#define do_aes_one128(ed, mcimc, qo, qb) \
+        aes##ed.8    qb, q5; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q6; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q7; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q8; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q9; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q10; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q11; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q12; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q13; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q14; \
+        veor         qo, qb, q15;
+
+#define do_aes_one128re(ed, mcimc, qo, qb, keysched, rekeysched) \
+        vldm         rekeysched, {q8-q9}; \
+        do_aes_one128(ed, mcimc, qo, qb);
+
+#define do_aes_one192(ed, mcimc, qo, qb, keysched, rekeysched) \
+        vldm         rekeysched!, {q8}; \
+        aes##ed.8    qb, q5; \
+        aes##mcimc.8 qb, qb; \
+        vldm         rekeysched, {q9}; \
+        aes##ed.8    qb, q6; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q7; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q8; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       keysched!, {q8}; \
+        aes##ed.8    qb, q9; \
+        aes##mcimc.8 qb, qb; \
+        sub          rekeysched, #(1*16); \
+        aes##ed.8    qb, q10; \
+        aes##mcimc.8 qb, qb; \
+        vldm         keysched, {q9}; \
+        aes##ed.8    qb, q11; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q12; \
+        aes##mcimc.8 qb, qb; \
+        sub          keysched, #16; \
+        aes##ed.8    qb, q13; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q14; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q15; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q8; \
+        veor         qo, qb, q9; \
+
+#define do_aes_one256(ed, mcimc, qo, qb, keysched, rekeysched) \
+        vldmia       rekeysched!, {q8}; \
+        aes##ed.8    qb, q5; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       rekeysched!, {q9}; \
+        aes##ed.8    qb, q6; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       rekeysched!, {q10}; \
+        aes##ed.8    qb, q7; \
+        aes##mcimc.8 qb, qb; \
+        vldm         rekeysched, {q11}; \
+        aes##ed.8    qb, q8; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       keysched!, {q8}; \
+        aes##ed.8    qb, q9; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q10; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       keysched!, {q9}; \
+        aes##ed.8    qb, q11; \
+        aes##mcimc.8 qb, qb; \
+        sub          rekeysched, #(3*16); \
+        aes##ed.8    qb, q12; \
+        aes##mcimc.8 qb, qb; \
+        vldmia       keysched!, {q10}; \
+        aes##ed.8    qb, q13; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q14; \
+        aes##mcimc.8 qb, qb; \
+        vldm         keysched, {q11}; \
+        aes##ed.8    qb, q15; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q8; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q9; \
+        aes##mcimc.8 qb, qb; \
+        aes##ed.8    qb, q10; \
+        veor         qo, qb, q11; \
+        sub          keysched, #(3*16); \
+
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+        aes##ed.8    b0, key; \
+        aes##mcimc.8 b0, b0; \
+          aes##ed.8    b1, key; \
+          aes##mcimc.8 b1, b1; \
+            aes##ed.8    b2, key; \
+            aes##mcimc.8 b2, b2; \
+              aes##ed.8    b3, key; \
+              aes##mcimc.8 b3, b3;
+
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+        aes##ed.8    b0, q14; \
+        veor         b0, b0, q15; \
+        aes##ed.8    b1, q14; \
+        veor         b1, b1, q15; \
+        aes##ed.8    b2, q14; \
+        veor         b2, b2, q15; \
+        aes##ed.8    b3, q14; \
+        veor         b3, b3, q15;
+
+#define do_aes_4_128re(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+        vldm         rekeysched, {q8-q9}; \
+        do_aes_4_128(ed, mcimc, b0, b1, b2, b3);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+        vldm         rekeysched!, {q8}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+        vldm         rekeysched, {q9}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+        vldmia       keysched!, {q8}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+        sub          rekeysched, #(1*16); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+        vldm         keysched, {q9}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+        sub          keysched, #16; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
+        aes##ed.8    b0, q8; \
+        veor         b0, b0, q9; \
+        aes##ed.8    b1, q8; \
+        veor         b1, b1, q9; \
+        aes##ed.8    b2, q8; \
+        veor         b2, b2, q9; \
+        aes##ed.8    b3, q8; \
+        veor         b3, b3, q9;
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3, keysched, rekeysched) \
+        vldmia       rekeysched!, {q8}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q5); \
+        vldmia       rekeysched!, {q9}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q6); \
+        vldmia       rekeysched!, {q10}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q7); \
+        vldm         rekeysched, {q11}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+        vldmia       keysched!, {q8}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q10); \
+        vldmia       keysched!, {q9}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q11); \
+        sub          rekeysched, #(3*16); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q12); \
+        vldmia       keysched!, {q10}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q13); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q14); \
+        vldm         keysched, {q11}; \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q15); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q8); \
+        aes_round_4(ed, mcimc, b0, b1, b2, b3, q9); \
+        sub          keysched, #(3*16); \
+        aes##ed.8    b0, q10; \
+        veor         b0, b0, q11; \
+        aes##ed.8    b1, q10; \
+        veor         b1, b1, q11; \
+        aes##ed.8    b2, q10; \
+        veor         b2, b2, q11; \
+        aes##ed.8    b3, q10; \
+        veor         b3, b3, q11;
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_enc_armv8_ce
+.type  _gcry_aes_enc_armv8_ce,%function;
+_gcry_aes_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: dst
+   *    r2: src
+   *    r3: nrounds
+   */
+
+  vldmia r0!, {q1-q3} /* load 3 round keys */
+
+  cmp r3, #12
+
+  vld1.8 {q0}, [r2]
+
+  bhi .Lenc1_256
+  beq .Lenc1_192
+
+.Lenc1_128:
+
+.Lenc1_tail:
+  vldmia r0, {q8-q15} /* load 8 round keys */
+
+  aese.8  q0, q1
+  aesmc.8 q0, q0
+  CLEAR_REG(q1)
+
+  aese.8  q0, q2
+  aesmc.8 q0, q0
+  CLEAR_REG(q2)
+
+  aese.8  q0, q3
+  aesmc.8 q0, q0
+  CLEAR_REG(q3)
+
+  aese.8  q0, q8
+  aesmc.8 q0, q0
+  CLEAR_REG(q8)
+
+  aese.8  q0, q9
+  aesmc.8 q0, q0
+  CLEAR_REG(q9)
+
+  aese.8  q0, q10
+  aesmc.8 q0, q0
+  CLEAR_REG(q10)
+
+  aese.8  q0, q11
+  aesmc.8 q0, q0
+  CLEAR_REG(q11)
+
+  aese.8  q0, q12
+  aesmc.8 q0, q0
+  CLEAR_REG(q12)
+
+  aese.8  q0, q13
+  aesmc.8 q0, q0
+  CLEAR_REG(q13)
+
+  aese.8  q0, q14
+  veor    q0, q15
+  CLEAR_REG(q14)
+  CLEAR_REG(q15)
+
+  vst1.8 {q0}, [r1]
+  CLEAR_REG(q0)
+
+  mov r0, #0
+  bx lr
+
+.Lenc1_192:
+  aese.8  q0, q1
+  aesmc.8 q0, q0
+  vmov q1, q3
+
+  aese.8  q0, q2
+  aesmc.8 q0, q0
+  vldm r0!, {q2-q3} /* load 3 round keys */
+
+  b .Lenc1_tail
+
+.Lenc1_256:
+  vldm r0!, {q15}   /* load 1 round key */
+  aese.8  q0, q1
+  aesmc.8 q0, q0
+
+  aese.8  q0, q2
+  aesmc.8 q0, q0
+
+  aese.8  q0, q3
+  aesmc.8 q0, q0
+  vldm r0!, {q1-q3} /* load 3 round keys */
+
+  aese.8  q0, q15
+  aesmc.8 q0, q0
+
+  b .Lenc1_tail
+.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;
+
+
+/*
+ * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_dec_armv8_ce
+.type  _gcry_aes_dec_armv8_ce,%function;
+_gcry_aes_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: dst
+   *    r2: src
+   *    r3: nrounds
+   */
+
+  vldmia r0!, {q1-q3} /* load 3 round keys */
+
+  cmp r3, #12
+
+  vld1.8 {q0}, [r2]
+
+  bhi .Ldec1_256
+  beq .Ldec1_192
+
+.Ldec1_128:
+
+.Ldec1_tail:
+  vldmia r0, {q8-q15} /* load 8 round keys */
+
+  aesd.8   q0, q1
+  aesimc.8 q0, q0
+  CLEAR_REG(q1)
+
+  aesd.8   q0, q2
+  aesimc.8 q0, q0
+  CLEAR_REG(q2)
+
+  aesd.8   q0, q3
+  aesimc.8 q0, q0
+  CLEAR_REG(q3)
+
+  aesd.8   q0, q8
+  aesimc.8 q0, q0
+  CLEAR_REG(q8)
+
+  aesd.8   q0, q9
+  aesimc.8 q0, q0
+  CLEAR_REG(q9)
+
+  aesd.8   q0, q10
+  aesimc.8 q0, q0
+  CLEAR_REG(q10)
+
+  aesd.8   q0, q11
+  aesimc.8 q0, q0
+  CLEAR_REG(q11)
+
+  aesd.8   q0, q12
+  aesimc.8 q0, q0
+  CLEAR_REG(q12)
+
+  aesd.8   q0, q13
+  aesimc.8 q0, q0
+  CLEAR_REG(q13)
+
+  aesd.8   q0, q14
+  veor     q0, q15
+  CLEAR_REG(q14)
+  CLEAR_REG(q15)
+
+  vst1.8 {q0}, [r1]
+  CLEAR_REG(q0)
+
+  mov r0, #0
+  bx lr
+
+.Ldec1_192:
+  aesd.8   q0, q1
+  aesimc.8 q0, q0
+  vmov q1, q3
+
+  aesd.8   q0, q2
+  aesimc.8 q0, q0
+  vldm r0!, {q2-q3} /* load 3 round keys */
+
+  b .Ldec1_tail
+
+.Ldec1_256:
+  vldm r0!, {q15}   /* load 1 round key */
+  aesd.8   q0, q1
+  aesimc.8 q0, q0
+
+  aesd.8   q0, q2
+  aesimc.8 q0, q0
+
+  aesd.8  q0, q3
+  aesimc.8 q0, q0
+  vldm r0!, {q1-q3} /* load 3 round keys */
+
+  aesd.8   q0, q15
+  aesimc.8 q0, q0
+
+  b .Ldec1_tail
+.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, size_t nblocks,
+ *                                  int cbc_mac, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_enc_armv8_ce
+.type  _gcry_aes_cbc_enc_armv8_ce,%function;
+_gcry_aes_cbc_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: cbc_mac => r5
+   *    %st+8: nrounds => r6
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  ldr r4, [sp, #(16+0)]
+  ldr r5, [sp, #(16+4)]
+  cmp r4, #0
+  ldr r6, [sp, #(16+8)]
+  beq .Lcbc_enc_skip
+  cmp r5, #0
+  vpush {q4-q7}
+  moveq r5, #16
+  movne r5, #0
+
+  cmp r6, #12
+  vld1.8 {q1}, [r3] /* load IV */
+
+  aes_preload_keys(r0, lr);
+
+  beq .Lcbc_enc_loop192
+  bhi .Lcbc_enc_loop256
+
+#define CBC_ENC(bits, ...) \
+  .Lcbc_enc_loop##bits: \
+    vld1.8 {q0}, [r2]!; /* load plaintext */ \
+    veor q1, q0, q1; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1}, [r1], r5; /* store ciphertext */ \
+    \
+    bne .Lcbc_enc_loop##bits; \
+    b .Lcbc_enc_done;
+
+  CBC_ENC(128)
+  CBC_ENC(192, r0, lr)
+  CBC_ENC(256, r0, lr)
+
+#undef CBC_ENC
+
+.Lcbc_enc_done:
+  vst1.8 {q1}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lcbc_enc_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_dec_armv8_ce
+.type  _gcry_aes_cbc_dec_armv8_ce,%function;
+_gcry_aes_cbc_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  ldr r4, [sp, #(16+0)]
+  ldr r5, [sp, #(16+4)]
+  cmp r4, #0
+  beq .Lcbc_dec_skip
+  vpush {q4-q7}
+
+  cmp r5, #12
+  vld1.8 {q0}, [r3] /* load IV */
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lcbc_dec_entry_192
+  bhi .Lcbc_dec_entry_256
+
+#define CBC_DEC(bits, ...) \
+  .Lcbc_dec_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lcbc_dec_loop_##bits; \
+    \
+  .Lcbc_dec_loop4_##bits: \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load ciphertext */ \
+    sub r4, r4, #4; \
+    vld1.8 {q3-q4}, [r2]; /* load ciphertext */ \
+    cmp r4, #4; \
+    sub r2, #32; \
+    \
+    do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q0; \
+    vld1.8 {q0}, [r2]!; /* load next IV */ \
+    veor q2, q2, q0; \
+    vld1.8 {q0}, [r2]!; /* load next IV */ \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    veor q3, q3, q0; \
+    vld1.8 {q0}, [r2]!; /* load next IV */ \
+    veor q4, q4, q0; \
+    vld1.8 {q0}, [r2]!; /* load next IV */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lcbc_dec_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lcbc_dec_done; \
+    \
+  .Lcbc_dec_loop_##bits: \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    subs r4, r4, #1; \
+    vmov q2, q1; \
+    \
+    do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q0; \
+    vmov q0, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lcbc_dec_loop_##bits; \
+    b .Lcbc_dec_done;
+
+  CBC_DEC(128)
+  CBC_DEC(192, r0, r6)
+  CBC_DEC(256, r0, r6)
+
+#undef CBC_DEC
+
+.Lcbc_dec_done:
+  vst1.8 {q0}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lcbc_dec_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_enc_armv8_ce
+.type  _gcry_aes_cfb_enc_armv8_ce,%function;
+_gcry_aes_cfb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  ldr r4, [sp, #(16+0)]
+  ldr r5, [sp, #(16+4)]
+  cmp r4, #0
+  beq .Lcfb_enc_skip
+  vpush {q4-q7}
+
+  cmp r5, #12
+  vld1.8 {q0}, [r3] /* load IV */
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lcfb_enc_entry_192
+  bhi .Lcfb_enc_entry_256
+
+#define CFB_ENC(bits, ...) \
+  .Lcfb_enc_entry_##bits: \
+  .Lcfb_enc_loop_##bits: \
+    vld1.8 {q1}, [r2]!; /* load plaintext */ \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
+    \
+    veor q0, q1, q0; \
+    vst1.8 {q0}, [r1]!; /* store ciphertext */ \
+    \
+    bne .Lcfb_enc_loop_##bits; \
+    b .Lcfb_enc_done;
+
+  CFB_ENC(128)
+  CFB_ENC(192, r0, r6)
+  CFB_ENC(256, r0, r6)
+
+#undef CFB_ENC
+
+.Lcfb_enc_done:
+  vst1.8 {q0}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lcfb_enc_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_dec_armv8_ce
+.type  _gcry_aes_cfb_dec_armv8_ce,%function;
+_gcry_aes_cfb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  ldr r4, [sp, #(16+0)]
+  ldr r5, [sp, #(16+4)]
+  cmp r4, #0
+  beq .Lcfb_dec_skip
+  vpush {q4-q7}
+
+  cmp r5, #12
+  vld1.8 {q0}, [r3] /* load IV */
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lcfb_dec_entry_192
+  bhi .Lcfb_dec_entry_256
+
+#define CFB_DEC(bits, ...) \
+  .Lcfb_dec_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lcfb_dec_loop_##bits; \
+    \
+  .Lcfb_dec_loop4_##bits: \
+    \
+    vld1.8 {q2-q3}, [r2]!; /* load ciphertext */ \
+    vmov q1, q0; \
+    sub r4, r4, #4; \
+    vld1.8 {q4}, [r2]; /* load ciphertext */ \
+    sub r2, #32; \
+    cmp r4, #4; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    veor q1, q1, q0; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vld1.8 {q0}, [r2]!; \
+    veor q3, q3, q0; \
+    vld1.8 {q0}, [r2]!; /* load next IV / ciphertext */ \
+    veor q4, q4, q0; \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lcfb_dec_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lcfb_dec_done; \
+    \
+  .Lcfb_dec_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q0, q0, ##__VA_ARGS__); \
+    \
+    veor q2, q1, q0; \
+    vmov q0, q1; \
+    vst1.8 {q2}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lcfb_dec_loop_##bits; \
+    b .Lcfb_dec_done;
+
+  CFB_DEC(128)
+  CFB_DEC(192, r0, r6)
+  CFB_DEC(256, r0, r6)
+
+#undef CFB_DEC
+
+.Lcfb_dec_done:
+  vst1.8 {q0}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lcfb_dec_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr_enc_armv8_ce
+.type  _gcry_aes_ctr_enc_armv8_ce,%function;
+_gcry_aes_ctr_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lctr_enc_skip
+
+  cmp r5, #12
+  ldm r3, {r7-r10}
+  vld1.8 {q0}, [r3] /* load IV */
+  rev r7, r7
+  rev r8, r8
+  rev r9, r9
+  rev r10, r10
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lctr_enc_entry_192
+  bhi .Lctr_enc_entry_256
+
+#define CTR_ENC(bits, ...) \
+  .Lctr_enc_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lctr_enc_loop_##bits; \
+    \
+  .Lctr_enc_loop4_##bits: \
+    cmp r10, #0xfffffffc; \
+    sub r4, r4, #4; \
+    blo .Lctr_enc_loop4_##bits##_nocarry; \
+    cmp r9, #0xffffffff; \
+    bne .Lctr_enc_loop4_##bits##_nocarry; \
+    \
+    adds r10, #1; \
+    vmov q1, q0; \
+    blcs .Lctr_overflow_one; \
+    rev r11, r10; \
+    vmov.32 d1[1], r11; \
+    \
+    adds r10, #1; \
+    vmov q2, q0; \
+    blcs .Lctr_overflow_one; \
+    rev r11, r10; \
+    vmov.32 d1[1], r11; \
+    \
+    adds r10, #1; \
+    vmov q3, q0; \
+    blcs .Lctr_overflow_one; \
+    rev r11, r10; \
+    vmov.32 d1[1], r11; \
+    \
+    adds r10, #1; \
+    vmov q4, q0; \
+    blcs .Lctr_overflow_one; \
+    rev r11, r10; \
+    vmov.32 d1[1], r11; \
+    \
+    b .Lctr_enc_loop4_##bits##_store_ctr; \
+    \
+  .Lctr_enc_loop4_##bits##_nocarry: \
+    \
+    veor q2, q2; \
+    vrev64.8 q1, q0; \
+    vceq.u32 d5, d5; \
+    vadd.u64 q3, q2, q2; \
+    vadd.u64 q4, q3, q2; \
+    vadd.u64 q0, q3, q3; \
+    vsub.u64 q2, q1, q2; \
+    vsub.u64 q3, q1, q3; \
+    vsub.u64 q4, q1, q4; \
+    vsub.u64 q0, q1, q0; \
+    vrev64.8 q1, q1; \
+    vrev64.8 q2, q2; \
+    vrev64.8 q3, q3; \
+    vrev64.8 q0, q0; \
+    vrev64.8 q4, q4; \
+    add r10, #4; \
+    \
+  .Lctr_enc_loop4_##bits##_store_ctr: \
+    \
+    vst1.8 {q0}, [r3]; \
+    cmp r4, #4; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q0; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    veor q2, q2, q0; \
+    veor q3, q3, q1; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    vst1.8 {q2}, [r1]!; /* store plaintext */ \
+    veor q4, q4, q0; \
+    vld1.8 {q0}, [r3]; /* reload IV */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lctr_enc_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lctr_enc_done; \
+    \
+  .Lctr_enc_loop_##bits: \
+    \
+    adds r10, #1; \
+    vmov q1, q0; \
+    blcs .Lctr_overflow_one; \
+    rev r11, r10; \
+    subs r4, r4, #1; \
+    vld1.8 {q2}, [r2]!; /* load ciphertext */ \
+    vmov.32 d1[1], r11; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q2, q1; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lctr_enc_loop_##bits; \
+    b .Lctr_enc_done;
+
+  CTR_ENC(128)
+  CTR_ENC(192, r0, r6)
+  CTR_ENC(256, r0, r6)
+
+#undef CTR_ENC
+
+.Lctr_enc_done:
+  vst1.8 {q0}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lctr_enc_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+
+.Lctr_overflow_one:
+  adcs r9, #0
+  adcs r8, #0
+  adc r7, #0
+  rev r11, r9
+  rev r12, r8
+  vmov.32 d1[0], r11
+  rev r11, r7
+  vmov.32 d0[1], r12
+  vmov.32 d0[0], r11
+  bx lr
+.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  unsigned char *L_table,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_enc_armv8_ce
+.type  _gcry_aes_ocb_enc_armv8_ce,%function;
+_gcry_aes_ocb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: offset
+   *    %st+0: checksum => r4
+   *    %st+4: Ls => r5
+   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    %st+12: nrounds => r7
+   *    %st+16: blkn => lr
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r7, [sp, #(104+12)]
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  ldr r6, [sp, #(104+8)]
+  ldr lr, [sp, #(104+16)]
+
+  cmp r7, #12
+  vld1.8 {q0}, [r3] /* load offset */
+
+  aes_preload_keys(r0, r12);
+
+  beq .Locb_enc_entry_192
+  bhi .Locb_enc_entry_256
+
+#define OCB_ENC(bits, ...) \
+  .Locb_enc_entry_##bits: \
+    cmp r6, #4; \
+    add lr, #1; \
+    blo .Locb_enc_loop_##bits; \
+    \
+  .Locb_enc_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
+    sub r6, #4; \
+    \
+    vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
+    vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
+    vld1.8 {q8}, [r4];     /* load Checksum_{i-1} */ \
+    veor q0, q0, q9;       /* Offset_i+0 */ \
+    vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
+    veor q8, q8, q1;       /* Checksum_i+0 */ \
+    veor q1, q1, q0;       /* P_i+0 xor Offset_i+0 */\
+    vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+0 */\
+    veor q0, q0, q9;       /* Offset_i+1 */ \
+    vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
+    veor q8, q8, q2;       /* Checksum_i+1 */ \
+    veor q2, q2, q0;       /* P_i+1 xor Offset_i+1 */\
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+1 */\
+    veor q0, q0, q9;       /* Offset_i+2 */ \
+    vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
+    veor q8, q8, q3;       /* Checksum_i+2 */ \
+    veor q3, q3, q0;       /* P_i+2 xor Offset_i+2 */\
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+2 */\
+    veor q0, q0, q9;       /* Offset_i+3 */ \
+    veor q8, q8, q4;       /* Checksum_i+3 */ \
+    veor q4, q4, q0;       /* P_i+3 xor Offset_i+3 */\
+    vst1.8 {q0}, [r1];     /* store Offset_i+3 */\
+    sub r1, #(3*16); \
+    vst1.8 {q8}, [r4];     /* store Checksum_i+3 */\
+    \
+    cmp r6, #4; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    mov r8, r1; \
+    vld1.8 {q8-q9}, [r1]!; \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]!; \
+    vst1.8 {q1-q2}, [r8]!; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q3-q4}, [r8]; \
+    \
+    bhs .Locb_enc_loop4_##bits; \
+    cmp r6, #0; \
+    beq .Locb_enc_done; \
+    \
+  .Locb_enc_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
+    vld1.8 {q1}, [r2]!; /* load plaintext */ \
+    vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+    vld1.8 {q3}, [r4]; /* load checksum */ \
+    subs r6, #1; \
+    veor q0, q0, q2; \
+    veor q3, q3, q1; \
+    veor q1, q1, q0; \
+    vst1.8 {q3}, [r4]; /* store checksum */ \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q0; \
+    vst1.8 {q1}, [r1]!; /* store ciphertext */ \
+    \
+    bne .Locb_enc_loop_##bits; \
+    b .Locb_enc_done;
+
+  OCB_ENC(128re, r0, r12)
+  OCB_ENC(192, r0, r12)
+  OCB_ENC(256, r0, r12)
+
+#undef OCB_ENC
+
+.Locb_enc_done:
+  vst1.8 {q0}, [r3] /* store offset */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  unsigned char *L_table,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_dec_armv8_ce
+.type  _gcry_aes_ocb_dec_armv8_ce,%function;
+_gcry_aes_ocb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: offset
+   *    %st+0: checksum => r4
+   *    %st+4: Ls => r5
+   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    %st+12: nrounds => r7
+   *    %st+16: blkn => lr
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r7, [sp, #(104+12)]
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  ldr r6, [sp, #(104+8)]
+  ldr lr, [sp, #(104+16)]
+
+  cmp r7, #12
+  vld1.8 {q0}, [r3] /* load offset */
+
+  aes_preload_keys(r0, r12);
+
+  beq .Locb_dec_entry_192
+  bhi .Locb_dec_entry_256
+
+#define OCB_DEC(bits, ...) \
+  .Locb_dec_entry_##bits: \
+    cmp r6, #4; \
+    add lr, #1; \
+    blo .Locb_dec_loop_##bits; \
+    \
+  .Locb_dec_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
+    sub r6, #4; \
+    \
+    vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
+    vld1.8 {q1-q2}, [r2]!; /* load P_i+<0-1> */ \
+    veor q0, q0, q9;       /* Offset_i+0 */ \
+    vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
+    veor q1, q1, q0;       /* P_i+0 xor Offset_i+0 */\
+    vld1.8 {q3-q4}, [r2]!; /* load P_i+<2-3> */ \
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+0 */\
+    veor q0, q0, q9;       /* Offset_i+1 */ \
+    vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
+    veor q2, q2, q0;       /* P_i+1 xor Offset_i+1 */\
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+1 */\
+    veor q0, q0, q9;       /* Offset_i+2 */ \
+    vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
+    veor q3, q3, q0;       /* P_i+2 xor Offset_i+2 */\
+    vst1.8 {q0}, [r1]!;    /* store Offset_i+2 */\
+    veor q0, q0, q9;       /* Offset_i+3 */ \
+    veor q4, q4, q0;       /* P_i+3 xor Offset_i+3 */\
+    vst1.8 {q0}, [r1];     /* store Offset_i+3 */\
+    sub r1, #(3*16); \
+    \
+    cmp r6, #4; \
+    \
+    do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    mov r8, r1; \
+    vld1.8 {q8-q9}, [r1]!; \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]!; \
+    vst1.8 {q1-q2}, [r8]!; \
+    veor q1, q1, q2; \
+    vld1.8 {q2}, [r4];     /* load Checksum_{i-1} */ \
+    veor q3, q3, q8; \
+    veor q1, q1, q3; \
+    veor q4, q4, q9; \
+    veor q1, q1, q4; \
+    vst1.8 {q3-q4}, [r8]; \
+    veor q2, q2, q1; \
+    vst1.8 {q2}, [r4];     /* store Checksum_i+3 */ \
+    \
+    bhs .Locb_dec_loop4_##bits; \
+    cmp r6, #0; \
+    beq .Locb_dec_done; \
+    \
+  .Locb_dec_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
+    vld1.8 {q2}, [r8]; /* load L_{ntz(i)} */ \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    subs r6, #1; \
+    veor q0, q0, q2; \
+    veor q1, q1, q0; \
+    \
+    do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__) \
+    \
+    vld1.8 {q2}, [r4]; /* load checksum */ \
+    veor q1, q1, q0; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    veor q2, q2, q1; \
+    vst1.8 {q2}, [r4]; /* store checksum */ \
+    \
+    bne .Locb_dec_loop_##bits; \
+    b .Locb_dec_done;
+
+  OCB_DEC(128re, r0, r12)
+  OCB_DEC(192, r0, r12)
+  OCB_DEC(256, r0, r12)
+
+#undef OCB_DEC
+
+.Locb_dec_done:
+  vst1.8 {q0}, [r3] /* store offset */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ *                                   const unsigned char *abuf,
+ *                                   unsigned char *offset,
+ *                                   unsigned char *checksum,
+ *                                   unsigned char *L_table,
+ *                                   size_t nblocks,
+ *                                   unsigned int nrounds,
+ *                                   unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_auth_armv8_ce
+.type  _gcry_aes_ocb_auth_armv8_ce,%function;
+_gcry_aes_ocb_auth_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: abuf
+   *    r2: offset
+   *    r3: checksum
+   *    %st+0: Ls => r5
+   *    %st+4: nblocks => r6  (0 < nblocks <= 32)
+   *    %st+8: nrounds => r7
+   *    %st+12: blkn => lr
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r7, [sp, #(104+8)]
+  ldr r5, [sp, #(104+0)]
+  ldr r6, [sp, #(104+4)]
+  ldr lr, [sp, #(104+12)]
+
+  cmp r7, #12
+  vld1.8 {q0}, [r2] /* load offset */
+
+  aes_preload_keys(r0, r12);
+
+  beq .Locb_auth_entry_192
+  bhi .Locb_auth_entry_256
+
+#define OCB_AUTH(bits, ...) \
+  .Locb_auth_entry_##bits: \
+    cmp r6, #4; \
+    add lr, #1; \
+    blo .Locb_auth_loop_##bits; \
+    \
+  .Locb_auth_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    add r9, lr, #1; \
+    add r10, lr, #2; \
+    add r11, lr, #3; \
+    rbit r8, lr; \
+    add lr, lr, #4; \
+    rbit r9, r9; \
+    rbit r10, r10; \
+    rbit r11, r11; \
+    clz r8, r8; /* ntz(i+0) */ \
+    clz r9, r9; /* ntz(i+1) */ \
+    clz r10, r10; /* ntz(i+2) */ \
+    clz r11, r11; /* ntz(i+3) */ \
+    add r8, r5, r8, lsl #4; \
+    add r9, r5, r9, lsl #4; \
+    add r10, r5, r10, lsl #4; \
+    add r11, r5, r11, lsl #4; \
+    \
+    sub r6, #4; \
+    \
+    vld1.8 {q9}, [r8];     /* load L_{ntz(i+0)} */ \
+    vld1.8 {q1-q2}, [r1]!; /* load A_i+<0-1> */ \
+    veor q0, q0, q9;       /* Offset_i+0 */ \
+    vld1.8 {q9}, [r9];     /* load L_{ntz(i+1)} */ \
+    veor q1, q1, q0;       /* A_i+0 xor Offset_i+0 */\
+    vld1.8 {q3-q4}, [r1]!; /* load A_i+<2-3> */ \
+    veor q0, q0, q9;       /* Offset_i+1 */ \
+    vld1.8 {q9}, [r10];    /* load L_{ntz(i+2)} */ \
+    veor q2, q2, q0;       /* A_i+1 xor Offset_i+1 */\
+    veor q0, q0, q9;       /* Offset_i+2 */ \
+    vld1.8 {q9}, [r11];    /* load L_{ntz(i+3)} */ \
+    veor q3, q3, q0;       /* A_i+2 xor Offset_i+2 */\
+    veor q0, q0, q9;       /* Offset_i+3 */ \
+    veor q4, q4, q0;       /* A_i+3 xor Offset_i+3 */\
+    \
+    cmp r6, #4; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    veor q3, q3, q4; \
+    vld1.8 {q2}, [r3]; \
+    veor q1, q1, q3; \
+    veor q2, q2, q1; \
+    vst1.8 {q2}, [r3]; \
+    \
+    bhs .Locb_auth_loop4_##bits; \
+    cmp r6, #0; \
+    beq .Locb_auth_done; \
+    \
+  .Locb_auth_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    rbit r8, lr; \
+    add lr, #1; \
+    clz r8, r8; /* ntz(i) */ \
+    add r8, r5, r8, lsl #4; \
+    \
+    vld1.8 {q2}, [r8];  /* load L_{ntz(i)} */ \
+    vld1.8 {q1}, [r1]!; /* load aadtext */ \
+    subs r6, #1; \
+    veor q0, q0, q2; \
+    vld1.8 {q2}, [r3]; /* load checksum */ \
+    veor q1, q1, q0; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__) \
+    \
+    veor q2, q2, q1; \
+    vst1.8 {q2}, [r3]; /* store checksum */ \
+    \
+    bne .Locb_auth_loop_##bits; \
+    b .Locb_auth_done;
+
+  OCB_AUTH(128re, r0, r12)
+  OCB_AUTH(192, r0, r12)
+  OCB_AUTH(256, r0, r12)
+
+#undef OCB_AUTH
+
+.Locb_auth_done:
+  vst1.8 {q0}, [r2] /* store offset */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;
+
+
+
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+.type  _gcry_aes_xts_enc_armv8_ce,%function;
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_enc_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_enc_entry_192
+  bhi .Lxts_enc_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_enc_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_enc_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_enc_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_enc_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+.type  _gcry_aes_xts_dec_armv8_ce,%function;
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lxts_dec_skip
+
+  cmp r5, #12
+
+  vld1.8 {q0}, [r3] /* load tweak */
+  mov r7, #0x87;
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lxts_dec_entry_192
+  bhi .Lxts_dec_entry_256
+
+#define CTR_XTS(bits, ...) \
+  .Lxts_dec_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    sub r4, r4, #4; \
+    veor q9, q9, q9; \
+    \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    veor q1, q1, q0; \
+    cmp r4, #4; \
+    vmov.u32 d18[0], r7; \
+    vst1.8 {q0}, [r1]!; /* store tweak0 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    veor q2, q2, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak1 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q3, q3, q0; \
+    vst1.8 {q0}, [r1]!; /* store tweak2 to temp */ \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    veor q4, q4, q0; \
+    vst1.8 {q0}, [r1]; /* store tweak3 to temp */ \
+    sub r1, r1, #48; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    \
+    do_aes_4_##bits(d, imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vld1.8 {q8-q9}, [r1]!; /* load tweak from temp */ \
+    veor q1, q1, q8; \
+    veor q2, q2, q9; \
+    vld1.8 {q8-q9}, [r1]; /* load tweak from temp */ \
+    sub r1, r1, #32; \
+    veor q3, q3, q8; \
+    veor q4, q4, q9; \
+    vst1.8 {q1-q2}, [r1]!; /* store plaintext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lxts_dec_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    \
+    veor q9, q9, q9; \
+    veor q1, q1, q0; \
+    vmov.u32 d18[0], r7; \
+    vmov q2, q0; \
+    \
+    vshr.s64 d16, d1, #63; \
+    vshr.u64 d17, d0, #63; \
+    vadd.u64 q0, q0, q0; \
+    vand d16, d16, d18; \
+    veor q0, q0, q8; \
+    subs r4, r4, #1; \
+    \
+    do_aes_one##bits(d, imc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q2; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  CTR_XTS(128re, r0, r6)
+  CTR_XTS(192, r0, r6)
+  CTR_XTS(256, r0, r6)
+
+#undef CTR_XTS
+
+.Lxts_dec_done:
+  vst1.8 {q0}, [r3] /* store tweak */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lxts_dec_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;
+
+
+/*
+ * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+ */
+.align 3
+.globl _gcry_aes_sbox4_armv8_ce
+.type  _gcry_aes_sbox4_armv8_ce,%function;
+_gcry_aes_sbox4_armv8_ce:
+  /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+   * Cryptology — CT-RSA 2015" for details.
+   */
+  vmov.i8 q0, #0x52
+  vmov.i8 q1, #0
+  vmov s0, r0
+  aese.8 q0, q1
+  veor d0, d1
+  vpadd.i32 d0, d0, d1
+  vmov r0, s0
+  CLEAR_REG(q0)
+  bx lr
+.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;
+
+
+/*
+ * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
+ */
+.align 3
+.globl _gcry_aes_invmixcol_armv8_ce
+.type  _gcry_aes_invmixcol_armv8_ce,%function;
+_gcry_aes_invmixcol_armv8_ce:
+  vld1.8 {q0}, [r1]
+  aesimc.8 q0, q0
+  vst1.8 {q0}, [r0]
+  CLEAR_REG(q0)
+  bx lr
+.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch64-ce.S b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch64-ce.S
new file mode 100644
index 0000000000..3af29e0d0c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-armv8-aarch64-ce.S
@@ -0,0 +1,1613 @@
+/* rijndael-armv8-aarch64-ce.S - ARMv8/CE accelerated AES
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.cpu generic+simd+crypto
+
+.text
+
+
+/* Register macros */
+
+#define vk0 v17
+#define vk1 v18
+#define vk2 v19
+#define vk3 v20
+#define vk4 v21
+#define vk5 v22
+#define vk6 v23
+#define vk7 v24
+#define vk8 v25
+#define vk9 v26
+#define vk10 v27
+#define vk11 v28
+#define vk12 v29
+#define vk13 v30
+#define vk14 v31
+
+
+/* AES macros */
+
+#define aes_preload_keys(keysched, nrounds) \
+	cmp nrounds, #12; \
+	ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
+	ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
+	ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
+	b.lo 1f; \
+	ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
+	b.eq 1f; \
+	ld1 {vk13.16b-vk14.16b}, [keysched]; \
+1:	;
+
+#define do_aes_one128(ed, mcimc, vo, vb) \
+	aes##ed    vb.16b, vk0.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk1.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk2.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk3.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk4.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk5.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk6.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk7.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk8.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk9.16b; \
+	eor        vo.16b, vb.16b, vk10.16b;
+
+#define do_aes_one192(ed, mcimc, vo, vb) \
+	aes##ed    vb.16b, vk0.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk1.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk2.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk3.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk4.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk5.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk6.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk7.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk8.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk9.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk10.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk11.16b; \
+	eor        vo.16b, vb.16b, vk12.16b;
+
+#define do_aes_one256(ed, mcimc, vo, vb) \
+	aes##ed    vb.16b, vk0.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk1.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk2.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk3.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk4.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk5.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk6.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk7.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk8.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk9.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk10.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk11.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk12.16b; \
+	aes##mcimc vb.16b, vb.16b; \
+	aes##ed    vb.16b, vk13.16b; \
+	eor        vo.16b, vb.16b, vk14.16b;
+
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+	aes##ed    b0.16b, key.16b; \
+	aes##mcimc b0.16b, b0.16b; \
+	  aes##ed    b1.16b, key.16b; \
+	  aes##mcimc b1.16b, b1.16b; \
+	    aes##ed    b2.16b, key.16b; \
+	    aes##mcimc b2.16b, b2.16b; \
+	      aes##ed    b3.16b, key.16b; \
+	      aes##mcimc b3.16b, b3.16b;
+
+#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
+	aes##ed    b0.16b, key1.16b; \
+	eor        b0.16b, b0.16b, key2.16b; \
+	  aes##ed    b1.16b, key1.16b; \
+	  eor        b1.16b, b1.16b, key2.16b; \
+	    aes##ed    b2.16b, key1.16b; \
+	    eor        b2.16b, b2.16b, key2.16b; \
+	      aes##ed    b3.16b, key1.16b; \
+	      eor        b3.16b, b3.16b, key2.16b;
+
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+	aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+	aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
+	aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+#define aes_clear_keys(nrounds) \
+	cmp nrounds, #12; \
+	CLEAR_REG(vk0); \
+	CLEAR_REG(vk1); \
+	CLEAR_REG(vk2); \
+	CLEAR_REG(vk3); \
+	CLEAR_REG(vk4); \
+	CLEAR_REG(vk5); \
+	CLEAR_REG(vk6); \
+	CLEAR_REG(vk7); \
+	CLEAR_REG(vk9); \
+	CLEAR_REG(vk8); \
+	CLEAR_REG(vk10); \
+	b.lo 1f; \
+	CLEAR_REG(vk11); \
+	CLEAR_REG(vk12); \
+	b.eq 1f; \
+	CLEAR_REG(vk13); \
+	CLEAR_REG(vk14); \
+1:	;
+
+
+/*
+ * unsigned int _gcry_aes_enc_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_enc_armv8_ce
+ELF(.type  _gcry_aes_enc_armv8_ce,%function;)
+_gcry_aes_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: dst
+   *    x2: src
+   *    w3: nrounds
+   */
+  CFI_STARTPROC();
+
+  aes_preload_keys(x0, w3);
+
+  ld1 {v0.16b}, [x2]
+
+  b.hi .Lenc1_256
+  b.eq .Lenc1_192
+
+.Lenc1_128:
+  do_aes_one128(e, mc, v0, v0);
+
+.Lenc1_tail:
+  CLEAR_REG(vk0)
+  CLEAR_REG(vk1)
+  CLEAR_REG(vk2)
+  CLEAR_REG(vk3)
+  CLEAR_REG(vk4)
+  CLEAR_REG(vk5)
+  CLEAR_REG(vk6)
+  CLEAR_REG(vk7)
+  CLEAR_REG(vk8)
+  CLEAR_REG(vk9)
+  CLEAR_REG(vk10)
+  st1 {v0.16b}, [x1]
+  CLEAR_REG(v0)
+
+  mov x0, #0
+  ret
+
+.Lenc1_192:
+  do_aes_one192(e, mc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  b .Lenc1_tail
+
+.Lenc1_256:
+  do_aes_one256(e, mc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  CLEAR_REG(vk13)
+  CLEAR_REG(vk14)
+  b .Lenc1_tail
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;)
+
+
+/*
+ * unsigned int _gcry_aes_dec_armv8_ce(void *keysched, byte *dst,
+ *                                     const byte *src,
+ *                                     unsigned int nrounds);
+ */
+.align 3
+.globl _gcry_aes_dec_armv8_ce
+ELF(.type  _gcry_aes_dec_armv8_ce,%function;)
+_gcry_aes_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: dst
+   *    x2: src
+   *    w3: nrounds
+   */
+  CFI_STARTPROC();
+
+  aes_preload_keys(x0, w3);
+
+  ld1 {v0.16b}, [x2]
+
+  b.hi .Ldec1_256
+  b.eq .Ldec1_192
+
+.Ldec1_128:
+  do_aes_one128(d, imc, v0, v0);
+
+.Ldec1_tail:
+  CLEAR_REG(vk0)
+  CLEAR_REG(vk1)
+  CLEAR_REG(vk2)
+  CLEAR_REG(vk3)
+  CLEAR_REG(vk4)
+  CLEAR_REG(vk5)
+  CLEAR_REG(vk6)
+  CLEAR_REG(vk7)
+  CLEAR_REG(vk8)
+  CLEAR_REG(vk9)
+  CLEAR_REG(vk10)
+  st1 {v0.16b}, [x1]
+  CLEAR_REG(v0)
+
+  mov x0, #0
+  ret
+
+.Ldec1_192:
+  do_aes_one192(d, imc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  b .Ldec1_tail
+
+.Ldec1_256:
+  do_aes_one256(d, imc, v0, v0);
+
+  CLEAR_REG(vk11)
+  CLEAR_REG(vk12)
+  CLEAR_REG(vk13)
+  CLEAR_REG(vk14)
+  b .Ldec1_tail
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, size_t nblocks,
+ *                                  int cbc_mac, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_enc_armv8_ce
+ELF(.type  _gcry_aes_cbc_enc_armv8_ce,%function;)
+_gcry_aes_cbc_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: iv
+   *    x4: nblocks
+   *    w5: cbc_mac
+   *    w6: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lcbc_enc_skip
+
+  cmp w5, #0
+  ld1 {v1.16b}, [x3] /* load IV */
+  cset x5, eq
+
+  aes_preload_keys(x0, w6);
+  lsl x5, x5, #4
+
+  b.eq .Lcbc_enc_loop192
+  b.hi .Lcbc_enc_loop256
+
+#define CBC_ENC(bits) \
+  .Lcbc_enc_loop##bits: \
+    ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
+    eor v1.16b, v0.16b, v1.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
+    \
+    cbnz x4, .Lcbc_enc_loop##bits; \
+    b .Lcbc_enc_done;
+
+  CBC_ENC(128)
+  CBC_ENC(192)
+  CBC_ENC(256)
+
+#undef CBC_ENC
+
+.Lcbc_enc_done:
+  aes_clear_keys(w6)
+
+  st1 {v1.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v1)
+  CLEAR_REG(v0)
+
+.Lcbc_enc_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
+
+/*
+ * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cbc_dec_armv8_ce
+ELF(.type  _gcry_aes_cbc_dec_armv8_ce,%function;)
+_gcry_aes_cbc_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lcbc_dec_skip
+
+  ld1 {v0.16b}, [x3] /* load IV */
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcbc_dec_entry_192
+  b.hi .Lcbc_dec_entry_256
+
+#define CBC_DEC(bits) \
+  .Lcbc_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lcbc_dec_loop_##bits; \
+    \
+  .Lcbc_dec_loop4_##bits: \
+    \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
+    sub x4, x4, #4; \
+    mov v5.16b, v1.16b; \
+    mov v6.16b, v2.16b; \
+    mov v7.16b, v3.16b; \
+    mov v16.16b, v4.16b; \
+    cmp x4, #4; \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    eor v2.16b, v2.16b, v5.16b; \
+    st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    mov v0.16b, v16.16b; /* next IV */ \
+    st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
+    \
+    b.hs .Lcbc_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    CLEAR_REG(v16); \
+    cbz x4, .Lcbc_dec_done; \
+    \
+  .Lcbc_dec_loop_##bits: \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    sub x4, x4, #1; \
+    mov v2.16b, v1.16b; \
+    \
+    do_aes_one##bits(d, imc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    mov v0.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lcbc_dec_loop_##bits; \
+    b .Lcbc_dec_done;
+
+  CBC_DEC(128)
+  CBC_DEC(192)
+  CBC_DEC(256)
+
+#undef CBC_DEC
+
+.Lcbc_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lcbc_dec_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr_enc_armv8_ce
+ELF(.type  _gcry_aes_ctr_enc_armv8_ce,%function;)
+_gcry_aes_ctr_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lctr_enc_skip
+
+  mov x6, #1
+  movi v16.16b, #0
+  mov v16.D[1], x6
+
+  /* load IV */
+  ldp x9, x10, [x3]
+  ld1 {v0.16b}, [x3]
+  rev x9, x9
+  rev x10, x10
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lctr_enc_entry_192
+  b.hi .Lctr_enc_entry_256
+
+#define CTR_ENC(bits) \
+  .Lctr_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lctr_enc_loop_##bits; \
+    \
+  .Lctr_enc_loop4_##bits: \
+    cmp x10, #0xfffffffffffffffc; \
+    sub x4, x4, #4; \
+    b.lo .Lctr_enc_loop4_##bits##_nocarry; \
+    \
+    adds x10, x10, #1; \
+    mov v1.16b, v0.16b; \
+    adc x9, x9, xzr; \
+    mov v2.D[1], x10; \
+    mov v2.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v2.16b, v2.16b; \
+    adc x9, x9, xzr; \
+    mov v3.D[1], x10; \
+    mov v3.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v3.16b, v3.16b; \
+    adc x9, x9, xzr; \
+    mov v4.D[1], x10; \
+    mov v4.D[0], x9; \
+    \
+    adds x10, x10, #1; \
+    rev64 v4.16b, v4.16b; \
+    adc x9, x9, xzr; \
+    mov v0.D[1], x10; \
+    mov v0.D[0], x9; \
+    rev64 v0.16b, v0.16b; \
+    \
+    b .Lctr_enc_loop4_##bits##_store_ctr; \
+    \
+  .Lctr_enc_loop4_##bits##_nocarry: \
+    \
+    add v3.2d, v16.2d, v16.2d; /* 2 */ \
+    rev64 v6.16b, v0.16b; \
+    add x10, x10, #4; \
+    add v4.2d, v3.2d, v16.2d;  /* 3 */ \
+    add v0.2d, v3.2d, v3.2d;   /* 4 */ \
+    rev64 v1.16b, v6.16b; \
+    add v2.2d, v6.2d, v16.2d; \
+    add v3.2d, v6.2d, v3.2d; \
+    add v4.2d, v6.2d, v4.2d; \
+    add v0.2d, v6.2d, v0.2d; \
+    rev64 v2.16b, v2.16b; \
+    rev64 v3.16b, v3.16b; \
+    rev64 v0.16b, v0.16b; \
+    rev64 v4.16b, v4.16b; \
+    \
+  .Lctr_enc_loop4_##bits##_store_ctr: \
+    \
+    st1 {v0.16b}, [x3]; \
+    cmp x4, #4; \
+    ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b; \
+    ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
+    eor v2.16b, v2.16b, v6.16b; \
+    eor v3.16b, v3.16b, v7.16b; \
+    eor v4.16b, v4.16b, v5.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lctr_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lctr_enc_done; \
+    \
+  .Lctr_enc_loop_##bits: \
+    \
+    adds x10, x10, #1; \
+    mov v1.16b, v0.16b; \
+    adc x9, x9, xzr; \
+    mov v0.D[1], x10; \
+    mov v0.D[0], x9; \
+    sub x4, x4, #1; \
+    ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
+    rev64 v0.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v2.16b, v1.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lctr_enc_loop_##bits; \
+    b .Lctr_enc_done;
+
+  CTR_ENC(128)
+  CTR_ENC(192)
+  CTR_ENC(256)
+
+#undef CTR_ENC
+
+.Lctr_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lctr_enc_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_enc_armv8_ce
+ELF(.type  _gcry_aes_cfb_enc_armv8_ce,%function;)
+_gcry_aes_cfb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lcfb_enc_skip
+
+  /* load IV */
+  ld1 {v0.16b}, [x3]
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcfb_enc_entry_192
+  b.hi .Lcfb_enc_entry_256
+
+#define CFB_ENC(bits) \
+  .Lcfb_enc_entry_##bits: \
+  .Lcfb_enc_loop_##bits: \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v0, v0); \
+    \
+    eor v0.16b, v1.16b, v0.16b; \
+    st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lcfb_enc_loop_##bits; \
+    b .Lcfb_enc_done;
+
+  CFB_ENC(128)
+  CFB_ENC(192)
+  CFB_ENC(256)
+
+#undef CFB_ENC
+
+.Lcfb_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+
+.Lcfb_enc_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *iv, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_cfb_dec_armv8_ce
+ELF(.type  _gcry_aes_cfb_dec_armv8_ce,%function;)
+_gcry_aes_cfb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lcfb_dec_skip
+
+  /* load IV */
+  ld1 {v0.16b}, [x3]
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lcfb_dec_entry_192
+  b.hi .Lcfb_dec_entry_256
+
+#define CFB_DEC(bits) \
+  .Lcfb_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lcfb_dec_loop_##bits; \
+    \
+  .Lcfb_dec_loop4_##bits: \
+    \
+    ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
+    mov v1.16b, v0.16b; \
+    sub x4, x4, #4; \
+    cmp x4, #4; \
+    mov v5.16b, v2.16b; \
+    mov v6.16b, v3.16b; \
+    mov v7.16b, v4.16b; \
+    ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b; \
+    eor v2.16b, v2.16b, v6.16b; \
+    eor v3.16b, v3.16b, v7.16b; \
+    eor v4.16b, v4.16b, v0.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lcfb_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lcfb_dec_done; \
+    \
+  .Lcfb_dec_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v0, v0); \
+    \
+    eor v2.16b, v1.16b, v0.16b; \
+    mov v0.16b, v1.16b; \
+    st1 {v2.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lcfb_dec_loop_##bits; \
+    b .Lcfb_dec_done;
+
+  CFB_DEC(128)
+  CFB_DEC(192)
+  CFB_DEC(256)
+
+#undef CFB_DEC
+
+.Lcfb_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lcfb_dec_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  unsigned char *L_table,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_enc_armv8_ce
+ELF(.type  _gcry_aes_ocb_enc_armv8_ce,%function;)
+_gcry_aes_ocb_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: offset
+   *    x4: checksum
+   *    x5: Ltable
+   *    x6: nblocks (0 < nblocks <= 32)
+   *    w7: nrounds
+   *    %st+0: blkn => w12
+   */
+  CFI_STARTPROC();
+
+  ldr w12, [sp]
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  aes_preload_keys(x0, w7);
+
+  b.eq .Locb_enc_entry_192
+  b.hi .Locb_enc_entry_256
+
+#define OCB_ENC(bits, ...) \
+  .Locb_enc_entry_##bits: \
+    cmp x6, #4; \
+    add x12, x12, #1; \
+    b.lo .Locb_enc_loop_##bits; \
+    \
+  .Locb_enc_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load P_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
+    \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* P_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* P_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* P_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* P_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
+    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
+    st1 {v1.16b-v4.16b}, [x1], #64; \
+    \
+    b.hs .Locb_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_enc_done; \
+    \
+  .Locb_enc_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
+    \
+    rbit x8, x12; \
+    add x12, x12, #1; \
+    clz x8, x8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v16.16b, v16.16b, v1.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x6, .Locb_enc_loop_##bits; \
+    b .Locb_enc_done;
+
+  OCB_ENC(128)
+  OCB_ENC(192)
+  OCB_ENC(256)
+
+#undef OCB_ENC
+
+.Locb_enc_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *offset,
+ *                                  unsigned char *checksum,
+ *                                  unsigned char *L_table,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds,
+ *                                  unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_dec_armv8_ce
+ELF(.type  _gcry_aes_ocb_dec_armv8_ce,%function;)
+_gcry_aes_ocb_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: offset
+   *    x4: checksum
+   *    x5: Ltable
+   *    x6: nblocks (0 < nblocks <= 32)
+   *    w7: nrounds
+   *    %st+0: blkn => w12
+   */
+  CFI_STARTPROC();
+
+  ldr w12, [sp]
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  aes_preload_keys(x0, w7);
+
+  b.eq .Locb_dec_entry_192
+  b.hi .Locb_dec_entry_256
+
+#define OCB_DEC(bits) \
+  .Locb_dec_entry_##bits: \
+    cmp x6, #4; \
+    add w12, w12, #1; \
+    b.lo .Locb_dec_loop_##bits; \
+    \
+  .Locb_dec_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load C_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
+    \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* C_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* C_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* C_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* C_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
+    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
+    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
+    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
+    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
+    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
+    st1 {v1.16b-v4.16b}, [x1], #64; \
+    \
+    b.hs .Locb_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_dec_done; \
+    \
+  .Locb_dec_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    \
+    rbit w8, w12; \
+    add w12, w12, #1; \
+    clz w8, w8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(d, imc, v1, v1) \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    cbnz x6, .Locb_dec_loop_##bits; \
+    b .Locb_dec_done;
+
+  OCB_DEC(128)
+  OCB_DEC(192)
+  OCB_DEC(256)
+
+#undef OCB_DEC
+
+.Locb_dec_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ *                                   const unsigned char *abuf,
+ *                                   unsigned char *offset,
+ *                                   unsigned char *checksum,
+ *                                   unsigned char *L_table,
+ *                                   size_t nblocks,
+ *                                   unsigned int nrounds,
+ *                                   unsigned int blkn);
+ */
+
+.align 3
+.globl _gcry_aes_ocb_auth_armv8_ce
+ELF(.type  _gcry_aes_ocb_auth_armv8_ce,%function;)
+_gcry_aes_ocb_auth_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: abuf
+   *    x2: offset => x3
+   *    x3: checksum => x4
+   *    x4: Ltable => x5
+   *    x5: nblocks => x6  (0 < nblocks <= 32)
+   *    w6: nrounds => w7
+   *    w7: blkn => w12
+   */
+  CFI_STARTPROC();
+
+  mov w12, w7
+  mov w7, w6
+  mov x6, x5
+  mov x5, x4
+  mov x4, x3
+  mov x3, x2
+
+  aes_preload_keys(x0, w7);
+
+  ld1 {v0.16b}, [x3] /* load offset */
+  ld1 {v16.16b}, [x4] /* load checksum */
+
+  beq .Locb_auth_entry_192
+  bhi .Locb_auth_entry_256
+
+#define OCB_AUTH(bits) \
+  .Locb_auth_entry_##bits: \
+    cmp x6, #4; \
+    add w12, w12, #1; \
+    b.lo .Locb_auth_loop_##bits; \
+    \
+  .Locb_auth_loop4_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    add w9, w12, #1; \
+    add w10, w12, #2; \
+    add w11, w12, #3; \
+    rbit w8, w12; \
+    add w12, w12, #4; \
+    rbit w9, w9; \
+    rbit w10, w10; \
+    rbit w11, w11; \
+    clz w8, w8; /* ntz(i+0) */ \
+    clz w9, w9; /* ntz(i+1) */ \
+    clz w10, w10; /* ntz(i+2) */ \
+    clz w11, w11; /* ntz(i+3) */ \
+    add x8, x5, x8, lsl #4; \
+    ld1 {v1.16b-v4.16b}, [x1], #64;   /* load A_i+<0-3> */ \
+    add x9, x5, x9, lsl #4; \
+    add x10, x5, x10, lsl #4; \
+    add x11, x5, x11, lsl #4; \
+    \
+    sub x6, x6, #4; \
+    \
+    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
+    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
+    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
+    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
+    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
+    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
+    eor v1.16b, v1.16b, v5.16b;       /* A_i+0 xor Offset_i+0 */ \
+    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
+    eor v2.16b, v2.16b, v6.16b;       /* A_i+1 xor Offset_i+1 */ \
+    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
+    cmp x6, #4; \
+    eor v3.16b, v3.16b, v7.16b;       /* A_i+2 xor Offset_i+2 */ \
+    eor v4.16b, v4.16b, v0.16b;       /* A_i+3 xor Offset_i+3 */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v16.16b, v16.16b, v3.16b; \
+    eor v1.16b, v1.16b, v4.16b; \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    b.hs .Locb_auth_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x6, .Locb_auth_done; \
+    \
+  .Locb_auth_loop_##bits: \
+    \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */ \
+    \
+    rbit w8, w12; \
+    add w12, w12, #1; \
+    clz w8, w8; /* ntz(i) */ \
+    add x8, x5, x8, lsl #4; \
+    \
+    ld1 {v1.16b}, [x1], #16; /* load aadtext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    sub x6, x6, #1; \
+    eor v0.16b, v0.16b, v2.16b; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    do_aes_one##bits(e, mc, v1, v1) \
+    \
+    eor v16.16b, v16.16b, v1.16b; \
+    \
+    cbnz x6, .Locb_auth_loop_##bits; \
+    b .Locb_auth_done;
+
+  OCB_AUTH(128)
+  OCB_AUTH(192)
+  OCB_AUTH(256)
+
+#undef OCB_AUTH
+
+.Locb_auth_done:
+  aes_clear_keys(w7)
+
+  st1 {v16.16b}, [x4] /* store checksum */
+  st1 {v0.16b}, [x3] /* store offset */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_enc_armv8_ce
+ELF(.type  _gcry_aes_xts_enc_armv8_ce,%function;)
+_gcry_aes_xts_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lxts_enc_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_enc_entry_192
+  b.hi .Lxts_enc_entry_256
+
+#define XTS_ENC(bits) \
+  .Lxts_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_enc_loop_##bits; \
+    \
+  .Lxts_enc_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_enc_done; \
+    \
+  .Lxts_enc_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_enc_loop_##bits; \
+    b .Lxts_enc_done;
+
+  XTS_ENC(128)
+  XTS_ENC(192)
+  XTS_ENC(256)
+
+#undef XTS_ENC
+
+.Lxts_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_enc_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  unsigned char *tweak,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_xts_dec_armv8_ce
+ELF(.type  _gcry_aes_xts_dec_armv8_ce,%function;)
+_gcry_aes_xts_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: tweak
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lxts_dec_skip
+
+  /* load tweak */
+  ld1 {v0.16b}, [x3]
+
+  /* load gfmul mask */
+  mov x6, #0x87
+  mov x7, #0x01
+  mov v16.D[0], x6
+  mov v16.D[1], x7
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lxts_dec_entry_192
+  b.hi .Lxts_dec_entry_256
+
+#define XTS_DEC(bits) \
+  .Lxts_dec_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lxts_dec_loop_##bits; \
+    \
+  .Lxts_dec_loop4_##bits: \
+    \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v3.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
+    st1 {v3.16b}, [x3]; \
+    sub x4, x4, #4; \
+    eor v1.16b, v1.16b, v0.16b; \
+    \
+    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
+    cmp x4, #4; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    \
+    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v0.16b; \
+    ld1 {v0.16b}, [x3]; \
+    eor v2.16b, v2.16b, v5.16b; \
+    eor v3.16b, v3.16b, v6.16b; \
+    eor v4.16b, v4.16b, v7.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_dec_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lxts_dec_done; \
+    \
+  .Lxts_dec_loop_##bits: \
+    \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ext v3.16b, v0.16b, v0.16b, #8; \
+    mov v2.16b, v0.16b; \
+    sshr v3.2d, v3.2d, #63; \
+    add v0.2d, v0.2d, v0.2d; \
+    and v3.16b, v3.16b, v16.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
+    eor v0.16b, v0.16b, v3.16b; \
+    sub x4, x4, #1; \
+    \
+    do_aes_one##bits(d, imc, v1, v1); \
+    \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    \
+    cbnz x4, .Lxts_dec_loop_##bits; \
+    b .Lxts_dec_done;
+
+  XTS_DEC(128)
+  XTS_DEC(192)
+  XTS_DEC(256)
+
+#undef XTS_DEC
+
+.Lxts_dec_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store tweak */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lxts_dec_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;)
+
+
+/*
+ * u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+ */
+.align 3
+.globl _gcry_aes_sbox4_armv8_ce
+ELF(.type  _gcry_aes_sbox4_armv8_ce,%function;)
+_gcry_aes_sbox4_armv8_ce:
+  /* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
+   * Cryptology — CT-RSA 2015" for details.
+   */
+  CFI_STARTPROC();
+  movi v0.16b, #0x52
+  movi v1.16b, #0
+  mov v0.S[0], w0
+  aese v0.16b, v1.16b
+  addv s0, v0.4s
+  mov w0, v0.S[0]
+  CLEAR_REG(v0)
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_invmixcol_armv8_ce(void *dst, const void *src);
+ */
+.align 3
+.globl _gcry_aes_invmixcol_armv8_ce
+ELF(.type  _gcry_aes_invmixcol_armv8_ce,%function;)
+_gcry_aes_invmixcol_armv8_ce:
+  CFI_STARTPROC();
+  ld1 {v0.16b}, [x1]
+  aesimc v0.16b, v0.16b
+  st1 {v0.16b}, [x0]
+  CLEAR_REG(v0)
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-armv8-ce.c b/comm/third_party/libgcrypt/cipher/rijndael-armv8-ce.c
new file mode 100644
index 0000000000..6e46830ee4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-armv8-ce.c
@@ -0,0 +1,414 @@
+/* ARMv8 Crypto Extension AES for Libgcrypt
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_ARM_CE
+
+
+typedef struct u128_s { u32 a, b, c, d; } u128_t;
+
+extern u32 _gcry_aes_sbox4_armv8_ce(u32 in4b);
+extern void _gcry_aes_invmixcol_armv8_ce(u128_t *dst, const u128_t *src);
+
+extern unsigned int _gcry_aes_enc_armv8_ce(const void *keysched, byte *dst,
+                                           const byte *src,
+                                           unsigned int nrounds);
+extern unsigned int _gcry_aes_dec_armv8_ce(const void *keysched, byte *dst,
+                                           const byte *src,
+                                           unsigned int nrounds);
+
+extern void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *iv, size_t nblocks,
+                                        int cbc_mac, unsigned int nrounds);
+extern void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *iv, size_t nblocks,
+                                        unsigned int nrounds);
+
+extern void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *iv, size_t nblocks,
+                                        unsigned int nrounds);
+extern void _gcry_aes_cfb_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *iv, size_t nblocks,
+                                        unsigned int nrounds);
+
+extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *iv, size_t nblocks,
+                                        unsigned int nrounds);
+
+extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *offset,
+                                        unsigned char *checksum,
+                                        unsigned char *L_table,
+                                        size_t nblocks,
+                                        unsigned int nrounds,
+                                        unsigned int blkn);
+extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *offset,
+                                        unsigned char *checksum,
+                                        unsigned char *L_table,
+                                        size_t nblocks,
+                                        unsigned int nrounds,
+                                        unsigned int blkn);
+extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+                                         const unsigned char *abuf,
+                                         unsigned char *offset,
+                                         unsigned char *checksum,
+                                         unsigned char *L_table,
+                                         size_t nblocks,
+                                         unsigned int nrounds,
+                                         unsigned int blkn);
+extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        unsigned char *tweak,
+                                        size_t nblocks, unsigned int nrounds);
+
+typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+                                const unsigned char *inbuf,
+                                unsigned char *offset, unsigned char *checksum,
+                                unsigned char *L_table, size_t nblocks,
+                                unsigned int nrounds, unsigned int blkn);
+
+typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
+                                const unsigned char *inbuf,
+                                unsigned char *tweak, size_t nblocks,
+                                unsigned int nrounds);
+
+void
+_gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  union
+    {
+      PROPERLY_ALIGNED_TYPE dummy;
+      byte data[MAXKC][4];
+      u32 data32[MAXKC];
+    } tkk[2];
+  unsigned int rounds = ctx->rounds;
+  int KC = rounds - 6;
+  unsigned int keylen = KC * 4;
+  unsigned int i, r, t;
+  byte rcon = 1;
+  int j;
+#define k      tkk[0].data
+#define k_u32  tkk[0].data32
+#define tk     tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W      (ctx->keyschenc)
+#define W_u32  (ctx->keyschenc32)
+
+  for (i = 0; i < keylen; i++)
+    {
+      k[i >> 2][i & 3] = key[i];
+    }
+
+  for (j = KC-1; j >= 0; j--)
+    {
+      tk_u32[j] = k_u32[j];
+    }
+  r = 0;
+  t = 0;
+  /* Copy values into round key array.  */
+  for (j = 0; (j < KC) && (r < rounds + 1); )
+    {
+      for (; (j < KC) && (t < 4); j++, t++)
+        {
+          W_u32[r][t] = le_bswap32(tk_u32[j]);
+        }
+      if (t == 4)
+        {
+          r++;
+          t = 0;
+        }
+    }
+
+  while (r < rounds + 1)
+    {
+      tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon;
+
+      if (KC != 8)
+        {
+          for (j = 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+      else
+        {
+          for (j = 1; j < KC/2; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+
+          tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]);
+
+          for (j = KC/2 + 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+
+      /* Copy values into round key array.  */
+      for (j = 0; (j < KC) && (r < rounds + 1); )
+        {
+          for (; (j < KC) && (t < 4); j++, t++)
+            {
+              W_u32[r][t] = le_bswap32(tk_u32[j]);
+            }
+          if (t == 4)
+            {
+              r++;
+              t = 0;
+            }
+        }
+
+      rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+    }
+
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+  wipememory(&tkk, sizeof(tkk));
+}
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_armv8_ce_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  int rr;
+  int r;
+
+#define DO_AESIMC() _gcry_aes_invmixcol_armv8_ce(&dkey[r], &ekey[rr])
+
+  dkey[0] = ekey[rounds];
+  r = 1;
+  rr = rounds-1;
+  DO_AESIMC(); r++; rr--; /* round 1 */
+  DO_AESIMC(); r++; rr--; /* round 2 */
+  DO_AESIMC(); r++; rr--; /* round 3 */
+  DO_AESIMC(); r++; rr--; /* round 4 */
+  DO_AESIMC(); r++; rr--; /* round 5 */
+  DO_AESIMC(); r++; rr--; /* round 6 */
+  DO_AESIMC(); r++; rr--; /* round 7 */
+  DO_AESIMC(); r++; rr--; /* round 8 */
+  DO_AESIMC(); r++; rr--; /* round 9 */
+  if (rounds >= 12)
+    {
+      if (rounds > 12)
+        {
+          DO_AESIMC(); r++; rr--; /* round 10 */
+          DO_AESIMC(); r++; rr--; /* round 11 */
+        }
+
+      DO_AESIMC(); r++; rr--; /* round 12 / 10 */
+      DO_AESIMC(); r++; rr--; /* round 13 / 11 */
+    }
+
+  dkey[r] = ekey[0];
+
+#undef DO_AESIMC
+}
+
+unsigned int
+_gcry_aes_armv8_ce_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                            const unsigned char *src)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  return _gcry_aes_enc_armv8_ce(keysched, dst, src, nrounds);
+}
+
+unsigned int
+_gcry_aes_armv8_ce_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                            const unsigned char *src)
+{
+  const void *keysched = ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  return _gcry_aes_dec_armv8_ce(keysched, dst, src, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_enc (const RIJNDAEL_context *ctx, unsigned char *iv,
+                            unsigned char *outbuf, const unsigned char *inbuf,
+                            size_t nblocks, int cbc_mac)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_aes_cbc_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, cbc_mac,
+                             nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                            unsigned char *outbuf, const unsigned char *inbuf,
+                            size_t nblocks)
+{
+  const void *keysched = ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if ( !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_aes_cbc_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                            unsigned char *outbuf, const unsigned char *inbuf,
+                            size_t nblocks)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_aes_cfb_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                            unsigned char *outbuf, const unsigned char *inbuf,
+                            size_t nblocks)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_aes_cfb_dec_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+void
+_gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                            unsigned char *outbuf, const unsigned char *inbuf,
+                            size_t nblocks)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
+size_t
+_gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                              const void *inbuf_arg, size_t nblocks,
+                              int encrypt)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
+                                    : _gcry_aes_ocb_dec_armv8_ce;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int nrounds = ctx->rounds;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+  if ( !encrypt && !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn + nblocks;
+
+  crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+           c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
+
+  return 0;
+}
+
+size_t
+_gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
+                             size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = ctx->keyschenc32;
+  const unsigned char *abuf = abuf_arg;
+  unsigned int nrounds = ctx->rounds;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  c->u_mode.ocb.aad_nblocks = blkn + nblocks;
+
+  _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
+			      c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+			      nblocks, nrounds, (unsigned int)blkn);
+
+  return 0;
+}
+
+void
+_gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			      unsigned char *outbuf, const unsigned char *inbuf,
+			      size_t nblocks, int encrypt)
+{
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
+                                    : _gcry_aes_xts_dec_armv8_ce;
+  unsigned int nrounds = ctx->rounds;
+
+  if ( !encrypt && !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+}
+
+#endif /* USE_ARM_CE */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-internal.h b/comm/third_party/libgcrypt/cipher/rijndael-internal.h
new file mode 100644
index 0000000000..7e01f6b057
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-internal.h
@@ -0,0 +1,194 @@
+/* Rijndael (AES) for GnuPG
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_RIJNDAEL_INTERNAL_H
+#define G10_RIJNDAEL_INTERNAL_H
+
+#include "types.h"  /* for byte and u32 typedefs */
+
+
+#define MAXKC                   (256/32)
+#define MAXROUNDS               14
+#define BLOCKSIZE               (128/8)
+
+
+/* Helper macro to force alignment to 16 or 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_16
+# define ATTR_ALIGNED_64
+#endif
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_SSSE3 indicates whether to use SSSE3 code. */
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_SSSE3 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+#  define USE_ARM_ASM 1
+# endif
+#endif
+#if defined(__AARCH64EL__)
+# ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+#  define USE_ARM_ASM 1
+# endif
+#endif
+
+/* USE_PADLOCK indicates whether to compile the padlock specific
+   code.  */
+#undef USE_PADLOCK
+#ifdef ENABLE_PADLOCK_SUPPORT
+# ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+#  if (defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)
+#   define USE_PADLOCK 1
+#  endif
+# endif
+#endif /* ENABLE_PADLOCK_SUPPORT */
+
+/* USE_AESNI inidicates whether to compile with Intel AES-NI code.  We
+   need the vector-size attribute which seems to be available since
+   gcc 3.  However, to be on the safe side we require at least gcc 4.  */
+#undef USE_AESNI
+#ifdef ENABLE_AESNI_SUPPORT
+# if ((defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define USE_AESNI 1
+#  endif
+# endif
+#endif /* ENABLE_AESNI_SUPPORT */
+
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
+ * code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+#  define USE_ARM_CE 1
+# elif defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_CE 1
+# endif
+#endif /* ENABLE_ARM_CRYPTO_SUPPORT */
+
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code.  USE_PPC_CRYPTO_WITH_PPC9LE indicates whether to
+ * enable POWER9 optimized variant.  */
+#undef USE_PPC_CRYPTO
+#undef USE_PPC_CRYPTO_WITH_PPC9LE
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_CRYPTO 1
+#   if !defined(WORDS_BIGENDIAN) && defined(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00)
+#    define USE_PPC_CRYPTO_WITH_PPC9LE 1
+#   endif
+#  endif
+# endif
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
+
+/* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define USE_S390X_CRYPTO 1
+#endif /* USE_S390X_CRYPTO */
+
+struct RIJNDAEL_context_s;
+
+typedef unsigned int (*rijndael_cryptfn_t)(const struct RIJNDAEL_context_s *ctx,
+                                           unsigned char *bx,
+                                           const unsigned char *ax);
+typedef void (*rijndael_prefetchfn_t)(void);
+typedef void (*rijndael_prepare_decfn_t)(struct RIJNDAEL_context_s *ctx);
+
+/* Our context object.  */
+typedef struct RIJNDAEL_context_s
+{
+  /* The first fields are the keyschedule arrays.  This is so that
+     they are aligned on a 16 byte boundary if using gcc.  This
+     alignment is required for the AES-NI code and a good idea in any
+     case.  The alignment is guaranteed due to the way cipher.c
+     allocates the space for the context.  The PROPERLY_ALIGNED_TYPE
+     hack is used to force a minimal alignment if not using gcc of if
+     the alignment requirement is higher that 16 bytes.  */
+  union
+  {
+    PROPERLY_ALIGNED_TYPE dummy;
+    byte keyschedule[MAXROUNDS+1][4][4];
+    u32 keyschedule32[MAXROUNDS+1][4];
+#ifdef USE_PADLOCK
+    /* The key as passed to the padlock engine.  It is only used if
+       the padlock engine is used (USE_PADLOCK, below).  */
+    unsigned char padlock_key[16] __attribute__ ((aligned (16)));
+#endif /*USE_PADLOCK*/
+  } u1;
+  union
+  {
+    PROPERLY_ALIGNED_TYPE dummy;
+    byte keyschedule[MAXROUNDS+1][4][4];
+    u32 keyschedule32[MAXROUNDS+1][4];
+  } u2;
+  int rounds;                         /* Key-length-dependent number of rounds.  */
+  unsigned int decryption_prepared:1; /* The decryption key schedule is available.  */
+#ifdef USE_AESNI
+  unsigned int use_avx:1;             /* AVX shall be used by AES-NI implementation. */
+  unsigned int use_avx2:1;            /* AVX2 shall be used by AES-NI implementation. */
+#endif /*USE_AESNI*/
+#ifdef USE_S390X_CRYPTO
+  byte km_func;
+  byte km_func_xts;
+  byte kmc_func;
+  byte kmac_func;
+  byte kmf_func;
+  byte kmo_func;
+  byte kma_func;
+#endif /*USE_S390X_CRYPTO*/
+  rijndael_cryptfn_t encrypt_fn;
+  rijndael_cryptfn_t decrypt_fn;
+  rijndael_prefetchfn_t prefetch_enc_fn;
+  rijndael_prefetchfn_t prefetch_dec_fn;
+  rijndael_prepare_decfn_t prepare_decryption;
+} RIJNDAEL_context ATTR_ALIGNED_16;
+
+/* Macros defining alias for the keyschedules.  */
+#define keyschenc   u1.keyschedule
+#define keyschenc32 u1.keyschedule32
+#define keyschdec   u2.keyschedule
+#define keyschdec32 u2.keyschedule32
+#define padlockkey  u1.padlock_key
+
+#endif /* G10_RIJNDAEL_INTERNAL_H */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-padlock.c b/comm/third_party/libgcrypt/cipher/rijndael-padlock.c
new file mode 100644
index 0000000000..3af214d74e
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-padlock.c
@@ -0,0 +1,110 @@
+/* Padlock accelerated AES for Libgcrypt
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+
+#ifdef USE_PADLOCK
+
+/* Encrypt or decrypt one block using the padlock engine.  A and B may
+   be the same. */
+static unsigned int
+do_padlock (const RIJNDAEL_context *ctx, unsigned char *bx,
+            const unsigned char *ax, int decrypt_flag)
+{
+  /* BX and AX are not necessary correctly aligned.  Thus we need to
+     copy them here. */
+  unsigned char a[16] __attribute__ ((aligned (16)));
+  unsigned char b[16] __attribute__ ((aligned (16)));
+  unsigned int cword[4] __attribute__ ((aligned (16)));
+  unsigned char *pa = a;
+  unsigned char *pb = b;
+  int blocks;
+
+  /* The control word fields are:
+      127:12   11:10 9     8     7     6     5     4     3:0
+      RESERVED KSIZE CRYPT INTER KEYGN CIPHR ALIGN DGEST ROUND  */
+  cword[0] = (ctx->rounds & 15);  /* (The mask is just a safeguard.)  */
+  cword[1] = 0;
+  cword[2] = 0;
+  cword[3] = 0;
+  if (decrypt_flag)
+    cword[0] |= 0x00000200;
+
+  memcpy (a, ax, 16);
+
+  blocks = 1; /* Init counter for just one block.  */
+#ifdef __x86_64__
+  asm volatile
+    ("pushfq\n\t"          /* Force key reload.  */
+     "popfq\n\t"
+     ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
+     : "+S" (pa), "+D" (pb), "+c" (blocks)
+     : "d" (cword), "b" (ctx->padlockkey)
+     : "cc", "memory"
+     );
+#else
+  asm volatile
+    ("pushfl\n\t"          /* Force key reload.  */
+     "popfl\n\t"
+     "xchg %4, %%ebx\n\t"  /* Load key.  */
+     ".byte 0xf3, 0x0f, 0xa7, 0xc8\n\t" /* REP XCRYPT ECB. */
+     "xchg %4, %%ebx\n"    /* Restore GOT register.  */
+     : "+S" (pa), "+D" (pb), "+c" (blocks)
+     : "d" (cword), "r" (ctx->padlockkey)
+     : "cc", "memory"
+     );
+#endif
+
+  memcpy (bx, b, 16);
+
+  return (48 + 15 /* possible padding for alignment */);
+}
+
+unsigned int
+_gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
+                           unsigned char *bx, const unsigned char *ax)
+{
+  return do_padlock(ctx, bx, ax, 0);
+}
+
+unsigned int
+_gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx,
+                           unsigned char *bx, const unsigned char *ax)
+{
+  return do_padlock(ctx, bx, ax, 1);
+}
+
+void
+_gcry_aes_padlock_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  /* Padlock does not need decryption subkeys. */
+  (void)ctx;
+}
+#endif /* USE_PADLOCK */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ppc-common.h b/comm/third_party/libgcrypt/cipher/rijndael-ppc-common.h
new file mode 100644
index 0000000000..bbbeaac035
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ppc-common.h
@@ -0,0 +1,342 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#ifndef G10_RIJNDAEL_PPC_COMMON_H
+#define G10_RIJNDAEL_PPC_COMMON_H
+
+#include <altivec.h>
+
+
+typedef vector unsigned char block;
+
+typedef union
+{
+  u32 data32[4];
+} __attribute__((packed, aligned(1), may_alias)) u128_t;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+#define ALIGNED_LOAD(in_ptr, offs) \
+  (asm_aligned_ld ((offs) * 16, (const void *)(in_ptr)))
+
+#define ALIGNED_STORE(out_ptr, offs, vec) \
+  (asm_aligned_st ((vec), (offs) * 16, (void *)(out_ptr)))
+
+#define VEC_BE_SWAP(vec, bige_const) (asm_be_swap ((vec), (bige_const)))
+
+#define VEC_LOAD_BE(in_ptr, offs, bige_const) \
+  (asm_be_swap (asm_load_be_noswap ((offs) * 16, (const void *)(in_ptr)), \
+		bige_const))
+
+#define VEC_LOAD_BE_NOSWAP(in_ptr, offs) \
+  (asm_load_be_noswap ((offs) * 16, (const unsigned char *)(in_ptr)))
+
+#define VEC_STORE_BE(out_ptr, offs, vec, bige_const) \
+  (asm_store_be_noswap (asm_be_swap ((vec), (bige_const)), (offs) * 16, \
+		        (void *)(out_ptr)))
+
+#define VEC_STORE_BE_NOSWAP(out_ptr, offs, vec) \
+  (asm_store_be_noswap ((vec), (offs) * 16, (void *)(out_ptr)))
+
+
+#define ROUND_KEY_VARIABLES \
+  block rkey0, rkeylast
+
+#define PRELOAD_ROUND_KEYS(nrounds) \
+  do { \
+    rkey0 = ALIGNED_LOAD (rk, 0); \
+    rkeylast = ALIGNED_LOAD (rk, nrounds); \
+  } while (0)
+
+#define AES_ENCRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 9)); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 11)); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 13)); \
+	  } \
+      } \
+    blk = asm_cipherlast_be (blk, rkeylast); \
+  } while (0)
+
+#define AES_DECRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 9)); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 11)); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 13)); \
+	  } \
+      } \
+    blk = asm_ncipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
+#define ROUND_KEY_VARIABLES_ALL \
+  block rkey0, rkey1, rkey2, rkey3, rkey4, rkey5, rkey6, rkey7, rkey8, \
+        rkey9, rkey10, rkey11, rkey12, rkey13, rkeylast
+
+#define PRELOAD_ROUND_KEYS_ALL(nrounds) \
+  do { \
+    rkey0 = ALIGNED_LOAD (rk, 0); \
+    rkey1 = ALIGNED_LOAD (rk, 1); \
+    rkey2 = ALIGNED_LOAD (rk, 2); \
+    rkey3 = ALIGNED_LOAD (rk, 3); \
+    rkey4 = ALIGNED_LOAD (rk, 4); \
+    rkey5 = ALIGNED_LOAD (rk, 5); \
+    rkey6 = ALIGNED_LOAD (rk, 6); \
+    rkey7 = ALIGNED_LOAD (rk, 7); \
+    rkey8 = ALIGNED_LOAD (rk, 8); \
+    rkey9 = ALIGNED_LOAD (rk, 9); \
+    if (nrounds >= 12) \
+      { \
+	rkey10 = ALIGNED_LOAD (rk, 10); \
+	rkey11 = ALIGNED_LOAD (rk, 11); \
+	if (rounds > 12) \
+	  { \
+	    rkey12 = ALIGNED_LOAD (rk, 12); \
+	    rkey13 = ALIGNED_LOAD (rk, 13); \
+	  } \
+      } \
+    rkeylast = ALIGNED_LOAD (rk, nrounds); \
+  } while (0)
+
+#define AES_ENCRYPT_ALL(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_cipher_be (blk, rkey1); \
+    blk = asm_cipher_be (blk, rkey2); \
+    blk = asm_cipher_be (blk, rkey3); \
+    blk = asm_cipher_be (blk, rkey4); \
+    blk = asm_cipher_be (blk, rkey5); \
+    blk = asm_cipher_be (blk, rkey6); \
+    blk = asm_cipher_be (blk, rkey7); \
+    blk = asm_cipher_be (blk, rkey8); \
+    blk = asm_cipher_be (blk, rkey9); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_cipher_be (blk, rkey10); \
+	blk = asm_cipher_be (blk, rkey11); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_cipher_be (blk, rkey12); \
+	    blk = asm_cipher_be (blk, rkey13); \
+	  } \
+      } \
+    blk = asm_cipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
+static ASM_FUNC_ATTR_INLINE block
+asm_aligned_ld(unsigned long offset, const void *ptr)
+{
+  block vec;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lvx %0,0,%1\n\t"
+		      : "=v" (vec)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lvx %0,%1,%2\n\t"
+		      : "=v" (vec)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+asm_aligned_st(block vec, unsigned long offset, void *ptr)
+{
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("stvx %0,0,%1\n\t"
+		      :
+		      : "v" (vec), "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("stvx %0,%1,%2\n\t"
+		      :
+		      : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_vperm1(block vec, block mask)
+{
+  block o;
+  __asm__ volatile ("vperm %0,%1,%1,%2\n\t"
+		    : "=v" (o)
+		    : "v" (vec), "v" (mask));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_add_uint128(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vadduqm %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_add_uint64(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vaddudm %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_sra_int64(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vsrad %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static block
+asm_swap_uint64_halfs(block a)
+{
+  block res;
+  __asm__ volatile ("xxswapd %x0, %x1"
+		    : "=wa" (res)
+		    : "wa" (a));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_xor(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vxor %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipher_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vcipher %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipherlast_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vcipherlast %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipher_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vncipher %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipherlast_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vncipherlast %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+
+/* Make a decryption key from an encryption key. */
+static ASM_FUNC_ATTR_INLINE void
+internal_aes_ppc_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  int rr;
+  int r;
+
+  r = 0;
+  rr = rounds;
+  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
+    {
+      ALIGNED_STORE (dkey, r, ALIGNED_LOAD (ekey, rr));
+    }
+}
+
+#endif /* G10_RIJNDAEL_PPC_COMMON_H */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ppc-functions.h b/comm/third_party/libgcrypt/cipher/rijndael-ppc-functions.h
new file mode 100644
index 0000000000..72f31852b4
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ppc-functions.h
@@ -0,0 +1,2020 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
+				 unsigned char *out,
+				 const unsigned char *in)
+{
+  const block bige_const = asm_load_be_const();
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b;
+
+  b = VEC_LOAD_BE (in, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  AES_ENCRYPT (b, rounds);
+  VEC_STORE_BE (out, 0, b, bige_const);
+
+  return 0; /* does not use stack */
+}
+
+
+unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
+				 unsigned char *out,
+				 const unsigned char *in)
+{
+  const block bige_const = asm_load_be_const();
+  const u128_t *rk = (u128_t *)&ctx->keyschdec;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b;
+
+  b = VEC_LOAD_BE (in, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  AES_DECRYPT (b, rounds);
+  VEC_STORE_BE (out, 0, b, bige_const);
+
+  return 0; /* does not use stack */
+}
+
+
+void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES_ALL;
+  block rkeylast_orig;
+  block iv;
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 2; nblocks -= 2)
+    {
+      block in2, iv1;
+
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
+      in += 2;
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      iv1 = iv;
+      rkeylast = rkeylast_orig ^ in2;
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      VEC_STORE_BE (out++, 0, iv1, bige_const);
+      VEC_STORE_BE (out++, 0, iv, bige_const);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      VEC_STORE_BE (out++, 0, iv, bige_const);
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block iv, b, bin;
+  block in0, in1, in2, in3, in4, in5, in6, in7;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      in0 = iv;
+      in1 = VEC_LOAD_BE_NOSWAP (in, 0);
+      in2 = VEC_LOAD_BE_NOSWAP (in, 1);
+      in3 = VEC_LOAD_BE_NOSWAP (in, 2);
+      in4 = VEC_LOAD_BE_NOSWAP (in, 3);
+      in1 = VEC_BE_SWAP (in1, bige_const);
+      in2 = VEC_BE_SWAP (in2, bige_const);
+      in5 = VEC_LOAD_BE_NOSWAP (in, 4);
+      in6 = VEC_LOAD_BE_NOSWAP (in, 5);
+      in3 = VEC_BE_SWAP (in3, bige_const);
+      in4 = VEC_BE_SWAP (in4, bige_const);
+      in7 = VEC_LOAD_BE_NOSWAP (in, 6);
+      iv = VEC_LOAD_BE_NOSWAP (in, 7);
+      in += 8;
+      in5 = VEC_BE_SWAP (in5, bige_const);
+      in6 = VEC_BE_SWAP (in6, bige_const);
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      in7 = VEC_BE_SWAP (in7, bige_const);
+      iv = VEC_BE_SWAP (iv, bige_const);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+      b4 = asm_xor (rkey0, in4);
+      b5 = asm_xor (rkey0, in5);
+      b6 = asm_xor (rkey0, in6);
+      b7 = asm_xor (rkey0, in7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      in3 = asm_xor (rkeylast, in3);
+      in4 = asm_xor (rkeylast, in4);
+      b0 = asm_cipherlast_be (b0, in1);
+      b1 = asm_cipherlast_be (b1, in2);
+      in5 = asm_xor (rkeylast, in5);
+      in6 = asm_xor (rkeylast, in6);
+      b2 = asm_cipherlast_be (b2, in3);
+      b3 = asm_cipherlast_be (b3, in4);
+      in7 = asm_xor (rkeylast, in7);
+      in0 = asm_xor (rkeylast, iv);
+      b0 = VEC_BE_SWAP (b0, bige_const);
+      b1 = VEC_BE_SWAP (b1, bige_const);
+      b4 = asm_cipherlast_be (b4, in5);
+      b5 = asm_cipherlast_be (b5, in6);
+      b2 = VEC_BE_SWAP (b2, bige_const);
+      b3 = VEC_BE_SWAP (b3, bige_const);
+      b6 = asm_cipherlast_be (b6, in7);
+      b7 = asm_cipherlast_be (b7, in0);
+      b4 = VEC_BE_SWAP (b4, bige_const);
+      b5 = VEC_BE_SWAP (b5, bige_const);
+      b6 = VEC_BE_SWAP (b6, bige_const);
+      b7 = VEC_BE_SWAP (b7, bige_const);
+      VEC_STORE_BE_NOSWAP (out, 0, b0);
+      VEC_STORE_BE_NOSWAP (out, 1, b1);
+      VEC_STORE_BE_NOSWAP (out, 2, b2);
+      VEC_STORE_BE_NOSWAP (out, 3, b3);
+      VEC_STORE_BE_NOSWAP (out, 4, b4);
+      VEC_STORE_BE_NOSWAP (out, 5, b5);
+      VEC_STORE_BE_NOSWAP (out, 6, b6);
+      VEC_STORE_BE_NOSWAP (out, 7, b7);
+      out += 8;
+    }
+
+  if (nblocks >= 4)
+    {
+      in0 = iv;
+      in1 = VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in, 1, bige_const);
+      in3 = VEC_LOAD_BE (in, 2, bige_const);
+      iv = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      in3 = asm_xor (rkeylast, in3);
+      in0 = asm_xor (rkeylast, iv);
+      b0 = asm_cipherlast_be (b0, in1);
+      b1 = asm_cipherlast_be (b1, in2);
+      b2 = asm_cipherlast_be (b2, in3);
+      b3 = asm_cipherlast_be (b3, in0);
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      bin = VEC_LOAD_BE (in, 0, bige_const);
+      rkeylast = rkeylast_orig ^ bin;
+      b = iv;
+      iv = bin;
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+
+void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks, int cbc_mac)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  byte *out = (byte *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES_ALL;
+  block lastiv, b;
+  unsigned int outadd = -(!cbc_mac) & 16;
+
+  lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+
+  for (; nblocks >= 2; nblocks -= 2)
+    {
+      block in2, lastiv1;
+
+      b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
+      in += 2;
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv1 = b;
+      b = lastiv1 ^ in2;
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv = b;
+      VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const);
+      out += outadd;
+      VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const);
+      out += outadd;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv = b;
+      VEC_STORE_BE ((u128_t *)out, 0, b, bige_const);
+      out += outadd;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
+}
+
+void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschdec;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block in0, in1, in2, in3, in4, in5, in6, in7;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
+  block iv, b;
+
+  if (!ctx->decryption_prepared)
+    {
+      internal_aes_ppc_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+      in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+      in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+      in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+      in0 = VEC_BE_SWAP (in0, bige_const);
+      in1 = VEC_BE_SWAP (in1, bige_const);
+      in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+      in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+      in2 = VEC_BE_SWAP (in2, bige_const);
+      in3 = VEC_BE_SWAP (in3, bige_const);
+      in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+      in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+      in += 8;
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      in4 = VEC_BE_SWAP (in4, bige_const);
+      in5 = VEC_BE_SWAP (in5, bige_const);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+      in6 = VEC_BE_SWAP (in6, bige_const);
+      in7 = VEC_BE_SWAP (in7, bige_const);
+      b4 = asm_xor (rkey0, in4);
+      b5 = asm_xor (rkey0, in5);
+      b6 = asm_xor (rkey0, in6);
+      b7 = asm_xor (rkey0, in7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      iv = asm_xor (rkeylast, iv);
+      in0 = asm_xor (rkeylast, in0);
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      b0 = asm_ncipherlast_be (b0, iv);
+      iv = in7;
+      b1 = asm_ncipherlast_be (b1, in0);
+      in3 = asm_xor (rkeylast, in3);
+      in4 = asm_xor (rkeylast, in4);
+      b2 = asm_ncipherlast_be (b2, in1);
+      b3 = asm_ncipherlast_be (b3, in2);
+      in5 = asm_xor (rkeylast, in5);
+      in6 = asm_xor (rkeylast, in6);
+      b0 = VEC_BE_SWAP (b0, bige_const);
+      b1 = VEC_BE_SWAP (b1, bige_const);
+      b4 = asm_ncipherlast_be (b4, in3);
+      b5 = asm_ncipherlast_be (b5, in4);
+      b2 = VEC_BE_SWAP (b2, bige_const);
+      b3 = VEC_BE_SWAP (b3, bige_const);
+      b6 = asm_ncipherlast_be (b6, in5);
+      b7 = asm_ncipherlast_be (b7, in6);
+      b4 = VEC_BE_SWAP (b4, bige_const);
+      b5 = VEC_BE_SWAP (b5, bige_const);
+      b6 = VEC_BE_SWAP (b6, bige_const);
+      b7 = VEC_BE_SWAP (b7, bige_const);
+      VEC_STORE_BE_NOSWAP (out, 0, b0);
+      VEC_STORE_BE_NOSWAP (out, 1, b1);
+      VEC_STORE_BE_NOSWAP (out, 2, b2);
+      VEC_STORE_BE_NOSWAP (out, 3, b3);
+      VEC_STORE_BE_NOSWAP (out, 4, b4);
+      VEC_STORE_BE_NOSWAP (out, 5, b5);
+      VEC_STORE_BE_NOSWAP (out, 6, b6);
+      VEC_STORE_BE_NOSWAP (out, 7, b7);
+      out += 8;
+    }
+
+  if (nblocks >= 4)
+    {
+      in0 = VEC_LOAD_BE (in, 0, bige_const);
+      in1 = VEC_LOAD_BE (in, 1, bige_const);
+      in2 = VEC_LOAD_BE (in, 2, bige_const);
+      in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      iv = asm_xor (rkeylast, iv);
+      in0 = asm_xor (rkeylast, in0);
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+
+      b0 = asm_ncipherlast_be (b0, iv);
+      iv = in3;
+      b1 = asm_ncipherlast_be (b1, in0);
+      b2 = asm_ncipherlast_be (b2, in1);
+      b3 = asm_ncipherlast_be (b3, in2);
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      rkeylast = rkeylast_orig ^ iv;
+
+      iv = VEC_LOAD_BE (in, 0, bige_const);
+      b = iv;
+      AES_DECRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      in++;
+      out++;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+
+void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  static const unsigned char vec_one_const[16] =
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block ctr, b, one;
+
+  ctr = VEC_LOAD_BE (ctr_arg, 0, bige_const);
+  one = VEC_LOAD_BE (&vec_one_const, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  if (nblocks >= 4)
+    {
+      block in0, in1, in2, in3, in4, in5, in6, in7;
+      block b0, b1, b2, b3, b4, b5, b6, b7;
+      block two, three, four;
+      block rkey;
+
+      two   = asm_add_uint128 (one, one);
+      three = asm_add_uint128 (two, one);
+      four  = asm_add_uint128 (two, two);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b1 = asm_add_uint128 (ctr, one);
+	  b2 = asm_add_uint128 (ctr, two);
+	  b3 = asm_add_uint128 (ctr, three);
+	  b4 = asm_add_uint128 (ctr, four);
+	  b5 = asm_add_uint128 (b1, four);
+	  b6 = asm_add_uint128 (b2, four);
+	  b7 = asm_add_uint128 (b3, four);
+	  b0 = asm_xor (rkey0, ctr);
+	  rkey = ALIGNED_LOAD (rk, 1);
+	  ctr = asm_add_uint128 (b4, four);
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+	  b0 = asm_cipher_be (b0, rkey);
+	  b1 = asm_cipher_be (b1, rkey);
+	  b2 = asm_cipher_be (b2, rkey);
+	  b3 = asm_cipher_be (b3, rkey);
+	  b4 = asm_xor (rkey0, b4);
+	  b5 = asm_xor (rkey0, b5);
+	  b6 = asm_xor (rkey0, b6);
+	  b7 = asm_xor (rkey0, b7);
+	  b4 = asm_cipher_be (b4, rkey);
+	  b5 = asm_cipher_be (b5, rkey);
+	  b6 = asm_cipher_be (b6, rkey);
+	  b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  DO_ROUND(2);
+	  in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  DO_ROUND(3);
+	  in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  DO_ROUND(4);
+	  in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  DO_ROUND(5);
+	  in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  DO_ROUND(6);
+	  in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  DO_ROUND(7);
+	  in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  DO_ROUND(8);
+	  in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  DO_ROUND(9);
+
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = VEC_BE_SWAP (in0, bige_const);
+	  in1 = VEC_BE_SWAP (in1, bige_const);
+	  in2 = VEC_BE_SWAP (in2, bige_const);
+	  in3 = VEC_BE_SWAP (in3, bige_const);
+	  in4 = VEC_BE_SWAP (in4, bige_const);
+	  in5 = VEC_BE_SWAP (in5, bige_const);
+	  in6 = VEC_BE_SWAP (in6, bige_const);
+	  in7 = VEC_BE_SWAP (in7, bige_const);
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  in4 = asm_xor (rkeylast, in4);
+	  in5 = asm_xor (rkeylast, in5);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+	  in6 = asm_xor (rkeylast, in6);
+	  in7 = asm_xor (rkeylast, in7);
+	  b4 = asm_cipherlast_be (b4, in4);
+	  b5 = asm_cipherlast_be (b5, in5);
+	  b6 = asm_cipherlast_be (b6, in6);
+	  b7 = asm_cipherlast_be (b7, in7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  b1 = asm_add_uint128 (ctr, one);
+	  b2 = asm_add_uint128 (ctr, two);
+	  b3 = asm_add_uint128 (ctr, three);
+	  b0 = asm_xor (rkey0, ctr);
+	  ctr = asm_add_uint128 (ctr, four);
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+
+	  in0 = VEC_LOAD_BE (in, 0, bige_const);
+	  in1 = VEC_LOAD_BE (in, 1, bige_const);
+	  in2 = VEC_LOAD_BE (in, 2, bige_const);
+	  in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = ctr;
+      ctr = asm_add_uint128 (ctr, one);
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+  VEC_STORE_BE (ctr_arg, 0, ctr, bige_const);
+}
+
+
+size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
+		       const void *inbuf_arg, size_t nblocks,
+		       int encrypt)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  u64 data_nblocks = c->u_mode.ocb.data_nblocks;
+  block l0, l1, l2, l;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b;
+  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+  block rkey, rkeylf;
+  block ctr, iv;
+  ROUND_KEY_VARIABLES;
+
+  iv = VEC_LOAD_BE (c->u_iv.iv, 0, bige_const);
+  ctr = VEC_LOAD_BE (c->u_ctr.ctr, 0, bige_const);
+
+  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
+
+  if (encrypt)
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschenc;
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+	  l = VEC_BE_SWAP(l, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  b4 ^= iv4;
+	  b5 ^= iv5;
+	  b6 ^= iv6;
+	  b7 ^= iv7;
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  iv0 = asm_xor (rkeylf, iv0);
+	  iv1 = asm_xor (rkeylf, iv1);
+	  iv2 = asm_xor (rkeylf, iv2);
+	  iv3 = asm_xor (rkeylf, iv3);
+	  iv4 = asm_xor (rkeylf, iv4);
+	  iv5 = asm_xor (rkeylf, iv5);
+	  iv6 = asm_xor (rkeylf, iv6);
+	  iv7 = asm_xor (rkeylf, iv7);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, iv0);
+	  b1 = asm_cipherlast_be (b1, iv1);
+	  b2 = asm_cipherlast_be (b2, iv2);
+	  b3 = asm_cipherlast_be (b3, iv3);
+	  b4 = asm_cipherlast_be (b4, iv4);
+	  b5 = asm_cipherlast_be (b5, iv5);
+	  b6 = asm_cipherlast_be (b6, iv6);
+	  b7 = asm_cipherlast_be (b7, iv7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = asm_cipherlast_be (b0, rkey ^ iv0);
+	  b1 = asm_cipherlast_be (b1, rkey ^ iv1);
+	  b2 = asm_cipherlast_be (b2, rkey ^ iv2);
+	  b3 = asm_cipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+    }
+  else
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschdec;
+
+      if (!ctx->decryption_prepared)
+	{
+	  internal_aes_ppc_prepare_decryption (ctx);
+	  ctx->decryption_prepared = 1;
+	}
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
+	  b ^= iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+	  l = VEC_BE_SWAP(l, bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  b4 ^= iv4;
+	  b5 ^= iv5;
+	  b6 ^= iv6;
+	  b7 ^= iv7;
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  iv0 = asm_xor (rkeylf, iv0);
+	  iv1 = asm_xor (rkeylf, iv1);
+	  iv2 = asm_xor (rkeylf, iv2);
+	  iv3 = asm_xor (rkeylf, iv3);
+	  iv4 = asm_xor (rkeylf, iv4);
+	  iv5 = asm_xor (rkeylf, iv5);
+	  iv6 = asm_xor (rkeylf, iv6);
+	  iv7 = asm_xor (rkeylf, iv7);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, iv0);
+	  b1 = asm_ncipherlast_be (b1, iv1);
+	  b2 = asm_ncipherlast_be (b2, iv2);
+	  b3 = asm_ncipherlast_be (b3, iv3);
+	  b4 = asm_ncipherlast_be (b4, iv4);
+	  b5 = asm_ncipherlast_be (b5, iv5);
+	  b6 = asm_ncipherlast_be (b6, iv6);
+	  b7 = asm_ncipherlast_be (b7, iv7);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = asm_ncipherlast_be (b0, rkey ^ iv0);
+	  b1 = asm_ncipherlast_be (b1, rkey ^ iv1);
+	  b2 = asm_ncipherlast_be (b2, rkey ^ iv2);
+	  b3 = asm_ncipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
+	  b ^= iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+    }
+
+  VEC_STORE_BE (c->u_iv.iv, 0, iv, bige_const);
+  VEC_STORE_BE (c->u_ctr.ctr, 0, ctr, bige_const);
+  c->u_mode.ocb.data_nblocks = data_nblocks;
+
+  return 0;
+}
+
+size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *abuf = (const u128_t *)abuf_arg;
+  int rounds = ctx->rounds;
+  u64 data_nblocks = c->u_mode.ocb.aad_nblocks;
+  block l0, l1, l2, l;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b;
+  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+  block rkey, frkey;
+  block ctr, iv;
+  ROUND_KEY_VARIABLES;
+
+  iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, 0, bige_const);
+  ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, 0, bige_const);
+
+  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+    {
+      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+      b = VEC_LOAD_BE (abuf, 0, bige_const);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      iv ^= l;
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      b ^= iv;
+      AES_ENCRYPT (b, rounds);
+      ctr ^= b;
+
+      abuf += 1;
+    }
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
+      b4 = VEC_LOAD_BE (abuf, 4, bige_const);
+      b5 = VEC_LOAD_BE (abuf, 5, bige_const);
+      b6 = VEC_LOAD_BE (abuf, 6, bige_const);
+      b7 = VEC_LOAD_BE (abuf, 7, bige_const);
+
+      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), 0, bige_const);
+
+      frkey = rkey0;
+      iv ^= frkey;
+
+      iv0 = iv ^ l0;
+      iv1 = iv ^ l0 ^ l1;
+      iv2 = iv ^ l1;
+      iv3 = iv ^ l1 ^ l2;
+      iv4 = iv ^ l1 ^ l2 ^ l0;
+      iv5 = iv ^ l2 ^ l0;
+      iv6 = iv ^ l2;
+      iv7 = iv ^ l2 ^ l;
+
+      b0 ^= iv0;
+      b1 ^= iv1;
+      b2 ^= iv2;
+      b3 ^= iv3;
+      b4 ^= iv4;
+      b5 ^= iv5;
+      b6 ^= iv6;
+      b7 ^= iv7;
+      iv = iv7 ^ frkey;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      rkey = rkeylast;
+      b0 = asm_cipherlast_be (b0, rkey);
+      b1 = asm_cipherlast_be (b1, rkey);
+      b2 = asm_cipherlast_be (b2, rkey);
+      b3 = asm_cipherlast_be (b3, rkey);
+      b4 = asm_cipherlast_be (b4, rkey);
+      b5 = asm_cipherlast_be (b5, rkey);
+      b6 = asm_cipherlast_be (b6, rkey);
+      b7 = asm_cipherlast_be (b7, rkey);
+
+      ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+      abuf += 8;
+    }
+
+  if (nblocks >= 4 && (data_nblocks % 4) == 0)
+    {
+      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
+
+      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+      frkey = rkey0;
+      iv ^= frkey;
+
+      iv0 = iv ^ l0;
+      iv1 = iv ^ l0 ^ l1;
+      iv2 = iv ^ l1;
+      iv3 = iv ^ l1 ^ l;
+
+      b0 ^= iv0;
+      b1 ^= iv1;
+      b2 ^= iv2;
+      b3 ^= iv3;
+      iv = iv3 ^ frkey;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      rkey = rkeylast;
+      b0 = asm_cipherlast_be (b0, rkey);
+      b1 = asm_cipherlast_be (b1, rkey);
+      b2 = asm_cipherlast_be (b2, rkey);
+      b3 = asm_cipherlast_be (b3, rkey);
+
+      ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+      abuf += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+      b = VEC_LOAD_BE (abuf, 0, bige_const);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      iv ^= l;
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      b ^= iv;
+      AES_ENCRYPT (b, rounds);
+      ctr ^= b;
+
+      abuf += 1;
+    }
+
+  VEC_STORE_BE (c->u_mode.ocb.aad_offset, 0, iv, bige_const);
+  VEC_STORE_BE (c->u_mode.ocb.aad_sum, 0, ctr, bige_const);
+  c->u_mode.ocb.aad_nblocks = data_nblocks;
+
+  return 0;
+}
+
+
+void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+#ifdef WORDS_BIGENDIAN
+  static const block vec_bswap128_const =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+#else
+  static const block vec_bswap128_const =
+    { ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8, ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0 };
+#endif
+  static const unsigned char vec_tweak_const[16] =
+    { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0x87 };
+  static const vector unsigned long long vec_shift63_const =
+    { 63, 63 };
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  block tweak;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey, rkeylf;
+  block tweak0, tweak1, tweak2, tweak3, tweak4, tweak5, tweak6, tweak7;
+  block tweak_const, bswap128_const, shift63_const;
+  ROUND_KEY_VARIABLES;
+
+  tweak_const = VEC_LOAD_BE (&vec_tweak_const, 0, bige_const);
+  bswap128_const = ALIGNED_LOAD (&vec_bswap128_const, 0);
+  shift63_const = ALIGNED_LOAD (&vec_shift63_const, 0);
+
+  tweak = VEC_LOAD_BE (tweak_arg, 0, bige_const);
+  tweak = asm_vperm1 (tweak, bswap128_const);
+
+#define GEN_TWEAK(tout, tin) /* Generate next tweak. */ \
+    do { \
+      block tmp1, tmp2; \
+      tmp1 = asm_swap_uint64_halfs(tin); \
+      tmp2 = asm_add_uint64(tin, tin); \
+      tmp1 = asm_sra_int64(tmp1, shift63_const) & tweak_const; \
+      tout = asm_xor(tmp1, tmp2); \
+    } while (0)
+
+  if (encrypt)
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschenc;
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  GEN_TWEAK (tweak2, tweak1);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  GEN_TWEAK (tweak3, tweak2);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  GEN_TWEAK (tweak4, tweak3);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  GEN_TWEAK (tweak5, tweak4);
+	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
+	  GEN_TWEAK (tweak6, tweak5);
+	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  GEN_TWEAK (tweak7, tweak6);
+	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
+	  GEN_TWEAK (tweak, tweak7);
+	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+
+	  tweak0 = asm_xor (tweak0, rkey0);
+	  tweak1 = asm_xor (tweak1, rkey0);
+	  tweak2 = asm_xor (tweak2, rkey0);
+	  tweak3 = asm_xor (tweak3, rkey0);
+	  tweak4 = asm_xor (tweak4, rkey0);
+	  tweak5 = asm_xor (tweak5, rkey0);
+	  tweak6 = asm_xor (tweak6, rkey0);
+	  tweak7 = asm_xor (tweak7, rkey0);
+
+	  b0 = asm_xor (b0, tweak0);
+	  b1 = asm_xor (b1, tweak1);
+	  b2 = asm_xor (b2, tweak2);
+	  b3 = asm_xor (b3, tweak3);
+	  b4 = asm_xor (b4, tweak4);
+	  b5 = asm_xor (b5, tweak5);
+	  b6 = asm_xor (b6, tweak6);
+	  b7 = asm_xor (b7, tweak7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  tweak0 = asm_xor (tweak0, rkeylf);
+	  tweak1 = asm_xor (tweak1, rkeylf);
+	  tweak2 = asm_xor (tweak2, rkeylf);
+	  tweak3 = asm_xor (tweak3, rkeylf);
+	  tweak4 = asm_xor (tweak4, rkeylf);
+	  tweak5 = asm_xor (tweak5, rkeylf);
+	  tweak6 = asm_xor (tweak6, rkeylf);
+	  tweak7 = asm_xor (tweak7, rkeylf);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, tweak0);
+	  b1 = asm_cipherlast_be (b1, tweak1);
+	  b2 = asm_cipherlast_be (b2, tweak2);
+	  b3 = asm_cipherlast_be (b3, tweak3);
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b4 = asm_cipherlast_be (b4, tweak4);
+	  b5 = asm_cipherlast_be (b5, tweak5);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b6 = asm_cipherlast_be (b6, tweak6);
+	  b7 = asm_cipherlast_be (b7, tweak7);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  GEN_TWEAK (tweak2, tweak1);
+	  GEN_TWEAK (tweak3, tweak2);
+	  GEN_TWEAK (tweak, tweak3);
+
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+	  b0 ^= tweak0 ^ rkey0;
+	  b1 ^= tweak1 ^ rkey0;
+	  b2 ^= tweak2 ^ rkey0;
+	  b3 ^= tweak3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast;
+	  b0 = asm_cipherlast_be (b0, rkey ^ tweak0);
+	  b1 = asm_cipherlast_be (b1, rkey ^ tweak1);
+	  b2 = asm_cipherlast_be (b2, rkey ^ tweak2);
+	  b3 = asm_cipherlast_be (b3, rkey ^ tweak3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  tweak0 = asm_vperm1 (tweak, bswap128_const);
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
+
+	  /* Generate next tweak. */
+	  GEN_TWEAK (tweak, tweak);
+
+	  AES_ENCRYPT (b, rounds);
+
+	  b ^= tweak0;
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in++;
+	  out++;
+	}
+    }
+  else
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschdec;
+
+      if (!ctx->decryption_prepared)
+	{
+	  internal_aes_ppc_prepare_decryption (ctx);
+	  ctx->decryption_prepared = 1;
+	}
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  GEN_TWEAK (tweak2, tweak1);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  GEN_TWEAK (tweak3, tweak2);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  GEN_TWEAK (tweak4, tweak3);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  GEN_TWEAK (tweak5, tweak4);
+	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
+	  GEN_TWEAK (tweak6, tweak5);
+	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  GEN_TWEAK (tweak7, tweak6);
+	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
+	  GEN_TWEAK (tweak, tweak7);
+	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+
+	  tweak0 = asm_xor (tweak0, rkey0);
+	  tweak1 = asm_xor (tweak1, rkey0);
+	  tweak2 = asm_xor (tweak2, rkey0);
+	  tweak3 = asm_xor (tweak3, rkey0);
+	  tweak4 = asm_xor (tweak4, rkey0);
+	  tweak5 = asm_xor (tweak5, rkey0);
+	  tweak6 = asm_xor (tweak6, rkey0);
+	  tweak7 = asm_xor (tweak7, rkey0);
+
+	  b0 = asm_xor (b0, tweak0);
+	  b1 = asm_xor (b1, tweak1);
+	  b2 = asm_xor (b2, tweak2);
+	  b3 = asm_xor (b3, tweak3);
+	  b4 = asm_xor (b4, tweak4);
+	  b5 = asm_xor (b5, tweak5);
+	  b6 = asm_xor (b6, tweak6);
+	  b7 = asm_xor (b7, tweak7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  tweak0 = asm_xor (tweak0, rkeylf);
+	  tweak1 = asm_xor (tweak1, rkeylf);
+	  tweak2 = asm_xor (tweak2, rkeylf);
+	  tweak3 = asm_xor (tweak3, rkeylf);
+	  tweak4 = asm_xor (tweak4, rkeylf);
+	  tweak5 = asm_xor (tweak5, rkeylf);
+	  tweak6 = asm_xor (tweak6, rkeylf);
+	  tweak7 = asm_xor (tweak7, rkeylf);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, tweak0);
+	  b1 = asm_ncipherlast_be (b1, tweak1);
+	  b2 = asm_ncipherlast_be (b2, tweak2);
+	  b3 = asm_ncipherlast_be (b3, tweak3);
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b4 = asm_ncipherlast_be (b4, tweak4);
+	  b5 = asm_ncipherlast_be (b5, tweak5);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b6 = asm_ncipherlast_be (b6, tweak6);
+	  b7 = asm_ncipherlast_be (b7, tweak7);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  GEN_TWEAK (tweak2, tweak1);
+	  GEN_TWEAK (tweak3, tweak2);
+	  GEN_TWEAK (tweak, tweak3);
+
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+	  b0 ^= tweak0 ^ rkey0;
+	  b1 ^= tweak1 ^ rkey0;
+	  b2 ^= tweak2 ^ rkey0;
+	  b3 ^= tweak3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast;
+	  b0 = asm_ncipherlast_be (b0, rkey ^ tweak0);
+	  b1 = asm_ncipherlast_be (b1, rkey ^ tweak1);
+	  b2 = asm_ncipherlast_be (b2, rkey ^ tweak2);
+	  b3 = asm_ncipherlast_be (b3, rkey ^ tweak3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  tweak0 = asm_vperm1 (tweak, bswap128_const);
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
+
+	  /* Generate next tweak. */
+	  GEN_TWEAK (tweak, tweak);
+
+	  AES_DECRYPT (b, rounds);
+
+	  b ^= tweak0;
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in++;
+	  out++;
+	}
+    }
+
+  tweak = asm_vperm1 (tweak, bswap128_const);
+  VEC_STORE_BE (tweak_arg, 0, tweak, bige_const);
+
+#undef GEN_TWEAK
+}
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ppc.c b/comm/third_party/libgcrypt/cipher/rijndael-ppc.c
new file mode 100644
index 0000000000..f5c3236111
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ppc.c
@@ -0,0 +1,259 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_PPC_CRYPTO
+
+#include "rijndael-ppc-common.h"
+
+
+#ifdef WORDS_BIGENDIAN
+static const block vec_bswap32_const =
+  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#else
+static const block vec_bswap32_const_neg =
+  { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 };
+#endif
+
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_const(void)
+{
+#ifndef WORDS_BIGENDIAN
+  return ALIGNED_LOAD (&vec_bswap32_const_neg, 0);
+#else
+  static const block vec_dummy = { 0 };
+  return vec_dummy;
+#endif
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_be_swap(block vec, block be_bswap_const)
+{
+  (void)be_bswap_const;
+#ifndef WORDS_BIGENDIAN
+  return asm_vperm1 (vec, be_bswap_const);
+#else
+  return vec;
+#endif
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_noswap(unsigned long offset, const void *ptr)
+{
+  block vec;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvw4x %x0,0,%1\n\t"
+		      : "=wa" (vec)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvw4x %x0,%1,%2\n\t"
+		      : "=wa" (vec)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  /* NOTE: vec needs to be be-swapped using 'asm_be_swap' by caller */
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
+{
+  /* NOTE: vec be-swapped using 'asm_be_swap' by caller */
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("stxvw4x %x0,0,%1\n\t"
+		      :
+		      : "wa" (vec), "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("stxvw4x %x0,%1,%2\n\t"
+		      :
+		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+}
+
+
+static ASM_FUNC_ATTR_INLINE u32
+_gcry_aes_sbox4_ppc8(u32 fourbytes)
+{
+  union
+    {
+      PROPERLY_ALIGNED_TYPE dummy;
+      block data_vec;
+      u32 data32[4];
+    } u;
+
+  u.data32[0] = fourbytes;
+  u.data_vec = vec_sbox_be(u.data_vec);
+  return u.data32[0];
+}
+
+void
+_gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  const block bige_const = asm_load_be_const();
+  union
+    {
+      PROPERLY_ALIGNED_TYPE dummy;
+      byte data[MAXKC][4];
+      u32 data32[MAXKC];
+    } tkk[2];
+  unsigned int rounds = ctx->rounds;
+  int KC = rounds - 6;
+  unsigned int keylen = KC * 4;
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  unsigned int i, r, t;
+  byte rcon = 1;
+  int j;
+#define k      tkk[0].data
+#define k_u32  tkk[0].data32
+#define tk     tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W      (ctx->keyschenc)
+#define W_u32  (ctx->keyschenc32)
+
+  for (i = 0; i < keylen; i++)
+    {
+      k[i >> 2][i & 3] = key[i];
+    }
+
+  for (j = KC-1; j >= 0; j--)
+    {
+      tk_u32[j] = k_u32[j];
+    }
+  r = 0;
+  t = 0;
+  /* Copy values into round key array.  */
+  for (j = 0; (j < KC) && (r < rounds + 1); )
+    {
+      for (; (j < KC) && (t < 4); j++, t++)
+        {
+          W_u32[r][t] = le_bswap32(tk_u32[j]);
+        }
+      if (t == 4)
+        {
+          r++;
+          t = 0;
+        }
+    }
+  while (r < rounds + 1)
+    {
+      tk_u32[0] ^=
+	le_bswap32(
+	  _gcry_aes_sbox4_ppc8(rol(le_bswap32(tk_u32[KC - 1]), 24)) ^ rcon);
+
+      if (KC != 8)
+        {
+          for (j = 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+      else
+        {
+          for (j = 1; j < KC/2; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+
+          tk_u32[KC/2] ^=
+	    le_bswap32(_gcry_aes_sbox4_ppc8(le_bswap32(tk_u32[KC/2 - 1])));
+
+          for (j = KC/2 + 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+
+      /* Copy values into round key array.  */
+      for (j = 0; (j < KC) && (r < rounds + 1); )
+        {
+          for (; (j < KC) && (t < 4); j++, t++)
+            {
+              W_u32[r][t] = le_bswap32(tk_u32[j]);
+            }
+          if (t == 4)
+            {
+              r++;
+              t = 0;
+            }
+        }
+
+      rcon = (rcon << 1) ^ (-(rcon >> 7) & 0x1b);
+    }
+
+  /* Store in big-endian order. */
+  for (r = 0; r <= rounds; r++)
+    {
+#ifndef WORDS_BIGENDIAN
+      VEC_STORE_BE(ekey, r, ALIGNED_LOAD (ekey, r), bige_const);
+#else
+      block rvec = ALIGNED_LOAD (ekey, r);
+      ALIGNED_STORE (ekey, r,
+                     vec_perm(rvec, rvec, vec_bswap32_const));
+      (void)bige_const;
+#endif
+    }
+
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+  wipememory(&tkk, sizeof(tkk));
+}
+
+void
+_gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  internal_aes_ppc_prepare_decryption (ctx);
+}
+
+
+#define GCRY_AES_PPC8 1
+#define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc8_encrypt
+#define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc8_decrypt
+#define CFB_ENC_FUNC		_gcry_aes_ppc8_cfb_enc
+#define CFB_DEC_FUNC		_gcry_aes_ppc8_cfb_dec
+#define CBC_ENC_FUNC		_gcry_aes_ppc8_cbc_enc
+#define CBC_DEC_FUNC		_gcry_aes_ppc8_cbc_dec
+#define CTR_ENC_FUNC		_gcry_aes_ppc8_ctr_enc
+#define OCB_CRYPT_FUNC		_gcry_aes_ppc8_ocb_crypt
+#define OCB_AUTH_FUNC		_gcry_aes_ppc8_ocb_auth
+#define XTS_CRYPT_FUNC		_gcry_aes_ppc8_xts_crypt
+
+#include <rijndael-ppc-functions.h>
+
+#endif /* USE_PPC_CRYPTO */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ppc9le.c b/comm/third_party/libgcrypt/cipher/rijndael-ppc9le.c
new file mode 100644
index 0000000000..facdedd4f2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ppc9le.c
@@ -0,0 +1,102 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+
+#include "rijndael-ppc-common.h"
+
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_const(void)
+{
+  static const block vec_dummy = { 0 };
+  return vec_dummy;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_be_swap(block vec, block be_bswap_const)
+{
+  (void)be_bswap_const;
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_noswap(unsigned long offset, const void *ptr)
+{
+  block vec;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvb16x %x0,0,%1\n\t"
+		      : "=wa" (vec)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvb16x %x0,%1,%2\n\t"
+		      : "=wa" (vec)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
+{
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("stxvb16x %x0,0,%1\n\t"
+		      :
+		      : "wa" (vec), "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("stxvb16x %x0,%1,%2\n\t"
+		      :
+		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+}
+
+
+#define GCRY_AES_PPC9LE 1
+#define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_encrypt
+#define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_decrypt
+#define CFB_ENC_FUNC		_gcry_aes_ppc9le_cfb_enc
+#define CFB_DEC_FUNC		_gcry_aes_ppc9le_cfb_dec
+#define CBC_ENC_FUNC		_gcry_aes_ppc9le_cbc_enc
+#define CBC_DEC_FUNC		_gcry_aes_ppc9le_cbc_dec
+#define CTR_ENC_FUNC		_gcry_aes_ppc9le_ctr_enc
+#define OCB_CRYPT_FUNC		_gcry_aes_ppc9le_ocb_crypt
+#define OCB_AUTH_FUNC		_gcry_aes_ppc9le_ocb_auth
+#define XTS_CRYPT_FUNC		_gcry_aes_ppc9le_xts_crypt
+
+#include <rijndael-ppc-functions.h>
+
+#endif /* USE_PPC_CRYPTO */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-s390x.c b/comm/third_party/libgcrypt/cipher/rijndael-s390x.c
new file mode 100644
index 0000000000..aea65c5a3d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-s390x.c
@@ -0,0 +1,1155 @@
+/* Rijndael (AES) for GnuPG - s390x/zSeries AES implementation
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_S390X_CRYPTO
+
+#include "asm-inline-s390x.h"
+
+#define NO_INLINE __attribute__((noinline))
+
+struct aes_s390x_gcm_params_s
+{
+  u32 reserved[3];
+  u32 counter_value;
+  u64 tag[2];
+  u64 hash_subkey[2];
+  u64 total_aad_length;
+  u64 total_cipher_length;
+  u32 initial_counter_value[4];
+  u64 key[4];
+};
+
+#define DECL_QUERY_FUNC(instruction, opcode) \
+  static u128_t instruction ##_query(void) \
+  { \
+    static u128_t function_codes = 0; \
+    static int initialized = 0; \
+    register unsigned long reg0 asm("0") = 0; \
+    register void *reg1 asm("1") = &function_codes; \
+    u128_t r1, r2; \
+    \
+    if (initialized) \
+      return function_codes; \
+    \
+    asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \
+		  "   brc 1,0b\n\t" \
+		  : [r1] "=a" (r1), [r2] "=a" (r2) \
+		  : [reg0] "r" (reg0), [reg1] "r" (reg1) \
+		  : "cc", "memory"); \
+    \
+    initialized = 1; \
+    return function_codes; \
+  }
+
+#define DECL_EXECUTE_FUNC(instruction, opcode, param_const) \
+  static ALWAYS_INLINE size_t \
+  instruction ##_execute(unsigned int func, param_const void *param_block, \
+			 void *dst, const void *src, size_t src_len) \
+  { \
+    register unsigned long reg0 asm("0") = func; \
+    register param_const byte *reg1 asm("1") = param_block; \
+    u128_t r1 = ((u128_t)(uintptr_t)dst << 64); \
+    u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len; \
+    \
+    asm volatile ("0: .insn rre," #opcode " << 16, %[r1], %[r2]\n\t" \
+		  "   brc 1,0b\n\t" \
+		  : [r1] "+a" (r1), [r2] "+a" (r2) \
+		  : [func] "r" (reg0), [param_ptr] "r" (reg1) \
+		  : "cc", "memory"); \
+    \
+    return (u64)r2; \
+  }
+
+DECL_QUERY_FUNC(km, 0xb92e);
+DECL_QUERY_FUNC(kmc, 0xb92f);
+DECL_QUERY_FUNC(kmac, 0xb91e);
+DECL_QUERY_FUNC(kmf, 0xb92a);
+DECL_QUERY_FUNC(kmo, 0xb92b);
+
+DECL_EXECUTE_FUNC(km, 0xb92e, const);
+DECL_EXECUTE_FUNC(kmc, 0xb92f, );
+DECL_EXECUTE_FUNC(kmac, 0xb91e, );
+DECL_EXECUTE_FUNC(kmf, 0xb92a, );
+DECL_EXECUTE_FUNC(kmo, 0xb92b, );
+
+static u128_t kma_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1, r2, r3;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "=a" (r1), [r2] "=a" (r2), [r3] "=a" (r3)
+		: [reg0] "r" (reg0), [reg1] "r" (reg1)
+		: "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static ALWAYS_INLINE void
+kma_execute(unsigned int func, void *param_block, byte *dst, const byte *src,
+	    size_t src_len, const byte *aad, size_t aad_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)dst << 64);
+  u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+  u128_t r3 = ((u128_t)(uintptr_t)aad << 64) | (u64)aad_len;
+
+  asm volatile ("0: .insn rrf,0xb929 << 16, %[r1], %[r2], %[r3], 0\n\t"
+		"   brc 1,0b\n\t"
+		: [r1] "+a" (r1), [r2] "+a" (r2), [r3] "+a" (r3),
+		  [func] "+r" (reg0)
+		: [param_ptr] "r" (reg1)
+		: "cc", "memory");
+}
+
+unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx,
+				     unsigned char *dst,
+				     const unsigned char *src)
+{
+  km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, dst, src,
+	      BLOCKSIZE);
+  return 0;
+}
+
+unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx,
+				     unsigned char *dst,
+				     const unsigned char *src)
+{
+  km_execute (ctx->km_func | KM_DECRYPT, ctx->keyschenc, dst, src,
+	      BLOCKSIZE);
+  return 0;
+}
+
+static void aes_s390x_cbc_enc(void *context, unsigned char *iv,
+			      void *outbuf_arg, const void *inbuf_arg,
+			      size_t nblocks, int cbc_mac)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  u128_t params[3];
+
+  /* Prepare parameter block. */
+  memcpy (&params[0], iv, BLOCKSIZE);
+  memcpy (&params[1], ctx->keyschenc, 32);
+
+  if (cbc_mac)
+    {
+      kmac_execute (ctx->kmac_func | KM_ENCRYPT, &params, NULL, in,
+	            nblocks * BLOCKSIZE);
+      memcpy (out, &params[0], BLOCKSIZE);
+    }
+  else
+    {
+      kmc_execute (ctx->kmc_func | KM_ENCRYPT, &params, out, in,
+	           nblocks * BLOCKSIZE);
+    }
+
+  /* Update IV with OCV. */
+  memcpy (iv, &params[0], BLOCKSIZE);
+
+  wipememory (&params, sizeof(params));
+}
+
+static void aes_s390x_cbc_dec(void *context, unsigned char *iv,
+			      void *outbuf_arg, const void *inbuf_arg,
+			      size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  u128_t params[3];
+
+  /* Prepare parameter block (ICV & key). */
+  memcpy (&params[0], iv, BLOCKSIZE);
+  memcpy (&params[1], ctx->keyschenc, 32);
+
+  kmc_execute (ctx->kmc_func | KM_DECRYPT, &params, out, in,
+	       nblocks * BLOCKSIZE);
+
+  /* Update IV with OCV. */
+  memcpy (iv, &params[0], BLOCKSIZE);
+
+  wipememory (&params, sizeof(params));
+}
+
+static void aes_s390x_cfb128_enc(void *context, unsigned char *iv,
+				 void *outbuf_arg, const void *inbuf_arg,
+				 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  unsigned int function;
+  u128_t params[3];
+
+  /* Prepare parameter block. */
+  memcpy (&params[0], iv, BLOCKSIZE);
+  memcpy (&params[1], ctx->keyschenc, 32);
+
+  function = ctx->kmf_func | KM_ENCRYPT | KMF_LCFB_16;
+  kmf_execute (function, &params, out, in, nblocks * BLOCKSIZE);
+
+  /* Update IV with OCV. */
+  memcpy (iv, &params[0], BLOCKSIZE);
+
+  wipememory (&params, sizeof(params));
+}
+
+static void aes_s390x_cfb128_dec(void *context, unsigned char *iv,
+				 void *outbuf_arg, const void *inbuf_arg,
+				 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  u128_t blocks[64];
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  size_t max_blocks_used = 0;
+
+  /* AES128-CFB128 decryption speed using KMF was observed to be the same as
+   * the KMF encryption, ~1.03 cpb. Expection was to see similar performance
+   * as for AES128-CBC decryption as decryption for both modes should be
+   * parallalizeble (CBC shows ~0.22 cpb). Therefore there is quite a bit
+   * of room for improvement and implementation below using KM instruction
+   * shows ~0.70 cpb speed, ~30% improvement over KMF instruction.
+   */
+
+  while (nblocks >= 64)
+    {
+      /* Copy IV to encrypt buffer, copy (nblocks - 1) input blocks to
+       * encrypt buffer and update IV. */
+      asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t"
+		    "mvc  16(240, %[blocks]),   0(%[in])\n\t"
+		    "mvc 256(256, %[blocks]), 240(%[in])\n\t"
+		    "mvc 512(256, %[blocks]), 496(%[in])\n\t"
+		    "mvc 768(256, %[blocks]), 752(%[in])\n\t"
+		    "mvc 0(16, %[iv]), 1008(%[in])\n\t"
+		    :
+		    : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks),
+		      [iv] "a" (iv)
+		    : "memory");
+
+      /* Perform encryption of temporary buffer. */
+      km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks,
+		  64 * BLOCKSIZE);
+
+      /* Xor encrypt buffer with input blocks and store to output blocks. */
+      asm volatile ("xc   0(256, %[blocks]),   0(%[in])\n\t"
+		    "xc 256(256, %[blocks]), 256(%[in])\n\t"
+		    "xc 512(256, %[blocks]), 512(%[in])\n\t"
+		    "xc 768(256, %[blocks]), 768(%[in])\n\t"
+		    "mvc   0(256, %[out]),   0(%[blocks])\n\t"
+		    "mvc 256(256, %[out]), 256(%[blocks])\n\t"
+		    "mvc 512(256, %[out]), 512(%[blocks])\n\t"
+		    "mvc 768(256, %[out]), 768(%[blocks])\n\t"
+		    :
+		    : [in] "a" (in), [out] "a" (out), [blocks] "a" (blocks)
+		    : "memory");
+
+      max_blocks_used = 64;
+      in += 64 * BLOCKSIZE;
+      out += 64 * BLOCKSIZE;
+      nblocks -= 64;
+    }
+
+  if (nblocks)
+    {
+      unsigned int pos = 0;
+      size_t in_nblocks = nblocks;
+      size_t num_in = 0;
+
+      max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
+
+      /* Copy IV to encrypt buffer. */
+      asm volatile ("mvc 0(16, %[blocks]), 0(%[iv])\n\t"
+		    :
+		    : [blocks] "a" (blocks), [iv] "a" (iv)
+		    : "memory");
+      pos += 1;
+
+#define CFB_MOVE_BLOCKS(block_oper, move_nbytes) \
+      block_oper (in_nblocks - 1 >= move_nbytes / BLOCKSIZE) \
+	{ \
+	  unsigned int move_nblocks = move_nbytes / BLOCKSIZE; \
+	  asm volatile ("mvc 0(" #move_nbytes ", %[blocks_x]), 0(%[in])\n\t" \
+			: \
+			: [blocks_x] "a" (&blocks[pos]), [in] "a" (in) \
+			: "memory"); \
+	  num_in += move_nblocks; \
+	  in += move_nblocks * BLOCKSIZE; \
+	  pos += move_nblocks; \
+          in_nblocks -= move_nblocks; \
+	}
+
+      /* Copy (nblocks - 1) input blocks to encrypt buffer. */
+      CFB_MOVE_BLOCKS(while, 256);
+      CFB_MOVE_BLOCKS(if, 128);
+      CFB_MOVE_BLOCKS(if, 64);
+      CFB_MOVE_BLOCKS(if, 32);
+      CFB_MOVE_BLOCKS(if, 16);
+
+#undef CFB_MOVE_BLOCKS
+
+      /* Update IV. */
+      asm volatile ("mvc 0(16, %[iv]), 0(%[in])\n\t"
+		    :
+		    : [iv] "a" (iv), [in] "a" (in)
+		    : "memory");
+      num_in += 1;
+      in += BLOCKSIZE;
+
+      /* Perform encryption of temporary buffer. */
+      km_execute (ctx->km_func | KM_ENCRYPT, ctx->keyschenc, blocks, blocks,
+		  nblocks * BLOCKSIZE);
+
+      /* Xor encrypt buffer with input blocks and store to output blocks. */
+      pos = 0;
+      in -= nblocks * BLOCKSIZE;
+
+#define CFB_XOR_BLOCKS(block_oper, xor_nbytes) \
+      block_oper (nblocks >= xor_nbytes / BLOCKSIZE) \
+	{ \
+	  unsigned int xor_nblocks = xor_nbytes / BLOCKSIZE; \
+	  asm volatile ("xc 0(" #xor_nbytes ", %[blocks_x]), 0(%[in])\n\t" \
+			"mvc 0(" #xor_nbytes ", %[out]), 0(%[blocks_x])\n\t" \
+			: \
+			: [blocks_x] "a" (&blocks[pos]), [out] "a" (out), \
+			  [in] "a" (in) \
+			: "memory"); \
+	  out += xor_nblocks * BLOCKSIZE; \
+	  in += xor_nblocks * BLOCKSIZE; \
+	  nblocks -= xor_nblocks; \
+	  pos += xor_nblocks; \
+	}
+
+      CFB_XOR_BLOCKS(while, 256);
+      CFB_XOR_BLOCKS(if, 128);
+      CFB_XOR_BLOCKS(if, 64);
+      CFB_XOR_BLOCKS(if, 32);
+      CFB_XOR_BLOCKS(if, 16);
+
+#undef CFB_XOR_BLOCKS
+    }
+
+  if (max_blocks_used)
+    wipememory (&blocks, max_blocks_used * BLOCKSIZE);
+}
+
+static void aes_s390x_ofb_enc(void *context, unsigned char *iv,
+			      void *outbuf_arg, const void *inbuf_arg,
+			      size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  unsigned int function;
+  u128_t params[3];
+
+  /* Prepare parameter block. */
+  memcpy (&params[0], iv, BLOCKSIZE);
+  memcpy (&params[1], ctx->keyschenc, 32);
+
+  function = ctx->kmo_func | KM_ENCRYPT;
+  kmo_execute (function, &params, out, in, nblocks * BLOCKSIZE);
+
+  /* Update IV with OCV. */
+  memcpy (iv, &params[0], BLOCKSIZE);
+
+  wipememory (&params, sizeof(params));
+}
+
+static void aes_s390x_ctr128_enc(void *context, unsigned char *ctr,
+				 void *outbuf_arg, const void *inbuf_arg,
+				 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  unsigned int function;
+  struct aes_s390x_gcm_params_s params;
+
+  memset (&params.hash_subkey, 0, sizeof(params.hash_subkey));
+  memcpy (&params.key, ctx->keyschenc, 32);
+
+  function = ctx->kma_func | KM_DECRYPT | KMA_HS | KMA_LAAD;
+
+  while (nblocks)
+    {
+      u64 to_overflow = (u64)0xFFFFFFFFU + 1 - buf_get_be32 (ctr + 12);
+      u64 ncurr = nblocks > to_overflow ? to_overflow : nblocks;
+
+      /* Prepare parameter block. */
+      memset (&params.reserved, 0, sizeof(params.reserved));
+      buf_put_be32 (&params.counter_value, buf_get_be32(ctr + 12) - 1);
+      memcpy (&params.initial_counter_value, ctr, 16);
+      params.initial_counter_value[3] = params.counter_value;
+      memset (&params.tag, 0, sizeof(params.tag));
+      params.total_aad_length = 0;
+      params.total_cipher_length = 0;
+
+      /* Update counter. */
+      cipher_block_add (ctr, ncurr, BLOCKSIZE);
+      if (ncurr == (u64)0xFFFFFFFFU + 1)
+	cipher_block_add (ctr, 1, BLOCKSIZE);
+
+      /* Perform CTR using KMA-GCM. */
+      kma_execute (function, &params, out, in, ncurr * BLOCKSIZE, NULL, 0);
+
+      out += ncurr * BLOCKSIZE;
+      in += ncurr * BLOCKSIZE;
+      nblocks -= ncurr;
+    }
+
+  wipememory (&params, sizeof(params));
+}
+
+static size_t aes_s390x_gcm_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  byte *ctr = c->u_ctr.ctr;
+  unsigned int function;
+  struct aes_s390x_gcm_params_s params;
+
+  function = ctx->kma_func | (encrypt ? KM_ENCRYPT : KM_DECRYPT)
+	      | KMA_HS | KMA_LAAD;
+
+  /* Prepare parameter block. */
+  memset (&params.reserved, 0, sizeof(params.reserved));
+  buf_put_be32 (&params.counter_value, buf_get_be32(ctr + 12) - 1);
+  memcpy (&params.tag, c->u_mode.gcm.u_tag.tag, 16);
+  memcpy (&params.hash_subkey, c->u_mode.gcm.u_ghash_key.key, 16);
+  params.total_aad_length = 0;
+  params.total_cipher_length = 0;
+  memcpy (&params.initial_counter_value, ctr, 12);
+  params.initial_counter_value[3] = params.counter_value;
+  memcpy (&params.key, ctx->keyschenc, 32);
+
+  /* Update counter (CTR32). */
+  buf_put_be32(ctr + 12, buf_get_be32(ctr + 12) + nblocks);
+
+  /* Perform KMA-GCM. */
+  kma_execute (function, &params, out, in, nblocks * BLOCKSIZE, NULL, 0);
+
+  /* Update tag. */
+  memcpy (c->u_mode.gcm.u_tag.tag, &params.tag, 16);
+
+  wipememory (&params, sizeof(params));
+
+  return 0;
+}
+
+static void aes_s390x_xts_crypt(void *context, unsigned char *tweak,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  byte *out = outbuf_arg;
+  const byte *in = inbuf_arg;
+  unsigned int function;
+  u128_t params[3];
+  u128_t *params_tweak;
+
+  if (ctx->rounds < 12)
+    {
+      memcpy (&params[0], ctx->keyschenc, 16);
+      params_tweak = &params[1];
+      memcpy (params_tweak, tweak, BLOCKSIZE);
+    }
+  else if (ctx->rounds == 12)
+    {
+      BUG(); /* KM-XTS-AES-192 not defined. */
+    }
+  else
+    {
+      memcpy (&params[0], ctx->keyschenc, 32);
+      params_tweak = &params[2];
+      memcpy (params_tweak, tweak, BLOCKSIZE);
+    }
+
+  function = ctx->km_func_xts | (encrypt ? KM_ENCRYPT : KM_DECRYPT);
+  km_execute (function, &params, out, in, nblocks * BLOCKSIZE);
+
+  /* Update tweak with XTSP. */
+  memcpy (tweak, params_tweak, BLOCKSIZE);
+
+  wipememory (&params, sizeof(params));
+}
+
+static NO_INLINE void
+aes_s390x_ocb_prepare_Ls (gcry_cipher_hd_t c, u64 blkn, const void *Ls[64],
+			  const void ***pl)
+{
+  unsigned int n = 64 - (blkn % 64);
+  int i;
+
+  /* Prepare L pointers. */
+  *pl = &Ls[(63 + n) % 64];
+  for (i = 0; i < 64; i += 8, n = (n + 8) % 64)
+    {
+      static const int lastL[8] = { 3, 4, 3, 5, 3, 4, 3, 0 };
+
+      Ls[(0 + n) % 64] = c->u_mode.ocb.L[0];
+      Ls[(1 + n) % 64] = c->u_mode.ocb.L[1];
+      Ls[(2 + n) % 64] = c->u_mode.ocb.L[0];
+      Ls[(3 + n) % 64] = c->u_mode.ocb.L[2];
+      Ls[(4 + n) % 64] = c->u_mode.ocb.L[0];
+      Ls[(5 + n) % 64] = c->u_mode.ocb.L[1];
+      Ls[(6 + n) % 64] = c->u_mode.ocb.L[0];
+      Ls[(7 + n) % 64] = c->u_mode.ocb.L[lastL[i / 8]];
+    }
+}
+
+static NO_INLINE void
+aes_s390x_ocb_checksum (unsigned char *checksum, const void *plainbuf_arg,
+			size_t nblks)
+{
+  const char *plainbuf = plainbuf_arg;
+  u64 tmp0[2];
+  u64 tmp1[2] = { 0, 0 };
+  u64 tmp2[2] = { 0, 0 };
+  u64 tmp3[2] = { 0, 0 };
+
+  cipher_block_cpy (tmp0, checksum, BLOCKSIZE);
+
+  if (nblks >= 4)
+    {
+      while (nblks >= 4)
+	{
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  cipher_block_xor_1 (tmp0, plainbuf + 0 * BLOCKSIZE, BLOCKSIZE);
+	  cipher_block_xor_1 (tmp1, plainbuf + 1 * BLOCKSIZE, BLOCKSIZE);
+	  cipher_block_xor_1 (tmp2, plainbuf + 2 * BLOCKSIZE, BLOCKSIZE);
+	  cipher_block_xor_1 (tmp3, plainbuf + 3 * BLOCKSIZE, BLOCKSIZE);
+
+	  plainbuf += 4 * BLOCKSIZE;
+	  nblks -= 4;
+	}
+
+      cipher_block_xor_1 (tmp0, tmp1, BLOCKSIZE);
+      cipher_block_xor_1 (tmp2, tmp3, BLOCKSIZE);
+      cipher_block_xor_1 (tmp0, tmp2, BLOCKSIZE);
+
+      wipememory (tmp1, sizeof(tmp1));
+      wipememory (tmp2, sizeof(tmp2));
+      wipememory (tmp3, sizeof(tmp3));
+    }
+
+  while (nblks > 0)
+    {
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      cipher_block_xor_1 (tmp0, plainbuf, BLOCKSIZE);
+
+      plainbuf += BLOCKSIZE;
+      nblks--;
+    }
+
+  cipher_block_cpy (checksum, tmp0, BLOCKSIZE);
+
+  wipememory (tmp0, sizeof(tmp0));
+}
+
+static NO_INLINE size_t
+aes_s390x_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks_arg)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  size_t nblocks = nblocks_arg;
+  u128_t blocks[64];
+  u128_t offset;
+  size_t max_blocks_used = 0;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  unsigned int function = ctx->km_func | KM_ENCRYPT;
+  const void *Ls[64];
+  const void **pl;
+
+  aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
+
+  /* Checksumming could be done inline in OCB_INPUT macros, but register
+   * pressure becomes too heavy and performance would end up being worse.
+   * For decryption, checksumming is part of OCB_OUTPUT macros as
+   * output handling is less demanding and can handle the additional
+   * computation. */
+  aes_s390x_ocb_checksum (c->u_ctr.ctr, inbuf_arg, nblocks_arg);
+
+  cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE);
+
+#define OCB_INPUT(n) \
+      cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
+      cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \
+			&offset, BLOCKSIZE)
+
+#define OCB_INPUT_4(n) \
+      OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
+      OCB_INPUT((n) + 3)
+
+#define OCB_INPUT_16(n) \
+      OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
+      OCB_INPUT_4((n) + 12);
+
+#define OCB_OUTPUT(n) \
+      cipher_block_xor_1 (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE)
+
+#define OCB_OUTPUT_4(n) \
+      OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \
+      OCB_OUTPUT((n) + 3)
+
+#define OCB_OUTPUT_16(n) \
+      OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \
+      OCB_OUTPUT_4((n) + 12);
+
+  while (nblocks >= 64)
+    {
+      blkn += 64;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      OCB_INPUT_16(0);
+      OCB_INPUT_16(16);
+      OCB_INPUT_16(32);
+      OCB_INPUT_16(48);
+
+      km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE);
+
+      asm volatile ("xc   0(256, %[out]),   0(%[blocks])\n\t"
+		    "xc 256(256, %[out]), 256(%[blocks])\n\t"
+		    "xc 512(256, %[out]), 512(%[blocks])\n\t"
+		    "xc 768(256, %[out]), 768(%[blocks])\n\t"
+		    :
+		    : [out] "a" (outbuf), [blocks] "a" (blocks)
+		    : "memory");
+
+      max_blocks_used = 64;
+      inbuf += 64 * BLOCKSIZE;
+      outbuf += 64 * BLOCKSIZE;
+      nblocks -= 64;
+    }
+
+  if (nblocks)
+    {
+      unsigned int pos = 0;
+
+      max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
+
+      blkn += nblocks;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      while (nblocks >= 16)
+	{
+	  OCB_INPUT_16(pos + 0);
+	  pos += 16;
+	  nblocks -= 16;
+	}
+      while (nblocks >= 4)
+	{
+	  OCB_INPUT_4(pos + 0);
+	  pos += 4;
+	  nblocks -= 4;
+	}
+      if (nblocks >= 2)
+	{
+	  OCB_INPUT(pos + 0);
+	  OCB_INPUT(pos + 1);
+	  pos += 2;
+	  nblocks -= 2;
+	}
+      if (nblocks >= 1)
+	{
+	  OCB_INPUT(pos + 0);
+	  pos += 1;
+	  nblocks -= 1;
+	}
+
+      nblocks = pos;
+      pos = 0;
+      km_execute (function, ctx->keyschenc, outbuf, outbuf,
+		  nblocks * BLOCKSIZE);
+
+      while (nblocks >= 16)
+	{
+	  OCB_OUTPUT_16(pos + 0);
+	  pos += 16;
+	  nblocks -= 16;
+	}
+      while (nblocks >= 4)
+	{
+	  OCB_OUTPUT_4(pos + 0);
+	  pos += 4;
+	  nblocks -= 4;
+	}
+      if (nblocks >= 2)
+	{
+	  OCB_OUTPUT(pos + 0);
+	  OCB_OUTPUT(pos + 1);
+	  pos += 2;
+	  nblocks -= 2;
+	}
+      if (nblocks >= 1)
+	{
+	  OCB_OUTPUT(pos + 0);
+	  pos += 1;
+	  nblocks -= 1;
+	}
+    }
+
+#undef OCB_INPUT
+#undef OCB_INPUT_4
+#undef OCB_INPUT_16
+#undef OCB_OUTPUT
+#undef OCB_OUTPUT_4
+#undef OCB_OUTPUT_16
+
+  c->u_mode.ocb.data_nblocks = blkn;
+  cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE);
+
+  if (max_blocks_used)
+    wipememory (&blocks, max_blocks_used * BLOCKSIZE);
+
+  return 0;
+}
+
+static NO_INLINE size_t
+aes_s390x_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks_arg)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  size_t nblocks = nblocks_arg;
+  u128_t blocks[64];
+  u128_t offset;
+  size_t max_blocks_used = 0;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  unsigned int function = ctx->km_func | KM_DECRYPT;
+  const void *Ls[64];
+  const void **pl;
+
+  aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
+
+  cipher_block_cpy (&offset, &c->u_iv.iv, BLOCKSIZE);
+
+#define OCB_INPUT(n) \
+      cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
+      cipher_block_xor (outbuf + (n) * BLOCKSIZE, inbuf + (n) * BLOCKSIZE, \
+			&offset, BLOCKSIZE)
+
+#define OCB_INPUT_4(n) \
+      OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
+      OCB_INPUT((n) + 3)
+
+#define OCB_INPUT_16(n) \
+      OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
+      OCB_INPUT_4((n) + 12);
+
+#define OCB_OUTPUT(n) \
+      cipher_block_xor_1 (&blocks[n], outbuf + (n) * BLOCKSIZE, BLOCKSIZE); \
+      cipher_block_xor_1 (c->u_ctr.ctr, &blocks[n], BLOCKSIZE); \
+      cipher_block_cpy (outbuf + (n) * BLOCKSIZE, &blocks[n], BLOCKSIZE);
+
+#define OCB_OUTPUT_4(n) \
+      OCB_OUTPUT((n) + 0); OCB_OUTPUT((n) + 1); OCB_OUTPUT((n) + 2); \
+      OCB_OUTPUT((n) + 3)
+
+#define OCB_OUTPUT_16(n) \
+      OCB_OUTPUT_4((n) + 0); OCB_OUTPUT_4((n) + 4); OCB_OUTPUT_4((n) + 8); \
+      OCB_OUTPUT_4((n) + 12);
+
+  while (nblocks >= 64)
+    {
+      blkn += 64;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      OCB_INPUT_16(0);
+      OCB_INPUT_16(16);
+      OCB_INPUT_16(32);
+      OCB_INPUT_16(48);
+
+      km_execute (function, ctx->keyschenc, outbuf, outbuf, 64 * BLOCKSIZE);
+
+      asm volatile ("xc   0(256, %[out]),   0(%[blocks])\n\t"
+		    "xc 256(256, %[out]), 256(%[blocks])\n\t"
+		    "xc 512(256, %[out]), 512(%[blocks])\n\t"
+		    "xc 768(256, %[out]), 768(%[blocks])\n\t"
+		    :
+		    : [out] "a" (outbuf), [blocks] "a" (blocks)
+		    : "memory");
+
+      max_blocks_used = 64;
+      inbuf += 64 * BLOCKSIZE;
+      outbuf += 64 * BLOCKSIZE;
+      nblocks -= 64;
+    }
+
+  if (nblocks)
+    {
+      unsigned int pos = 0;
+
+      max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
+
+      blkn += nblocks;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      while (nblocks >= 16)
+	{
+	  OCB_INPUT_16(pos + 0);
+	  pos += 16;
+	  nblocks -= 16;
+	}
+      while (nblocks >= 4)
+	{
+	  OCB_INPUT_4(pos + 0);
+	  pos += 4;
+	  nblocks -= 4;
+	}
+      if (nblocks >= 2)
+	{
+	  OCB_INPUT(pos + 0);
+	  OCB_INPUT(pos + 1);
+	  pos += 2;
+	  nblocks -= 2;
+	}
+      if (nblocks >= 1)
+	{
+	  OCB_INPUT(pos + 0);
+	  pos += 1;
+	  nblocks -= 1;
+	}
+
+      nblocks = pos;
+      pos = 0;
+      km_execute (function, ctx->keyschenc, outbuf, outbuf,
+		  nblocks * BLOCKSIZE);
+
+      while (nblocks >= 16)
+	{
+	  OCB_OUTPUT_16(pos + 0);
+	  pos += 16;
+	  nblocks -= 16;
+	}
+      while (nblocks >= 4)
+	{
+	  OCB_OUTPUT_4(pos + 0);
+	  pos += 4;
+	  nblocks -= 4;
+	}
+      if (nblocks >= 2)
+	{
+	  OCB_OUTPUT(pos + 0);
+	  OCB_OUTPUT(pos + 1);
+	  pos += 2;
+	  nblocks -= 2;
+	}
+      if (nblocks >= 1)
+	{
+	  OCB_OUTPUT(pos + 0);
+	  pos += 1;
+	  nblocks -= 1;
+	}
+    }
+
+#undef OCB_INPUT
+#undef OCB_INPUT_4
+#undef OCB_INPUT_16
+#undef OCB_OUTPUT
+#undef OCB_OUTPUT_4
+#undef OCB_OUTPUT_16
+
+  c->u_mode.ocb.data_nblocks = blkn;
+  cipher_block_cpy (&c->u_iv.iv, &offset, BLOCKSIZE);
+
+  if (max_blocks_used)
+    wipememory (&blocks, max_blocks_used * BLOCKSIZE);
+
+  return 0;
+}
+
+static size_t
+aes_s390x_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+		     const void *inbuf_arg, size_t nblocks_arg, int encrypt)
+{
+  if (encrypt)
+    return aes_s390x_ocb_enc (c, outbuf_arg, inbuf_arg, nblocks_arg);
+  else
+    return aes_s390x_ocb_dec (c, outbuf_arg, inbuf_arg, nblocks_arg);
+}
+
+static size_t
+aes_s390x_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+		    size_t nblocks_arg)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u128_t blocks[64];
+  u128_t offset;
+  size_t max_blocks_used = 0;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+  unsigned int function = ctx->km_func | KM_ENCRYPT;
+  const void *Ls[64];
+  const void **pl;
+
+  aes_s390x_ocb_prepare_Ls (c, blkn, Ls, &pl);
+
+  cipher_block_cpy (&offset, c->u_mode.ocb.aad_offset, BLOCKSIZE);
+
+#define OCB_INPUT(n) \
+      cipher_block_xor_2dst (&blocks[n], &offset, Ls[n], BLOCKSIZE); \
+      cipher_block_xor_1 (&blocks[n], abuf + (n) * BLOCKSIZE, BLOCKSIZE)
+
+#define OCB_INPUT_4(n) \
+      OCB_INPUT((n) + 0); OCB_INPUT((n) + 1); OCB_INPUT((n) + 2); \
+      OCB_INPUT((n) + 3)
+
+#define OCB_INPUT_16(n) \
+      OCB_INPUT_4((n) + 0); OCB_INPUT_4((n) + 4); OCB_INPUT_4((n) + 8); \
+      OCB_INPUT_4((n) + 12);
+
+  while (nblocks_arg >= 64)
+    {
+      blkn += 64;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      OCB_INPUT_16(0);
+      OCB_INPUT_16(16);
+      OCB_INPUT_16(32);
+      OCB_INPUT_16(48);
+
+      km_execute (function, ctx->keyschenc, blocks, blocks, 64 * BLOCKSIZE);
+
+      aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, 64);
+
+      max_blocks_used = 64;
+      abuf += 64 * BLOCKSIZE;
+      nblocks_arg -= 64;
+    }
+
+  if (nblocks_arg > 0)
+    {
+      size_t nblocks = nblocks_arg;
+      unsigned int pos = 0;
+
+      max_blocks_used = max_blocks_used < nblocks ? nblocks : max_blocks_used;
+
+      blkn += nblocks;
+      *pl = ocb_get_l(c, blkn - blkn % 64);
+
+      while (nblocks >= 16)
+	{
+	  OCB_INPUT_16(pos + 0);
+	  pos += 16;
+	  nblocks -= 16;
+	}
+      while (nblocks >= 4)
+	{
+	  OCB_INPUT_4(pos + 0);
+	  pos += 4;
+	  nblocks -= 4;
+	}
+      if (nblocks >= 2)
+	{
+	  OCB_INPUT(pos + 0);
+	  OCB_INPUT(pos + 1);
+	  pos += 2;
+	  nblocks -= 2;
+	}
+      if (nblocks >= 1)
+	{
+	  OCB_INPUT(pos + 0);
+	  pos += 1;
+	  nblocks -= 1;
+	}
+
+      nblocks = pos;
+      nblocks_arg -= pos;
+      pos = 0;
+      km_execute (function, ctx->keyschenc, blocks, blocks,
+		  nblocks * BLOCKSIZE);
+
+      aes_s390x_ocb_checksum (c->u_mode.ocb.aad_sum, blocks, nblocks);
+    }
+
+#undef OCB_INPUT
+#undef OCB_INPUT_4
+#undef OCB_INPUT_16
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+  cipher_block_cpy (c->u_mode.ocb.aad_offset, &offset, BLOCKSIZE);
+
+  if (max_blocks_used)
+    wipememory (&blocks, max_blocks_used * BLOCKSIZE);
+
+  return 0;
+}
+
+int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx,
+				       unsigned int keylen,
+				       unsigned int hwfeatures,
+				       cipher_bulk_ops_t *bulk_ops)
+{
+  unsigned int func;
+  unsigned int func_xts;
+  u128_t func_mask;
+  u128_t func_xts_mask;
+
+  if (!(hwfeatures & HWF_S390X_MSA))
+    return 0;
+
+  switch (keylen)
+    {
+    default:
+    case 16:
+      func = KM_FUNCTION_AES_128;
+      func_xts = KM_FUNCTION_XTS_AES_128;
+      func_mask = km_function_to_mask(KM_FUNCTION_AES_128);
+      func_xts_mask = km_function_to_mask(KM_FUNCTION_XTS_AES_128);
+      break;
+    case 24:
+      func = KM_FUNCTION_AES_192;
+      func_xts = 0;
+      func_mask = km_function_to_mask(KM_FUNCTION_AES_192);
+      func_xts_mask = 0; /* XTS-AES192 not available. */
+      break;
+    case 32:
+      func = KM_FUNCTION_AES_256;
+      func_xts = KM_FUNCTION_XTS_AES_256;
+      func_mask = km_function_to_mask(KM_FUNCTION_AES_256);
+      func_xts_mask = km_function_to_mask(KM_FUNCTION_AES_256);
+      break;
+    }
+
+  /* Query KM for supported algorithms and check if acceleration for
+   * requested key-length is available. */
+  if (!(km_query () & func_mask))
+    return 0;
+
+  ctx->km_func = func;
+
+  /* Query KM for supported XTS algorithms. */
+  if (km_query () & func_xts_mask)
+    ctx->km_func_xts = func_xts;
+
+  /* Query KMC for supported algorithms. */
+  if (kmc_query () & func_mask)
+    ctx->kmc_func = func;
+
+  /* Query KMAC for supported algorithms. */
+  if (kmac_query () & func_mask)
+    ctx->kmac_func = func;
+
+  if (hwfeatures & HWF_S390X_MSA_4)
+    {
+      /* Query KMF for supported algorithms. */
+      if (kmf_query () & func_mask)
+	ctx->kmf_func = func;
+
+      /* Query KMO for supported algorithms. */
+      if (kmo_query () & func_mask)
+	ctx->kmo_func = func;
+    }
+
+  if (hwfeatures & HWF_S390X_MSA_8)
+    {
+      /* Query KMA for supported algorithms. */
+      if (kma_query () & func_mask)
+	ctx->kma_func = func;
+    }
+
+  /* Setup zSeries bulk encryption/decryption routines. */
+
+  if (ctx->km_func)
+    {
+      bulk_ops->ocb_crypt = aes_s390x_ocb_crypt;
+      bulk_ops->ocb_auth = aes_s390x_ocb_auth;
+
+      /* CFB128 decryption uses KM instruction, instead of KMF. */
+      bulk_ops->cfb_dec = aes_s390x_cfb128_dec;
+    }
+
+  if (ctx->km_func_xts)
+    {
+      bulk_ops->xts_crypt = aes_s390x_xts_crypt;
+    }
+
+  if (ctx->kmc_func)
+    {
+      if(ctx->kmac_func)
+	{
+	  /* Either KMC or KMAC used depending on 'cbc_mac' parameter. */
+	  bulk_ops->cbc_enc = aes_s390x_cbc_enc;
+	}
+
+      bulk_ops->cbc_dec = aes_s390x_cbc_dec;
+    }
+
+  if (ctx->kmf_func)
+    {
+      bulk_ops->cfb_enc = aes_s390x_cfb128_enc;
+    }
+
+  if (ctx->kmo_func)
+    {
+      bulk_ops->ofb_enc = aes_s390x_ofb_enc;
+    }
+
+  if (ctx->kma_func)
+    {
+      bulk_ops->ctr_enc = aes_s390x_ctr128_enc;
+
+      if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
+	{
+	  /* KIMD based GHASH implementation is required with AES-GCM
+	   * acceleration. */
+	  bulk_ops->gcm_crypt = aes_s390x_gcm_crypt;
+	}
+    }
+
+  return 1;
+}
+
+void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key)
+{
+  unsigned int keylen = 16 + (ctx->rounds - 10) * 4;
+  memcpy (ctx->keyschenc, key, keylen);
+}
+
+void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx)
+{
+  /* Do nothing. */
+  (void)ctx;
+}
+
+#endif /* USE_S390X_CRYPTO */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64-asm.S b/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64-asm.S
new file mode 100644
index 0000000000..8124eb2198
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64-asm.S
@@ -0,0 +1,874 @@
+/* SSSE3 vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ *     libvpaes: constant-time SSSE3 AES encryption and decryption.
+ *     version 0.5
+ *
+ *     By Mike Hamburg, Stanford University, 2009.  Public domain.
+ *     I wrote essentially all of this code.  I did not write the test
+ *     vectors; they are the NIST known answer tests.  I hereby release all
+ *     the code and documentation here that I wrote into the public domain.
+ *
+ *     This is an implementation of AES following my paper,
+ *       "Accelerating AES with Vector Permute Instructions
+ *       CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#if defined(__x86_64__)
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+##
+##  _gcry_aes_ssse3_enc_preload
+##
+ELF(.type _gcry_aes_ssse3_enc_preload,@function)
+.globl _gcry_aes_ssse3_enc_preload
+_gcry_aes_ssse3_enc_preload:
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	lea	.Laes_consts(%rip), %rax
+	movdqa	          (%rax), %xmm9  # 0F
+	movdqa	.Lk_inv   (%rax), %xmm10 # inv
+	movdqa	.Lk_inv+16(%rax), %xmm11 # inva
+	movdqa	.Lk_sb1   (%rax), %xmm13 # sb1u
+	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
+	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
+	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
+
+##
+##  _gcry_aes_ssse3_dec_preload
+##
+ELF(.type _gcry_aes_ssse3_dec_preload,@function)
+.globl _gcry_aes_ssse3_dec_preload
+_gcry_aes_ssse3_dec_preload:
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	lea	.Laes_consts(%rip), %rax
+	movdqa	          (%rax), %xmm9   # 0F
+	movdqa	.Lk_inv   (%rax), %xmm10  # inv
+	movdqa	.Lk_inv+16(%rax), %xmm11  # inva
+	movdqa	.Lk_dsb9   (%rax), %xmm13 # sb9u
+	movdqa	.Lk_dsb9+16(%rax), %xmm12 # sb9t
+	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
+	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
+	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
+
+##
+## Constant-time SSSE3 AES core implementation.
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in .Laes_preheat
+##    (%rdi) = scheduled keys
+##     %rsi  = nrounds
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
+##  Preserves %xmm6 - %xmm7 so you get some local vectors
+##
+##
+.align 16
+ELF(.type _gcry_aes_ssse3_encrypt_core,@function)
+.globl _gcry_aes_ssse3_encrypt_core
+_gcry_aes_ssse3_encrypt_core:
+_aes_encrypt_core:
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
+	leaq	-1(%rsi), %rax
+	lea	.Laes_consts(%rip), %rcx
+	leaq	.Lk_mc_backward(%rcx), %rdi
+	mov	$16,	%rsi
+	movdqa	.Lk_ipt   (%rcx), %xmm2 # iptlo
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rcx), %xmm0 # ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	(%rdx),%xmm2
+	pxor	%xmm2,	%xmm0
+	add	$16,	%rdx
+	jmp	.Laes_entry
+
+.align 8
+.Laes_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4   # 4 = sb1u
+	pxor	(%rdx),	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm4	# 4 : sb2u
+	pshufb	%xmm2,	%xmm4	# 4 = sb2u
+	movdqa	.Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm4,  %xmm2	# 2 = 2A
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	(%rsi,%rdi), %xmm3  # 3 = D
+	lea	16(%esi),%esi	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	lea	16(%rdx),%rdx	# next key
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	and	$48, %rsi	# ... mod 4
+	dec	%rax		# nr--
+
+.Laes_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	$4,    	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa  %xmm10, %xmm3   # 3 : 1/jak
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Laes_loop
+
+	# middle of last round
+	movdqa	.Lk_sbo(%rcx), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4   # 4 = sbou
+	pxor	(%rdx), %xmm4   # 4 = sb1u + k
+	movdqa	.Lk_sbo+16(%rcx), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.align 16
+.globl _gcry_aes_ssse3_decrypt_core
+ELF(.type _gcry_aes_ssse3_decrypt_core,@function)
+_gcry_aes_ssse3_decrypt_core:
+_aes_decrypt_core:
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
+	lea	.Laes_consts(%rip), %rcx
+	subl	$1,	%esi
+	movl	%esi,   %eax
+	shll	$4,	%esi
+	xorl	$48,	%esi
+	andl	$48,	%esi
+	movdqa	.Lk_dipt   (%rcx), %xmm2 # iptlo
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rcx), %xmm0 # ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	(%rdx),	%xmm2
+	pxor	%xmm2,	%xmm0
+	movdqa	.Lk_mc_forward+48(%rcx), %xmm5
+	lea	16(%rdx), %rdx
+	neg	%rax
+	jmp	.Laes_dec_entry
+
+.align 16
+.Laes_dec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  %xmm13,	%xmm4		# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	(%rdx),	%xmm4
+	movdqa  %xmm12,	%xmm0		# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	movdqa  .Lk_dsbd+16(%rcx),%xmm1	# 1 : sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	lea	16(%rdx), %rdx		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  %xmm15,	%xmm4		# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	pshufb	%xmm3,	%xmm1		# 1 = sbdt
+	pxor	%xmm4,	%xmm1		# 1 = ch
+
+	pshufb	%xmm5,	%xmm1		# MC ch
+	movdqa  %xmm14,	%xmm4		# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	inc     %rax                    # nr--
+	pxor	%xmm1,	%xmm4		# 4 = ch
+	movdqa  .Lk_dsbb+16(%rcx),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  %xmm8,	%xmm4		# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pshufd	$0x93,	%xmm5,	%xmm5
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  .Lk_dsbe+16(%rcx),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+.Laes_dec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	$4,    	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa  %xmm10, %xmm3   # 3 : 1/jak
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Laes_dec_loop
+
+	# middle of last round
+	movdqa	.Lk_dsbo(%rcx), %xmm4		# 3 : sbou
+	pshufb  %xmm2,  %xmm4   # 4 = sbou
+	pxor	(%rdx), %xmm4   # 4 = sb1u + k
+	movdqa	.Lk_dsbo+16(%rcx), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+
+.align 16
+.globl _gcry_aes_ssse3_schedule_core
+ELF(.type _gcry_aes_ssse3_schedule_core,@function)
+_gcry_aes_ssse3_schedule_core:
+_aes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+	# r8 = rotoffs
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_5
+
+	# load the tables
+	lea	.Laes_consts(%rip), %r10
+	movdqa	          (%r10), %xmm9  # 0F
+	movdqa	.Lk_inv   (%r10), %xmm10 # inv
+	movdqa	.Lk_inv+16(%r10), %xmm11 # inva
+	movdqa	.Lk_sb1   (%r10), %xmm13 # sb1u
+	movdqa	.Lk_sb1+16(%r10), %xmm12 # sb1t
+	movdqa	.Lk_sb2   (%r10), %xmm15 # sb2u
+	movdqa	.Lk_sb2+16(%r10), %xmm14 # sb2t
+
+	movdqa	.Lk_rcon(%r10), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqu	%xmm0,	%xmm3
+	lea	.Lk_ipt(%r10), %r11
+	call	.Laes_schedule_transform
+	movdqu	%xmm0,	%xmm7
+
+	test	%rcx,	%rcx
+	jnz	.Laes_schedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqa	%xmm0,	(%rdx)
+	jmp	.Laes_schedule_go
+
+.Laes_schedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	pshufb  .Lk_sr(%r8,%r10),%xmm3
+	movdqa	%xmm3,	(%rdx)
+	xor	$48, 	%r8
+
+.Laes_schedule_go:
+	cmp	$192,	%rsi
+	je	.Laes_schedule_192
+	cmp	$256,	%rsi
+	je	.Laes_schedule_256
+	# 128: fall though
+
+##
+##  .Laes_schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Laes_schedule_128:
+	mov	$10, %rsi
+
+.Laes_schedule_128_L:
+	call 	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle	# write output
+	jmp 	.Laes_schedule_128_L
+
+##
+##  .Laes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.Laes_schedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	.Laes_schedule_transform	# input transform
+	pshufd	$0x0E,	%xmm0,	%xmm6
+	pslldq	$8,	%xmm6		# clobber low side with zeros
+	mov	$4,	%rsi
+
+.Laes_schedule_192_L:
+	call	.Laes_schedule_round
+	palignr	$8,%xmm6,%xmm0
+	call	.Laes_schedule_mangle	# save key n
+	call	.Laes_schedule_192_smear
+	call	.Laes_schedule_mangle	# save key n+1
+	call	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle	# save key n+2
+	call	.Laes_schedule_192_smear
+	jmp	.Laes_schedule_192_L
+
+##
+##  .Laes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.Laes_schedule_192_smear:
+	pshufd	$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm6,	%xmm0		# -> b+c+d b+c b a
+	pshufd	$0x0E,	%xmm0,	%xmm6
+	pslldq	$8,	%xmm6		# clobber low side with zeros
+	ret
+
+##
+##  .Laes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional 'low side' in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.Laes_schedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	.Laes_schedule_transform	# input transform
+	mov	$7, %rsi
+
+.Laes_schedule_256_L:
+	call	.Laes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle
+
+	# low round. swap xmm7 and xmm6
+	pshufd	$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	.Laes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+
+	jmp	.Laes_schedule_256_L
+
+##
+##  .Laes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.Laes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	$15,	%xmm8,	%xmm1
+	palignr	$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	$0xFF,	%xmm0,	%xmm0
+	palignr	$1,	%xmm0,	%xmm0
+
+	# fall through...
+
+	# low round: same as high round, but no rotation and no rcon.
+.Laes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%r10), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	$4,    	%xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa  %xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa  %xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	.Lk_sb1(%r10), %xmm4	# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	.Lk_sb1+16(%r10), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0
+	movdqa	%xmm0,	%xmm7
+	ret
+
+##
+##  .Laes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.Laes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+
+##
+##  .Laes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by 'inverse mixcolumns' circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.Laes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%r10),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Laes_schedule_mangle_dec
+
+	# encrypting
+	add	$16,	%rdx
+	pxor	.Lk_s63(%r10),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Laes_schedule_mangle_both
+
+.Laes_schedule_mangle_dec:
+	lea	.Lk_dks_1(%r10), %r11	# first table: *9
+	call 	.Laes_schedule_transform
+	movdqa	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *B
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *D
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *E
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	%xmm4,	%xmm0		# restore %xmm0
+	add	$-16,	%rdx
+
+.Laes_schedule_mangle_both:
+	pshufb	.Lk_sr(%r8,%r10),%xmm3
+	add	$-16,	%r8
+	and	$48,	%r8
+	movdqa	%xmm3,	(%rdx)
+	ret
+
+##
+##  .Laes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.Laes_schedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%r10),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Laes_schedule_mangle_last_dec
+
+	# encrypting
+	pshufb	.Lk_sr(%r8,%r10),%xmm0	# output permute
+	lea	.Lk_opt(%r10),	%r11	# prepare to output transform
+	add	$32,	%rdx
+
+.Laes_schedule_mangle_last_dec:
+	add	$-16,	%rdx
+	pxor	.Lk_s63(%r10),	%xmm0
+	call	.Laes_schedule_transform # output transform
+	movdqa	%xmm0,	(%rdx)		# save last key
+
+	#_aes_cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	pxor	%xmm8,  %xmm8
+	EXIT_SYSV_FUNC
+	ret
+	CFI_ENDPROC();
+ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
+
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+
+.align 16
+ELF(.type _aes_consts,@object)
+.Laes_consts:
+_aes_consts:
+	# s0F
+	.Lk_s0F = .-.Laes_consts
+	.quad	0x0F0F0F0F0F0F0F0F
+	.quad	0x0F0F0F0F0F0F0F0F
+
+	# input transform (lo, hi)
+	.Lk_ipt = .-.Laes_consts
+	.quad	0xC2B2E8985A2A7000
+	.quad	0xCABAE09052227808
+	.quad	0x4C01307D317C4D00
+	.quad	0xCD80B1FCB0FDCC81
+
+	# inv, inva
+	.Lk_inv = .-.Laes_consts
+	.quad	0x0E05060F0D080180
+	.quad	0x040703090A0B0C02
+	.quad	0x01040A060F0B0780
+	.quad	0x030D0E0C02050809
+
+	# sb1u, sb1t
+	.Lk_sb1 = .-.Laes_consts
+	.quad	0xB19BE18FCB503E00
+	.quad	0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300
+	.quad	0x3BF7CCC10D2ED9EF
+
+
+	# sb2u, sb2t
+	.Lk_sb2 = .-.Laes_consts
+	.quad	0xE27A93C60B712400
+	.quad	0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900
+	.quad	0xC2A163C8AB82234A
+
+	# sbou, sbot
+	.Lk_sbo = .-.Laes_consts
+	.quad	0xD0D26D176FBDC700
+	.quad	0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00
+	.quad	0x8E1E90D1412B35FA
+
+	# mc_forward
+	.Lk_mc_forward = .-.Laes_consts
+	.quad	0x0407060500030201
+	.quad	0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605
+	.quad	0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09
+	.quad	0x0407060500030201
+	.quad	0x000302010C0F0E0D
+	.quad	0x080B0A0904070605
+
+	# mc_backward
+	.Lk_mc_backward = .-.Laes_consts
+	.quad	0x0605040702010003
+	.quad	0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F
+	.quad	0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B
+	.quad	0x0605040702010003
+	.quad	0x0A09080B06050407
+	.quad	0x020100030E0D0C0F
+
+	# sr
+	.Lk_sr = .-.Laes_consts
+	.quad	0x0706050403020100
+	.quad	0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500
+	.quad	0x0B06010C07020D08
+	.quad	0x0F060D040B020900
+	.quad	0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00
+	.quad	0x0306090C0F020508
+
+	# rcon
+	.Lk_rcon = .-.Laes_consts
+	.quad	0x1F8391B9AF9DEEB6
+	.quad	0x702A98084D7C7D81
+
+	# s63: all equal to 0x63 transformed
+	.Lk_s63 = .-.Laes_consts
+	.quad	0x5B5B5B5B5B5B5B5B
+	.quad	0x5B5B5B5B5B5B5B5B
+
+	# output transform
+	.Lk_opt = .-.Laes_consts
+	.quad	0xFF9F4929D6B66000
+	.quad	0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00
+	.quad	0xE10D5DB1B05C0CE0
+
+	# deskew tables: inverts the sbox's 'skew'
+	.Lk_deskew = .-.Laes_consts
+	.quad	0x07E4A34047A4E300
+	.quad	0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900
+	.quad	0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+	# decryption key schedule: x -> invskew x*9
+	.Lk_dks_1 = .-.Laes_consts
+	.quad	0xB6116FC87ED9A700
+	.quad	0x4AED933482255BFC
+	.quad	0x4576516227143300
+	.quad	0x8BB89FACE9DAFDCE
+
+	# decryption key schedule: invskew x*9 -> invskew x*D
+	.Lk_dks_2 = .-.Laes_consts
+	.quad	0x27438FEBCCA86400
+	.quad	0x4622EE8AADC90561
+	.quad	0x815C13CE4F92DD00
+	.quad	0x73AEE13CBD602FF2
+
+	# decryption key schedule: invskew x*D -> invskew x*B
+	.Lk_dks_3 = .-.Laes_consts
+	.quad	0x03C4C50201C6C700
+	.quad	0xF83F3EF9FA3D3CFB
+	.quad	0xEE1921D638CFF700
+	.quad	0xA5526A9D7384BC4B
+
+	# decryption key schedule: invskew x*B -> invskew x*E + 0x63
+	.Lk_dks_4 = .-.Laes_consts
+	.quad	0xE3C390B053732000
+	.quad	0xA080D3F310306343
+	.quad	0xA0CA214B036982E8
+	.quad	0x2F45AEC48CE60D67
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+	# decryption input transform
+	.Lk_dipt = .-.Laes_consts
+	.quad	0x0F505B040B545F00
+	.quad	0x154A411E114E451A
+	.quad	0x86E383E660056500
+	.quad	0x12771772F491F194
+
+	# decryption sbox output *9*u, *9*t
+	.Lk_dsb9 = .-.Laes_consts
+	.quad	0x851C03539A86D600
+	.quad	0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900
+	.quad	0x725E2C9EB2FBA565
+
+	# decryption sbox output *D*u, *D*t
+	.Lk_dsbd = .-.Laes_consts
+	.quad	0x7D57CCDFE6B1A200
+	.quad	0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00
+	.quad	0x2931180D15DEEFD3
+
+	# decryption sbox output *B*u, *B*t
+	.Lk_dsbb = .-.Laes_consts
+	.quad	0xD022649296B44200
+	.quad	0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700
+	.quad	0xF3FF0C3E3255AA6B
+
+	# decryption sbox output *E*u, *E*t
+	.Lk_dsbe = .-.Laes_consts
+	.quad	0x46F2929626D4D000
+	.quad	0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100
+	.quad	0x9467F36B98593E32
+
+	# decryption sbox final output
+	.Lk_dsbo = .-.Laes_consts
+	.quad	0x1387EA537EF94000
+	.quad	0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00
+	.quad	0xCA4B8159D8C58E9C
+ELF(.size _aes_consts,.-_aes_consts)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64.c b/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64.c
new file mode 100644
index 0000000000..b07238531c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-ssse3-amd64.c
@@ -0,0 +1,743 @@
+/* SSSE3 vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ *     libvpaes: constant-time SSSE3 AES encryption and decryption.
+ *     version 0.5
+ *
+ *     By Mike Hamburg, Stanford University, 2009.  Public domain.
+ *     I wrote essentially all of this code.  I did not write the test
+ *     vectors; they are the NIST known answer tests.  I hereby release all
+ *     the code and documentation here that I wrote into the public domain.
+ *
+ *     This is an implementation of AES following my paper,
+ *       "Accelerating AES with Vector Permute Instructions"
+ *       CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_SSSE3
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR        NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
+
+
+/* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
+   because of 'pragma target'. */
+static ASM_FUNC_ATTR_INLINE const unsigned char *
+aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
+{
+  unsigned long ntz;
+
+  /* Assumes that N != 0. */
+  asm ("rep;bsfl %k[low], %k[ntz]\n\t"
+        : [ntz] "=r" (ntz)
+        : [low] "r" ((unsigned long)n)
+        : "cc");
+
+  return c->u_mode.ocb.L[ntz];
+}
+
+
+/* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
+   have custom calling convention (additional XMM parameters). */
+extern void _gcry_aes_ssse3_enc_preload(void);
+extern void _gcry_aes_ssse3_dec_preload(void);
+extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits,
+					  void *buffer, u64 decrypt,
+					  u64 rotoffs);
+extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds);
+extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds);
+
+
+
+/* Two macros to be called prior and after the use of SSSE3
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE registers are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SSSE3_STATE_SIZE (16 * 10)
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define vpaes_ssse3_prepare() \
+    asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t" \
+                  "movdqu %%xmm7,  1*16(%0)\n\t" \
+                  "movdqu %%xmm8,  2*16(%0)\n\t" \
+                  "movdqu %%xmm9,  3*16(%0)\n\t" \
+                  "movdqu %%xmm10, 4*16(%0)\n\t" \
+                  "movdqu %%xmm11, 5*16(%0)\n\t" \
+                  "movdqu %%xmm12, 6*16(%0)\n\t" \
+                  "movdqu %%xmm13, 7*16(%0)\n\t" \
+                  "movdqu %%xmm14, 8*16(%0)\n\t" \
+                  "movdqu %%xmm15, 9*16(%0)\n\t" \
+                  : \
+                  : "r" (ssse3_state) \
+                  : "memory" )
+# define vpaes_ssse3_cleanup() \
+    asm volatile ("pxor	%%xmm0,  %%xmm0 \n\t" \
+                  "pxor	%%xmm1,  %%xmm1 \n\t" \
+                  "pxor	%%xmm2,  %%xmm2 \n\t" \
+                  "pxor	%%xmm3,  %%xmm3 \n\t" \
+                  "pxor	%%xmm4,  %%xmm4 \n\t" \
+                  "pxor	%%xmm5,  %%xmm5 \n\t" \
+                  "movdqu 0*16(%0), %%xmm6 \n\t" \
+                  "movdqu 1*16(%0), %%xmm7 \n\t" \
+                  "movdqu 2*16(%0), %%xmm8 \n\t" \
+                  "movdqu 3*16(%0), %%xmm9 \n\t" \
+                  "movdqu 4*16(%0), %%xmm10 \n\t" \
+                  "movdqu 5*16(%0), %%xmm11 \n\t" \
+                  "movdqu 6*16(%0), %%xmm12 \n\t" \
+                  "movdqu 7*16(%0), %%xmm13 \n\t" \
+                  "movdqu 8*16(%0), %%xmm14 \n\t" \
+                  "movdqu 9*16(%0), %%xmm15 \n\t" \
+                  : \
+                  : "r" (ssse3_state) \
+                  : "memory" )
+#else
+# define SSSE3_STATE_SIZE 1
+# define vpaes_ssse3_prepare() (void)ssse3_state
+# define vpaes_ssse3_cleanup() \
+    asm volatile ("pxor	%%xmm0,  %%xmm0 \n\t" \
+                  "pxor	%%xmm1,  %%xmm1 \n\t" \
+                  "pxor	%%xmm2,  %%xmm2 \n\t" \
+                  "pxor	%%xmm3,  %%xmm3 \n\t" \
+                  "pxor	%%xmm4,  %%xmm4 \n\t" \
+                  "pxor	%%xmm5,  %%xmm5 \n\t" \
+                  "pxor	%%xmm6,  %%xmm6 \n\t" \
+                  "pxor	%%xmm7,  %%xmm7 \n\t" \
+                  "pxor	%%xmm8,  %%xmm8 \n\t" \
+                  ::: "memory" )
+#endif
+
+#define vpaes_ssse3_prepare_enc() \
+    vpaes_ssse3_prepare(); \
+    _gcry_aes_ssse3_enc_preload();
+
+#define vpaes_ssse3_prepare_dec() \
+    vpaes_ssse3_prepare(); \
+    _gcry_aes_ssse3_dec_preload();
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare();
+
+  _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48);
+
+  /* Save key for setting up decryption. */
+  if (keybits > 192)
+    asm volatile ("movdqu   (%[src]), %%xmm0\n\t"
+		  "movdqu 16(%[src]), %%xmm1\n\t"
+		  "movdqu %%xmm0,   (%[dst])\n\t"
+		  "movdqu %%xmm1, 16(%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+  else if (keybits == 192)
+    asm volatile ("movdqu   (%[src]), %%xmm0\n\t"
+		  "movq   16(%[src]), %%xmm1\n\t"
+		  "movdqu %%xmm0,   (%[dst])\n\t"
+		  "movq   %%xmm1, 16(%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+  else
+    asm volatile ("movdqu (%[src]), %%xmm0\n\t"
+		  "movdqu %%xmm0, (%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+
+  vpaes_ssse3_cleanup();
+}
+
+
+/* Make a decryption key from an encryption key. */
+static ASM_FUNC_ATTR_INLINE void
+do_ssse3_prepare_decryption (RIJNDAEL_context *ctx,
+                             byte ssse3_state[SSSE3_STATE_SIZE])
+{
+  unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+
+  vpaes_ssse3_prepare();
+
+  _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits,
+				&ctx->keyschdec32[ctx->rounds][0], 1,
+				(keybits == 192) ? 0 : 32);
+
+  vpaes_ssse3_cleanup();
+}
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  do_ssse3_prepare_decryption(ctx, ssse3_state);
+}
+
+
+/* Encrypt one block using the Intel SSSE3 instructions.  Block is input
+* and output through SSE register xmm0. */
+static ASM_FUNC_ATTR_INLINE void
+do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
+{
+  _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds);
+}
+
+
+/* Decrypt one block using the Intel SSSE3 instructions.  Block is input
+* and output through SSE register xmm0. */
+static ASM_FUNC_ATTR_INLINE void
+do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
+{
+  _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds);
+}
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                        const unsigned char *src)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_vpaes_ssse3_enc (ctx, nrounds);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
+  vpaes_ssse3_cleanup ();
+  return 0;
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
+                    "pxor %%xmm1, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks, int cbc_mac)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm7, %%xmm0\n\t"
+                    : /* No output */
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      if (!cbc_mac)
+        outbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+  u64 ctrlow;
+
+  vpaes_ssse3_prepare_enc ();
+
+  asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+                "movdqa (%[ctr]), %%xmm7\n\t"  /* Preload CTR */
+                "movq 8(%[ctr]), %q[ctrlow]\n\t"
+                "bswapq %q[ctrlow]\n\t"
+                : [ctrlow] "=r" (ctrlow)
+                : [mask] "m" (*be_mask),
+                  [ctr] "r" (ctr)
+                : "memory", "cc");
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqa %%xmm7, %%xmm0\n\t"     /* xmm0 := CTR (xmm7)  */
+                    "pcmpeqd %%xmm1, %%xmm1\n\t"
+                    "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                    "pshufb %%xmm6, %%xmm7\n\t"
+                    "psubq  %%xmm1, %%xmm7\n\t"     /* xmm7++ (big endian) */
+
+                    /* detect if 64-bit carry handling is needed */
+                    "incq   %q[ctrlow]\n\t"
+                    "jnz    .Lno_carry%=\n\t"
+
+                    "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                    "psubq   %%xmm1, %%xmm7\n\t"    /* add carry to upper 64bits */
+
+                    ".Lno_carry%=:\n\t"
+
+                    "pshufb %%xmm6, %%xmm7\n\t"
+                    : [ctrlow] "+r" (ctrlow)
+                    :
+                    : "cc", "memory");
+
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
+                    "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
+                    "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
+                    : [dst] "=m" (*outbuf)
+                    : [src] "m" (*inbuf)
+                    : "memory");
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm7, %[ctr]\n\t"   /* Update CTR (mem).       */
+                : [ctr] "=m" (*ctr)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+                         const unsigned char *src)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_dec ();
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_vpaes_ssse3_dec (ctx, nrounds);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
+  vpaes_ssse3_cleanup ();
+  return 0;
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm6, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+void ASM_FUNC_ATTR
+_gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+                         unsigned char *outbuf, const unsigned char *inbuf,
+                         size_t nblocks)
+{
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  if ( !ctx->decryption_prepared )
+    {
+      do_ssse3_prepare_decryption ( ctx, ssse3_state );
+      ctx->decryption_prepared = 1;
+    }
+
+  vpaes_ssse3_prepare_dec ();
+
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"	/* use xmm7 as fast IV storage */
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory");
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+		    "movdqa %%xmm0, %%xmm6\n\t"    /* use xmm6 as savebuf */
+		    : /* No output */
+		    : [inbuf] "m" (*inbuf)
+		    : "memory");
+
+      do_vpaes_ssse3_dec (ctx, nrounds);
+
+      asm volatile ("pxor %%xmm7, %%xmm0\n\t"	/* xor IV with output */
+		    "movdqu %%xmm0, %[outbuf]\n\t"
+		    "movdqu %%xmm6, %%xmm7\n\t"	/* store savebuf as new IV */
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory");
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"	/* store IV */
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory");
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+static void ASM_FUNC_ATTR
+ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+
+  /* Preload Offset and Checksum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_iv.iv),
+                  [ctr] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm0,   %%xmm6\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_iv.iv),
+                  [ctr] "=m" (*c->u_ctr.ctr)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+static void ASM_FUNC_ATTR
+ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  if ( !ctx->decryption_prepared )
+    {
+      do_ssse3_prepare_decryption ( ctx, ssse3_state );
+      ctx->decryption_prepared = 1;
+    }
+
+  vpaes_ssse3_prepare_dec ();
+
+  /* Preload Offset and Checksum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_iv.iv),
+                  [ctr] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_dec (ctx, nrounds);
+
+      asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
+                    "pxor   %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_iv.iv),
+                  [ctr] "=m" (*c->u_ctr.ctr)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+}
+
+
+size_t ASM_FUNC_ATTR
+_gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+                          const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+  else
+    ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+
+  return 0;
+}
+
+
+size_t ASM_FUNC_ATTR
+_gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                          size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 n = c->u_mode.ocb.aad_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc ();
+
+  /* Preload Offset and Sum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "m" (*c->u_mode.ocb.aad_sum)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = aes_ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_enc (ctx, nrounds);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "=m" (*c->u_mode.ocb.aad_sum)
+                :
+                : "memory" );
+
+  vpaes_ssse3_cleanup ();
+
+  return 0;
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* USE_SSSE3 */
diff --git a/comm/third_party/libgcrypt/cipher/rijndael-tables.h b/comm/third_party/libgcrypt/cipher/rijndael-tables.h
new file mode 100644
index 0000000000..b54d959393
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael-tables.h
@@ -0,0 +1,227 @@
+/* rijndael-tables.h - Rijndael (AES) for GnuPG,
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* To keep the actual implementation at a readable size we use this
+   include file to define the tables.  */
+
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  u32 T[256];
+  volatile u32 counter_tail;
+} enc_tables ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+    {
+      0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
+      0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
+      0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56,
+      0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec,
+      0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa,
+      0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb,
+      0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45,
+      0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b,
+      0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c,
+      0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83,
+      0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9,
+      0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a,
+      0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d,
+      0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f,
+      0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df,
+      0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea,
+      0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34,
+      0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b,
+      0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d,
+      0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413,
+      0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1,
+      0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6,
+      0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972,
+      0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85,
+      0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed,
+      0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511,
+      0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe,
+      0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b,
+      0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05,
+      0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1,
+      0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142,
+      0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf,
+      0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3,
+      0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e,
+      0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a,
+      0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6,
+      0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3,
+      0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b,
+      0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428,
+      0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad,
+      0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14,
+      0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8,
+      0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4,
+      0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2,
+      0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda,
+      0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949,
+      0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf,
+      0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810,
+      0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c,
+      0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697,
+      0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e,
+      0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f,
+      0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc,
+      0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c,
+      0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969,
+      0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27,
+      0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122,
+      0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433,
+      0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9,
+      0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5,
+      0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a,
+      0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0,
+      0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e,
+      0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c
+    },
+    0
+  };
+
+#define encT enc_tables.T
+
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  u32 T[256];
+  byte inv_sbox[256];
+  volatile u32 counter_tail;
+} dec_tables ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+    {
+      0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
+      0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
+      0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5,
+      0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5,
+      0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d,
+      0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b,
+      0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295,
+      0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e,
+      0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927,
+      0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d,
+      0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362,
+      0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9,
+      0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52,
+      0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566,
+      0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3,
+      0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed,
+      0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e,
+      0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4,
+      0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4,
+      0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd,
+      0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d,
+      0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060,
+      0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967,
+      0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879,
+      0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000,
+      0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c,
+      0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36,
+      0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624,
+      0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b,
+      0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c,
+      0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12,
+      0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14,
+      0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3,
+      0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b,
+      0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8,
+      0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684,
+      0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7,
+      0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177,
+      0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947,
+      0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322,
+      0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498,
+      0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f,
+      0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54,
+      0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382,
+      0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf,
+      0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb,
+      0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83,
+      0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef,
+      0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029,
+      0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235,
+      0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733,
+      0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117,
+      0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4,
+      0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546,
+      0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb,
+      0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d,
+      0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb,
+      0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a,
+      0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773,
+      0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478,
+      0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2,
+      0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff,
+      0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664,
+      0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0
+    },
+    {
+      0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,
+      0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
+      0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,
+      0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
+      0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,
+      0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
+      0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,
+      0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
+      0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,
+      0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
+      0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,
+      0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
+      0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,
+      0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
+      0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,
+      0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
+      0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,
+      0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
+      0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,
+      0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
+      0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,
+      0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
+      0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,
+      0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
+      0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,
+      0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
+      0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,
+      0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
+      0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,
+      0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
+      0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,
+      0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+    },
+    0
+  };
+
+#define decT dec_tables.T
+#define inv_sbox dec_tables.inv_sbox
+
+static const u32 rcon[30] =
+  {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c,
+    0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
+    0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91
+  };
diff --git a/comm/third_party/libgcrypt/cipher/rijndael.c b/comm/third_party/libgcrypt/cipher/rijndael.c
new file mode 100644
index 0000000000..fe137327e7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rijndael.c
@@ -0,0 +1,2032 @@
+/* Rijndael (AES) for GnuPG
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ *               2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *******************************************************************
+ * The code here is based on the optimized implementation taken from
+ * http://www.esat.kuleuven.ac.be/~rijmen/rijndael/ on Oct 2, 2000,
+ * which carries this notice:
+ *------------------------------------------
+ * rijndael-alg-fst.c   v2.3   April '2000
+ *
+ * Optimised ANSI C code
+ *
+ * authors: v1.0: Antoon Bosselaers
+ *          v2.0: Vincent Rijmen
+ *          v2.3: Paulo Barreto
+ *
+ * This code is placed in the public domain.
+ *------------------------------------------
+ *
+ * The SP800-38a document is available at:
+ *   http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_AMD64_ASM
+/* AMD64 assembly implementations of AES */
+extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  int rounds,
+                                                  const void *encT);
+
+extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  int rounds,
+                                                  const void *decT);
+#endif /*USE_AMD64_ASM*/
+
+#ifdef USE_AESNI
+/* AES-NI (AMD64 & i386) accelerated implementations of AES */
+extern void _gcry_aes_aesni_do_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx,
+                                             unsigned char *dst,
+                                             const unsigned char *src);
+extern unsigned int _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx,
+                                             unsigned char *dst,
+                                             const unsigned char *src);
+extern void _gcry_aes_aesni_cfb_enc (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_aesni_cbc_enc (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks, int cbc_mac);
+extern void _gcry_aes_aesni_ctr_enc (void *context, unsigned char *ctr,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_aesni_cfb_dec (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_aesni_cbc_dec (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern size_t _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                                         const void *inbuf_arg, size_t nblocks,
+                                         int encrypt);
+extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                                        size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
+                                       void *outbuf_arg, const void *inbuf_arg,
+                                       size_t nblocks, int encrypt);
+#endif
+
+#ifdef USE_SSSE3
+/* SSSE3 (AMD64) vector permutation implementation of AES */
+extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx,
+                                             unsigned char *dst,
+                                             const unsigned char *src);
+extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx,
+                                             unsigned char *dst,
+                                             const unsigned char *src);
+extern void _gcry_aes_ssse3_cfb_enc (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_ssse3_cbc_enc (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks,
+                                     int cbc_mac);
+extern void _gcry_aes_ssse3_ctr_enc (void *context, unsigned char *ctr,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_ssse3_cfb_dec (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern void _gcry_aes_ssse3_cbc_dec (void *context, unsigned char *iv,
+                                     void *outbuf_arg, const void *inbuf_arg,
+                                     size_t nblocks);
+extern size_t _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                                         const void *inbuf_arg, size_t nblocks,
+                                         int encrypt);
+extern size_t _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                                        size_t nblocks);
+#endif
+
+#ifdef USE_PADLOCK
+extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
+                                               unsigned char *bx,
+                                               const unsigned char *ax);
+extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx,
+                                               unsigned char *bx,
+                                               const unsigned char *ax);
+extern void _gcry_aes_padlock_prepare_decryption (RIJNDAEL_context *ctx);
+#endif
+
+#ifdef USE_ARM_ASM
+/* ARM assembly implementations of AES */
+extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc,
+                                                unsigned char *out,
+                                                const unsigned char *in,
+                                                int rounds,
+                                                const void *encT);
+
+extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec,
+                                                unsigned char *out,
+                                                const unsigned char *in,
+                                                int rounds,
+                                                const void *decT);
+#endif /*USE_ARM_ASM*/
+
+#ifdef USE_ARM_CE
+/* ARMv8 Crypto Extension implementations of AES */
+extern void _gcry_aes_armv8_ce_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_armv8_ce_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_armv8_ce_encrypt(const RIJNDAEL_context *ctx,
+                                               unsigned char *dst,
+                                               const unsigned char *src);
+extern unsigned int _gcry_aes_armv8_ce_decrypt(const RIJNDAEL_context *ctx,
+                                               unsigned char *dst,
+                                               const unsigned char *src);
+
+extern void _gcry_aes_armv8_ce_cfb_enc (void *context, unsigned char *iv,
+                                        void *outbuf_arg, const void *inbuf_arg,
+                                        size_t nblocks);
+extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv,
+                                        void *outbuf_arg, const void *inbuf_arg,
+                                        size_t nblocks,
+                                        int cbc_mac);
+extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr,
+                                        void *outbuf_arg, const void *inbuf_arg,
+                                        size_t nblocks);
+extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv,
+                                        void *outbuf_arg, const void *inbuf_arg,
+                                        size_t nblocks);
+extern void _gcry_aes_armv8_ce_cbc_dec (void *context, unsigned char *iv,
+                                        void *outbuf_arg, const void *inbuf_arg,
+                                        size_t nblocks);
+extern size_t _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                                            const void *inbuf_arg, size_t nblocks,
+                                            int encrypt);
+extern size_t _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c,
+                                           const void *abuf_arg, size_t nblocks);
+extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
+                                          void *outbuf_arg,
+                                          const void *inbuf_arg,
+                                          size_t nblocks, int encrypt);
+#endif /*USE_ARM_ASM*/
+
+#ifdef USE_PPC_CRYPTO
+/* PowerPC Crypto implementations of AES */
+extern void _gcry_aes_ppc8_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_ppc8_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_ppc8_encrypt(const RIJNDAEL_context *ctx,
+					   unsigned char *dst,
+					   const unsigned char *src);
+extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
+					   unsigned char *dst,
+					   const unsigned char *src);
+
+extern void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+extern void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks, int cbc_mac);
+extern void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+extern void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+extern void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+
+extern size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					const void *inbuf_arg, size_t nblocks,
+					int encrypt);
+extern size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c,
+				       const void *abuf_arg, size_t nblocks);
+
+extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
+				      void *outbuf_arg,
+				      const void *inbuf_arg,
+				      size_t nblocks, int encrypt);
+#endif /*USE_PPC_CRYPTO*/
+
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+/* Power9 little-endian crypto implementations of AES */
+extern unsigned int _gcry_aes_ppc9le_encrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+
+extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cbc_enc (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks, int cbc_mac);
+extern void _gcry_aes_ppc9le_ctr_enc (void *context, unsigned char *ctr,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cfb_dec (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cbc_dec (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+
+extern size_t _gcry_aes_ppc9le_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					  const void *inbuf_arg, size_t nblocks,
+					  int encrypt);
+extern size_t _gcry_aes_ppc9le_ocb_auth (gcry_cipher_hd_t c,
+					const void *abuf_arg, size_t nblocks);
+
+extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks, int encrypt);
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
+
+#ifdef USE_S390X_CRYPTO
+/* zSeries crypto implementations of AES */
+extern int _gcry_aes_s390x_setup_acceleration(RIJNDAEL_context *ctx,
+					      unsigned int keylen,
+					      unsigned int hwfeatures,
+					      cipher_bulk_ops_t *bulk_ops);
+extern void _gcry_aes_s390x_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_s390x_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_s390x_encrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+extern unsigned int _gcry_aes_s390x_decrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+
+#endif /*USE_S390X_CRYPTO*/
+
+static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
+                                const unsigned char *ax);
+static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
+                                const unsigned char *ax);
+
+static void _gcry_aes_cfb_enc (void *context, unsigned char *iv,
+			       void *outbuf, const void *inbuf,
+			       size_t nblocks);
+static void _gcry_aes_cfb_dec (void *context, unsigned char *iv,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static void _gcry_aes_cbc_enc (void *context, unsigned char *iv,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks, int cbc_mac);
+static void _gcry_aes_cbc_dec (void *context, unsigned char *iv,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				   const void *inbuf_arg, size_t nblocks,
+				   int encrypt);
+static size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				  size_t nblocks);
+static void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+				 void *outbuf_arg, const void *inbuf_arg,
+				 size_t nblocks, int encrypt);
+
+
+/* All the numbers.  */
+#include "rijndael-tables.h"
+
+
+
+
+/* Function prototypes.  */
+static const char *selftest(void);
+static void prepare_decryption(RIJNDAEL_context *ctx);
+
+
+
+/* Prefetching for encryption/decryption tables. */
+static inline void prefetch_table(const volatile byte *tab, size_t len)
+{
+  size_t i;
+
+  for (i = 0; len - i >= 8 * 32; i += 8 * 32)
+    {
+      (void)tab[i + 0 * 32];
+      (void)tab[i + 1 * 32];
+      (void)tab[i + 2 * 32];
+      (void)tab[i + 3 * 32];
+      (void)tab[i + 4 * 32];
+      (void)tab[i + 5 * 32];
+      (void)tab[i + 6 * 32];
+      (void)tab[i + 7 * 32];
+    }
+  for (; i < len; i += 32)
+    {
+      (void)tab[i];
+    }
+
+  (void)tab[len - 1];
+}
+
+static void prefetch_enc(void)
+{
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  enc_tables.counter_head++;
+  enc_tables.counter_tail++;
+
+  /* Prefetch look-up tables to cache.  */
+  prefetch_table((const void *)&enc_tables, sizeof(enc_tables));
+}
+
+static void prefetch_dec(void)
+{
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  dec_tables.counter_head++;
+  dec_tables.counter_tail++;
+
+  /* Prefetch look-up tables to cache.  */
+  prefetch_table((const void *)&dec_tables, sizeof(dec_tables));
+}
+
+
+
+/* Perform the key setup.  */
+static gcry_err_code_t
+do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
+           cipher_bulk_ops_t *bulk_ops)
+{
+  static int initialized = 0;
+  static const char *selftest_failed = 0;
+  void (*hw_setkey)(RIJNDAEL_context *ctx, const byte *key) = NULL;
+  int rounds;
+  int i,j, r, t, rconpointer = 0;
+  int KC;
+  unsigned int hwfeatures;
+
+  /* The on-the-fly self tests are only run in non-fips mode. In fips
+     mode explicit self-tests are required.  Actually the on-the-fly
+     self-tests are not fully thread-safe and it might happen that a
+     failed self-test won't get noticed in another thread.
+
+     FIXME: We might want to have a central registry of succeeded
+     self-tests. */
+  if (!fips_mode () && !initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("%s\n", selftest_failed );
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if( keylen == 128/8 )
+    {
+      rounds = 10;
+      KC = 4;
+    }
+  else if ( keylen == 192/8 )
+    {
+      rounds = 12;
+      KC = 6;
+    }
+  else if ( keylen == 256/8 )
+    {
+      rounds = 14;
+      KC = 8;
+    }
+  else
+    return GPG_ERR_INV_KEYLEN;
+
+  ctx->rounds = rounds;
+  hwfeatures = _gcry_get_hw_features ();
+
+  ctx->decryption_prepared = 0;
+
+  /* Setup default bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cfb_enc = _gcry_aes_cfb_enc;
+  bulk_ops->cfb_dec = _gcry_aes_cfb_dec;
+  bulk_ops->cbc_enc = _gcry_aes_cbc_enc;
+  bulk_ops->cbc_dec = _gcry_aes_cbc_dec;
+  bulk_ops->ctr_enc = _gcry_aes_ctr_enc;
+  bulk_ops->ocb_crypt = _gcry_aes_ocb_crypt;
+  bulk_ops->ocb_auth  = _gcry_aes_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_aes_xts_crypt;
+
+  (void)hwfeatures;
+
+  if (0)
+    {
+      ;
+    }
+#ifdef USE_AESNI
+  else if (hwfeatures & HWF_INTEL_AESNI)
+    {
+      hw_setkey = _gcry_aes_aesni_do_setkey;
+      ctx->encrypt_fn = _gcry_aes_aesni_encrypt;
+      ctx->decrypt_fn = _gcry_aes_aesni_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_aesni_prepare_decryption;
+      ctx->use_avx = !!(hwfeatures & HWF_INTEL_AVX);
+      ctx->use_avx2 = !!(hwfeatures & HWF_INTEL_AVX2);
+
+      /* Setup AES-NI bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_aesni_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_aesni_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_aesni_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_aesni_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_aesni_ctr_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
+      bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+    }
+#endif
+#ifdef USE_PADLOCK
+  else if (hwfeatures & HWF_PADLOCK_AES && keylen == 128/8)
+    {
+      ctx->encrypt_fn = _gcry_aes_padlock_encrypt;
+      ctx->decrypt_fn = _gcry_aes_padlock_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_padlock_prepare_decryption;
+      memcpy (ctx->padlockkey, key, keylen);
+    }
+#endif
+#ifdef USE_SSSE3
+  else if (hwfeatures & HWF_INTEL_SSSE3)
+    {
+      hw_setkey = _gcry_aes_ssse3_do_setkey;
+      ctx->encrypt_fn = _gcry_aes_ssse3_encrypt;
+      ctx->decrypt_fn = _gcry_aes_ssse3_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_ssse3_prepare_decryption;
+
+      /* Setup SSSE3 bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_ssse3_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_ssse3_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_ssse3_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_ssse3_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_ssse3_ctr_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_ssse3_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_ssse3_ocb_auth;
+    }
+#endif
+#ifdef USE_ARM_CE
+  else if (hwfeatures & HWF_ARM_AES)
+    {
+      hw_setkey = _gcry_aes_armv8_ce_setkey;
+      ctx->encrypt_fn = _gcry_aes_armv8_ce_encrypt;
+      ctx->decrypt_fn = _gcry_aes_armv8_ce_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_armv8_ce_prepare_decryption;
+
+      /* Setup ARM-CE bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_armv8_ce_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_armv8_ce_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_armv8_ce_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_armv8_ce_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_armv8_ce_ctr_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
+      bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
+    }
+#endif
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00))
+    {
+      hw_setkey = _gcry_aes_ppc8_setkey;
+      ctx->encrypt_fn = _gcry_aes_ppc9le_encrypt;
+      ctx->decrypt_fn = _gcry_aes_ppc9le_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
+
+      /* Setup PPC9LE bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_ppc9le_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_ppc9le_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_ppc9le_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_ppc9le_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_ppc9le_ctr_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
+      bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+    }
+#endif
+#ifdef USE_PPC_CRYPTO
+  else if (hwfeatures & HWF_PPC_VCRYPTO)
+    {
+      hw_setkey = _gcry_aes_ppc8_setkey;
+      ctx->encrypt_fn = _gcry_aes_ppc8_encrypt;
+      ctx->decrypt_fn = _gcry_aes_ppc8_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
+
+      /* Setup PPC8 bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_ppc8_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_ppc8_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_ppc8_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_ppc8_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_ppc8_ctr_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth;
+      bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt;
+    }
+#endif
+#ifdef USE_S390X_CRYPTO
+  else if (_gcry_aes_s390x_setup_acceleration (ctx, keylen, hwfeatures,
+					       bulk_ops))
+  {
+      hw_setkey = _gcry_aes_s390x_setkey;
+      ctx->encrypt_fn = _gcry_aes_s390x_encrypt;
+      ctx->decrypt_fn = _gcry_aes_s390x_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_s390x_prepare_decryption;
+    }
+#endif
+  else
+    {
+      ctx->encrypt_fn = do_encrypt;
+      ctx->decrypt_fn = do_decrypt;
+      ctx->prefetch_enc_fn = prefetch_enc;
+      ctx->prefetch_dec_fn = prefetch_dec;
+      ctx->prepare_decryption = prepare_decryption;
+    }
+
+  /* NB: We don't yet support Padlock hardware key generation.  */
+
+  if (hw_setkey)
+    {
+      hw_setkey (ctx, key);
+    }
+  else
+    {
+      const byte *sbox = ((const byte *)encT) + 1;
+      union
+        {
+          PROPERLY_ALIGNED_TYPE dummy;
+          byte data[MAXKC][4];
+          u32 data32[MAXKC];
+        } tkk[2];
+#define k      tkk[0].data
+#define k_u32  tkk[0].data32
+#define tk     tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W      (ctx->keyschenc)
+#define W_u32  (ctx->keyschenc32)
+
+      prefetch_enc();
+
+      for (i = 0; i < keylen; i++)
+        {
+          k[i >> 2][i & 3] = key[i];
+        }
+
+      for (j = KC-1; j >= 0; j--)
+        {
+          tk_u32[j] = k_u32[j];
+        }
+      r = 0;
+      t = 0;
+      /* Copy values into round key array.  */
+      for (j = 0; (j < KC) && (r < rounds + 1); )
+        {
+          for (; (j < KC) && (t < 4); j++, t++)
+            {
+              W_u32[r][t] = le_bswap32(tk_u32[j]);
+            }
+          if (t == 4)
+            {
+              r++;
+              t = 0;
+            }
+        }
+
+      while (r < rounds + 1)
+        {
+          /* While not enough round key material calculated calculate
+             new values.  */
+          tk[0][0] ^= sbox[tk[KC-1][1] * 4];
+          tk[0][1] ^= sbox[tk[KC-1][2] * 4];
+          tk[0][2] ^= sbox[tk[KC-1][3] * 4];
+          tk[0][3] ^= sbox[tk[KC-1][0] * 4];
+          tk[0][0] ^= rcon[rconpointer++];
+
+          if (KC != 8)
+            {
+              for (j = 1; j < KC; j++)
+                {
+                  tk_u32[j] ^= tk_u32[j-1];
+                }
+            }
+          else
+            {
+              for (j = 1; j < KC/2; j++)
+                {
+                  tk_u32[j] ^= tk_u32[j-1];
+                }
+              tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4];
+              tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4];
+              tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4];
+              tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4];
+              for (j = KC/2 + 1; j < KC; j++)
+                {
+                  tk_u32[j] ^= tk_u32[j-1];
+                }
+            }
+
+          /* Copy values into round key array.  */
+          for (j = 0; (j < KC) && (r < rounds + 1); )
+            {
+              for (; (j < KC) && (t < 4); j++, t++)
+                {
+                  W_u32[r][t] = le_bswap32(tk_u32[j]);
+                }
+              if (t == 4)
+                {
+                  r++;
+                  t = 0;
+                }
+            }
+        }
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+      wipememory(&tkk, sizeof(tkk));
+    }
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+rijndael_setkey (void *context, const byte *key, const unsigned keylen,
+                 cipher_bulk_ops_t *bulk_ops)
+{
+  RIJNDAEL_context *ctx = context;
+  return do_setkey (ctx, key, keylen, bulk_ops);
+}
+
+
+/* Make a decryption key from an encryption key. */
+static void
+prepare_decryption( RIJNDAEL_context *ctx )
+{
+  const byte *sbox = ((const byte *)encT) + 1;
+  int r;
+
+  prefetch_enc();
+  prefetch_dec();
+
+  ctx->keyschdec32[0][0] = ctx->keyschenc32[0][0];
+  ctx->keyschdec32[0][1] = ctx->keyschenc32[0][1];
+  ctx->keyschdec32[0][2] = ctx->keyschenc32[0][2];
+  ctx->keyschdec32[0][3] = ctx->keyschenc32[0][3];
+
+  for (r = 1; r < ctx->rounds; r++)
+    {
+      u32 *wi = ctx->keyschenc32[r];
+      u32 *wo = ctx->keyschdec32[r];
+      u32 wt;
+
+      wt = wi[0];
+      wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+	      ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+	      ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+	      ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+      wt = wi[1];
+      wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+	      ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+	      ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+	      ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+      wt = wi[2];
+      wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+	      ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+	      ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+	      ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+      wt = wi[3];
+      wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+	      ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+	      ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+	      ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+    }
+
+  ctx->keyschdec32[r][0] = ctx->keyschenc32[r][0];
+  ctx->keyschdec32[r][1] = ctx->keyschenc32[r][1];
+  ctx->keyschdec32[r][2] = ctx->keyschenc32[r][2];
+  ctx->keyschdec32[r][3] = ctx->keyschenc32[r][3];
+}
+
+
+#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM)
+/* Encrypt one block. A and B may be the same. */
+static unsigned int
+do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b,
+               const unsigned char *a)
+{
+#define rk (ctx->keyschenc32)
+  const byte *sbox = ((const byte *)encT) + 1;
+  int rounds = ctx->rounds;
+  int r;
+  u32 sa[4];
+  u32 sb[4];
+
+  sb[0] = buf_get_le32(a + 0);
+  sb[1] = buf_get_le32(a + 4);
+  sb[2] = buf_get_le32(a + 8);
+  sb[3] = buf_get_le32(a + 12);
+
+  sa[0] = sb[0] ^ rk[0][0];
+  sa[1] = sb[1] ^ rk[0][1];
+  sa[2] = sb[2] ^ rk[0][2];
+  sa[3] = sb[3] ^ rk[0][3];
+
+  sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+  sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+  sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+  sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+  sa[0] = rk[1][0] ^ sb[0];
+
+  sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+  sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+  sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+  sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+  sa[1] = rk[1][1] ^ sb[1];
+
+  sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+  sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+  sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+  sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+  sa[2] = rk[1][2] ^ sb[2];
+
+  sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+  sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+  sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+  sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+  sa[3] = rk[1][3] ^ sb[3];
+
+  for (r = 2; r < rounds; r++)
+    {
+      sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+      sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+      sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+      sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+      sa[0] = rk[r][0] ^ sb[0];
+
+      sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+      sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+      sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+      sa[1] = rk[r][1] ^ sb[1];
+
+      sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+      sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+      sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+      sa[2] = rk[r][2] ^ sb[2];
+
+      sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+      sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+      sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+      sa[3] = rk[r][3] ^ sb[3];
+
+      r++;
+
+      sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+      sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+      sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+      sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+      sa[0] = rk[r][0] ^ sb[0];
+
+      sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+      sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+      sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+      sa[1] = rk[r][1] ^ sb[1];
+
+      sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+      sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+      sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+      sa[2] = rk[r][2] ^ sb[2];
+
+      sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+      sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+      sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+      sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+      sa[3] = rk[r][3] ^ sb[3];
+    }
+
+  /* Last round is special. */
+
+  sb[0] = ((u32)sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8);
+  sb[3] = ((u32)sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8);
+  sb[2] = ((u32)sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8);
+  sb[1] = ((u32)sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8);
+  sa[0] = rk[r][0] ^ sb[0];
+
+  sb[1] ^= ((u32)sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8);
+  sa[0] ^= ((u32)sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8);
+  sb[3] ^= ((u32)sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8);
+  sb[2] ^= ((u32)sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8);
+  sa[1] = rk[r][1] ^ sb[1];
+
+  sb[2] ^= ((u32)sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8);
+  sa[1] ^= ((u32)sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8);
+  sa[0] ^= ((u32)sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8);
+  sb[3] ^= ((u32)sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8);
+  sa[2] = rk[r][2] ^ sb[2];
+
+  sb[3] ^= ((u32)sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8);
+  sa[2] ^= ((u32)sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8);
+  sa[1] ^= ((u32)sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8);
+  sa[0] ^= ((u32)sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8);
+  sa[3] = rk[r][3] ^ sb[3];
+
+  buf_put_le32(b + 0, sa[0]);
+  buf_put_le32(b + 4, sa[1]);
+  buf_put_le32(b + 8, sa[2]);
+  buf_put_le32(b + 12, sa[3]);
+#undef rk
+
+  return (56 + 2*sizeof(int));
+}
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
+
+
+static unsigned int
+do_encrypt (const RIJNDAEL_context *ctx,
+            unsigned char *bx, const unsigned char *ax)
+{
+#ifdef USE_AMD64_ASM
+  return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds,
+				       enc_tables.T);
+#elif defined(USE_ARM_ASM)
+  return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds,
+				     enc_tables.T);
+#else
+  return do_encrypt_fn (ctx, bx, ax);
+#endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/
+}
+
+
+static unsigned int
+rijndael_encrypt (void *context, byte *b, const byte *a)
+{
+  RIJNDAEL_context *ctx = context;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  return ctx->encrypt_fn (ctx, b, a);
+}
+
+
+/* Bulk encryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+static void
+_gcry_aes_cfb_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the IV. */
+      burn_depth = encrypt_fn (ctx, iv, iv);
+      /* XOR the input with the IV and store input into IV.  */
+      cipher_block_xor_2dst(outbuf, iv, inbuf, BLOCKSIZE);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+
+/* Bulk encryption of complete blocks in CBC mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+static void
+_gcry_aes_cbc_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks, int cbc_mac)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *last_iv;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  last_iv = iv;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      cipher_block_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
+
+      burn_depth = encrypt_fn (ctx, outbuf, outbuf);
+
+      last_iv = outbuf;
+      inbuf += BLOCKSIZE;
+      if (!cbc_mac)
+	outbuf += BLOCKSIZE;
+    }
+
+  if (last_iv != iv)
+    cipher_block_cpy (iv, last_iv, BLOCKSIZE);
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode.  Caller needs to
+   make sure that CTR is aligned on a 16 byte boundary if AESNI; the
+   minimum alignment is for an u32.  This function is only intended
+   for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOCKSIZE. */
+static void
+_gcry_aes_ctr_enc (void *context, unsigned char *ctr,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp;
+  rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      burn_depth = encrypt_fn (ctx, tmp.x1, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmp.x1, inbuf, BLOCKSIZE);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add(ctr, 1, BLOCKSIZE);
+    }
+
+  wipememory(&tmp, sizeof(tmp));
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+
+
+#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM)
+/* Decrypt one block.  A and B may be the same. */
+static unsigned int
+do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b,
+               const unsigned char *a)
+{
+#define rk (ctx->keyschdec32)
+  int rounds = ctx->rounds;
+  int r;
+  u32 sa[4];
+  u32 sb[4];
+
+  sb[0] = buf_get_le32(a + 0);
+  sb[1] = buf_get_le32(a + 4);
+  sb[2] = buf_get_le32(a + 8);
+  sb[3] = buf_get_le32(a + 12);
+
+  sa[0] = sb[0] ^ rk[rounds][0];
+  sa[1] = sb[1] ^ rk[rounds][1];
+  sa[2] = sb[2] ^ rk[rounds][2];
+  sa[3] = sb[3] ^ rk[rounds][3];
+
+  for (r = rounds - 1; r > 1; r--)
+    {
+      sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+      sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+      sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+      sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+      sa[0] = rk[r][0] ^ sb[0];
+
+      sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+      sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+      sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+      sa[1] = rk[r][1] ^ sb[1];
+
+      sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+      sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+      sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+      sa[2] = rk[r][2] ^ sb[2];
+
+      sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+      sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+      sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+      sa[3] = rk[r][3] ^ sb[3];
+
+      r--;
+
+      sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+      sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+      sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+      sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+      sa[0] = rk[r][0] ^ sb[0];
+
+      sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+      sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+      sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+      sa[1] = rk[r][1] ^ sb[1];
+
+      sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+      sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+      sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+      sa[2] = rk[r][2] ^ sb[2];
+
+      sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+      sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+      sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+      sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+      sa[3] = rk[r][3] ^ sb[3];
+    }
+
+  sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+  sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+  sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+  sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+  sa[0] = rk[1][0] ^ sb[0];
+
+  sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+  sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+  sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+  sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+  sa[1] = rk[1][1] ^ sb[1];
+
+  sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+  sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+  sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+  sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+  sa[2] = rk[1][2] ^ sb[2];
+
+  sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+  sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+  sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+  sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+  sa[3] = rk[1][3] ^ sb[3];
+
+  /* Last round is special. */
+  sb[0] = (u32)inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8);
+  sb[1] = (u32)inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8);
+  sb[2] = (u32)inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8);
+  sb[3] = (u32)inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8);
+  sa[0] = sb[0] ^ rk[0][0];
+
+  sb[1] ^= (u32)inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8);
+  sb[2] ^= (u32)inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8);
+  sb[3] ^= (u32)inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8);
+  sa[0] ^= (u32)inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8);
+  sa[1] = sb[1] ^ rk[0][1];
+
+  sb[2] ^= (u32)inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8);
+  sb[3] ^= (u32)inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8);
+  sa[0] ^= (u32)inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8);
+  sa[1] ^= (u32)inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8);
+  sa[2] = sb[2] ^ rk[0][2];
+
+  sb[3] ^= (u32)inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8);
+  sa[0] ^= (u32)inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8);
+  sa[1] ^= (u32)inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8);
+  sa[2] ^= (u32)inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8);
+  sa[3] = sb[3] ^ rk[0][3];
+
+  buf_put_le32(b + 0, sa[0]);
+  buf_put_le32(b + 4, sa[1]);
+  buf_put_le32(b + 8, sa[2]);
+  buf_put_le32(b + 12, sa[3]);
+#undef rk
+
+  return (56+2*sizeof(int));
+}
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
+
+
+/* Decrypt one block.  AX and BX may be the same. */
+static unsigned int
+do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
+            const unsigned char *ax)
+{
+#ifdef USE_AMD64_ASM
+  return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
+				       dec_tables.T);
+#elif defined(USE_ARM_ASM)
+  return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
+				     dec_tables.T);
+#else
+  return do_decrypt_fn (ctx, bx, ax);
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
+}
+
+
+static inline void
+check_decryption_preparation (RIJNDAEL_context *ctx)
+{
+  if ( !ctx->decryption_prepared )
+    {
+      ctx->prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+}
+
+
+static unsigned int
+rijndael_decrypt (void *context, byte *b, const byte *a)
+{
+  RIJNDAEL_context *ctx = context;
+
+  check_decryption_preparation (ctx);
+
+  if (ctx->prefetch_dec_fn)
+    ctx->prefetch_dec_fn();
+
+  return ctx->decrypt_fn (ctx, b, a);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+static void
+_gcry_aes_cfb_dec (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  for ( ;nblocks; nblocks-- )
+    {
+      burn_depth = encrypt_fn (ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, BLOCKSIZE);
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+static void
+_gcry_aes_cbc_dec (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16;
+  rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn;
+
+  check_decryption_preparation (ctx);
+
+  if (ctx->prefetch_dec_fn)
+    ctx->prefetch_dec_fn();
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+	  the intermediate result to SAVEBUF.  */
+
+      burn_depth = decrypt_fn (ctx, savebuf, inbuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+
+  if (encrypt)
+    {
+      union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+      rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+      if (ctx->prefetch_enc_fn)
+        ctx->prefetch_enc_fn();
+
+      for ( ;nblocks; nblocks-- )
+        {
+          u64 i = ++c->u_mode.ocb.data_nblocks;
+          const unsigned char *l = ocb_get_l(c, i);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
+          cipher_block_cpy (l_tmp.x1, inbuf, BLOCKSIZE);
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          cipher_block_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE);
+          /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+          cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE);
+          burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1);
+          cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE);
+          cipher_block_cpy (outbuf, l_tmp.x1, BLOCKSIZE);
+
+          inbuf += BLOCKSIZE;
+          outbuf += BLOCKSIZE;
+        }
+    }
+  else
+    {
+      union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+      rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn;
+
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+        ctx->prefetch_dec_fn();
+
+      for ( ;nblocks; nblocks-- )
+        {
+          u64 i = ++c->u_mode.ocb.data_nblocks;
+          const unsigned char *l = ocb_get_l(c, i);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
+          cipher_block_cpy (l_tmp.x1, inbuf, BLOCKSIZE);
+          /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+          cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE);
+          burn_depth = decrypt_fn (ctx, l_tmp.x1, l_tmp.x1);
+          cipher_block_xor_1 (l_tmp.x1, c->u_iv.iv, BLOCKSIZE);
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          cipher_block_xor_1 (c->u_ctr.ctr, l_tmp.x1, BLOCKSIZE);
+          cipher_block_cpy (outbuf, l_tmp.x1, BLOCKSIZE);
+
+          inbuf += BLOCKSIZE;
+          outbuf += BLOCKSIZE;
+        }
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  unsigned int burn_depth = 0;
+  union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+  rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+
+  if (ctx->prefetch_enc_fn)
+    ctx->prefetch_enc_fn();
+
+  for ( ;nblocks; nblocks-- )
+    {
+      u64 i = ++c->u_mode.ocb.aad_nblocks;
+      const unsigned char *l = ocb_get_l(c, i);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      cipher_block_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE);
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      cipher_block_xor (l_tmp.x1, c->u_mode.ocb.aad_offset, abuf,
+			BLOCKSIZE);
+      burn_depth = encrypt_fn (ctx, l_tmp.x1, l_tmp.x1);
+      cipher_block_xor_1 (c->u_mode.ocb.aad_sum, l_tmp.x1, BLOCKSIZE);
+
+      abuf += BLOCKSIZE;
+    }
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+
+  return 0;
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t crypt_fn;
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+  if (encrypt)
+    {
+      if (ctx->prefetch_enc_fn)
+	ctx->prefetch_enc_fn();
+
+      crypt_fn = ctx->encrypt_fn;
+    }
+  else
+    {
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+	ctx->prefetch_dec_fn();
+
+      crypt_fn = ctx->decrypt_fn;
+    }
+
+  tweak_next_lo = buf_get_le64 (tweak + 0);
+  tweak_next_hi = buf_get_le64 (tweak + 8);
+
+  while (nblocks)
+    {
+      tweak_lo = tweak_next_lo;
+      tweak_hi = tweak_next_hi;
+
+      /* Xor-Encrypt/Decrypt-Xor block. */
+      tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+      tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+      buf_put_le64 (outbuf + 0, tmp_lo);
+      buf_put_le64 (outbuf + 8, tmp_hi);
+
+      /* Generate next tweak. */
+      carry = -(tweak_next_hi >> 63) & 0x87;
+      tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+      tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+      burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+      buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+      buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+      outbuf += GCRY_XTS_BLOCK_LEN;
+      inbuf += GCRY_XTS_BLOCK_LEN;
+      nblocks--;
+    }
+
+  buf_put_le64 (tweak + 0, tweak_next_lo);
+  buf_put_le64 (tweak + 8, tweak_next_hi);
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
+
+/* Run the self-tests for AES 128.  Returns NULL on success. */
+static const char*
+selftest_basic_128 (void)
+{
+  RIJNDAEL_context *ctx;
+  unsigned char *ctxmem;
+  unsigned char scratch[16];
+  cipher_bulk_ops_t bulk_ops;
+
+  /* The test vectors are from the AES supplied ones; more or less
+     randomly taken from ecb_tbl.txt (I=42,81,14) */
+#if 1
+  static const unsigned char plaintext_128[16] =
+    {
+      0x01,0x4B,0xAF,0x22,0x78,0xA6,0x9D,0x33,
+      0x1D,0x51,0x80,0x10,0x36,0x43,0xE9,0x9A
+    };
+  static const unsigned char key_128[16] =
+    {
+      0xE8,0xE9,0xEA,0xEB,0xED,0xEE,0xEF,0xF0,
+      0xF2,0xF3,0xF4,0xF5,0xF7,0xF8,0xF9,0xFA
+    };
+  static const unsigned char ciphertext_128[16] =
+    {
+      0x67,0x43,0xC3,0xD1,0x51,0x9A,0xB4,0xF2,
+      0xCD,0x9A,0x78,0xAB,0x09,0xA5,0x11,0xBD
+    };
+#else
+  /* Test vectors from fips-197, appendix C. */
+# warning debug test vectors in use
+  static const unsigned char plaintext_128[16] =
+    {
+      0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,
+      0x88,0x99,0xaa,0xbb,0xcc,0xdd,0xee,0xff
+    };
+  static const unsigned char key_128[16] =
+    {
+      0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+      0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+      /* 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, */
+      /* 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c */
+    };
+  static const unsigned char ciphertext_128[16] =
+    {
+      0x69,0xc4,0xe0,0xd8,0x6a,0x7b,0x04,0x30,
+      0xd8,0xcd,0xb7,0x80,0x70,0xb4,0xc5,0x5a
+    };
+#endif
+
+  /* Because gcc/ld can only align the CTX struct on 8 bytes on the
+     stack, we need to allocate that context on the heap.  */
+  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
+  if (!ctx)
+    return "failed to allocate memory";
+
+  rijndael_setkey (ctx, key_128, sizeof (key_128), &bulk_ops);
+  rijndael_encrypt (ctx, scratch, plaintext_128);
+  if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128)))
+    {
+      xfree (ctxmem);
+      return "AES-128 test encryption failed.";
+    }
+  rijndael_decrypt (ctx, scratch, scratch);
+  xfree (ctxmem);
+  if (memcmp (scratch, plaintext_128, sizeof (plaintext_128)))
+    return "AES-128 test decryption failed.";
+
+  return NULL;
+}
+
+/* Run the self-tests for AES 192.  Returns NULL on success. */
+static const char*
+selftest_basic_192 (void)
+{
+  RIJNDAEL_context *ctx;
+  unsigned char *ctxmem;
+  unsigned char scratch[16];
+  cipher_bulk_ops_t bulk_ops;
+
+  static unsigned char plaintext_192[16] =
+    {
+      0x76,0x77,0x74,0x75,0xF1,0xF2,0xF3,0xF4,
+      0xF8,0xF9,0xE6,0xE7,0x77,0x70,0x71,0x72
+    };
+  static unsigned char key_192[24] =
+    {
+      0x04,0x05,0x06,0x07,0x09,0x0A,0x0B,0x0C,
+      0x0E,0x0F,0x10,0x11,0x13,0x14,0x15,0x16,
+      0x18,0x19,0x1A,0x1B,0x1D,0x1E,0x1F,0x20
+    };
+  static const unsigned char ciphertext_192[16] =
+    {
+      0x5D,0x1E,0xF2,0x0D,0xCE,0xD6,0xBC,0xBC,
+      0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA
+    };
+
+  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
+  if (!ctx)
+    return "failed to allocate memory";
+  rijndael_setkey (ctx, key_192, sizeof(key_192), &bulk_ops);
+  rijndael_encrypt (ctx, scratch, plaintext_192);
+  if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192)))
+    {
+      xfree (ctxmem);
+      return "AES-192 test encryption failed.";
+    }
+  rijndael_decrypt (ctx, scratch, scratch);
+  xfree (ctxmem);
+  if (memcmp (scratch, plaintext_192, sizeof (plaintext_192)))
+    return "AES-192 test decryption failed.";
+
+  return NULL;
+}
+
+
+/* Run the self-tests for AES 256.  Returns NULL on success. */
+static const char*
+selftest_basic_256 (void)
+{
+  RIJNDAEL_context *ctx;
+  unsigned char *ctxmem;
+  unsigned char scratch[16];
+  cipher_bulk_ops_t bulk_ops;
+
+  static unsigned char plaintext_256[16] =
+    {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+  static unsigned char key_256[32] =
+    {
+      0x08,0x09,0x0A,0x0B,0x0D,0x0E,0x0F,0x10,
+      0x12,0x13,0x14,0x15,0x17,0x18,0x19,0x1A,
+      0x1C,0x1D,0x1E,0x1F,0x21,0x22,0x23,0x24,
+      0x26,0x27,0x28,0x29,0x2B,0x2C,0x2D,0x2E
+    };
+  static const unsigned char ciphertext_256[16] =
+    {
+      0x08,0x0E,0x95,0x17,0xEB,0x16,0x77,0x71,
+      0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3
+    };
+
+  ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
+  if (!ctx)
+    return "failed to allocate memory";
+  rijndael_setkey (ctx, key_256, sizeof(key_256), &bulk_ops);
+  rijndael_encrypt (ctx, scratch, plaintext_256);
+  if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
+    {
+      xfree (ctxmem);
+      return "AES-256 test encryption failed.";
+    }
+  rijndael_decrypt (ctx, scratch, scratch);
+  xfree (ctxmem);
+  if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
+    return "AES-256 test decryption failed.";
+
+  return NULL;
+}
+
+
+/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 8+1;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
+           &rijndael_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
+           &rijndael_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
+           &rijndael_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run all the self-tests and return NULL on success.  This function
+   is used for the on-the-fly self-tests. */
+static const char *
+selftest (void)
+{
+  const char *r;
+
+  if ( (r = selftest_basic_128 ())
+       || (r = selftest_basic_192 ())
+       || (r = selftest_basic_256 ()) )
+    return r;
+
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
+  return r;
+}
+
+
+/* SP800-38a.pdf for AES-128.  */
+static const char *
+selftest_fips_128_38a (int requested_mode)
+{
+  static const struct tv
+  {
+    int mode;
+    const unsigned char key[16];
+    const unsigned char iv[16];
+    struct
+    {
+      const unsigned char input[16];
+      const unsigned char output[16];
+    } data[4];
+  } tv[2] =
+    {
+      {
+        GCRY_CIPHER_MODE_CFB,  /* F.3.13, CFB128-AES128 */
+        { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
+          0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
+        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+          0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+        {
+          { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
+              0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a },
+            { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20,
+              0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } },
+
+          { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c,
+              0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
+            { 0xc8, 0xa6, 0x45, 0x37, 0xa0, 0xb3, 0xa9, 0x3f,
+              0xcd, 0xe3, 0xcd, 0xad, 0x9f, 0x1c, 0xe5, 0x8b } },
+
+          { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
+              0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
+            { 0x26, 0x75, 0x1f, 0x67, 0xa3, 0xcb, 0xb1, 0x40,
+              0xb1, 0x80, 0x8c, 0xf1, 0x87, 0xa4, 0xf4, 0xdf } },
+
+          { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17,
+              0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 },
+            { 0xc0, 0x4b, 0x05, 0x35, 0x7c, 0x5d, 0x1c, 0x0e,
+              0xea, 0xc4, 0xc6, 0x6f, 0x9f, 0xf7, 0xf2, 0xe6 } }
+        }
+      },
+      {
+        GCRY_CIPHER_MODE_OFB,
+        { 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6,
+          0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c },
+        { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+          0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+        {
+          { { 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96,
+              0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a },
+            { 0x3b, 0x3f, 0xd9, 0x2e, 0xb7, 0x2d, 0xad, 0x20,
+              0x33, 0x34, 0x49, 0xf8, 0xe8, 0x3c, 0xfb, 0x4a } },
+
+          { { 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c,
+              0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51 },
+            { 0x77, 0x89, 0x50, 0x8d, 0x16, 0x91, 0x8f, 0x03,
+              0xf5, 0x3c, 0x52, 0xda, 0xc5, 0x4e, 0xd8, 0x25 } },
+
+          { { 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11,
+              0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef },
+            { 0x97, 0x40, 0x05, 0x1e, 0x9c, 0x5f, 0xec, 0xf6,
+              0x43, 0x44, 0xf7, 0xa8, 0x22, 0x60, 0xed, 0xcc } },
+
+          { { 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17,
+              0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 },
+            { 0x30, 0x4c, 0x65, 0x28, 0xf6, 0x59, 0xc7, 0x78,
+              0x66, 0xa5, 0x10, 0xd9, 0xc1, 0xd6, 0xae, 0x5e } },
+        }
+      }
+    };
+  unsigned char scratch[16];
+  gpg_error_t err;
+  int tvi, idx;
+  gcry_cipher_hd_t hdenc = NULL;
+  gcry_cipher_hd_t hddec = NULL;
+
+#define Fail(a) do {           \
+    _gcry_cipher_close (hdenc);  \
+    _gcry_cipher_close (hddec);  \
+    return a;                    \
+  } while (0)
+
+  gcry_assert (sizeof tv[0].data[0].input == sizeof scratch);
+  gcry_assert (sizeof tv[0].data[0].output == sizeof scratch);
+
+  for (tvi=0; tvi < DIM (tv); tvi++)
+    if (tv[tvi].mode == requested_mode)
+      break;
+  if (tvi == DIM (tv))
+    Fail ("no test data for this mode");
+
+  err = _gcry_cipher_open (&hdenc, GCRY_CIPHER_AES, tv[tvi].mode, 0);
+  if (err)
+    Fail ("open");
+  err = _gcry_cipher_open (&hddec, GCRY_CIPHER_AES, tv[tvi].mode, 0);
+  if (err)
+    Fail ("open");
+  err = _gcry_cipher_setkey (hdenc, tv[tvi].key,  sizeof tv[tvi].key);
+  if (!err)
+    err = _gcry_cipher_setkey (hddec, tv[tvi].key, sizeof tv[tvi].key);
+  if (err)
+    Fail ("set key");
+  err = _gcry_cipher_setiv (hdenc, tv[tvi].iv, sizeof tv[tvi].iv);
+  if (!err)
+    err = _gcry_cipher_setiv (hddec, tv[tvi].iv, sizeof tv[tvi].iv);
+  if (err)
+    Fail ("set IV");
+  for (idx=0; idx < DIM (tv[tvi].data); idx++)
+    {
+      err = _gcry_cipher_encrypt (hdenc, scratch, sizeof scratch,
+                                  tv[tvi].data[idx].input,
+                                  sizeof tv[tvi].data[idx].input);
+      if (err)
+        Fail ("encrypt command");
+      if (memcmp (scratch, tv[tvi].data[idx].output, sizeof scratch))
+        Fail ("encrypt mismatch");
+      err = _gcry_cipher_decrypt (hddec, scratch, sizeof scratch,
+                                  tv[tvi].data[idx].output,
+                                  sizeof tv[tvi].data[idx].output);
+      if (err)
+        Fail ("decrypt command");
+      if (memcmp (scratch, tv[tvi].data[idx].input, sizeof scratch))
+        Fail ("decrypt mismatch");
+    }
+
+#undef Fail
+  _gcry_cipher_close (hdenc);
+  _gcry_cipher_close (hddec);
+  return NULL;
+}
+
+
+/* Complete selftest for AES-128 with all modes and driver code.  */
+static gpg_err_code_t
+selftest_fips_128 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "low-level";
+  errtxt = selftest_basic_128 ();
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "cfb";
+      errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_CFB);
+      if (errtxt)
+        goto failed;
+
+      what = "ofb";
+      errtxt = selftest_fips_128_38a (GCRY_CIPHER_MODE_OFB);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cipher", GCRY_CIPHER_AES128, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+/* Complete selftest for AES-192.  */
+static gpg_err_code_t
+selftest_fips_192 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  (void)extended; /* No extended tests available.  */
+
+  what = "low-level";
+  errtxt = selftest_basic_192 ();
+  if (errtxt)
+    goto failed;
+
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cipher", GCRY_CIPHER_AES192, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Complete selftest for AES-256.  */
+static gpg_err_code_t
+selftest_fips_256 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  (void)extended; /* No extended tests available.  */
+
+  what = "low-level";
+  errtxt = selftest_basic_256 ();
+  if (errtxt)
+    goto failed;
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("cipher", GCRY_CIPHER_AES256, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_CIPHER_AES128:
+      ec = selftest_fips_128 (extended, report);
+      break;
+    case GCRY_CIPHER_AES192:
+      ec = selftest_fips_192 (extended, report);
+      break;
+    case GCRY_CIPHER_AES256:
+      ec = selftest_fips_256 (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_CIPHER_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+
+static const char *rijndael_names[] =
+  {
+    "RIJNDAEL",
+    "AES128",
+    "AES-128",
+    NULL
+  };
+
+static gcry_cipher_oid_spec_t rijndael_oids[] =
+  {
+    { "2.16.840.1.101.3.4.1.1", GCRY_CIPHER_MODE_ECB },
+    { "2.16.840.1.101.3.4.1.2", GCRY_CIPHER_MODE_CBC },
+    { "2.16.840.1.101.3.4.1.3", GCRY_CIPHER_MODE_OFB },
+    { "2.16.840.1.101.3.4.1.4", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes =
+  {
+    GCRY_CIPHER_AES, {0, 1},
+    "AES", rijndael_names, rijndael_oids, 16, 128,
+    sizeof (RIJNDAEL_context),
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
+    run_selftests
+  };
+
+
+static const char *rijndael192_names[] =
+  {
+    "RIJNDAEL192",
+    "AES-192",
+    NULL
+  };
+
+static gcry_cipher_oid_spec_t rijndael192_oids[] =
+  {
+    { "2.16.840.1.101.3.4.1.21", GCRY_CIPHER_MODE_ECB },
+    { "2.16.840.1.101.3.4.1.22", GCRY_CIPHER_MODE_CBC },
+    { "2.16.840.1.101.3.4.1.23", GCRY_CIPHER_MODE_OFB },
+    { "2.16.840.1.101.3.4.1.24", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes192 =
+  {
+    GCRY_CIPHER_AES192, {0, 1},
+    "AES192", rijndael192_names, rijndael192_oids, 16, 192,
+    sizeof (RIJNDAEL_context),
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
+    run_selftests
+  };
+
+
+static const char *rijndael256_names[] =
+  {
+    "RIJNDAEL256",
+    "AES-256",
+    NULL
+  };
+
+static gcry_cipher_oid_spec_t rijndael256_oids[] =
+  {
+    { "2.16.840.1.101.3.4.1.41", GCRY_CIPHER_MODE_ECB },
+    { "2.16.840.1.101.3.4.1.42", GCRY_CIPHER_MODE_CBC },
+    { "2.16.840.1.101.3.4.1.43", GCRY_CIPHER_MODE_OFB },
+    { "2.16.840.1.101.3.4.1.44", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aes256 =
+  {
+    GCRY_CIPHER_AES256, {0, 1},
+    "AES256", rijndael256_names, rijndael256_oids, 16, 256,
+    sizeof (RIJNDAEL_context),
+    rijndael_setkey, rijndael_encrypt, rijndael_decrypt,
+    NULL, NULL,
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/rmd160.c b/comm/third_party/libgcrypt/cipher/rmd160.c
new file mode 100644
index 0000000000..e12ff0176f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rmd160.c
@@ -0,0 +1,529 @@
+/* rmd160.c  -	RIPE-MD160
+ * Copyright (C) 1998, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "hash-common.h"
+#include "cipher.h" /* Only used for the rmd160_hash_buffer() prototype. */
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+/*********************************
+ * RIPEMD-160 is not patented, see (as of 25.10.97)
+ *   http://www.esat.kuleuven.ac.be/~bosselae/ripemd160.html
+ * Note that the code uses Little Endian byteorder, which is good for
+ * 386 etc, but we must add some conversion when used on a big endian box.
+ *
+ *
+ * Pseudo-code for RIPEMD-160
+ *
+ * RIPEMD-160 is an iterative hash function that operates on 32-bit words.
+ * The round function takes as input a 5-word chaining variable and a 16-word
+ * message block and maps this to a new chaining variable. All operations are
+ * defined on 32-bit words. Padding is identical to that of MD4.
+ *
+ *
+ * RIPEMD-160: definitions
+ *
+ *
+ *   nonlinear functions at bit level: exor, mux, -, mux, -
+ *
+ *   f(j, x, y, z) = x XOR y XOR z		  (0 <= j <= 15)
+ *   f(j, x, y, z) = (x AND y) OR (NOT(x) AND z)  (16 <= j <= 31)
+ *   f(j, x, y, z) = (x OR NOT(y)) XOR z	  (32 <= j <= 47)
+ *   f(j, x, y, z) = (x AND z) OR (y AND NOT(z))  (48 <= j <= 63)
+ *   f(j, x, y, z) = x XOR (y OR NOT(z))	  (64 <= j <= 79)
+ *
+ *
+ *   added constants (hexadecimal)
+ *
+ *   K(j) = 0x00000000	    (0 <= j <= 15)
+ *   K(j) = 0x5A827999	   (16 <= j <= 31)	int(2**30 x sqrt(2))
+ *   K(j) = 0x6ED9EBA1	   (32 <= j <= 47)	int(2**30 x sqrt(3))
+ *   K(j) = 0x8F1BBCDC	   (48 <= j <= 63)	int(2**30 x sqrt(5))
+ *   K(j) = 0xA953FD4E	   (64 <= j <= 79)	int(2**30 x sqrt(7))
+ *   K'(j) = 0x50A28BE6     (0 <= j <= 15)      int(2**30 x cbrt(2))
+ *   K'(j) = 0x5C4DD124    (16 <= j <= 31)      int(2**30 x cbrt(3))
+ *   K'(j) = 0x6D703EF3    (32 <= j <= 47)      int(2**30 x cbrt(5))
+ *   K'(j) = 0x7A6D76E9    (48 <= j <= 63)      int(2**30 x cbrt(7))
+ *   K'(j) = 0x00000000    (64 <= j <= 79)
+ *
+ *
+ *   selection of message word
+ *
+ *   r(j)      = j		      (0 <= j <= 15)
+ *   r(16..31) = 7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8
+ *   r(32..47) = 3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12
+ *   r(48..63) = 1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2
+ *   r(64..79) = 4, 0, 5, 9, 7, 12, 2, 10, 14, 1, 3, 8, 11, 6, 15, 13
+ *   r0(0..15) = 5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12
+ *   r0(16..31)= 6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2
+ *   r0(32..47)= 15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13
+ *   r0(48..63)= 8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14
+ *   r0(64..79)= 12, 15, 10, 4, 1, 5, 8, 7, 6, 2, 13, 14, 0, 3, 9, 11
+ *
+ *
+ *   amount for rotate left (rol)
+ *
+ *   s(0..15)  = 11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8
+ *   s(16..31) = 7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12
+ *   s(32..47) = 11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5
+ *   s(48..63) = 11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12
+ *   s(64..79) = 9, 15, 5, 11, 6, 8, 13, 12, 5, 12, 13, 14, 11, 8, 5, 6
+ *   s'(0..15) = 8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6
+ *   s'(16..31)= 9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11
+ *   s'(32..47)= 9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5
+ *   s'(48..63)= 15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8
+ *   s'(64..79)= 8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
+ *
+ *
+ *   initial value (hexadecimal)
+ *
+ *   h0 = 0x67452301; h1 = 0xEFCDAB89; h2 = 0x98BADCFE; h3 = 0x10325476;
+ *							h4 = 0xC3D2E1F0;
+ *
+ *
+ * RIPEMD-160: pseudo-code
+ *
+ *   It is assumed that the message after padding consists of t 16-word blocks
+ *   that will be denoted with X[i][j], with 0 <= i <= t-1 and 0 <= j <= 15.
+ *   The symbol [+] denotes addition modulo 2**32 and rol_s denotes cyclic left
+ *   shift (rotate) over s positions.
+ *
+ *
+ *   for i := 0 to t-1 {
+ *	 A := h0; B := h1; C := h2; D = h3; E = h4;
+ *	 A' := h0; B' := h1; C' := h2; D' = h3; E' = h4;
+ *	 for j := 0 to 79 {
+ *	     T := rol_s(j)(A [+] f(j, B, C, D) [+] X[i][r(j)] [+] K(j)) [+] E;
+ *	     A := E; E := D; D := rol_10(C); C := B; B := T;
+ *	     T := rol_s'(j)(A' [+] f(79-j, B', C', D') [+] X[i][r'(j)]
+						       [+] K'(j)) [+] E';
+ *	     A' := E'; E' := D'; D' := rol_10(C'); C' := B'; B' := T;
+ *	 }
+ *	 T := h1 [+] C [+] D'; h1 := h2 [+] D [+] E'; h2 := h3 [+] E [+] A';
+ *	 h3 := h4 [+] A [+] B'; h4 := h0 [+] B [+] C'; h0 := T;
+ *   }
+ */
+
+/* Some examples:
+ * ""                    9c1185a5c5e9fc54612808977ee8f548b2258d31
+ * "a"                   0bdc9d2d256b3ee9daae347be6f4dc835a467ffe
+ * "abc"                 8eb208f7e05d987a9b044a8e98c6b087f15a0bfc
+ * "message digest"      5d0689ef49d2fae572b881b123a85ffa21595f36
+ * "a...z"               f71c27109c692c1b56bbdceb5b9d2865b3708dbc
+ * "abcdbcde...nopq"     12a053384a9c0c88e405a06c27dcf49ada62eb2b
+ * "A...Za...z0...9"     b0e20b6e3116640286ed3a87a5713079b21f5189
+ * 8 times "1234567890"  9b752e45573d4b39f4dbd3323cab82bf63326bfb
+ * 1 million times "a"   52783243c1697bdbe16d37f97f68f08325dc1528
+ */
+
+typedef struct
+{
+  gcry_md_block_ctx_t bctx;
+  u32  h0,h1,h2,h3,h4;
+} RMD160_CONTEXT;
+
+
+static unsigned int
+transform ( void *ctx, const unsigned char *data, size_t nblks );
+
+static void
+rmd160_init (void *context, unsigned int flags)
+{
+  RMD160_CONTEXT *hd = context;
+
+  (void)flags;
+
+  hd->h0 = 0x67452301;
+  hd->h1 = 0xEFCDAB89;
+  hd->h2 = 0x98BADCFE;
+  hd->h3 = 0x10325476;
+  hd->h4 = 0xC3D2E1F0;
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.nblocks_high = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+  hd->bctx.bwrite = transform;
+}
+
+
+/****************
+ * Transform the message X which consists of 16 32-bit-words
+ */
+static unsigned int
+transform_blk ( void *ctx, const unsigned char *data )
+{
+  RMD160_CONTEXT *hd = ctx;
+  register u32 al, ar, bl, br, cl, cr, dl, dr, el, er;
+  u32 x[16];
+  int i;
+
+  for ( i = 0; i < 16; i++ )
+    x[i] = buf_get_le32(data + i * 4);
+
+#define K0  0x00000000
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xA953FD4E
+#define KK0 0x50A28BE6
+#define KK1 0x5C4DD124
+#define KK2 0x6D703EF3
+#define KK3 0x7A6D76E9
+#define KK4 0x00000000
+#define F0(x,y,z)   ( (x) ^ (y) ^ (z) )
+#define F1(x,y,z)   ( ((x) & (y)) | (~(x) & (z)) )
+#define F2(x,y,z)   ( ((x) | ~(y)) ^ (z) )
+#define F3(x,y,z)   ( ((x) & (z)) | ((y) & ~(z)) )
+#define F4(x,y,z)   ( (x) ^ ((y) | ~(z)) )
+#define R(a,b,c,d,e,f,k,r,s) do { a += f(b,c,d) + k + x[r]; \
+				  a = rol(a,s) + e;	       \
+				  c = rol(c,10);	       \
+				} while(0)
+
+  /* left lane and right lanes interleaved */
+  al = ar = hd->h0;
+  bl = br = hd->h1;
+  cl = cr = hd->h2;
+  dl = dr = hd->h3;
+  el = er = hd->h4;
+  R( al, bl, cl, dl, el, F0, K0,  0, 11 );
+  R( ar, br, cr, dr, er, F4, KK0,	5,  8);
+  R( el, al, bl, cl, dl, F0, K0,  1, 14 );
+  R( er, ar, br, cr, dr, F4, KK0, 14,  9);
+  R( dl, el, al, bl, cl, F0, K0,  2, 15 );
+  R( dr, er, ar, br, cr, F4, KK0,	7,  9);
+  R( cl, dl, el, al, bl, F0, K0,  3, 12 );
+  R( cr, dr, er, ar, br, F4, KK0,	0, 11);
+  R( bl, cl, dl, el, al, F0, K0,  4,  5 );
+  R( br, cr, dr, er, ar, F4, KK0,	9, 13);
+  R( al, bl, cl, dl, el, F0, K0,  5,  8 );
+  R( ar, br, cr, dr, er, F4, KK0,	2, 15);
+  R( el, al, bl, cl, dl, F0, K0,  6,  7 );
+  R( er, ar, br, cr, dr, F4, KK0, 11, 15);
+  R( dl, el, al, bl, cl, F0, K0,  7,  9 );
+  R( dr, er, ar, br, cr, F4, KK0,	4,  5);
+  R( cl, dl, el, al, bl, F0, K0,  8, 11 );
+  R( cr, dr, er, ar, br, F4, KK0, 13,  7);
+  R( bl, cl, dl, el, al, F0, K0,  9, 13 );
+  R( br, cr, dr, er, ar, F4, KK0,	6,  7);
+  R( al, bl, cl, dl, el, F0, K0, 10, 14 );
+  R( ar, br, cr, dr, er, F4, KK0, 15,  8);
+  R( el, al, bl, cl, dl, F0, K0, 11, 15 );
+  R( er, ar, br, cr, dr, F4, KK0,	8, 11);
+  R( dl, el, al, bl, cl, F0, K0, 12,  6 );
+  R( dr, er, ar, br, cr, F4, KK0,	1, 14);
+  R( cl, dl, el, al, bl, F0, K0, 13,  7 );
+  R( cr, dr, er, ar, br, F4, KK0, 10, 14);
+  R( bl, cl, dl, el, al, F0, K0, 14,  9 );
+  R( br, cr, dr, er, ar, F4, KK0,	3, 12);
+  R( al, bl, cl, dl, el, F0, K0, 15,  8 );
+  R( ar, br, cr, dr, er, F4, KK0, 12,  6);
+  R( el, al, bl, cl, dl, F1, K1,  7,  7 );
+  R( er, ar, br, cr, dr, F3, KK1,	6,  9);
+  R( dl, el, al, bl, cl, F1, K1,  4,  6 );
+  R( dr, er, ar, br, cr, F3, KK1, 11, 13);
+  R( cl, dl, el, al, bl, F1, K1, 13,  8 );
+  R( cr, dr, er, ar, br, F3, KK1,	3, 15);
+  R( bl, cl, dl, el, al, F1, K1,  1, 13 );
+  R( br, cr, dr, er, ar, F3, KK1,	7,  7);
+  R( al, bl, cl, dl, el, F1, K1, 10, 11 );
+  R( ar, br, cr, dr, er, F3, KK1,	0, 12);
+  R( el, al, bl, cl, dl, F1, K1,  6,  9 );
+  R( er, ar, br, cr, dr, F3, KK1, 13,  8);
+  R( dl, el, al, bl, cl, F1, K1, 15,  7 );
+  R( dr, er, ar, br, cr, F3, KK1,	5,  9);
+  R( cl, dl, el, al, bl, F1, K1,  3, 15 );
+  R( cr, dr, er, ar, br, F3, KK1, 10, 11);
+  R( bl, cl, dl, el, al, F1, K1, 12,  7 );
+  R( br, cr, dr, er, ar, F3, KK1, 14,  7);
+  R( al, bl, cl, dl, el, F1, K1,  0, 12 );
+  R( ar, br, cr, dr, er, F3, KK1, 15,  7);
+  R( el, al, bl, cl, dl, F1, K1,  9, 15 );
+  R( er, ar, br, cr, dr, F3, KK1,	8, 12);
+  R( dl, el, al, bl, cl, F1, K1,  5,  9 );
+  R( dr, er, ar, br, cr, F3, KK1, 12,  7);
+  R( cl, dl, el, al, bl, F1, K1,  2, 11 );
+  R( cr, dr, er, ar, br, F3, KK1,	4,  6);
+  R( bl, cl, dl, el, al, F1, K1, 14,  7 );
+  R( br, cr, dr, er, ar, F3, KK1,	9, 15);
+  R( al, bl, cl, dl, el, F1, K1, 11, 13 );
+  R( ar, br, cr, dr, er, F3, KK1,	1, 13);
+  R( el, al, bl, cl, dl, F1, K1,  8, 12 );
+  R( er, ar, br, cr, dr, F3, KK1,	2, 11);
+  R( dl, el, al, bl, cl, F2, K2,  3, 11 );
+  R( dr, er, ar, br, cr, F2, KK2, 15,  9);
+  R( cl, dl, el, al, bl, F2, K2, 10, 13 );
+  R( cr, dr, er, ar, br, F2, KK2,	5,  7);
+  R( bl, cl, dl, el, al, F2, K2, 14,  6 );
+  R( br, cr, dr, er, ar, F2, KK2,	1, 15);
+  R( al, bl, cl, dl, el, F2, K2,  4,  7 );
+  R( ar, br, cr, dr, er, F2, KK2,	3, 11);
+  R( el, al, bl, cl, dl, F2, K2,  9, 14 );
+  R( er, ar, br, cr, dr, F2, KK2,	7,  8);
+  R( dl, el, al, bl, cl, F2, K2, 15,  9 );
+  R( dr, er, ar, br, cr, F2, KK2, 14,  6);
+  R( cl, dl, el, al, bl, F2, K2,  8, 13 );
+  R( cr, dr, er, ar, br, F2, KK2,	6,  6);
+  R( bl, cl, dl, el, al, F2, K2,  1, 15 );
+  R( br, cr, dr, er, ar, F2, KK2,	9, 14);
+  R( al, bl, cl, dl, el, F2, K2,  2, 14 );
+  R( ar, br, cr, dr, er, F2, KK2, 11, 12);
+  R( el, al, bl, cl, dl, F2, K2,  7,  8 );
+  R( er, ar, br, cr, dr, F2, KK2,	8, 13);
+  R( dl, el, al, bl, cl, F2, K2,  0, 13 );
+  R( dr, er, ar, br, cr, F2, KK2, 12,  5);
+  R( cl, dl, el, al, bl, F2, K2,  6,  6 );
+  R( cr, dr, er, ar, br, F2, KK2,	2, 14);
+  R( bl, cl, dl, el, al, F2, K2, 13,  5 );
+  R( br, cr, dr, er, ar, F2, KK2, 10, 13);
+  R( al, bl, cl, dl, el, F2, K2, 11, 12 );
+  R( ar, br, cr, dr, er, F2, KK2,	0, 13);
+  R( el, al, bl, cl, dl, F2, K2,  5,  7 );
+  R( er, ar, br, cr, dr, F2, KK2,	4,  7);
+  R( dl, el, al, bl, cl, F2, K2, 12,  5 );
+  R( dr, er, ar, br, cr, F2, KK2, 13,  5);
+  R( cl, dl, el, al, bl, F3, K3,  1, 11 );
+  R( cr, dr, er, ar, br, F1, KK3,	8, 15);
+  R( bl, cl, dl, el, al, F3, K3,  9, 12 );
+  R( br, cr, dr, er, ar, F1, KK3,	6,  5);
+  R( al, bl, cl, dl, el, F3, K3, 11, 14 );
+  R( ar, br, cr, dr, er, F1, KK3,	4,  8);
+  R( el, al, bl, cl, dl, F3, K3, 10, 15 );
+  R( er, ar, br, cr, dr, F1, KK3,	1, 11);
+  R( dl, el, al, bl, cl, F3, K3,  0, 14 );
+  R( dr, er, ar, br, cr, F1, KK3,	3, 14);
+  R( cl, dl, el, al, bl, F3, K3,  8, 15 );
+  R( cr, dr, er, ar, br, F1, KK3, 11, 14);
+  R( bl, cl, dl, el, al, F3, K3, 12,  9 );
+  R( br, cr, dr, er, ar, F1, KK3, 15,  6);
+  R( al, bl, cl, dl, el, F3, K3,  4,  8 );
+  R( ar, br, cr, dr, er, F1, KK3,	0, 14);
+  R( el, al, bl, cl, dl, F3, K3, 13,  9 );
+  R( er, ar, br, cr, dr, F1, KK3,	5,  6);
+  R( dl, el, al, bl, cl, F3, K3,  3, 14 );
+  R( dr, er, ar, br, cr, F1, KK3, 12,  9);
+  R( cl, dl, el, al, bl, F3, K3,  7,  5 );
+  R( cr, dr, er, ar, br, F1, KK3,	2, 12);
+  R( bl, cl, dl, el, al, F3, K3, 15,  6 );
+  R( br, cr, dr, er, ar, F1, KK3, 13,  9);
+  R( al, bl, cl, dl, el, F3, K3, 14,  8 );
+  R( ar, br, cr, dr, er, F1, KK3,	9, 12);
+  R( el, al, bl, cl, dl, F3, K3,  5,  6 );
+  R( er, ar, br, cr, dr, F1, KK3,	7,  5);
+  R( dl, el, al, bl, cl, F3, K3,  6,  5 );
+  R( dr, er, ar, br, cr, F1, KK3, 10, 15);
+  R( cl, dl, el, al, bl, F3, K3,  2, 12 );
+  R( cr, dr, er, ar, br, F1, KK3, 14,  8);
+  R( bl, cl, dl, el, al, F4, K4,  4,  9 );
+  R( br, cr, dr, er, ar, F0, KK4, 12,  8);
+  R( al, bl, cl, dl, el, F4, K4,  0, 15 );
+  R( ar, br, cr, dr, er, F0, KK4, 15,  5);
+  R( el, al, bl, cl, dl, F4, K4,  5,  5 );
+  R( er, ar, br, cr, dr, F0, KK4, 10, 12);
+  R( dl, el, al, bl, cl, F4, K4,  9, 11 );
+  R( dr, er, ar, br, cr, F0, KK4,	4,  9);
+  R( cl, dl, el, al, bl, F4, K4,  7,  6 );
+  R( cr, dr, er, ar, br, F0, KK4,	1, 12);
+  R( bl, cl, dl, el, al, F4, K4, 12,  8 );
+  R( br, cr, dr, er, ar, F0, KK4,	5,  5);
+  R( al, bl, cl, dl, el, F4, K4,  2, 13 );
+  R( ar, br, cr, dr, er, F0, KK4,	8, 14);
+  R( el, al, bl, cl, dl, F4, K4, 10, 12 );
+  R( er, ar, br, cr, dr, F0, KK4,	7,  6);
+  R( dl, el, al, bl, cl, F4, K4, 14,  5 );
+  R( dr, er, ar, br, cr, F0, KK4,	6,  8);
+  R( cl, dl, el, al, bl, F4, K4,  1, 12 );
+  R( cr, dr, er, ar, br, F0, KK4,	2, 13);
+  R( bl, cl, dl, el, al, F4, K4,  3, 13 );
+  R( br, cr, dr, er, ar, F0, KK4, 13,  6);
+  R( al, bl, cl, dl, el, F4, K4,  8, 14 );
+  R( ar, br, cr, dr, er, F0, KK4, 14,  5);
+  R( el, al, bl, cl, dl, F4, K4, 11, 11 );
+  R( er, ar, br, cr, dr, F0, KK4,	0, 15);
+  R( dl, el, al, bl, cl, F4, K4,  6,  8 );
+  R( dr, er, ar, br, cr, F0, KK4,	3, 13);
+  R( cl, dl, el, al, bl, F4, K4, 15,  5 );
+  R( cr, dr, er, ar, br, F0, KK4,	9, 11);
+  R( bl, cl, dl, el, al, F4, K4, 13,  6 );
+  R( br, cr, dr, er, ar, F0, KK4, 11, 11);
+
+  dr += cl + hd->h1;
+  hd->h1 = hd->h2 + dl + er;
+  hd->h2 = hd->h3 + el + ar;
+  hd->h3 = hd->h4 + al + br;
+  hd->h4 = hd->h0 + bl + cr;
+  hd->h0 = dr;
+
+  return /*burn_stack*/ 104+5*sizeof(void*);
+}
+
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+/*
+ * The routine terminates the computation
+ */
+static void
+rmd160_final( void *context )
+{
+  RMD160_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if( (lsb += hd->bctx.count) < t )
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
+    }
+  else /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 64 + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 64 + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_le32(p, hd->h##a); p += 4; } while(0)
+  X(0);
+  X(1);
+  X(2);
+  X(3);
+  X(4);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+rmd160_read( void *context )
+{
+  RMD160_CONTEXT *hd = context;
+
+  return hd->bctx.buf;
+}
+
+
+
+/****************
+ * Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 20 bytes.
+ */
+void
+_gcry_rmd160_hash_buffer (void *outbuf, const void *buffer, size_t length )
+{
+  RMD160_CONTEXT hd;
+
+  rmd160_init (&hd, 0);
+  _gcry_md_block_write ( &hd, buffer, length );
+  rmd160_final ( &hd );
+  memcpy ( outbuf, hd.bctx.buf, 20 );
+}
+
+/* Variant of the above shortcut function using a multiple buffers.  */
+static void
+_gcry_rmd160_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  RMD160_CONTEXT hd;
+
+  rmd160_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  rmd160_final ( &hd );
+  memcpy ( outbuf, hd.bctx.buf, 20 );
+}
+
+
+static byte asn[15] = /* Object ID is 1.3.36.3.2.1 */
+  { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x24, 0x03,
+    0x02, 0x01, 0x05, 0x00, 0x04, 0x14 };
+
+static gcry_md_oid_spec_t oid_spec_rmd160[] =
+  {
+    /* rsaSignatureWithripemd160 */
+    { "1.3.36.3.3.1.2" },
+    /* TeleTrust hash algorithm.  */
+    { "1.3.36.3.2.1" },
+    { NULL }
+  };
+
+gcry_md_spec_t _gcry_digest_spec_rmd160 =
+  {
+    GCRY_MD_RMD160, {0, 0},
+    "RIPEMD160", asn, DIM (asn), oid_spec_rmd160, 20,
+    rmd160_init, _gcry_md_block_write, rmd160_final, rmd160_read, NULL,
+    _gcry_rmd160_hash_buffer, _gcry_rmd160_hash_buffers,
+    sizeof (RMD160_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/rsa-common.c b/comm/third_party/libgcrypt/cipher/rsa-common.c
new file mode 100644
index 0000000000..29b7bc8148
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rsa-common.c
@@ -0,0 +1,1038 @@
+/* rsa-common.c - Supporting functions for RSA
+ * Copyright (C) 2011 Free Software Foundation, Inc.
+ * Copyright (C) 2013  g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+/* Turn VALUE into an octet string and store it in an allocated buffer
+   at R_FRAME or - if R_RAME is NULL - copy it into the caller
+   provided buffer SPACE; either SPACE or R_FRAME may be used.  If
+   SPACE if not NULL, the caller must provide a buffer of at least
+   NBYTES.  If the resulting octet string is shorter than NBYTES pad
+   it to the left with zeroes.  If VALUE does not fit into NBYTES
+   return an error code.  */
+static gpg_err_code_t
+octet_string_from_mpi (unsigned char **r_frame, void *space,
+                       gcry_mpi_t value, size_t nbytes)
+{
+  return _gcry_mpi_to_octet_string (r_frame, space, value, nbytes);
+}
+
+
+
+/* Encode {VALUE,VALUELEN} for an NBITS keys using the pkcs#1 block
+   type 2 padding.  On success the result is stored as a new MPI at
+   R_RESULT.  On error the value at R_RESULT is undefined.
+
+   If {RANDOM_OVERRIDE, RANDOM_OVERRIDE_LEN} is given it is used as
+   the seed instead of using a random string for it.  This feature is
+   only useful for regression tests.  Note that this value may not
+   contain zero bytes.
+
+   We encode the value in this way:
+
+     0  2  RND(n bytes)  0  VALUE
+
+   0   is a marker we unfortunately can't encode because we return an
+       MPI which strips all leading zeroes.
+   2   is the block type.
+   RND are non-zero random bytes.
+
+   (Note that OpenPGP includes the cipher algorithm and a checksum in
+   VALUE; the caller needs to prepare the value accordingly.)
+  */
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_for_enc (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen,
+                                const unsigned char *random_override,
+                                size_t random_override_len)
+{
+  gcry_err_code_t rc = 0;
+  unsigned char *frame = NULL;
+  size_t nframe = (nbits+7) / 8;
+  int i;
+  size_t n;
+  unsigned char *p;
+
+  if (valuelen + 7 > nframe || !nframe)
+    {
+      /* Can't encode a VALUELEN value in a NFRAME bytes frame.  */
+      return GPG_ERR_TOO_SHORT; /* The key is too short.  */
+    }
+
+  if ( !(frame = xtrymalloc_secure (nframe)))
+    return gpg_err_code_from_syserror ();
+
+  n = 0;
+  frame[n++] = 0;
+  frame[n++] = 2; /* block type */
+  i = nframe - 3 - valuelen;
+  gcry_assert (i > 0);
+
+  if (random_override)
+    {
+      int j;
+
+      if (random_override_len != i)
+        {
+          xfree (frame);
+          return GPG_ERR_INV_ARG;
+        }
+      /* Check that random does not include a zero byte.  */
+      for (j=0; j < random_override_len; j++)
+        if (!random_override[j])
+          {
+            xfree (frame);
+            return GPG_ERR_INV_ARG;
+          }
+      memcpy (frame + n, random_override, random_override_len);
+      n += random_override_len;
+    }
+  else
+    {
+      p = _gcry_random_bytes_secure (i, GCRY_STRONG_RANDOM);
+      /* Replace zero bytes by new values. */
+      for (;;)
+        {
+          int j, k;
+          unsigned char *pp;
+
+          /* Count the zero bytes. */
+          for (j=k=0; j < i; j++)
+            {
+              if (!p[j])
+                k++;
+            }
+          if (!k)
+            break; /* Okay: no (more) zero bytes. */
+
+          k += k/128 + 3; /* Better get some more. */
+          pp = _gcry_random_bytes_secure (k, GCRY_STRONG_RANDOM);
+          for (j=0; j < i && k; )
+            {
+              if (!p[j])
+                p[j] = pp[--k];
+              if (p[j])
+                j++;
+            }
+          xfree (pp);
+        }
+      memcpy (frame+n, p, i);
+      n += i;
+      xfree (p);
+    }
+
+  frame[n++] = 0;
+  memcpy (frame+n, value, valuelen);
+  n += valuelen;
+  gcry_assert (n == nframe);
+
+  rc = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, n, &nframe);
+  if (!rc &&DBG_CIPHER)
+    log_mpidump ("PKCS#1 block type 2 encoded data", *r_result);
+  xfree (frame);
+
+  return rc;
+}
+
+
+/* Decode a plaintext in VALUE assuming pkcs#1 block type 2 padding.
+   NBITS is the size of the secret key.  On success the result is
+   stored as a newly allocated buffer at R_RESULT and its valid length at
+   R_RESULTLEN.  On error NULL is stored at R_RESULT.  */
+gpg_err_code_t
+_gcry_rsa_pkcs1_decode_for_enc (unsigned char **r_result, size_t *r_resultlen,
+                                unsigned int nbits, gcry_mpi_t value)
+{
+  gcry_error_t err;
+  unsigned char *frame = NULL;
+  size_t nframe = (nbits+7) / 8;
+  size_t n;
+
+  *r_result = NULL;
+
+  if ( !(frame = xtrymalloc_secure (nframe)))
+    return gpg_err_code_from_syserror ();
+
+  err = _gcry_mpi_print (GCRYMPI_FMT_USG, frame, nframe, &n, value);
+  if (err)
+    {
+      xfree (frame);
+      return gcry_err_code (err);
+    }
+
+  nframe = n; /* Set NFRAME to the actual length.  */
+
+  /* FRAME = 0x00 || 0x02 || PS || 0x00 || M
+
+     pkcs#1 requires that the first byte is zero.  Our MPIs usually
+     strip leading zero bytes; thus we are not able to detect them.
+     However due to the way gcry_mpi_print is implemented we may see
+     leading zero bytes nevertheless.  We handle this by making the
+     first zero byte optional.  */
+  if (nframe < 4)
+    {
+      xfree (frame);
+      return GPG_ERR_ENCODING_PROBLEM;  /* Too short.  */
+    }
+  n = 0;
+  if (!frame[0])
+    n++;
+  if (frame[n++] != 0x02)
+    {
+      xfree (frame);
+      return GPG_ERR_ENCODING_PROBLEM;  /* Wrong block type.  */
+    }
+
+  /* Skip the non-zero random bytes and the terminating zero byte.  */
+  for (; n < nframe && frame[n] != 0x00; n++)
+    ;
+  if (n+1 >= nframe)
+    {
+      xfree (frame);
+      return GPG_ERR_ENCODING_PROBLEM; /* No zero byte.  */
+    }
+  n++; /* Skip the zero byte.  */
+
+  /* To avoid an extra allocation we reuse the frame buffer.  The only
+     caller of this function will anyway free the result soon.  */
+  memmove (frame, frame + n, nframe - n);
+  *r_result = frame;
+  *r_resultlen = nframe - n;
+
+  if (DBG_CIPHER)
+    log_printhex ("value extracted from PKCS#1 block type 2 encoded data",
+                  *r_result, *r_resultlen);
+
+  return 0;
+}
+
+
+/* Encode {VALUE,VALUELEN} for an NBITS keys and hash algorithm ALGO
+   using the pkcs#1 block type 1 padding.  On success the result is
+   stored as a new MPI at R_RESULT.  On error the value at R_RESULT is
+   undefined.
+
+   We encode the value in this way:
+
+     0  1  PAD(n bytes)  0  ASN(asnlen bytes) VALUE(valuelen bytes)
+
+   0   is a marker we unfortunately can't encode because we return an
+       MPI which strips all leading zeroes.
+   1   is the block type.
+   PAD consists of 0xff bytes.
+   0   marks the end of the padding.
+   ASN is the DER encoding of the hash algorithm; along with the VALUE
+       it yields a valid DER encoding.
+
+   (Note that PGP prior to version 2.3 encoded the message digest as:
+      0   1   MD(16 bytes)   0   PAD(n bytes)   1
+    The MD is always 16 bytes here because it's always MD5.  GnuPG
+    does not not support pre-v2.3 signatures, but I'm including this
+    comment so the information is easily found if needed.)
+*/
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen,
+                                int algo)
+{
+  gcry_err_code_t rc = 0;
+  byte asn[100];
+  byte *frame = NULL;
+  size_t nframe = (nbits+7) / 8;
+  int i;
+  size_t n;
+  size_t asnlen, dlen;
+
+  asnlen = DIM(asn);
+  dlen = _gcry_md_get_algo_dlen (algo);
+
+  if (_gcry_md_algo_info (algo, GCRYCTL_GET_ASNOID, asn, &asnlen))
+    {
+      /* We don't have yet all of the above algorithms.  */
+      return GPG_ERR_NOT_IMPLEMENTED;
+    }
+
+  if ( valuelen != dlen )
+    {
+      /* Hash value does not match the length of digest for
+         the given algorithm.  */
+      return GPG_ERR_CONFLICT;
+    }
+
+  if ( !dlen || dlen + asnlen + 4 > nframe)
+    {
+      /* Can't encode an DLEN byte digest MD into an NFRAME byte
+         frame.  */
+      return GPG_ERR_TOO_SHORT;
+    }
+
+  if ( !(frame = xtrymalloc (nframe)) )
+    return gpg_err_code_from_syserror ();
+
+  /* Assemble the pkcs#1 block type 1. */
+  n = 0;
+  frame[n++] = 0;
+  frame[n++] = 1; /* block type */
+  i = nframe - valuelen - asnlen - 3 ;
+  gcry_assert (i > 1);
+  memset (frame+n, 0xff, i );
+  n += i;
+  frame[n++] = 0;
+  memcpy (frame+n, asn, asnlen);
+  n += asnlen;
+  memcpy (frame+n, value, valuelen );
+  n += valuelen;
+  gcry_assert (n == nframe);
+
+  /* Convert it into an MPI. */
+  rc = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, n, &nframe);
+  if (!rc && DBG_CIPHER)
+    log_mpidump ("PKCS#1 block type 1 encoded data", *r_result);
+  xfree (frame);
+
+  return rc;
+}
+
+/* Encode {VALUE,VALUELEN} for an NBITS keys using the pkcs#1 block
+   type 1 padding.  On success the result is stored as a new MPI at
+   R_RESULT.  On error the value at R_RESULT is undefined.
+
+   We encode the value in this way:
+
+     0  1  PAD(n bytes)  0  VALUE(valuelen bytes)
+
+   0   is a marker we unfortunately can't encode because we return an
+       MPI which strips all leading zeroes.
+   1   is the block type.
+   PAD consists of 0xff bytes.
+   0   marks the end of the padding.
+
+   (Note that PGP prior to version 2.3 encoded the message digest as:
+      0   1   MD(16 bytes)   0   PAD(n bytes)   1
+    The MD is always 16 bytes here because it's always MD5.  GnuPG
+    does not not support pre-v2.3 signatures, but I'm including this
+    comment so the information is easily found if needed.)
+*/
+gpg_err_code_t
+_gcry_rsa_pkcs1_encode_raw_for_sig (gcry_mpi_t *r_result, unsigned int nbits,
+                                const unsigned char *value, size_t valuelen)
+{
+  gcry_err_code_t rc = 0;
+  gcry_error_t err;
+  byte *frame = NULL;
+  size_t nframe = (nbits+7) / 8;
+  int i;
+  size_t n;
+
+  if ( !valuelen || valuelen + 4 > nframe)
+    {
+      /* Can't encode an DLEN byte digest MD into an NFRAME byte
+         frame.  */
+      return GPG_ERR_TOO_SHORT;
+    }
+
+  if ( !(frame = xtrymalloc (nframe)) )
+    return gpg_err_code_from_syserror ();
+
+  /* Assemble the pkcs#1 block type 1. */
+  n = 0;
+  frame[n++] = 0;
+  frame[n++] = 1; /* block type */
+  i = nframe - valuelen - 3 ;
+  gcry_assert (i > 1);
+  memset (frame+n, 0xff, i );
+  n += i;
+  frame[n++] = 0;
+  memcpy (frame+n, value, valuelen );
+  n += valuelen;
+  gcry_assert (n == nframe);
+
+  /* Convert it into an MPI. */
+  err = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, n, &nframe);
+  if (err)
+    rc = gcry_err_code (err);
+  else if (DBG_CIPHER)
+    log_mpidump ("PKCS#1 block type 1 encoded data", *r_result);
+  xfree (frame);
+
+  return rc;
+}
+
+
+/* Mask generation function for OAEP.  See RFC-3447 B.2.1.  */
+static gcry_err_code_t
+mgf1 (unsigned char *output, size_t outlen, unsigned char *seed, size_t seedlen,
+      int algo)
+{
+  size_t dlen, nbytes, n;
+  int idx;
+  gcry_md_hd_t hd;
+  gcry_err_code_t err;
+
+  err = _gcry_md_open (&hd, algo, 0);
+  if (err)
+    return err;
+
+  dlen = _gcry_md_get_algo_dlen (algo);
+
+  /* We skip step 1 which would be assert(OUTLEN <= 2^32).  The loop
+     in step 3 is merged with step 4 by concatenating no more octets
+     than what would fit into OUTPUT.  The ceiling for the counter IDX
+     is implemented indirectly.  */
+  nbytes = 0;  /* Step 2.  */
+  idx = 0;
+  while ( nbytes < outlen )
+    {
+      unsigned char c[4], *digest;
+
+      if (idx)
+        _gcry_md_reset (hd);
+
+      c[0] = (idx >> 24) & 0xFF;
+      c[1] = (idx >> 16) & 0xFF;
+      c[2] = (idx >> 8) & 0xFF;
+      c[3] = idx & 0xFF;
+      idx++;
+
+      _gcry_md_write (hd, seed, seedlen);
+      _gcry_md_write (hd, c, 4);
+      digest = _gcry_md_read (hd, 0);
+
+      n = (outlen - nbytes < dlen)? (outlen - nbytes) : dlen;
+      memcpy (output+nbytes, digest, n);
+      nbytes += n;
+    }
+
+  _gcry_md_close (hd);
+  return GPG_ERR_NO_ERROR;
+}
+
+
+/* RFC-3447 (pkcs#1 v2.1) OAEP encoding.  NBITS is the length of the
+   key measured in bits.  ALGO is the hash function; it must be a
+   valid and usable algorithm.  {VALUE,VALUELEN} is the message to
+   encrypt.  {LABEL,LABELLEN} is the optional label to be associated
+   with the message, if LABEL is NULL the default is to use the empty
+   string as label.  On success the encoded ciphertext is returned at
+   R_RESULT.
+
+   If {RANDOM_OVERRIDE, RANDOM_OVERRIDE_LEN} is given it is used as
+   the seed instead of using a random string for it.  This feature is
+   only useful for regression tests.
+
+   Here is figure 1 from the RFC depicting the process:
+
+                             +----------+---------+-------+
+                        DB = |  lHash   |    PS   |   M   |
+                             +----------+---------+-------+
+                                            |
+                  +----------+              V
+                  |   seed   |--> MGF ---> xor
+                  +----------+              |
+                        |                   |
+               +--+     V                   |
+               |00|    xor <----- MGF <-----|
+               +--+     |                   |
+                 |      |                   |
+                 V      V                   V
+               +--+----------+----------------------------+
+         EM =  |00|maskedSeed|          maskedDB          |
+               +--+----------+----------------------------+
+  */
+gpg_err_code_t
+_gcry_rsa_oaep_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
+                       const unsigned char *value, size_t valuelen,
+                       const unsigned char *label, size_t labellen,
+                       const void *random_override, size_t random_override_len)
+{
+  gcry_err_code_t rc = 0;
+  unsigned char *frame = NULL;
+  size_t nframe = (nbits+7) / 8;
+  unsigned char *p;
+  size_t hlen;
+  size_t n;
+
+  *r_result = NULL;
+
+  /* Set defaults for LABEL.  */
+  if (!label || !labellen)
+    {
+      label = (const unsigned char*)"";
+      labellen = 0;
+    }
+
+  hlen = _gcry_md_get_algo_dlen (algo);
+
+  /* We skip step 1a which would be to check that LABELLEN is not
+     greater than 2^61-1.  See rfc-3447 7.1.1. */
+
+  /* Step 1b.  Note that the obsolete rfc-2437 uses the check:
+     valuelen > nframe - 2 * hlen - 1 .  */
+  if (valuelen > nframe - 2 * hlen - 2 || !nframe)
+    {
+      /* Can't encode a VALUELEN value in a NFRAME bytes frame. */
+      return GPG_ERR_TOO_SHORT; /* The key is too short.  */
+    }
+
+  /* Allocate the frame.  */
+  frame = xtrycalloc_secure (1, nframe);
+  if (!frame)
+    return gpg_err_code_from_syserror ();
+
+  /* Step 2a: Compute the hash of the label.  We store it in the frame
+     where later the maskedDB will commence.  */
+  _gcry_md_hash_buffer (algo, frame + 1 + hlen, label, labellen);
+
+  /* Step 2b: Set octet string to zero.  */
+  /* This has already been done while allocating FRAME.  */
+
+  /* Step 2c: Create DB by concatenating lHash, PS, 0x01 and M.  */
+  n = nframe - valuelen - 1;
+  frame[n] = 0x01;
+  memcpy (frame + n + 1, value, valuelen);
+
+  /* Step 3d: Generate seed.  We store it where the maskedSeed will go
+     later. */
+  if (random_override)
+    {
+      if (random_override_len != hlen)
+        {
+          xfree (frame);
+          return GPG_ERR_INV_ARG;
+        }
+      memcpy (frame + 1, random_override, hlen);
+    }
+  else
+    _gcry_randomize (frame + 1, hlen, GCRY_STRONG_RANDOM);
+
+  /* Step 2e and 2f: Create maskedDB.  */
+  {
+    unsigned char *dmask;
+
+    dmask = xtrymalloc_secure (nframe - hlen - 1);
+    if (!dmask)
+      {
+        rc = gpg_err_code_from_syserror ();
+        xfree (frame);
+        return rc;
+      }
+    rc = mgf1 (dmask, nframe - hlen - 1, frame+1, hlen, algo);
+    if (rc)
+      {
+        xfree (dmask);
+        xfree (frame);
+        return rc;
+      }
+    for (n = 1 + hlen, p = dmask; n < nframe; n++)
+      frame[n] ^= *p++;
+    xfree (dmask);
+  }
+
+  /* Step 2g and 2h: Create maskedSeed.  */
+  {
+    unsigned char *smask;
+
+    smask = xtrymalloc_secure (hlen);
+    if (!smask)
+      {
+        rc = gpg_err_code_from_syserror ();
+        xfree (frame);
+        return rc;
+      }
+    rc = mgf1 (smask, hlen, frame + 1 + hlen, nframe - hlen - 1, algo);
+    if (rc)
+      {
+        xfree (smask);
+        xfree (frame);
+        return rc;
+      }
+    for (n = 1, p = smask; n < 1 + hlen; n++)
+      frame[n] ^= *p++;
+    xfree (smask);
+  }
+
+  /* Step 2i: Concatenate 0x00, maskedSeed and maskedDB.  */
+  /* This has already been done by using in-place operations.  */
+
+  /* Convert the stuff into an MPI as expected by the caller.  */
+  rc = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, nframe, NULL);
+  if (!rc && DBG_CIPHER)
+    log_mpidump ("OAEP encoded data", *r_result);
+  xfree (frame);
+
+  return rc;
+}
+
+
+/* RFC-3447 (pkcs#1 v2.1) OAEP decoding.  NBITS is the length of the
+   key measured in bits.  ALGO is the hash function; it must be a
+   valid and usable algorithm.  VALUE is the raw decrypted message
+   {LABEL,LABELLEN} is the optional label to be associated with the
+   message, if LABEL is NULL the default is to use the empty string as
+   label.  On success the plaintext is returned as a newly allocated
+   buffer at R_RESULT; its valid length is stored at R_RESULTLEN.  On
+   error NULL is stored at R_RESULT.  */
+gpg_err_code_t
+_gcry_rsa_oaep_decode (unsigned char **r_result, size_t *r_resultlen,
+                       unsigned int nbits, int algo,
+                       gcry_mpi_t value,
+                       const unsigned char *label, size_t labellen)
+{
+  gcry_err_code_t rc;
+  unsigned char *frame = NULL; /* Encoded messages (EM).  */
+  unsigned char *masked_seed;  /* Points into FRAME.  */
+  unsigned char *masked_db;    /* Points into FRAME.  */
+  unsigned char *seed = NULL;  /* Allocated space for the seed and DB.  */
+  unsigned char *db;           /* Points into SEED.  */
+  unsigned char *lhash = NULL; /* Hash of the label.  */
+  size_t nframe;               /* Length of the ciphertext (EM).  */
+  size_t hlen;                 /* Length of the hash digest.  */
+  size_t db_len;               /* Length of DB and masked_db.  */
+  size_t nkey = (nbits+7)/8;   /* Length of the key in bytes.  */
+  int failed = 0;              /* Error indicator.  */
+  size_t n;
+
+  *r_result = NULL;
+
+  /* This code is implemented as described by rfc-3447 7.1.2.  */
+
+  /* Set defaults for LABEL.  */
+  if (!label || !labellen)
+    {
+      label = (const unsigned char*)"";
+      labellen = 0;
+    }
+
+  /* Get the length of the digest.  */
+  hlen = _gcry_md_get_algo_dlen (algo);
+
+  /* Hash the label right away.  */
+  lhash = xtrymalloc (hlen);
+  if (!lhash)
+    return gpg_err_code_from_syserror ();
+  _gcry_md_hash_buffer (algo, lhash, label, labellen);
+
+  /* Turn the MPI into an octet string.  If the octet string is
+     shorter than the key we pad it to the left with zeroes.  This may
+     happen due to the leading zero in OAEP frames and due to the
+     following random octets (seed^mask) which may have leading zero
+     bytes.  This all is needed to cope with our leading zeroes
+     suppressing MPI implementation.  The code implictly implements
+     Step 1b (bail out if NFRAME != N).  */
+  rc = octet_string_from_mpi (&frame, NULL, value, nkey);
+  if (rc)
+    {
+      xfree (lhash);
+      return GPG_ERR_ENCODING_PROBLEM;
+    }
+  nframe = nkey;
+
+  /* Step 1c: Check that the key is long enough.  */
+  if ( nframe < 2 * hlen + 2 )
+    {
+      xfree (frame);
+      xfree (lhash);
+      return GPG_ERR_ENCODING_PROBLEM;
+    }
+
+  /* Step 2 has already been done by the caller and the
+     gcry_mpi_aprint above.  */
+
+  /* Allocate space for SEED and DB.  */
+  seed = xtrymalloc_secure (nframe - 1);
+  if (!seed)
+    {
+      rc = gpg_err_code_from_syserror ();
+      xfree (frame);
+      xfree (lhash);
+      return rc;
+    }
+  db = seed + hlen;
+
+  /* To avoid chosen ciphertext attacks from now on we make sure to
+     run all code even in the error case; this avoids possible timing
+     attacks as described by Manger.  */
+
+  /* Step 3a: Hash the label.  */
+  /* This has already been done.  */
+
+  /* Step 3b: Separate the encoded message.  */
+  masked_seed = frame + 1;
+  masked_db   = frame + 1 + hlen;
+  db_len      = nframe - 1 - hlen;
+
+  /* Step 3c and 3d: seed = maskedSeed ^ mgf(maskedDB, hlen).  */
+  if (mgf1 (seed, hlen, masked_db, db_len, algo))
+    failed = 1;
+  for (n = 0; n < hlen; n++)
+    seed[n] ^= masked_seed[n];
+
+  /* Step 3e and 3f: db = maskedDB ^ mgf(seed, db_len).  */
+  if (mgf1 (db, db_len, seed, hlen, algo))
+    failed = 1;
+  for (n = 0; n < db_len; n++)
+    db[n] ^= masked_db[n];
+
+  /* Step 3g: Check lhash, an possible empty padding string terminated
+     by 0x01 and the first byte of EM being 0.  */
+  if (memcmp (lhash, db, hlen))
+    failed = 1;
+  for (n = hlen; n < db_len; n++)
+    if (db[n] == 0x01)
+      break;
+  if (n == db_len)
+    failed = 1;
+  if (frame[0])
+    failed = 1;
+
+  xfree (lhash);
+  xfree (frame);
+  if (failed)
+    {
+      xfree (seed);
+      return GPG_ERR_ENCODING_PROBLEM;
+    }
+
+  /* Step 4: Output M.  */
+  /* To avoid an extra allocation we reuse the seed buffer.  The only
+     caller of this function will anyway free the result soon.  */
+  n++;
+  memmove (seed, db + n, db_len - n);
+  *r_result = seed;
+  *r_resultlen = db_len - n;
+  seed = NULL;
+
+  if (DBG_CIPHER)
+    log_printhex ("value extracted from OAEP encoded data",
+                  *r_result, *r_resultlen);
+
+  return 0;
+}
+
+
+/* RFC-3447 (pkcs#1 v2.1) PSS encoding.  Encode {VALUE,VALUELEN} for
+   an NBITS key.  Note that VALUE is already the mHash from the
+   picture below.  ALGO is a valid hash algorithm and SALTLEN is the
+   length of salt to be used.  On success the result is stored as a
+   new MPI at R_RESULT.  On error the value at R_RESULT is undefined.
+
+   If {RANDOM_OVERRIDE, RANDOM_OVERRIDE_LEN} is given it is used as
+   the salt instead of using a random string for the salt.  This
+   feature is only useful for regression tests.
+
+   Here is figure 2 from the RFC (errata 595 applied) depicting the
+   process:
+
+                                  +-----------+
+                                  |     M     |
+                                  +-----------+
+                                        |
+                                        V
+                                      Hash
+                                        |
+                                        V
+                          +--------+----------+----------+
+                     M' = |Padding1|  mHash   |   salt   |
+                          +--------+----------+----------+
+                                         |
+               +--------+----------+     V
+         DB =  |Padding2| salt     |   Hash
+               +--------+----------+     |
+                         |               |
+                         V               |    +----+
+                        xor <--- MGF <---|    |0xbc|
+                         |               |    +----+
+                         |               |      |
+                         V               V      V
+               +-------------------+----------+----+
+         EM =  |    maskedDB       |     H    |0xbc|
+               +-------------------+----------+----+
+
+  */
+gpg_err_code_t
+_gcry_rsa_pss_encode (gcry_mpi_t *r_result, unsigned int nbits, int algo,
+                      const unsigned char *value, size_t valuelen, int saltlen,
+                      const void *random_override, size_t random_override_len)
+{
+  gcry_err_code_t rc = 0;
+  size_t hlen;                 /* Length of the hash digest.  */
+  unsigned char *em = NULL;    /* Encoded message.  */
+  size_t emlen = (nbits+7)/8;  /* Length in bytes of EM.  */
+  unsigned char *h;            /* Points into EM.  */
+  unsigned char *buf = NULL;   /* Help buffer.  */
+  size_t buflen;               /* Length of BUF.  */
+  unsigned char *mhash;        /* Points into BUF.  */
+  unsigned char *salt;         /* Points into BUF.  */
+  unsigned char *dbmask;       /* Points into BUF.  */
+  unsigned char *p;
+  size_t n;
+
+  /* This code is implemented as described by rfc-3447 9.1.1.  */
+
+  /* Get the length of the digest.  */
+  hlen = _gcry_md_get_algo_dlen (algo);
+  gcry_assert (hlen);  /* We expect a valid ALGO here.  */
+
+  /* Allocate a help buffer and setup some pointers.  */
+  buflen = 8 + hlen + saltlen + (emlen - hlen - 1);
+  buf = xtrymalloc (buflen);
+  if (!buf)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  mhash = buf + 8;
+  salt  = mhash + hlen;
+  dbmask= salt + saltlen;
+
+  /* Step 2: That would be: mHash = Hash(M) but our input is already
+     mHash thus we do only a consistency check and copy to MHASH.  */
+  if (valuelen != hlen)
+    {
+      rc = GPG_ERR_INV_LENGTH;
+      goto leave;
+    }
+  memcpy (mhash, value, hlen);
+
+  /* Step 3: Check length constraints.  */
+  if (emlen < hlen + saltlen + 2)
+    {
+      rc = GPG_ERR_TOO_SHORT;
+      goto leave;
+    }
+
+  /* Allocate space for EM.  */
+  em = xtrymalloc (emlen);
+  if (!em)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  h = em + emlen - 1 - hlen;
+
+  /* Step 4: Create a salt.  */
+  if (saltlen)
+    {
+      if (random_override)
+        {
+          if (random_override_len != saltlen)
+            {
+              rc = GPG_ERR_INV_ARG;
+              goto leave;
+            }
+          memcpy (salt, random_override, saltlen);
+        }
+      else
+        _gcry_randomize (salt, saltlen, GCRY_STRONG_RANDOM);
+    }
+
+  /* Step 5 and 6: M' = Hash(Padding1 || mHash || salt).  */
+  memset (buf, 0, 8);  /* Padding.  */
+  _gcry_md_hash_buffer (algo, h, buf, 8 + hlen + saltlen);
+
+  /* Step 7 and 8: DB = PS || 0x01 || salt.  */
+  /* Note that we use EM to store DB and later Xor in-place.  */
+  p = em + emlen - 1 - hlen - saltlen - 1;
+  memset (em, 0, p - em);
+  *p++ = 0x01;
+  memcpy (p, salt, saltlen);
+
+  /* Step 9: dbmask = MGF(H, emlen - hlen - 1).  */
+  mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
+
+  /* Step 10: maskedDB = DB ^ dbMask */
+  for (n = 0, p = dbmask; n < emlen - hlen - 1; n++, p++)
+    em[n] ^= *p;
+
+  /* Step 11: Set the leftmost bits to zero.  */
+  em[0] &= 0xFF >> (8 * emlen - nbits);
+
+  /* Step 12: EM = maskedDB || H || 0xbc.  */
+  em[emlen-1] = 0xbc;
+
+  /* Convert EM into an MPI.  */
+  rc = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, em, emlen, NULL);
+  if (!rc && DBG_CIPHER)
+    log_mpidump ("PSS encoded data", *r_result);
+
+ leave:
+  if (em)
+    {
+      wipememory (em, emlen);
+      xfree (em);
+    }
+  if (buf)
+    {
+      wipememory (buf, buflen);
+      xfree (buf);
+    }
+  return rc;
+}
+
+
+/* Verify a signature assuming PSS padding.  VALUE is the hash of the
+   message (mHash) encoded as an MPI; its length must match the digest
+   length of ALGO.  ENCODED is the output of the RSA public key
+   function (EM).  NBITS is the size of the public key.  ALGO is the
+   hash algorithm and SALTLEN is the length of the used salt.  The
+   function returns 0 on success or on error code.  */
+gpg_err_code_t
+_gcry_rsa_pss_verify (gcry_mpi_t value, gcry_mpi_t encoded,
+                      unsigned int nbits, int algo, size_t saltlen)
+{
+  gcry_err_code_t rc = 0;
+  size_t hlen;                 /* Length of the hash digest.  */
+  unsigned char *em = NULL;    /* Encoded message.  */
+  size_t emlen = (nbits+7)/8;  /* Length in bytes of EM.  */
+  unsigned char *salt;         /* Points into EM.  */
+  unsigned char *h;            /* Points into EM.  */
+  unsigned char *buf = NULL;   /* Help buffer.  */
+  size_t buflen;               /* Length of BUF.  */
+  unsigned char *dbmask;       /* Points into BUF.  */
+  unsigned char *mhash;        /* Points into BUF.  */
+  unsigned char *p;
+  size_t n;
+
+  /* This code is implemented as described by rfc-3447 9.1.2.  */
+
+  /* Get the length of the digest.  */
+  hlen = _gcry_md_get_algo_dlen (algo);
+  gcry_assert (hlen);  /* We expect a valid ALGO here.  */
+
+  /* Allocate a help buffer and setup some pointers.
+     This buffer is used for two purposes:
+        +------------------------------+-------+
+     1. | dbmask                       | mHash |
+        +------------------------------+-------+
+           emlen - hlen - 1              hlen
+
+        +----------+-------+---------+-+-------+
+     2. | padding1 | mHash | salt    | | mHash |
+        +----------+-------+---------+-+-------+
+             8       hlen    saltlen     hlen
+  */
+  buflen = 8 + hlen + saltlen;
+  if (buflen < emlen - hlen - 1)
+    buflen = emlen - hlen - 1;
+  buflen += hlen;
+  buf = xtrymalloc (buflen);
+  if (!buf)
+    {
+      rc = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+  dbmask = buf;
+  mhash = buf + buflen - hlen;
+
+  /* Step 2: That would be: mHash = Hash(M) but our input is already
+     mHash thus we only need to convert VALUE into MHASH.  */
+  rc = octet_string_from_mpi (NULL, mhash, value, hlen);
+  if (rc)
+    goto leave;
+
+  /* Convert the signature into an octet string.  */
+  rc = octet_string_from_mpi (&em, NULL, encoded, emlen);
+  if (rc)
+    goto leave;
+
+  /* Step 3: Check length of EM.  Because we internally use MPI
+     functions we can't do this properly; EMLEN is always the length
+     of the key because octet_string_from_mpi needs to left pad the
+     result with zero to cope with the fact that our MPIs suppress all
+     leading zeroes.  Thus what we test here are merely the digest and
+     salt lengths to the key.  */
+  if (emlen < hlen + saltlen + 2)
+    {
+      rc = GPG_ERR_TOO_SHORT; /* For the hash and saltlen.  */
+      goto leave;
+    }
+
+  /* Step 4: Check last octet.  */
+  if (em[emlen - 1] != 0xbc)
+    {
+      rc = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  /* Step 5: Split EM.  */
+  h = em + emlen - 1 - hlen;
+
+  /* Step 6: Check the leftmost bits.  */
+  if ((em[0] & ~(0xFF >> (8 * emlen - nbits))))
+    {
+      rc = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  /* Step 7: dbmask = MGF(H, emlen - hlen - 1).  */
+  mgf1 (dbmask, emlen - hlen - 1, h, hlen, algo);
+
+  /* Step 8: maskedDB = DB ^ dbMask.  */
+  for (n = 0, p = dbmask; n < emlen - hlen - 1; n++, p++)
+    em[n] ^= *p;
+
+  /* Step 9: Set leftmost bits in DB to zero.  */
+  em[0] &= 0xFF >> (8 * emlen - nbits);
+
+  /* Step 10: Check the padding of DB.  */
+  for (n = 0; n < emlen - hlen - saltlen - 2 && !em[n]; n++)
+    ;
+  if (n != emlen - hlen - saltlen - 2 || em[n++] != 1)
+    {
+      rc = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  /* Step 11: Extract salt from DB.  */
+  salt = em + n;
+
+  /* Step 12:  M' = (0x)00 00 00 00 00 00 00 00 || mHash || salt */
+  memset (buf, 0, 8);
+  memcpy (buf+8, mhash, hlen);
+  memcpy (buf+8+hlen, salt, saltlen);
+
+  /* Step 13:  H' = Hash(M').  */
+  _gcry_md_hash_buffer (algo, buf, buf, 8 + hlen + saltlen);
+
+  /* Step 14:  Check H == H'.   */
+  rc = memcmp (h, buf, hlen) ? GPG_ERR_BAD_SIGNATURE : GPG_ERR_NO_ERROR;
+
+ leave:
+  if (em)
+    {
+      wipememory (em, emlen);
+      xfree (em);
+    }
+  if (buf)
+    {
+      wipememory (buf, buflen);
+      xfree (buf);
+    }
+  return rc;
+}
diff --git a/comm/third_party/libgcrypt/cipher/rsa.c b/comm/third_party/libgcrypt/cipher/rsa.c
new file mode 100644
index 0000000000..575ea94924
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/rsa.c
@@ -0,0 +1,2035 @@
+/* rsa.c - RSA implementation
+ * Copyright (C) 1997, 1998, 1999 by Werner Koch (dd9jn)
+ * Copyright (C) 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* This code uses an algorithm protected by U.S. Patent #4,405,829
+   which expired on September 20, 2000.  The patent holder placed that
+   patent into the public domain on Sep 6th, 2000.
+*/
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "mpi.h"
+#include "cipher.h"
+#include "pubkey-internal.h"
+
+
+typedef struct
+{
+  gcry_mpi_t n;	    /* modulus */
+  gcry_mpi_t e;	    /* exponent */
+} RSA_public_key;
+
+
+typedef struct
+{
+  gcry_mpi_t n;	    /* public modulus */
+  gcry_mpi_t e;	    /* public exponent */
+  gcry_mpi_t d;	    /* exponent */
+  gcry_mpi_t p;	    /* prime  p. */
+  gcry_mpi_t q;	    /* prime  q. */
+  gcry_mpi_t u;	    /* inverse of p mod q. */
+} RSA_secret_key;
+
+
+static const char *rsa_names[] =
+  {
+    "rsa",
+    "openpgp-rsa",
+    "oid.1.2.840.113549.1.1.1",
+    NULL,
+  };
+
+
+/* A sample 2048 bit RSA key used for the selftests.  */
+static const char sample_secret_key[] =
+" (private-key"
+"  (rsa"
+"  (n #009F56231A3D82E3E7D613D59D53E9AB921BEF9F08A782AED0B6E46ADBC853EC"
+"      7C71C422435A3CD8FA0DB9EFD55CD3295BADC4E8E2E2B94E15AE82866AB8ADE8"
+"      7E469FAE76DC3577DE87F1F419C4EB41123DFAF8D16922D5EDBAD6E9076D5A1C"
+"      958106F0AE5E2E9193C6B49124C64C2A241C4075D4AF16299EB87A6585BAE917"
+"      DEF27FCDD165764D069BC18D16527B29DAAB549F7BBED4A7C6A842D203ED6613"
+"      6E2411744E432CD26D940132F25874483DCAEECDFD95744819CBCF1EA810681C"
+"      42907EBCB1C7EAFBE75C87EC32C5413EA10476545D3FC7B2ADB1B66B7F200918"
+"      664B0E5261C2895AA28B0DE321E921B3F877172CCCAB81F43EF98002916156F6CB#)"
+"   (e #010001#)"
+"   (d #07EF82500C403899934FE993AC5A36F14FF2DF38CF1EF315F205EE4C83EDAA19"
+"       8890FC23DE9AA933CAFB37B6A8A8DBA675411958337287310D3FF2F1DDC0CB93"
+"       7E70F57F75F833C021852B631D2B9A520E4431A03C5C3FCB5742DCD841D9FB12"
+"       771AA1620DCEC3F1583426066ED9DC3F7028C5B59202C88FDF20396E2FA0EC4F"
+"       5A22D9008F3043673931BC14A5046D6327398327900867E39CC61B2D1AFE2F48"
+"       EC8E1E3861C68D257D7425F4E6F99ABD77D61F10CA100EFC14389071831B33DD"
+"       69CC8EABEF860D1DC2AAA84ABEAE5DFC91BC124DAF0F4C8EF5BBEA436751DE84"
+"       3A8063E827A024466F44C28614F93B0732A100D4A0D86D532FE1E22C7725E401#)"
+"   (p #00C29D438F115825779631CD665A5739367F3E128ADC29766483A46CA80897E0"
+"       79B32881860B8F9A6A04C2614A904F6F2578DAE13EA67CD60AE3D0AA00A1FF9B"
+"       441485E44B2DC3D0B60260FBFE073B5AC72FAF67964DE15C8212C389D20DB9CF"
+"       54AF6AEF5C4196EAA56495DD30CF709F499D5AB30CA35E086C2A1589D6283F1783#)"
+"   (q #00D1984135231CB243FE959C0CBEF551EDD986AD7BEDF71EDF447BE3DA27AF46"
+"       79C974A6FA69E4D52FE796650623DE70622862713932AA2FD9F2EC856EAEAA77"
+"       88B4EA6084DC81C902F014829B18EA8B2666EC41586818E0589E18876065F97E"
+"       8D22CE2DA53A05951EC132DCEF41E70A9C35F4ACC268FFAC2ADF54FA1DA110B919#)"
+"   (u #67CF0FD7635205DD80FA814EE9E9C267C17376BF3209FB5D1BC42890D2822A04"
+"       479DAF4D5B6ED69D0F8D1AF94164D07F8CD52ECEFE880641FA0F41DDAB1785E4"
+"       A37A32F997A516480B4CD4F6482B9466A1765093ED95023CA32D5EDC1E34CEE9"
+"       AF595BC51FE43C4BF810FA225AF697FB473B83815966188A4312C048B885E3F7#)))";
+
+/* A sample 2048 bit RSA key used for the selftests (public only).  */
+static const char sample_public_key[] =
+" (public-key"
+"  (rsa"
+"   (n #009F56231A3D82E3E7D613D59D53E9AB921BEF9F08A782AED0B6E46ADBC853EC"
+"       7C71C422435A3CD8FA0DB9EFD55CD3295BADC4E8E2E2B94E15AE82866AB8ADE8"
+"       7E469FAE76DC3577DE87F1F419C4EB41123DFAF8D16922D5EDBAD6E9076D5A1C"
+"       958106F0AE5E2E9193C6B49124C64C2A241C4075D4AF16299EB87A6585BAE917"
+"       DEF27FCDD165764D069BC18D16527B29DAAB549F7BBED4A7C6A842D203ED6613"
+"       6E2411744E432CD26D940132F25874483DCAEECDFD95744819CBCF1EA810681C"
+"       42907EBCB1C7EAFBE75C87EC32C5413EA10476545D3FC7B2ADB1B66B7F200918"
+"       664B0E5261C2895AA28B0DE321E921B3F877172CCCAB81F43EF98002916156F6CB#)"
+"   (e #010001#)))";
+
+
+static int test_keys (RSA_secret_key *sk, unsigned nbits);
+static int  check_secret_key (RSA_secret_key *sk);
+static void public (gcry_mpi_t output, gcry_mpi_t input, RSA_public_key *skey);
+static void secret (gcry_mpi_t output, gcry_mpi_t input, RSA_secret_key *skey);
+static unsigned int rsa_get_nbits (gcry_sexp_t parms);
+
+
+/* Check that a freshly generated key actually works.  Returns 0 on success. */
+static int
+test_keys (RSA_secret_key *sk, unsigned int nbits)
+{
+  int result = -1; /* Default to failure.  */
+  RSA_public_key pk;
+  gcry_mpi_t plaintext = mpi_new (nbits);
+  gcry_mpi_t ciphertext = mpi_new (nbits);
+  gcry_mpi_t decr_plaintext = mpi_new (nbits);
+  gcry_mpi_t signature = mpi_new (nbits);
+
+  /* Put the relevant parameters into a public key structure.  */
+  pk.n = sk->n;
+  pk.e = sk->e;
+
+  /* Create a random plaintext.  */
+  _gcry_mpi_randomize (plaintext, nbits, GCRY_WEAK_RANDOM);
+
+  /* Encrypt using the public key.  */
+  public (ciphertext, plaintext, &pk);
+
+  /* Check that the cipher text does not match the plaintext.  */
+  if (!mpi_cmp (ciphertext, plaintext))
+    goto leave; /* Ciphertext is identical to the plaintext.  */
+
+  /* Decrypt using the secret key.  */
+  secret (decr_plaintext, ciphertext, sk);
+
+  /* Check that the decrypted plaintext matches the original plaintext.  */
+  if (mpi_cmp (decr_plaintext, plaintext))
+    goto leave; /* Plaintext does not match.  */
+
+  /* Create another random plaintext as data for signature checking.  */
+  _gcry_mpi_randomize (plaintext, nbits, GCRY_WEAK_RANDOM);
+
+  /* Use the RSA secret function to create a signature of the plaintext.  */
+  secret (signature, plaintext, sk);
+
+  /* Use the RSA public function to verify this signature.  */
+  public (decr_plaintext, signature, &pk);
+  if (mpi_cmp (decr_plaintext, plaintext))
+    goto leave; /* Signature does not match.  */
+
+  /* Modify the signature and check that the signing fails.  */
+  mpi_add_ui (signature, signature, 1);
+  public (decr_plaintext, signature, &pk);
+  if (!mpi_cmp (decr_plaintext, plaintext))
+    goto leave; /* Signature matches but should not.  */
+
+  result = 0; /* All tests succeeded.  */
+
+ leave:
+  _gcry_mpi_release (signature);
+  _gcry_mpi_release (decr_plaintext);
+  _gcry_mpi_release (ciphertext);
+  _gcry_mpi_release (plaintext);
+  return result;
+}
+
+
+/* Callback used by the prime generation to test whether the exponent
+   is suitable. Returns 0 if the test has been passed. */
+static int
+check_exponent (void *arg, gcry_mpi_t a)
+{
+  gcry_mpi_t e = arg;
+  gcry_mpi_t tmp;
+  int result;
+
+  mpi_sub_ui (a, a, 1);
+  tmp = _gcry_mpi_alloc_like (a);
+  result = !mpi_gcd(tmp, e, a); /* GCD is not 1. */
+  _gcry_mpi_release (tmp);
+  mpi_add_ui (a, a, 1);
+  return result;
+}
+
+/****************
+ * Generate a key pair with a key of size NBITS.
+ * USE_E = 0 let Libcgrypt decide what exponent to use.
+ *       = 1 request the use of a "secure" exponent; this is required by some
+ *           specification to be 65537.
+ *       > 2 Use this public exponent.  If the given exponent
+ *           is not odd one is internally added to it.
+ * TRANSIENT_KEY:  If true, generate the primes using the standard RNG.
+ * Returns: 2 structures filled with all needed values
+ */
+static gpg_err_code_t
+generate_std (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
+              int transient_key)
+{
+  gcry_mpi_t p, q; /* the two primes */
+  gcry_mpi_t d;    /* the private key */
+  gcry_mpi_t u;
+  gcry_mpi_t t1, t2;
+  gcry_mpi_t n;    /* the public key */
+  gcry_mpi_t e;    /* the exponent */
+  gcry_mpi_t phi;  /* helper: (p-1)(q-1) */
+  gcry_mpi_t g;
+  gcry_mpi_t f;
+  gcry_random_level_t random_level;
+
+  if (fips_mode ())
+    {
+      if (nbits < 1024)
+        return GPG_ERR_INV_VALUE;
+      if (transient_key)
+        return GPG_ERR_INV_VALUE;
+    }
+
+  /* The random quality depends on the transient_key flag.  */
+  random_level = transient_key ? GCRY_STRONG_RANDOM : GCRY_VERY_STRONG_RANDOM;
+
+  /* Make sure that nbits is even so that we generate p, q of equal size. */
+  if ( (nbits&1) )
+    nbits++;
+
+  if (use_e == 1)   /* Alias for a secure value */
+    use_e = 65537;  /* as demanded by Sphinx. */
+
+  /* Public exponent:
+     In general we use 41 as this is quite fast and more secure than the
+     commonly used 17.  Benchmarking the RSA verify function
+     with a 1024 bit key yields (2001-11-08):
+     e=17    0.54 ms
+     e=41    0.75 ms
+     e=257   0.95 ms
+     e=65537 1.80 ms
+  */
+  e = mpi_alloc( (32+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB );
+  if (!use_e)
+    mpi_set_ui (e, 41);     /* This is a reasonable secure and fast value */
+  else
+    {
+      use_e |= 1; /* make sure this is odd */
+      mpi_set_ui (e, use_e);
+    }
+
+  n = mpi_new (nbits);
+
+  p = q = NULL;
+  do
+    {
+      /* select two (very secret) primes */
+      if (p)
+        _gcry_mpi_release (p);
+      if (q)
+        _gcry_mpi_release (q);
+      if (use_e)
+        { /* Do an extra test to ensure that the given exponent is
+             suitable. */
+          p = _gcry_generate_secret_prime (nbits/2, random_level,
+                                           check_exponent, e);
+          q = _gcry_generate_secret_prime (nbits/2, random_level,
+                                           check_exponent, e);
+        }
+      else
+        { /* We check the exponent later. */
+          p = _gcry_generate_secret_prime (nbits/2, random_level, NULL, NULL);
+          q = _gcry_generate_secret_prime (nbits/2, random_level, NULL, NULL);
+        }
+      if (mpi_cmp (p, q) > 0 ) /* p shall be smaller than q (for calc of u)*/
+        mpi_swap(p,q);
+      /* calculate the modulus */
+      mpi_mul( n, p, q );
+    }
+  while ( mpi_get_nbits(n) != nbits );
+
+  /* calculate Euler totient: phi = (p-1)(q-1) */
+  t1 = mpi_alloc_secure( mpi_get_nlimbs(p) );
+  t2 = mpi_alloc_secure( mpi_get_nlimbs(p) );
+  phi   = mpi_snew ( nbits );
+  g	= mpi_snew ( nbits );
+  f	= mpi_snew ( nbits );
+  mpi_sub_ui( t1, p, 1 );
+  mpi_sub_ui( t2, q, 1 );
+  mpi_mul( phi, t1, t2 );
+  mpi_gcd (g, t1, t2);
+  mpi_fdiv_q(f, phi, g);
+
+  while (!mpi_gcd(t1, e, phi)) /* (while gcd is not 1) */
+    {
+      if (use_e)
+        BUG (); /* The prime generator already made sure that we
+                   never can get to here. */
+      mpi_add_ui (e, e, 2);
+    }
+
+  /* calculate the secret key d = e^-1 mod phi */
+  d = mpi_snew ( nbits );
+  mpi_invm (d, e, f );
+  /* calculate the inverse of p and q (used for chinese remainder theorem)*/
+  u = mpi_snew ( nbits );
+  mpi_invm(u, p, q );
+
+  if( DBG_CIPHER )
+    {
+      log_mpidump("  p= ", p );
+      log_mpidump("  q= ", q );
+      log_mpidump("phi= ", phi );
+      log_mpidump("  g= ", g );
+      log_mpidump("  f= ", f );
+      log_mpidump("  n= ", n );
+      log_mpidump("  e= ", e );
+      log_mpidump("  d= ", d );
+      log_mpidump("  u= ", u );
+    }
+
+  _gcry_mpi_release (t1);
+  _gcry_mpi_release (t2);
+  _gcry_mpi_release (phi);
+  _gcry_mpi_release (f);
+  _gcry_mpi_release (g);
+
+  sk->n = n;
+  sk->e = e;
+  sk->p = p;
+  sk->q = q;
+  sk->d = d;
+  sk->u = u;
+
+  /* Now we can test our keys. */
+  if (test_keys (sk, nbits - 64))
+    {
+      _gcry_mpi_release (sk->n); sk->n = NULL;
+      _gcry_mpi_release (sk->e); sk->e = NULL;
+      _gcry_mpi_release (sk->p); sk->p = NULL;
+      _gcry_mpi_release (sk->q); sk->q = NULL;
+      _gcry_mpi_release (sk->d); sk->d = NULL;
+      _gcry_mpi_release (sk->u); sk->u = NULL;
+      fips_signal_error ("self-test after key generation failed");
+      return GPG_ERR_SELFTEST_FAILED;
+    }
+
+  return 0;
+}
+
+
+/****************
+ * Generate a key pair with a key of size NBITS.
+ * USE_E = 0 let Libcgrypt decide what exponent to use.
+ *       = 1 request the use of a "secure" exponent; this is required by some
+ *           specification to be 65537.
+ *       > 2 Use this public exponent.  If the given exponent
+ *           is not odd one is internally added to it.
+ * TESTPARMS: If set, do not generate but test whether the p,q is probably prime
+ *            Returns key with zeroes to not break code calling this function.
+ * TRANSIENT_KEY:  If true, generate the primes using the standard RNG.
+ * Returns: 2 structures filled with all needed values
+ */
+static gpg_err_code_t
+generate_fips (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
+               gcry_sexp_t testparms, int transient_key)
+{
+  gcry_mpi_t p, q; /* the two primes */
+  gcry_mpi_t d;    /* the private key */
+  gcry_mpi_t u;
+  gcry_mpi_t p1, q1;
+  gcry_mpi_t n;    /* the public key */
+  gcry_mpi_t e;    /* the exponent */
+  gcry_mpi_t g;
+  gcry_mpi_t minp;
+  gcry_mpi_t diff, mindiff;
+  gcry_random_level_t random_level;
+  unsigned int pbits = nbits/2;
+  unsigned int i;
+  int pqswitch;
+  gpg_err_code_t ec = GPG_ERR_NO_PRIME;
+
+  if (nbits < 1024 || (nbits & 0x1FF))
+    return GPG_ERR_INV_VALUE;
+  if (_gcry_enforced_fips_mode() && nbits != 2048 && nbits != 3072)
+      return GPG_ERR_INV_VALUE;
+
+  /* The random quality depends on the transient_key flag.  */
+  random_level = transient_key ? GCRY_STRONG_RANDOM : GCRY_VERY_STRONG_RANDOM;
+
+  if (testparms)
+    {
+      /* Parameters to derive the key are given.  */
+      /* Note that we explicitly need to setup the values of tbl
+         because some compilers (e.g. OpenWatcom, IRIX) don't allow to
+         initialize a structure with automatic variables.  */
+      struct { const char *name; gcry_mpi_t *value; } tbl[] = {
+        { "e" },
+        { "p" },
+        { "q" },
+        { NULL }
+      };
+      int idx;
+      gcry_sexp_t oneparm;
+
+      tbl[0].value = &e;
+      tbl[1].value = &p;
+      tbl[2].value = &q;
+
+      for (idx=0; tbl[idx].name; idx++)
+        {
+          oneparm = sexp_find_token (testparms, tbl[idx].name, 0);
+          if (oneparm)
+            {
+              *tbl[idx].value = sexp_nth_mpi (oneparm, 1, GCRYMPI_FMT_USG);
+              sexp_release (oneparm);
+            }
+        }
+      for (idx=0; tbl[idx].name; idx++)
+        if (!*tbl[idx].value)
+          break;
+      if (tbl[idx].name)
+        {
+          /* At least one parameter is missing.  */
+          for (idx=0; tbl[idx].name; idx++)
+            _gcry_mpi_release (*tbl[idx].value);
+          return GPG_ERR_MISSING_VALUE;
+        }
+    }
+  else
+    {
+      if (use_e < 65537)
+        use_e = 65537;  /* This is the smallest value allowed by FIPS */
+
+      e = mpi_alloc ((32+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB);
+
+      use_e |= 1; /* make sure this is odd */
+      mpi_set_ui (e, use_e);
+
+      p = mpi_snew (pbits);
+      q = mpi_snew (pbits);
+    }
+
+  n = mpi_new (nbits);
+  d = mpi_snew (nbits);
+  u = mpi_snew (nbits);
+
+  /* prepare approximate minimum p and q */
+  minp = mpi_new (pbits);
+  mpi_set_ui (minp, 0xB504F334);
+  mpi_lshift (minp, minp, pbits - 32);
+
+  /* prepare minimum p and q difference */
+  diff = mpi_new (pbits);
+  mindiff = mpi_new (pbits - 99);
+  mpi_set_ui (mindiff, 1);
+  mpi_lshift (mindiff, mindiff, pbits - 100);
+
+  p1 = mpi_snew (pbits);
+  q1 = mpi_snew (pbits);
+  g  = mpi_snew (pbits);
+
+ retry:
+  /* generate p and q */
+  for (i = 0; i < 5 * pbits; i++)
+    {
+    ploop:
+      if (!testparms)
+        {
+          _gcry_mpi_randomize (p, pbits, random_level);
+        }
+      if (mpi_cmp (p, minp) < 0)
+        {
+          if (testparms)
+            goto err;
+          goto ploop;
+        }
+
+      mpi_sub_ui (p1, p, 1);
+      if (mpi_gcd (g, p1, e))
+        {
+          if (_gcry_fips186_4_prime_check (p, pbits) != GPG_ERR_NO_ERROR)
+            {
+              /* not a prime */
+              if (testparms)
+                goto err;
+            }
+          else
+            break;
+        }
+      else if (testparms)
+        goto err;
+    }
+  if (i >= 5 * pbits)
+    goto err;
+
+  for (i = 0; i < 5 * pbits; i++)
+    {
+    qloop:
+      if (!testparms)
+        {
+          _gcry_mpi_randomize (q, pbits, random_level);
+        }
+      if (mpi_cmp (q, minp) < 0)
+        {
+          if (testparms)
+            goto err;
+          goto qloop;
+        }
+      if (mpi_cmp (p, q) > 0)
+        {
+          pqswitch = 1;
+          mpi_sub (diff, p, q);
+        }
+      else
+        {
+          pqswitch = 0;
+          mpi_sub (diff, q, p);
+        }
+      if (mpi_cmp (diff, mindiff) < 0)
+        {
+          if (testparms)
+            goto err;
+          goto qloop;
+        }
+
+      mpi_sub_ui (q1, q, 1);
+      if (mpi_gcd (g, q1, e))
+        {
+          if (_gcry_fips186_4_prime_check (q, pbits) != GPG_ERR_NO_ERROR)
+            {
+              /* not a prime */
+              if (testparms)
+                goto err;
+            }
+          else
+            break;
+        }
+      else if (testparms)
+        goto err;
+    }
+  if (i >= 5 * pbits)
+    goto err;
+
+  if (testparms)
+    {
+      mpi_clear (p);
+      mpi_clear (q);
+    }
+  else
+    {
+      gcry_mpi_t f;
+
+      if (pqswitch)
+        {
+          gcry_mpi_t tmp;
+
+          tmp = p;
+          p = q;
+          q = tmp;
+        }
+
+      f = mpi_snew (nbits);
+
+      /* calculate the modulus */
+      mpi_mul (n, p, q);
+
+      /* calculate the secret key d = e^1 mod phi */
+      mpi_gcd (g, p1, q1);
+      mpi_fdiv_q (f, p1, g);
+      mpi_mul (f, f, q1);
+
+      mpi_invm (d, e, f);
+
+      _gcry_mpi_release (f);
+
+      if (mpi_get_nbits (d) < pbits)
+        goto retry;
+
+      /* calculate the inverse of p and q (used for chinese remainder theorem)*/
+      mpi_invm (u, p, q );
+    }
+
+  ec = 0;
+
+  if (DBG_CIPHER)
+    {
+      log_mpidump("  p= ", p );
+      log_mpidump("  q= ", q );
+      log_mpidump("  n= ", n );
+      log_mpidump("  e= ", e );
+      log_mpidump("  d= ", d );
+      log_mpidump("  u= ", u );
+    }
+
+ err:
+
+  _gcry_mpi_release (p1);
+  _gcry_mpi_release (q1);
+  _gcry_mpi_release (g);
+  _gcry_mpi_release (minp);
+  _gcry_mpi_release (mindiff);
+  _gcry_mpi_release (diff);
+
+  sk->n = n;
+  sk->e = e;
+  sk->p = p;
+  sk->q = q;
+  sk->d = d;
+  sk->u = u;
+
+  /* Now we can test our keys. */
+  if (ec || (!testparms && test_keys (sk, nbits - 64)))
+    {
+      _gcry_mpi_release (sk->n); sk->n = NULL;
+      _gcry_mpi_release (sk->e); sk->e = NULL;
+      _gcry_mpi_release (sk->p); sk->p = NULL;
+      _gcry_mpi_release (sk->q); sk->q = NULL;
+      _gcry_mpi_release (sk->d); sk->d = NULL;
+      _gcry_mpi_release (sk->u); sk->u = NULL;
+      if (!ec)
+        {
+          fips_signal_error ("self-test after key generation failed");
+          return GPG_ERR_SELFTEST_FAILED;
+        }
+    }
+
+  return ec;
+}
+
+
+/* Helper for generate_x931.  */
+static gcry_mpi_t
+gen_x931_parm_xp (unsigned int nbits)
+{
+  gcry_mpi_t xp;
+
+  xp = mpi_snew (nbits);
+  _gcry_mpi_randomize (xp, nbits, GCRY_VERY_STRONG_RANDOM);
+
+  /* The requirement for Xp is:
+
+       sqrt{2}*2^{nbits-1} <= xp <= 2^{nbits} - 1
+
+     We set the two high order bits to 1 to satisfy the lower bound.
+     By using mpi_set_highbit we make sure that the upper bound is
+     satisfied as well.  */
+  mpi_set_highbit (xp, nbits-1);
+  mpi_set_bit (xp, nbits-2);
+  gcry_assert ( mpi_get_nbits (xp) == nbits );
+
+  return xp;
+}
+
+
+/* Helper for generate_x931.  */
+static gcry_mpi_t
+gen_x931_parm_xi (void)
+{
+  gcry_mpi_t xi;
+
+  xi = mpi_snew (101);
+  _gcry_mpi_randomize (xi, 101, GCRY_VERY_STRONG_RANDOM);
+  mpi_set_highbit (xi, 100);
+  gcry_assert ( mpi_get_nbits (xi) == 101 );
+
+  return xi;
+}
+
+
+
+/* Variant of the standard key generation code using the algorithm
+   from X9.31.  Using this algorithm has the advantage that the
+   generation can be made deterministic which is required for CAVS
+   testing.  */
+static gpg_err_code_t
+generate_x931 (RSA_secret_key *sk, unsigned int nbits, unsigned long e_value,
+               gcry_sexp_t deriveparms, int *swapped)
+{
+  gcry_mpi_t p, q; /* The two primes.  */
+  gcry_mpi_t e;    /* The public exponent.  */
+  gcry_mpi_t n;    /* The public key.  */
+  gcry_mpi_t d;    /* The private key */
+  gcry_mpi_t u;    /* The inverse of p and q.  */
+  gcry_mpi_t pm1;  /* p - 1  */
+  gcry_mpi_t qm1;  /* q - 1  */
+  gcry_mpi_t phi;  /* Euler totient.  */
+  gcry_mpi_t f, g; /* Helper.  */
+
+  *swapped = 0;
+
+  if (e_value == 1)   /* Alias for a secure value. */
+    e_value = 65537;
+
+  /* Point 1 of section 4.1:  k = 1024 + 256s with S >= 0  */
+  if (nbits < 1024 || (nbits % 256))
+    return GPG_ERR_INV_VALUE;
+
+  /* Point 2:  2 <= bitlength(e) < 2^{k-2}
+     Note that we do not need to check the upper bound because we use
+     an unsigned long for E and thus there is no way for E to reach
+     that limit.  */
+  if (e_value < 3)
+    return GPG_ERR_INV_VALUE;
+
+  /* Our implementation requires E to be odd.  */
+  if (!(e_value & 1))
+    return GPG_ERR_INV_VALUE;
+
+  /* Point 3:  e > 0 or e 0 if it is to be randomly generated.
+     We support only a fixed E and thus there is no need for an extra test.  */
+
+
+  /* Compute or extract the derive parameters.  */
+  {
+    gcry_mpi_t xp1 = NULL;
+    gcry_mpi_t xp2 = NULL;
+    gcry_mpi_t xp  = NULL;
+    gcry_mpi_t xq1 = NULL;
+    gcry_mpi_t xq2 = NULL;
+    gcry_mpi_t xq  = NULL;
+    gcry_mpi_t tmpval;
+
+    if (!deriveparms)
+      {
+        /* Not given: Generate them.  */
+        xp = gen_x931_parm_xp (nbits/2);
+        /* Make sure that |xp - xq| > 2^{nbits - 100} holds.  */
+        tmpval = mpi_snew (nbits/2);
+        do
+          {
+            _gcry_mpi_release (xq);
+            xq = gen_x931_parm_xp (nbits/2);
+            mpi_sub (tmpval, xp, xq);
+          }
+        while (mpi_get_nbits (tmpval) <= (nbits/2 - 100));
+        _gcry_mpi_release (tmpval);
+
+        xp1 = gen_x931_parm_xi ();
+        xp2 = gen_x931_parm_xi ();
+        xq1 = gen_x931_parm_xi ();
+        xq2 = gen_x931_parm_xi ();
+
+      }
+    else
+      {
+        /* Parameters to derive the key are given.  */
+        /* Note that we explicitly need to setup the values of tbl
+           because some compilers (e.g. OpenWatcom, IRIX) don't allow
+           to initialize a structure with automatic variables.  */
+        struct { const char *name; gcry_mpi_t *value; } tbl[] = {
+          { "Xp1" },
+          { "Xp2" },
+          { "Xp"  },
+          { "Xq1" },
+          { "Xq2" },
+          { "Xq"  },
+          { NULL }
+        };
+        int idx;
+        gcry_sexp_t oneparm;
+
+        tbl[0].value = &xp1;
+        tbl[1].value = &xp2;
+        tbl[2].value = &xp;
+        tbl[3].value = &xq1;
+        tbl[4].value = &xq2;
+        tbl[5].value = &xq;
+
+        for (idx=0; tbl[idx].name; idx++)
+          {
+            oneparm = sexp_find_token (deriveparms, tbl[idx].name, 0);
+            if (oneparm)
+              {
+                *tbl[idx].value = sexp_nth_mpi (oneparm, 1, GCRYMPI_FMT_USG);
+                sexp_release (oneparm);
+              }
+          }
+        for (idx=0; tbl[idx].name; idx++)
+          if (!*tbl[idx].value)
+            break;
+        if (tbl[idx].name)
+          {
+            /* At least one parameter is missing.  */
+            for (idx=0; tbl[idx].name; idx++)
+              _gcry_mpi_release (*tbl[idx].value);
+            return GPG_ERR_MISSING_VALUE;
+          }
+      }
+
+    e = mpi_alloc_set_ui (e_value);
+
+    /* Find two prime numbers.  */
+    p = _gcry_derive_x931_prime (xp, xp1, xp2, e, NULL, NULL);
+    q = _gcry_derive_x931_prime (xq, xq1, xq2, e, NULL, NULL);
+    _gcry_mpi_release (xp);  xp  = NULL;
+    _gcry_mpi_release (xp1); xp1 = NULL;
+    _gcry_mpi_release (xp2); xp2 = NULL;
+    _gcry_mpi_release (xq);  xq  = NULL;
+    _gcry_mpi_release (xq1); xq1 = NULL;
+    _gcry_mpi_release (xq2); xq2 = NULL;
+    if (!p || !q)
+      {
+        _gcry_mpi_release (p);
+        _gcry_mpi_release (q);
+        _gcry_mpi_release (e);
+        return GPG_ERR_NO_PRIME;
+      }
+  }
+
+
+  /* Compute the public modulus.  We make sure that p is smaller than
+     q to allow the use of the CRT.  */
+  if (mpi_cmp (p, q) > 0 )
+    {
+      mpi_swap (p, q);
+      *swapped = 1;
+    }
+  n = mpi_new (nbits);
+  mpi_mul (n, p, q);
+
+  /* Compute the Euler totient:  phi = (p-1)(q-1)  */
+  pm1 = mpi_snew (nbits/2);
+  qm1 = mpi_snew (nbits/2);
+  phi = mpi_snew (nbits);
+  mpi_sub_ui (pm1, p, 1);
+  mpi_sub_ui (qm1, q, 1);
+  mpi_mul (phi, pm1, qm1);
+
+  g = mpi_snew (nbits);
+  gcry_assert (mpi_gcd (g, e, phi));
+
+  /* Compute: f = lcm(p-1,q-1) = phi / gcd(p-1,q-1) */
+  mpi_gcd (g, pm1, qm1);
+  f = pm1; pm1 = NULL;
+  _gcry_mpi_release (qm1); qm1 = NULL;
+  mpi_fdiv_q (f, phi, g);
+  _gcry_mpi_release (phi); phi = NULL;
+  d = g; g = NULL;
+  /* Compute the secret key:  d = e^{-1} mod lcm(p-1,q-1) */
+  mpi_invm (d, e, f);
+
+  /* Compute the inverse of p and q.  */
+  u = f; f = NULL;
+  mpi_invm (u, p, q );
+
+  if( DBG_CIPHER )
+    {
+      if (*swapped)
+        log_debug ("p and q are swapped\n");
+      log_mpidump("  p", p );
+      log_mpidump("  q", q );
+      log_mpidump("  n", n );
+      log_mpidump("  e", e );
+      log_mpidump("  d", d );
+      log_mpidump("  u", u );
+    }
+
+
+  sk->n = n;
+  sk->e = e;
+  sk->p = p;
+  sk->q = q;
+  sk->d = d;
+  sk->u = u;
+
+  /* Now we can test our keys. */
+  if (test_keys (sk, nbits - 64))
+    {
+      _gcry_mpi_release (sk->n); sk->n = NULL;
+      _gcry_mpi_release (sk->e); sk->e = NULL;
+      _gcry_mpi_release (sk->p); sk->p = NULL;
+      _gcry_mpi_release (sk->q); sk->q = NULL;
+      _gcry_mpi_release (sk->d); sk->d = NULL;
+      _gcry_mpi_release (sk->u); sk->u = NULL;
+      fips_signal_error ("self-test after key generation failed");
+      return GPG_ERR_SELFTEST_FAILED;
+    }
+
+  return 0;
+}
+
+
+/****************
+ * Test whether the secret key is valid.
+ * Returns: true if this is a valid key.
+ */
+static int
+check_secret_key( RSA_secret_key *sk )
+{
+  int rc;
+  gcry_mpi_t temp = mpi_alloc( mpi_get_nlimbs(sk->p)*2 );
+
+  mpi_mul(temp, sk->p, sk->q );
+  rc = mpi_cmp( temp, sk->n );
+  mpi_free(temp);
+  return !rc;
+}
+
+
+
+/****************
+ * Public key operation. Encrypt INPUT with PKEY and put result into OUTPUT.
+ *
+ *	c = m^e mod n
+ *
+ * Where c is OUTPUT, m is INPUT and e,n are elements of PKEY.
+ */
+static void
+public(gcry_mpi_t output, gcry_mpi_t input, RSA_public_key *pkey )
+{
+  if( output == input )  /* powm doesn't like output and input the same */
+    {
+      gcry_mpi_t x = mpi_alloc( mpi_get_nlimbs(input)*2 );
+      mpi_powm( x, input, pkey->e, pkey->n );
+      mpi_set(output, x);
+      mpi_free(x);
+    }
+  else
+    mpi_powm( output, input, pkey->e, pkey->n );
+}
+
+#if 0
+static void
+stronger_key_check ( RSA_secret_key *skey )
+{
+  gcry_mpi_t t = mpi_alloc_secure ( 0 );
+  gcry_mpi_t t1 = mpi_alloc_secure ( 0 );
+  gcry_mpi_t t2 = mpi_alloc_secure ( 0 );
+  gcry_mpi_t phi = mpi_alloc_secure ( 0 );
+
+  /* check that n == p * q */
+  mpi_mul( t, skey->p, skey->q);
+  if (mpi_cmp( t, skey->n) )
+    log_info ( "RSA Oops: n != p * q\n" );
+
+  /* check that p is less than q */
+  if( mpi_cmp( skey->p, skey->q ) > 0 )
+    {
+      log_info ("RSA Oops: p >= q - fixed\n");
+      _gcry_mpi_swap ( skey->p, skey->q);
+    }
+
+    /* check that e divides neither p-1 nor q-1 */
+    mpi_sub_ui(t, skey->p, 1 );
+    mpi_fdiv_r(t, t, skey->e );
+    if ( !mpi_cmp_ui( t, 0) )
+        log_info ( "RSA Oops: e divides p-1\n" );
+    mpi_sub_ui(t, skey->q, 1 );
+    mpi_fdiv_r(t, t, skey->e );
+    if ( !mpi_cmp_ui( t, 0) )
+        log_info ( "RSA Oops: e divides q-1\n" );
+
+    /* check that d is correct */
+    mpi_sub_ui( t1, skey->p, 1 );
+    mpi_sub_ui( t2, skey->q, 1 );
+    mpi_mul( phi, t1, t2 );
+    gcry_mpi_gcd(t, t1, t2);
+    mpi_fdiv_q(t, phi, t);
+    mpi_invm(t, skey->e, t );
+    if ( mpi_cmp(t, skey->d ) )
+      {
+        log_info ( "RSA Oops: d is wrong - fixed\n");
+        mpi_set (skey->d, t);
+        log_printmpi ("  fixed d", skey->d);
+      }
+
+    /* check for correctness of u */
+    mpi_invm(t, skey->p, skey->q );
+    if ( mpi_cmp(t, skey->u ) )
+      {
+        log_info ( "RSA Oops: u is wrong - fixed\n");
+        mpi_set (skey->u, t);
+        log_printmpi ("  fixed u", skey->u);
+      }
+
+    log_info ( "RSA secret key check finished\n");
+
+    mpi_free (t);
+    mpi_free (t1);
+    mpi_free (t2);
+    mpi_free (phi);
+}
+#endif
+
+
+
+/* Secret key operation - standard version.
+ *
+ *	m = c^d mod n
+ */
+static void
+secret_core_std (gcry_mpi_t M, gcry_mpi_t C,
+                 gcry_mpi_t D, gcry_mpi_t N)
+{
+  mpi_powm (M, C, D, N);
+}
+
+
+/* Secret key operation - using the CRT.
+ *
+ *      m1 = c ^ (d mod (p-1)) mod p
+ *      m2 = c ^ (d mod (q-1)) mod q
+ *      h = u * (m2 - m1) mod q
+ *      m = m1 + h * p
+ */
+static void
+secret_core_crt (gcry_mpi_t M, gcry_mpi_t C,
+                 gcry_mpi_t D, unsigned int Nlimbs,
+                 gcry_mpi_t P, gcry_mpi_t Q, gcry_mpi_t U)
+{
+  gcry_mpi_t m1 = mpi_alloc_secure ( Nlimbs + 1 );
+  gcry_mpi_t m2 = mpi_alloc_secure ( Nlimbs + 1 );
+  gcry_mpi_t h  = mpi_alloc_secure ( Nlimbs + 1 );
+  gcry_mpi_t D_blind = mpi_alloc_secure ( Nlimbs + 1 );
+  gcry_mpi_t r;
+  unsigned int r_nbits;
+
+  r_nbits = mpi_get_nbits (P) / 4;
+  if (r_nbits < 96)
+    r_nbits = 96;
+  r = mpi_secure_new (r_nbits);
+
+  /* d_blind = (d mod (p-1)) + (p-1) * r            */
+  /* m1 = c ^ d_blind mod p */
+  _gcry_mpi_randomize (r, r_nbits, GCRY_WEAK_RANDOM);
+  mpi_set_highbit (r, r_nbits - 1);
+  mpi_sub_ui ( h, P, 1 );
+  mpi_mul ( D_blind, h, r );
+  mpi_fdiv_r ( h, D, h );
+  mpi_add ( D_blind, D_blind, h );
+  mpi_powm ( m1, C, D_blind, P );
+
+  /* d_blind = (d mod (q-1)) + (q-1) * r            */
+  /* m2 = c ^ d_blind mod q */
+  _gcry_mpi_randomize (r, r_nbits, GCRY_WEAK_RANDOM);
+  mpi_set_highbit (r, r_nbits - 1);
+  mpi_sub_ui ( h, Q, 1  );
+  mpi_mul ( D_blind, h, r );
+  mpi_fdiv_r ( h, D, h );
+  mpi_add ( D_blind, D_blind, h );
+  mpi_powm ( m2, C, D_blind, Q );
+
+  mpi_free ( r );
+  mpi_free ( D_blind );
+
+  /* h = u * ( m2 - m1 ) mod q */
+  mpi_sub ( h, m2, m1 );
+  if ( mpi_has_sign ( h ) )
+    mpi_add ( h, h, Q );
+  mpi_mulm ( h, U, h, Q );
+
+  /* m = m1 + h * p */
+  mpi_mul ( h, h, P );
+  mpi_add ( M, m1, h );
+
+  mpi_free ( h );
+  mpi_free ( m1 );
+  mpi_free ( m2 );
+}
+
+
+/* Secret key operation.
+ * Encrypt INPUT with SKEY and put result into
+ * OUTPUT.  SKEY has the secret key parameters.
+ */
+static void
+secret (gcry_mpi_t output, gcry_mpi_t input, RSA_secret_key *skey )
+{
+  /* Remove superfluous leading zeroes from INPUT.  */
+  mpi_normalize (input);
+
+  if (!skey->p || !skey->q || !skey->u)
+    {
+      secret_core_std (output, input, skey->d, skey->n);
+    }
+  else
+    {
+      secret_core_crt (output, input, skey->d, mpi_get_nlimbs (skey->n),
+                       skey->p, skey->q, skey->u);
+    }
+}
+
+
+static void
+secret_blinded (gcry_mpi_t output, gcry_mpi_t input,
+                RSA_secret_key *sk, unsigned int nbits)
+{
+  gcry_mpi_t r;	           /* Random number needed for blinding.  */
+  gcry_mpi_t ri;	   /* Modular multiplicative inverse of r.  */
+  gcry_mpi_t bldata;       /* Blinded data to decrypt.  */
+
+  /* First, we need a random number r between 0 and n - 1, which is
+   * relatively prime to n (i.e. it is neither p nor q).  The random
+   * number needs to be only unpredictable, thus we employ the
+   * gcry_create_nonce function by using GCRY_WEAK_RANDOM with
+   * gcry_mpi_randomize.  */
+  r  = mpi_snew (nbits);
+  ri = mpi_snew (nbits);
+  bldata = mpi_snew (nbits);
+
+  do
+    {
+      _gcry_mpi_randomize (r, nbits, GCRY_WEAK_RANDOM);
+      mpi_mod (r, r, sk->n);
+    }
+  while (!mpi_invm (ri, r, sk->n));
+
+  /* Do blinding.  We calculate: y = (x * r^e) mod n, where r is the
+   * random number, e is the public exponent, x is the non-blinded
+   * input data and n is the RSA modulus.  */
+  mpi_powm (bldata, r, sk->e, sk->n);
+  mpi_mulm (bldata, bldata, input, sk->n);
+
+  /* Perform decryption.  */
+  secret (output, bldata, sk);
+  _gcry_mpi_release (bldata);
+
+  /* Undo blinding.  Here we calculate: y = (x * r^-1) mod n, where x
+   * is the blinded decrypted data, ri is the modular multiplicative
+   * inverse of r and n is the RSA modulus.  */
+  mpi_mulm (output, output, ri, sk->n);
+
+  _gcry_mpi_release (r);
+  _gcry_mpi_release (ri);
+}
+
+
+/*********************************************
+ **************  interface  ******************
+ *********************************************/
+
+static gcry_err_code_t
+rsa_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
+{
+  gpg_err_code_t ec;
+  unsigned int nbits;
+  unsigned long evalue;
+  RSA_secret_key sk;
+  gcry_sexp_t deriveparms;
+  int flags = 0;
+  gcry_sexp_t l1;
+  gcry_sexp_t swap_info = NULL;
+
+  memset (&sk, 0, sizeof sk);
+
+  ec = _gcry_pk_util_get_nbits (genparms, &nbits);
+  if (ec)
+    return ec;
+
+  ec = _gcry_pk_util_get_rsa_use_e (genparms, &evalue);
+  if (ec)
+    return ec;
+
+  /* Parse the optional flags list.  */
+  l1 = sexp_find_token (genparms, "flags", 0);
+  if (l1)
+    {
+      ec = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      sexp_release (l1);
+      if (ec)
+        return ec;
+    }
+
+  deriveparms = (genparms?
+                 sexp_find_token (genparms, "derive-parms", 0) : NULL);
+  if (!deriveparms)
+    {
+      /* Parse the optional "use-x931" flag. */
+      l1 = sexp_find_token (genparms, "use-x931", 0);
+      if (l1)
+        {
+          flags |= PUBKEY_FLAG_USE_X931;
+          sexp_release (l1);
+        }
+    }
+
+  if (deriveparms || (flags & PUBKEY_FLAG_USE_X931))
+    {
+      int swapped;
+      ec = generate_x931 (&sk, nbits, evalue, deriveparms, &swapped);
+      sexp_release (deriveparms);
+      if (!ec && swapped)
+        ec = sexp_new (&swap_info, "(misc-key-info(p-q-swapped))", 0, 1);
+    }
+  else
+    {
+      /* Parse the optional "transient-key" flag. */
+      if (!(flags & PUBKEY_FLAG_TRANSIENT_KEY))
+        {
+          l1 = sexp_find_token (genparms, "transient-key", 0);
+          if (l1)
+            {
+              flags |= PUBKEY_FLAG_TRANSIENT_KEY;
+              sexp_release (l1);
+            }
+        }
+      deriveparms = (genparms? sexp_find_token (genparms, "test-parms", 0)
+                     /**/    : NULL);
+
+      /* Generate.  */
+      if (deriveparms || fips_mode())
+        {
+          ec = generate_fips (&sk, nbits, evalue, deriveparms,
+                              !!(flags & PUBKEY_FLAG_TRANSIENT_KEY));
+        }
+      else
+        {
+          ec = generate_std (&sk, nbits, evalue,
+                             !!(flags & PUBKEY_FLAG_TRANSIENT_KEY));
+        }
+      sexp_release (deriveparms);
+    }
+
+  if (!ec)
+    {
+      ec = sexp_build (r_skey, NULL,
+                       "(key-data"
+                       " (public-key"
+                       "  (rsa(n%m)(e%m)))"
+                       " (private-key"
+                       "  (rsa(n%m)(e%m)(d%m)(p%m)(q%m)(u%m)))"
+                       " %S)",
+                       sk.n, sk.e,
+                       sk.n, sk.e, sk.d, sk.p, sk.q, sk.u,
+                       swap_info);
+    }
+
+  mpi_free (sk.n);
+  mpi_free (sk.e);
+  mpi_free (sk.p);
+  mpi_free (sk.q);
+  mpi_free (sk.d);
+  mpi_free (sk.u);
+  sexp_release (swap_info);
+
+  return ec;
+}
+
+
+static gcry_err_code_t
+rsa_check_secret_key (gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  RSA_secret_key sk = {NULL, NULL, NULL, NULL, NULL, NULL};
+
+  /* To check the key we need the optional parameters. */
+  rc = sexp_extract_param (keyparms, NULL, "nedpqu",
+                           &sk.n, &sk.e, &sk.d, &sk.p, &sk.q, &sk.u,
+                           NULL);
+  if (rc)
+    goto leave;
+
+  if (!check_secret_key (&sk))
+    rc = GPG_ERR_BAD_SECKEY;
+
+ leave:
+  _gcry_mpi_release (sk.n);
+  _gcry_mpi_release (sk.e);
+  _gcry_mpi_release (sk.d);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.u);
+  if (DBG_CIPHER)
+    log_debug ("rsa_testkey    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+rsa_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t data = NULL;
+  RSA_public_key pk = {NULL, NULL};
+  gcry_mpi_t ciph = NULL;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_ENCRYPT,
+                                   rsa_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_mpidump ("rsa_encrypt data", data);
+  if (!data || mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "ne", &pk.n, &pk.e, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_mpidump ("rsa_encrypt    n", pk.n);
+      log_mpidump ("rsa_encrypt    e", pk.e);
+    }
+
+  /* Do RSA computation and build result.  */
+  ciph = mpi_new (0);
+  public (ciph, data, &pk);
+  if (DBG_CIPHER)
+    log_mpidump ("rsa_encrypt  res", ciph);
+  if ((ctx.flags & PUBKEY_FLAG_FIXEDLEN))
+    {
+      /* We need to make sure to return the correct length to avoid
+         problems with missing leading zeroes.  */
+      unsigned char *em;
+      size_t emlen = (mpi_get_nbits (pk.n)+7)/8;
+
+      rc = _gcry_mpi_to_octet_string (&em, NULL, ciph, emlen);
+      if (!rc)
+        {
+          rc = sexp_build (r_ciph, NULL, "(enc-val(rsa(a%b)))", (int)emlen, em);
+          xfree (em);
+        }
+    }
+  else
+    rc = sexp_build (r_ciph, NULL, "(enc-val(rsa(a%m)))", ciph);
+
+ leave:
+  _gcry_mpi_release (ciph);
+  _gcry_mpi_release (pk.n);
+  _gcry_mpi_release (pk.e);
+  _gcry_mpi_release (data);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("rsa_encrypt    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+rsa_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+
+{
+  gpg_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t data = NULL;
+  RSA_secret_key sk = {NULL, NULL, NULL, NULL, NULL, NULL};
+  gcry_mpi_t plain = NULL;
+  unsigned char *unpad = NULL;
+  size_t unpadlen = 0;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT,
+                                   rsa_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_preparse_encval (s_data, rsa_names, &l1, &ctx);
+  if (rc)
+    goto leave;
+  rc = sexp_extract_param (l1, NULL, "a", &data, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_decrypt data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "nedp?q?u?",
+                           &sk.n, &sk.e, &sk.d, &sk.p, &sk.q, &sk.u,
+                           NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("rsa_decrypt    n", sk.n);
+      log_printmpi ("rsa_decrypt    e", sk.e);
+      if (!fips_mode ())
+        {
+          log_printmpi ("rsa_decrypt    d", sk.d);
+          log_printmpi ("rsa_decrypt    p", sk.p);
+          log_printmpi ("rsa_decrypt    q", sk.q);
+          log_printmpi ("rsa_decrypt    u", sk.u);
+        }
+    }
+
+  /* Better make sure that there are no superfluous leading zeroes in
+     the input and it has not been "padded" using multiples of N.
+     This mitigates side-channel attacks (CVE-2013-4576).  */
+  mpi_normalize (data);
+  mpi_fdiv_r (data, data, sk.n);
+
+  /* Allocate MPI for the plaintext.  */
+  plain = mpi_snew (ctx.nbits);
+
+  /* We use blinding by default to mitigate timing attacks which can
+     be practically mounted over the network as shown by Brumley and
+     Boney in 2003.  */
+  if ((ctx.flags & PUBKEY_FLAG_NO_BLINDING))
+    secret (plain, data, &sk);
+  else
+    secret_blinded (plain, data, &sk, ctx.nbits);
+
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_decrypt  res", plain);
+
+  /* Reverse the encoding and build the s-expression.  */
+  switch (ctx.encoding)
+    {
+    case PUBKEY_ENC_PKCS1:
+      rc = _gcry_rsa_pkcs1_decode_for_enc (&unpad, &unpadlen, ctx.nbits, plain);
+      mpi_free (plain);
+      plain = NULL;
+      if (!rc)
+        rc = sexp_build (r_plain, NULL, "(value %b)", (int)unpadlen, unpad);
+      break;
+
+    case PUBKEY_ENC_OAEP:
+      rc = _gcry_rsa_oaep_decode (&unpad, &unpadlen,
+                                  ctx.nbits, ctx.hash_algo,
+                                  plain, ctx.label, ctx.labellen);
+      mpi_free (plain);
+      plain = NULL;
+      if (!rc)
+        rc = sexp_build (r_plain, NULL, "(value %b)", (int)unpadlen, unpad);
+      break;
+
+    default:
+      /* Raw format.  For backward compatibility we need to assume a
+         signed mpi by using the sexp format string "%m".  */
+      rc = sexp_build (r_plain, NULL,
+                       (ctx.flags & PUBKEY_FLAG_LEGACYRESULT)
+                       ? "%m":"(value %m)", plain);
+      break;
+    }
+
+ leave:
+  xfree (unpad);
+  _gcry_mpi_release (plain);
+  _gcry_mpi_release (sk.n);
+  _gcry_mpi_release (sk.e);
+  _gcry_mpi_release (sk.d);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.u);
+  _gcry_mpi_release (data);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("rsa_decrypt    => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+rsa_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gpg_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_mpi_t data = NULL;
+  RSA_secret_key sk = {NULL, NULL, NULL, NULL, NULL, NULL};
+  RSA_public_key pk;
+  gcry_mpi_t sig = NULL;
+  gcry_mpi_t result = NULL;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_SIGN,
+                                   rsa_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_sign   data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "nedp?q?u?",
+                           &sk.n, &sk.e, &sk.d, &sk.p, &sk.q, &sk.u,
+                           NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("rsa_sign      n", sk.n);
+      log_printmpi ("rsa_sign      e", sk.e);
+      if (!fips_mode ())
+        {
+          log_printmpi ("rsa_sign      d", sk.d);
+          log_printmpi ("rsa_sign      p", sk.p);
+          log_printmpi ("rsa_sign      q", sk.q);
+          log_printmpi ("rsa_sign      u", sk.u);
+        }
+    }
+
+  /* Do RSA computation.  */
+  sig = mpi_new (0);
+  if ((ctx.flags & PUBKEY_FLAG_NO_BLINDING))
+    secret (sig, data, &sk);
+  else
+    secret_blinded (sig, data, &sk, ctx.nbits);
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_sign    res", sig);
+
+  /* Check that the created signature is good.  This detects a failure
+     of the CRT algorithm  (Lenstra's attack on RSA's use of the CRT).  */
+  result = mpi_new (0);
+  pk.n = sk.n;
+  pk.e = sk.e;
+  public (result, sig, &pk);
+  if (mpi_cmp (result, data))
+    {
+      rc = GPG_ERR_BAD_SIGNATURE;
+      goto leave;
+    }
+
+  /* Convert the result.  */
+  if ((ctx.flags & PUBKEY_FLAG_FIXEDLEN))
+    {
+      /* We need to make sure to return the correct length to avoid
+         problems with missing leading zeroes.  */
+      unsigned char *em;
+      size_t emlen = (mpi_get_nbits (sk.n)+7)/8;
+
+      rc = _gcry_mpi_to_octet_string (&em, NULL, sig, emlen);
+      if (!rc)
+        {
+          rc = sexp_build (r_sig, NULL, "(sig-val(rsa(s%b)))", (int)emlen, em);
+          xfree (em);
+        }
+    }
+  else
+    rc = sexp_build (r_sig, NULL, "(sig-val(rsa(s%M)))", sig);
+
+
+ leave:
+  _gcry_mpi_release (result);
+  _gcry_mpi_release (sig);
+  _gcry_mpi_release (sk.n);
+  _gcry_mpi_release (sk.e);
+  _gcry_mpi_release (sk.d);
+  _gcry_mpi_release (sk.p);
+  _gcry_mpi_release (sk.q);
+  _gcry_mpi_release (sk.u);
+  _gcry_mpi_release (data);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("rsa_sign      => %s\n", gpg_strerror (rc));
+  return rc;
+}
+
+
+static gcry_err_code_t
+rsa_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
+{
+  gcry_err_code_t rc;
+  struct pk_encoding_ctx ctx;
+  gcry_sexp_t l1 = NULL;
+  gcry_mpi_t sig = NULL;
+  gcry_mpi_t data = NULL;
+  RSA_public_key pk = { NULL, NULL };
+  gcry_mpi_t result = NULL;
+
+  _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_VERIFY,
+                                   rsa_get_nbits (keyparms));
+
+  /* Extract the data.  */
+  rc = _gcry_pk_util_data_to_mpi (s_data, &data, &ctx);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_verify data", data);
+  if (mpi_is_opaque (data))
+    {
+      rc = GPG_ERR_INV_DATA;
+      goto leave;
+    }
+
+  /* Extract the signature value.  */
+  rc = _gcry_pk_util_preparse_sigval (s_sig, rsa_names, &l1, NULL);
+  if (rc)
+    goto leave;
+  rc = sexp_extract_param (l1, NULL, "s", &sig, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_verify  sig", sig);
+
+  /* Extract the key.  */
+  rc = sexp_extract_param (keyparms, NULL, "ne", &pk.n, &pk.e, NULL);
+  if (rc)
+    goto leave;
+  if (DBG_CIPHER)
+    {
+      log_printmpi ("rsa_verify    n", pk.n);
+      log_printmpi ("rsa_verify    e", pk.e);
+    }
+
+  /* Do RSA computation and compare.  */
+  result = mpi_new (0);
+  public (result, sig, &pk);
+  if (DBG_CIPHER)
+    log_printmpi ("rsa_verify  cmp", result);
+  if (ctx.verify_cmp)
+    rc = ctx.verify_cmp (&ctx, result);
+  else
+    rc = mpi_cmp (result, data) ? GPG_ERR_BAD_SIGNATURE : 0;
+
+ leave:
+  _gcry_mpi_release (result);
+  _gcry_mpi_release (pk.n);
+  _gcry_mpi_release (pk.e);
+  _gcry_mpi_release (data);
+  _gcry_mpi_release (sig);
+  sexp_release (l1);
+  _gcry_pk_util_free_encoding_ctx (&ctx);
+  if (DBG_CIPHER)
+    log_debug ("rsa_verify    => %s\n", rc?gpg_strerror (rc):"Good");
+  return rc;
+}
+
+
+
+/* Return the number of bits for the key described by PARMS.  On error
+ * 0 is returned.  The format of PARMS starts with the algorithm name;
+ * for example:
+ *
+ *   (rsa
+ *     (n <mpi>)
+ *     (e <mpi>))
+ *
+ * More parameters may be given but we only need N here.
+ */
+static unsigned int
+rsa_get_nbits (gcry_sexp_t parms)
+{
+  gcry_sexp_t l1;
+  gcry_mpi_t n;
+  unsigned int nbits;
+
+  l1 = sexp_find_token (parms, "n", 1);
+  if (!l1)
+    return 0; /* Parameter N not found.  */
+
+  n = sexp_nth_mpi (l1, 1, GCRYMPI_FMT_USG);
+  sexp_release (l1);
+  nbits = n? mpi_get_nbits (n) : 0;
+  _gcry_mpi_release (n);
+  return nbits;
+}
+
+
+/* Compute a keygrip.  MD is the hash context which we are going to
+   update.  KEYPARAM is an S-expression with the key parameters, this
+   is usually a public key but may also be a secret key.  An example
+   of such an S-expression is:
+
+      (rsa
+        (n #00B...#)
+        (e #010001#))
+
+   PKCS-15 says that for RSA only the modulus should be hashed -
+   however, it is not clear whether this is meant to use the raw bytes
+   (assuming this is an unsigned integer) or whether the DER required
+   0 should be prefixed.  We hash the raw bytes.  */
+static gpg_err_code_t
+compute_keygrip (gcry_md_hd_t md, gcry_sexp_t keyparam)
+{
+  gcry_sexp_t l1;
+  const char *data;
+  size_t datalen;
+
+  l1 = sexp_find_token (keyparam, "n", 1);
+  if (!l1)
+    return GPG_ERR_NO_OBJ;
+
+  data = sexp_nth_data (l1, 1, &datalen);
+  if (!data)
+    {
+      sexp_release (l1);
+      return GPG_ERR_NO_OBJ;
+    }
+
+  _gcry_md_write (md, data, datalen);
+  sexp_release (l1);
+
+  return 0;
+}
+
+
+
+
+/*
+     Self-test section.
+ */
+
+static const char *
+selftest_sign_2048 (gcry_sexp_t pkey, gcry_sexp_t skey)
+{
+  static const char sample_data[] =
+    "(data (flags pkcs1)"
+    " (hash sha256 #11223344556677889900aabbccddeeff"
+    /**/           "102030405060708090a0b0c0d0f01121#))";
+  static const char sample_data_bad[] =
+    "(data (flags pkcs1)"
+    " (hash sha256 #11223344556677889900aabbccddeeff"
+    /**/           "802030405060708090a0b0c0d0f01121#))";
+
+  const char *errtxt = NULL;
+  gcry_error_t err;
+  gcry_sexp_t data = NULL;
+  gcry_sexp_t data_bad = NULL;
+  gcry_sexp_t sig = NULL;
+  /* raw signature data reference */
+  const char ref_data[] =
+    "6252a19a11e1d5155ed9376036277193d644fa239397fff03e9b92d6f86415d6"
+    "d30da9273775f290e580d038295ff8ff89522becccfa6ae870bf76b76df402a8"
+    "54f69347e3db3de8e1e7d4dada281ec556810c7a8ecd0b5f51f9b1c0e7aa7557"
+    "61aa2b8ba5f811304acc6af0eca41fe49baf33bf34eddaf44e21e036ac7f0b68"
+    "03cdef1c60021fb7b5b97ebacdd88ab755ce29af568dbc5728cc6e6eff42618d"
+    "62a0386ca8beed46402bdeeef29b6a3feded906bace411a06a39192bf516ae10"
+    "67e4320fa8ea113968525f4574d022a3ceeaafdc41079efe1f22cc94bf59d8d3"
+    "328085da9674857db56de5978a62394aab48aa3b72e23a1b16260cfd9daafe65";
+  gcry_mpi_t ref_mpi = NULL;
+  gcry_mpi_t sig_mpi = NULL;
+
+  err = sexp_sscan (&data, NULL, sample_data, strlen (sample_data));
+  if (!err)
+    err = sexp_sscan (&data_bad, NULL,
+                      sample_data_bad, strlen (sample_data_bad));
+  if (err)
+    {
+      errtxt = "converting data failed";
+      goto leave;
+    }
+
+  err = _gcry_pk_sign (&sig, data, skey);
+  if (err)
+    {
+      errtxt = "signing failed";
+      goto leave;
+    }
+
+  err = _gcry_mpi_scan(&ref_mpi, GCRYMPI_FMT_HEX, ref_data, 0, NULL);
+  if (err)
+    {
+      errtxt = "converting ref_data to mpi failed";
+      goto leave;
+    }
+
+  err = _gcry_sexp_extract_param(sig, "sig-val!rsa", "s", &sig_mpi, NULL);
+  if (err)
+    {
+      errtxt = "extracting signature data failed";
+      goto leave;
+    }
+
+  if (mpi_cmp (sig_mpi, ref_mpi))
+    {
+      errtxt = "signature does not match reference data";
+      goto leave;
+    }
+
+  err = _gcry_pk_verify (sig, data, pkey);
+  if (err)
+    {
+      errtxt = "verify failed";
+      goto leave;
+    }
+  err = _gcry_pk_verify (sig, data_bad, pkey);
+  if (gcry_err_code (err) != GPG_ERR_BAD_SIGNATURE)
+    {
+      errtxt = "bad signature not detected";
+      goto leave;
+    }
+
+
+ leave:
+  sexp_release (sig);
+  sexp_release (data_bad);
+  sexp_release (data);
+  _gcry_mpi_release (ref_mpi);
+  _gcry_mpi_release (sig_mpi);
+  return errtxt;
+}
+
+
+
+/* Given an S-expression ENCR_DATA of the form:
+
+   (enc-val
+    (rsa
+     (a a-value)))
+
+   as returned by gcry_pk_decrypt, return the the A-VALUE.  On error,
+   return NULL.  */
+static gcry_mpi_t
+extract_a_from_sexp (gcry_sexp_t encr_data)
+{
+  gcry_sexp_t l1, l2, l3;
+  gcry_mpi_t a_value;
+
+  l1 = sexp_find_token (encr_data, "enc-val", 0);
+  if (!l1)
+    return NULL;
+  l2 = sexp_find_token (l1, "rsa", 0);
+  sexp_release (l1);
+  if (!l2)
+    return NULL;
+  l3 = sexp_find_token (l2, "a", 0);
+  sexp_release (l2);
+  if (!l3)
+    return NULL;
+  a_value = sexp_nth_mpi (l3, 1, 0);
+  sexp_release (l3);
+
+  return a_value;
+}
+
+
+static const char *
+selftest_encr_2048 (gcry_sexp_t pkey, gcry_sexp_t skey)
+{
+  const char *errtxt = NULL;
+  gcry_error_t err;
+  static const char plaintext[] =
+    "Jim quickly realized that the beautiful gowns are expensive.";
+  gcry_sexp_t plain = NULL;
+  gcry_sexp_t encr  = NULL;
+  gcry_mpi_t  ciphertext = NULL;
+  gcry_sexp_t decr  = NULL;
+  char *decr_plaintext = NULL;
+  gcry_sexp_t tmplist = NULL;
+  /* expected result of encrypting the plaintext with sample_secret_key */
+  static const char ref_data[] =
+    "18022e2593a402a737caaa93b4c7e750e20ca265452980e1d6b7710fbd3e"
+    "7dce72be5c2110fb47691cb38f42170ee3b4a37f2498d4a51567d762585e"
+    "4cb81d04fbc7df4144f8e5eac2d4b8688521b64011f11d7ad53f4c874004"
+    "819856f2e2a6f83d1c9c4e73ac26089789c14482b0b8d44139133c88c4a5"
+    "2dba9dd6d6ffc622666b7d129168333d999706af30a2d7d272db7734e5ed"
+    "fb8c64ea3018af3ad20f4a013a5060cb0f5e72753967bebe294280a6ed0d"
+    "dbd3c4f11d0a8696e9d32a0dc03deb0b5e49b2cbd1503392642d4e1211f3"
+    "e8e2ee38abaa3671ccd57fcde8ca76e85fd2cb77c35706a970a213a27352"
+    "cec92a9604d543ddb5fc478ff50e0622";
+  gcry_mpi_t ref_mpi = NULL;
+
+  /* Put the plaintext into an S-expression.  */
+  err = sexp_build (&plain, NULL, "(data (flags raw) (value %s))", plaintext);
+  if (err)
+    {
+      errtxt = "converting data failed";
+      goto leave;
+    }
+
+  /* Encrypt.  */
+  err = _gcry_pk_encrypt (&encr, plain, pkey);
+  if (err)
+    {
+      errtxt = "encrypt failed";
+      goto leave;
+    }
+
+  err = _gcry_mpi_scan(&ref_mpi, GCRYMPI_FMT_HEX, ref_data, 0, NULL);
+  if (err)
+    {
+      errtxt = "converting encrydata to mpi failed";
+      goto leave;
+    }
+
+  /* Extraxt the ciphertext from the returned S-expression.  */
+  /*sexp_dump (encr);*/
+  ciphertext = extract_a_from_sexp (encr);
+  if (!ciphertext)
+    {
+      errtxt = "gcry_pk_decrypt returned garbage";
+      goto leave;
+    }
+
+  /* Check that the ciphertext does no match the plaintext.  */
+  /* _gcry_log_printmpi ("plaintext", plaintext); */
+  /* _gcry_log_printmpi ("ciphertxt", ciphertext); */
+  if (mpi_cmp (ref_mpi, ciphertext))
+    {
+      errtxt = "ciphertext doesn't match reference data";
+      goto leave;
+    }
+
+  /* Decrypt.  */
+  err = _gcry_pk_decrypt (&decr, encr, skey);
+  if (err)
+    {
+      errtxt = "decrypt failed";
+      goto leave;
+    }
+
+  /* Extract the decrypted data from the S-expression.  Note that the
+     output of gcry_pk_decrypt depends on whether a flags lists occurs
+     in its input data.  Because we passed the output of
+     gcry_pk_encrypt directly to gcry_pk_decrypt, such a flag value
+     won't be there as of today.  To be prepared for future changes we
+     take care of it anyway.  */
+  tmplist = sexp_find_token (decr, "value", 0);
+  if (tmplist)
+    decr_plaintext = sexp_nth_string (tmplist, 1);
+  else
+    decr_plaintext = sexp_nth_string (decr, 0);
+  if (!decr_plaintext)
+    {
+      errtxt = "decrypt returned no plaintext";
+      goto leave;
+    }
+
+  /* Check that the decrypted plaintext matches the original  plaintext.  */
+  if (strcmp (plaintext, decr_plaintext))
+    {
+      errtxt = "mismatch";
+      goto leave;
+    }
+
+ leave:
+  sexp_release (tmplist);
+  xfree (decr_plaintext);
+  sexp_release (decr);
+  _gcry_mpi_release (ciphertext);
+  _gcry_mpi_release (ref_mpi);
+  sexp_release (encr);
+  sexp_release (plain);
+  return errtxt;
+}
+
+
+static gpg_err_code_t
+selftests_rsa (selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+  gcry_error_t err;
+  gcry_sexp_t skey = NULL;
+  gcry_sexp_t pkey = NULL;
+
+  /* Convert the S-expressions into the internal representation.  */
+  what = "convert";
+  err = sexp_sscan (&skey, NULL, sample_secret_key, strlen (sample_secret_key));
+  if (!err)
+    err = sexp_sscan (&pkey, NULL,
+                      sample_public_key, strlen (sample_public_key));
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "key consistency";
+  err = _gcry_pk_testkey (skey);
+  if (err)
+    {
+      errtxt = _gcry_strerror (err);
+      goto failed;
+    }
+
+  what = "sign";
+  errtxt = selftest_sign_2048 (pkey, skey);
+  if (errtxt)
+    goto failed;
+
+  what = "encrypt";
+  errtxt = selftest_encr_2048 (pkey, skey);
+  if (errtxt)
+    goto failed;
+
+  sexp_release (pkey);
+  sexp_release (skey);
+  return 0; /* Succeeded. */
+
+ failed:
+  sexp_release (pkey);
+  sexp_release (skey);
+  if (report)
+    report ("pubkey", GCRY_PK_RSA, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  (void)extended;
+
+  switch (algo)
+    {
+    case GCRY_PK_RSA:
+      ec = selftests_rsa (report);
+      break;
+    default:
+      ec = GPG_ERR_PUBKEY_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+
+gcry_pk_spec_t _gcry_pubkey_spec_rsa =
+  {
+    GCRY_PK_RSA, { 0, 1 },
+    (GCRY_PK_USAGE_SIGN | GCRY_PK_USAGE_ENCR),
+    "RSA", rsa_names,
+    "ne", "nedpqu", "a", "s", "n",
+    rsa_generate,
+    rsa_check_secret_key,
+    rsa_encrypt,
+    rsa_decrypt,
+    rsa_sign,
+    rsa_verify,
+    rsa_get_nbits,
+    run_selftests,
+    compute_keygrip
+  };
diff --git a/comm/third_party/libgcrypt/cipher/salsa20-amd64.S b/comm/third_party/libgcrypt/cipher/salsa20-amd64.S
new file mode 100644
index 0000000000..ae8f27155a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/salsa20-amd64.S
@@ -0,0 +1,940 @@
+/* salsa20-amd64.S  -  AMD64 implementation of Salsa20
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by D. J. Bernstein at
+ *  http://cr.yp.to/snuffle.html
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)
+
+#include "asm-common-amd64.h"
+
+.text
+
+.align 8
+.globl _gcry_salsa20_amd64_keysetup
+ELF(.type  _gcry_salsa20_amd64_keysetup,@function;)
+_gcry_salsa20_amd64_keysetup:
+	CFI_STARTPROC();
+	movl   0(%rsi),%r8d
+	movl   4(%rsi),%r9d
+	movl   8(%rsi),%eax
+	movl   12(%rsi),%r10d
+	movl   %r8d,20(%rdi)
+	movl   %r9d,40(%rdi)
+	movl   %eax,60(%rdi)
+	movl   %r10d,48(%rdi)
+	cmp  $256,%rdx
+	jb .L_kbits128
+.L_kbits256:
+	movl   16(%rsi),%edx
+	movl   20(%rsi),%ecx
+	movl   24(%rsi),%r8d
+	movl   28(%rsi),%esi
+	movl   %edx,28(%rdi)
+	movl   %ecx,16(%rdi)
+	movl   %r8d,36(%rdi)
+	movl   %esi,56(%rdi)
+	mov  $1634760805,%rsi
+	mov  $857760878,%rdx
+	mov  $2036477234,%rcx
+	mov  $1797285236,%r8
+	movl   %esi,0(%rdi)
+	movl   %edx,4(%rdi)
+	movl   %ecx,8(%rdi)
+	movl   %r8d,12(%rdi)
+	jmp .L_keysetupdone
+.L_kbits128:
+	movl   0(%rsi),%edx
+	movl   4(%rsi),%ecx
+	movl   8(%rsi),%r8d
+	movl   12(%rsi),%esi
+	movl   %edx,28(%rdi)
+	movl   %ecx,16(%rdi)
+	movl   %r8d,36(%rdi)
+	movl   %esi,56(%rdi)
+	mov  $1634760805,%rsi
+	mov  $824206446,%rdx
+	mov  $2036477238,%rcx
+	mov  $1797285236,%r8
+	movl   %esi,0(%rdi)
+	movl   %edx,4(%rdi)
+	movl   %ecx,8(%rdi)
+	movl   %r8d,12(%rdi)
+.L_keysetupdone:
+	ret
+	CFI_ENDPROC();
+
+.align 8
+.globl _gcry_salsa20_amd64_ivsetup
+ELF(.type  _gcry_salsa20_amd64_ivsetup,@function;)
+_gcry_salsa20_amd64_ivsetup:
+	CFI_STARTPROC();
+	movl   0(%rsi),%r8d
+	movl   4(%rsi),%esi
+	mov  $0,%r9
+	mov  $0,%rax
+	movl   %r8d,24(%rdi)
+	movl   %esi,44(%rdi)
+	movl   %r9d,32(%rdi)
+	movl   %eax,52(%rdi)
+	ret
+	CFI_ENDPROC();
+
+.align 8
+.globl _gcry_salsa20_amd64_encrypt_blocks
+ELF(.type  _gcry_salsa20_amd64_encrypt_blocks,@function;)
+_gcry_salsa20_amd64_encrypt_blocks:
+	/*
+	 * Modifications to original implementation:
+	 *  - Number of rounds passing in register %r8 (for Salsa20/12).
+	 *  - Length is input as number of blocks, so don't handle tail bytes
+	 *    (this is done in salsa20.c).
+	 */
+	CFI_STARTPROC();
+	push %rbx
+	CFI_PUSH(%rbx);
+	shlq $6, %rcx /* blocks to bytes */
+	mov %r8, %rbx
+	mov %rsp,%r11
+	CFI_DEF_CFA_REGISTER(%r11);
+	sub $384,%rsp
+	and $~31,%rsp
+	mov  %rdi,%r8
+	mov  %rsi,%rsi
+	mov  %rdx,%rdi
+	mov  %rcx,%rdx
+	cmp  $0,%rdx
+	jbe .L_done
+.L_start:
+	cmp  $256,%rdx
+	jb .L_bytes_are_64_128_or_192
+	movdqa 0(%r8),%xmm0
+	pshufd $0x55,%xmm0,%xmm1
+	pshufd $0xaa,%xmm0,%xmm2
+	pshufd $0xff,%xmm0,%xmm3
+	pshufd $0x00,%xmm0,%xmm0
+	movdqa %xmm1,0(%rsp)
+	movdqa %xmm2,16(%rsp)
+	movdqa %xmm3,32(%rsp)
+	movdqa %xmm0,48(%rsp)
+	movdqa 16(%r8),%xmm0
+	pshufd $0xaa,%xmm0,%xmm1
+	pshufd $0xff,%xmm0,%xmm2
+	pshufd $0x00,%xmm0,%xmm3
+	pshufd $0x55,%xmm0,%xmm0
+	movdqa %xmm1,64(%rsp)
+	movdqa %xmm2,80(%rsp)
+	movdqa %xmm3,96(%rsp)
+	movdqa %xmm0,112(%rsp)
+	movdqa 32(%r8),%xmm0
+	pshufd $0xff,%xmm0,%xmm1
+	pshufd $0x55,%xmm0,%xmm2
+	pshufd $0xaa,%xmm0,%xmm0
+	movdqa %xmm1,128(%rsp)
+	movdqa %xmm2,144(%rsp)
+	movdqa %xmm0,160(%rsp)
+	movdqa 48(%r8),%xmm0
+	pshufd $0x00,%xmm0,%xmm1
+	pshufd $0xaa,%xmm0,%xmm2
+	pshufd $0xff,%xmm0,%xmm0
+	movdqa %xmm1,176(%rsp)
+	movdqa %xmm2,192(%rsp)
+	movdqa %xmm0,208(%rsp)
+.L_bytesatleast256:
+	movl   32(%r8),%ecx
+	movl   52(%r8),%r9d
+	movl %ecx,224(%rsp)
+	movl %r9d,240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,4+224(%rsp)
+	movl %r9d,4+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,8+224(%rsp)
+	movl %r9d,8+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,12+224(%rsp)
+	movl %r9d,12+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl   %ecx,32(%r8)
+	movl   %r9d,52(%r8)
+	movq %rdx,288(%rsp)
+	mov  %rbx,%rdx
+	movdqa 0(%rsp),%xmm0
+	movdqa 16(%rsp),%xmm1
+	movdqa 32(%rsp),%xmm2
+	movdqa 192(%rsp),%xmm3
+	movdqa 208(%rsp),%xmm4
+	movdqa 64(%rsp),%xmm5
+	movdqa 80(%rsp),%xmm6
+	movdqa 112(%rsp),%xmm7
+	movdqa 128(%rsp),%xmm8
+	movdqa 144(%rsp),%xmm9
+	movdqa 160(%rsp),%xmm10
+	movdqa 240(%rsp),%xmm11
+	movdqa 48(%rsp),%xmm12
+	movdqa 96(%rsp),%xmm13
+	movdqa 176(%rsp),%xmm14
+	movdqa 224(%rsp),%xmm15
+.L_mainloop1:
+	movdqa %xmm1,256(%rsp)
+	movdqa %xmm2,272(%rsp)
+	movdqa %xmm13,%xmm1
+	paddd %xmm12,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm14
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm14
+	movdqa %xmm7,%xmm1
+	paddd %xmm0,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm11
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm11
+	movdqa %xmm12,%xmm1
+	paddd %xmm14,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm15
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm15
+	movdqa %xmm0,%xmm1
+	paddd %xmm11,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm9
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm9
+	movdqa %xmm14,%xmm1
+	paddd %xmm15,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm13
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm13
+	movdqa %xmm11,%xmm1
+	paddd %xmm9,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm7
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm7
+	movdqa %xmm15,%xmm1
+	paddd %xmm13,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm12
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm12
+	movdqa 256(%rsp),%xmm1
+	movdqa %xmm12,256(%rsp)
+	movdqa %xmm9,%xmm2
+	paddd %xmm7,%xmm2
+	movdqa %xmm2,%xmm12
+	pslld $18,%xmm2
+	pxor  %xmm2,%xmm0
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm0
+	movdqa %xmm5,%xmm2
+	paddd %xmm1,%xmm2
+	movdqa %xmm2,%xmm12
+	pslld $7,%xmm2
+	pxor  %xmm2,%xmm3
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm3
+	movdqa 272(%rsp),%xmm2
+	movdqa %xmm0,272(%rsp)
+	movdqa %xmm6,%xmm0
+	paddd %xmm2,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm4
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm4
+	movdqa %xmm1,%xmm0
+	paddd %xmm3,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm10
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm10
+	movdqa %xmm2,%xmm0
+	paddd %xmm4,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm8
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm8
+	movdqa %xmm3,%xmm0
+	paddd %xmm10,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm5
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm5
+	movdqa %xmm4,%xmm0
+	paddd %xmm8,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm6
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm6
+	movdqa %xmm10,%xmm0
+	paddd %xmm5,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm1
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm1
+	movdqa 256(%rsp),%xmm0
+	movdqa %xmm1,256(%rsp)
+	movdqa %xmm4,%xmm1
+	paddd %xmm0,%xmm1
+	movdqa %xmm1,%xmm12
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm7
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm7
+	movdqa %xmm8,%xmm1
+	paddd %xmm6,%xmm1
+	movdqa %xmm1,%xmm12
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm2
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm2
+	movdqa 272(%rsp),%xmm12
+	movdqa %xmm2,272(%rsp)
+	movdqa %xmm14,%xmm1
+	paddd %xmm12,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm5
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm5
+	movdqa %xmm0,%xmm1
+	paddd %xmm7,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm10
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm10
+	movdqa %xmm12,%xmm1
+	paddd %xmm5,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm8
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm8
+	movdqa %xmm7,%xmm1
+	paddd %xmm10,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm4
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm4
+	movdqa %xmm5,%xmm1
+	paddd %xmm8,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm14
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm14
+	movdqa %xmm10,%xmm1
+	paddd %xmm4,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm0
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm0
+	movdqa 256(%rsp),%xmm1
+	movdqa %xmm0,256(%rsp)
+	movdqa %xmm8,%xmm0
+	paddd %xmm14,%xmm0
+	movdqa %xmm0,%xmm2
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm12
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm12
+	movdqa %xmm11,%xmm0
+	paddd %xmm1,%xmm0
+	movdqa %xmm0,%xmm2
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm6
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm6
+	movdqa 272(%rsp),%xmm2
+	movdqa %xmm12,272(%rsp)
+	movdqa %xmm3,%xmm0
+	paddd %xmm2,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm13
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm13
+	movdqa %xmm1,%xmm0
+	paddd %xmm6,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm15
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm15
+	movdqa %xmm2,%xmm0
+	paddd %xmm13,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm9
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm9
+	movdqa %xmm6,%xmm0
+	paddd %xmm15,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm11
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm11
+	movdqa %xmm13,%xmm0
+	paddd %xmm9,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm3
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm3
+	movdqa %xmm15,%xmm0
+	paddd %xmm11,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm1
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm1
+	movdqa %xmm9,%xmm0
+	paddd %xmm3,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm2
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm2
+	movdqa 256(%rsp),%xmm12
+	movdqa 272(%rsp),%xmm0
+	sub  $2,%rdx
+	ja .L_mainloop1
+	paddd 48(%rsp),%xmm12
+	paddd 112(%rsp),%xmm7
+	paddd 160(%rsp),%xmm10
+	paddd 208(%rsp),%xmm4
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 0(%rsi),%edx
+	xorl 4(%rsi),%ecx
+	xorl 8(%rsi),%r9d
+	xorl 12(%rsi),%eax
+	movl   %edx,0(%rdi)
+	movl   %ecx,4(%rdi)
+	movl   %r9d,8(%rdi)
+	movl   %eax,12(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 64(%rsi),%edx
+	xorl 68(%rsi),%ecx
+	xorl 72(%rsi),%r9d
+	xorl 76(%rsi),%eax
+	movl   %edx,64(%rdi)
+	movl   %ecx,68(%rdi)
+	movl   %r9d,72(%rdi)
+	movl   %eax,76(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 128(%rsi),%edx
+	xorl 132(%rsi),%ecx
+	xorl 136(%rsi),%r9d
+	xorl 140(%rsi),%eax
+	movl   %edx,128(%rdi)
+	movl   %ecx,132(%rdi)
+	movl   %r9d,136(%rdi)
+	movl   %eax,140(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	xorl 192(%rsi),%edx
+	xorl 196(%rsi),%ecx
+	xorl 200(%rsi),%r9d
+	xorl 204(%rsi),%eax
+	movl   %edx,192(%rdi)
+	movl   %ecx,196(%rdi)
+	movl   %r9d,200(%rdi)
+	movl   %eax,204(%rdi)
+	paddd 176(%rsp),%xmm14
+	paddd 0(%rsp),%xmm0
+	paddd 64(%rsp),%xmm5
+	paddd 128(%rsp),%xmm8
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 16(%rsi),%edx
+	xorl 20(%rsi),%ecx
+	xorl 24(%rsi),%r9d
+	xorl 28(%rsi),%eax
+	movl   %edx,16(%rdi)
+	movl   %ecx,20(%rdi)
+	movl   %r9d,24(%rdi)
+	movl   %eax,28(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 80(%rsi),%edx
+	xorl 84(%rsi),%ecx
+	xorl 88(%rsi),%r9d
+	xorl 92(%rsi),%eax
+	movl   %edx,80(%rdi)
+	movl   %ecx,84(%rdi)
+	movl   %r9d,88(%rdi)
+	movl   %eax,92(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 144(%rsi),%edx
+	xorl 148(%rsi),%ecx
+	xorl 152(%rsi),%r9d
+	xorl 156(%rsi),%eax
+	movl   %edx,144(%rdi)
+	movl   %ecx,148(%rdi)
+	movl   %r9d,152(%rdi)
+	movl   %eax,156(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	xorl 208(%rsi),%edx
+	xorl 212(%rsi),%ecx
+	xorl 216(%rsi),%r9d
+	xorl 220(%rsi),%eax
+	movl   %edx,208(%rdi)
+	movl   %ecx,212(%rdi)
+	movl   %r9d,216(%rdi)
+	movl   %eax,220(%rdi)
+	paddd 224(%rsp),%xmm15
+	paddd 240(%rsp),%xmm11
+	paddd 16(%rsp),%xmm1
+	paddd 80(%rsp),%xmm6
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 32(%rsi),%edx
+	xorl 36(%rsi),%ecx
+	xorl 40(%rsi),%r9d
+	xorl 44(%rsi),%eax
+	movl   %edx,32(%rdi)
+	movl   %ecx,36(%rdi)
+	movl   %r9d,40(%rdi)
+	movl   %eax,44(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 96(%rsi),%edx
+	xorl 100(%rsi),%ecx
+	xorl 104(%rsi),%r9d
+	xorl 108(%rsi),%eax
+	movl   %edx,96(%rdi)
+	movl   %ecx,100(%rdi)
+	movl   %r9d,104(%rdi)
+	movl   %eax,108(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 160(%rsi),%edx
+	xorl 164(%rsi),%ecx
+	xorl 168(%rsi),%r9d
+	xorl 172(%rsi),%eax
+	movl   %edx,160(%rdi)
+	movl   %ecx,164(%rdi)
+	movl   %r9d,168(%rdi)
+	movl   %eax,172(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	xorl 224(%rsi),%edx
+	xorl 228(%rsi),%ecx
+	xorl 232(%rsi),%r9d
+	xorl 236(%rsi),%eax
+	movl   %edx,224(%rdi)
+	movl   %ecx,228(%rdi)
+	movl   %r9d,232(%rdi)
+	movl   %eax,236(%rdi)
+	paddd 96(%rsp),%xmm13
+	paddd 144(%rsp),%xmm9
+	paddd 192(%rsp),%xmm3
+	paddd 32(%rsp),%xmm2
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 48(%rsi),%edx
+	xorl 52(%rsi),%ecx
+	xorl 56(%rsi),%r9d
+	xorl 60(%rsi),%eax
+	movl   %edx,48(%rdi)
+	movl   %ecx,52(%rdi)
+	movl   %r9d,56(%rdi)
+	movl   %eax,60(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 112(%rsi),%edx
+	xorl 116(%rsi),%ecx
+	xorl 120(%rsi),%r9d
+	xorl 124(%rsi),%eax
+	movl   %edx,112(%rdi)
+	movl   %ecx,116(%rdi)
+	movl   %r9d,120(%rdi)
+	movl   %eax,124(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 176(%rsi),%edx
+	xorl 180(%rsi),%ecx
+	xorl 184(%rsi),%r9d
+	xorl 188(%rsi),%eax
+	movl   %edx,176(%rdi)
+	movl   %ecx,180(%rdi)
+	movl   %r9d,184(%rdi)
+	movl   %eax,188(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	xorl 240(%rsi),%edx
+	xorl 244(%rsi),%ecx
+	xorl 248(%rsi),%r9d
+	xorl 252(%rsi),%eax
+	movl   %edx,240(%rdi)
+	movl   %ecx,244(%rdi)
+	movl   %r9d,248(%rdi)
+	movl   %eax,252(%rdi)
+	movq 288(%rsp),%rdx
+	sub  $256,%rdx
+	add  $256,%rsi
+	add  $256,%rdi
+	cmp  $256,%rdx
+	jae .L_bytesatleast256
+	cmp  $0,%rdx
+	jbe .L_done
+.L_bytes_are_64_128_or_192:
+	movq %rdx,288(%rsp)
+	movdqa 0(%r8),%xmm0
+	movdqa 16(%r8),%xmm1
+	movdqa 32(%r8),%xmm2
+	movdqa 48(%r8),%xmm3
+	movdqa %xmm1,%xmm4
+	mov  %rbx,%rdx
+.L_mainloop2:
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm3
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm3,%xmm3
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm1
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm1
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm1,%xmm1
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm3
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm3,%xmm3
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm3
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm3,%xmm3
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm1
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm1
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm1,%xmm1
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm3
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm3
+	sub  $4,%rdx
+	paddd %xmm3,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	pxor   %xmm7,%xmm7
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm3,%xmm3
+	pxor  %xmm6,%xmm0
+	ja .L_mainloop2
+	paddd 0(%r8),%xmm0
+	paddd 16(%r8),%xmm1
+	paddd 32(%r8),%xmm2
+	paddd 48(%r8),%xmm3
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 0(%rsi),%edx
+	xorl 48(%rsi),%ecx
+	xorl 32(%rsi),%eax
+	xorl 16(%rsi),%r10d
+	movl   %edx,0(%rdi)
+	movl   %ecx,48(%rdi)
+	movl   %eax,32(%rdi)
+	movl   %r10d,16(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 20(%rsi),%edx
+	xorl 4(%rsi),%ecx
+	xorl 52(%rsi),%eax
+	xorl 36(%rsi),%r10d
+	movl   %edx,20(%rdi)
+	movl   %ecx,4(%rdi)
+	movl   %eax,52(%rdi)
+	movl   %r10d,36(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 40(%rsi),%edx
+	xorl 24(%rsi),%ecx
+	xorl 8(%rsi),%eax
+	xorl 56(%rsi),%r10d
+	movl   %edx,40(%rdi)
+	movl   %ecx,24(%rdi)
+	movl   %eax,8(%rdi)
+	movl   %r10d,56(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	xorl 60(%rsi),%edx
+	xorl 44(%rsi),%ecx
+	xorl 28(%rsi),%eax
+	xorl 12(%rsi),%r10d
+	movl   %edx,60(%rdi)
+	movl   %ecx,44(%rdi)
+	movl   %eax,28(%rdi)
+	movl   %r10d,12(%rdi)
+	movq 288(%rsp),%rdx
+	movl   32(%r8),%ecx
+	movl   52(%r8),%eax
+	add  $1,%ecx
+	adc  $0,%eax
+	movl   %ecx,32(%r8)
+	movl   %eax,52(%r8)
+	cmp  $64,%rdx
+	ja .L_bytes_are_128_or_192
+.L_done:
+	CFI_REMEMBER_STATE();
+	mov %r11,%rax
+	sub %rsp,%rax
+	mov %r11,%rsp
+	CFI_REGISTER(%r11, %rsp)
+	CFI_DEF_CFA_REGISTER(%rsp)
+	pop %rbx
+	CFI_POP(%rbx)
+	ret
+	CFI_RESTORE_STATE();
+.L_bytes_are_128_or_192:
+	sub  $64,%rdx
+	add  $64,%rdi
+	add  $64,%rsi
+	jmp .L_bytes_are_64_128_or_192
+	CFI_ENDPROC();
+ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)
+
+#endif /*defined(USE_SALSA20)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/salsa20-armv7-neon.S b/comm/third_party/libgcrypt/cipher/salsa20-armv7-neon.S
new file mode 100644
index 0000000000..3686e3fa6f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/salsa20-armv7-neon.S
@@ -0,0 +1,899 @@
+/* salsa-armv7-neon.S  -  ARM NEON implementation of Salsa20 cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_SALSA20)
+
+/*
+ * Based on public domain implementation from SUPERCOP benchmarking framework
+ * by Peter Schwabe and D. J. Bernstein. Paper about the implementation at:
+ *   http://cryptojedi.org/papers/#neoncrypto
+ */
+
+.syntax unified
+.arm
+.fpu neon
+.text
+
+.align 2
+.globl _gcry_arm_neon_salsa20_encrypt
+.type  _gcry_arm_neon_salsa20_encrypt,%function;
+_gcry_arm_neon_salsa20_encrypt:
+	/* Modifications:
+	 *  - arguments changed to (void *c, const void *m, unsigned int nblks,
+         *    void *ctx, unsigned int rounds) from (void *c, const void *m,
+         *    unsigned long long mlen, const void *n, const void *k)
+	 *  - nonce and key read from 'ctx' as well as sigma and counter.
+	 *  - read in counter from 'ctx' at the start.
+         *  - update counter in 'ctx' at the end.
+	 *  - length is input as number of blocks, so don't handle tail bytes
+	 *    (this is done in salsa20.c).
+	 */
+	lsl r2,r2,#6
+	vpush {q4,q5,q6,q7}
+	mov r12,sp
+	sub sp,sp,#352
+	and sp,sp,#0xffffffe0
+	strd r4,[sp,#0]
+	strd r6,[sp,#8]
+	strd r8,[sp,#16]
+	strd r10,[sp,#24]
+	str r14,[sp,#224]
+	str r12,[sp,#228]
+	str r0,[sp,#232]
+	str r1,[sp,#236]
+	str r2,[sp,#240]
+	ldr r4,[r12,#64]
+	str r4,[sp,#244]
+	mov r2,r3
+	add r3,r2,#48
+	vld1.8 {q3},[r2]
+	add r0,r2,#32
+	add r14,r2,#40
+	vmov.i64 q3,#0xff
+	str r14,[sp,#160]
+	ldrd r8,[r2,#4]
+	vld1.8 {d0},[r0]
+	ldrd r4,[r2,#20]
+	vld1.8 {d8-d9},[r2]!
+	ldrd r6,[r0,#0]
+	vmov d4,d9
+	ldr r0,[r14]
+	vrev64.i32 d0,d0
+	ldr r1,[r14,#4]
+	vld1.8 {d10-d11},[r2]
+	strd r6,[sp,#32]
+	sub r2,r2,#16
+	strd r0,[sp,#40]
+	vmov d5,d11
+	strd r8,[sp,#48]
+	vext.32 d1,d0,d10,#1
+	strd r4,[sp,#56]
+	ldr r1,[r2,#0]
+	vshr.u32 q3,q3,#7
+	ldr r4,[r2,#12]
+	vext.32 d3,d11,d9,#1
+	ldr r11,[r2,#16]
+	vext.32 d2,d8,d0,#1
+	ldr r8,[r2,#28]
+	vext.32 d0,d10,d8,#1
+	ldr r0,[r3,#0]
+	add r2,r2,#44
+	vmov q4,q3
+	vld1.8 {d6-d7},[r14]
+	vadd.i64 q3,q3,q4
+	ldr r5,[r3,#4]
+	add r12,sp,#256
+	vst1.8 {d4-d5},[r12,: 128]
+	ldr r10,[r3,#8]
+	add r14,sp,#272
+	vst1.8 {d2-d3},[r14,: 128]
+	ldr r9,[r3,#12]
+	vld1.8 {d2-d3},[r3]
+	strd r0,[sp,#64]
+	ldr r0,[sp,#240]
+	strd r4,[sp,#72]
+	strd r10,[sp,#80]
+	strd r8,[sp,#88]
+	nop
+	cmp r0,#192
+	blo .L_mlenlowbelow192
+.L_mlenatleast192:
+	ldrd r2,[sp,#48]
+	vext.32 d7,d6,d6,#1
+	vmov q8,q1
+	ldrd r6,[sp,#32]
+	vld1.8 {d18-d19},[r12,: 128]
+	vmov q10,q0
+	str r0,[sp,#240]
+	vext.32 d4,d7,d19,#1
+	vmov q11,q8
+	vext.32 d10,d18,d7,#1
+	vadd.i64 q3,q3,q4
+	ldrd r0,[sp,#64]
+	vld1.8 {d24-d25},[r14,: 128]
+	vmov d5,d24
+	add r8,sp,#288
+	ldrd r4,[sp,#72]
+	vmov d11,d25
+	add r9,sp,#304
+	ldrd r10,[sp,#80]
+	vst1.8 {d4-d5},[r8,: 128]
+	strd r2,[sp,#96]
+	vext.32 d7,d6,d6,#1
+	vmov q13,q10
+	strd r6,[sp,#104]
+	vmov d13,d24
+	vst1.8 {d10-d11},[r9,: 128]
+	add r2,sp,#320
+	vext.32 d12,d7,d19,#1
+	vmov d15,d25
+	add r6,sp,#336
+	ldr r12,[sp,#244]
+	vext.32 d14,d18,d7,#1
+	vadd.i64 q3,q3,q4
+	ldrd r8,[sp,#88]
+	vst1.8 {d12-d13},[r2,: 128]
+	ldrd r2,[sp,#56]
+	vst1.8 {d14-d15},[r6,: 128]
+	ldrd r6,[sp,#40]
+.L_mainloop2:
+	str r12,[sp,#248]
+	vadd.i32 q4,q10,q8
+	vadd.i32 q9,q13,q11
+	add r12,r0,r2
+	add r14,r5,r1
+	vshl.i32 q12,q4,#7
+	vshl.i32 q14,q9,#7
+	vshr.u32 q4,q4,#25
+	vshr.u32 q9,q9,#25
+	eor r4,r4,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r4,r0
+	add r14,r7,r5
+	veor q5,q5,q12
+	veor q7,q7,q14
+	veor q4,q5,q4
+	veor q5,q7,q9
+	eor r6,r6,r12,ROR #23
+	eor r3,r3,r14,ROR #23
+	add r12,r6,r4
+	str r7,[sp,#116]
+	add r7,r3,r7
+	ldr r14,[sp,#108]
+	vadd.i32 q7,q8,q4
+	vadd.i32 q9,q11,q5
+	vshl.i32 q12,q7,#9
+	vshl.i32 q14,q9,#9
+	vshr.u32 q7,q7,#23
+	vshr.u32 q9,q9,#23
+	veor q2,q2,q12
+	veor q6,q6,q14
+	veor q2,q2,q7
+	veor q6,q6,q9
+	eor r2,r2,r12,ROR #19
+	str r2,[sp,#120]
+	eor r1,r1,r7,ROR #19
+	ldr r7,[sp,#96]
+	add r2,r2,r6
+	str r6,[sp,#112]
+	add r6,r1,r3
+	ldr r12,[sp,#104]
+	vadd.i32 q7,q4,q2
+	vext.32 q4,q4,q4,#3
+	vadd.i32 q9,q5,q6
+	vshl.i32 q12,q7,#13
+	vext.32 q5,q5,q5,#3
+	vshl.i32 q14,q9,#13
+	eor r0,r0,r2,ROR #14
+	eor r2,r5,r6,ROR #14
+	str r3,[sp,#124]
+	add r3,r10,r12
+	ldr r5,[sp,#100]
+	add r6,r9,r11
+	vshr.u32 q7,q7,#19
+	vshr.u32 q9,q9,#19
+	veor q10,q10,q12
+	veor q12,q13,q14
+	eor r8,r8,r3,ROR #25
+	eor r3,r5,r6,ROR #25
+	add r5,r8,r10
+	add r6,r3,r9
+	veor q7,q10,q7
+	veor q9,q12,q9
+	eor r5,r7,r5,ROR #23
+	eor r6,r14,r6,ROR #23
+	add r7,r5,r8
+	add r14,r6,r3
+	vadd.i32 q10,q2,q7
+	vswp d4,d5
+	vadd.i32 q12,q6,q9
+	vshl.i32 q13,q10,#18
+	vswp d12,d13
+	vshl.i32 q14,q12,#18
+	eor r7,r12,r7,ROR #19
+	eor r11,r11,r14,ROR #19
+	add r12,r7,r5
+	add r14,r11,r6
+	vshr.u32 q10,q10,#14
+	vext.32 q7,q7,q7,#1
+	vshr.u32 q12,q12,#14
+	veor q8,q8,q13
+	vext.32 q9,q9,q9,#1
+	veor q11,q11,q14
+	eor r10,r10,r12,ROR #14
+	eor r9,r9,r14,ROR #14
+	add r12,r0,r3
+	add r14,r2,r4
+	veor q8,q8,q10
+	veor q10,q11,q12
+	eor r1,r1,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r1,r0
+	add r14,r7,r2
+	vadd.i32 q11,q4,q8
+	vadd.i32 q12,q5,q10
+	vshl.i32 q13,q11,#7
+	vshl.i32 q14,q12,#7
+	eor r5,r5,r12,ROR #23
+	eor r6,r6,r14,ROR #23
+	vshr.u32 q11,q11,#25
+	vshr.u32 q12,q12,#25
+	add r12,r5,r1
+	add r14,r6,r7
+	veor q7,q7,q13
+	veor q9,q9,q14
+	veor q7,q7,q11
+	veor q9,q9,q12
+	vadd.i32 q11,q8,q7
+	vadd.i32 q12,q10,q9
+	vshl.i32 q13,q11,#9
+	vshl.i32 q14,q12,#9
+	eor r3,r3,r12,ROR #19
+	str r7,[sp,#104]
+	eor r4,r4,r14,ROR #19
+	ldr r7,[sp,#112]
+	add r12,r3,r5
+	str r6,[sp,#108]
+	add r6,r4,r6
+	ldr r14,[sp,#116]
+	eor r0,r0,r12,ROR #14
+	str r5,[sp,#96]
+	eor r5,r2,r6,ROR #14
+	ldr r2,[sp,#120]
+	vshr.u32 q11,q11,#23
+	vshr.u32 q12,q12,#23
+	veor q2,q2,q13
+	veor q6,q6,q14
+	veor q2,q2,q11
+	veor q6,q6,q12
+	add r6,r10,r14
+	add r12,r9,r8
+	vadd.i32 q11,q7,q2
+	vext.32 q7,q7,q7,#3
+	vadd.i32 q12,q9,q6
+	vshl.i32 q13,q11,#13
+	vext.32 q9,q9,q9,#3
+	vshl.i32 q14,q12,#13
+	vshr.u32 q11,q11,#19
+	vshr.u32 q12,q12,#19
+	eor r11,r11,r6,ROR #25
+	eor r2,r2,r12,ROR #25
+	add r6,r11,r10
+	str r3,[sp,#100]
+	add r3,r2,r9
+	ldr r12,[sp,#124]
+	veor q4,q4,q13
+	veor q5,q5,q14
+	veor q4,q4,q11
+	veor q5,q5,q12
+	eor r6,r7,r6,ROR #23
+	eor r3,r12,r3,ROR #23
+	add r7,r6,r11
+	add r12,r3,r2
+	vadd.i32 q11,q2,q4
+	vswp d4,d5
+	vadd.i32 q12,q6,q5
+	vshl.i32 q13,q11,#18
+	vswp d12,d13
+	vshl.i32 q14,q12,#18
+	eor r7,r14,r7,ROR #19
+	eor r8,r8,r12,ROR #19
+	add r12,r7,r6
+	add r14,r8,r3
+	vshr.u32 q11,q11,#14
+	vext.32 q4,q4,q4,#1
+	vshr.u32 q12,q12,#14
+	veor q8,q8,q13
+	vext.32 q5,q5,q5,#1
+	veor q10,q10,q14
+	eor r10,r10,r12,ROR #14
+	veor q8,q8,q11
+	eor r9,r9,r14,ROR #14
+	veor q10,q10,q12
+	vadd.i32 q11,q7,q8
+	vadd.i32 q12,q9,q10
+	add r12,r0,r2
+	add r14,r5,r1
+	vshl.i32 q13,q11,#7
+	vshl.i32 q14,q12,#7
+	vshr.u32 q11,q11,#25
+	vshr.u32 q12,q12,#25
+	eor r4,r4,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r4,r0
+	add r14,r7,r5
+	veor q4,q4,q13
+	veor q5,q5,q14
+	veor q4,q4,q11
+	veor q5,q5,q12
+	eor r6,r6,r12,ROR #23
+	eor r3,r3,r14,ROR #23
+	add r12,r6,r4
+	str r7,[sp,#116]
+	add r7,r3,r7
+	ldr r14,[sp,#108]
+	vadd.i32 q11,q8,q4
+	vadd.i32 q12,q10,q5
+	vshl.i32 q13,q11,#9
+	vshl.i32 q14,q12,#9
+	vshr.u32 q11,q11,#23
+	vshr.u32 q12,q12,#23
+	veor q2,q2,q13
+	veor q6,q6,q14
+	veor q2,q2,q11
+	veor q6,q6,q12
+	eor r2,r2,r12,ROR #19
+	str r2,[sp,#120]
+	eor r1,r1,r7,ROR #19
+	ldr r7,[sp,#96]
+	add r2,r2,r6
+	str r6,[sp,#112]
+	add r6,r1,r3
+	ldr r12,[sp,#104]
+	vadd.i32 q11,q4,q2
+	vext.32 q4,q4,q4,#3
+	vadd.i32 q12,q5,q6
+	vshl.i32 q13,q11,#13
+	vext.32 q5,q5,q5,#3
+	vshl.i32 q14,q12,#13
+	eor r0,r0,r2,ROR #14
+	eor r2,r5,r6,ROR #14
+	str r3,[sp,#124]
+	add r3,r10,r12
+	ldr r5,[sp,#100]
+	add r6,r9,r11
+	vshr.u32 q11,q11,#19
+	vshr.u32 q12,q12,#19
+	veor q7,q7,q13
+	veor q9,q9,q14
+	eor r8,r8,r3,ROR #25
+	eor r3,r5,r6,ROR #25
+	add r5,r8,r10
+	add r6,r3,r9
+	veor q7,q7,q11
+	veor q9,q9,q12
+	eor r5,r7,r5,ROR #23
+	eor r6,r14,r6,ROR #23
+	add r7,r5,r8
+	add r14,r6,r3
+	vadd.i32 q11,q2,q7
+	vswp d4,d5
+	vadd.i32 q12,q6,q9
+	vshl.i32 q13,q11,#18
+	vswp d12,d13
+	vshl.i32 q14,q12,#18
+	eor r7,r12,r7,ROR #19
+	eor r11,r11,r14,ROR #19
+	add r12,r7,r5
+	add r14,r11,r6
+	vshr.u32 q11,q11,#14
+	vext.32 q7,q7,q7,#1
+	vshr.u32 q12,q12,#14
+	veor q8,q8,q13
+	vext.32 q9,q9,q9,#1
+	veor q10,q10,q14
+	eor r10,r10,r12,ROR #14
+	eor r9,r9,r14,ROR #14
+	add r12,r0,r3
+	add r14,r2,r4
+	veor q8,q8,q11
+	veor q11,q10,q12
+	eor r1,r1,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r1,r0
+	add r14,r7,r2
+	vadd.i32 q10,q4,q8
+	vadd.i32 q12,q5,q11
+	vshl.i32 q13,q10,#7
+	vshl.i32 q14,q12,#7
+	eor r5,r5,r12,ROR #23
+	eor r6,r6,r14,ROR #23
+	vshr.u32 q10,q10,#25
+	vshr.u32 q12,q12,#25
+	add r12,r5,r1
+	add r14,r6,r7
+	veor q7,q7,q13
+	veor q9,q9,q14
+	veor q7,q7,q10
+	veor q9,q9,q12
+	vadd.i32 q10,q8,q7
+	vadd.i32 q12,q11,q9
+	vshl.i32 q13,q10,#9
+	vshl.i32 q14,q12,#9
+	eor r3,r3,r12,ROR #19
+	str r7,[sp,#104]
+	eor r4,r4,r14,ROR #19
+	ldr r7,[sp,#112]
+	add r12,r3,r5
+	str r6,[sp,#108]
+	add r6,r4,r6
+	ldr r14,[sp,#116]
+	eor r0,r0,r12,ROR #14
+	str r5,[sp,#96]
+	eor r5,r2,r6,ROR #14
+	ldr r2,[sp,#120]
+	vshr.u32 q10,q10,#23
+	vshr.u32 q12,q12,#23
+	veor q2,q2,q13
+	veor q6,q6,q14
+	veor q2,q2,q10
+	veor q6,q6,q12
+	add r6,r10,r14
+	add r12,r9,r8
+	vadd.i32 q12,q7,q2
+	vext.32 q10,q7,q7,#3
+	vadd.i32 q7,q9,q6
+	vshl.i32 q14,q12,#13
+	vext.32 q13,q9,q9,#3
+	vshl.i32 q9,q7,#13
+	vshr.u32 q12,q12,#19
+	vshr.u32 q7,q7,#19
+	eor r11,r11,r6,ROR #25
+	eor r2,r2,r12,ROR #25
+	add r6,r11,r10
+	str r3,[sp,#100]
+	add r3,r2,r9
+	ldr r12,[sp,#124]
+	veor q4,q4,q14
+	veor q5,q5,q9
+	veor q4,q4,q12
+	veor q7,q5,q7
+	eor r6,r7,r6,ROR #23
+	eor r3,r12,r3,ROR #23
+	add r7,r6,r11
+	add r12,r3,r2
+	vadd.i32 q5,q2,q4
+	vswp d4,d5
+	vadd.i32 q9,q6,q7
+	vshl.i32 q12,q5,#18
+	vswp d12,d13
+	vshl.i32 q14,q9,#18
+	eor r7,r14,r7,ROR #19
+	eor r8,r8,r12,ROR #19
+	add r12,r7,r6
+	add r14,r8,r3
+	vshr.u32 q15,q5,#14
+	vext.32 q5,q4,q4,#1
+	vshr.u32 q4,q9,#14
+	veor q8,q8,q12
+	vext.32 q7,q7,q7,#1
+	veor q9,q11,q14
+	eor r10,r10,r12,ROR #14
+	ldr r12,[sp,#248]
+	veor q8,q8,q15
+	eor r9,r9,r14,ROR #14
+	veor q11,q9,q4
+	subs r12,r12,#4
+	bhi .L_mainloop2
+	strd r8,[sp,#112]
+	ldrd r8,[sp,#64]
+	strd r2,[sp,#120]
+	ldrd r2,[sp,#96]
+	add r0,r0,r8
+	strd r10,[sp,#96]
+	add r1,r1,r9
+	ldrd r10,[sp,#48]
+	ldrd r8,[sp,#72]
+	add r2,r2,r10
+	strd r6,[sp,#128]
+	add r3,r3,r11
+	ldrd r6,[sp,#104]
+	ldrd r10,[sp,#32]
+	ldr r12,[sp,#236]
+	add r4,r4,r8
+	add r5,r5,r9
+	add r6,r6,r10
+	add r7,r7,r11
+	cmp r12,#0
+	beq .L_nomessage1
+	ldr r8,[r12,#0]
+	ldr r9,[r12,#4]
+	ldr r10,[r12,#8]
+	ldr r11,[r12,#12]
+	eor r0,r0,r8
+	ldr r8,[r12,#16]
+	eor r1,r1,r9
+	ldr r9,[r12,#20]
+	eor r2,r2,r10
+	ldr r10,[r12,#24]
+	eor r3,r3,r11
+	ldr r11,[r12,#28]
+	eor r4,r4,r8
+	eor r5,r5,r9
+	eor r6,r6,r10
+	eor r7,r7,r11
+.L_nomessage1:
+	ldr r14,[sp,#232]
+	vadd.i32 q4,q8,q1
+	str r0,[r14,#0]
+	add r0,sp,#304
+	str r1,[r14,#4]
+	vld1.8 {d16-d17},[r0,: 128]
+	str r2,[r14,#8]
+	vadd.i32 q5,q8,q5
+	str r3,[r14,#12]
+	add r0,sp,#288
+	str r4,[r14,#16]
+	vld1.8 {d16-d17},[r0,: 128]
+	str r5,[r14,#20]
+	vadd.i32 q9,q10,q0
+	str r6,[r14,#24]
+	vadd.i32 q2,q8,q2
+	str r7,[r14,#28]
+	vmov.i64 q8,#0xffffffff
+	ldrd r6,[sp,#128]
+	vext.32 d20,d8,d10,#1
+	ldrd r0,[sp,#40]
+	vext.32 d25,d9,d11,#1
+	ldrd r2,[sp,#120]
+	vbif q4,q9,q8
+	ldrd r4,[sp,#56]
+	vext.32 d21,d5,d19,#1
+	add r6,r6,r0
+	vext.32 d24,d4,d18,#1
+	add r7,r7,r1
+	vbif q2,q5,q8
+	add r2,r2,r4
+	vrev64.i32 q5,q10
+	add r3,r3,r5
+	vrev64.i32 q9,q12
+	adds r0,r0,#3
+	vswp d5,d9
+	adc r1,r1,#0
+	strd r0,[sp,#40]
+	ldrd r8,[sp,#112]
+	ldrd r0,[sp,#88]
+	ldrd r10,[sp,#96]
+	ldrd r4,[sp,#80]
+	add r0,r8,r0
+	add r1,r9,r1
+	add r4,r10,r4
+	add r5,r11,r5
+	add r8,r14,#64
+	cmp r12,#0
+	beq .L_nomessage2
+	ldr r9,[r12,#32]
+	ldr r10,[r12,#36]
+	ldr r11,[r12,#40]
+	ldr r14,[r12,#44]
+	eor r6,r6,r9
+	ldr r9,[r12,#48]
+	eor r7,r7,r10
+	ldr r10,[r12,#52]
+	eor r4,r4,r11
+	ldr r11,[r12,#56]
+	eor r5,r5,r14
+	ldr r14,[r12,#60]
+	add r12,r12,#64
+	eor r2,r2,r9
+	vld1.8 {d20-d21},[r12]!
+	veor q4,q4,q10
+	eor r3,r3,r10
+	vld1.8 {d20-d21},[r12]!
+	veor q5,q5,q10
+	eor r0,r0,r11
+	vld1.8 {d20-d21},[r12]!
+	veor q2,q2,q10
+	eor r1,r1,r14
+	vld1.8 {d20-d21},[r12]!
+	veor q9,q9,q10
+.L_nomessage2:
+	vst1.8 {d8-d9},[r8]!
+	vst1.8 {d10-d11},[r8]!
+	vmov.i64 q4,#0xff
+	vst1.8 {d4-d5},[r8]!
+	vst1.8 {d18-d19},[r8]!
+	str r6,[r8,#-96]
+	add r6,sp,#336
+	str r7,[r8,#-92]
+	add r7,sp,#320
+	str r4,[r8,#-88]
+	vadd.i32 q2,q11,q1
+	vld1.8 {d10-d11},[r6,: 128]
+	vadd.i32 q5,q5,q7
+	vld1.8 {d14-d15},[r7,: 128]
+	vadd.i32 q9,q13,q0
+	vadd.i32 q6,q7,q6
+	str r5,[r8,#-84]
+	vext.32 d14,d4,d10,#1
+	str r2,[r8,#-80]
+	vext.32 d21,d5,d11,#1
+	str r3,[r8,#-76]
+	vbif q2,q9,q8
+	str r0,[r8,#-72]
+	vext.32 d15,d13,d19,#1
+	vshr.u32 q4,q4,#7
+	str r1,[r8,#-68]
+	vext.32 d20,d12,d18,#1
+	vbif q6,q5,q8
+	ldr r0,[sp,#240]
+	vrev64.i32 q5,q7
+	vrev64.i32 q7,q10
+	vswp d13,d5
+	vadd.i64 q3,q3,q4
+	sub r0,r0,#192
+	cmp r12,#0
+	beq .L_nomessage21
+	vld1.8 {d16-d17},[r12]!
+	veor q2,q2,q8
+	vld1.8 {d16-d17},[r12]!
+	veor q5,q5,q8
+	vld1.8 {d16-d17},[r12]!
+	veor q6,q6,q8
+	vld1.8 {d16-d17},[r12]!
+	veor q7,q7,q8
+.L_nomessage21:
+	vst1.8 {d4-d5},[r8]!
+	vst1.8 {d10-d11},[r8]!
+	vst1.8 {d12-d13},[r8]!
+	vst1.8 {d14-d15},[r8]!
+	str r12,[sp,#236]
+	add r14,sp,#272
+	add r12,sp,#256
+	str r8,[sp,#232]
+	cmp r0,#192
+	bhs .L_mlenatleast192
+.L_mlenlowbelow192:
+	cmp r0,#0
+	beq .L_done
+	b .L_mlenatleast1
+.L_nextblock:
+	sub r0,r0,#64
+.L_mlenatleast1:
+.L_handleblock:
+	str r0,[sp,#248]
+	ldrd r2,[sp,#48]
+	ldrd r6,[sp,#32]
+	ldrd r0,[sp,#64]
+	ldrd r4,[sp,#72]
+	ldrd r10,[sp,#80]
+	ldrd r8,[sp,#88]
+	strd r2,[sp,#96]
+	strd r6,[sp,#104]
+	ldrd r2,[sp,#56]
+	ldrd r6,[sp,#40]
+	ldr r12,[sp,#244]
+.L_mainloop1:
+	str r12,[sp,#252]
+	add r12,r0,r2
+	add r14,r5,r1
+	eor r4,r4,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r4,r0
+	add r14,r7,r5
+	eor r6,r6,r12,ROR #23
+	eor r3,r3,r14,ROR #23
+	add r12,r6,r4
+	str r7,[sp,#132]
+	add r7,r3,r7
+	ldr r14,[sp,#104]
+	eor r2,r2,r12,ROR #19
+	str r6,[sp,#128]
+	eor r1,r1,r7,ROR #19
+	ldr r7,[sp,#100]
+	add r6,r2,r6
+	str r2,[sp,#120]
+	add r2,r1,r3
+	ldr r12,[sp,#96]
+	eor r0,r0,r6,ROR #14
+	str r3,[sp,#124]
+	eor r2,r5,r2,ROR #14
+	ldr r3,[sp,#108]
+	add r5,r10,r14
+	add r6,r9,r11
+	eor r8,r8,r5,ROR #25
+	eor r5,r7,r6,ROR #25
+	add r6,r8,r10
+	add r7,r5,r9
+	eor r6,r12,r6,ROR #23
+	eor r3,r3,r7,ROR #23
+	add r7,r6,r8
+	add r12,r3,r5
+	eor r7,r14,r7,ROR #19
+	eor r11,r11,r12,ROR #19
+	add r12,r7,r6
+	add r14,r11,r3
+	eor r10,r10,r12,ROR #14
+	eor r9,r9,r14,ROR #14
+	add r12,r0,r5
+	add r14,r2,r4
+	eor r1,r1,r12,ROR #25
+	eor r7,r7,r14,ROR #25
+	add r12,r1,r0
+	add r14,r7,r2
+	eor r6,r6,r12,ROR #23
+	eor r3,r3,r14,ROR #23
+	add r12,r6,r1
+	str r7,[sp,#104]
+	add r7,r3,r7
+	ldr r14,[sp,#128]
+	eor r5,r5,r12,ROR #19
+	str r3,[sp,#108]
+	eor r4,r4,r7,ROR #19
+	ldr r7,[sp,#132]
+	add r12,r5,r6
+	str r6,[sp,#96]
+	add r3,r4,r3
+	ldr r6,[sp,#120]
+	eor r0,r0,r12,ROR #14
+	str r5,[sp,#100]
+	eor r5,r2,r3,ROR #14
+	ldr r3,[sp,#124]
+	add r2,r10,r7
+	add r12,r9,r8
+	eor r11,r11,r2,ROR #25
+	eor r2,r6,r12,ROR #25
+	add r6,r11,r10
+	add r12,r2,r9
+	eor r6,r14,r6,ROR #23
+	eor r3,r3,r12,ROR #23
+	add r12,r6,r11
+	add r14,r3,r2
+	eor r7,r7,r12,ROR #19
+	eor r8,r8,r14,ROR #19
+	add r12,r7,r6
+	add r14,r8,r3
+	eor r10,r10,r12,ROR #14
+	eor r9,r9,r14,ROR #14
+	ldr r12,[sp,#252]
+	subs r12,r12,#2
+	bhi .L_mainloop1
+	strd r6,[sp,#128]
+	strd r2,[sp,#120]
+	strd r10,[sp,#112]
+	strd r8,[sp,#136]
+	ldrd r2,[sp,#96]
+	ldrd r6,[sp,#104]
+	ldrd r8,[sp,#64]
+	ldrd r10,[sp,#48]
+	add r0,r0,r8
+	add r1,r1,r9
+	add r2,r2,r10
+	add r3,r3,r11
+	ldrd r8,[sp,#72]
+	ldrd r10,[sp,#32]
+	add r4,r4,r8
+	add r5,r5,r9
+	add r6,r6,r10
+	add r7,r7,r11
+	ldr r12,[sp,#236]
+	cmp r12,#0
+	beq .L_nomessage10
+	ldr r8,[r12,#0]
+	ldr r9,[r12,#4]
+	ldr r10,[r12,#8]
+	ldr r11,[r12,#12]
+	eor r0,r0,r8
+	ldr r8,[r12,#16]
+	eor r1,r1,r9
+	ldr r9,[r12,#20]
+	eor r2,r2,r10
+	ldr r10,[r12,#24]
+	eor r3,r3,r11
+	ldr r11,[r12,#28]
+	eor r4,r4,r8
+	eor r5,r5,r9
+	eor r6,r6,r10
+	eor r7,r7,r11
+.L_nomessage10:
+	ldr r14,[sp,#232]
+	str r0,[r14,#0]
+	str r1,[r14,#4]
+	str r2,[r14,#8]
+	str r3,[r14,#12]
+	str r4,[r14,#16]
+	str r5,[r14,#20]
+	str r6,[r14,#24]
+	str r7,[r14,#28]
+	ldrd r6,[sp,#128]
+	ldrd r10,[sp,#112]
+	ldrd r0,[sp,#40]
+	ldrd r4,[sp,#80]
+	add r6,r6,r0
+	add r7,r7,r1
+	add r10,r10,r4
+	add r11,r11,r5
+	adds r0,r0,#1
+	adc r1,r1,#0
+	strd r0,[sp,#40]
+	ldrd r2,[sp,#120]
+	ldrd r8,[sp,#136]
+	ldrd r4,[sp,#56]
+	ldrd r0,[sp,#88]
+	add r2,r2,r4
+	add r3,r3,r5
+	add r0,r8,r0
+	add r1,r9,r1
+	cmp r12,#0
+	beq .L_nomessage11
+	ldr r4,[r12,#32]
+	ldr r5,[r12,#36]
+	ldr r8,[r12,#40]
+	ldr r9,[r12,#44]
+	eor r6,r6,r4
+	ldr r4,[r12,#48]
+	eor r7,r7,r5
+	ldr r5,[r12,#52]
+	eor r10,r10,r8
+	ldr r8,[r12,#56]
+	eor r11,r11,r9
+	ldr r9,[r12,#60]
+	eor r2,r2,r4
+	eor r3,r3,r5
+	eor r0,r0,r8
+	eor r1,r1,r9
+	add r4,r12,#64
+	str r4,[sp,#236]
+.L_nomessage11:
+	str r6,[r14,#32]
+	str r7,[r14,#36]
+	str r10,[r14,#40]
+	str r11,[r14,#44]
+	str r2,[r14,#48]
+	str r3,[r14,#52]
+	str r0,[r14,#56]
+	str r1,[r14,#60]
+	add r0,r14,#64
+	str r0,[sp,#232]
+	ldr r0,[sp,#248]
+	cmp r0,#64
+	bhi .L_nextblock
+.L_done:
+	ldr r2,[sp,#160]
+	ldrd r4,[sp,#0]
+	ldrd r6,[sp,#8]
+	ldrd r8,[sp,#16]
+	ldrd r10,[sp,#24]
+	ldr r12,[sp,#228]
+	ldr r14,[sp,#224]
+	ldrd r0,[sp,#40]
+	strd r0,[r2]
+	sub r0,r12,sp
+	mov sp,r12
+	vpop {q4,q5,q6,q7}
+	add r0,r0,#64
+	bx lr
+.size _gcry_arm_neon_salsa20_encrypt,.-_gcry_arm_neon_salsa20_encrypt;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/salsa20.c b/comm/third_party/libgcrypt/cipher/salsa20.c
new file mode 100644
index 0000000000..d8c5c81f30
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/salsa20.c
@@ -0,0 +1,600 @@
+/* salsa20.c  -  Bernstein's Salsa20 cipher
+ * Copyright (C) 2012 Simon Josefsson, Niels Möller
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * For a description of the algorithm, see:
+ *   http://cr.yp.to/snuffle/spec.pdf
+ *   http://cr.yp.to/snuffle/design.pdf
+ */
+
+/* The code is based on the code in Nettle
+   (git commit id 9d2d8ddaee35b91a4e1a32ae77cba04bea3480e7)
+   which in turn is based on
+   salsa20-ref.c version 20051118
+   D. J. Bernstein
+   Public domain.
+*/
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+
+
+/* USE_AMD64 indicates whether to compile with AMD64 code. */
+#undef USE_AMD64
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64 1
+#endif
+
+/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
+#undef USE_ARM_NEON_ASM
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_ARM_NEON_ASM 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+#define SALSA20_MIN_KEY_SIZE 16  /* Bytes.  */
+#define SALSA20_MAX_KEY_SIZE 32  /* Bytes.  */
+#define SALSA20_BLOCK_SIZE   64  /* Bytes.  */
+#define SALSA20_IV_SIZE       8  /* Bytes.  */
+#define SALSA20_INPUT_LENGTH 16  /* Bytes.  */
+
+/* Number of rounds.  The standard uses 20 rounds.  In any case the
+   number of rounds must be even.  */
+#define SALSA20_ROUNDS       20
+#define SALSA20R12_ROUNDS    12
+
+
+struct SALSA20_context_s;
+
+typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx,
+                                        unsigned int rounds);
+typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx,
+                                    const byte *key, int keylen);
+typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx,
+                                   const byte *iv);
+
+typedef struct SALSA20_context_s
+{
+  /* Indices 1-4 and 11-14 holds the key (two identical copies for the
+     shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7
+     are the IV, and indices 8, 9 are the block counter:
+
+     C K K K
+     K C I I
+     B B C K
+     K K K C
+  */
+  u32 input[SALSA20_INPUT_LENGTH];
+  u32 pad[SALSA20_INPUT_LENGTH];
+  unsigned int unused; /* bytes in the pad.  */
+#ifdef USE_ARM_NEON_ASM
+  int use_neon;
+#endif
+  salsa20_keysetup_t keysetup;
+  salsa20_ivsetup_t ivsetup;
+  salsa20_core_t core;
+} SALSA20_context_t;
+
+
+/* The masking of the right shift is needed to allow n == 0 (using
+   just 32 - n and 64 - n results in undefined behaviour). Most uses
+   of these macros use a constant and non-zero rotation count. */
+#define ROTL32(n,x) (((x)<<(n)) | ((x)>>((-(n)&31))))
+
+
+#define LE_SWAP32(v) le_bswap32(v)
+
+#define LE_READ_UINT32(p) buf_get_le32(p)
+
+
+static void salsa20_setiv (void *context, const byte *iv, size_t ivlen);
+static const char *selftest (void);
+
+
+#ifdef USE_AMD64
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+/* AMD64 assembly implementations of Salsa20. */
+void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits)
+                                 ASM_FUNC_ABI;
+void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv)
+                                ASM_FUNC_ABI;
+unsigned int
+_gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
+                                   size_t len, int rounds) ASM_FUNC_ABI;
+
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
+{
+  _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8);
+}
+
+static void
+salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+  _gcry_salsa20_amd64_ivsetup(ctx->input, iv);
+}
+
+static unsigned int
+salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
+{
+  memset(dst, 0, SALSA20_BLOCK_SIZE);
+  return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds)
+         + ASM_EXTRA_STACK;
+}
+
+#else /* USE_AMD64 */
+
+
+
+#if 0
+# define SALSA20_CORE_DEBUG(i) do {		\
+    unsigned debug_j;				\
+    for (debug_j = 0; debug_j < 16; debug_j++)	\
+      {						\
+	if (debug_j == 0)			\
+	  fprintf(stderr, "%2d:", (i));		\
+	else if (debug_j % 4 == 0)		\
+	  fprintf(stderr, "\n   ");		\
+	fprintf(stderr, " %8x", pad[debug_j]);	\
+      }						\
+    fprintf(stderr, "\n");			\
+  } while (0)
+#else
+# define SALSA20_CORE_DEBUG(i)
+#endif
+
+#define QROUND(x0, x1, x2, x3)      \
+  do {                              \
+    x1 ^= ROTL32 ( 7, x0 + x3);	    \
+    x2 ^= ROTL32 ( 9, x1 + x0);	    \
+    x3 ^= ROTL32 (13, x2 + x1);	    \
+    x0 ^= ROTL32 (18, x3 + x2);	    \
+  } while(0)
+
+static unsigned int
+salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds)
+{
+  u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input;
+  unsigned int i;
+
+  memcpy (pad, src, sizeof(pad));
+  for (i = 0; i < rounds; i += 2)
+    {
+      SALSA20_CORE_DEBUG (i);
+      QROUND (pad[0],  pad[4],  pad[8],  pad[12]);
+      QROUND (pad[5],  pad[9],  pad[13], pad[1] );
+      QROUND (pad[10], pad[14], pad[2],  pad[6] );
+      QROUND (pad[15], pad[3],  pad[7],  pad[11]);
+
+      SALSA20_CORE_DEBUG (i+1);
+      QROUND (pad[0],  pad[1],  pad[2],  pad[3] );
+      QROUND (pad[5],  pad[6],  pad[7],  pad[4] );
+      QROUND (pad[10], pad[11], pad[8],  pad[9] );
+      QROUND (pad[15], pad[12], pad[13], pad[14]);
+    }
+  SALSA20_CORE_DEBUG (i);
+
+  for (i = 0; i < SALSA20_INPUT_LENGTH; i++)
+    {
+      u32 t = pad[i] + src[i];
+      dst[i] = LE_SWAP32 (t);
+    }
+
+  /* Update counter. */
+  if (!++src[8])
+    src[9]++;
+
+  /* burn_stack */
+  return ( 3*sizeof (void*) \
+         + 2*sizeof (void*) \
+         + 64 \
+         + sizeof (unsigned int) \
+         + sizeof (u32) );
+}
+#undef QROUND
+#undef SALSA20_CORE_DEBUG
+
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
+{
+  /* These constants are the little endian encoding of the string
+     "expand 32-byte k".  For the 128 bit variant, the "32" in that
+     string will be fixed up to "16".  */
+  ctx->input[0]  = 0x61707865; /* "apxe"  */
+  ctx->input[5]  = 0x3320646e; /* "3 dn"  */
+  ctx->input[10] = 0x79622d32; /* "yb-2"  */
+  ctx->input[15] = 0x6b206574; /* "k et"  */
+
+  ctx->input[1] = LE_READ_UINT32(key + 0);
+  ctx->input[2] = LE_READ_UINT32(key + 4);
+  ctx->input[3] = LE_READ_UINT32(key + 8);
+  ctx->input[4] = LE_READ_UINT32(key + 12);
+  if (keylen == SALSA20_MAX_KEY_SIZE) /* 256 bits */
+    {
+      ctx->input[11] = LE_READ_UINT32(key + 16);
+      ctx->input[12] = LE_READ_UINT32(key + 20);
+      ctx->input[13] = LE_READ_UINT32(key + 24);
+      ctx->input[14] = LE_READ_UINT32(key + 28);
+    }
+  else  /* 128 bits */
+    {
+      ctx->input[11] = ctx->input[1];
+      ctx->input[12] = ctx->input[2];
+      ctx->input[13] = ctx->input[3];
+      ctx->input[14] = ctx->input[4];
+
+      ctx->input[5]  -= 0x02000000; /* Change to "1 dn".  */
+      ctx->input[10] += 0x00000004; /* Change to "yb-6".  */
+    }
+}
+
+static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+  ctx->input[6] = LE_READ_UINT32(iv + 0);
+  ctx->input[7] = LE_READ_UINT32(iv + 4);
+  /* Reset the block counter.  */
+  ctx->input[8] = 0;
+  ctx->input[9] = 0;
+}
+
+#endif /*!USE_AMD64*/
+
+#ifdef USE_ARM_NEON_ASM
+
+/* ARM NEON implementation of Salsa20. */
+unsigned int
+_gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks,
+                               void *k, unsigned int rounds);
+
+static unsigned int
+salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
+{
+  return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds);
+}
+
+static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv)
+{
+  memcpy(ctx->input + 8, iv, 8);
+  /* Reset the block counter.  */
+  memset(ctx->input + 10, 0, 8);
+}
+
+static void
+salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen)
+{
+  static const unsigned char sigma32[16] = "expand 32-byte k";
+  static const unsigned char sigma16[16] = "expand 16-byte k";
+
+  if (klen == 16)
+    {
+      memcpy (ctx->input, key, 16);
+      memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */
+      memcpy (ctx->input + 12, sigma16, 16);
+    }
+  else
+    {
+      /* 32-byte key */
+      memcpy (ctx->input, key, 32);
+      memcpy (ctx->input + 12, sigma32, 16);
+    }
+}
+
+#endif /*USE_ARM_NEON_ASM*/
+
+
+static gcry_err_code_t
+salsa20_do_setkey (SALSA20_context_t *ctx,
+                   const byte *key, unsigned int keylen)
+{
+  static int initialized;
+  static const char *selftest_failed;
+
+  if (!initialized )
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != SALSA20_MIN_KEY_SIZE
+      && keylen != SALSA20_MAX_KEY_SIZE)
+    return GPG_ERR_INV_KEYLEN;
+
+  /* Default ops. */
+  ctx->keysetup = salsa20_keysetup;
+  ctx->ivsetup = salsa20_ivsetup;
+  ctx->core = salsa20_core;
+
+#ifdef USE_ARM_NEON_ASM
+  ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+  if (ctx->use_neon)
+    {
+      /* Use ARM NEON ops instead. */
+      ctx->keysetup = salsa20_keysetup_neon;
+      ctx->ivsetup = salsa20_ivsetup_neon;
+      ctx->core = salsa20_core_neon;
+    }
+#endif
+
+  ctx->keysetup (ctx, key, keylen);
+
+  /* We default to a zero nonce.  */
+  salsa20_setiv (ctx, NULL, 0);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+salsa20_setkey (void *context, const byte *key, unsigned int keylen,
+                cipher_bulk_ops_t *bulk_ops)
+{
+  SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+  gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen);
+  (void)bulk_ops;
+  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
+  return rc;
+}
+
+
+static void
+salsa20_setiv (void *context, const byte *iv, size_t ivlen)
+{
+  SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+  byte tmp[SALSA20_IV_SIZE];
+
+  if (iv && ivlen != SALSA20_IV_SIZE)
+    log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", (u32)ivlen);
+
+  if (!iv || ivlen != SALSA20_IV_SIZE)
+    memset (tmp, 0, sizeof(tmp));
+  else
+    memcpy (tmp, iv, SALSA20_IV_SIZE);
+
+  ctx->ivsetup (ctx, tmp);
+
+  /* Reset the unused pad bytes counter.  */
+  ctx->unused = 0;
+
+  wipememory (tmp, sizeof(tmp));
+}
+
+
+
+/* Note: This function requires LENGTH > 0.  */
+static void
+salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
+                           byte *outbuf, const byte *inbuf,
+                           size_t length, unsigned rounds)
+{
+  unsigned int nburn, burn = 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = (void*)ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < SALSA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+      buf_xor (outbuf, inbuf, p + SALSA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf  += n;
+      ctx->unused -= n;
+      if (!length)
+        return;
+      gcry_assert (!ctx->unused);
+    }
+
+#ifdef USE_AMD64
+  if (length >= SALSA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / SALSA20_BLOCK_SIZE;
+      burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
+                                                nblocks, rounds);
+      burn += ASM_EXTRA_STACK;
+      length -= SALSA20_BLOCK_SIZE * nblocks;
+      outbuf += SALSA20_BLOCK_SIZE * nblocks;
+      inbuf  += SALSA20_BLOCK_SIZE * nblocks;
+    }
+#endif
+
+#ifdef USE_ARM_NEON_ASM
+  if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE)
+    {
+      unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
+      _gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input,
+                                      rounds);
+      length -= SALSA20_BLOCK_SIZE * nblocks;
+      outbuf += SALSA20_BLOCK_SIZE * nblocks;
+      inbuf  += SALSA20_BLOCK_SIZE * nblocks;
+    }
+#endif
+
+  while (length > 0)
+    {
+      /* Create the next pad and bump the block counter.  Note that it
+         is the user's duty to change to another nonce not later than
+         after 2^70 processed bytes.  */
+      nburn = ctx->core (ctx->pad, ctx, rounds);
+      burn = nburn > burn ? nburn : burn;
+
+      if (length <= SALSA20_BLOCK_SIZE)
+	{
+	  buf_xor (outbuf, inbuf, ctx->pad, length);
+          ctx->unused = SALSA20_BLOCK_SIZE - length;
+	  break;
+	}
+      buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE);
+      length -= SALSA20_BLOCK_SIZE;
+      outbuf += SALSA20_BLOCK_SIZE;
+      inbuf  += SALSA20_BLOCK_SIZE;
+    }
+
+  _gcry_burn_stack (burn);
+}
+
+
+static void
+salsa20_encrypt_stream (void *context,
+                        byte *outbuf, const byte *inbuf, size_t length)
+{
+  SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+
+  if (length)
+    salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
+}
+
+
+static void
+salsa20r12_encrypt_stream (void *context,
+                           byte *outbuf, const byte *inbuf, size_t length)
+{
+  SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+
+  if (length)
+    salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
+}
+
+
+static const char*
+selftest (void)
+{
+  byte ctxbuf[sizeof(SALSA20_context_t) + 15];
+  SALSA20_context_t *ctx;
+  byte scratch[8+1];
+  byte buf[256+64+4];
+  int i;
+
+  static byte key_1[] =
+    { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static const byte nonce_1[] =
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static const byte plaintext_1[] =
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+  static const byte ciphertext_1[] =
+    { 0xE3, 0xBE, 0x8F, 0xDD, 0x8B, 0xEC, 0xA2, 0xE3};
+
+  /* 16-byte alignment required for amd64 implementation. */
+  ctx = (SALSA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
+
+  salsa20_setkey (ctx, key_1, sizeof key_1, NULL);
+  salsa20_setiv  (ctx, nonce_1, sizeof nonce_1);
+  scratch[8] = 0;
+  salsa20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
+  if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
+    return "Salsa20 encryption test 1 failed.";
+  if (scratch[8])
+    return "Salsa20 wrote too much.";
+  salsa20_setkey( ctx, key_1, sizeof(key_1), NULL);
+  salsa20_setiv  (ctx, nonce_1, sizeof nonce_1);
+  salsa20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
+  if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
+    return "Salsa20 decryption test 1 failed.";
+
+  for (i = 0; i < sizeof buf; i++)
+    buf[i] = i;
+  salsa20_setkey (ctx, key_1, sizeof key_1, NULL);
+  salsa20_setiv (ctx, nonce_1, sizeof nonce_1);
+  /*encrypt*/
+  salsa20_encrypt_stream (ctx, buf, buf, sizeof buf);
+  /*decrypt*/
+  salsa20_setkey (ctx, key_1, sizeof key_1, NULL);
+  salsa20_setiv (ctx, nonce_1, sizeof nonce_1);
+  salsa20_encrypt_stream (ctx, buf, buf, 1);
+  salsa20_encrypt_stream (ctx, buf+1, buf+1, (sizeof buf)-1-1);
+  salsa20_encrypt_stream (ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
+  for (i = 0; i < sizeof buf; i++)
+    if (buf[i] != (byte)i)
+      return "Salsa20 encryption test 2 failed.";
+
+  return NULL;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_salsa20 =
+  {
+    GCRY_CIPHER_SALSA20,
+    {0, 0},     /* flags */
+    "SALSA20",  /* name */
+    NULL,       /* aliases */
+    NULL,       /* oids */
+    1,          /* blocksize in bytes. */
+    SALSA20_MAX_KEY_SIZE*8,  /* standard key length in bits. */
+    sizeof (SALSA20_context_t),
+    salsa20_setkey,
+    NULL,
+    NULL,
+    salsa20_encrypt_stream,
+    salsa20_encrypt_stream,
+    NULL,
+    NULL,
+    salsa20_setiv
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12 =
+  {
+    GCRY_CIPHER_SALSA20R12,
+    {0, 0},     /* flags */
+    "SALSA20R12",  /* name */
+    NULL,       /* aliases */
+    NULL,       /* oids */
+    1,          /* blocksize in bytes. */
+    SALSA20_MAX_KEY_SIZE*8,  /* standard key length in bits. */
+    sizeof (SALSA20_context_t),
+    salsa20_setkey,
+    NULL,
+    NULL,
+    salsa20r12_encrypt_stream,
+    salsa20r12_encrypt_stream,
+    NULL,
+    NULL,
+    salsa20_setiv
+  };
diff --git a/comm/third_party/libgcrypt/cipher/scrypt.c b/comm/third_party/libgcrypt/cipher/scrypt.c
new file mode 100644
index 0000000000..13fd1cf06c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/scrypt.c
@@ -0,0 +1,322 @@
+/* scrypt.c - Scrypt password-based key derivation function.
+ * Copyright (C) 2012 Simon Josefsson
+ * Copyright (C) 2013 Christian Grothoff
+ * Copyright (C) 2013 g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Adapted from the nettle, low-level cryptographics library for
+ * libgcrypt by Christian Grothoff; original license:
+ *
+ * Copyright (C) 2012 Simon Josefsson
+ *
+ * The nettle library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * The nettle library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the nettle library; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02111-1301, USA.
+ */
+
+#include <config.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "kdf-internal.h"
+#include "bufhelp.h"
+
+/* We really need a 64 bit type for this code.  */
+#define SALSA20_INPUT_LENGTH 16
+
+#define ROTL32(n,x) (((x)<<(n)) | ((x)>>(32-(n))))
+
+
+/* Reads a 64-bit integer, in network, big-endian, byte order */
+#define READ_UINT64(p) buf_get_be64(p)
+
+
+/* And the other, little-endian, byteorder */
+#define LE_READ_UINT64(p) buf_get_le64(p)
+
+#define LE_SWAP32(v) le_bswap32(v)
+
+
+#define QROUND(x0, x1, x2, x3) do { \
+  x1 ^= ROTL32(7, x0 + x3);	    \
+  x2 ^= ROTL32(9, x1 + x0);	    \
+  x3 ^= ROTL32(13, x2 + x1);	    \
+  x0 ^= ROTL32(18, x3 + x2);	    \
+  } while(0)
+
+
+static void
+salsa20_core (u32 *dst, const u32 *src, unsigned int rounds)
+{
+  u32 x[SALSA20_INPUT_LENGTH];
+  unsigned i;
+
+  assert ( (rounds & 1) == 0);
+
+  for (i = 0; i < SALSA20_INPUT_LENGTH; i++)
+    x[i] = LE_SWAP32(src[i]);
+
+  for (i = 0; i < rounds;i += 2)
+    {
+      QROUND(x[0], x[4], x[8], x[12]);
+      QROUND(x[5], x[9], x[13], x[1]);
+      QROUND(x[10], x[14], x[2], x[6]);
+      QROUND(x[15], x[3], x[7], x[11]);
+
+      QROUND(x[0], x[1], x[2], x[3]);
+      QROUND(x[5], x[6], x[7], x[4]);
+      QROUND(x[10], x[11], x[8], x[9]);
+      QROUND(x[15], x[12], x[13], x[14]);
+    }
+
+  for (i = 0; i < SALSA20_INPUT_LENGTH; i++)
+    {
+      u32 t = x[i] + LE_SWAP32(src[i]);
+      dst[i] = LE_SWAP32(t);
+    }
+}
+
+
+static void
+scrypt_block_mix (u32 r, unsigned char *B, unsigned char *tmp2)
+{
+  u64 i;
+  unsigned char *X = tmp2;
+  unsigned char *Y = tmp2 + 64;
+
+#if 0
+  if (r == 1)
+    {
+      for (i = 0; i < 2 * r; i++)
+        {
+          size_t j;
+          printf ("B[%d] = ", (int)i);
+          for (j = 0; j < 64; j++)
+            {
+              if (j && !(j % 16))
+                printf ("\n       ");
+              printf (" %02x", B[i * 64 + j]);
+            }
+          putchar ('\n');
+        }
+    }
+#endif
+
+  /* X = B[2 * r - 1] */
+  memcpy (X, &B[(2 * r - 1) * 64], 64);
+
+  /* for i = 0 to 2 * r - 1 do */
+  for (i = 0; i <= 2 * r - 1; i++)
+    {
+      /* T = X xor B[i] */
+      buf_xor(X, X, &B[i * 64], 64);
+
+      /* X = Salsa (T) */
+      salsa20_core ((u32*)(void*)X, (u32*)(void*)X, 8);
+
+      /* Y[i] = X */
+      memcpy (&Y[i * 64], X, 64);
+    }
+
+  for (i = 0; i < r; i++)
+    {
+      memcpy (&B[i * 64], &Y[2 * i * 64], 64);
+      memcpy (&B[(r + i) * 64], &Y[(2 * i + 1) * 64], 64);
+    }
+
+#if 0
+  if (r==1)
+    {
+      for (i = 0; i < 2 * r; i++)
+        {
+          size_t j;
+          printf ("B'[%d] =", (int)i);
+          for (j = 0; j < 64; j++)
+            {
+              if (j && !(j % 16))
+                printf ("\n       ");
+              printf (" %02x", B[i * 64 + j]);
+            }
+          putchar ('\n');
+        }
+    }
+#endif
+}
+
+
+static void
+scrypt_ro_mix (u32 r, unsigned char *B, u64 N,
+	      unsigned char *tmp1, unsigned char *tmp2)
+{
+  unsigned char *X = B, *T = B;
+  u64 i;
+
+#if 0
+  if (r == 1)
+    {
+      printf ("B = ");
+      for (i = 0; i < 128 * r; i++)
+        {
+          if (i && !(i % 16))
+            printf ("\n    ");
+          printf (" %02x", B[i]);
+        }
+      putchar ('\n');
+    }
+#endif
+
+  /* for i = 0 to N - 1 do */
+  for (i = 0; i <= N - 1; i++)
+    {
+      /* V[i] = X */
+      memcpy (&tmp1[i * 128 * r], X, 128 * r);
+
+      /* X =  ScryptBlockMix (X) */
+      scrypt_block_mix (r, X, tmp2);
+    }
+
+  /* for i = 0 to N - 1 do */
+  for (i = 0; i <= N - 1; i++)
+    {
+      u64 j;
+
+      /* j = Integerify (X) mod N */
+      j = LE_READ_UINT64 (&X[128 * r - 64]) % N;
+
+      /* T = X xor V[j] */
+      buf_xor (T, T, &tmp1[j * 128 * r], 128 * r);
+
+      /* X = scryptBlockMix (T) */
+      scrypt_block_mix (r, T, tmp2);
+    }
+
+#if 0
+  if (r == 1)
+    {
+      printf ("B' =");
+      for (i = 0; i < 128 * r; i++)
+        {
+          if (i && !(i % 16))
+            printf ("\n    ");
+          printf (" %02x", B[i]);
+        }
+      putchar ('\n');
+    }
+#endif
+}
+
+
+/*
+ *
+ */
+gcry_err_code_t
+_gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
+                  int algo, int subalgo,
+                  const unsigned char *salt, size_t saltlen,
+                  unsigned long iterations,
+                  size_t dkLen, unsigned char *DK)
+{
+  u64 N = subalgo;    /* CPU/memory cost parameter.  */
+  u32 r;              /* Block size.  */
+  u32 p = iterations; /* Parallelization parameter.  */
+
+  gpg_err_code_t ec;
+  u32 i;
+  unsigned char *B = NULL;
+  unsigned char *tmp1 = NULL;
+  unsigned char *tmp2 = NULL;
+  size_t r128;
+  size_t nbytes;
+
+  if (subalgo < 1 || !iterations)
+    return GPG_ERR_INV_VALUE;
+
+  if (algo == GCRY_KDF_SCRYPT)
+    r = 8;
+  else if (algo == 41) /* Hack to allow the use of all test vectors.  */
+    r = 1;
+  else
+    return GPG_ERR_UNKNOWN_ALGORITHM;
+
+  r128 = r * 128;
+  if (r128 / 128 != r)
+    return GPG_ERR_ENOMEM;
+
+  nbytes = p * r128;
+  if (r128 && nbytes / r128 != p)
+    return GPG_ERR_ENOMEM;
+
+  nbytes = N * r128;
+  if (r128 && nbytes / r128 != N)
+    return GPG_ERR_ENOMEM;
+
+  nbytes = 64 + r128;
+  if (nbytes < r128)
+    return GPG_ERR_ENOMEM;
+
+  B = xtrymalloc (p * r128);
+  if (!B)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  tmp1 = xtrymalloc (N * r128);
+  if (!tmp1)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  tmp2 = xtrymalloc (64 + r128);
+  if (!tmp2)
+    {
+      ec = gpg_err_code_from_syserror ();
+      goto leave;
+    }
+
+  ec = _gcry_kdf_pkdf2 (passwd, passwdlen, GCRY_MD_SHA256, salt, saltlen,
+                        1 /* iterations */, p * r128, B);
+
+  for (i = 0; !ec && i < p; i++)
+    scrypt_ro_mix (r, &B[i * r128], N, tmp1, tmp2);
+
+  for (i = 0; !ec && i < p; i++)
+    ec = _gcry_kdf_pkdf2 (passwd, passwdlen, GCRY_MD_SHA256, B, p * r128,
+                          1 /* iterations */, dkLen, DK);
+
+ leave:
+  xfree (tmp2);
+  xfree (tmp1);
+  xfree (B);
+
+  return ec;
+}
diff --git a/comm/third_party/libgcrypt/cipher/seed.c b/comm/third_party/libgcrypt/cipher/seed.c
new file mode 100644
index 0000000000..2c8958fa82
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/seed.c
@@ -0,0 +1,478 @@
+/* SEED for libgcrypt
+ *	Copyright (C) 2006 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * --
+ * This implementation was provided for libgcrypt in public domain
+ * by Hye-Shik Chang <perky@FreeBSD.org>, July 2006.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+
+#define NUMKC	16
+
+#define GETU32(pt) buf_get_be32(pt)
+#define PUTU32(ct, st) buf_put_be32(ct, st)
+
+union wordbuf
+{
+  u32 w;
+  byte b[4];
+};
+
+#ifdef WORDS_BIGENDIAN
+#define b0  b[3]
+#define b1  b[2]
+#define b2  b[1]
+#define b3  b[0]
+#else
+#define b0  b[0]
+#define b1  b[1]
+#define b2  b[2]
+#define b3  b[3]
+#endif
+
+static const char *selftest(void);
+
+typedef struct
+{
+  u32 keyschedule[32];
+} SEED_context;
+
+static const u32 SS0[256] = {
+    0x2989a1a8, 0x05858184, 0x16c6d2d4, 0x13c3d3d0, 0x14445054, 0x1d0d111c,
+    0x2c8ca0ac, 0x25052124, 0x1d4d515c, 0x03434340, 0x18081018, 0x1e0e121c,
+    0x11415150, 0x3cccf0fc, 0x0acac2c8, 0x23436360, 0x28082028, 0x04444044,
+    0x20002020, 0x1d8d919c, 0x20c0e0e0, 0x22c2e2e0, 0x08c8c0c8, 0x17071314,
+    0x2585a1a4, 0x0f8f838c, 0x03030300, 0x3b4b7378, 0x3b8bb3b8, 0x13031310,
+    0x12c2d2d0, 0x2ecee2ec, 0x30407070, 0x0c8c808c, 0x3f0f333c, 0x2888a0a8,
+    0x32023230, 0x1dcdd1dc, 0x36c6f2f4, 0x34447074, 0x2ccce0ec, 0x15859194,
+    0x0b0b0308, 0x17475354, 0x1c4c505c, 0x1b4b5358, 0x3d8db1bc, 0x01010100,
+    0x24042024, 0x1c0c101c, 0x33437370, 0x18889098, 0x10001010, 0x0cccc0cc,
+    0x32c2f2f0, 0x19c9d1d8, 0x2c0c202c, 0x27c7e3e4, 0x32427270, 0x03838380,
+    0x1b8b9398, 0x11c1d1d0, 0x06868284, 0x09c9c1c8, 0x20406060, 0x10405050,
+    0x2383a3a0, 0x2bcbe3e8, 0x0d0d010c, 0x3686b2b4, 0x1e8e929c, 0x0f4f434c,
+    0x3787b3b4, 0x1a4a5258, 0x06c6c2c4, 0x38487078, 0x2686a2a4, 0x12021210,
+    0x2f8fa3ac, 0x15c5d1d4, 0x21416160, 0x03c3c3c0, 0x3484b0b4, 0x01414140,
+    0x12425250, 0x3d4d717c, 0x0d8d818c, 0x08080008, 0x1f0f131c, 0x19899198,
+    0x00000000, 0x19091118, 0x04040004, 0x13435350, 0x37c7f3f4, 0x21c1e1e0,
+    0x3dcdf1fc, 0x36467274, 0x2f0f232c, 0x27072324, 0x3080b0b0, 0x0b8b8388,
+    0x0e0e020c, 0x2b8ba3a8, 0x2282a2a0, 0x2e4e626c, 0x13839390, 0x0d4d414c,
+    0x29496168, 0x3c4c707c, 0x09090108, 0x0a0a0208, 0x3f8fb3bc, 0x2fcfe3ec,
+    0x33c3f3f0, 0x05c5c1c4, 0x07878384, 0x14041014, 0x3ecef2fc, 0x24446064,
+    0x1eced2dc, 0x2e0e222c, 0x0b4b4348, 0x1a0a1218, 0x06060204, 0x21012120,
+    0x2b4b6368, 0x26466264, 0x02020200, 0x35c5f1f4, 0x12829290, 0x0a8a8288,
+    0x0c0c000c, 0x3383b3b0, 0x3e4e727c, 0x10c0d0d0, 0x3a4a7278, 0x07474344,
+    0x16869294, 0x25c5e1e4, 0x26062224, 0x00808080, 0x2d8da1ac, 0x1fcfd3dc,
+    0x2181a1a0, 0x30003030, 0x37073334, 0x2e8ea2ac, 0x36063234, 0x15051114,
+    0x22022220, 0x38083038, 0x34c4f0f4, 0x2787a3a4, 0x05454144, 0x0c4c404c,
+    0x01818180, 0x29c9e1e8, 0x04848084, 0x17879394, 0x35053134, 0x0bcbc3c8,
+    0x0ecec2cc, 0x3c0c303c, 0x31417170, 0x11011110, 0x07c7c3c4, 0x09898188,
+    0x35457174, 0x3bcbf3f8, 0x1acad2d8, 0x38c8f0f8, 0x14849094, 0x19495158,
+    0x02828280, 0x04c4c0c4, 0x3fcff3fc, 0x09494148, 0x39093138, 0x27476364,
+    0x00c0c0c0, 0x0fcfc3cc, 0x17c7d3d4, 0x3888b0b8, 0x0f0f030c, 0x0e8e828c,
+    0x02424240, 0x23032320, 0x11819190, 0x2c4c606c, 0x1bcbd3d8, 0x2484a0a4,
+    0x34043034, 0x31c1f1f0, 0x08484048, 0x02c2c2c0, 0x2f4f636c, 0x3d0d313c,
+    0x2d0d212c, 0x00404040, 0x3e8eb2bc, 0x3e0e323c, 0x3c8cb0bc, 0x01c1c1c0,
+    0x2a8aa2a8, 0x3a8ab2b8, 0x0e4e424c, 0x15455154, 0x3b0b3338, 0x1cccd0dc,
+    0x28486068, 0x3f4f737c, 0x1c8c909c, 0x18c8d0d8, 0x0a4a4248, 0x16465254,
+    0x37477374, 0x2080a0a0, 0x2dcde1ec, 0x06464244, 0x3585b1b4, 0x2b0b2328,
+    0x25456164, 0x3acaf2f8, 0x23c3e3e0, 0x3989b1b8, 0x3181b1b0, 0x1f8f939c,
+    0x1e4e525c, 0x39c9f1f8, 0x26c6e2e4, 0x3282b2b0, 0x31013130, 0x2acae2e8,
+    0x2d4d616c, 0x1f4f535c, 0x24c4e0e4, 0x30c0f0f0, 0x0dcdc1cc, 0x08888088,
+    0x16061214, 0x3a0a3238, 0x18485058, 0x14c4d0d4, 0x22426260, 0x29092128,
+    0x07070304, 0x33033330, 0x28c8e0e8, 0x1b0b1318, 0x05050104, 0x39497178,
+    0x10809090, 0x2a4a6268, 0x2a0a2228, 0x1a8a9298,
+};
+
+static const u32 SS1[256] = {
+    0x38380830, 0xe828c8e0, 0x2c2d0d21, 0xa42686a2, 0xcc0fcfc3, 0xdc1eced2,
+    0xb03383b3, 0xb83888b0, 0xac2f8fa3, 0x60204060, 0x54154551, 0xc407c7c3,
+    0x44044440, 0x6c2f4f63, 0x682b4b63, 0x581b4b53, 0xc003c3c3, 0x60224262,
+    0x30330333, 0xb43585b1, 0x28290921, 0xa02080a0, 0xe022c2e2, 0xa42787a3,
+    0xd013c3d3, 0x90118191, 0x10110111, 0x04060602, 0x1c1c0c10, 0xbc3c8cb0,
+    0x34360632, 0x480b4b43, 0xec2fcfe3, 0x88088880, 0x6c2c4c60, 0xa82888a0,
+    0x14170713, 0xc404c4c0, 0x14160612, 0xf434c4f0, 0xc002c2c2, 0x44054541,
+    0xe021c1e1, 0xd416c6d2, 0x3c3f0f33, 0x3c3d0d31, 0x8c0e8e82, 0x98188890,
+    0x28280820, 0x4c0e4e42, 0xf436c6f2, 0x3c3e0e32, 0xa42585a1, 0xf839c9f1,
+    0x0c0d0d01, 0xdc1fcfd3, 0xd818c8d0, 0x282b0b23, 0x64264662, 0x783a4a72,
+    0x24270723, 0x2c2f0f23, 0xf031c1f1, 0x70324272, 0x40024242, 0xd414c4d0,
+    0x40014141, 0xc000c0c0, 0x70334373, 0x64274763, 0xac2c8ca0, 0x880b8b83,
+    0xf437c7f3, 0xac2d8da1, 0x80008080, 0x1c1f0f13, 0xc80acac2, 0x2c2c0c20,
+    0xa82a8aa2, 0x34340430, 0xd012c2d2, 0x080b0b03, 0xec2ecee2, 0xe829c9e1,
+    0x5c1d4d51, 0x94148490, 0x18180810, 0xf838c8f0, 0x54174753, 0xac2e8ea2,
+    0x08080800, 0xc405c5c1, 0x10130313, 0xcc0dcdc1, 0x84068682, 0xb83989b1,
+    0xfc3fcff3, 0x7c3d4d71, 0xc001c1c1, 0x30310131, 0xf435c5f1, 0x880a8a82,
+    0x682a4a62, 0xb03181b1, 0xd011c1d1, 0x20200020, 0xd417c7d3, 0x00020202,
+    0x20220222, 0x04040400, 0x68284860, 0x70314171, 0x04070703, 0xd81bcbd3,
+    0x9c1d8d91, 0x98198991, 0x60214161, 0xbc3e8eb2, 0xe426c6e2, 0x58194951,
+    0xdc1dcdd1, 0x50114151, 0x90108090, 0xdc1cccd0, 0x981a8a92, 0xa02383a3,
+    0xa82b8ba3, 0xd010c0d0, 0x80018181, 0x0c0f0f03, 0x44074743, 0x181a0a12,
+    0xe023c3e3, 0xec2ccce0, 0x8c0d8d81, 0xbc3f8fb3, 0x94168692, 0x783b4b73,
+    0x5c1c4c50, 0xa02282a2, 0xa02181a1, 0x60234363, 0x20230323, 0x4c0d4d41,
+    0xc808c8c0, 0x9c1e8e92, 0x9c1c8c90, 0x383a0a32, 0x0c0c0c00, 0x2c2e0e22,
+    0xb83a8ab2, 0x6c2e4e62, 0x9c1f8f93, 0x581a4a52, 0xf032c2f2, 0x90128292,
+    0xf033c3f3, 0x48094941, 0x78384870, 0xcc0cccc0, 0x14150511, 0xf83bcbf3,
+    0x70304070, 0x74354571, 0x7c3f4f73, 0x34350531, 0x10100010, 0x00030303,
+    0x64244460, 0x6c2d4d61, 0xc406c6c2, 0x74344470, 0xd415c5d1, 0xb43484b0,
+    0xe82acae2, 0x08090901, 0x74364672, 0x18190911, 0xfc3ecef2, 0x40004040,
+    0x10120212, 0xe020c0e0, 0xbc3d8db1, 0x04050501, 0xf83acaf2, 0x00010101,
+    0xf030c0f0, 0x282a0a22, 0x5c1e4e52, 0xa82989a1, 0x54164652, 0x40034343,
+    0x84058581, 0x14140410, 0x88098981, 0x981b8b93, 0xb03080b0, 0xe425c5e1,
+    0x48084840, 0x78394971, 0x94178793, 0xfc3cccf0, 0x1c1e0e12, 0x80028282,
+    0x20210121, 0x8c0c8c80, 0x181b0b13, 0x5c1f4f53, 0x74374773, 0x54144450,
+    0xb03282b2, 0x1c1d0d11, 0x24250521, 0x4c0f4f43, 0x00000000, 0x44064642,
+    0xec2dcde1, 0x58184850, 0x50124252, 0xe82bcbe3, 0x7c3e4e72, 0xd81acad2,
+    0xc809c9c1, 0xfc3dcdf1, 0x30300030, 0x94158591, 0x64254561, 0x3c3c0c30,
+    0xb43686b2, 0xe424c4e0, 0xb83b8bb3, 0x7c3c4c70, 0x0c0e0e02, 0x50104050,
+    0x38390931, 0x24260622, 0x30320232, 0x84048480, 0x68294961, 0x90138393,
+    0x34370733, 0xe427c7e3, 0x24240420, 0xa42484a0, 0xc80bcbc3, 0x50134353,
+    0x080a0a02, 0x84078783, 0xd819c9d1, 0x4c0c4c40, 0x80038383, 0x8c0f8f83,
+    0xcc0ecec2, 0x383b0b33, 0x480a4a42, 0xb43787b3,
+};
+
+static const u32 SS2[256] = {
+    0xa1a82989, 0x81840585, 0xd2d416c6, 0xd3d013c3, 0x50541444, 0x111c1d0d,
+    0xa0ac2c8c, 0x21242505, 0x515c1d4d, 0x43400343, 0x10181808, 0x121c1e0e,
+    0x51501141, 0xf0fc3ccc, 0xc2c80aca, 0x63602343, 0x20282808, 0x40440444,
+    0x20202000, 0x919c1d8d, 0xe0e020c0, 0xe2e022c2, 0xc0c808c8, 0x13141707,
+    0xa1a42585, 0x838c0f8f, 0x03000303, 0x73783b4b, 0xb3b83b8b, 0x13101303,
+    0xd2d012c2, 0xe2ec2ece, 0x70703040, 0x808c0c8c, 0x333c3f0f, 0xa0a82888,
+    0x32303202, 0xd1dc1dcd, 0xf2f436c6, 0x70743444, 0xe0ec2ccc, 0x91941585,
+    0x03080b0b, 0x53541747, 0x505c1c4c, 0x53581b4b, 0xb1bc3d8d, 0x01000101,
+    0x20242404, 0x101c1c0c, 0x73703343, 0x90981888, 0x10101000, 0xc0cc0ccc,
+    0xf2f032c2, 0xd1d819c9, 0x202c2c0c, 0xe3e427c7, 0x72703242, 0x83800383,
+    0x93981b8b, 0xd1d011c1, 0x82840686, 0xc1c809c9, 0x60602040, 0x50501040,
+    0xa3a02383, 0xe3e82bcb, 0x010c0d0d, 0xb2b43686, 0x929c1e8e, 0x434c0f4f,
+    0xb3b43787, 0x52581a4a, 0xc2c406c6, 0x70783848, 0xa2a42686, 0x12101202,
+    0xa3ac2f8f, 0xd1d415c5, 0x61602141, 0xc3c003c3, 0xb0b43484, 0x41400141,
+    0x52501242, 0x717c3d4d, 0x818c0d8d, 0x00080808, 0x131c1f0f, 0x91981989,
+    0x00000000, 0x11181909, 0x00040404, 0x53501343, 0xf3f437c7, 0xe1e021c1,
+    0xf1fc3dcd, 0x72743646, 0x232c2f0f, 0x23242707, 0xb0b03080, 0x83880b8b,
+    0x020c0e0e, 0xa3a82b8b, 0xa2a02282, 0x626c2e4e, 0x93901383, 0x414c0d4d,
+    0x61682949, 0x707c3c4c, 0x01080909, 0x02080a0a, 0xb3bc3f8f, 0xe3ec2fcf,
+    0xf3f033c3, 0xc1c405c5, 0x83840787, 0x10141404, 0xf2fc3ece, 0x60642444,
+    0xd2dc1ece, 0x222c2e0e, 0x43480b4b, 0x12181a0a, 0x02040606, 0x21202101,
+    0x63682b4b, 0x62642646, 0x02000202, 0xf1f435c5, 0x92901282, 0x82880a8a,
+    0x000c0c0c, 0xb3b03383, 0x727c3e4e, 0xd0d010c0, 0x72783a4a, 0x43440747,
+    0x92941686, 0xe1e425c5, 0x22242606, 0x80800080, 0xa1ac2d8d, 0xd3dc1fcf,
+    0xa1a02181, 0x30303000, 0x33343707, 0xa2ac2e8e, 0x32343606, 0x11141505,
+    0x22202202, 0x30383808, 0xf0f434c4, 0xa3a42787, 0x41440545, 0x404c0c4c,
+    0x81800181, 0xe1e829c9, 0x80840484, 0x93941787, 0x31343505, 0xc3c80bcb,
+    0xc2cc0ece, 0x303c3c0c, 0x71703141, 0x11101101, 0xc3c407c7, 0x81880989,
+    0x71743545, 0xf3f83bcb, 0xd2d81aca, 0xf0f838c8, 0x90941484, 0x51581949,
+    0x82800282, 0xc0c404c4, 0xf3fc3fcf, 0x41480949, 0x31383909, 0x63642747,
+    0xc0c000c0, 0xc3cc0fcf, 0xd3d417c7, 0xb0b83888, 0x030c0f0f, 0x828c0e8e,
+    0x42400242, 0x23202303, 0x91901181, 0x606c2c4c, 0xd3d81bcb, 0xa0a42484,
+    0x30343404, 0xf1f031c1, 0x40480848, 0xc2c002c2, 0x636c2f4f, 0x313c3d0d,
+    0x212c2d0d, 0x40400040, 0xb2bc3e8e, 0x323c3e0e, 0xb0bc3c8c, 0xc1c001c1,
+    0xa2a82a8a, 0xb2b83a8a, 0x424c0e4e, 0x51541545, 0x33383b0b, 0xd0dc1ccc,
+    0x60682848, 0x737c3f4f, 0x909c1c8c, 0xd0d818c8, 0x42480a4a, 0x52541646,
+    0x73743747, 0xa0a02080, 0xe1ec2dcd, 0x42440646, 0xb1b43585, 0x23282b0b,
+    0x61642545, 0xf2f83aca, 0xe3e023c3, 0xb1b83989, 0xb1b03181, 0x939c1f8f,
+    0x525c1e4e, 0xf1f839c9, 0xe2e426c6, 0xb2b03282, 0x31303101, 0xe2e82aca,
+    0x616c2d4d, 0x535c1f4f, 0xe0e424c4, 0xf0f030c0, 0xc1cc0dcd, 0x80880888,
+    0x12141606, 0x32383a0a, 0x50581848, 0xd0d414c4, 0x62602242, 0x21282909,
+    0x03040707, 0x33303303, 0xe0e828c8, 0x13181b0b, 0x01040505, 0x71783949,
+    0x90901080, 0x62682a4a, 0x22282a0a, 0x92981a8a,
+};
+
+static const u32 SS3[256] = {
+    0x08303838, 0xc8e0e828, 0x0d212c2d, 0x86a2a426, 0xcfc3cc0f, 0xced2dc1e,
+    0x83b3b033, 0x88b0b838, 0x8fa3ac2f, 0x40606020, 0x45515415, 0xc7c3c407,
+    0x44404404, 0x4f636c2f, 0x4b63682b, 0x4b53581b, 0xc3c3c003, 0x42626022,
+    0x03333033, 0x85b1b435, 0x09212829, 0x80a0a020, 0xc2e2e022, 0x87a3a427,
+    0xc3d3d013, 0x81919011, 0x01111011, 0x06020406, 0x0c101c1c, 0x8cb0bc3c,
+    0x06323436, 0x4b43480b, 0xcfe3ec2f, 0x88808808, 0x4c606c2c, 0x88a0a828,
+    0x07131417, 0xc4c0c404, 0x06121416, 0xc4f0f434, 0xc2c2c002, 0x45414405,
+    0xc1e1e021, 0xc6d2d416, 0x0f333c3f, 0x0d313c3d, 0x8e828c0e, 0x88909818,
+    0x08202828, 0x4e424c0e, 0xc6f2f436, 0x0e323c3e, 0x85a1a425, 0xc9f1f839,
+    0x0d010c0d, 0xcfd3dc1f, 0xc8d0d818, 0x0b23282b, 0x46626426, 0x4a72783a,
+    0x07232427, 0x0f232c2f, 0xc1f1f031, 0x42727032, 0x42424002, 0xc4d0d414,
+    0x41414001, 0xc0c0c000, 0x43737033, 0x47636427, 0x8ca0ac2c, 0x8b83880b,
+    0xc7f3f437, 0x8da1ac2d, 0x80808000, 0x0f131c1f, 0xcac2c80a, 0x0c202c2c,
+    0x8aa2a82a, 0x04303434, 0xc2d2d012, 0x0b03080b, 0xcee2ec2e, 0xc9e1e829,
+    0x4d515c1d, 0x84909414, 0x08101818, 0xc8f0f838, 0x47535417, 0x8ea2ac2e,
+    0x08000808, 0xc5c1c405, 0x03131013, 0xcdc1cc0d, 0x86828406, 0x89b1b839,
+    0xcff3fc3f, 0x4d717c3d, 0xc1c1c001, 0x01313031, 0xc5f1f435, 0x8a82880a,
+    0x4a62682a, 0x81b1b031, 0xc1d1d011, 0x00202020, 0xc7d3d417, 0x02020002,
+    0x02222022, 0x04000404, 0x48606828, 0x41717031, 0x07030407, 0xcbd3d81b,
+    0x8d919c1d, 0x89919819, 0x41616021, 0x8eb2bc3e, 0xc6e2e426, 0x49515819,
+    0xcdd1dc1d, 0x41515011, 0x80909010, 0xccd0dc1c, 0x8a92981a, 0x83a3a023,
+    0x8ba3a82b, 0xc0d0d010, 0x81818001, 0x0f030c0f, 0x47434407, 0x0a12181a,
+    0xc3e3e023, 0xcce0ec2c, 0x8d818c0d, 0x8fb3bc3f, 0x86929416, 0x4b73783b,
+    0x4c505c1c, 0x82a2a022, 0x81a1a021, 0x43636023, 0x03232023, 0x4d414c0d,
+    0xc8c0c808, 0x8e929c1e, 0x8c909c1c, 0x0a32383a, 0x0c000c0c, 0x0e222c2e,
+    0x8ab2b83a, 0x4e626c2e, 0x8f939c1f, 0x4a52581a, 0xc2f2f032, 0x82929012,
+    0xc3f3f033, 0x49414809, 0x48707838, 0xccc0cc0c, 0x05111415, 0xcbf3f83b,
+    0x40707030, 0x45717435, 0x4f737c3f, 0x05313435, 0x00101010, 0x03030003,
+    0x44606424, 0x4d616c2d, 0xc6c2c406, 0x44707434, 0xc5d1d415, 0x84b0b434,
+    0xcae2e82a, 0x09010809, 0x46727436, 0x09111819, 0xcef2fc3e, 0x40404000,
+    0x02121012, 0xc0e0e020, 0x8db1bc3d, 0x05010405, 0xcaf2f83a, 0x01010001,
+    0xc0f0f030, 0x0a22282a, 0x4e525c1e, 0x89a1a829, 0x46525416, 0x43434003,
+    0x85818405, 0x04101414, 0x89818809, 0x8b93981b, 0x80b0b030, 0xc5e1e425,
+    0x48404808, 0x49717839, 0x87939417, 0xccf0fc3c, 0x0e121c1e, 0x82828002,
+    0x01212021, 0x8c808c0c, 0x0b13181b, 0x4f535c1f, 0x47737437, 0x44505414,
+    0x82b2b032, 0x0d111c1d, 0x05212425, 0x4f434c0f, 0x00000000, 0x46424406,
+    0xcde1ec2d, 0x48505818, 0x42525012, 0xcbe3e82b, 0x4e727c3e, 0xcad2d81a,
+    0xc9c1c809, 0xcdf1fc3d, 0x00303030, 0x85919415, 0x45616425, 0x0c303c3c,
+    0x86b2b436, 0xc4e0e424, 0x8bb3b83b, 0x4c707c3c, 0x0e020c0e, 0x40505010,
+    0x09313839, 0x06222426, 0x02323032, 0x84808404, 0x49616829, 0x83939013,
+    0x07333437, 0xc7e3e427, 0x04202424, 0x84a0a424, 0xcbc3c80b, 0x43535013,
+    0x0a02080a, 0x87838407, 0xc9d1d819, 0x4c404c0c, 0x83838003, 0x8f838c0f,
+    0xcec2cc0e, 0x0b33383b, 0x4a42480a, 0x87b3b437,
+};
+
+static const u32 KC[NUMKC] = {
+    0x9e3779b9, 0x3c6ef373, 0x78dde6e6, 0xf1bbcdcc,
+    0xe3779b99, 0xc6ef3733, 0x8dde6e67, 0x1bbcdccf,
+    0x3779b99e, 0x6ef3733c, 0xdde6e678, 0xbbcdccf1,
+    0x779b99e3, 0xef3733c6, 0xde6e678d, 0xbcdccf1b,
+};
+
+
+
+/* Perform the key setup.
+ */
+static gcry_err_code_t
+do_setkey (SEED_context *ctx, const byte *key, const unsigned keylen)
+{
+  static int initialized = 0;
+  static const char *selftest_failed=0;
+  u32 x1, x2, x3, x4;
+  union wordbuf t0, t1;
+  u32 *keyout = ctx->keyschedule;
+  int i;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if( selftest_failed )
+        log_error ("%s\n", selftest_failed );
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != 16)
+    return GPG_ERR_INV_KEYLEN;
+
+  x1 = GETU32 (key);
+  x2 = GETU32 (key+4);
+  x3 = GETU32 (key+8);
+  x4 = GETU32 (key+12);
+
+  for (i = 0; i < NUMKC; i++)
+    {
+      t0.w = x1 + x3 - KC[i];
+      t1.w = x2 + KC[i] - x4;
+      *(keyout++) = SS0[t0.b0] ^ SS1[t0.b1] ^ SS2[t0.b2] ^ SS3[t0.b3];
+      *(keyout++) = SS0[t1.b0] ^ SS1[t1.b1] ^ SS2[t1.b2] ^ SS3[t1.b3];
+
+      if (i % 2 == 0)
+	{
+	  t0.w = x1;
+	  x1 = (x1>>8) ^ (x2<<24);
+	  x2 = (x2>>8) ^ (t0.w<<24);
+	}
+      else
+	{
+	  t0.w = x3;
+	  x3 = (x3<<8) ^ (x4>>24);
+	  x4 = (x4<<8) ^ (t0.w>>24);
+	}
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t
+seed_setkey (void *context, const byte *key, const unsigned keylen,
+             cipher_bulk_ops_t *bulk_ops)
+{
+  SEED_context *ctx = context;
+  int rc = do_setkey (ctx, key, keylen);
+  (void)bulk_ops;
+  _gcry_burn_stack (4*6 + sizeof(void*)*2 + sizeof(int)*2);
+  return rc;
+}
+
+
+
+#define OP(X1, X2, X3, X4, rbase)				\
+    t0.w = X3 ^ ctx->keyschedule[rbase];			\
+    t1.w = X4 ^ ctx->keyschedule[rbase+1];			\
+    t1.w ^= t0.w;						\
+    t1.w = SS0[t1.b0] ^ SS1[t1.b1] ^ SS2[t1.b2] ^ SS3[t1.b3];	\
+    t0.w += t1.w;						\
+    t0.w = SS0[t0.b0] ^ SS1[t0.b1] ^ SS2[t0.b2] ^ SS3[t0.b3];	\
+    t1.w += t0.w;						\
+    t1.w = SS0[t1.b0] ^ SS1[t1.b1] ^ SS2[t1.b2] ^ SS3[t1.b3];	\
+    t0.w += t1.w;						\
+    X1 ^= t0.w;							\
+    X2 ^= t1.w;
+
+/* Encrypt one block.  inbuf and outbuf may be the same. */
+static void
+do_encrypt (const SEED_context *ctx, byte *outbuf, const byte *inbuf)
+{
+  u32 x1, x2, x3, x4;
+  union wordbuf t0, t1;
+
+  x1 = GETU32 (inbuf);
+  x2 = GETU32 (inbuf+4);
+  x3 = GETU32 (inbuf+8);
+  x4 = GETU32 (inbuf+12);
+
+  OP (x1, x2, x3, x4, 0);
+  OP (x3, x4, x1, x2, 2);
+  OP (x1, x2, x3, x4, 4);
+  OP (x3, x4, x1, x2, 6);
+  OP (x1, x2, x3, x4, 8);
+  OP (x3, x4, x1, x2, 10);
+  OP (x1, x2, x3, x4, 12);
+  OP (x3, x4, x1, x2, 14);
+  OP (x1, x2, x3, x4, 16);
+  OP (x3, x4, x1, x2, 18);
+  OP (x1, x2, x3, x4, 20);
+  OP (x3, x4, x1, x2, 22);
+  OP (x1, x2, x3, x4, 24);
+  OP (x3, x4, x1, x2, 26);
+  OP (x1, x2, x3, x4, 28);
+  OP (x3, x4, x1, x2, 30);
+
+  PUTU32 (outbuf, x3);
+  PUTU32 (outbuf+4, x4);
+  PUTU32 (outbuf+8, x1);
+  PUTU32 (outbuf+12, x2);
+}
+
+static unsigned int
+seed_encrypt (void *context, byte *outbuf, const byte *inbuf)
+{
+  SEED_context *ctx = context;
+
+  do_encrypt (ctx, outbuf, inbuf);
+  return /*burn_stack*/ (4*6);
+}
+
+
+
+/* Decrypt one block.  inbuf and outbuf may be the same. */
+static void
+do_decrypt (SEED_context *ctx, byte *outbuf, const byte *inbuf)
+{
+  u32 x1, x2, x3, x4;
+  union wordbuf t0, t1;
+
+  x1 = GETU32 (inbuf);
+  x2 = GETU32 (inbuf+4);
+  x3 = GETU32 (inbuf+8);
+  x4 = GETU32 (inbuf+12);
+
+  OP (x1, x2, x3, x4, 30);
+  OP (x3, x4, x1, x2, 28);
+  OP (x1, x2, x3, x4, 26);
+  OP (x3, x4, x1, x2, 24);
+  OP (x1, x2, x3, x4, 22);
+  OP (x3, x4, x1, x2, 20);
+  OP (x1, x2, x3, x4, 18);
+  OP (x3, x4, x1, x2, 16);
+  OP (x1, x2, x3, x4, 14);
+  OP (x3, x4, x1, x2, 12);
+  OP (x1, x2, x3, x4, 10);
+  OP (x3, x4, x1, x2, 8);
+  OP (x1, x2, x3, x4, 6);
+  OP (x3, x4, x1, x2, 4);
+  OP (x1, x2, x3, x4, 2);
+  OP (x3, x4, x1, x2, 0);
+
+  PUTU32 (outbuf, x3);
+  PUTU32 (outbuf+4, x4);
+  PUTU32 (outbuf+8, x1);
+  PUTU32 (outbuf+12, x2);
+}
+
+static unsigned int
+seed_decrypt (void *context, byte *outbuf, const byte *inbuf)
+{
+  SEED_context *ctx = context;
+
+  do_decrypt (ctx, outbuf, inbuf);
+  return /*burn_stack*/ (4*6);
+}
+
+
+/* Test a single encryption and decryption with each key size. */
+static const char*
+selftest (void)
+{
+  SEED_context ctx;
+  byte scratch[16];
+
+  /* The test vector is taken from the appendix section B.3 of RFC4269.
+   */
+  static const byte plaintext[16] = {
+    0x83, 0xA2, 0xF8, 0xA2, 0x88, 0x64, 0x1F, 0xB9,
+    0xA4, 0xE9, 0xA5, 0xCC, 0x2F, 0x13, 0x1C, 0x7D
+  };
+  static const byte key[16] = {
+    0x47, 0x06, 0x48, 0x08, 0x51, 0xE6, 0x1B, 0xE8,
+    0x5D, 0x74, 0xBF, 0xB3, 0xFD, 0x95, 0x61, 0x85
+  };
+  static const byte ciphertext[16] = {
+    0xEE, 0x54, 0xD1, 0x3E, 0xBC, 0xAE, 0x70, 0x6D,
+    0x22, 0x6B, 0xC3, 0x14, 0x2C, 0xD4, 0x0D, 0x4A,
+  };
+
+  seed_setkey (&ctx, key, sizeof(key), NULL);
+  seed_encrypt (&ctx, scratch, plaintext);
+  if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
+    return "SEED test encryption failed.";
+  seed_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext, sizeof (plaintext)))
+    return "SEED test decryption failed.";
+
+  return NULL;
+}
+
+
+
+static gcry_cipher_oid_spec_t seed_oids[] =
+  {
+    { "1.2.410.200004.1.3", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200004.1.4", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200004.1.5", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200004.1.6", GCRY_CIPHER_MODE_OFB },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_seed =
+  {
+    GCRY_CIPHER_SEED, {0, 0},
+    "SEED", NULL, seed_oids, 16, 128, sizeof (SEED_context),
+    seed_setkey, seed_encrypt, seed_decrypt,
+  };
diff --git a/comm/third_party/libgcrypt/cipher/serpent-armv7-neon.S b/comm/third_party/libgcrypt/cipher/serpent-armv7-neon.S
new file mode 100644
index 0000000000..adff639463
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/serpent-armv7-neon.S
@@ -0,0 +1,1124 @@
+/* serpent-armv7-neon.S  -  ARM/NEON assembly implementation of Serpent cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+/* ARM registers */
+#define RROUND r0
+
+/* NEON vector registers */
+#define RA0 q0
+#define RA1 q1
+#define RA2 q2
+#define RA3 q3
+#define RA4 q4
+#define RB0 q5
+#define RB1 q6
+#define RB2 q7
+#define RB3 q8
+#define RB4 q9
+
+#define RT0 q10
+#define RT1 q11
+#define RT2 q12
+#define RT3 q13
+
+#define RA0d0 d0
+#define RA0d1 d1
+#define RA1d0 d2
+#define RA1d1 d3
+#define RA2d0 d4
+#define RA2d1 d5
+#define RA3d0 d6
+#define RA3d1 d7
+#define RA4d0 d8
+#define RA4d1 d9
+#define RB0d0 d10
+#define RB0d1 d11
+#define RB1d0 d12
+#define RB1d1 d13
+#define RB2d0 d14
+#define RB2d1 d15
+#define RB3d0 d16
+#define RB3d1 d17
+#define RB4d0 d18
+#define RB4d1 d19
+#define RT0d0 d20
+#define RT0d1 d21
+#define RT1d0 d22
+#define RT1d1 d23
+#define RT2d0 d24
+#define RT2d1 d25
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#define transpose_4x4(_q0, _q1, _q2, _q3) \
+	vtrn.32 _q0, _q1;	\
+	vtrn.32 _q2, _q3;	\
+	vswp _q0##d1, _q2##d0;	\
+	vswp _q1##d1, _q3##d0;
+
+/**********************************************************************
+  8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	veor	a3, a3, a0;	veor	b3, b3, b0;	vmov	a4, a1;		vmov	b4, b1;		\
+	vand	a1, a1, a3;	vand	b1, b1, b3;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	veor	a1, a1, a0;	veor	b1, b1, b0;	vorr	a0, a0, a3;	vorr	b0, b0, b3;	\
+	veor	a0, a0, a4;	veor	b0, b0, b4;	veor	a4, a4, a3;	veor	b4, b4, b3;	\
+	veor	a3, a3, a2;	veor	b3, b3, b2;	vorr	a2, a2, a1;	vorr	b2, b2, b1;	\
+	veor	a2, a2, a4;	veor	b2, b2, b4;	vmvn	a4, a4;		vmvn	b4, b4;		\
+	vorr	a4, a4, a1;	vorr	b4, b4, b1;	veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;	vorr	a3, a3, a0;	vorr	b3, b3, b0;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	veor	a4, a3;		veor	b4, b3;
+
+#define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmvn	a2, a2;		vmvn	b2, b2;		vmov	a4, a1;		vmov	b4, b1;		\
+	vorr	a1, a1, a0;	vorr	b1, b1, b0;	vmvn	a4, a4;		vmvn	b4, b4;		\
+	veor	a1, a1, a2;	veor	b1, b1, b2;	vorr	a2, a2, a4;	vorr	b2, b2, b4;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	veor	a0, a0, a4;	veor	b0, b0, b4;	\
+	veor	a2, a2, a0;	veor	b2, b2, b0;	vand	a0, a0, a3;	vand	b0, b0, b3;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	vorr	a0, a0, a1;	vorr	b0, b0, b1;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	veor	a3, a3, a4;	veor	b3, b3, b4;	\
+	veor	a2, a2, a1;	veor	b2, b2, b1;	veor	a3, a3, a0;	veor	b3, b3, b0;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;\
+	vand	a2, a2, a3;	vand	b2, b2, b3;\
+	veor	a4, a2;	veor	b4, b2;
+
+#define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmvn	a0, a0;		vmvn	b0, b0;		vmvn	a2, a2;		vmvn	b2, b2;		\
+	vmov	a4, a0;		vmov	b4, b0;		vand	a0, a0, a1;	vand	b0, b0, b1;	\
+	veor	a2, a2, a0;	veor	b2, b2, b0;	vorr	a0, a0, a3;	vorr	b0, b0, b3;	\
+	veor	a3, a3, a2;	veor	b3, b3, b2;	veor	a1, a1, a0;	veor	b1, b1, b0;	\
+	veor	a0, a0, a4;	veor	b0, b0, b4;	vorr	a4, a4, a1;	vorr	b4, b4, b1;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	vorr	a2, a2, a0;	vorr	b2, b2, b0;	\
+	vand	a2, a2, a4;	vand	b2, b2, b4;	veor	a0, a0, a1;	veor	b0, b0, b1;	\
+	vand	a1, a1, a2;	vand	b1, b1, b2;\
+	veor	a1, a1, a0;	veor	b1, b1, b0;	vand	a0, a0, a2;	vand	b0, b0, b2;	\
+	veor	a0, a4;		veor	b0, b4;
+
+#define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a1;		vmov	b4, b1;		veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	vand	a3, a3, a1;	vand	b3, b3, b1;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	veor	a3, a3, a0;	veor	b3, b3, b0;	vorr	a0, a0, a1;	vorr	b0, b0, b1;	\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	veor	a0, a0, a4;	veor	b0, b0, b4;	\
+	vorr	a0, a0, a2;	vorr	b0, b0, b2;	veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	veor	a0, a0, a1;	veor	b0, b0, b1;	vorr	a1, a1, a3;	vorr	b1, b1, b3;	\
+	veor	a1, a1, a0;	veor	b1, b1, b0;	vmvn	a4, a4;		vmvn	b4, b4;		\
+	veor	a4, a4, a1;	veor	b4, b4, b1;	vorr	a1, a1, a0;	vorr	b1, b1, b0;	\
+	veor	a1, a1, a0;	veor	b1, b1, b0;\
+	vorr	a1, a1, a4;	vorr	b1, b1, b4;\
+	veor	a3, a1;		veor	b3, b1;
+
+#define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a0;		vmov	b4, b0;		vand	a0, a0, a2;	vand	b0, b0, b2;	\
+	veor	a0, a0, a3;	veor	b0, b0, b3;	veor	a2, a2, a1;	veor	b2, b2, b1;	\
+	veor	a2, a2, a0;	veor	b2, b2, b0;	vorr	a3, a3, a4;	vorr	b3, b3, b4;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	vmov	a1, a3;		vmov	b1, b3;		vorr	a3, a3, a4;	vorr	b3, b3, b4;	\
+	veor	a3, a3, a0;	veor	b3, b3, b0;	vand	a0, a0, a1;	vand	b0, b0, b1;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;	vmvn	a4, a4;		vmvn	b4, b4;
+
+#define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	veor	a2, a2, a3;	veor	b2, b2, b3;	veor	a3, a3, a0;	veor	b3, b3, b0;	\
+	vmov	a4, a3;		vmov	b4, b3;		vand	a3, a3, a2;	vand	b3, b3, b2;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;	vorr	a1, a1, a2;	vorr	b1, b1, b2;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;	vand	a4, a4, a3;	vand	b4, b4, b3;	\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	vand	a4, a4, a0;	vand	b4, b4, b0;	\
+	veor	a4, a4, a2;	veor	b4, b4, b2;	vand	a2, a2, a1;	vand	b2, b2, b1;	\
+	vorr	a2, a2, a0;	vorr	b2, b2, b0;	vmvn	a3, a3;		vmvn	b3, b3;		\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	veor	a0, a0, a3;	veor	b0, b0, b3;	\
+	vand	a0, a0, a1;	vand	b0, b0, b1;	veor	a3, a3, a4;	veor	b3, b3, b4;	\
+	veor	a3, a0;		veor	b3, b0;
+
+#define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a0;		vmov	b4, b0;		vorr	a0, a0, a3;	vorr	b0, b0, b3;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;	vand	a1, a1, a4;	vand	b1, b1, b4;	\
+	veor	a4, a4, a2;	veor	b4, b4, b2;	veor	a2, a2, a3;	veor	b2, b2, b3;	\
+	vand	a3, a3, a0;	vand	b3, b3, b0;	vorr	a4, a4, a1;	vorr	b4, b4, b1;	\
+	veor	a3, a3, a4;	veor	b3, b3, b4;	veor	a0, a0, a1;	veor	b0, b0, b1;	\
+	vand	a4, a4, a0;	vand	b4, b4, b0;	veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	veor	a4, a4, a2;	veor	b4, b4, b2;	vorr	a1, a1, a0;	vorr	b1, b1, b0;	\
+	veor	a1, a1, a2;	veor	b1, b1, b2;	veor	a0, a0, a3;	veor	b0, b0, b3;	\
+	vmov	a2, a1;		vmov	b2, b1;		vorr	a1, a1, a3;	vorr	b1, b1, b3;	\
+	veor	a1, a0;		veor	b1, b0;
+
+#define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a2;		vmov	b4, b2;		veor	a2, a2, a1;	veor	b2, b2, b1;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	vand	a4, a4, a2;	vand	b4, b4, b2;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	vand	a0, a0, a1;	vand	b0, b0, b1;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	vorr	a3, a3, a4;	vorr	b3, b3, b4;	\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	veor	a0, a0, a3;	veor	b0, b0, b3;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;	vand	a3, a3, a2;	vand	b3, b3, b2;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;	veor	a1, a1, a0;	veor	b1, b1, b0;	\
+	vorr	a1, a1, a2;	vorr	b1, b1, b2;	veor	a0, a0, a3;	veor	b0, b0, b3;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;\
+	veor	a0, a1;		veor	b0, b1;
+
+#define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	veor	a1, a1, a3;	veor	b1, b1, b3;	vmvn	a3, a3;		vmvn	b3, b3;		\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	veor	a3, a3, a0;	veor	b3, b3, b0;	\
+	vmov	a4, a1;		vmov	b4, b1;		vand	a1, a1, a3;	vand	b1, b1, b3;	\
+	veor	a1, a1, a2;	veor	b1, b1, b2;	veor	a4, a4, a3;	veor	b4, b4, b3;	\
+	veor	a0, a0, a4;	veor	b0, b0, b4;	vand	a2, a2, a4;	vand	b2, b2, b4;	\
+	veor	a2, a2, a0;	veor	b2, b2, b0;	vand	a0, a0, a1;	vand	b0, b0, b1;	\
+	veor	a3, a3, a0;	veor	b3, b3, b0;	vorr	a4, a4, a1;	vorr	b4, b4, b1;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	vorr	a0, a0, a3;	vorr	b0, b0, b3;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	vand	a2, a2, a3;	vand	b2, b2, b3;	\
+	vmvn	a0, a0;		vmvn	b0, b0;		veor	a4, a2;		veor	b4, b2;
+
+#define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a2;		vmov	b4, b2;		vand	a2, a2, a3;	vand	b2, b2, b3;	\
+	veor	a2, a2, a1;	veor	b2, b2, b1;	vorr	a1, a1, a3;	vorr	b1, b1, b3;	\
+	vand	a1, a1, a0;	vand	b1, b1, b0;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	veor	a4, a4, a1;	veor	b4, b4, b1;	vand	a1, a1, a2;	vand	b1, b1, b2;	\
+	vmvn	a0, a0;		vmvn	b0, b0;		veor	a3, a3, a4;	veor	b3, b3, b4;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	vand	a3, a3, a0;	vand	b3, b3, b0;	\
+	veor	a3, a3, a2;	veor	b3, b3, b2;	veor	a0, a0, a1;	veor	b0, b0, b1;	\
+	vand	a2, a2, a0;	vand	b2, b2, b0;	veor	a3, a3, a0;	veor	b3, b3, b0;	\
+	veor	a2, a2, a4;	veor	b2, b2, b4;\
+	vorr	a2, a2, a3;	vorr	b2, b2, b3;	veor	a3, a3, a0;	veor	b3, b3, b0;	\
+	veor	a2, a1;		veor	b2, b1;
+
+#define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	veor	a0, a0, a1;	veor	b0, b0, b1;	veor	a1, a1, a3;	veor	b1, b1, b3;	\
+	vmvn	a3, a3;		vmvn	b3, b3;		vmov	a4, a1;		vmov	b4, b1;		\
+	vand	a1, a1, a0;	vand	b1, b1, b0;	veor	a2, a2, a3;	veor	b2, b2, b3;	\
+	veor	a1, a1, a2;	veor	b1, b1, b2;	vorr	a2, a2, a4;	vorr	b2, b2, b4;	\
+	veor	a4, a4, a3;	veor	b4, b4, b3;	vand	a3, a3, a1;	vand	b3, b3, b1;	\
+	veor	a3, a3, a0;	veor	b3, b3, b0;	veor	a4, a4, a1;	veor	b4, b4, b1;	\
+	veor	a4, a4, a2;	veor	b4, b4, b2;	veor	a2, a2, a0;	veor	b2, b2, b0;	\
+	vand	a0, a0, a3;	vand	b0, b0, b3;	vmvn	a2, a2;		vmvn	b2, b2;		\
+	veor	a0, a0, a4;	veor	b0, b0, b4;	vorr	a4, a4, a3;	vorr	b4, b4, b3;	\
+	veor	a2, a4;		veor	b2, b4;
+
+#define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmvn	a1, a1;		vmvn	b1, b1;		vmov	a4, a3;		vmov	b4, b3;		\
+	veor	a2, a2, a1;	veor	b2, b2, b1;	vorr	a3, a3, a0;	vorr	b3, b3, b0;	\
+	veor	a3, a3, a2;	veor	b3, b3, b2;	vorr	a2, a2, a1;	vorr	b2, b2, b1;	\
+	vand	a2, a2, a0;	vand	b2, b2, b0;	veor	a4, a4, a3;	veor	b4, b4, b3;	\
+	veor	a2, a2, a4;	veor	b2, b2, b4;	vorr	a4, a4, a0;	vorr	b4, b4, b0;	\
+	veor	a4, a4, a1;	veor	b4, b4, b1;	vand	a1, a1, a2;	vand	b1, b1, b2;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	vand	a3, a3, a4;	vand	b3, b3, b4;	veor	a4, a4, a1;	veor	b4, b4, b1;	\
+	veor	a3, a3, a4;	veor	b3, b3, b4;	vmvn	a4, a4;		vmvn	b4, b4;		\
+	veor	a3, a0;		veor	b3, b0;
+
+#define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmvn	a2, a2;		vmvn	b2, b2;		vmov	a4, a3;		vmov	b4, b3;		\
+	vand	a3, a3, a0;	vand	b3, b3, b0;	veor	a0, a0, a4;	veor	b0, b0, b4;	\
+	veor	a3, a3, a2;	veor	b3, b3, b2;	vorr	a2, a2, a4;	vorr	b2, b2, b4;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	veor	a2, a2, a0;	veor	b2, b2, b0;	\
+	vorr	a0, a0, a1;	vorr	b0, b0, b1;	veor	a2, a2, a1;	veor	b2, b2, b1;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	vorr	a0, a0, a3;	vorr	b0, b0, b3;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	veor	a4, a4, a3;	veor	b4, b4, b3;	\
+	veor	a4, a4, a0;	veor	b4, b4, b0;	vmvn	a3, a3;		vmvn	b3, b3;		\
+	vand	a2, a2, a4;	vand	b2, b2, b4;\
+	veor	a2, a3;		veor	b2, b3;
+
+#define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	veor	a0, a0, a2;	veor	b0, b0, b2;	vmov	a4, a2;		vmov	b4, b2;		\
+	vand	a2, a2, a0;	vand	b2, b2, b0;	veor	a4, a4, a3;	veor	b4, b4, b3;	\
+	vmvn	a2, a2;		vmvn	b2, b2;		veor	a3, a3, a1;	veor	b3, b3, b1;	\
+	veor	a2, a2, a3;	veor	b2, b2, b3;	vorr	a4, a4, a0;	vorr	b4, b4, b0;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	veor	a3, a3, a4;	veor	b3, b3, b4;	\
+	veor	a4, a4, a1;	veor	b4, b4, b1;	vand	a1, a1, a3;	vand	b1, b1, b3;	\
+	veor	a1, a1, a0;	veor	b1, b1, b0;	veor	a0, a0, a3;	veor	b0, b0, b3;	\
+	vorr	a0, a0, a2;	vorr	b0, b0, b2;	veor	a3, a3, a1;	veor	b3, b3, b1;	\
+	veor	a4, a0;		veor	b4, b0;
+
+#define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a1;		vmov	b4, b1;		vorr	a1, a1, a2;	vorr	b1, b1, b2;	\
+	veor	a1, a1, a3;	veor	b1, b1, b3;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	veor	a2, a2, a1;	veor	b2, b2, b1;	vorr	a3, a3, a4;	vorr	b3, b3, b4;	\
+	vand	a3, a3, a0;	vand	b3, b3, b0;	veor	a4, a4, a2;	veor	b4, b4, b2;	\
+	veor	a3, a3, a1;	veor	b3, b3, b1;	vorr	a1, a1, a4;	vorr	b1, b1, b4;	\
+	veor	a1, a1, a0;	veor	b1, b1, b0;	vorr	a0, a0, a4;	vorr	b0, b0, b4;	\
+	veor	a0, a0, a2;	veor	b0, b0, b2;	veor	a1, a1, a4;	veor	b1, b1, b4;	\
+	veor	a2, a2, a1;	veor	b2, b2, b1;	vand	a1, a1, a0;	vand	b1, b1, b0;	\
+	veor	a1, a1, a4;	veor	b1, b1, b4;	vmvn	a2, a2;		vmvn	b2, b2;		\
+	vorr	a2, a2, a0;	vorr	b2, b2, b0;\
+	veor	a4, a2;		veor	b4, b2;
+
+#define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vmov	a4, a2;		vmov	b4, b2;		veor	a2, a2, a0;	veor	b2, b2, b0;	\
+	vand	a0, a0, a3;	vand	b0, b0, b3;	vorr	a4, a4, a3;	vorr	b4, b4, b3;	\
+	vmvn	a2, a2;		vmvn	b2, b2;		veor	a3, a3, a1;	veor	b3, b3, b1;	\
+	vorr	a1, a1, a0;	vorr	b1, b1, b0;	veor	a0, a0, a2;	veor	b0, b0, b2;	\
+	vand	a2, a2, a4;	vand	b2, b2, b4;	vand	a3, a3, a4;	vand	b3, b3, b4;	\
+	veor	a1, a1, a2;	veor	b1, b1, b2;	veor	a2, a2, a0;	veor	b2, b2, b0;	\
+	vorr	a0, a0, a2;	vorr	b0, b0, b2;	veor	a4, a4, a1;	veor	b4, b4, b1;	\
+	veor	a0, a0, a3;	veor	b0, b0, b3;	veor	a3, a3, a4;	veor	b3, b3, b4;	\
+	vorr	a4, a4, a0;	vorr	b4, b4, b0;	veor	a3, a3, a2;	veor	b3, b3, b2;	\
+	veor	a4, a2;		veor	b4, b2;
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
+
+/* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary.  */
+#define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vdup.32 RT3, RT0d0[0]; \
+	vdup.32 RT1, RT0d0[1]; \
+	vdup.32 RT2, RT0d1[0]; \
+	vdup.32 RT0, RT0d1[1]; \
+	veor a0, a0, RT3;	veor b0, b0, RT3; \
+	veor a1, a1, RT1;	veor b1, b1, RT1; \
+	veor a2, a2, RT2;	veor b2, b2, RT2; \
+	veor a3, a3, RT0;	veor b3, b3, RT0;
+
+#define BLOCK_LOAD_KEY_ENC() \
+	vld1.8 {RT0d0, RT0d1}, [RROUND]!;
+
+#define BLOCK_LOAD_KEY_DEC() \
+	vld1.8 {RT0d0, RT0d1}, [RROUND]; \
+	sub RROUND, RROUND, #16
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vshl.u32	a4, a0, #13;		vshl.u32	b4, b0, #13;		\
+	vshr.u32	a0, a0, #(32-13);	vshr.u32	b0, b0, #(32-13);	\
+	veor		a0, a0, a4;		veor		b0, b0, b4;		\
+	vshl.u32	a4, a2, #3;		vshl.u32	b4, b2, #3;		\
+	vshr.u32	a2, a2, #(32-3);	vshr.u32	b2, b2, #(32-3);	\
+	veor		a2, a2, a4;		veor		b2, b2, b4;		\
+	veor		a1, a0, a1;		veor		b1, b0, b1;		\
+	veor		a1, a2, a1;		veor		b1, b2, b1;		\
+	vshl.u32	a4, a0, #3;		vshl.u32	b4, b0, #3;		\
+	veor		a3, a2, a3;		veor		b3, b2, b3;		\
+	veor		a3, a4, a3;		veor		b3, b4, b3;		\
+	vshl.u32	a4, a1, #1;		vshl.u32	b4, b1, #1;		\
+	vshr.u32	a1, a1, #(32-1);	vshr.u32	b1, b1, #(32-1);	\
+	veor		a1, a1, a4;		veor		b1, b1, b4;		\
+	vshl.u32	a4, a3, #7;		vshl.u32	b4, b3, #7;		\
+	vshr.u32	a3, a3, #(32-7);	vshr.u32	b3, b3, #(32-7);	\
+	veor		a3, a3, a4;		veor		b3, b3, b4;		\
+	veor		a0, a1, a0;		veor		b0, b1, b0;		\
+	veor		a0, a3, a0;		veor		b0, b3, b0;		\
+	vshl.u32	a4, a1, #7;		vshl.u32	b4, b1, #7;		\
+	veor		a2, a3, a2;		veor		b2, b3, b2;		\
+	veor		a2, a4, a2;		veor		b2, b4, b2;		\
+	vshl.u32	a4, a0, #5;		vshl.u32	b4, b0, #5;		\
+	vshr.u32	a0, a0, #(32-5);	vshr.u32	b0, b0, #(32-5);	\
+	veor		a0, a0, a4;		veor		b0, b0, b4;		\
+	vshl.u32	a4, a2, #22;		vshl.u32	b4, b2, #22;		\
+	vshr.u32	a2, a2, #(32-22);	vshr.u32	b2, b2, #(32-22);	\
+	veor		a2, a2, a4;		veor		b2, b2, b4;
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	vshr.u32	a4, a2, #22;		vshr.u32	b4, b2, #22;		\
+	vshl.u32	a2, a2, #(32-22);	vshl.u32	b2, b2, #(32-22);	\
+	veor		a2, a2, a4;		veor		b2, b2, b4;		\
+	vshr.u32	a4, a0, #5;		vshr.u32	b4, b0, #5;		\
+	vshl.u32	a0, a0, #(32-5);	vshl.u32	b0, b0, #(32-5);	\
+	veor		a0, a0, a4;		veor		b0, b0, b4;		\
+	vshl.u32	a4, a1, #7;		vshl.u32	b4, b1, #7;		\
+	veor		a2, a3, a2;		veor		b2, b3, b2;		\
+	veor		a2, a4, a2;		veor		b2, b4, b2;		\
+	veor		a0, a1, a0;		veor		b0, b1, b0;		\
+	veor		a0, a3, a0;		veor		b0, b3, b0;		\
+	vshr.u32	a4, a3, #7;		vshr.u32	b4, b3, #7;		\
+	vshl.u32	a3, a3, #(32-7);	vshl.u32	b3, b3, #(32-7);	\
+	veor		a3, a3, a4;		veor		b3, b3, b4;		\
+	vshr.u32	a4, a1, #1;		vshr.u32	b4, b1, #1;		\
+	vshl.u32	a1, a1, #(32-1);	vshl.u32	b1, b1, #(32-1);	\
+	veor		a1, a1, a4;		veor		b1, b1, b4;		\
+	vshl.u32	a4, a0, #3;		vshl.u32	b4, b0, #3;		\
+	veor		a3, a2, a3;		veor		b3, b2, b3;		\
+	veor		a3, a4, a3;		veor		b3, b4, b3;		\
+	veor		a1, a0, a1;		veor		b1, b0, b1;		\
+	veor		a1, a2, a1;		veor		b1, b2, b1;		\
+	vshr.u32	a4, a2, #3;		vshr.u32	b4, b2, #3;		\
+	vshl.u32	a2, a2, #(32-3);	vshl.u32	b2, b2, #(32-3);	\
+	veor		a2, a2, a4;		veor		b2, b2, b4;		\
+	vshr.u32	a4, a0, #13;		vshr.u32	b4, b0, #13;		\
+	vshl.u32	a0, a0, #(32-13);	vshl.u32	b0, b0, #(32-13);	\
+	veor		a0, a0, a4;		veor		b0, b0, b4;
+
+/* Apply a Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);		\
+	BLOCK_LOAD_KEY_ENC ();						\
+	SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);		\
+	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
+
+/* Apply the last Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);		\
+	BLOCK_LOAD_KEY_ENC ();						\
+	SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
+
+/* Apply an inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+				    na0, na1, na2, na3, na4, \
+				    b0, b1, b2, b3, b4, \
+				    nb0, nb1, nb2, nb3, nb4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);	\
+	BLOCK_LOAD_KEY_DEC ();
+
+/* Apply the first inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+					  na0, na1, na2, na3, na4, \
+					  b0, b1, b2, b3, b4, \
+					  nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4);			\
+	BLOCK_LOAD_KEY_DEC ();							\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); 		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);	\
+	BLOCK_LOAD_KEY_DEC ();
+
+.align 3
+.type __serpent_enc_blk8,%function;
+__serpent_enc_blk8:
+	/* input:
+	 *	r0: round key pointer
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 * output:
+	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
+	 * 						ciphertext blocks
+	 */
+
+	transpose_4x4(RA0, RA1, RA2, RA3);
+	BLOCK_LOAD_KEY_ENC ();
+	transpose_4x4(RB0, RB1, RB2, RB3);
+
+	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+	transpose_4x4(RA4, RA1, RA2, RA0);
+	transpose_4x4(RB4, RB1, RB2, RB0);
+
+	bx lr;
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+
+.align 3
+.type   __serpent_dec_blk8,%function;
+__serpent_dec_blk8:
+	/* input:
+	 *	r0: round key pointer
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+
+	add RROUND, RROUND, #(32*16);
+
+	transpose_4x4(RA0, RA1, RA2, RA3);
+	BLOCK_LOAD_KEY_DEC ();
+	transpose_4x4(RB0, RB1, RB2, RB3);
+
+	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+				    RA3, RA0, RA1, RA4, RA2,
+				    RB0, RB1, RB2, RB3, RB4,
+				    RB3, RB0, RB1, RB4, RB2);
+	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3);
+	transpose_4x4(RB0, RB1, RB2, RB3);
+
+	bx lr;
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+
+.align 3
+.globl _gcry_serpent_neon_ctr_enc
+.type _gcry_serpent_neon_ctr_enc,%function;
+_gcry_serpent_neon_ctr_enc:
+	/* input:
+	 *	r0: ctx, CTX
+	 *	r1: dst (8 blocks)
+	 *	r2: src (8 blocks)
+	 *	r3: iv
+	 */
+
+	vmov.u8 RT1d0, #0xff; /* u64: -1 */
+	push {r4,lr};
+	vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */
+	vpush {RA4-RB2};
+
+	/* load IV and byteswap */
+	vld1.8 {RA0}, [r3];
+	vrev64.u8 RT0, RA0; /* be => le */
+	ldr r4, [r3, #8];
+
+	/* construct IVs */
+	vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */
+	vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */
+	cmp r4, #-1;
+
+	vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */
+	vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */
+	ldr r4, [r3, #12];
+
+	vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */
+	vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */
+
+	vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */
+	vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */
+
+	vmov RA1d0, RT0d0;
+	vmov RA2d0, RT0d0;
+	vmov RA3d0, RT0d0;
+	vmov RB0d0, RT0d0;
+	rev r4, r4;
+	vmov RB1d0, RT0d0;
+	vmov RB2d0, RT0d0;
+	vmov RB3d0, RT0d0;
+	vmov RT2d0, RT0d0;
+
+	/* check need for handling 64-bit overflow and carry */
+	beq .Ldo_ctr_carry;
+
+.Lctr_carry_done:
+	/* le => be */
+	vrev64.u8 RA1, RA1;
+	vrev64.u8 RA2, RA2;
+	vrev64.u8 RA3, RA3;
+	vrev64.u8 RB0, RB0;
+	vrev64.u8 RT2, RT2;
+	vrev64.u8 RB1, RB1;
+	vrev64.u8 RB2, RB2;
+	vrev64.u8 RB3, RB3;
+	/* store new IV */
+	vst1.8 {RT2}, [r3];
+
+	bl __serpent_enc_blk8;
+
+	vld1.8 {RT0, RT1}, [r2]!;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RA4, RA4, RT0;
+	veor RA1, RA1, RT1;
+	vld1.8 {RT0, RT1}, [r2]!;
+	veor RA2, RA2, RT2;
+	veor RA0, RA0, RT3;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RB4, RB4, RT0;
+	veor RT0, RT0;
+	veor RB1, RB1, RT1;
+	veor RT1, RT1;
+	veor RB2, RB2, RT2;
+	veor RT2, RT2;
+	veor RB0, RB0, RT3;
+	veor RT3, RT3;
+
+	vst1.8 {RA4}, [r1]!;
+	vst1.8 {RA1}, [r1]!;
+	veor RA1, RA1;
+	vst1.8 {RA2}, [r1]!;
+	veor RA2, RA2;
+	vst1.8 {RA0}, [r1]!;
+	veor RA0, RA0;
+	vst1.8 {RB4}, [r1]!;
+	veor RB4, RB4;
+	vst1.8 {RB1}, [r1]!;
+	vst1.8 {RB2}, [r1]!;
+	vst1.8 {RB0}, [r1]!;
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {r4,pc};
+
+.Ldo_ctr_carry:
+	cmp r4, #-8;
+	blo .Lctr_carry_done;
+	beq .Lcarry_RT2;
+
+	cmp r4, #-6;
+	blo .Lcarry_RB3;
+	beq .Lcarry_RB2;
+
+	cmp r4, #-4;
+	blo .Lcarry_RB1;
+	beq .Lcarry_RB0;
+
+	cmp r4, #-2;
+	blo .Lcarry_RA3;
+	beq .Lcarry_RA2;
+
+	vsub.u64 RA1d0, RT1d0;
+.Lcarry_RA2:
+	vsub.u64 RA2d0, RT1d0;
+.Lcarry_RA3:
+	vsub.u64 RA3d0, RT1d0;
+.Lcarry_RB0:
+	vsub.u64 RB0d0, RT1d0;
+.Lcarry_RB1:
+	vsub.u64 RB1d0, RT1d0;
+.Lcarry_RB2:
+	vsub.u64 RB2d0, RT1d0;
+.Lcarry_RB3:
+	vsub.u64 RB3d0, RT1d0;
+.Lcarry_RT2:
+	vsub.u64 RT2d0, RT1d0;
+
+	b .Lctr_carry_done;
+.size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc;
+
+.align 3
+.globl _gcry_serpent_neon_cfb_dec
+.type _gcry_serpent_neon_cfb_dec,%function;
+_gcry_serpent_neon_cfb_dec:
+	/* input:
+	 *	r0: ctx, CTX
+	 *	r1: dst (8 blocks)
+	 *	r2: src (8 blocks)
+	 *	r3: iv
+	 */
+
+	push {lr};
+	vpush {RA4-RB2};
+
+	/* Load input */
+	vld1.8 {RA0}, [r3];
+	vld1.8 {RA1, RA2}, [r2]!;
+	vld1.8 {RA3}, [r2]!;
+	vld1.8 {RB0}, [r2]!;
+	vld1.8 {RB1, RB2}, [r2]!;
+	vld1.8 {RB3}, [r2]!;
+
+	/* Update IV */
+	vld1.8 {RT0}, [r2]!;
+	vst1.8 {RT0}, [r3];
+	mov r3, lr;
+	sub r2, r2, #(8*16);
+
+	bl __serpent_enc_blk8;
+
+	vld1.8 {RT0, RT1}, [r2]!;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RA4, RA4, RT0;
+	veor RA1, RA1, RT1;
+	vld1.8 {RT0, RT1}, [r2]!;
+	veor RA2, RA2, RT2;
+	veor RA0, RA0, RT3;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RB4, RB4, RT0;
+	veor RT0, RT0;
+	veor RB1, RB1, RT1;
+	veor RT1, RT1;
+	veor RB2, RB2, RT2;
+	veor RT2, RT2;
+	veor RB0, RB0, RT3;
+	veor RT3, RT3;
+
+	vst1.8 {RA4}, [r1]!;
+	vst1.8 {RA1}, [r1]!;
+	veor RA1, RA1;
+	vst1.8 {RA2}, [r1]!;
+	veor RA2, RA2;
+	vst1.8 {RA0}, [r1]!;
+	veor RA0, RA0;
+	vst1.8 {RB4}, [r1]!;
+	veor RB4, RB4;
+	vst1.8 {RB1}, [r1]!;
+	vst1.8 {RB2}, [r1]!;
+	vst1.8 {RB0}, [r1]!;
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {pc};
+.size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec;
+
+.align 3
+.globl _gcry_serpent_neon_cbc_dec
+.type _gcry_serpent_neon_cbc_dec,%function;
+_gcry_serpent_neon_cbc_dec:
+	/* input:
+	 *	r0: ctx, CTX
+	 *	r1: dst (8 blocks)
+	 *	r2: src (8 blocks)
+	 *	r3: iv
+	 */
+
+	push {lr};
+	vpush {RA4-RB2};
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2]!;
+	sub r2, r2, #(8*16);
+
+	bl __serpent_dec_blk8;
+
+	vld1.8 {RB4}, [r3];
+	vld1.8 {RT0, RT1}, [r2]!;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RA0, RA0, RB4;
+	veor RA1, RA1, RT0;
+	veor RA2, RA2, RT1;
+	vld1.8 {RT0, RT1}, [r2]!;
+	veor RA3, RA3, RT2;
+	veor RB0, RB0, RT3;
+	vld1.8 {RT2, RT3}, [r2]!;
+	veor RB1, RB1, RT0;
+	veor RT0, RT0;
+	veor RB2, RB2, RT1;
+	veor RT1, RT1;
+	veor RB3, RB3, RT2;
+	veor RT2, RT2;
+	vst1.8 {RT3}, [r3]; /* store new IV */
+	veor RT3, RT3;
+
+	vst1.8 {RA0, RA1}, [r1]!;
+	veor RA0, RA0;
+	veor RA1, RA1;
+	vst1.8 {RA2, RA3}, [r1]!;
+	veor RA2, RA2;
+	vst1.8 {RB0, RB1}, [r1]!;
+	veor RA3, RA3;
+	vst1.8 {RB2, RB3}, [r1]!;
+	veor RB3, RB3;
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RB4, RB4;
+
+	pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
+.align 3
+.globl _gcry_serpent_neon_ocb_enc
+.type _gcry_serpent_neon_ocb_enc,%function;
+_gcry_serpent_neon_ocb_enc:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : dst (8 blocks)
+	 *	r2  : src (8 blocks)
+	 *	r3  : offset
+	 *	sp+0: checksum
+	 *	sp+4: L pointers (void *L[8])
+	 */
+
+	push {r4-r11, ip, lr};
+	add ip, sp, #(10*4);
+
+	vpush {RA4-RB2};
+
+	ldm ip, {r4, lr};
+
+	vld1.8 {RT0}, [r3];
+	vld1.8 {RT1}, [r4];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor RT1, vreg; \
+	  veor vreg, RT0; \
+	  vst1.8 {RT0}, [r1]!;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	sub r1, r1, #(8*16);
+	vst1.8 {RT0}, [r3];
+	vst1.8 {RT1}, [r4];
+	mov r2, r1;
+
+	bl __serpent_enc_blk8;
+
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RT0, RA4, RT0;
+	veor RT1, RA1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	vst1.8 {RT0, RT1}, [r2]!;
+	veor RT2, RA2, RT2;
+	veor RT3, RA0, RT3;
+	vld1.8 {RT0, RT1}, [r1]!;
+	vst1.8 {RT2, RT3}, [r2]!;
+	veor RT0, RB4, RT0;
+	veor RT1, RB1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	vst1.8 {RT0, RT1}, [r2]!;
+	veor RT2, RB2, RT2;
+	veor RT3, RB0, RT3;
+	vst1.8 {RT2, RT3}, [r2]!;
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {r4-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_enc,.-_gcry_serpent_neon_ocb_enc;
+
+.align 3
+.globl _gcry_serpent_neon_ocb_dec
+.type _gcry_serpent_neon_ocb_dec,%function;
+_gcry_serpent_neon_ocb_dec:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : dst (8 blocks)
+	 *	r2  : src (8 blocks)
+	 *	r3  : offset
+	 *	sp+0: checksum
+	 *	sp+4: L pointers (void *L[8])
+	 */
+
+	push {r4-r11, ip, lr};
+	add ip, sp, #(10*4);
+
+	vpush {RA4-RB2};
+
+	ldm ip, {r4, lr};
+
+	vld1.8 {RT0}, [r3];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor vreg, RT0; \
+	  vst1.8 {RT0}, [r1]!;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	sub r1, r1, #(8*16);
+	vst1.8 {RT0}, [r3];
+	mov r2, r1;
+
+	bl __serpent_dec_blk8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	vld1.8 {RA4}, [r4];
+
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RA0, RA0, RT0;
+	veor RA1, RA1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	veor RA4, RA4, RA0;
+	vst1.8 {RA0, RA1}, [r2]!;
+	veor RA4, RA4, RA1;
+	veor RA2, RA2, RT2;
+	veor RA3, RA3, RT3;
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RA4, RA4, RA2;
+	vst1.8 {RA2, RA3}, [r2]!;
+	veor RA4, RA4, RA3;
+	veor RB0, RB0, RT0;
+	veor RB1, RB1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	veor RA4, RA4, RB0;
+	vst1.8 {RB0, RB1}, [r2]!;
+	veor RA4, RA4, RB1;
+	veor RB2, RB2, RT2;
+	veor RB3, RB3, RT3;
+	veor RA4, RA4, RB2;
+	vst1.8 {RB2, RB3}, [r2]!;
+
+	veor RA4, RA4, RB3;
+	vst1.8 {RA4}, [r4];
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RB4, RB4;
+
+	pop {r4-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_dec,.-_gcry_serpent_neon_ocb_dec;
+
+.align 3
+.globl _gcry_serpent_neon_ocb_auth
+.type _gcry_serpent_neon_ocb_auth,%function;
+_gcry_serpent_neon_ocb_auth:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : abuf (8 blocks)
+	 *	r2  : offset
+	 *	r3  : checksum
+	 *	sp+0: L pointers (void *L[8])
+	 */
+
+	push {r5-r11, ip, lr};
+	ldr lr, [sp, #(9*4)];
+
+	vpush {RA4-RB2};
+
+	vld1.8 {RT0}, [r2];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r1]!;
+	vld1.8 {RA2, RA3}, [r1]!;
+	vld1.8 {RB0, RB1}, [r1]!;
+	vld1.8 {RB2, RB3}, [r1];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor vreg, RT0;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	vst1.8 {RT0}, [r2];
+
+	bl __serpent_enc_blk8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	vld1.8 {RT0}, [r3];
+
+	veor RA4, RB4;
+	veor RA1, RB1;
+	veor RA2, RB2;
+	veor RA0, RB0;
+
+	veor RA2, RT0;
+	veor RA1, RA4;
+	veor RA0, RA2;
+
+	veor RA0, RA1;
+
+	vst1.8 {RA0}, [r3];
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {r5-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_auth,.-_gcry_serpent_neon_ocb_auth;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/serpent-avx2-amd64.S b/comm/third_party/libgcrypt/cipher/serpent-avx2-amd64.S
new file mode 100644
index 0000000000..dcee9b62a5
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/serpent-avx2-amd64.S
@@ -0,0 +1,1160 @@
+/* serpent-avx2-amd64.S  -  AVX2 implementation of Serpent cipher
+ *
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
+    defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* struct serpent_context: */
+#define ctx_keys 0
+
+/* register macros */
+#define CTX %rdi
+
+/* vector registers */
+#define RA0 %ymm0
+#define RA1 %ymm1
+#define RA2 %ymm2
+#define RA3 %ymm3
+#define RA4 %ymm4
+
+#define RB0 %ymm5
+#define RB1 %ymm6
+#define RB2 %ymm7
+#define RB3 %ymm8
+#define RB4 %ymm9
+
+#define RNOT %ymm10
+#define RTMP0 %ymm11
+#define RTMP1 %ymm12
+#define RTMP2 %ymm13
+#define RTMP3 %ymm14
+#define RTMP4 %ymm15
+
+#define RNOTx %xmm10
+#define RTMP0x %xmm11
+#define RTMP1x %xmm12
+#define RTMP2x %xmm13
+#define RTMP3x %xmm14
+#define RTMP4x %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* vector 32-bit rotation to left */
+#define vec_rol(reg, nleft, tmp) \
+	vpslld $(nleft), reg, tmp;		\
+	vpsrld $(32 - (nleft)), reg, reg;	\
+	vpor tmp, reg, reg;
+
+/* vector 32-bit rotation to right */
+#define vec_ror(reg, nright, tmp) \
+	vec_rol(reg, 32 - nright, tmp)
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/**********************************************************************
+  16-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(r0, r1, r2, r3, r4) \
+	vpxor	r0, r3, r3;		vmovdqa	r1, r4;			\
+	vpand	r3, r1, r1;		vpxor	r2, r4, r4;		\
+	vpxor	r0, r1, r1;		vpor	r3, r0, r0;		\
+	vpxor	r4, r0, r0;		vpxor	r3, r4, r4;		\
+	vpxor	r2, r3, r3;		vpor	r1, r2, r2;		\
+	vpxor	r4, r2, r2;		vpxor	RNOT, r4, r4;		\
+	vpor	r1, r4, r4;		vpxor	r3, r1, r1;		\
+	vpxor	r4, r1, r1;		vpor	r0, r3, r3;		\
+	vpxor	r3, r1, r1;		vpxor	r3, r4, r4;
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	RNOT, r2, r2;		vmovdqa	r1, r4;			\
+	vpor	r0, r1, r1;		vpxor	RNOT, r4, r4;		\
+	vpxor	r2, r1, r1;		vpor	r4, r2, r2;		\
+	vpxor	r3, r1, r1;		vpxor	r4, r0, r0;		\
+	vpxor	r0, r2, r2;		vpand	r3, r0, r0;		\
+	vpxor	r0, r4, r4;		vpor	r1, r0, r0;		\
+	vpxor	r2, r0, r0;		vpxor	r4, r3, r3;		\
+	vpxor	r1, r2, r2;		vpxor	r0, r3, r3;		\
+	vpxor	r1, r3, r3;	\
+	vpand	r3, r2, r2;	\
+	vpxor	r2, r4, r4;
+
+#define SBOX1(r0, r1, r2, r3, r4) \
+	vpxor	RNOT, r0, r0;		vpxor	RNOT, r2, r2;		\
+	vmovdqa	r0, r4;			vpand	r1, r0, r0;		\
+	vpxor	r0, r2, r2;		vpor	r3, r0, r0;		\
+	vpxor	r2, r3, r3;		vpxor	r0, r1, r1;		\
+	vpxor	r4, r0, r0;		vpor	r1, r4, r4;		\
+	vpxor	r3, r1, r1;		vpor	r0, r2, r2;		\
+	vpand	r4, r2, r2;		vpxor	r1, r0, r0;		\
+	vpand	r2, r1, r1;	\
+	vpxor	r0, r1, r1;		vpand	r2, r0, r0;		\
+	vpxor	r4, r0, r0;
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r1, r4;			vpxor	r3, r1, r1;		\
+	vpand	r1, r3, r3;		vpxor	r2, r4, r4;		\
+	vpxor	r0, r3, r3;		vpor	r1, r0, r0;		\
+	vpxor	r3, r2, r2;		vpxor	r4, r0, r0;		\
+	vpor	r2, r0, r0;		vpxor	r3, r1, r1;		\
+	vpxor	r1, r0, r0;		vpor	r3, r1, r1;		\
+	vpxor	r0, r1, r1;		vpxor	RNOT, r4, r4;		\
+	vpxor	r1, r4, r4;		vpor	r0, r1, r1;		\
+	vpxor	r0, r1, r1;	\
+	vpor	r4, r1, r1;	\
+	vpxor	r1, r3, r3;
+
+#define SBOX2(r0, r1, r2, r3, r4) \
+	vmovdqa	r0, r4;			vpand	r2, r0, r0;		\
+	vpxor	r3, r0, r0;		vpxor	r1, r2, r2;		\
+	vpxor	r0, r2, r2;		vpor	r4, r3, r3;		\
+	vpxor	r1, r3, r3;		vpxor	r2, r4, r4;		\
+	vmovdqa	r3, r1;			vpor	r4, r3, r3;		\
+	vpxor	r0, r3, r3;		vpand	r1, r0, r0;		\
+	vpxor	r0, r4, r4;		vpxor	r3, r1, r1;		\
+	vpxor	r4, r1, r1;		vpxor	RNOT, r4, r4;
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r3, r2, r2;		vpxor	r0, r3, r3;		\
+	vmovdqa	r3, r4;			vpand	r2, r3, r3;		\
+	vpxor	r1, r3, r3;		vpor	r2, r1, r1;		\
+	vpxor	r4, r1, r1;		vpand	r3, r4, r4;		\
+	vpxor	r3, r2, r2;		vpand	r0, r4, r4;		\
+	vpxor	r2, r4, r4;		vpand	r1, r2, r2;		\
+	vpor	r0, r2, r2;		vpxor	RNOT, r3, r3;		\
+	vpxor	r3, r2, r2;		vpxor	r3, r0, r0;		\
+	vpand	r1, r0, r0;		vpxor	r4, r3, r3;		\
+	vpxor	r0, r3, r3;
+
+#define SBOX3(r0, r1, r2, r3, r4) \
+	vmovdqa	r0, r4;			vpor	r3, r0, r0;		\
+	vpxor	r1, r3, r3;		vpand	r4, r1, r1;		\
+	vpxor	r2, r4, r4;		vpxor	r3, r2, r2;		\
+	vpand	r0, r3, r3;		vpor	r1, r4, r4;		\
+	vpxor	r4, r3, r3;		vpxor	r1, r0, r0;		\
+	vpand	r0, r4, r4;		vpxor	r3, r1, r1;		\
+	vpxor	r2, r4, r4;		vpor	r0, r1, r1;		\
+	vpxor	r2, r1, r1;		vpxor	r3, r0, r0;		\
+	vmovdqa	r1, r2;			vpor	r3, r1, r1;		\
+	vpxor	r0, r1, r1;
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;			vpxor	r1, r2, r2;		\
+	vpxor	r2, r0, r0;		vpand	r2, r4, r4;		\
+	vpxor	r0, r4, r4;		vpand	r1, r0, r0;		\
+	vpxor	r3, r1, r1;		vpor	r4, r3, r3;		\
+	vpxor	r3, r2, r2;		vpxor	r3, r0, r0;		\
+	vpxor	r4, r1, r1;		vpand	r2, r3, r3;		\
+	vpxor	r1, r3, r3;		vpxor	r0, r1, r1;		\
+	vpor	r2, r1, r1;		vpxor	r3, r0, r0;		\
+	vpxor	r4, r1, r1;	\
+	vpxor	r1, r0, r0;
+
+#define SBOX4(r0, r1, r2, r3, r4) \
+	vpxor	r3, r1, r1;		vpxor	RNOT, r3, r3;		\
+	vpxor	r3, r2, r2;		vpxor	r0, r3, r3;		\
+	vmovdqa	r1, r4;			vpand	r3, r1, r1;		\
+	vpxor	r2, r1, r1;		vpxor	r3, r4, r4;		\
+	vpxor	r4, r0, r0;		vpand	r4, r2, r2;		\
+	vpxor	r0, r2, r2;		vpand	r1, r0, r0;		\
+	vpxor	r0, r3, r3;		vpor	r1, r4, r4;		\
+	vpxor	r0, r4, r4;		vpor	r3, r0, r0;		\
+	vpxor	r2, r0, r0;		vpand	r3, r2, r2;		\
+	vpxor	RNOT, r0, r0;		vpxor	r2, r4, r4;
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;			vpand	r3, r2, r2;		\
+	vpxor	r1, r2, r2;		vpor	r3, r1, r1;		\
+	vpand	r0, r1, r1;		vpxor	r2, r4, r4;		\
+	vpxor	r1, r4, r4;		vpand	r2, r1, r1;		\
+	vpxor	RNOT, r0, r0;		vpxor	r4, r3, r3;		\
+	vpxor	r3, r1, r1;		vpand	r0, r3, r3;		\
+	vpxor	r2, r3, r3;		vpxor	r1, r0, r0;		\
+	vpand	r0, r2, r2;		vpxor	r0, r3, r3;		\
+	vpxor	r4, r2, r2;	\
+	vpor	r3, r2, r2;		vpxor	r0, r3, r3;		\
+	vpxor	r1, r2, r2;
+
+#define SBOX5(r0, r1, r2, r3, r4) \
+	vpxor	r1, r0, r0;		vpxor	r3, r1, r1;		\
+	vpxor	RNOT, r3, r3;		vmovdqa	r1, r4;			\
+	vpand	r0, r1, r1;		vpxor	r3, r2, r2;		\
+	vpxor	r2, r1, r1;		vpor	r4, r2, r2;		\
+	vpxor	r3, r4, r4;		vpand	r1, r3, r3;		\
+	vpxor	r0, r3, r3;		vpxor	r1, r4, r4;		\
+	vpxor	r2, r4, r4;		vpxor	r0, r2, r2;		\
+	vpand	r3, r0, r0;		vpxor	RNOT, r2, r2;		\
+	vpxor	r4, r0, r0;		vpor	r3, r4, r4;		\
+	vpxor	r4, r2, r2;
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	RNOT, r1, r1;		vmovdqa	r3, r4;			\
+	vpxor	r1, r2, r2;		vpor	r0, r3, r3;		\
+	vpxor	r2, r3, r3;		vpor	r1, r2, r2;		\
+	vpand	r0, r2, r2;		vpxor	r3, r4, r4;		\
+	vpxor	r4, r2, r2;		vpor	r0, r4, r4;		\
+	vpxor	r1, r4, r4;		vpand	r2, r1, r1;		\
+	vpxor	r3, r1, r1;		vpxor	r2, r4, r4;		\
+	vpand	r4, r3, r3;		vpxor	r1, r4, r4;		\
+	vpxor	r4, r3, r3;		vpxor	RNOT, r4, r4;		\
+	vpxor	r0, r3, r3;
+
+#define SBOX6(r0, r1, r2, r3, r4) \
+	vpxor	RNOT, r2, r2;		vmovdqa	r3, r4;			\
+	vpand	r0, r3, r3;		vpxor	r4, r0, r0;		\
+	vpxor	r2, r3, r3;		vpor	r4, r2, r2;		\
+	vpxor	r3, r1, r1;		vpxor	r0, r2, r2;		\
+	vpor	r1, r0, r0;		vpxor	r1, r2, r2;		\
+	vpxor	r0, r4, r4;		vpor	r3, r0, r0;		\
+	vpxor	r2, r0, r0;		vpxor	r3, r4, r4;		\
+	vpxor	r0, r4, r4;		vpxor	RNOT, r3, r3;		\
+	vpand	r4, r2, r2;	\
+	vpxor	r3, r2, r2;
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r2, r0, r0;		vmovdqa	r2, r4;			\
+	vpand	r0, r2, r2;		vpxor	r3, r4, r4;		\
+	vpxor	RNOT, r2, r2;		vpxor	r1, r3, r3;		\
+	vpxor	r3, r2, r2;		vpor	r0, r4, r4;		\
+	vpxor	r2, r0, r0;		vpxor	r4, r3, r3;		\
+	vpxor	r1, r4, r4;		vpand	r3, r1, r1;		\
+	vpxor	r0, r1, r1;		vpxor	r3, r0, r0;		\
+	vpor	r2, r0, r0;		vpxor	r1, r3, r3;		\
+	vpxor	r0, r4, r4;
+
+#define SBOX7(r0, r1, r2, r3, r4) \
+	vmovdqa	r1, r4;			vpor	r2, r1, r1;		\
+	vpxor	r3, r1, r1;		vpxor	r2, r4, r4;		\
+	vpxor	r1, r2, r2;		vpor	r4, r3, r3;		\
+	vpand	r0, r3, r3;		vpxor	r2, r4, r4;		\
+	vpxor	r1, r3, r3;		vpor	r4, r1, r1;		\
+	vpxor	r0, r1, r1;		vpor	r4, r0, r0;		\
+	vpxor	r2, r0, r0;		vpxor	r4, r1, r1;		\
+	vpxor	r1, r2, r2;		vpand	r0, r1, r1;		\
+	vpxor	r4, r1, r1;		vpxor	RNOT, r2, r2;		\
+	vpor	r0, r2, r2;	\
+	vpxor	r2, r4, r4;
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;			vpxor	r0, r2, r2;		\
+	vpand	r3, r0, r0;		vpor	r3, r4, r4;		\
+	vpxor	RNOT, r2, r2;		vpxor	r1, r3, r3;		\
+	vpor	r0, r1, r1;		vpxor	r2, r0, r0;		\
+	vpand	r4, r2, r2;		vpand	r4, r3, r3;		\
+	vpxor	r2, r1, r1;		vpxor	r0, r2, r2;		\
+	vpor	r2, r0, r0;		vpxor	r1, r4, r4;		\
+	vpxor	r3, r0, r0;		vpxor	r4, r3, r3;		\
+	vpor	r0, r4, r4;		vpxor	r2, r3, r3;		\
+	vpxor	r2, r4, r4;
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, r0, r1, r2, r3, r4) \
+	SBOX##which (r0, r1, r2, r3, r4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
+	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
+
+/* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
+#define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
+	vpbroadcastd (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \
+	vpxor r4, r0, r0; \
+	vpbroadcastd (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \
+	vpxor r4, r1, r1; \
+	vpbroadcastd (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \
+	vpxor r4, r2, r2; \
+	vpbroadcastd (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \
+	vpxor r4, r3, r3;
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
+	vec_rol(r0, 13, r4);	\
+	vec_rol(r2, 3, r4);	\
+	vpxor r0, r1, r1;	\
+	vpxor r2, r1, r1;	\
+	vpslld $3, r0, r4;	\
+	vpxor r2, r3, r3;	\
+	vpxor r4, r3, r3;	\
+	vec_rol(r1, 1, r4);	\
+	vec_rol(r3, 7, r4);	\
+	vpxor r1, r0, r0;	\
+	vpxor r3, r0, r0;	\
+	vpslld $7, r1, r4;	\
+	vpxor r3, r2, r2;	\
+	vpxor r4, r2, r2;	\
+	vec_rol(r0, 5, r4);	\
+	vec_rol(r2, 22, r4);
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
+	vec_ror(r2, 22, r4);	\
+	vec_ror(r0, 5, r4);	\
+	vpslld $7, r1, r4;	\
+	vpxor r3, r2, r2;	\
+	vpxor r4, r2, r2;	\
+	vpxor r1, r0, r0;	\
+	vpxor r3, r0, r0;	\
+	vec_ror(r3, 7, r4);	\
+	vec_ror(r1, 1, r4);	\
+	vpslld $3, r0, r4;	\
+	vpxor r2, r3, r3;	\
+	vpxor r4, r3, r3;	\
+	vpxor r0, r1, r1;	\
+	vpxor r2, r1, r1;	\
+	vec_ror(r2, 3, r4);	\
+	vec_ror(r0, 13, r4);
+
+/* Apply a Serpent round to sixteen parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
+		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
+
+/* Apply the last Serpent round to sixteen parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
+
+/* Apply an inverse Serpent round to sixteen parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+				    na0, na1, na2, na3, na4, \
+				    b0, b1, b2, b3, b4, \
+				    nb0, nb1, nb2, nb3, nb4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
+
+/* Apply the first inverse Serpent round to sixteen parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+					  na0, na1, na2, na3, na4, \
+					  b0, b1, b2, b3, b4, \
+					  nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
+
+.text
+
+.align 8
+ELF(.type   __serpent_enc_blk16,@function;)
+__serpent_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
+	 * 						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
+
+.align 8
+ELF(.type   __serpent_dec_blk16,@function;)
+__serpent_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 *						plaintext blocks
+	 */
+	CFI_STARTPROC();
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+				    RA3, RA0, RA1, RA4, RA2,
+				    RB0, RB1, RB2, RB3, RB4,
+				    RB3, RB0, RB1, RB4, RB2);
+	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_serpent_avx2_ctr_enc
+ELF(.type   _gcry_serpent_avx2_ctr_enc,@function;)
+_gcry_serpent_avx2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	movq 8(%rcx), %rax;
+	bswapq %rax;
+
+	vzeroupper;
+
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
+	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RTMP4x;
+	vpshufb RTMP3x, RTMP4x, RTMP4x;
+	vmovdqa RTMP4x, RTMP0x;
+	inc_le128(RTMP4x, RNOTx, RTMP1x);
+	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 16), %rax;
+	ja .Lhandle_ctr_carry;
+
+	/* construct IVs */
+	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+	vpshufb RTMP3, RTMP0, RA1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+	vpshufb RTMP3, RTMP0, RA2;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+	vpshufb RTMP3, RTMP0, RA3;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+	vpshufb RTMP3, RTMP0, RB0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+	vpshufb RTMP3, RTMP0, RB1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+	vpshufb RTMP3, RTMP0, RB2;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+	vpshufb RTMP3, RTMP0, RB3;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+	vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+	jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vextracti128 $1, RTMP0, RTMP0x;
+	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+	/* store new IV */
+	vmovdqu RTMP0x, (%rcx);
+
+	call __serpent_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA4, RA4;
+	vpxor (1 * 32)(%rdx), RA1, RA1;
+	vpxor (2 * 32)(%rdx), RA2, RA2;
+	vpxor (3 * 32)(%rdx), RA0, RA0;
+	vpxor (4 * 32)(%rdx), RB4, RB4;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RB2, RB2;
+	vpxor (7 * 32)(%rdx), RB0, RB0;
+
+	vmovdqu RA4, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_serpent_avx2_cbc_dec
+ELF(.type   _gcry_serpent_avx2_cbc_dec,@function;)
+_gcry_serpent_avx2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	vmovdqu (4 * 32)(%rdx), RB0;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RB2;
+	vmovdqu (7 * 32)(%rdx), RB3;
+
+	call __serpent_dec_blk16;
+
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RNOT;
+	vpxor RNOT, RA0, RA0;
+	vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+	vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+	vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+	vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+	vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+	vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_serpent_avx2_cfb_dec
+ELF(.type   _gcry_serpent_avx2_cfb_dec,@function;)
+_gcry_serpent_avx2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	/* Load input */
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RA0;
+	vmovdqu (0 * 32 + 16)(%rdx), RA1;
+	vmovdqu (1 * 32 + 16)(%rdx), RA2;
+	vmovdqu (2 * 32 + 16)(%rdx), RA3;
+	vmovdqu (3 * 32 + 16)(%rdx), RB0;
+	vmovdqu (4 * 32 + 16)(%rdx), RB1;
+	vmovdqu (5 * 32 + 16)(%rdx), RB2;
+	vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+	/* Update IV */
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx);
+
+	call __serpent_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA4, RA4;
+	vpxor (1 * 32)(%rdx), RA1, RA1;
+	vpxor (2 * 32)(%rdx), RA2, RA2;
+	vpxor (3 * 32)(%rdx), RA0, RA0;
+	vpxor (4 * 32)(%rdx), RB4, RB4;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RB2, RB2;
+	vpxor (7 * 32)(%rdx), RB0, RB0;
+
+	vmovdqu RA4, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_serpent_avx2_ocb_enc
+ELF(.type _gcry_serpent_avx2_ocb_enc,@function;)
+
+_gcry_serpent_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+	vmovdqu (%r8), RTMP1x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RTMP1, RTMP1; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vmovdqu RTMP0x, (%rcx);
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor (0 * 32)(%rsi), RA4, RA4;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA0, RA0;
+	vpxor (4 * 32)(%rsi), RB4, RB4;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB0, RB0;
+
+	vmovdqu RA4, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_serpent_avx2_ocb_dec
+ELF(.type _gcry_serpent_avx2_ocb_dec,@function;)
+
+_gcry_serpent_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_dec_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vmovdqu (%r8), RTMP1x;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA3, RA3;
+	vpxor (4 * 32)(%rsi), RB0, RB0;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB3, RB3;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vpxor RA0, RTMP1, RTMP1;
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vpxor RA1, RTMP1, RTMP1;
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vpxor RA2, RTMP1, RTMP1;
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vpxor RA3, RTMP1, RTMP1;
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vpxor RB0, RTMP1, RTMP1;
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vpxor RB1, RTMP1, RTMP1;
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vpxor RB2, RTMP1, RTMP1;
+	vmovdqu RB3, (7 * 32)(%rsi);
+	vpxor RB3, RTMP1, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_serpent_avx2_ocb_auth
+ELF(.type _gcry_serpent_avx2_ocb_auth,@function;)
+
+_gcry_serpent_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rdx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor RA4, RB4, RA4;
+	vpxor RA1, RB1, RA1;
+	vpxor RA2, RB2, RA2;
+	vpxor RA0, RB0, RA0;
+
+	vpxor RA4, RA1, RA1;
+	vpxor RA2, RA0, RA0;
+
+	vpxor RA1, RA0, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor (%rcx), RTMP1x, RTMP1x;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
+
+.align 16
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#endif /*defined(USE_SERPENT) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/serpent-sse2-amd64.S b/comm/third_party/libgcrypt/cipher/serpent-sse2-amd64.S
new file mode 100644
index 0000000000..39cba00297
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/serpent-sse2-amd64.S
@@ -0,0 +1,1211 @@
+/* serpent-sse2-amd64.S  -  SSE2 implementation of Serpent cipher
+ *
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT)
+
+#include "asm-common-amd64.h"
+
+/* struct serpent_context: */
+#define ctx_keys 0
+
+/* register macros */
+#define CTX %rdi
+
+/* vector registers */
+#define RA0 %xmm0
+#define RA1 %xmm1
+#define RA2 %xmm2
+#define RA3 %xmm3
+#define RA4 %xmm4
+
+#define RB0 %xmm5
+#define RB1 %xmm6
+#define RB2 %xmm7
+#define RB3 %xmm8
+#define RB4 %xmm9
+
+#define RNOT %xmm10
+#define RTMP0 %xmm11
+#define RTMP1 %xmm12
+#define RTMP2 %xmm13
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* vector 32-bit rotation to left */
+#define vec_rol(reg, nleft, tmp) \
+	movdqa reg, tmp; 		\
+	pslld $(nleft), tmp;		\
+	psrld $(32 - (nleft)), reg;	\
+	por tmp, reg;
+
+/* vector 32-bit rotation to right */
+#define vec_ror(reg, nright, tmp) \
+	vec_rol(reg, 32 - nright, tmp)
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* 128-bit wide byte swap */
+#define pbswap(xreg, t0) \
+	/* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \
+	pshufd $0x1b, xreg, xreg; \
+	/* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \
+	pshuflw $0xb1, xreg, xreg; \
+	pshufhw $0xb1, xreg, xreg; \
+	/* reorder bytes in 16-bit words */ \
+	movdqa xreg, t0; \
+	psrlw $8, t0; \
+	psllw $8, xreg; \
+	por t0, xreg;
+
+/**********************************************************************
+  8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(r0, r1, r2, r3, r4) \
+	pxor	r0, r3;		movdqa	r1, r4;		\
+	pand	r3, r1;		pxor	r2, r4;		\
+	pxor	r0, r1;		por	r3, r0;		\
+	pxor	r4, r0;		pxor	r3, r4;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pxor	r4, r2;		pxor	RNOT, r4;	\
+	por	r1, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		por	r0, r3;		\
+	pxor	r3, r1;		pxor	r3, r4;
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r1, r4;		\
+	por	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r4, r0;		\
+	pxor	r0, r2;		pand	r3, r0;		\
+	pxor	r0, r4;		por	r1, r0;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r2;		pxor	r0, r3;		\
+	pxor	r1, r3;	\
+	pand	r3, r2;	\
+	pxor	r2, r4;
+
+#define SBOX1(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r0;	pxor	RNOT, r2;	\
+	movdqa	r0, r4;		pand	r1, r0;		\
+	pxor	r0, r2;		por	r3, r0;		\
+	pxor	r2, r3;		pxor	r0, r1;		\
+	pxor	r4, r0;		por	r1, r4;		\
+	pxor	r3, r1;		por	r0, r2;		\
+	pand	r4, r2;		pxor	r1, r0;		\
+	pand	r2, r1;	\
+	pxor	r0, r1;		pand	r2, r0;		\
+	pxor	r4, r0;
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		pxor	r3, r1;		\
+	pand	r1, r3;		pxor	r2, r4;		\
+	pxor	r0, r3;		por	r1, r0;		\
+	pxor	r3, r2;		pxor	r4, r0;		\
+	por	r2, r0;		pxor	r3, r1;		\
+	pxor	r1, r0;		por	r3, r1;		\
+	pxor	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r1, r4;		por	r0, r1;		\
+	pxor	r0, r1;	\
+	por	r4, r1;	\
+	pxor	r1, r3;
+
+#define SBOX2(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		pand	r2, r0;		\
+	pxor	r3, r0;		pxor	r1, r2;		\
+	pxor	r0, r2;		por	r4, r3;		\
+	pxor	r1, r3;		pxor	r2, r4;		\
+	movdqa	r3, r1;		por	r4, r3;		\
+	pxor	r0, r3;		pand	r1, r0;		\
+	pxor	r0, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r4;
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r3, r4;		pand	r2, r3;		\
+	pxor	r1, r3;		por	r2, r1;		\
+	pxor	r4, r1;		pand	r3, r4;		\
+	pxor	r3, r2;		pand	r0, r4;		\
+	pxor	r2, r4;		pand	r1, r2;		\
+	por	r0, r2;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pand	r1, r0;		pxor	r4, r3;		\
+	pxor	r0, r3;
+
+#define SBOX3(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		por	r3, r0;		\
+	pxor	r1, r3;		pand	r4, r1;		\
+	pxor	r2, r4;		pxor	r3, r2;		\
+	pand	r0, r3;		por	r1, r4;		\
+	pxor	r4, r3;		pxor	r1, r0;		\
+	pand	r0, r4;		pxor	r3, r1;		\
+	pxor	r2, r4;		por	r0, r1;		\
+	pxor	r2, r1;		pxor	r3, r0;		\
+	movdqa	r1, r2;		por	r3, r1;		\
+	pxor	r0, r1;
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r1, r2;		\
+	pxor	r2, r0;		pand	r2, r4;		\
+	pxor	r0, r4;		pand	r1, r0;		\
+	pxor	r3, r1;		por	r4, r3;		\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pxor	r4, r1;		pand	r2, r3;		\
+	pxor	r1, r3;		pxor	r0, r1;		\
+	por	r2, r1;		pxor	r3, r0;		\
+	pxor	r4, r1;	\
+	pxor	r1, r0;
+
+#define SBOX4(r0, r1, r2, r3, r4) \
+	pxor	r3, r1;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r1, r4;		pand	r3, r1;		\
+	pxor	r2, r1;		pxor	r3, r4;		\
+	pxor	r4, r0;		pand	r4, r2;		\
+	pxor	r0, r2;		pand	r1, r0;		\
+	pxor	r0, r3;		por	r1, r4;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pand	r3, r2;		\
+	pxor	RNOT, r0;	pxor	r2, r4;
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pand	r3, r2;		\
+	pxor	r1, r2;		por	r3, r1;		\
+	pand	r0, r1;		pxor	r2, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	RNOT, r0;	pxor	r4, r3;		\
+	pxor	r3, r1;		pand	r0, r3;		\
+	pxor	r2, r3;		pxor	r1, r0;		\
+	pand	r0, r2;		pxor	r0, r3;		\
+	pxor	r4, r2;	\
+	por	r3, r2;		pxor	r0, r3;		\
+	pxor	r1, r2;
+
+#define SBOX5(r0, r1, r2, r3, r4) \
+	pxor	r1, r0;		pxor	r3, r1;		\
+	pxor	RNOT, r3;	movdqa	r1, r4;		\
+	pand	r0, r1;		pxor	r3, r2;		\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r4;		pand	r1, r3;		\
+	pxor	r0, r3;		pxor	r1, r4;		\
+	pxor	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		pxor	RNOT, r2;	\
+	pxor	r4, r0;		por	r3, r4;		\
+	pxor	r4, r2;
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r1;	movdqa	r3, r4;		\
+	pxor	r1, r2;		por	r0, r3;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	r4, r2;		por	r0, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pand	r4, r3;		pxor	r1, r4;		\
+	pxor	r4, r3;		pxor	RNOT, r4;	\
+	pxor	r0, r3;
+
+#define SBOX6(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r3, r4;		\
+	pand	r0, r3;		pxor	r4, r0;		\
+	pxor	r2, r3;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r0, r2;		\
+	por	r1, r0;		pxor	r1, r2;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pxor	r3, r4;		\
+	pxor	r0, r4;		pxor	RNOT, r3;	\
+	pand	r4, r2;	\
+	pxor	r3, r2;
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r2, r0;		movdqa	r2, r4;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	pxor	r3, r2;		por	r0, r4;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r4;		pand	r3, r1;		\
+	pxor	r0, r1;		pxor	r3, r0;		\
+	por	r2, r0;		pxor	r1, r3;		\
+	pxor	r0, r4;
+
+#define SBOX7(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		por	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pxor	r1, r2;		por	r4, r3;		\
+	pand	r0, r3;		pxor	r2, r4;		\
+	pxor	r1, r3;		por	r4, r1;		\
+	pxor	r0, r1;		por	r4, r0;		\
+	pxor	r2, r0;		pxor	r4, r1;		\
+	pxor	r1, r2;		pand	r0, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r2;	\
+	por	r0, r2;	\
+	pxor	r2, r4;
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		por	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	por	r0, r1;		pxor	r2, r0;		\
+	pand	r4, r2;		pand	r4, r3;		\
+	pxor	r2, r1;		pxor	r0, r2;		\
+	por	r2, r0;		pxor	r1, r4;		\
+	pxor	r3, r0;		pxor	r4, r3;		\
+	por	r0, r4;		pxor	r2, r3;		\
+	pxor	r2, r4;
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, r0, r1, r2, r3, r4) \
+	SBOX##which (r0, r1, r2, r3, r4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
+	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
+
+/* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
+#define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
+	pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \
+	pxor r4, r0; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \
+	pxor r4, r1; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \
+	pxor r4, r2; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \
+	pxor r4, r3;
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
+	vec_rol(r0, 13, r4);	\
+	vec_rol(r2, 3, r4);	\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	vec_rol(r1, 1, r4);	\
+	vec_rol(r3, 7, r4);	\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	vec_rol(r0, 5, r4);	\
+	vec_rol(r2, 22, r4);
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
+	vec_ror(r2, 22, r4);	\
+	vec_ror(r0, 5, r4);	\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	vec_ror(r3, 7, r4);	\
+	vec_ror(r1, 1, r4);	\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	vec_ror(r2, 3, r4);	\
+	vec_ror(r0, 13, r4);
+
+/* Apply a Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
+		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
+
+/* Apply the last Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
+
+/* Apply an inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+				    na0, na1, na2, na3, na4, \
+				    b0, b1, b2, b3, b4, \
+				    nb0, nb1, nb2, nb3, nb4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
+
+/* Apply the first inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+					  na0, na1, na2, na3, na4, \
+					  b0, b1, b2, b3, b4, \
+					  nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
+
+.text
+
+.align 8
+ELF(.type   __serpent_enc_blk8,@function;)
+__serpent_enc_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 * output:
+	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
+	 * 						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
+
+.align 8
+ELF(.type   __serpent_dec_blk8,@function;)
+__serpent_dec_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+	CFI_STARTPROC();
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+				    RA3, RA0, RA1, RA4, RA2,
+				    RB0, RB1, RB2, RB3, RB4,
+				    RB3, RB0, RB1, RB4, RB2);
+	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
+
+.align 8
+.globl _gcry_serpent_sse2_ctr_enc
+ELF(.type   _gcry_serpent_sse2_ctr_enc,@function;)
+_gcry_serpent_sse2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	/* load IV and byteswap */
+	movdqu (%rcx), RA0;
+	movdqa RA0, RTMP0;
+	pbswap(RTMP0, RTMP1); /* be => le */
+
+	pcmpeqd RNOT, RNOT;
+	psrldq $8, RNOT; /* low: -1, high: 0 */
+	movdqa RNOT, RTMP2;
+	paddq RTMP2, RTMP2; /* low: -2, high: 0 */
+
+	/* construct IVs */
+	movdqa RTMP0, RTMP1;
+	psubq RNOT, RTMP0; /* +1 */
+	movdqa RTMP0, RA1;
+	psubq RTMP2, RTMP1; /* +2 */
+	movdqa RTMP1, RA2;
+	psubq RTMP2, RTMP0; /* +3 */
+	movdqa RTMP0, RA3;
+	psubq RTMP2, RTMP1; /* +4 */
+	movdqa RTMP1, RB0;
+	psubq RTMP2, RTMP0; /* +5 */
+	movdqa RTMP0, RB1;
+	psubq RTMP2, RTMP1; /* +6 */
+	movdqa RTMP1, RB2;
+	psubq RTMP2, RTMP0; /* +7 */
+	movdqa RTMP0, RB3;
+	psubq RTMP2, RTMP1; /* +8 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpl $0xffffffff, 8(%rcx);
+	jne .Lno_ctr_carry;
+
+	movl 12(%rcx), %eax;
+	bswapl %eax;
+	cmpl $-8, %eax;
+	jb .Lno_ctr_carry;
+	pslldq $8, RNOT; /* low: 0, high: -1 */
+	je .Lcarry_RTMP0;
+
+	cmpl $-6, %eax;
+	jb .Lcarry_RB3;
+	je .Lcarry_RB2;
+
+	cmpl $-4, %eax;
+	jb .Lcarry_RB1;
+	je .Lcarry_RB0;
+
+	cmpl $-2, %eax;
+	jb .Lcarry_RA3;
+	je .Lcarry_RA2;
+
+	psubq RNOT, RA1;
+.Lcarry_RA2:
+	psubq RNOT, RA2;
+.Lcarry_RA3:
+	psubq RNOT, RA3;
+.Lcarry_RB0:
+	psubq RNOT, RB0;
+.Lcarry_RB1:
+	psubq RNOT, RB1;
+.Lcarry_RB2:
+	psubq RNOT, RB2;
+.Lcarry_RB3:
+	psubq RNOT, RB3;
+.Lcarry_RTMP0:
+	psubq RNOT, RTMP1;
+
+.Lno_ctr_carry:
+	/* le => be */
+	pbswap(RA1, RTMP0);
+	pbswap(RA2, RTMP0);
+	pbswap(RA3, RTMP0);
+	pbswap(RB0, RTMP0);
+	pbswap(RB1, RTMP0);
+	pbswap(RB2, RTMP0);
+	pbswap(RB3, RTMP0);
+	pbswap(RTMP1, RTMP0);
+	/* store new IV */
+	movdqu RTMP1, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
+
+	movdqu RA4, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
+
+.align 8
+.globl _gcry_serpent_sse2_cbc_dec
+ELF(.type   _gcry_serpent_sse2_cbc_dec,@function;)
+_gcry_serpent_sse2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	movdqu (0 * 16)(%rdx), RA0;
+	movdqu (1 * 16)(%rdx), RA1;
+	movdqu (2 * 16)(%rdx), RA2;
+	movdqu (3 * 16)(%rdx), RA3;
+	movdqu (4 * 16)(%rdx), RB0;
+	movdqu (5 * 16)(%rdx), RB1;
+	movdqu (6 * 16)(%rdx), RB2;
+	movdqu (7 * 16)(%rdx), RB3;
+
+	call __serpent_dec_blk8;
+
+	movdqu (7 * 16)(%rdx), RNOT;
+	pxor_u((%rcx), RA0, RTMP0);
+	pxor_u((0 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((3 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB3, RTMP0);
+	movdqu RNOT, (%rcx); /* store new IV */
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
+
+.align 8
+.globl _gcry_serpent_sse2_cfb_dec
+ELF(.type   _gcry_serpent_sse2_cfb_dec,@function;)
+_gcry_serpent_sse2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	/* Load input */
+	movdqu (%rcx), RA0;
+	movdqu 0 * 16(%rdx), RA1;
+	movdqu 1 * 16(%rdx), RA2;
+	movdqu 2 * 16(%rdx), RA3;
+	movdqu 3 * 16(%rdx), RB0;
+	movdqu 4 * 16(%rdx), RB1;
+	movdqu 5 * 16(%rdx), RB2;
+	movdqu 6 * 16(%rdx), RB3;
+
+	/* Update IV */
+	movdqu 7 * 16(%rdx), RNOT;
+	movdqu RNOT, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
+
+	movdqu RA4, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
+
+.align 8
+.globl _gcry_serpent_sse2_ocb_enc
+ELF(.type _gcry_serpent_sse2_ocb_enc,@function;)
+
+_gcry_serpent_sse2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	movdqu (%rcx), RTMP0;
+	movdqu (%r8), RTMP1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rdx), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor xreg, RTMP1; \
+	  pxor RTMP0, xreg; \
+	  movdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rcx);
+	movdqu RTMP1, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_enc_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	pxor_u((0 * 16)(%rsi), RA4, RTMP0);
+	pxor_u((1 * 16)(%rsi), RA1, RTMP0);
+	pxor_u((2 * 16)(%rsi), RA2, RTMP0);
+	pxor_u((3 * 16)(%rsi), RA0, RTMP0);
+	pxor_u((4 * 16)(%rsi), RB4, RTMP0);
+	pxor_u((5 * 16)(%rsi), RB1, RTMP0);
+	pxor_u((6 * 16)(%rsi), RB2, RTMP0);
+	pxor_u((7 * 16)(%rsi), RB0, RTMP0);
+
+	movdqu RA4, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
+
+.align 8
+.globl _gcry_serpent_sse2_ocb_dec
+ELF(.type _gcry_serpent_sse2_ocb_dec,@function;)
+
+_gcry_serpent_sse2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	movdqu (%rcx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rdx), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor RTMP0, xreg; \
+	  movdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_dec_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	movdqu (%r8), RTMP0;
+
+	pxor_u((0 * 16)(%rsi), RA0, RTMP1);
+	pxor_u((1 * 16)(%rsi), RA1, RTMP1);
+	pxor_u((2 * 16)(%rsi), RA2, RTMP1);
+	pxor_u((3 * 16)(%rsi), RA3, RTMP1);
+	pxor_u((4 * 16)(%rsi), RB0, RTMP1);
+	pxor_u((5 * 16)(%rsi), RB1, RTMP1);
+	pxor_u((6 * 16)(%rsi), RB2, RTMP1);
+	pxor_u((7 * 16)(%rsi), RB3, RTMP1);
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	movdqu RA0, (0 * 16)(%rsi);
+	pxor RA0, RTMP0;
+	movdqu RA1, (1 * 16)(%rsi);
+	pxor RA1, RTMP0;
+	movdqu RA2, (2 * 16)(%rsi);
+	pxor RA2, RTMP0;
+	movdqu RA3, (3 * 16)(%rsi);
+	pxor RA3, RTMP0;
+	movdqu RB0, (4 * 16)(%rsi);
+	pxor RB0, RTMP0;
+	movdqu RB1, (5 * 16)(%rsi);
+	pxor RB1, RTMP0;
+	movdqu RB2, (6 * 16)(%rsi);
+	pxor RB2, RTMP0;
+	movdqu RB3, (7 * 16)(%rsi);
+	pxor RB3, RTMP0;
+
+	movdqu RTMP0, (%r8);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
+
+.align 8
+.globl _gcry_serpent_sse2_ocb_auth
+ELF(.type _gcry_serpent_sse2_ocb_auth,@function;)
+
+_gcry_serpent_sse2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (8 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	movdqu (%rdx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rsi), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor RTMP0, xreg;
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __serpent_enc_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	movdqu (%rcx), RTMP0;
+	pxor RB4, RA4;
+	pxor RB1, RA1;
+	pxor RB2, RA2;
+	pxor RB0, RA0;
+
+	pxor RTMP0, RA2;
+	pxor RA4, RA1;
+	pxor RA2, RA0;
+
+	pxor RA1, RA0;
+	movdqu RA0, (%rcx);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
+
+#endif /*defined(USE_SERPENT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/serpent.c b/comm/third_party/libgcrypt/cipher/serpent.c
new file mode 100644
index 0000000000..3c5eed2c03
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/serpent.c
@@ -0,0 +1,1807 @@
+/* serpent.c - Implementation of the Serpent encryption algorithm.
+ *	Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <config.h>
+
+#include <string.h>
+#include <stdio.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+
+/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
+#undef USE_SSE2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSE2 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# if defined(ENABLE_AVX2_SUPPORT)
+#  define USE_AVX2 1
+# endif
+#endif
+
+/* USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef USE_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+/* Number of rounds per Serpent encrypt/decrypt operation.  */
+#define ROUNDS 32
+
+/* Magic number, used during generating of the subkeys.  */
+#define PHI 0x9E3779B9
+
+/* Serpent works on 128 bit blocks.  */
+typedef u32 serpent_block_t[4];
+
+/* Serpent key, provided by the user.  If the original key is shorter
+   than 256 bits, it is padded.  */
+typedef u32 serpent_key_t[8];
+
+/* The key schedule consists of 33 128 bit subkeys.  */
+typedef u32 serpent_subkeys_t[ROUNDS + 1][4];
+
+/* A Serpent context.  */
+typedef struct serpent_context
+{
+  serpent_subkeys_t keys;	/* Generated subkeys.  */
+
+#ifdef USE_AVX2
+  int use_avx2;
+#endif
+#ifdef USE_NEON
+  int use_neon;
+#endif
+} serpent_context_t;
+
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_SSE2) || defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+#  define ASM_FUNC_ABI
+# endif
+#endif
+
+
+#ifdef USE_SSE2
+/* Assembler implementations of Serpent using SSE2.  Process 8 block in
+   parallel.
+ */
+extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[8]) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_AVX2
+/* Assembler implementations of Serpent using AVX2.  Process 16 block in
+   parallel.
+ */
+extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[16]) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_NEON
+/* Assembler implementations of Serpent using ARM NEON.  Process 8 block in
+   parallel.
+ */
+extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr);
+
+extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
+
+extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
+
+extern void _gcry_serpent_neon_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]);
+
+extern void _gcry_serpent_neon_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]);
+
+extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const void *Ls[8]);
+#endif
+
+
+/* Prototypes.  */
+static const char *serpent_test (void);
+
+static void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				       const void *inbuf_arg, size_t nblocks,
+				       int encrypt);
+static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				      size_t nblocks);
+
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+
+#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r3 ^= r0; r4 =  r1; \
+    r1 &= r3; r4 ^= r2; \
+    r1 ^= r0; r0 |= r3; \
+    r0 ^= r4; r4 ^= r3; \
+    r3 ^= r2; r2 |= r1; \
+    r2 ^= r4; r4 = ~r4; \
+    r4 |= r1; r1 ^= r3; \
+    r1 ^= r4; r3 |= r0; \
+    r1 ^= r3; r4 ^= r3; \
+    \
+    w = r1; x = r4; y = r2; z = r0; \
+  }
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r1; \
+    r1 |= r0; r4 = ~r4; \
+    r1 ^= r2; r2 |= r4; \
+    r1 ^= r3; r0 ^= r4; \
+    r2 ^= r0; r0 &= r3; \
+    r4 ^= r0; r0 |= r1; \
+    r0 ^= r2; r3 ^= r4; \
+    r2 ^= r1; r3 ^= r0; \
+    r3 ^= r1; \
+    r2 &= r3; \
+    r4 ^= r2; \
+    \
+    w = r0; x = r4; y = r1; z = r3; \
+  }
+
+#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r0 = ~r0; r2 = ~r2; \
+    r4 =  r0; r0 &= r1; \
+    r2 ^= r0; r0 |= r3; \
+    r3 ^= r2; r1 ^= r0; \
+    r0 ^= r4; r4 |= r1; \
+    r1 ^= r3; r2 |= r0; \
+    r2 &= r4; r0 ^= r1; \
+    r1 &= r2; \
+    r1 ^= r0; r0 &= r2; \
+    r0 ^= r4; \
+    \
+    w = r2; x = r0; y = r3; z = r1; \
+  }
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r1; r1 ^= r3; \
+    r3 &= r1; r4 ^= r2; \
+    r3 ^= r0; r0 |= r1; \
+    r2 ^= r3; r0 ^= r4; \
+    r0 |= r2; r1 ^= r3; \
+    r0 ^= r1; r1 |= r3; \
+    r1 ^= r0; r4 = ~r4; \
+    r4 ^= r1; r1 |= r0; \
+    r1 ^= r0; \
+    r1 |= r4; \
+    r3 ^= r1; \
+    \
+    w = r4; x = r0; y = r3; z = r2; \
+  }
+
+#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r0; r0 &= r2; \
+    r0 ^= r3; r2 ^= r1; \
+    r2 ^= r0; r3 |= r4; \
+    r3 ^= r1; r4 ^= r2; \
+    r1 =  r3; r3 |= r4; \
+    r3 ^= r0; r0 &= r1; \
+    r4 ^= r0; r1 ^= r3; \
+    r1 ^= r4; r4 = ~r4; \
+    \
+    w = r2; x = r3; y = r1; z = r4; \
+  }
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r3; r3 &= r2; \
+    r3 ^= r1; r1 |= r2; \
+    r1 ^= r4; r4 &= r3; \
+    r2 ^= r3; r4 &= r0; \
+    r4 ^= r2; r2 &= r1; \
+    r2 |= r0; r3 = ~r3; \
+    r2 ^= r3; r0 ^= r3; \
+    r0 &= r1; r3 ^= r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r2; z = r3; \
+  }
+
+#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r0; r0 |= r3; \
+    r3 ^= r1; r1 &= r4; \
+    r4 ^= r2; r2 ^= r3; \
+    r3 &= r0; r4 |= r1; \
+    r3 ^= r4; r0 ^= r1; \
+    r4 &= r0; r1 ^= r3; \
+    r4 ^= r2; r1 |= r0; \
+    r1 ^= r2; r0 ^= r3; \
+    r2 =  r1; r1 |= r3; \
+    r1 ^= r0; \
+    \
+    w = r1; x = r2; y = r3; z = r4; \
+  }
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r1; \
+    r0 ^= r2; r4 &= r2; \
+    r4 ^= r0; r0 &= r1; \
+    r1 ^= r3; r3 |= r4; \
+    r2 ^= r3; r0 ^= r3; \
+    r1 ^= r4; r3 &= r2; \
+    r3 ^= r1; r1 ^= r0; \
+    r1 |= r2; r0 ^= r3; \
+    r1 ^= r4; \
+    r0 ^= r1; \
+    \
+    w = r2; x = r1; y = r3; z = r0; \
+  }
+
+#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r1 ^= r3; r3 = ~r3; \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r1; r1 &= r3; \
+    r1 ^= r2; r4 ^= r3; \
+    r0 ^= r4; r2 &= r4; \
+    r2 ^= r0; r0 &= r1; \
+    r3 ^= r0; r4 |= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r2 &= r3; \
+    r0 = ~r0; r4 ^= r2; \
+    \
+    w = r1; x = r4; y = r0; z = r3; \
+  }
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r2; r2 &= r3; \
+    r2 ^= r1; r1 |= r3; \
+    r1 &= r0; r4 ^= r2; \
+    r4 ^= r1; r1 &= r2; \
+    r0 = ~r0; r3 ^= r4; \
+    r1 ^= r3; r3 &= r0; \
+    r3 ^= r2; r0 ^= r1; \
+    r2 &= r0; r3 ^= r0; \
+    r2 ^= r4; \
+    r2 |= r3; r3 ^= r0; \
+    r2 ^= r1; \
+    \
+    w = r0; x = r3; y = r2; z = r4; \
+  }
+
+#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r0 ^= r1; r1 ^= r3; \
+    r3 = ~r3; r4 =  r1; \
+    r1 &= r0; r2 ^= r3; \
+    r1 ^= r2; r2 |= r4; \
+    r4 ^= r3; r3 &= r1; \
+    r3 ^= r0; r4 ^= r1; \
+    r4 ^= r2; r2 ^= r0; \
+    r0 &= r3; r2 = ~r2; \
+    r0 ^= r4; r4 |= r3; \
+    r2 ^= r4; \
+    \
+    w = r1; x = r3; y = r0; z = r2; \
+  }
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r1 = ~r1; r4 =  r3; \
+    r2 ^= r1; r3 |= r0; \
+    r3 ^= r2; r2 |= r1; \
+    r2 &= r0; r4 ^= r3; \
+    r2 ^= r4; r4 |= r0; \
+    r4 ^= r1; r1 &= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r3 &= r4; r4 ^= r1; \
+    r3 ^= r4; r4 = ~r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r3; z = r2; \
+  }
+
+#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r3; \
+    r3 &= r0; r0 ^= r4; \
+    r3 ^= r2; r2 |= r4; \
+    r1 ^= r3; r2 ^= r0; \
+    r0 |= r1; r2 ^= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r4 ^= r3; \
+    r4 ^= r0; r3 = ~r3; \
+    r2 &= r4; \
+    r2 ^= r3; \
+    \
+    w = r0; x = r1; y = r4; z = r2; \
+  }
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r0 ^= r2; r4 =  r2; \
+    r2 &= r0; r4 ^= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r2 ^= r3; r4 |= r0; \
+    r0 ^= r2; r3 ^= r4; \
+    r4 ^= r1; r1 &= r3; \
+    r1 ^= r0; r0 ^= r3; \
+    r0 |= r2; r3 ^= r1; \
+    r4 ^= r0; \
+    \
+    w = r1; x = r2; y = r4; z = r3; \
+  }
+
+#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r1; r1 |= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r2 ^= r1; r3 |= r4; \
+    r3 &= r0; r4 ^= r2; \
+    r3 ^= r1; r1 |= r4; \
+    r1 ^= r0; r0 |= r4; \
+    r0 ^= r2; r1 ^= r4; \
+    r2 ^= r1; r1 &= r0; \
+    r1 ^= r4; r2 = ~r2; \
+    r2 |= r0; \
+    r4 ^= r2; \
+    \
+    w = r4; x = r3; y = r1; z = r0; \
+  }
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
+  { \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r0; \
+    r0 &= r3; r4 |= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r1 |= r0; r0 ^= r2; \
+    r2 &= r4; r3 &= r4; \
+    r1 ^= r2; r2 ^= r0; \
+    r0 |= r2; r4 ^= r1; \
+    r0 ^= r3; r3 ^= r4; \
+    r4 |= r0; r3 ^= r2; \
+    r4 ^= r2; \
+    \
+    w = r3; x = r0; y = r1; z = r4; \
+  }
+
+/* XOR BLOCK1 into BLOCK0.  */
+#define BLOCK_XOR(block0, block1) \
+  {                               \
+    block0[0] ^= block1[0];       \
+    block0[1] ^= block1[1];       \
+    block0[2] ^= block1[2];       \
+    block0[3] ^= block1[3];       \
+  }
+
+/* Copy BLOCK_SRC to BLOCK_DST.  */
+#define BLOCK_COPY(block_dst, block_src) \
+  {                                      \
+    block_dst[0] = block_src[0];         \
+    block_dst[1] = block_src[1];         \
+    block_dst[2] = block_src[2];         \
+    block_dst[3] = block_src[3];         \
+  }
+
+/* Apply SBOX number WHICH to to the block found in ARRAY0, writing
+   the output to the block found in ARRAY1.  */
+#define SBOX(which, array0, array1)                         \
+  SBOX##which (array0[0], array0[1], array0[2], array0[3],  \
+               array1[0], array1[1], array1[2], array1[3]);
+
+/* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing
+   the output to the block found in ARRAY1.  */
+#define SBOX_INVERSE(which, array0, array1)                           \
+  SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3],  \
+                         array1[0], array1[1], array1[2], array1[3]);
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(block)                  \
+  {                                                   \
+    block[0] = rol (block[0], 13);                    \
+    block[2] = rol (block[2], 3);                     \
+    block[1] = block[1] ^ block[0] ^ block[2];        \
+    block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
+    block[1] = rol (block[1], 1);                     \
+    block[3] = rol (block[3], 7);                     \
+    block[0] = block[0] ^ block[1] ^ block[3];        \
+    block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
+    block[0] = rol (block[0], 5);                     \
+    block[2] = rol (block[2], 22);                    \
+  }
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(block)          \
+  {                                                   \
+    block[2] = ror (block[2], 22);                    \
+    block[0] = ror (block[0] , 5);                    \
+    block[2] = block[2] ^ block[3] ^ (block[1] << 7); \
+    block[0] = block[0] ^ block[1] ^ block[3];        \
+    block[3] = ror (block[3], 7);                     \
+    block[1] = ror (block[1], 1);                     \
+    block[3] = block[3] ^ block[2] ^ (block[0] << 3); \
+    block[1] = block[1] ^ block[0] ^ block[2];        \
+    block[2] = ror (block[2], 3);                     \
+    block[0] = ror (block[0], 13);                    \
+  }
+
+/* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the
+   subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary storage.
+   This macro increments `round'.  */
+#define ROUND(which, subkeys, block, block_tmp) \
+  {                                             \
+    BLOCK_XOR (block, subkeys[round]);          \
+    round++;                                    \
+    SBOX (which, block, block_tmp);             \
+    LINEAR_TRANSFORMATION (block_tmp);          \
+    BLOCK_COPY (block, block_tmp);              \
+  }
+
+/* Apply the last Serpent round to BLOCK, using the SBOX number WHICH
+   and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
+   storage.  The result will be stored in BLOCK_TMP.  This macro
+   increments `round'.  */
+#define ROUND_LAST(which, subkeys, block, block_tmp) \
+  {                                                  \
+    BLOCK_XOR (block, subkeys[round]);               \
+    round++;                                         \
+    SBOX (which, block, block_tmp);                  \
+    BLOCK_XOR (block_tmp, subkeys[round]);           \
+    round++;                                         \
+  }
+
+/* Apply an inverse Serpent round to BLOCK, using the SBOX number
+   WHICH and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as
+   temporary storage.  This macro increments `round'.  */
+#define ROUND_INVERSE(which, subkey, block, block_tmp) \
+  {                                                    \
+    LINEAR_TRANSFORMATION_INVERSE (block);             \
+    SBOX_INVERSE (which, block, block_tmp);            \
+    BLOCK_XOR (block_tmp, subkey[round]);              \
+    round--;                                           \
+    BLOCK_COPY (block, block_tmp);                     \
+  }
+
+/* Apply the first Serpent round to BLOCK, using the SBOX number WHICH
+   and the subkeys contained in SUBKEYS.  Use BLOCK_TMP as temporary
+   storage.  The result will be stored in BLOCK_TMP.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \
+  {                                                           \
+    BLOCK_XOR (block, subkeys[round]);                        \
+    round--;                                                  \
+    SBOX_INVERSE (which, block, block_tmp);                   \
+    BLOCK_XOR (block_tmp, subkeys[round]);                    \
+    round--;                                                  \
+  }
+
+/* Convert the user provided key KEY of KEY_LENGTH bytes into the
+   internally used format.  */
+static void
+serpent_key_prepare (const byte *key, unsigned int key_length,
+		     serpent_key_t key_prepared)
+{
+  int i;
+
+  /* Copy key.  */
+  key_length /= 4;
+  for (i = 0; i < key_length; i++)
+    key_prepared[i] = buf_get_le32 (key + i * 4);
+
+  if (i < 8)
+    {
+      /* Key must be padded according to the Serpent
+	 specification.  */
+      key_prepared[i] = 0x00000001;
+
+      for (i++; i < 8; i++)
+	key_prepared[i] = 0;
+    }
+}
+
+/* Derive the 33 subkeys from KEY and store them in SUBKEYS.  */
+static void
+serpent_subkeys_generate (serpent_key_t key, serpent_subkeys_t subkeys)
+{
+  u32 w[8];		/* The `prekey'.  */
+  u32 ws[4];
+  u32 wt[4];
+
+  /* Initialize with key values.  */
+  w[0] = key[0];
+  w[1] = key[1];
+  w[2] = key[2];
+  w[3] = key[3];
+  w[4] = key[4];
+  w[5] = key[5];
+  w[6] = key[6];
+  w[7] = key[7];
+
+  /* Expand to intermediate key using the affine recurrence.  */
+#define EXPAND_KEY4(wo, r)                                                     \
+  wo[0] = w[(r+0)%8] =                                                         \
+    rol (w[(r+0)%8] ^ w[(r+3)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ PHI ^ (r+0), 11); \
+  wo[1] = w[(r+1)%8] =                                                         \
+    rol (w[(r+1)%8] ^ w[(r+4)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ PHI ^ (r+1), 11); \
+  wo[2] = w[(r+2)%8] =                                                         \
+    rol (w[(r+2)%8] ^ w[(r+5)%8] ^ w[(r+7)%8] ^ w[(r+1)%8] ^ PHI ^ (r+2), 11); \
+  wo[3] = w[(r+3)%8] =                                                         \
+    rol (w[(r+3)%8] ^ w[(r+6)%8] ^ w[(r+0)%8] ^ w[(r+2)%8] ^ PHI ^ (r+3), 11);
+
+#define EXPAND_KEY(r)       \
+  EXPAND_KEY4(ws, (r));     \
+  EXPAND_KEY4(wt, (r + 4));
+
+  /* Calculate subkeys via S-Boxes, in bitslice mode.  */
+  EXPAND_KEY (0); SBOX (3, ws, subkeys[0]); SBOX (2, wt, subkeys[1]);
+  EXPAND_KEY (8); SBOX (1, ws, subkeys[2]); SBOX (0, wt, subkeys[3]);
+  EXPAND_KEY (16); SBOX (7, ws, subkeys[4]); SBOX (6, wt, subkeys[5]);
+  EXPAND_KEY (24); SBOX (5, ws, subkeys[6]); SBOX (4, wt, subkeys[7]);
+  EXPAND_KEY (32); SBOX (3, ws, subkeys[8]); SBOX (2, wt, subkeys[9]);
+  EXPAND_KEY (40); SBOX (1, ws, subkeys[10]); SBOX (0, wt, subkeys[11]);
+  EXPAND_KEY (48); SBOX (7, ws, subkeys[12]); SBOX (6, wt, subkeys[13]);
+  EXPAND_KEY (56); SBOX (5, ws, subkeys[14]); SBOX (4, wt, subkeys[15]);
+  EXPAND_KEY (64); SBOX (3, ws, subkeys[16]); SBOX (2, wt, subkeys[17]);
+  EXPAND_KEY (72); SBOX (1, ws, subkeys[18]); SBOX (0, wt, subkeys[19]);
+  EXPAND_KEY (80); SBOX (7, ws, subkeys[20]); SBOX (6, wt, subkeys[21]);
+  EXPAND_KEY (88); SBOX (5, ws, subkeys[22]); SBOX (4, wt, subkeys[23]);
+  EXPAND_KEY (96); SBOX (3, ws, subkeys[24]); SBOX (2, wt, subkeys[25]);
+  EXPAND_KEY (104); SBOX (1, ws, subkeys[26]); SBOX (0, wt, subkeys[27]);
+  EXPAND_KEY (112); SBOX (7, ws, subkeys[28]); SBOX (6, wt, subkeys[29]);
+  EXPAND_KEY (120); SBOX (5, ws, subkeys[30]); SBOX (4, wt, subkeys[31]);
+  EXPAND_KEY4 (ws, 128); SBOX (3, ws, subkeys[32]);
+
+  wipememory (ws, sizeof (ws));
+  wipememory (wt, sizeof (wt));
+  wipememory (w, sizeof (w));
+}
+
+/* Initialize CONTEXT with the key KEY of KEY_LENGTH bits.  */
+static void
+serpent_setkey_internal (serpent_context_t *context,
+			 const byte *key, unsigned int key_length)
+{
+  serpent_key_t key_prepared;
+
+  serpent_key_prepare (key, key_length, key_prepared);
+  serpent_subkeys_generate (key_prepared, context->keys);
+
+#ifdef USE_AVX2
+  context->use_avx2 = 0;
+  if ((_gcry_get_hw_features () & HWF_INTEL_AVX2))
+    {
+      context->use_avx2 = 1;
+    }
+#endif
+
+#ifdef USE_NEON
+  context->use_neon = 0;
+  if ((_gcry_get_hw_features () & HWF_ARM_NEON))
+    {
+      context->use_neon = 1;
+    }
+#endif
+
+  wipememory (key_prepared, sizeof(key_prepared));
+}
+
+/* Initialize CTX with the key KEY of KEY_LENGTH bytes.  */
+static gcry_err_code_t
+serpent_setkey (void *ctx,
+		const byte *key, unsigned int key_length,
+                cipher_bulk_ops_t *bulk_ops)
+{
+  serpent_context_t *context = ctx;
+  static const char *serpent_test_ret;
+  static int serpent_init_done;
+  gcry_err_code_t ret = GPG_ERR_NO_ERROR;
+
+  if (! serpent_init_done)
+    {
+      /* Execute a self-test the first time, Serpent is used.  */
+      serpent_init_done = 1;
+      serpent_test_ret = serpent_test ();
+      if (serpent_test_ret)
+	log_error ("Serpent test failure: %s\n", serpent_test_ret);
+    }
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_dec = _gcry_serpent_cbc_dec;
+  bulk_ops->cfb_dec = _gcry_serpent_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_serpent_ctr_enc;
+  bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt;
+  bulk_ops->ocb_auth  = _gcry_serpent_ocb_auth;
+
+  if (serpent_test_ret)
+    ret = GPG_ERR_SELFTEST_FAILED;
+  else
+    serpent_setkey_internal (context, key, key_length);
+
+  return ret;
+}
+
+static void
+serpent_encrypt_internal (serpent_context_t *context,
+			  const byte *input, byte *output)
+{
+  serpent_block_t b, b_next;
+  int round = 0;
+
+  b[0] = buf_get_le32 (input + 0);
+  b[1] = buf_get_le32 (input + 4);
+  b[2] = buf_get_le32 (input + 8);
+  b[3] = buf_get_le32 (input + 12);
+
+  ROUND (0, context->keys, b, b_next);
+  ROUND (1, context->keys, b, b_next);
+  ROUND (2, context->keys, b, b_next);
+  ROUND (3, context->keys, b, b_next);
+  ROUND (4, context->keys, b, b_next);
+  ROUND (5, context->keys, b, b_next);
+  ROUND (6, context->keys, b, b_next);
+  ROUND (7, context->keys, b, b_next);
+  ROUND (0, context->keys, b, b_next);
+  ROUND (1, context->keys, b, b_next);
+  ROUND (2, context->keys, b, b_next);
+  ROUND (3, context->keys, b, b_next);
+  ROUND (4, context->keys, b, b_next);
+  ROUND (5, context->keys, b, b_next);
+  ROUND (6, context->keys, b, b_next);
+  ROUND (7, context->keys, b, b_next);
+  ROUND (0, context->keys, b, b_next);
+  ROUND (1, context->keys, b, b_next);
+  ROUND (2, context->keys, b, b_next);
+  ROUND (3, context->keys, b, b_next);
+  ROUND (4, context->keys, b, b_next);
+  ROUND (5, context->keys, b, b_next);
+  ROUND (6, context->keys, b, b_next);
+  ROUND (7, context->keys, b, b_next);
+  ROUND (0, context->keys, b, b_next);
+  ROUND (1, context->keys, b, b_next);
+  ROUND (2, context->keys, b, b_next);
+  ROUND (3, context->keys, b, b_next);
+  ROUND (4, context->keys, b, b_next);
+  ROUND (5, context->keys, b, b_next);
+  ROUND (6, context->keys, b, b_next);
+
+  ROUND_LAST (7, context->keys, b, b_next);
+
+  buf_put_le32 (output + 0, b_next[0]);
+  buf_put_le32 (output + 4, b_next[1]);
+  buf_put_le32 (output + 8, b_next[2]);
+  buf_put_le32 (output + 12, b_next[3]);
+}
+
+static void
+serpent_decrypt_internal (serpent_context_t *context,
+			  const byte *input, byte *output)
+{
+  serpent_block_t b, b_next;
+  int round = ROUNDS;
+
+  b_next[0] = buf_get_le32 (input + 0);
+  b_next[1] = buf_get_le32 (input + 4);
+  b_next[2] = buf_get_le32 (input + 8);
+  b_next[3] = buf_get_le32 (input + 12);
+
+  ROUND_FIRST_INVERSE (7, context->keys, b_next, b);
+
+  ROUND_INVERSE (6, context->keys, b, b_next);
+  ROUND_INVERSE (5, context->keys, b, b_next);
+  ROUND_INVERSE (4, context->keys, b, b_next);
+  ROUND_INVERSE (3, context->keys, b, b_next);
+  ROUND_INVERSE (2, context->keys, b, b_next);
+  ROUND_INVERSE (1, context->keys, b, b_next);
+  ROUND_INVERSE (0, context->keys, b, b_next);
+  ROUND_INVERSE (7, context->keys, b, b_next);
+  ROUND_INVERSE (6, context->keys, b, b_next);
+  ROUND_INVERSE (5, context->keys, b, b_next);
+  ROUND_INVERSE (4, context->keys, b, b_next);
+  ROUND_INVERSE (3, context->keys, b, b_next);
+  ROUND_INVERSE (2, context->keys, b, b_next);
+  ROUND_INVERSE (1, context->keys, b, b_next);
+  ROUND_INVERSE (0, context->keys, b, b_next);
+  ROUND_INVERSE (7, context->keys, b, b_next);
+  ROUND_INVERSE (6, context->keys, b, b_next);
+  ROUND_INVERSE (5, context->keys, b, b_next);
+  ROUND_INVERSE (4, context->keys, b, b_next);
+  ROUND_INVERSE (3, context->keys, b, b_next);
+  ROUND_INVERSE (2, context->keys, b, b_next);
+  ROUND_INVERSE (1, context->keys, b, b_next);
+  ROUND_INVERSE (0, context->keys, b, b_next);
+  ROUND_INVERSE (7, context->keys, b, b_next);
+  ROUND_INVERSE (6, context->keys, b, b_next);
+  ROUND_INVERSE (5, context->keys, b, b_next);
+  ROUND_INVERSE (4, context->keys, b, b_next);
+  ROUND_INVERSE (3, context->keys, b, b_next);
+  ROUND_INVERSE (2, context->keys, b, b_next);
+  ROUND_INVERSE (1, context->keys, b, b_next);
+  ROUND_INVERSE (0, context->keys, b, b_next);
+
+  buf_put_le32 (output + 0, b_next[0]);
+  buf_put_le32 (output + 4, b_next[1]);
+  buf_put_le32 (output + 8, b_next[2]);
+  buf_put_le32 (output + 12, b_next[3]);
+}
+
+static unsigned int
+serpent_encrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
+{
+  serpent_context_t *context = ctx;
+
+  serpent_encrypt_internal (context, buffer_in, buffer_out);
+  return /*burn_stack*/ (2 * sizeof (serpent_block_t));
+}
+
+static unsigned int
+serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
+{
+  serpent_context_t *context = ctx;
+
+  serpent_decrypt_internal (context, buffer_in, buffer_out);
+  return /*burn_stack*/ (2 * sizeof (serpent_block_t));
+}
+
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size sizeof(serpent_block_t). */
+static void
+_gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * sizeof(serpent_block_t);
+          inbuf  += 16 * sizeof(serpent_block_t);
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* serpent-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/sse2 code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 8;
+          outbuf += 8 * sizeof(serpent_block_t);
+          inbuf  += 8 * sizeof(serpent_block_t);
+          did_use_neon = 1;
+        }
+
+      if (did_use_neon)
+        {
+          /* serpent-neon assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      serpent_encrypt_internal(ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+      /* Increment the counter.  */
+      cipher_block_add(ctr, 1, sizeof(serpent_block_t));
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_serpent_cbc_dec(void *context, unsigned char *iv,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * sizeof(serpent_block_t);
+          inbuf  += 16 * sizeof(serpent_block_t);
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* serpent-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/sse2 code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 8;
+          outbuf += 8 * sizeof(serpent_block_t);
+          inbuf  += 8 * sizeof(serpent_block_t);
+          did_use_neon = 1;
+        }
+
+      if (did_use_neon)
+        {
+          /* serpent-neon assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      serpent_decrypt_internal (ctx, inbuf, savebuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
+                                sizeof(serpent_block_t));
+      inbuf += sizeof(serpent_block_t);
+      outbuf += sizeof(serpent_block_t);
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * sizeof(serpent_block_t);
+          inbuf  += 16 * sizeof(serpent_block_t);
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* serpent-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic/sse2 code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 8;
+          outbuf += 8 * sizeof(serpent_block_t);
+          inbuf  += 8 * sizeof(serpent_block_t);
+          did_use_neon = 1;
+        }
+
+      if (did_use_neon)
+        {
+          /* serpent-neon assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      serpent_encrypt_internal(ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+  serpent_context_t *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * sizeof(serpent_block_t);
+	      inbuf  += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* serpent-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+    u64 Ls[8];
+    unsigned int n = 8 - (blkn % 8);
+    u64 *l;
+
+    if (nblocks >= 8)
+      {
+	/* Use u64 to store pointers for x32 support (assembly function
+	  * assumes 64-bit pointers). */
+	Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
+	  {
+	    blkn += 8;
+	    *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
+
+	    if (encrypt)
+	      _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	    else
+	      _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	    nblocks -= 8;
+	    outbuf += 8 * sizeof(serpent_block_t);
+	    inbuf  += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
+	  }
+      }
+
+    if (did_use_sse2)
+      {
+	/* serpent-sse2 assembly code does not use stack */
+	if (nblocks == 0)
+	  burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+      const void *Ls[8];
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
+
+      if (nblocks >= 8)
+	{
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
+	    {
+	      blkn += 8;
+	      *l = ocb_get_l(c,  blkn - blkn % 8);
+
+	      if (encrypt)
+		_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 8;
+	      outbuf += 8 * sizeof(serpent_block_t);
+	      inbuf  += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
+	    }
+	}
+
+      if (did_use_neon)
+	{
+	  /* serpent-neon assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			size_t nblocks)
+{
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+  serpent_context_t *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  int burn_stack_depth = 2 * sizeof(serpent_block_t);
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* serpent-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+    u64 Ls[8];
+    unsigned int n = 8 - (blkn % 8);
+    u64 *l;
+
+    if (nblocks >= 8)
+      {
+	/* Use u64 to store pointers for x32 support (assembly function
+	* assumes 64-bit pointers). */
+	Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
+	  {
+	    blkn += 8;
+	    *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
+
+	    _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					c->u_mode.ocb.aad_sum, Ls);
+
+	    nblocks -= 8;
+	    abuf += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
+	  }
+      }
+
+    if (did_use_sse2)
+      {
+	/* serpent-avx2 assembly code does not use stack */
+	if (nblocks == 0)
+	  burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+      const void *Ls[8];
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
+
+      if (nblocks >= 8)
+	{
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
+	    {
+	      blkn += 8;
+	      *l = ocb_get_l(c, blkn - blkn % 8);
+
+	      _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 8;
+	      abuf += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
+	    }
+	}
+
+      if (did_use_neon)
+	{
+	  /* serpent-neon assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
+}
+
+
+
+/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 16+8+1;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
+           &serpent_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 16+8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
+           &serpent_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 16+8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
+           &serpent_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Serpent test.  */
+
+static const char *
+serpent_test (void)
+{
+  serpent_context_t context;
+  unsigned char scratch[16];
+  unsigned int i;
+  const char *r;
+
+  static struct test
+  {
+    int key_length;
+    unsigned char key[32];
+    unsigned char text_plain[16];
+    unsigned char text_cipher[16];
+  } test_data[] =
+    {
+      {
+	16,
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	"\xD2\x9D\x57\x6F\xCE\xA3\xA3\xA7\xED\x90\x99\xF2\x92\x73\xD7\x8E",
+	"\xB2\x28\x8B\x96\x8A\xE8\xB0\x86\x48\xD1\xCE\x96\x06\xFD\x99\x2D"
+      },
+      {
+	24,
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	"\x00\x00\x00\x00\x00\x00\x00\x00",
+	"\xD2\x9D\x57\x6F\xCE\xAB\xA3\xA7\xED\x98\x99\xF2\x92\x7B\xD7\x8E",
+	"\x13\x0E\x35\x3E\x10\x37\xC2\x24\x05\xE8\xFA\xEF\xB2\xC3\xC3\xE9"
+      },
+      {
+	32,
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	"\xD0\x95\x57\x6F\xCE\xA3\xE3\xA7\xED\x98\xD9\xF2\x90\x73\xD7\x8E",
+	"\xB9\x0E\xE5\x86\x2D\xE6\x91\x68\xF2\xBD\xD5\x12\x5B\x45\x47\x2B"
+      },
+      {
+	32,
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	"\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00",
+	"\x20\x61\xA4\x27\x82\xBD\x52\xEC\x69\x1E\xC3\x83\xB0\x3B\xA7\x7C"
+      },
+      {
+	0
+      },
+    };
+
+  for (i = 0; test_data[i].key_length; i++)
+    {
+      serpent_setkey_internal (&context, test_data[i].key,
+                               test_data[i].key_length);
+      serpent_encrypt_internal (&context, test_data[i].text_plain, scratch);
+
+      if (memcmp (scratch, test_data[i].text_cipher, sizeof (serpent_block_t)))
+	switch (test_data[i].key_length)
+	  {
+	  case 16:
+	    return "Serpent-128 test encryption failed.";
+	  case  24:
+	    return "Serpent-192 test encryption failed.";
+	  case 32:
+	    return "Serpent-256 test encryption failed.";
+	  }
+
+    serpent_decrypt_internal (&context, test_data[i].text_cipher, scratch);
+    if (memcmp (scratch, test_data[i].text_plain, sizeof (serpent_block_t)))
+      switch (test_data[i].key_length)
+	{
+	case 16:
+	  return "Serpent-128 test decryption failed.";
+	case  24:
+	  return "Serpent-192 test decryption failed.";
+	case 32:
+	  return "Serpent-256 test decryption failed.";
+	}
+    }
+
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
+  return NULL;
+}
+
+
+static gcry_cipher_oid_spec_t serpent128_oids[] =
+  {
+    {"1.3.6.1.4.1.11591.13.2.1", GCRY_CIPHER_MODE_ECB },
+    {"1.3.6.1.4.1.11591.13.2.2", GCRY_CIPHER_MODE_CBC },
+    {"1.3.6.1.4.1.11591.13.2.3", GCRY_CIPHER_MODE_OFB },
+    {"1.3.6.1.4.1.11591.13.2.4", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+static gcry_cipher_oid_spec_t serpent192_oids[] =
+  {
+    {"1.3.6.1.4.1.11591.13.2.21", GCRY_CIPHER_MODE_ECB },
+    {"1.3.6.1.4.1.11591.13.2.22", GCRY_CIPHER_MODE_CBC },
+    {"1.3.6.1.4.1.11591.13.2.23", GCRY_CIPHER_MODE_OFB },
+    {"1.3.6.1.4.1.11591.13.2.24", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+static gcry_cipher_oid_spec_t serpent256_oids[] =
+  {
+    {"1.3.6.1.4.1.11591.13.2.41", GCRY_CIPHER_MODE_ECB },
+    {"1.3.6.1.4.1.11591.13.2.42", GCRY_CIPHER_MODE_CBC },
+    {"1.3.6.1.4.1.11591.13.2.43", GCRY_CIPHER_MODE_OFB },
+    {"1.3.6.1.4.1.11591.13.2.44", GCRY_CIPHER_MODE_CFB },
+    { NULL }
+  };
+
+static const char *serpent128_aliases[] =
+  {
+    "SERPENT",
+    "SERPENT-128",
+    NULL
+  };
+static const char *serpent192_aliases[] =
+  {
+    "SERPENT-192",
+    NULL
+  };
+static const char *serpent256_aliases[] =
+  {
+    "SERPENT-256",
+    NULL
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_serpent128 =
+  {
+    GCRY_CIPHER_SERPENT128, {0, 0},
+    "SERPENT128", serpent128_aliases, serpent128_oids, 16, 128,
+    sizeof (serpent_context_t),
+    serpent_setkey, serpent_encrypt, serpent_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_serpent192 =
+  {
+    GCRY_CIPHER_SERPENT192, {0, 0},
+    "SERPENT192", serpent192_aliases, serpent192_oids, 16, 192,
+    sizeof (serpent_context_t),
+    serpent_setkey, serpent_encrypt, serpent_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_serpent256 =
+  {
+    GCRY_CIPHER_SERPENT256, {0, 0},
+    "SERPENT256", serpent256_aliases, serpent256_oids, 16, 256,
+    sizeof (serpent_context_t),
+    serpent_setkey, serpent_encrypt, serpent_decrypt
+  };
diff --git a/comm/third_party/libgcrypt/cipher/sha1-armv7-neon.S b/comm/third_party/libgcrypt/cipher/sha1-armv7-neon.S
new file mode 100644
index 0000000000..61cc541c68
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-armv7-neon.S
@@ -0,0 +1,526 @@
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_SHA1)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+gcry_sha1_armv7_neon_K_VEC:
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q1
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q5
+#define W6 q6
+#define W7 q7
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	bic RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	and RT1, c, b; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add RT0, RT0, RT3; \
+	add e, e, RT1; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	eor RT0, RT0, c; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT3; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, b, c; \
+	and RT1, b, c; \
+	add e, e, a, ror #(32 - 5); \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	and RT0, RT0, d; \
+	add RT1, RT1, RT3; \
+	add e, e, RT0; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+	add       RWK, sp, #(WK_offs(0));			\
+	\
+	vld1.32   {tmp0, tmp1}, [RDATA]!;			\
+	vrev32.8  W0, tmp0;		/* big => little */	\
+	vld1.32   {tmp2, tmp3}, [RDATA]!;			\
+	vadd.u32  tmp0, W0, curK;				\
+	vrev32.8  W7, tmp1;		/* big => little */	\
+	vrev32.8  W6, tmp2;		/* big => little */	\
+	vadd.u32  tmp1, W7, curK;				\
+	vrev32.8  W5, tmp3;		/* big => little */	\
+	vadd.u32  tmp2, W6, curK;				\
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+	vadd.u32  tmp3, W5, curK;				\
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+#define WPRECALC_00_15_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vld1.32   {tmp0, tmp1}, [RDATA]!;			\
+
+#define WPRECALC_00_15_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	add       RWK, sp, #(WK_offs(0));			\
+
+#define WPRECALC_00_15_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vrev32.8  W0, tmp0;		/* big => little */	\
+
+#define WPRECALC_00_15_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vld1.32   {tmp2, tmp3}, [RDATA]!;			\
+
+#define WPRECALC_00_15_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32  tmp0, W0, curK;				\
+
+#define WPRECALC_00_15_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vrev32.8  W7, tmp1;		/* big => little */	\
+
+#define WPRECALC_00_15_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vrev32.8  W6, tmp2;		/* big => little */	\
+
+#define WPRECALC_00_15_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32  tmp1, W7, curK;				\
+
+#define WPRECALC_00_15_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vrev32.8  W5, tmp3;		/* big => little */	\
+
+#define WPRECALC_00_15_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32  tmp2, W6, curK;				\
+
+#define WPRECALC_00_15_10(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+
+#define WPRECALC_00_15_11(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32  tmp3, W5, curK;				\
+
+#define WPRECALC_00_15_12(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor      tmp0, tmp0;			\
+	vext.8    W, W_m16, W_m12, #8;		\
+
+#define WPRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	add       RWK, sp, #(WK_offs(i));	\
+	vext.8    tmp0, W_m04, tmp0, #4;	\
+
+#define WPRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor      tmp0, tmp0, W_m16;		\
+	veor.32   W, W, W_m08;			\
+
+#define WPRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor      tmp1, tmp1;			\
+	veor      W, W, tmp0;			\
+
+#define WPRECALC_16_31_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vshl.u32  tmp0, W, #1;			\
+
+#define WPRECALC_16_31_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vext.8    tmp1, tmp1, W, #(16-12);	\
+	vshr.u32  W, W, #31;			\
+
+#define WPRECALC_16_31_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vorr      tmp0, tmp0, W;		\
+	vshr.u32  W, tmp1, #30;			\
+
+#define WPRECALC_16_31_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vshl.u32  tmp1, tmp1, #2;		\
+
+#define WPRECALC_16_31_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor      tmp0, tmp0, W;		\
+
+#define WPRECALC_16_31_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor      W, tmp0, tmp1;		\
+
+#define WPRECALC_16_31_10(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32  tmp0, W, curK;		\
+
+#define WPRECALC_16_31_11(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vst1.32   {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \
+	vst1.32 {tmp0}, [RWK];
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_armv7_neon (void *ctx, const unsigned char *data,
+ *                                  size_t nblks)
+ */
+.align 3
+.globl _gcry_sha1_transform_armv7_neon
+.type  _gcry_sha1_transform_armv7_neon,%function;
+_gcry_sha1_transform_armv7_neon:
+  /* input:
+   *	r0: ctx, CTX
+   *	r1: data (64*nblks bytes)
+   *	r2: nblks
+   */
+
+  cmp RNBLKS, #0;
+  beq .Ldo_nothing;
+
+  push {r4-r12, lr};
+
+  GET_DATA_POINTER(RT3, .LK_VEC, _a);
+  vpush {q4-q7};
+
+  mov ROLDSTACK, sp;
+
+  /* Align stack. */
+  sub sp, #(16*4);
+  and sp, #(~(16-1));
+
+  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+  /* Get the values of the chaining variables. */
+  ldm RSTATE, {_a-_e};
+
+  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+  /* Precalc 0-15. */
+  W_PRECALC_00_15();
+
+  b .Loop;
+
+.ltorg
+.Loop:
+  /* Transform 0-15 + Precalc 16-31. */
+  _R( _a, _b, _c, _d, _e, F1,  0, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, W4, W5, W6, W7, W0, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  1, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, W4, W5, W6, W7, W0, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  2, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, W4, W5, W6, W7, W0, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1,  3, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+  _R( _b, _c, _d, _e, _a, F1,  4, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, W3, W4, W5, W6, W7, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1,  5, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, W3, W4, W5, W6, W7, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  6, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, W3, W4, W5, W6, W7, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  7, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, W3, W4, W5, W6, W7, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F1,  8, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, W2, W3, W4, W5, W6, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1,  9, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, W2, W3, W4, W5, W6, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 10, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, W2, W3, W4, W5, W6, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1, 11, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, W2, W3, W4, W5, W6, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F1, 12, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, W1, W2, W3, W4, W5, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1, 13, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, W1, W2, W3, W4, W5, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1, 14, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, W1, W2, W3, W4, W5, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 15, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, W1, W2, W3, W4, W5, _, _, _ );
+
+  /* Transform 16-63 + Precalc 32-79. */
+  _R( _e, _a, _b, _c, _d, F1, 16, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _d, _e, _a, _b, _c, F1, 17, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _c, _d, _e, _a, _b, F1, 18, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F1, 19, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32, W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _a, _b, _c, _d, _e, F2, 20, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _e, _a, _b, _c, _d, F2, 21, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _d, _e, _a, _b, _c, F2, 22, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F2, 23, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36, W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+  _R( _b, _c, _d, _e, _a, F2, 24, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _a, _b, _c, _d, _e, F2, 25, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _e, _a, _b, _c, _d, F2, 26, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F2, 27, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40, W6, W7, W0, W1, W2, W3, W4, W5);
+
+  _R( _c, _d, _e, _a, _b, F2, 28, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _b, _c, _d, _e, _a, F2, 29, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _a, _b, _c, _d, _e, F2, 30, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F2, 31, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44, W5, W6, W7, W0, W1, W2, W3, W4);
+
+  _R( _d, _e, _a, _b, _c, F2, 32, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _c, _d, _e, _a, _b, F2, 33, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _b, _c, _d, _e, _a, F2, 34, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48, W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _a, _b, _c, _d, _e, F2, 35, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48, W4, W5, W6, W7, W0, W1, W2, W3);
+
+  _R( _e, _a, _b, _c, _d, F2, 36, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _d, _e, _a, _b, _c, F2, 37, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _c, _d, _e, _a, _b, F2, 38, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52, W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _b, _c, _d, _e, _a, F2, 39, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52, W3, W4, W5, W6, W7, W0, W1, W2);
+
+  _R( _a, _b, _c, _d, _e, F3, 40, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _e, _a, _b, _c, _d, F3, 41, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _d, _e, _a, _b, _c, F3, 42, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56, W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _c, _d, _e, _a, _b, F3, 43, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56, W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+  _R( _b, _c, _d, _e, _a, F3, 44, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _a, _b, _c, _d, _e, F3, 45, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _e, _a, _b, _c, _d, F3, 46, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60, W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _d, _e, _a, _b, _c, F3, 47, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60, W1, W2, W3, W4, W5, W6, W7, W0);
+
+  _R( _c, _d, _e, _a, _b, F3, 48, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F3, 49, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _a, _b, _c, _d, _e, F3, 50, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64, W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _e, _a, _b, _c, _d, F3, 51, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64, W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _d, _e, _a, _b, _c, F3, 52, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F3, 53, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _b, _c, _d, _e, _a, F3, 54, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68, W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _a, _b, _c, _d, _e, F3, 55, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68, W7, W0, W1, W2, W3, W4, W5, W6);
+
+  _R( _e, _a, _b, _c, _d, F3, 56, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F3, 57, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _c, _d, _e, _a, _b, F3, 58, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72, W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _b, _c, _d, _e, _a, F3, 59, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72, W6, W7, W0, W1, W2, W3, W4, W5);
+
+  subs RNBLKS, #1;
+
+  _R( _a, _b, _c, _d, _e, F4, 60, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F4, 61, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _d, _e, _a, _b, _c, F4, 62, WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76, W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _c, _d, _e, _a, _b, F4, 63, WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76, W5, W6, W7, W0, W1, W2, W3, W4);
+
+  beq .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+  _R( _b, _c, _d, _e, _a, F4, 64, WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 65, WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 66, WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 67, WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F4, 68, dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 69, dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 70, WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 71, WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F4, 72, dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 73, dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 74, WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 75, WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _e, _a, _b, _c, _d, F4, 76, WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 77, WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 78, WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 79, WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  b .Loop;
+
+.ltorg
+.Lend:
+  /* Transform 64-79 + Clear XMM registers. */
+  R( _b, _c, _d, _e, _a, F4, 64 );
+  R( _a, _b, _c, _d, _e, F4, 65 ); CLEAR_REG(tmp0);
+  R( _e, _a, _b, _c, _d, F4, 66 ); CLEAR_REG(tmp1);
+  R( _d, _e, _a, _b, _c, F4, 67 ); CLEAR_REG(W0);
+  R( _c, _d, _e, _a, _b, F4, 68 ); CLEAR_REG(W1);
+  R( _b, _c, _d, _e, _a, F4, 69 ); CLEAR_REG(W2);
+  R( _a, _b, _c, _d, _e, F4, 70 ); CLEAR_REG(W3);
+  R( _e, _a, _b, _c, _d, F4, 71 ); CLEAR_REG(W4);
+  R( _d, _e, _a, _b, _c, F4, 72 ); CLEAR_REG(W5);
+  R( _c, _d, _e, _a, _b, F4, 73 ); CLEAR_REG(W6);
+  R( _b, _c, _d, _e, _a, F4, 74 ); CLEAR_REG(W7);
+  R( _a, _b, _c, _d, _e, F4, 75 );
+  R( _e, _a, _b, _c, _d, F4, 76 );
+  R( _d, _e, _a, _b, _c, F4, 77 );
+  R( _c, _d, _e, _a, _b, F4, 78 );
+  R( _b, _c, _d, _e, _a, F4, 79 );
+
+  mov sp, ROLDSTACK;
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  vpop {q4-q7};
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  /* burn_stack */
+  mov r0, #(16*4 + 16*4 + 15);
+
+  pop {r4-r12, pc};
+
+.Ldo_nothing:
+  mov r0, #0;
+  bx lr
+.size _gcry_sha1_transform_armv7_neon,.-_gcry_sha1_transform_armv7_neon;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch32-ce.S
new file mode 100644
index 0000000000..bf2b233b01
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch32-ce.S
@@ -0,0 +1,220 @@
+/* sha1-armv8-aarch32-ce.S - ARM/CE accelerated SHA-1 transform function
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) && defined(USE_SHA1)
+
+.syntax unified
+.arch armv8-a
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+gcry_sha1_aarch32_ce_K_VEC:
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define qH4    q0
+#define sH4    s0
+#define qH0123 q1
+
+#define qABCD q2
+#define qE0   q3
+#define qE1   q4
+
+#define qT0   q5
+#define qT1   q6
+
+#define qW0 q8
+#define qW1 q9
+#define qW2 q10
+#define qW3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+#define do_add(dst, src0, src1) vadd.u32 dst, src0, src1;
+#define do_sha1su0(w0,w1,w2) sha1su0.32 w0,w1,w2;
+#define do_sha1su1(w0,w3) sha1su1.32 w0,w3;
+
+#define do_rounds(f, e0, e1, t, k, w0, w1, w2, w3, add_fn, sha1su0_fn, sha1su1_fn) \
+        sha1su1_fn( w3, w2     ); \
+        sha1h.32    e0, qABCD; \
+        sha1##f.32  qABCD, e1, t; \
+        add_fn(     t, w2, k   ); \
+        sha1su0_fn( w0, w1, w2 );
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int
+ * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
+ *                                size_t nblks)
+ */
+.align 3
+.globl _gcry_sha1_transform_armv8_ce
+.type  _gcry_sha1_transform_armv8_ce,%function;
+_gcry_sha1_transform_armv8_ce:
+  /* input:
+   *	r0: ctx, CTX
+   *	r1: data (64*nblks bytes)
+   *	r2: nblks
+   */
+
+  cmp r2, #0;
+  push {r4,lr};
+  beq .Ldo_nothing;
+
+  vpush {q4-q7};
+
+  GET_DATA_POINTER(r4, .LK_VEC, lr);
+
+  veor qH4, qH4
+  vld1.32 {qH0123}, [r0]    /* load h0,h1,h2,h3 */
+
+  vld1.32 {qK1-qK2}, [r4]!  /* load K1,K2 */
+  vldr sH4, [r0, #16]       /* load h4 */
+  vld1.32 {qK3-qK4}, [r4]   /* load K3,K4 */
+
+  vld1.8 {qW0-qW1}, [r1]!
+  vmov qABCD, qH0123
+  vld1.8 {qW2-qW3}, [r1]!
+
+  vrev32.8 qW0, qW0
+  vrev32.8 qW1, qW1
+  vrev32.8 qW2, qW2
+  do_add(qT0, qW0, qK1)
+  vrev32.8 qW3, qW3
+  do_add(qT1, qW1, qK1)
+
+.Loop:
+  do_rounds(c, qE1, qH4, qT0, qK1, qW0, qW1, qW2, qW3, do_add, do_sha1su0, _)
+  subs r2, r2, #1
+  do_rounds(c, qE0, qE1, qT1, qK1, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, qE1, qE0, qT0, qK1, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, qE0, qE1, qT1, qK2, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, qE1, qE0, qT0, qK2, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(p, qE0, qE1, qT1, qK2, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, qE1, qE0, qT0, qK2, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, qE0, qE1, qT1, qK2, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, qE1, qE0, qT0, qK3, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, qE0, qE1, qT1, qK3, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(m, qE1, qE0, qT0, qK3, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, qE0, qE1, qT1, qK3, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, qE1, qE0, qT0, qK3, qW0, qW1, qW2, qW3, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, qE0, qE1, qT1, qK4, qW1, qW2, qW3, qW0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, qE1, qE0, qT0, qK4, qW2, qW3, qW0, qW1, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(p, qE0, qE1, qT1, qK4, qW3, qW0, qW1, qW2, do_add, do_sha1su0, do_sha1su1)
+  beq .Lend
+
+  vld1.8 {qW0-qW1}, [r1]! /* preload */
+  do_rounds(p, qE1, qE0, qT0, qK4, _  , _  , qW2, qW3, do_add, _, do_sha1su1)
+  vrev32.8 qW0, qW0
+  vld1.8 {qW2}, [r1]!
+  vrev32.8 qW1, qW1
+  do_rounds(p, qE0, qE1, qT1, qK4, _  , _  , qW3, _  , do_add, _, _)
+  vld1.8 {qW3}, [r1]!
+  vrev32.8 qW2, qW2
+  do_rounds(p, qE1, qE0, qT0, _, _, _, _, _, _, _, _)
+  vrev32.8 qW3, qW3
+  do_rounds(p, qE0, qE1, qT1, _, _, _, _, _, _, _, _)
+
+  do_add(qT0, qW0, qK1)
+  vadd.u32 qH4, qE0
+  vadd.u32 qABCD, qH0123
+  do_add(qT1, qW1, qK1)
+
+  vmov qH0123, qABCD
+
+  b .Loop
+
+.Lend:
+  do_rounds(p, qE1, qE0, qT0, qK4, _  , _  , qW2, qW3, do_add, _, do_sha1su1)
+  do_rounds(p, qE0, qE1, qT1, qK4, _  , _  , qW3, _  , do_add, _, _)
+  do_rounds(p, qE1, qE0, qT0, _, _, _, _, _, _, _, _)
+  do_rounds(p, qE0, qE1, qT1, _, _, _, _, _, _, _, _)
+
+  vadd.u32 qH4, qE0
+  vadd.u32 qH0123, qABCD
+
+  CLEAR_REG(qW0)
+  CLEAR_REG(qW1)
+  CLEAR_REG(qW2)
+  CLEAR_REG(qW3)
+  CLEAR_REG(qABCD)
+  CLEAR_REG(qE1)
+  CLEAR_REG(qE0)
+
+  vstr sH4, [r0, #16]    /* store h4 */
+  vst1.32 {qH0123}, [r0] /* store h0,h1,h2,h3 */
+
+  CLEAR_REG(qH0123)
+  CLEAR_REG(qH4)
+  vpop {q4-q7}
+
+.Ldo_nothing:
+  mov r0, #0
+  pop {r4,pc}
+.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch64-ce.S b/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch64-ce.S
new file mode 100644
index 0000000000..223268cad2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-armv8-aarch64-ce.S
@@ -0,0 +1,201 @@
+/* sha1-armv8-aarch64-ce.S - ARM/CE accelerated SHA-1 transform function
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA1)
+
+.cpu generic+simd+crypto
+
+.text
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+gcry_sha1_aarch64_ce_K_VEC:
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define sH4    s0
+#define vH4    v0
+#define vH0123 v1
+
+#define qABCD q2
+#define sABCD s2
+#define vABCD v2
+#define sE0   s3
+#define vE0   v3
+#define sE1   s4
+#define vE1   v4
+
+#define vT0   v5
+#define vT1   v6
+
+#define vW0 v16
+#define vW1 v17
+#define vW2 v18
+#define vW3 v19
+
+#define vK1 v20
+#define vK2 v21
+#define vK3 v22
+#define vK4 v23
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+#define do_add(dst, src0, src1) add dst.4s, src0.4s, src1.4s;
+#define do_sha1su0(w0,w1,w2) sha1su0 w0.4s,w1.4s,w2.4s;
+#define do_sha1su1(w0,w3) sha1su1 w0.4s,w3.4s;
+
+#define do_rounds(f, e0, e1, t, k, w0, w1, w2, w3, add_fn, sha1su0_fn, sha1su1_fn) \
+        sha1su1_fn( v##w3, v##w2     ); \
+        sha1h       e0, sABCD; \
+        sha1##f     qABCD, e1, v##t.4s; \
+        add_fn(     v##t, v##w2, v##k   ); \
+        sha1su0_fn( v##w0, v##w1, v##w2 );
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+
+/*
+ * unsigned int
+ * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
+ *                                size_t nblks)
+ */
+.align 3
+.globl _gcry_sha1_transform_armv8_ce
+ELF(.type  _gcry_sha1_transform_armv8_ce,%function;)
+_gcry_sha1_transform_armv8_ce:
+  /* input:
+   *	x0: ctx, CTX
+   *	x1: data (64*nblks bytes)
+   *	x2: nblks
+   */
+  CFI_STARTPROC();
+
+  cbz x2, .Ldo_nothing;
+
+  GET_DATA_POINTER(x4, .LK_VEC);
+
+  ld1 {vH0123.4s}, [x0]     /* load h0,h1,h2,h3 */
+  ld1 {vK1.4s-vK4.4s}, [x4] /* load K1,K2,K3,K4 */
+  ldr sH4, [x0, #16]        /* load h4 */
+
+  ld1 {vW0.16b-vW3.16b}, [x1], #64
+  mov vABCD.16b, vH0123.16b
+
+  rev32 vW0.16b, vW0.16b
+  rev32 vW1.16b, vW1.16b
+  rev32 vW2.16b, vW2.16b
+  do_add(vT0, vW0, vK1)
+  rev32 vW3.16b, vW3.16b
+  do_add(vT1, vW1, vK1)
+
+.Loop:
+  do_rounds(c, sE1, sH4, T0, K1, W0, W1, W2, W3, do_add, do_sha1su0, _)
+  sub x2, x2, #1
+  do_rounds(c, sE0, sE1, T1, K1, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, sE1, sE0, T0, K1, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, sE0, sE1, T1, K2, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(c, sE1, sE0, T0, K2, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(p, sE0, sE1, T1, K2, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, sE1, sE0, T0, K2, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, sE0, sE1, T1, K2, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, sE1, sE0, T0, K3, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(p, sE0, sE1, T1, K3, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(m, sE1, sE0, T0, K3, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, sE0, sE1, T1, K3, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, sE1, sE0, T0, K3, W0, W1, W2, W3, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, sE0, sE1, T1, K4, W1, W2, W3, W0, do_add, do_sha1su0, do_sha1su1)
+  do_rounds(m, sE1, sE0, T0, K4, W2, W3, W0, W1, do_add, do_sha1su0, do_sha1su1)
+
+  do_rounds(p, sE0, sE1, T1, K4, W3, W0, W1, W2, do_add, do_sha1su0, do_sha1su1)
+  cbz x2, .Lend
+
+  ld1 {vW0.16b-vW1.16b}, [x1], #32 /* preload */
+  do_rounds(p, sE1, sE0, T0, K4, _  , _  , W2, W3, do_add, _, do_sha1su1)
+  rev32 vW0.16b, vW0.16b
+  ld1 {vW2.16b}, [x1], #16
+  rev32 vW1.16b, vW1.16b
+  do_rounds(p, sE0, sE1, T1, K4, _  , _  , W3, _  , do_add, _, _)
+  ld1 {vW3.16b}, [x1], #16
+  rev32 vW2.16b, vW2.16b
+  do_rounds(p, sE1, sE0, T0, _, _, _, _, _, _, _, _)
+  rev32 vW3.16b, vW3.16b
+  do_rounds(p, sE0, sE1, T1, _, _, _, _, _, _, _, _)
+
+  do_add(vT0, vW0, vK1)
+  add vH4.2s, vH4.2s, vE0.2s
+  add vABCD.4s, vABCD.4s, vH0123.4s
+  do_add(vT1, vW1, vK1)
+
+  mov vH0123.16b, vABCD.16b
+
+  b .Loop
+
+.Lend:
+  do_rounds(p, sE1, sE0, T0, K4, _  , _  , W2, W3, do_add, _, do_sha1su1)
+  do_rounds(p, sE0, sE1, T1, K4, _  , _  , W3, _  , do_add, _, _)
+  do_rounds(p, sE1, sE0, T0, _, _, _, _, _, _, _, _)
+  do_rounds(p, sE0, sE1, T1, _, _, _, _, _, _, _, _)
+
+  add vH4.2s, vH4.2s, vE0.2s
+  add vH0123.4s, vH0123.4s, vABCD.4s
+
+  CLEAR_REG(vW0)
+  CLEAR_REG(vW1)
+  CLEAR_REG(vW2)
+  CLEAR_REG(vW3)
+  CLEAR_REG(vABCD)
+  CLEAR_REG(vE1)
+  CLEAR_REG(vE0)
+
+  str sH4, [x0, #16]    /* store h4 */
+  st1 {vH0123.4s}, [x0] /* store h0,h1,h2,h3 */
+
+  CLEAR_REG(vH0123)
+  CLEAR_REG(vH4)
+
+.Ldo_nothing:
+  mov x0, #0
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-avx-amd64.S b/comm/third_party/libgcrypt/cipher/sha1-avx-amd64.S
new file mode 100644
index 0000000000..85876ad418
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-avx-amd64.S
@@ -0,0 +1,429 @@
+/* sha1-avx-amd64.S - Intel AVX accelerated SHA-1 transform function
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
+
+#include "asm-common-amd64.h"
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+.text
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 16
+.LK_XMM:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+.Lbswap_shufb_ctl:
+	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %xmm0
+#define Wtmp1 %xmm1
+
+#define W0 %xmm2
+#define W1 %xmm3
+#define W2 %xmm4
+#define W3 %xmm5
+#define W4 %xmm6
+#define W5 %xmm7
+#define W6 %xmm8
+#define W7 %xmm9
+
+#define BSWAP_REG %xmm10
+
+
+/* Round function macros. */
+
+#define WK(i) (((i) & 15) * 4)(%rsp)
+
+#define R_F1(a,b,c,d,e,i) \
+	movl c, RT0; \
+	addl WK(i), e; \
+	xorl d, RT0; \
+	movl a, RT1; \
+	andl b, RT0; \
+	shldl $30, b, b; \
+	xorl d, RT0; \
+	leal (RT0,e), e; \
+	shldl $5, RT1, RT1; \
+	addl RT1, e;
+
+#define R_F2(a,b,c,d,e,i) \
+	movl c, RT0; \
+	addl WK(i), e; \
+	xorl b, RT0; \
+	shldl $30, b, b; \
+	xorl d, RT0; \
+	movl a, RT1; \
+	leal (RT0,e), e; \
+	shldl $5, RT1, RT1; \
+	addl RT1, e;
+
+#define R_F3(a,b,c,d,e,i) \
+	movl c, RT0; \
+	movl b, RT1; \
+	xorl b, RT0; \
+	andl c, RT1; \
+	andl d, RT0; \
+	addl RT1, e; \
+	addl WK(i), e; \
+	shldl $30, b, b; \
+	movl a, RT1; \
+	leal (RT0,e), e; \
+	shldl $5, RT1, RT1; \
+	addl RT1, e;
+
+#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
+
+#define R(a,b,c,d,e,f,i) \
+	R_##f(a,b,c,d,e,i)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+	vmovdqu (4*(i))(RDATA), tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+	vpshufb BSWAP_REG, tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0) \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+	vmovdqa tmp0, WK(i&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpalignr $8, W_m16, W_m12, W; \
+	vpsrldq $4, W_m04, tmp0; \
+	vpxor W_m08, W, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpxor W_m16, tmp0, tmp0; \
+	vpxor tmp0, W, W; \
+	vpslld $1, W, tmp0; \
+	vpslldq $12, W, tmp1; \
+	vpsrld $31, W, W;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpor W, tmp0, tmp0; \
+	vpsrld $30, tmp1, W; \
+	vpslld $2, tmp1, tmp1;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpxor W, tmp0, tmp0; \
+	vpxor tmp1, tmp0, W; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
+	vmovdqa tmp0, WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m28, W, W; \
+	vpalignr $8, W_m08, W_m04, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m16, W, W; \
+	vpxor tmp0, W, W;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpsrld $30, W, tmp0; \
+	vpslld $2, W, W;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpor W, tmp0, W; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
+	vmovdqa tmp0, WK((i)&~3);
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_avx (void *ctx, const unsigned char *data,
+ *                                  size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_avx
+ELF(.type _gcry_sha1_transform_amd64_avx,@function)
+.align 16
+_gcry_sha1_transform_amd64_avx:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks
+   */
+  CFI_STARTPROC();
+
+  xorl %eax, %eax;
+  cmpq $0, %rdx;
+  jz .Lret;
+
+  vzeroupper;
+
+  movq %rdx, RNBLKS;
+  movq %rdi, RSTATE;
+  movq %rsi, RDATA;
+  pushq %rbx;
+  CFI_PUSH(%rbx);
+  pushq %rbp;
+  CFI_PUSH(%rbp);
+
+  movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
+
+  subq $(16*4), %rsp;
+  andq $(~31), %rsp;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+
+  /* Precalc 0-15. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+.align 8
+.Loop:
+  addq $64, RDATA;
+
+  /* Transform 0-15 + Precalc 16-31. */
+  R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+
+  /* Transform 16-63 + Precalc 32-79. */
+  R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+
+  decq RNBLKS;
+  jz .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+  R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+  R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+  R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
+  R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+  R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+  R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+  R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
+  R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+  R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+  R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+  R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
+  R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+  R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+  R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+  R( c, d, e, a, b, F4, 78 );
+  addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0);
+  R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  jmp .Loop;
+
+.align 16
+.Lend:
+  vzeroall;
+
+  /* Transform 64-79 + burn stack */
+  R( b, c, d, e, a, F4, 64 );
+  R( a, b, c, d, e, F4, 65 );
+  R( e, a, b, c, d, F4, 66 );
+  R( d, e, a, b, c, F4, 67 );
+  R( c, d, e, a, b, F4, 68 );
+  R( b, c, d, e, a, F4, 69 );
+  R( a, b, c, d, e, F4, 70 );
+  R( e, a, b, c, d, F4, 71 );
+  R( d, e, a, b, c, F4, 72 );
+  R( c, d, e, a, b, F4, 73 );
+  R( b, c, d, e, a, F4, 74 );
+  R( a, b, c, d, e, F4, 75 );
+  R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79 );
+
+  /* 16*4/16-1 = 3 */
+  vmovdqa %xmm0, (3*16)(%rsp);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
+
+  popq %rbp;
+  CFI_POP(%rbp);
+  popq %rbx;
+  CFI_POP(%rbx);
+
+  /* stack already burned */
+  xorl %eax, %eax;
+
+.Lret:
+  ret;
+  CFI_ENDPROC();
+ELF(.size _gcry_sha1_transform_amd64_avx,
+    .-_gcry_sha1_transform_amd64_avx;)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-avx-bmi2-amd64.S b/comm/third_party/libgcrypt/cipher/sha1-avx-bmi2-amd64.S
new file mode 100644
index 0000000000..5dfcdca979
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-avx-bmi2-amd64.S
@@ -0,0 +1,441 @@
+/* sha1-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SHA-1 transform function
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
+
+#include "asm-common-amd64.h"
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+.text
+.align 16
+.Lbswap_shufb_ctl:
+	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.LK1:	.long 0x5A827999
+.LK2:	.long 0x6ED9EBA1
+.LK3:	.long 0x8F1BBCDC
+.LK4:	.long 0xCA62C1D6
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %esi
+#define b %edi
+#define c %ebp
+#define d %edx
+#define e %ecx
+#define ne %ebx
+
+#define RT0 %eax
+#define RT1 %r12d
+
+#define Wtmp0 %xmm0
+#define Wtmp1 %xmm1
+
+#define W0 %xmm2
+#define W1 %xmm3
+#define W2 %xmm4
+#define W3 %xmm5
+#define W4 %xmm6
+#define W5 %xmm7
+#define W6 %xmm8
+#define W7 %xmm9
+
+#define BSWAP_REG %xmm10
+
+#define K1 %xmm11
+#define K2 %xmm12
+#define K3 %xmm13
+#define K4 %xmm14
+
+
+/* Round function macros. */
+
+#define WK(i) (((i) & 15) * 4)(%rsp)
+
+#define R_F1(a,b,c,d,e,i) \
+	movl c, RT0; \
+	andn d, b, RT1; \
+	addl WK(i), e; \
+	andl b, RT0; \
+	rorxl $2, b, b; \
+	addl RT1, e; \
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
+
+#define R_F2(a,b,c,d,e,i) \
+	movl c, RT0; \
+	addl WK(i), e; \
+	xorl b, RT0; \
+	rorxl $2, b, b; \
+	xorl d, RT0; \
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
+
+#define R_F3(a,b,c,d,e,i) \
+	movl c, RT0; \
+	movl b, RT1; \
+	addl WK(i), e; \
+	xorl b, RT0; \
+	andl c, RT1; \
+	andl d, RT0; \
+	addl RT1, e; \
+	rorxl $2, b, b; \
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
+
+#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
+
+#define R(a,b,c,d,e,f,i) \
+	R_##f(a,b,c,d,e,i)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+	vmovdqu (4*(i))(RDATA), tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+	vpshufb BSWAP_REG, tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+	vpaddd K, W, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+	vmovdqa tmp0, WK(i&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpalignr $8, W_m16, W_m12, W; \
+	vpsrldq $4, W_m04, tmp0; \
+	vpxor W_m08, W, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpxor W_m16, tmp0, tmp0; \
+	vpxor tmp0, W, W; \
+	vpslld $1, W, tmp0; \
+	vpslldq $12, W, tmp1; \
+	vpsrld $31, W, W;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpor W, tmp0, tmp0; \
+	vpsrld $30, tmp1, W; \
+	vpslld $2, tmp1, tmp1;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
+	vpxor W, tmp0, tmp0; \
+	vpxor tmp1, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m28, W, W; \
+	vpalignr $8, W_m08, W_m04, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m16, W, W; \
+	vpxor tmp0, W, W;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpsrld $30, W, tmp0; \
+	vpslld $2, W, W;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
+	vpor W, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, WK((i)&~3);
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
+ *                                      size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_avx_bmi2
+ELF(.type _gcry_sha1_transform_amd64_avx_bmi2,@function)
+.align 16
+_gcry_sha1_transform_amd64_avx_bmi2:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks
+   */
+  CFI_STARTPROC();
+
+  xorl %eax, %eax;
+  cmpq $0, %rdx;
+  jz .Lret;
+
+  vzeroupper;
+
+  movq %rdx, RNBLKS;
+  movq %rdi, RSTATE;
+  movq %rsi, RDATA;
+  pushq %rbx;
+  CFI_PUSH(%rbx);
+  pushq %rbp;
+  CFI_PUSH(%rbp);
+  pushq %r12;
+  CFI_PUSH(%r12);
+
+  movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
+
+  subq $(16*4), %rsp;
+  andq $(~31), %rsp;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+  xorl ne, ne;
+
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
+
+  /* Precalc 0-15. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+.align 8
+.Loop:
+  addq $64, RDATA;
+
+  /* Transform 0-15 + Precalc 16-31. */
+  R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+  R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+  R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+  R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+
+  /* Transform 16-63 + Precalc 32-79. */
+  R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
+  R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
+  R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
+  R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
+  R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
+  R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
+  R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
+  R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
+  R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
+  R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
+  R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
+  R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
+
+  decq RNBLKS;
+  jz .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+  R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+  R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+  R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+  R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+  R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+  R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+  R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+  R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+  R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+  R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+  R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+  R( c, d, e, a, b, F4, 78 );
+  addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  jmp .Loop;
+
+.align 16
+.Lend:
+  vzeroall;
+
+  /* Transform 64-79 + burn stack */
+  R( b, c, d, e, a, F4, 64 );
+  R( a, b, c, d, e, F4, 65 );
+  R( e, a, b, c, d, F4, 66 );
+  R( d, e, a, b, c, F4, 67 );
+  R( c, d, e, a, b, F4, 68 );
+  R( b, c, d, e, a, F4, 69 );
+  R( a, b, c, d, e, F4, 70 );
+  R( e, a, b, c, d, F4, 71 );
+  R( d, e, a, b, c, F4, 72 );
+  R( c, d, e, a, b, F4, 73 );
+  R( b, c, d, e, a, F4, 74 );
+  R( a, b, c, d, e, F4, 75 );
+  R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* 16*4/16-1 = 3 */
+  vmovdqa %xmm0, (3*16)(%rsp);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
+
+  popq %r12;
+  CFI_POP(%r12);
+  popq %rbp;
+  CFI_POP(%rbp);
+  popq %rbx;
+  CFI_POP(%rbx);
+
+  /* stack already burned */
+  xorl %eax, %eax;
+
+.Lret:
+  ret;
+  CFI_ENDPROC();
+ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
+    .-_gcry_sha1_transform_amd64_avx_bmi2;)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-avx2-bmi2-amd64.S b/comm/third_party/libgcrypt/cipher/sha1-avx2-bmi2-amd64.S
new file mode 100644
index 0000000000..938632305a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-avx2-bmi2-amd64.S
@@ -0,0 +1,573 @@
+/* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+     defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
+
+#include "asm-common-amd64.h"
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define WK_STACK_WORDS (80 * 2)
+
+.text
+.align 16
+.Lbswap_shufb_ctl:
+	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.LK1:	.long 0x5A827999
+.LK2:	.long 0x6ED9EBA1
+.LK3:	.long 0x8F1BBCDC
+.LK4:	.long 0xCA62C1D6
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+#define ne %r12d
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %ymm0
+#define Wtmp1 %ymm1
+#define Wtmp0x %xmm0
+#define Wtmp1x %xmm1
+
+#define W0 %ymm2
+#define W1 %ymm3
+#define W2 %ymm4
+#define W3 %ymm5
+#define W4 %ymm6
+#define W5 %ymm7
+#define W6 %ymm8
+#define W7 %ymm9
+
+#define BSWAP_REG %ymm10
+
+#define K1 %ymm11
+#define K2 %ymm12
+#define K3 %ymm13
+#define K4 %ymm14
+
+
+/* Round function macros. */
+
+#define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp)
+#define PRE_WK(i) ((i) * 4 * 2)(%rsp)
+
+#define R_F1(a,b,c,d,e,i,block) \
+	movl c, RT0; \
+	andn d, b, RT1; \
+	addl WK(i,block), e; \
+	andl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	addl RT1, e; \
+	rorxl $27, a, ne; \
+	addl RT0, e;
+
+#define R_F2(a,b,c,d,e,i,block) \
+	addl WK(i,block), e; \
+	movl c, RT0; \
+	xorl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	xorl d, RT0; \
+	addl RT0, e; \
+	rorxl $27, a, ne;
+
+#define R_F3(a,b,c,d,e,i,block) \
+	movl c, RT0; \
+	addl WK(i,block), e; \
+	movl b, RT1; \
+	xorl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	andl c, RT1; \
+	addl RT1, e; \
+	andl d, RT0; \
+	rorxl $27, a, ne; \
+	addl RT0, e;
+
+#define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block)
+
+#define R(a,b,c,d,e,f,i,block) \
+	R_##f(a,b,c,d,e,i,block)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+	vmovdqu (4*(i))(RDATA), tmp0##x; \
+	vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+	vpshufb BSWAP_REG, tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+	vpaddd K, W, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpalignr $8, W_m16, W_m12, W; \
+	vpsrldq $4, W_m04, tmp0; \
+	vpxor W_m08, W, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpxor W_m16, tmp0, tmp0; \
+	vpxor tmp0, W, W; \
+	vpslld $1, W, tmp0; \
+	vpslldq $12, W, tmp1; \
+	vpsrld $31, W, W;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpor W, tmp0, tmp0; \
+	vpsrld $30, tmp1, W; \
+	vpslld $2, tmp1, tmp1;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
+	vpxor W, tmp0, tmp0; \
+	vpxor tmp1, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m28, W, W; \
+	vpalignr $8, W_m08, W_m04, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m16, W, W; \
+	vpxor tmp0, W, W;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpsrld $30, W, tmp0; \
+	vpslld $2, W, W;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
+	vpor W, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+
+/*
+ * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+ *                                       size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_avx2_bmi2
+ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2,@function)
+.align 16
+_gcry_sha1_transform_amd64_avx2_bmi2:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks (multiple of 2, larger than 0)
+   */
+  CFI_STARTPROC();
+
+  vzeroupper;
+
+  movq %rdx, RNBLKS;
+  movq %rdi, RSTATE;
+  movq %rsi, RDATA;
+  pushq %rbx;
+  CFI_PUSH(%rbx);
+  pushq %rbp;
+  CFI_PUSH(%rbp);
+  pushq %r12;
+  CFI_PUSH(%r12);
+
+  movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
+
+  subq $(WK_STACK_WORDS*4), %rsp;
+  andq $(~63), %rsp;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+  xorl ne, ne;
+
+  vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
+
+  /* Precalc 0-31 for block 1 & 2. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+  W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+  W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+  W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+  W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+
+.align 8
+.Loop:
+  addq $(2 * 64), RDATA;
+
+  /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */
+  R( a, b, c, d, e, F1,  0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F1,  1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1,  2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1,  3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
+  R( b, c, d, e, a, F1,  4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F1,  5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F1,  6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F1,  7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
+  R( c, d, e, a, b, F1,  8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F1,  9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
+  R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
+
+  /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */
+  R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
+  R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
+  R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
+  R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
+  R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
+  R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
+  R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
+  R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
+
+  /* Transform 48-79 for block 1. */
+  R( c, d, e, a, b, F3, 48, 0 );
+  R( b, c, d, e, a, F3, 49, 0 );
+  R( a, b, c, d, e, F3, 50, 0 );
+  R( e, a, b, c, d, F3, 51, 0 );
+  R( d, e, a, b, c, F3, 52, 0 );
+  R( c, d, e, a, b, F3, 53, 0 );
+  R( b, c, d, e, a, F3, 54, 0 );
+  R( a, b, c, d, e, F3, 55, 0 );
+  R( e, a, b, c, d, F3, 56, 0 );
+  R( d, e, a, b, c, F3, 57, 0 );
+  R( c, d, e, a, b, F3, 58, 0 );
+  R( b, c, d, e, a, F3, 59, 0 );
+  R( a, b, c, d, e, F4, 60, 0 );
+  R( e, a, b, c, d, F4, 61, 0 );
+  R( d, e, a, b, c, F4, 62, 0 );
+  R( c, d, e, a, b, F4, 63, 0 );
+  R( b, c, d, e, a, F4, 64, 0 );
+  R( a, b, c, d, e, F4, 65, 0 );
+  R( e, a, b, c, d, F4, 66, 0 );
+  R( d, e, a, b, c, F4, 67, 0 );
+  R( c, d, e, a, b, F4, 68, 0 );
+  R( b, c, d, e, a, F4, 69, 0 );
+  R( a, b, c, d, e, F4, 70, 0 );
+  R( e, a, b, c, d, F4, 71, 0 );
+  R( d, e, a, b, c, F4, 72, 0 );
+  R( c, d, e, a, b, F4, 73, 0 );
+  R( b, c, d, e, a, F4, 74, 0 );
+  R( a, b, c, d, e, F4, 75, 0 );
+  R( e, a, b, c, d, F4, 76, 0 );
+  R( d, e, a, b, c, F4, 77, 0 );
+  R( c, d, e, a, b, F4, 78, 0 );
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79, 0 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  /* Transform 0-47 for block 2. */
+  R( a, b, c, d, e, F1,  0, 1 );
+  R( e, a, b, c, d, F1,  1, 1 );
+  R( d, e, a, b, c, F1,  2, 1 );
+  R( c, d, e, a, b, F1,  3, 1 );
+  R( b, c, d, e, a, F1,  4, 1 );
+  R( a, b, c, d, e, F1,  5, 1 );
+  R( e, a, b, c, d, F1,  6, 1 );
+  R( d, e, a, b, c, F1,  7, 1 );
+  R( c, d, e, a, b, F1,  8, 1 );
+  R( b, c, d, e, a, F1,  9, 1 );
+  R( a, b, c, d, e, F1, 10, 1 );
+  R( e, a, b, c, d, F1, 11, 1 );
+  R( d, e, a, b, c, F1, 12, 1 );
+  R( c, d, e, a, b, F1, 13, 1 );
+  R( b, c, d, e, a, F1, 14, 1 );
+  R( a, b, c, d, e, F1, 15, 1 );
+  R( e, a, b, c, d, F1, 16, 1 );
+  R( d, e, a, b, c, F1, 17, 1 );
+  R( c, d, e, a, b, F1, 18, 1 );
+  R( b, c, d, e, a, F1, 19, 1 );
+  R( a, b, c, d, e, F2, 20, 1 );
+  R( e, a, b, c, d, F2, 21, 1 );
+  R( d, e, a, b, c, F2, 22, 1 );
+  R( c, d, e, a, b, F2, 23, 1 );
+  R( b, c, d, e, a, F2, 24, 1 );
+  R( a, b, c, d, e, F2, 25, 1 );
+  R( e, a, b, c, d, F2, 26, 1 );
+  R( d, e, a, b, c, F2, 27, 1 );
+  R( c, d, e, a, b, F2, 28, 1 );
+  R( b, c, d, e, a, F2, 29, 1 );
+  R( a, b, c, d, e, F2, 30, 1 );
+  R( e, a, b, c, d, F2, 31, 1 );
+  R( d, e, a, b, c, F2, 32, 1 );
+  R( c, d, e, a, b, F2, 33, 1 );
+  R( b, c, d, e, a, F2, 34, 1 );
+  R( a, b, c, d, e, F2, 35, 1 );
+  R( e, a, b, c, d, F2, 36, 1 );
+  R( d, e, a, b, c, F2, 37, 1 );
+  R( c, d, e, a, b, F2, 38, 1 );
+  R( b, c, d, e, a, F2, 39, 1 );
+  R( a, b, c, d, e, F3, 40, 1 );
+  R( e, a, b, c, d, F3, 41, 1 );
+  R( d, e, a, b, c, F3, 42, 1 );
+  R( c, d, e, a, b, F3, 43, 1 );
+  R( b, c, d, e, a, F3, 44, 1 );
+  R( a, b, c, d, e, F3, 45, 1 );
+  R( e, a, b, c, d, F3, 46, 1 );
+  R( d, e, a, b, c, F3, 47, 1 );
+
+  addq $-2, RNBLKS;
+  jz .Lend;
+
+  /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */
+  R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+  R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+  R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+  R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+  R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+  R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+  R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+  R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+  R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+  R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+  R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+  R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+  R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+  R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  addl state_h0(RSTATE), a;      W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+  R( b, c, d, e, a, F4, 79, 1 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  jmp .Loop;
+
+.align 16
+.Lend:
+  vzeroall;
+
+  /* Transform 48-79 for block 2 + burn stack */
+  R( c, d, e, a, b, F3, 48, 1 );
+  R( b, c, d, e, a, F3, 49, 1 );
+  R( a, b, c, d, e, F3, 50, 1 );
+  R( e, a, b, c, d, F3, 51, 1 );
+  R( d, e, a, b, c, F3, 52, 1 );
+  R( c, d, e, a, b, F3, 53, 1 );
+  R( b, c, d, e, a, F3, 54, 1 );
+  R( a, b, c, d, e, F3, 55, 1 );
+  R( e, a, b, c, d, F3, 56, 1 );
+  R( d, e, a, b, c, F3, 57, 1 );
+  R( c, d, e, a, b, F3, 58, 1 );
+  R( b, c, d, e, a, F3, 59, 1 );
+  R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp);
+  R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp);
+  R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp);
+  R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp);
+  R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp);
+  R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp);
+  R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp);
+  R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp);
+  R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp);
+  R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp);
+  R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp);
+  R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp);
+  R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp);
+  R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp);
+  R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp);
+  R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp);
+  R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp);
+  R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp);
+  R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp);
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79, 1 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* WK_STACK_WORDS*4/32-1 = 19 */
+  vmovdqa %ymm0, (19*32)(%rsp);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
+
+  popq %r12;
+  CFI_POP(%r12);
+  popq %rbp;
+  CFI_POP(%rbp);
+  popq %rbx;
+  CFI_POP(%rbx);
+
+  /* stack already burned */
+  xorl %eax, %eax;
+
+  ret;
+  CFI_ENDPROC();
+ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
+    .-_gcry_sha1_transform_amd64_avx2_bmi2;)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1-intel-shaext.c b/comm/third_party/libgcrypt/cipher/sha1-intel-shaext.c
new file mode 100644
index 0000000000..ddf2be2aa1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-intel-shaext.c
@@ -0,0 +1,292 @@
+/* sha1-intel-shaext.S - SHAEXT accelerated SHA-1 transform function
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && defined(USE_SHA1) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
+
+/* Two macros to be called prior and after the use of SHA-EXT
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, (%0)\n"                           \
+                      "movdqu %%xmm7, (%1)\n"                           \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16])         \
+                      : "memory");                                      \
+   } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("movdqu (%0), %%xmm6\n"                           \
+                      "movdqu (%1), %%xmm7\n"                           \
+                      "pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "movdqa %%xmm0, (%2)\n\t"                         \
+                      "movdqa %%xmm0, (%3)\n\t"                         \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16]),        \
+                        "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n"                           \
+                      "pxor %%xmm7, %%xmm7\n"                           \
+                      "movdqa %%xmm0, (%0)\n\t"                         \
+                      "movdqa %%xmm0, (%1)\n\t"                         \
+                      :                                                 \
+                      : "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#endif
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ */
+unsigned int ASM_FUNC_ATTR
+_gcry_sha1_transform_intel_shaext(void *state, const unsigned char *data,
+                                  size_t nblks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  char save_buf[2 * 16 + 15];
+  char *abcd_save;
+  char *e_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abcd_save) : "0" (save_buf) : "memory");
+  abcd_save = abcd_save + (-(uintptr_t)abcd_save & 15);
+  e_save = abcd_save + 16;
+
+  /* byteswap mask => XMM7 */
+  asm volatile ("movdqa %[mask], %%xmm7\n\t" /* Preload mask */
+                :
+                : [mask] "m" (*be_mask)
+                : "memory");
+
+  /* Load state.. ABCD => XMM4, E => XMM5 */
+  asm volatile ("movd 16(%[state]), %%xmm5\n\t"
+                "movdqu (%[state]), %%xmm4\n\t"
+                "pslldq $12, %%xmm5\n\t"
+                "pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "movdqa %%xmm5, (%[e_save])\n\t"
+                "movdqa %%xmm4, (%[abcd_save])\n\t"
+                :
+                : [state] "r" (state), [abcd_save] "r" (abcd_save),
+                  [e_save] "r" (e_save)
+                : "memory" );
+
+  /* DATA => XMM[0..4] */
+  asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                "movdqu 16(%[data]), %%xmm1\n\t"
+                "movdqu 32(%[data]), %%xmm2\n\t"
+                "movdqu 48(%[data]), %%xmm3\n\t"
+                "pshufb %%xmm7, %%xmm0\n\t"
+                "pshufb %%xmm7, %%xmm1\n\t"
+                "pshufb %%xmm7, %%xmm2\n\t"
+                "pshufb %%xmm7, %%xmm3\n\t"
+                :
+                : [data] "r" (data)
+                : "memory" );
+  data += 64;
+
+  while (1)
+    {
+      /* Round 0..3 */
+      asm volatile ("paddd %%xmm0, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t" /* ABCD => E1 */
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    ::: "memory" );
+
+      /* Round 4..7 */
+      asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $0, %%xmm6, %%xmm4\n\t"
+                    "sha1msg1 %%xmm1, %%xmm0\n\t"
+                    ::: "memory" );
+
+      /* Round 8..11 */
+      asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    "sha1msg1 %%xmm2, %%xmm1\n\t"
+                    "pxor %%xmm2, %%xmm0\n\t"
+                    ::: "memory" );
+
+#define ROUND(imm, E0, E1, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("sha1nexte %%"MSG0", %%"E0"\n\t" \
+                    "movdqa %%xmm4, %%"E1"\n\t" \
+                    "sha1msg2 %%"MSG0", %%"MSG1"\n\t" \
+                    "sha1rnds4 $"imm", %%"E0", %%xmm4\n\t" \
+                    "sha1msg1 %%"MSG0", %%"MSG3"\n\t" \
+                    "pxor %%"MSG0", %%"MSG2"\n\t" \
+                    ::: "memory" )
+
+      /* Rounds 12..15 to 64..67 */
+      ROUND("0", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("0", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("1", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("1", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("1", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("2", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("2", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("2", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("3", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("3", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+
+      if (--nblks == 0)
+        break;
+
+      /* Round 68..71 */
+      asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                    "sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1msg2 %%xmm1, %%xmm2\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pxor %%xmm1, %%xmm3\n\t"
+                    "pshufb %%xmm7, %%xmm0\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 72..75 */
+      asm volatile ("movdqu 16(%[data]), %%xmm1\n\t"
+                    "sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1msg2 %%xmm2, %%xmm3\n\t"
+                    "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm1\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 76..79 */
+      asm volatile ("movdqu 32(%[data]), %%xmm2\n\t"
+                    "sha1nexte %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm2\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Merge states, store current. */
+      asm volatile ("movdqu 48(%[data]), %%xmm3\n\t"
+                    "sha1nexte (%[e_save]), %%xmm5\n\t"
+                    "paddd (%[abcd_save]), %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm3\n\t"
+                    "movdqa %%xmm5, (%[e_save])\n\t"
+                    "movdqa %%xmm4, (%[abcd_save])\n\t"
+                    :
+                    : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save),
+                      [data] "r" (data)
+                    : "memory" );
+
+      data += 64;
+    }
+
+  /* Round 68..71 */
+  asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1msg2 %%xmm1, %%xmm2\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                "pxor %%xmm1, %%xmm3\n\t"
+                ::: "memory" );
+
+  /* Round 72..75 */
+  asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm6\n\t"
+                "sha1msg2 %%xmm2, %%xmm3\n\t"
+                "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Round 76..79 */
+  asm volatile ("sha1nexte %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Merge states. */
+  asm volatile ("sha1nexte (%[e_save]), %%xmm5\n\t"
+                "paddd (%[abcd_save]), %%xmm4\n\t"
+                :
+                : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save)
+                : "memory" );
+
+  /* Save state */
+  asm volatile ("pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "psrldq $12, %%xmm5\n\t"
+                "movdqu %%xmm4, (%[state])\n\t"
+                "movd %%xmm5, 16(%[state])\n\t"
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  shaext_cleanup (abcd_save, e_save);
+  return 0;
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/comm/third_party/libgcrypt/cipher/sha1-ssse3-amd64.S b/comm/third_party/libgcrypt/cipher/sha1-ssse3-amd64.S
new file mode 100644
index 0000000000..db62928ad3
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1-ssse3-amd64.S
@@ -0,0 +1,437 @@
+/* sha1-ssse3-amd64.S - Intel SSSE3 accelerated SHA-1 transform function
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
+
+#include "asm-common-amd64.h"
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+.text
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 16
+.LK_XMM:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+.Lbswap_shufb_ctl:
+	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %xmm0
+#define Wtmp1 %xmm1
+
+#define W0 %xmm2
+#define W1 %xmm3
+#define W2 %xmm4
+#define W3 %xmm5
+#define W4 %xmm6
+#define W5 %xmm7
+#define W6 %xmm8
+#define W7 %xmm9
+
+#define BSWAP_REG %xmm10
+
+
+/* Round function macros. */
+
+#define WK(i) (((i) & 15) * 4)(%rsp)
+
+#define R_F1(a,b,c,d,e,i) \
+	movl c, RT0; \
+	addl WK(i), e; \
+	xorl d, RT0; \
+	movl a, RT1; \
+	andl b, RT0; \
+	roll $30, b; \
+	xorl d, RT0; \
+	leal (RT0,e), e; \
+	roll $5, RT1; \
+	addl RT1, e;
+
+#define R_F2(a,b,c,d,e,i) \
+	movl c, RT0; \
+	addl WK(i), e; \
+	xorl b, RT0; \
+	roll $30, b; \
+	xorl d, RT0; \
+	movl a, RT1; \
+	leal (RT0,e), e; \
+	roll $5, RT1; \
+	addl RT1, e;
+
+#define R_F3(a,b,c,d,e,i) \
+	movl c, RT0; \
+	movl b, RT1; \
+	xorl b, RT0; \
+	andl c, RT1; \
+	andl d, RT0; \
+	addl RT1, e; \
+	addl WK(i), e; \
+	roll $30, b; \
+	movl a, RT1; \
+	leal (RT0,e), e; \
+	roll $5, RT1; \
+	addl RT1, e;
+
+#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
+
+#define R(a,b,c,d,e,f,i) \
+	R_##f(a,b,c,d,e,i)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+	movdqu (4*(i))(RDATA), tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+	pshufb BSWAP_REG, tmp0; \
+	movdqa tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0) \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+	movdqa tmp0, WK(i&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	movdqa W_m12, W; \
+	palignr $8, W_m16, W; \
+	movdqa W_m04, tmp0; \
+	psrldq $4, tmp0; \
+	pxor W_m08, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	pxor W_m16, tmp0; \
+	pxor tmp0, W; \
+	movdqa W, tmp1; \
+	movdqa W, tmp0; \
+	pslldq $12, tmp1;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	psrld $31, W; \
+	pslld $1, tmp0; \
+	por W, tmp0; \
+	movdqa tmp1, W; \
+	psrld $30, tmp1; \
+	pslld $2, W;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	pxor W, tmp0; \
+	pxor tmp1, tmp0; \
+	movdqa tmp0, W; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
+	movdqa tmp0, WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	movdqa W_m04, tmp0; \
+	pxor W_m28, W; \
+	palignr $8, W_m08, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	pxor W_m16, W; \
+	pxor tmp0, W; \
+	movdqa W, tmp0;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	psrld $30, W; \
+	pslld $2, tmp0; \
+	por W, tmp0;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	movdqa tmp0, W; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
+	movdqa tmp0, WK((i)&~3);
+
+#define CLEAR_REG(reg) pxor reg, reg;
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
+ *                                   size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_ssse3
+ELF(.type _gcry_sha1_transform_amd64_ssse3,@function)
+.align 16
+_gcry_sha1_transform_amd64_ssse3:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks
+   */
+  CFI_STARTPROC();
+
+  xorl %eax, %eax;
+  cmpq $0, %rdx;
+  jz .Lret;
+
+  movq %rdx, RNBLKS;
+  movq %rdi, RSTATE;
+  movq %rsi, RDATA;
+  pushq %rbx;
+  CFI_PUSH(%rbx);
+  pushq %rbp;
+  CFI_PUSH(%rbp);
+
+  movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
+
+  subq $(16*4), %rsp;
+  andq $(~31), %rsp;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+
+  movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+
+  /* Precalc 0-15. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+.align 8
+.Loop:
+  addq $64, RDATA;
+
+  /* Transform 0-15 + Precalc 16-31. */
+  R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+
+  /* Transform 16-63 + Precalc 32-79. */
+  R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+
+  decq RNBLKS;
+  jz .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+  R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+  R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+  R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
+  R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+  R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+  R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+  R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
+  R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+  R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+  R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+  R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
+  R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+  R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+  R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+  R( c, d, e, a, b, F4, 78 );
+  addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0);
+  R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  jmp .Loop;
+
+.align 16
+.Lend:
+  /* Transform 64-79 + Clear XMM registers + Burn stack. */
+  R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG);
+  R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0);
+  R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1);
+  R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0);
+  R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1);
+  R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2);
+  R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3);
+  R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4);
+  R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5);
+  R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6);
+  R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7);
+  R( a, b, c, d, e, F4, 75 );
+  R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp);
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79 );
+
+  /* 16*4/16-1 = 3 */
+  movdqa Wtmp0, (3*16)(%rsp);
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
+
+  popq %rbp;
+  CFI_POP(%rbp);
+  popq %rbx;
+  CFI_POP(%rbx);
+
+  /* stack already burned */
+  xorl %eax, %eax;
+
+.Lret:
+  ret;
+  CFI_ENDPROC();
+ELF(.size _gcry_sha1_transform_amd64_ssse3,
+    .-_gcry_sha1_transform_amd64_ssse3;)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha1.c b/comm/third_party/libgcrypt/cipher/sha1.c
new file mode 100644
index 0000000000..35f7376c19
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1.c
@@ -0,0 +1,765 @@
+/* sha1.c - SHA1 hash function
+ * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*  Test vectors:
+ *
+ *  "abc"
+ *  A999 3E36 4706 816A BA3E  2571 7850 C26C 9CD0 D89D
+ *
+ *  "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+ *  8498 3E44 1C3B D26E BAAE  4AA1 F951 29E5 E546 70F1
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "sha1.h"
+
+
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSSE3 1
+#endif
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+/* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
+#undef USE_BMI2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_BMI2 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
+#undef USE_AVX2
+#if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2)
+# define USE_AVX2 1
+#endif
+
+/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
+#undef USE_SHAEXT
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHAEXT 1
+#endif
+
+/* USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef USE_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_NEON 1
+# endif
+#endif
+
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
+ * code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+#  define USE_ARM_CE 1
+# elif defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_CE 1
+# endif
+#endif
+
+
+/* A macro to test whether P is properly aligned for an u32 type.
+   Note that config.h provides a suitable replacement for uintptr_t if
+   it does not exist in stdint.h.  */
+/* #if __GNUC__ >= 2 */
+/* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % __alignof__ (u32))) */
+/* #else */
+/* # define U32_ALIGNED_P(p) (!(((uintptr_t)p) % sizeof (u32))) */
+/* #endif */
+
+
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
+    defined(USE_SHAEXT)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+#ifdef USE_SSSE3
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data,
+                                  size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX
+unsigned int
+_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data,
+                                 size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx (void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_BMI2
+unsigned int
+_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
+                                     size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
+                                  size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+
+#ifdef USE_AVX2
+unsigned int
+_gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data,
+                                      size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+                                   size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be
+   * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2
+   * code to handle these cases. */
+
+  if (nblks <= 1)
+    return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks);
+
+  if (nblks & 1)
+    {
+      (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1);
+      nblks--;
+      data += 64;
+    }
+
+  return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif /* USE_AVX2 */
+#endif /* USE_BMI2 */
+
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
+                                   size_t nblks);
+
+static unsigned int
+do_sha1_transform_intel_shaext (void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_NEON
+unsigned int
+_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data,
+                                 size_t nblks);
+
+static unsigned int
+do_sha1_transform_armv7_neon (void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_ARM_CE
+unsigned int
+_gcry_sha1_transform_armv8_ce (void *state, const unsigned char *data,
+                               size_t nblks);
+
+static unsigned int
+do_sha1_transform_armv8_ce (void *ctx, const unsigned char *data,
+                            size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+  return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef SHA1_USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static unsigned int
+do_sha1_transform_s390x (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  kimd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, nblks * 64);
+  return 0;
+}
+
+static unsigned int
+do_sha1_final_s390x (void *ctx, const unsigned char *data, size_t datalen,
+		     u32 len_msb, u32 len_lsb)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  /* Make sure that 'final_len' is positioned at correct offset relative
+   * to 'h0'. This is because we are passing 'h0' pointer as start of
+   * parameter block to 'klmd' instruction. */
+
+  gcry_assert (offsetof (SHA1_CONTEXT, final_len_msb)
+	       - offsetof (SHA1_CONTEXT, h0) == 5 * sizeof(u32));
+  gcry_assert (offsetof (SHA1_CONTEXT, final_len_lsb)
+	       - offsetof (SHA1_CONTEXT, final_len_msb) == 1 * sizeof(u32));
+
+  hd->final_len_msb = len_msb;
+  hd->final_len_lsb = len_lsb;
+
+  klmd_execute (KMID_FUNCTION_SHA1, &hd->h0, data, datalen);
+  return 0;
+}
+#endif
+
+
+static unsigned int
+do_transform_generic (void *c, const unsigned char *data, size_t nblks);
+
+
+static void
+sha1_init (void *context, unsigned int flags)
+{
+  SHA1_CONTEXT *hd = context;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)flags;
+
+  hd->h0 = 0x67452301;
+  hd->h1 = 0xefcdab89;
+  hd->h2 = 0x98badcfe;
+  hd->h3 = 0x10325476;
+  hd->h4 = 0xc3d2e1f0;
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.nblocks_high = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  hd->bctx.bwrite = do_transform_generic;
+#ifdef USE_SSSE3
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    hd->bctx.bwrite = do_sha1_transform_amd64_ssse3;
+#endif
+#ifdef USE_AVX
+  /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
+   * Therefore use this implementation on Intel CPUs only. */
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx;
+#endif
+#ifdef USE_BMI2
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2;
+#endif
+#ifdef USE_AVX2
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) &&
+      (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2;
+#endif
+#ifdef USE_SHAEXT
+  if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
+    hd->bctx.bwrite = do_sha1_transform_intel_shaext;
+#endif
+#ifdef USE_NEON
+  if ((features & HWF_ARM_NEON) != 0)
+    hd->bctx.bwrite = do_sha1_transform_armv7_neon;
+#endif
+#ifdef USE_ARM_CE
+  if ((features & HWF_ARM_SHA1) != 0)
+    hd->bctx.bwrite = do_sha1_transform_armv8_ce;
+#endif
+#ifdef SHA1_USE_S390X_CRYPTO
+  hd->use_s390x_crypto = 0;
+  if ((features & HWF_S390X_MSA) != 0)
+    {
+      if ((kimd_query () & km_function_to_mask (KMID_FUNCTION_SHA1)) &&
+	  (klmd_query () & km_function_to_mask (KMID_FUNCTION_SHA1)))
+	{
+	  hd->bctx.bwrite = do_sha1_transform_s390x;
+	  hd->use_s390x_crypto = 1;
+	}
+    }
+#endif
+
+  (void)features;
+}
+
+/*
+ * Initialize the context HD. This is used to prepare the use of
+ * _gcry_sha1_mixblock.  WARNING: This is a special purpose function
+ * for exclusive use by random-csprng.c.
+ */
+void
+_gcry_sha1_mixblock_init (SHA1_CONTEXT *hd)
+{
+  sha1_init (hd, 0);
+}
+
+
+/* Round function macros. */
+#define K1  0x5A827999L
+#define K2  0x6ED9EBA1L
+#define K3  0x8F1BBCDCL
+#define K4  0xCA62C1D6L
+#define F1(x,y,z)   ( z ^ ( x & ( y ^ z ) ) )
+#define F2(x,y,z)   ( x ^ y ^ z )
+#define F3(x,y,z)   ( ( x & y ) | ( z & ( x | y ) ) )
+#define F4(x,y,z)   ( x ^ y ^ z )
+#define M(i) ( tm =    x[ i    &0x0f]  \
+                     ^ x[(i-14)&0x0f]  \
+	 	     ^ x[(i-8) &0x0f]  \
+                     ^ x[(i-3) &0x0f], \
+                     (x[i&0x0f] = rol(tm, 1)))
+#define R(a,b,c,d,e,f,k,m)  do { e += rol( a, 5 )     \
+	                              + f( b, c, d )  \
+		 		      + k	      \
+			 	      + m;	      \
+				 b = rol( b, 30 );    \
+			       } while(0)
+
+/*
+ * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
+ */
+static unsigned int
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  do
+    {
+      const u32 *idata = (const void *)data;
+      u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
+      u32 tm;            /* Helper.  */
+      u32 x[16];         /* The array we work on. */
+
+#define I(i) (x[i] = buf_get_be32(idata + i))
+
+      /* Get the values of the chaining variables. */
+      a = hd->h0;
+      b = hd->h1;
+      c = hd->h2;
+      d = hd->h3;
+      e = hd->h4;
+
+      /* Transform. */
+      R( a, b, c, d, e, F1, K1, I( 0) );
+      R( e, a, b, c, d, F1, K1, I( 1) );
+      R( d, e, a, b, c, F1, K1, I( 2) );
+      R( c, d, e, a, b, F1, K1, I( 3) );
+      R( b, c, d, e, a, F1, K1, I( 4) );
+      R( a, b, c, d, e, F1, K1, I( 5) );
+      R( e, a, b, c, d, F1, K1, I( 6) );
+      R( d, e, a, b, c, F1, K1, I( 7) );
+      R( c, d, e, a, b, F1, K1, I( 8) );
+      R( b, c, d, e, a, F1, K1, I( 9) );
+      R( a, b, c, d, e, F1, K1, I(10) );
+      R( e, a, b, c, d, F1, K1, I(11) );
+      R( d, e, a, b, c, F1, K1, I(12) );
+      R( c, d, e, a, b, F1, K1, I(13) );
+      R( b, c, d, e, a, F1, K1, I(14) );
+      R( a, b, c, d, e, F1, K1, I(15) );
+      R( e, a, b, c, d, F1, K1, M(16) );
+      R( d, e, a, b, c, F1, K1, M(17) );
+      R( c, d, e, a, b, F1, K1, M(18) );
+      R( b, c, d, e, a, F1, K1, M(19) );
+      R( a, b, c, d, e, F2, K2, M(20) );
+      R( e, a, b, c, d, F2, K2, M(21) );
+      R( d, e, a, b, c, F2, K2, M(22) );
+      R( c, d, e, a, b, F2, K2, M(23) );
+      R( b, c, d, e, a, F2, K2, M(24) );
+      R( a, b, c, d, e, F2, K2, M(25) );
+      R( e, a, b, c, d, F2, K2, M(26) );
+      R( d, e, a, b, c, F2, K2, M(27) );
+      R( c, d, e, a, b, F2, K2, M(28) );
+      R( b, c, d, e, a, F2, K2, M(29) );
+      R( a, b, c, d, e, F2, K2, M(30) );
+      R( e, a, b, c, d, F2, K2, M(31) );
+      R( d, e, a, b, c, F2, K2, M(32) );
+      R( c, d, e, a, b, F2, K2, M(33) );
+      R( b, c, d, e, a, F2, K2, M(34) );
+      R( a, b, c, d, e, F2, K2, M(35) );
+      R( e, a, b, c, d, F2, K2, M(36) );
+      R( d, e, a, b, c, F2, K2, M(37) );
+      R( c, d, e, a, b, F2, K2, M(38) );
+      R( b, c, d, e, a, F2, K2, M(39) );
+      R( a, b, c, d, e, F3, K3, M(40) );
+      R( e, a, b, c, d, F3, K3, M(41) );
+      R( d, e, a, b, c, F3, K3, M(42) );
+      R( c, d, e, a, b, F3, K3, M(43) );
+      R( b, c, d, e, a, F3, K3, M(44) );
+      R( a, b, c, d, e, F3, K3, M(45) );
+      R( e, a, b, c, d, F3, K3, M(46) );
+      R( d, e, a, b, c, F3, K3, M(47) );
+      R( c, d, e, a, b, F3, K3, M(48) );
+      R( b, c, d, e, a, F3, K3, M(49) );
+      R( a, b, c, d, e, F3, K3, M(50) );
+      R( e, a, b, c, d, F3, K3, M(51) );
+      R( d, e, a, b, c, F3, K3, M(52) );
+      R( c, d, e, a, b, F3, K3, M(53) );
+      R( b, c, d, e, a, F3, K3, M(54) );
+      R( a, b, c, d, e, F3, K3, M(55) );
+      R( e, a, b, c, d, F3, K3, M(56) );
+      R( d, e, a, b, c, F3, K3, M(57) );
+      R( c, d, e, a, b, F3, K3, M(58) );
+      R( b, c, d, e, a, F3, K3, M(59) );
+      R( a, b, c, d, e, F4, K4, M(60) );
+      R( e, a, b, c, d, F4, K4, M(61) );
+      R( d, e, a, b, c, F4, K4, M(62) );
+      R( c, d, e, a, b, F4, K4, M(63) );
+      R( b, c, d, e, a, F4, K4, M(64) );
+      R( a, b, c, d, e, F4, K4, M(65) );
+      R( e, a, b, c, d, F4, K4, M(66) );
+      R( d, e, a, b, c, F4, K4, M(67) );
+      R( c, d, e, a, b, F4, K4, M(68) );
+      R( b, c, d, e, a, F4, K4, M(69) );
+      R( a, b, c, d, e, F4, K4, M(70) );
+      R( e, a, b, c, d, F4, K4, M(71) );
+      R( d, e, a, b, c, F4, K4, M(72) );
+      R( c, d, e, a, b, F4, K4, M(73) );
+      R( b, c, d, e, a, F4, K4, M(74) );
+      R( a, b, c, d, e, F4, K4, M(75) );
+      R( e, a, b, c, d, F4, K4, M(76) );
+      R( d, e, a, b, c, F4, K4, M(77) );
+      R( c, d, e, a, b, F4, K4, M(78) );
+      R( b, c, d, e, a, F4, K4, M(79) );
+
+      /* Update the chaining variables. */
+      hd->h0 += a;
+      hd->h1 += b;
+      hd->h2 += c;
+      hd->h3 += d;
+      hd->h4 += e;
+
+      data += 64;
+    }
+  while (--nblks);
+
+  return 88+4*sizeof(void*);
+}
+
+
+/*
+ * Apply the SHA-1 transform function on the buffer BLOCKOF64BYTE
+ * which must have a length 64 bytes.  BLOCKOF64BYTE must be 32-bit
+ * aligned.  Updates the 20 bytes in BLOCKOF64BYTE with its mixed
+ * content.  Returns the number of bytes which should be burned on the
+ * stack.  You need to use _gcry_sha1_mixblock_init to initialize the
+ * context.
+ * WARNING: This is a special purpose function for exclusive use by
+ * random-csprng.c.
+ */
+unsigned int
+_gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte)
+{
+  u32 *p = blockof64byte;
+  unsigned int nburn;
+
+  nburn = (*hd->bctx.bwrite) (hd, blockof64byte, 1);
+  p[0] = hd->h0;
+  p[1] = hd->h1;
+  p[2] = hd->h2;
+  p[3] = hd->h3;
+  p[4] = hd->h4;
+
+  return nburn;
+}
+
+
+/* The routine final terminates the computation and
+ * returns the digest.
+ * The handle is prepared for a new cycle, but adding bytes to the
+ * handle will the destroy the returned buffer.
+ * Returns: 20 bytes representing the digest.
+ */
+
+static void
+sha1_final(void *context)
+{
+  SHA1_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  unsigned char *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if( (lsb += hd->bctx.count) < t )
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (0)
+    { }
+#ifdef SHA1_USE_S390X_CRYPTO
+  else if (hd->use_s390x_crypto)
+    {
+      burn = do_sha1_final_s390x (hd, hd->bctx.buf, hd->bctx.count, msb, lsb);
+    }
+#endif
+  else if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
+    }
+  else  /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
+  X(0);
+  X(1);
+  X(2);
+  X(3);
+  X(4);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static unsigned char *
+sha1_read( void *context )
+{
+  SHA1_CONTEXT *hd = context;
+
+  return hd->bctx.buf;
+}
+
+/****************
+ * Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 20 bytes.
+ */
+void
+_gcry_sha1_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA1_CONTEXT hd;
+
+  sha1_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha1_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 20);
+}
+
+
+/* Variant of the above shortcut function using a multiple buffers.  */
+void
+_gcry_sha1_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SHA1_CONTEXT hd;
+
+  sha1_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha1_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 20);
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+
+static gpg_err_code_t
+selftests_sha1 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA1, 0,
+     "abc", 3,
+     "\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
+     "\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D", 20);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA1, 0,
+         "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
+         "\x84\x98\x3E\x44\x1C\x3B\xD2\x6E\xBA\xAE"
+         "\x4A\xA1\xF9\x51\x29\xE5\xE5\x46\x70\xF1", 20);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA1, 1,
+         NULL, 0,
+         "\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
+         "\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F", 20);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA1, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MD_SHA1:
+      ec = selftests_sha1 (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_DIGEST_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+
+static unsigned char asn[15] = /* Object ID is 1.3.14.3.2.26 */
+  { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03,
+    0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 };
+
+static gcry_md_oid_spec_t oid_spec_sha1[] =
+  {
+    /* iso.member-body.us.rsadsi.pkcs.pkcs-1.5 (sha1WithRSAEncryption) */
+    { "1.2.840.113549.1.1.5" },
+    /* iso.member-body.us.x9-57.x9cm.3 (dsaWithSha1)*/
+    { "1.2.840.10040.4.3" },
+    /* from NIST's OIW  (sha1) */
+    { "1.3.14.3.2.26" },
+    /* from NIST OIW (sha-1WithRSAEncryption) */
+    { "1.3.14.3.2.29" },
+    /* iso.member-body.us.ansi-x9-62.signatures.ecdsa-with-sha1 */
+    { "1.2.840.10045.4.1" },
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha1 =
+  {
+    GCRY_MD_SHA1, {0, 1},
+    "SHA1", asn, DIM (asn), oid_spec_sha1, 20,
+    sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL,
+    _gcry_sha1_hash_buffer, _gcry_sha1_hash_buffers,
+    sizeof (SHA1_CONTEXT),
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/sha1.h b/comm/third_party/libgcrypt/cipher/sha1.h
new file mode 100644
index 0000000000..a359765847
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha1.h
@@ -0,0 +1,47 @@
+/* sha1.h - SHA-1 context definition
+ * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRY_SHA1_H
+#define GCRY_SHA1_H
+
+#include "hash-common.h"
+
+
+/* SHA1_USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef SHA1_USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define SHA1_USE_S390X_CRYPTO 1
+#endif /* SHA1_USE_S390X_CRYPTO */
+
+
+/* We need this here for direct use by random-csprng.c. */
+typedef struct
+{
+  gcry_md_block_ctx_t bctx;
+  u32          h0,h1,h2,h3,h4;
+#ifdef SHA1_USE_S390X_CRYPTO
+  u32          final_len_msb, final_len_lsb; /* needs to be right after h4. */
+  int          use_s390x_crypto;
+#endif
+} SHA1_CONTEXT;
+
+
+void _gcry_sha1_mixblock_init (SHA1_CONTEXT *hd);
+unsigned int _gcry_sha1_mixblock (SHA1_CONTEXT *hd, void *blockof64byte);
+
+#endif /*GCRY_SHA1_H*/
diff --git a/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch32-ce.S b/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch32-ce.S
new file mode 100644
index 0000000000..2b17ab1b17
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch32-ce.S
@@ -0,0 +1,231 @@
+/* sha256-armv8-aarch32-ce.S - ARM/CE accelerated SHA-256 transform function
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO) && defined(USE_SHA256)
+
+.syntax unified
+.arch armv8-a
+.fpu crypto-neon-fp-armv8
+.arm
+
+.text
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+/* Constants */
+
+.align 4
+gcry_sha256_aarch32_ce_K:
+.LK:
+  .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+  .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+  .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+  .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+  .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+  .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+  .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+  .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+  .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+  .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+  .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+  .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+  .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+  .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+  .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+  .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+
+/* Register macros */
+
+#define qH0123 q0
+#define qH4567 q1
+
+#define qABCD0 q2
+#define qABCD1 q3
+#define qEFGH  q4
+
+#define qT0 q5
+#define qT1 q6
+
+#define qW0 q8
+#define qW1 q9
+#define qW2 q10
+#define qW3 q11
+
+#define qK0 q12
+#define qK1 q13
+#define qK2 q14
+#define qK3 q15
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+
+#define do_loadk(nk0, nk1) vld1.32 {nk0-nk1},[lr]!;
+#define do_add(a, b) vadd.u32 a, a, b;
+#define do_sha256su0(w0, w1) sha256su0.32 w0, w1;
+#define do_sha256su1(w0, w2, w3) sha256su1.32 w0, w2, w3;
+
+#define do_rounds(k, nk0, nk1, w0, w1, w2, w3, loadk_fn, add_fn, su0_fn, su1_fn) \
+        loadk_fn(   nk0, nk1     ); \
+        su0_fn(     w0, w1       ); \
+        vmov        qABCD1, qABCD0; \
+        sha256h.32  qABCD0, qEFGH, k; \
+        sha256h2.32 qEFGH, qABCD1, k; \
+        add_fn(     nk0, w2      ); \
+        su1_fn(     w0, w2, w3   );
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) veor reg, reg;
+
+
+/*
+ * unsigned int
+ * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data,
+ *                                  size_t num_blks)
+ */
+.align 3
+.globl _gcry_sha256_transform_armv8_ce
+.type  _gcry_sha256_transform_armv8_ce,%function;
+_gcry_sha256_transform_armv8_ce:
+  /* input:
+   *	r0: ctx, CTX
+   *	r1: data (64*nblks bytes)
+   *	r2: nblks
+   */
+
+  cmp r2, #0;
+  push {r4,lr};
+  beq .Ldo_nothing;
+
+  vpush {q4-q7};
+
+  GET_DATA_POINTER(r4, .LK, lr);
+  mov lr, r4
+
+  vld1.32 {qH0123-qH4567}, [r0]  /* load state */
+
+  vld1.8 {qW0-qW1}, [r1]!
+  do_loadk(qK0, qK1)
+  vld1.8 {qW2-qW3}, [r1]!
+  vmov qABCD0, qH0123
+  vmov qEFGH, qH4567
+
+  vrev32.8 qW0, qW0
+  vrev32.8 qW1, qW1
+  vrev32.8 qW2, qW2
+  do_add(qK0, qW0)
+  vrev32.8 qW3, qW3
+  do_add(qK1, qW1)
+
+.Loop:
+  do_rounds(qK0, qK2, qK3, qW0, qW1, qW2, qW3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  subs r2,r2,#1
+  do_rounds(qK1, qK3, _  , qW1, qW2, qW3, qW0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK2, qK0, qK1, qW2, qW3, qW0, qW1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK3, qK1, _  , qW3, qW0, qW1, qW2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  do_rounds(qK0, qK2, qK3, qW0, qW1, qW2, qW3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK1, qK3, _  , qW1, qW2, qW3, qW0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK2, qK0, qK1, qW2, qW3, qW0, qW1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK3, qK1, _  , qW3, qW0, qW1, qW2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  do_rounds(qK0, qK2, qK3, qW0, qW1, qW2, qW3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK1, qK3, _  , qW1, qW2, qW3, qW0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK2, qK0, qK1, qW2, qW3, qW0, qW1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(qK3, qK1, _  , qW3, qW0, qW1, qW2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  beq .Lend
+
+  do_rounds(qK0, qK2, qK3, qW0, _  , qW2, qW3, do_loadk, do_add, _, _)
+  vld1.8 {qW0}, [r1]!
+  mov lr, r4
+  do_rounds(qK1, qK3, _  , qW1, _  , qW3, _  , _       , do_add, _, _)
+  vld1.8 {qW1}, [r1]!
+  vrev32.8 qW0, qW0
+  do_rounds(qK2, qK0, qK1, qW2, _  , qW0, _  , do_loadk, do_add, _, _)
+  vrev32.8 qW1, qW1
+  vld1.8 {qW2}, [r1]!
+  do_rounds(qK3, qK1, _  , qW3, _  , qW1, _  , _       , do_add, _, _)
+  vld1.8 {qW3}, [r1]!
+
+  vadd.u32 qH0123, qABCD0
+  vadd.u32 qH4567, qEFGH
+
+  vrev32.8 qW2, qW2
+  vmov qABCD0, qH0123
+  vrev32.8 qW3, qW3
+  vmov qEFGH, qH4567
+
+  b .Loop
+
+.Lend:
+
+  do_rounds(qK0, qK2, qK3, qW0, _  , qW2, qW3, do_loadk, do_add, _, _)
+  do_rounds(qK1, qK3, _  , qW1, _  , qW3, _  , _       , do_add, _, _)
+  do_rounds(qK2, _  , _  , qW2, _  , _  , _  , _       , _, _, _)
+  do_rounds(qK3, _  , _  , qW3, _  , _  , _  , _       , _, _, _)
+
+  CLEAR_REG(qW0)
+  CLEAR_REG(qW1)
+  CLEAR_REG(qW2)
+  CLEAR_REG(qW3)
+  CLEAR_REG(qK0)
+  CLEAR_REG(qK1)
+  CLEAR_REG(qK2)
+  CLEAR_REG(qK3)
+
+  vadd.u32 qH0123, qABCD0
+  vadd.u32 qH4567, qEFGH
+
+  CLEAR_REG(qABCD0)
+  CLEAR_REG(qABCD1)
+  CLEAR_REG(qEFGH)
+
+  vst1.32 {qH0123-qH4567}, [r0] /* store state */
+
+  CLEAR_REG(qH0123)
+  CLEAR_REG(qH4567)
+  vpop {q4-q7}
+
+.Ldo_nothing:
+  mov r0, #0
+  pop {r4,pc}
+.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch64-ce.S b/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch64-ce.S
new file mode 100644
index 0000000000..f57cae290b
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-armv8-aarch64-ce.S
@@ -0,0 +1,215 @@
+/* sha256-armv8-aarch64-ce.S - ARM/CE accelerated SHA-256 transform function
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA256)
+
+.cpu generic+simd+crypto
+
+.text
+
+
+/* Constants */
+
+.align 4
+gcry_sha256_aarch64_ce_K:
+.LK:
+  .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+  .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+  .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+  .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+  .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+  .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+  .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+  .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+  .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+  .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+  .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+  .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+  .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+  .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+  .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+  .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+
+/* Register macros */
+
+#define vH0123 v0
+#define vH4567 v1
+
+#define vABCD0 v2
+#define qABCD0 q2
+#define vABCD1 v3
+#define qABCD1 q3
+#define vEFGH  v4
+#define qEFGH  q4
+
+#define vT0 v5
+#define vT1 v6
+
+#define vW0 v16
+#define vW1 v17
+#define vW2 v18
+#define vW3 v19
+
+#define vK0 v20
+#define vK1 v21
+#define vK2 v22
+#define vK3 v23
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+
+#define do_loadk(nk0, nk1) ld1 {nk0.16b-nk1.16b},[x3],#32;
+#define do_add(a, b) add a.4s, a.4s, b.4s;
+#define do_sha256su0(w0, w1) sha256su0 w0.4s, w1.4s;
+#define do_sha256su1(w0, w2, w3) sha256su1 w0.4s, w2.4s, w3.4s;
+
+#define do_rounds(k, nk0, nk1, w0, w1, w2, w3, loadk_fn, add_fn, su0_fn, su1_fn) \
+        loadk_fn(   v##nk0, v##nk1     ); \
+        su0_fn(     v##w0, v##w1       ); \
+        mov         vABCD1.16b, vABCD0.16b; \
+        sha256h     qABCD0, qEFGH, v##k.4s; \
+        sha256h2    qEFGH, qABCD1, v##k.4s; \
+        add_fn(     v##nk0, v##w2      ); \
+        su1_fn(     v##w0, v##w2, v##w3   );
+
+
+/* Other functional macros */
+
+#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+
+
+/*
+ * unsigned int
+ * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data,
+ *                                  size_t num_blks)
+ */
+.align 3
+.globl _gcry_sha256_transform_armv8_ce
+ELF(.type  _gcry_sha256_transform_armv8_ce,%function;)
+_gcry_sha256_transform_armv8_ce:
+  /* input:
+   *	r0: ctx, CTX
+   *	r1: data (64*nblks bytes)
+   *	r2: nblks
+   */
+  CFI_STARTPROC();
+
+  cbz x2, .Ldo_nothing;
+
+  GET_DATA_POINTER(x3, .LK);
+  mov x4, x3
+
+  ld1 {vH0123.4s-vH4567.4s}, [x0]  /* load state */
+
+  ld1 {vW0.16b-vW1.16b}, [x1], #32
+  do_loadk(vK0, vK1)
+  ld1 {vW2.16b-vW3.16b}, [x1], #32
+  mov vABCD0.16b, vH0123.16b
+  mov vEFGH.16b, vH4567.16b
+
+  rev32 vW0.16b, vW0.16b
+  rev32 vW1.16b, vW1.16b
+  rev32 vW2.16b, vW2.16b
+  do_add(vK0, vW0)
+  rev32 vW3.16b, vW3.16b
+  do_add(vK1, vW1)
+
+.Loop:
+  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  sub x2,x2,#1
+  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
+  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
+
+  cbz x2, .Lend
+
+  do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
+  ld1 {vW0.16b}, [x1], #16
+  mov x3, x4
+  do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
+  ld1 {vW1.16b}, [x1], #16
+  rev32 vW0.16b, vW0.16b
+  do_rounds(K2, K0, K1, W2, _  , W0, _  , do_loadk, do_add, _, _)
+  rev32 vW1.16b, vW1.16b
+  ld1 {vW2.16b}, [x1], #16
+  do_rounds(K3, K1, _ , W3, _  , W1, _  , _       , do_add, _, _)
+  ld1 {vW3.16b}, [x1], #16
+
+  do_add(vH0123, vABCD0)
+  do_add(vH4567, vEFGH)
+
+  rev32 vW2.16b, vW2.16b
+  mov vABCD0.16b, vH0123.16b
+  rev32 vW3.16b, vW3.16b
+  mov vEFGH.16b, vH4567.16b
+
+  b .Loop
+
+.Lend:
+
+  do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
+  do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
+  do_rounds(K2, _ , _ , W2, _  , _  , _  , _       , _, _, _)
+  do_rounds(K3, _ , _ , W3, _  , _  , _  , _       , _, _, _)
+
+  CLEAR_REG(vW0)
+  CLEAR_REG(vW1)
+  CLEAR_REG(vW2)
+  CLEAR_REG(vW3)
+  CLEAR_REG(vK0)
+  CLEAR_REG(vK1)
+  CLEAR_REG(vK2)
+  CLEAR_REG(vK3)
+
+  do_add(vH0123, vABCD0)
+  do_add(vH4567, vEFGH)
+
+  CLEAR_REG(vABCD0)
+  CLEAR_REG(vABCD1)
+  CLEAR_REG(vEFGH)
+
+  st1 {vH0123.4s-vH4567.4s}, [x0] /* store state */
+
+  CLEAR_REG(vH0123)
+  CLEAR_REG(vH4567)
+
+.Ldo_nothing:
+  mov x0, #0
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;)
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha256-avx-amd64.S b/comm/third_party/libgcrypt/cipher/sha256-avx-amd64.S
new file mode 100644
index 0000000000..ec945f8473
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-avx-amd64.S
@@ -0,0 +1,506 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Note: Based on the SSSE3 implementation.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+#define	VMOVDQ vmovdqu /* assume buffers not aligned */
+
+#define ROR(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (32 - p2);
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
+
+/* addm [mem], reg
+ * Add reg to mem using reg-mem add and store */
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+ * Load xmm with mem and byte swap each dword */
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3;
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
+
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
+
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
+
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
+
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
+
+#define TBL rbp
+#define a eax
+#define b ebx
+
+#define f r9d
+#define g r10d
+#define h r11d
+
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+
+#define _INP_END_SIZE	8
+#define _INP_SIZE	8
+#define _XFER_SIZE	8
+#define _XMM_SAVE_SIZE	0
+/* STACK_SIZE plus pushes must be an odd multiple of 8 */
+#define _ALIGN_SIZE	8
+
+#define _INP_END	0
+#define _INP		(_INP_END  + _INP_END_SIZE)
+#define _XFER		(_INP      + _INP_SIZE)
+#define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
+#define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
+
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpslld	XTMP2, XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpsrld	XTMP3, XTMP1, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+		vpslld	XTMP2, XTMP1, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrld	XTMP4, XTMP1, 18; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP4, XTMP4, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
+
+/* input is [rsp + _XFER + %1 * 4] */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+*/
+.text
+.globl _gcry_sha256_transform_amd64_avx
+ELF(.type  _gcry_sha256_transform_amd64_avx,@function;)
+.align 16
+_gcry_sha256_transform_amd64_avx:
+	CFI_STARTPROC()
+	vzeroupper
+
+	push	rbx
+	CFI_PUSH(rbx)
+	push	rbp
+	CFI_PUSH(rbp)
+	push	r13
+	CFI_PUSH(r13)
+	push	r14
+	CFI_PUSH(r14)
+	push	r15
+	CFI_PUSH(r15)
+
+	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
+
+	shl	NUM_BLKS, 6	/* convert to bytes */
+	jz	.Ldone_hash
+	add	NUM_BLKS, INP	/* pointer to end of data */
+	mov	[rsp + _INP_END], NUM_BLKS
+
+	/* load initial digest */
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
+	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
+
+.Loop0:
+	lea	TBL, [.LK256 ADD_RIP]
+
+	/* byte swap first 16 dwords */
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
+
+	mov	[rsp + _INP], INP
+
+	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
+	mov	SRND, 3
+.align 16
+.Loop1:
+	vpaddd	XFER, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+
+	vpaddd	XFER, X1, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+
+	vpaddd	XFER, X2, [TBL + 2*16]
+	vmovdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+
+	vpaddd	XFER, X3, [TBL + 3*16]
+	vmovdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
+
+	sub	SRND, 1
+	jne	.Loop1
+
+	mov	SRND, 2
+.Loop2:
+	vpaddd	X0, X0, [TBL + 0*16]
+	vmovdqa	[rsp + _XFER], X0
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
+	vpaddd	X1, X1, [TBL + 1*16]
+	vmovdqa	[rsp + _XFER], X1
+	add	TBL, 2*16
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
+
+	vmovdqa	X0, X2
+	vmovdqa	X1, X3
+
+	sub	SRND, 1
+	jne	.Loop2
+
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
+
+	mov	INP, [rsp + _INP]
+	add	INP, 64
+	cmp	INP, [rsp + _INP_END]
+	jne	.Loop0
+
+.Ldone_hash:
+	vzeroall
+
+	vmovdqa	[rsp + _XFER], XFER
+	xor     eax, eax
+
+	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
+
+	pop	r15
+	CFI_POP(r15)
+	pop	r14
+	CFI_POP(r14)
+	pop	r13
+	CFI_POP(r13)
+	pop	rbp
+	CFI_POP(rbp)
+	pop	rbx
+	CFI_POP(rbx)
+
+	ret
+	CFI_ENDPROC()
+
+
+.align 16
+.LK256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
+
+/* shuffle xBxA -> 00BA */
+.L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+/* shuffle xDxC -> DC00 */
+.L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha256-avx2-bmi2-amd64.S b/comm/third_party/libgcrypt/cipher/sha256-avx2-bmi2-amd64.S
new file mode 100644
index 0000000000..d130dd4a61
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-avx2-bmi2-amd64.S
@@ -0,0 +1,527 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 2 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(USE_SHA256)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+#define	VMOVDQ vmovdqu /* ; assume buffers not aligned  */
+
+/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */
+
+/*  addm [mem], reg */
+/*  Add reg to mem using reg-mem add and store */
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
+
+/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+
+#define X0 ymm4
+#define X1 ymm5
+#define X2 ymm6
+#define X3 ymm7
+
+/*  XMM versions of above */
+#define XWORD0 xmm4
+#define XWORD1 xmm5
+#define XWORD2 xmm6
+#define XWORD3 xmm7
+
+#define XTMP0 ymm0
+#define XTMP1 ymm1
+#define XTMP2 ymm2
+#define XTMP3 ymm3
+#define XTMP4 ymm8
+#define XFER ymm9
+#define XTMP5 ymm11
+
+#define SHUF_00BA ymm10 /*  shuffle xBxA -> 00BA */
+#define SHUF_DC00 ymm12 /*  shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK ymm13
+
+#define X_BYTE_FLIP_MASK xmm13 /*  XMM version of BYTE_FLIP_MASK */
+
+#define NUM_BLKS rdx /*  3rd arg */
+#define CTX rsi      /*  2nd arg */
+#define INP rdi      /*  1st arg */
+#define c ecx
+#define d r8d
+#define e edx        /*  clobbers NUM_BLKS */
+#define y3 edi       /*  clobbers INP */
+
+#define TBL rbp
+#define SRND CTX     /*  SRND is same register as CTX */
+
+#define a eax
+#define b ebx
+#define f r9d
+#define g r10d
+#define h r11d
+#define old_h r11d
+
+#define T1 r12d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+#define _XFER_SIZE 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
+#define _XMM_SAVE_SIZE 0
+#define _INP_END_SIZE 8
+#define _INP_SIZE 8
+#define _CTX_SIZE 8
+#define _RSP_SIZE 8
+
+#define _XFER 0
+#define _XMM_SAVE  _XFER     + _XFER_SIZE
+#define _INP_END   _XMM_SAVE + _XMM_SAVE_SIZE
+#define _INP       _INP_END  + _INP_END_SIZE
+#define _CTX       _INP      + _INP_SIZE
+#define _RSP       _CTX      + _CTX_SIZE
+#define STACK_SIZE _RSP      + _RSP_SIZE
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \
+	/* d += h; */ \
+	/* h += Sum0 (a) + Maj (a, b, c); */ \
+	\
+	/* Ch(x, y, z) => ((x & y) + (~x & z)) */ \
+	/* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 25; \
+	rorx y1, e, 11; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 13; \
+	xor y0, y1; \
+	lea h, [h + y3]
+
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 22; \
+	rorx y1, e, 6; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 2; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
+
+#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */; \
+		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \
+		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */; \
+		vpsrld	XTMP2, XTMP1, 7; \
+		vpslld	XTMP3, XTMP1, (32-7); \
+		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */; \
+		vpsrld	XTMP2, XTMP1,18; \
+	\
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */; \
+		vpslld	XTMP1, XTMP1, (32-18); \
+		vpxor	XTMP3, XTMP3, XTMP1; \
+		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */; \
+		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */; \
+	\
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */; \
+		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */; \
+	\
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */; \
+		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */; \
+		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */; \
+		vpaddd	XFER, X0, [TBL + XFEROUT]; \
+	\
+	ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \
+		vmovdqa [rsp + _XFER + XFEROUT], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e);
+
+#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e)
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+*/
+.text
+.globl _gcry_sha256_transform_amd64_avx2
+ELF(.type _gcry_sha256_transform_amd64_avx2,@function)
+.align 32
+_gcry_sha256_transform_amd64_avx2:
+	CFI_STARTPROC()
+	xor eax, eax
+
+	cmp rdx, 0
+	je .Lnowork
+
+	push	rbx
+	CFI_PUSH(rbx)
+	push	rbp
+	CFI_PUSH(rbp)
+	push	r12
+	CFI_PUSH(r12)
+	push	r13
+	CFI_PUSH(r13)
+	push	r14
+	CFI_PUSH(r14)
+	push	r15
+	CFI_PUSH(r15)
+
+	vzeroupper
+
+	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
+	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
+
+	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
+	sub	rsp, STACK_SIZE
+	and	rsp, ~63
+	mov	[rsp + _RSP], rax
+	CFI_CFA_ON_STACK(_RSP, 6 * 8)
+
+	shl	NUM_BLKS, 6	/*  convert to bytes */
+	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
+	mov	[rsp + _INP_END], NUM_BLKS
+
+	/* Check if only one block of input. Note: Loading initial digest
+	 * only uses 'mov' instruction and does not change condition
+	 * flags. */
+	cmp	NUM_BLKS, INP
+
+	/* ; load initial digest */
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	mov	[rsp + _CTX], CTX
+
+	je	.Ldo_last_block
+
+.Loop0:
+	lea	TBL, [.LK256 ADD_RIP]
+
+	/* ; Load first 16 dwords from two blocks */
+	VMOVDQ	XTMP0, [INP + 0*32]
+	VMOVDQ	XTMP1, [INP + 1*32]
+	VMOVDQ	XTMP2, [INP + 2*32]
+	VMOVDQ	XTMP3, [INP + 3*32]
+
+	/* ; byte swap data */
+	vpshufb	XTMP0, XTMP0, BYTE_FLIP_MASK
+	vpshufb	XTMP1, XTMP1, BYTE_FLIP_MASK
+	vpshufb	XTMP2, XTMP2, BYTE_FLIP_MASK
+	vpshufb	XTMP3, XTMP3, BYTE_FLIP_MASK
+
+	/* ; transpose data into high/low halves */
+	vperm2i128	X0, XTMP0, XTMP2, 0x20
+	vperm2i128	X1, XTMP0, XTMP2, 0x31
+	vperm2i128	X2, XTMP1, XTMP3, 0x20
+	vperm2i128	X3, XTMP1, XTMP3, 0x31
+
+.Last_block_enter:
+	add	INP, 64
+	mov	[rsp + _INP], INP
+
+	/* ; schedule 48 input dwords, by doing 3 rounds of 12 each */
+	xor	SRND, SRND
+
+	vpaddd	XFER, X0, [TBL + 0*32]
+	vmovdqa [rsp + _XFER + 0*32], XFER
+	vpaddd	XFER, X1, [TBL + 1*32]
+	vmovdqa [rsp + _XFER + 1*32], XFER
+	vpaddd	XFER, X2, [TBL + 2*32]
+	vmovdqa [rsp + _XFER + 2*32], XFER
+	vpaddd	XFER, X3, [TBL + 3*32]
+	vmovdqa [rsp + _XFER + 3*32], XFER
+
+.align 16
+.Loop1:
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d)
+
+	add	SRND, 4*32
+	cmp	SRND, 3 * 4*32
+	jb	.Loop1
+
+	/* ; Do last 16 rounds with no scheduling */
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d)
+
+	mov	CTX, [rsp + _CTX]
+	mov	INP, [rsp + _INP]
+
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
+
+	cmp	INP, [rsp + _INP_END]
+	ja	.Ldone_hash
+
+	/* ;;; Do second block using previously scheduled results */
+	xor	SRND, SRND
+.align 16
+.Loop3:
+	DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d)
+	add	SRND, 2*32
+	cmp	SRND, 4 * 4*32
+	jb .Loop3
+
+	mov	CTX, [rsp + _CTX]
+	mov	INP, [rsp + _INP]
+	add	INP, 64
+
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
+
+	cmp	INP, [rsp + _INP_END]
+	jb	.Loop0
+	ja	.Ldone_hash
+
+.Ldo_last_block:
+	/* ;;; do last block */
+	lea	TBL, [.LK256 ADD_RIP]
+
+	VMOVDQ	XWORD0, [INP + 0*16]
+	VMOVDQ	XWORD1, [INP + 1*16]
+	VMOVDQ	XWORD2, [INP + 2*16]
+	VMOVDQ	XWORD3, [INP + 3*16]
+
+	vpshufb	XWORD0, XWORD0, X_BYTE_FLIP_MASK
+	vpshufb	XWORD1, XWORD1, X_BYTE_FLIP_MASK
+	vpshufb	XWORD2, XWORD2, X_BYTE_FLIP_MASK
+	vpshufb	XWORD3, XWORD3, X_BYTE_FLIP_MASK
+
+	jmp	.Last_block_enter
+
+.Lonly_one_block:
+
+	/* ; load initial digest */
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
+	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
+
+	mov	[rsp + _CTX], CTX
+	jmp	.Ldo_last_block
+
+.Ldone_hash:
+	vzeroall
+
+	/* burn stack */
+	vmovdqa [rsp + _XFER + 0 * 32], ymm0
+	vmovdqa [rsp + _XFER + 1 * 32], ymm0
+	vmovdqa [rsp + _XFER + 2 * 32], ymm0
+	vmovdqa [rsp + _XFER + 3 * 32], ymm0
+	vmovdqa [rsp + _XFER + 4 * 32], ymm0
+	vmovdqa [rsp + _XFER + 5 * 32], ymm0
+	vmovdqa [rsp + _XFER + 6 * 32], ymm0
+	vmovdqa [rsp + _XFER + 7 * 32], ymm0
+	vmovdqa [rsp + _XFER + 8 * 32], ymm0
+	vmovdqa [rsp + _XFER + 9 * 32], ymm0
+	vmovdqa [rsp + _XFER + 10 * 32], ymm0
+	vmovdqa [rsp + _XFER + 11 * 32], ymm0
+	vmovdqa [rsp + _XFER + 12 * 32], ymm0
+	vmovdqa [rsp + _XFER + 13 * 32], ymm0
+	vmovdqa [rsp + _XFER + 14 * 32], ymm0
+	vmovdqa [rsp + _XFER + 15 * 32], ymm0
+	xor     eax, eax
+
+	mov	rsp, [rsp + _RSP]
+	CFI_DEF_CFA_REGISTER(rsp)
+
+	pop	r15
+	CFI_POP(r15)
+	pop	r14
+	CFI_POP(r14)
+	pop	r13
+	CFI_POP(r13)
+	pop	r12
+	CFI_POP(r12)
+	pop	rbp
+	CFI_POP(rbp)
+	pop	rbx
+	CFI_POP(rbx)
+
+.Lnowork:
+	ret
+	CFI_ENDPROC()
+
+.align 64
+.LK256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.LPSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+/*  shuffle xBxA -> 00BA */
+.L_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+/*  shuffle xDxC -> DC00 */
+.L_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha256-intel-shaext.c b/comm/third_party/libgcrypt/cipher/sha256-intel-shaext.c
new file mode 100644
index 0000000000..48c09eefe1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-intel-shaext.c
@@ -0,0 +1,363 @@
+/* sha256-intel-shaext.S - SHAEXT accelerated SHA-256 transform function
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && defined(USE_SHA256) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
+
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
+
+/* Two macros to be called prior and after the use of SHA-EXT
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, (%0)\n"                           \
+                      "movdqu %%xmm7, (%1)\n"                           \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16])         \
+                      : "memory");                                      \
+   } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("movdqu (%0), %%xmm6\n"                           \
+                      "movdqu (%1), %%xmm7\n"                           \
+                      "pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "movdqa %%xmm0, (%2)\n\t"                         \
+                      "movdqa %%xmm0, (%3)\n\t"                         \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16]),        \
+                        "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n"                           \
+                      "pxor %%xmm7, %%xmm7\n"                           \
+                      "movdqa %%xmm0, (%0)\n\t"                         \
+                      "movdqa %%xmm0, (%1)\n\t"                         \
+                      :                                                 \
+                      : "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#endif
+
+typedef struct u128_s
+{
+  u32 a, b, c, d;
+} u128_t;
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ */
+unsigned int ASM_FUNC_ATTR
+_gcry_sha256_transform_intel_shaext(u32 state[8], const unsigned char *data,
+                                    size_t nblks)
+{
+  static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) =
+    { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  static const u128_t K[16] __attribute__ ((aligned (16))) =
+  {
+    { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 },
+    { 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 },
+    { 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 },
+    { 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 },
+    { 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc },
+    { 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da },
+    { 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 },
+    { 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 },
+    { 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 },
+    { 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 },
+    { 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 },
+    { 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 },
+    { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 },
+    { 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 },
+    { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 },
+    { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }
+  };
+  char save_buf[2 * 16 + 15];
+  char *abef_save;
+  char *cdgh_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abef_save) : "0" (save_buf) : "memory");
+  abef_save = abef_save + (-(uintptr_t)abef_save & 15);
+  cdgh_save = abef_save + 16;
+
+  /* byteswap mask => XMM7 */
+  asm volatile ("movdqa %[mask], %%xmm7\n\t" /* Preload mask */
+                :
+                : [mask] "m" (*bshuf_mask)
+                : "memory");
+
+  /* Load state.. ABEF_SAVE => STATE0 XMM1, CDGH_STATE => STATE1 XMM2 */
+  asm volatile ("movups 16(%[state]), %%xmm1\n\t" /* HGFE (xmm=EFGH) */
+                "movups  0(%[state]), %%xmm0\n\t" /* DCBA (xmm=ABCD) */
+                "movaps %%xmm1, %%xmm2\n\t"
+                "shufps $0x11, %%xmm0, %%xmm1\n\t" /* ABEF (xmm=FEBA) */
+                "shufps $0xbb, %%xmm0, %%xmm2\n\t" /* CDGH (xmm=HGDC) */
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  /* Load message */
+  asm volatile ("movdqu 0*16(%[data]), %%xmm3\n\t"
+                "movdqu 1*16(%[data]), %%xmm4\n\t"
+                "movdqu 2*16(%[data]), %%xmm5\n\t"
+                "movdqu 3*16(%[data]), %%xmm6\n\t"
+                "pshufb %%xmm7, %%xmm3\n\t"
+                "pshufb %%xmm7, %%xmm4\n\t"
+                "pshufb %%xmm7, %%xmm5\n\t"
+                "pshufb %%xmm7, %%xmm6\n\t"
+                :
+                : [data] "r" (data)
+                : "memory" );
+  data += 64;
+
+  do
+    {
+      /* Save state */
+      asm volatile ("movdqa %%xmm1, (%[abef_save])\n\t"
+                    "movdqa %%xmm2, (%[cdgh_save])\n\t"
+                    :
+                    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                    : "memory" );
+
+      /* Round 0..3 */
+      asm volatile ("movdqa %%xmm3, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[0].a)
+                    : "memory" );
+
+      /* Round 4..7 */
+      asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    "sha256msg1 %%xmm4, %%xmm3\n\t"
+                    :
+                    : [constants] "m" (K[1].a)
+                    : "memory" );
+
+      /* Round 8..11 */
+      asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    "sha256msg1 %%xmm5, %%xmm4\n\t"
+                    :
+                    : [constants] "m" (K[2].a)
+                    : "memory" );
+
+#define ROUND(k, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("movdqa %%"MSG0", %%xmm0\n\t" \
+                      "paddd %[constants], %%xmm0\n\t" \
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t" \
+                    "movdqa %%"MSG0", %%xmm7\n\t" \
+                    "palignr $4, %%"MSG3", %%xmm7\n\t" \
+                    "paddd %%xmm7, %%"MSG1"\n\t" \
+                    "sha256msg2 %%"MSG0", %%"MSG1"\n\t" \
+                      "psrldq $8, %%xmm0\n\t" \
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t" \
+                    "sha256msg1 %%"MSG0", %%"MSG3"\n\t" \
+                    : \
+                    : [constants] "m" (K[k].a) \
+                    : "memory" )
+
+      /* Rounds 12..15 to 48..51 */
+      ROUND(3, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(4, "xmm3", "xmm4", "xmm5", "xmm6");
+      ROUND(5, "xmm4", "xmm5", "xmm6", "xmm3");
+      ROUND(6, "xmm5", "xmm6", "xmm3", "xmm4");
+      ROUND(7, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(8, "xmm3", "xmm4", "xmm5", "xmm6");
+      ROUND(9, "xmm4", "xmm5", "xmm6", "xmm3");
+      ROUND(10, "xmm5", "xmm6", "xmm3", "xmm4");
+      ROUND(11, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(12, "xmm3", "xmm4", "xmm5", "xmm6");
+
+      if (--nblks == 0)
+        break;
+
+      /* Round 52..55 */
+      asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                    "movdqa %%xmm4, %%xmm7\n\t"
+                    "palignr $4, %%xmm3, %%xmm7\n\t"
+                    "movdqu 0*16(%[data]), %%xmm3\n\t"
+                    "paddd %%xmm7, %%xmm5\n\t"
+                    "sha256msg2 %%xmm4, %%xmm5\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[13].a), [data] "r" (data)
+                    : "memory" );
+
+      /* Round 56..59 */
+      asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                    "movdqa %%xmm5, %%xmm7\n\t"
+                    "palignr $4, %%xmm4, %%xmm7\n\t"
+                    "movdqu 1*16(%[data]), %%xmm4\n\t"
+                    "paddd %%xmm7, %%xmm6\n\t"
+                    "movdqa %[mask], %%xmm7\n\t" /* Reload mask */
+                    "sha256msg2 %%xmm5, %%xmm6\n\t"
+                    "movdqu 2*16(%[data]), %%xmm5\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[14].a), [mask] "m" (*bshuf_mask),
+                      [data] "r" (data)
+                    : "memory" );
+
+      /* Round 60..63 */
+      asm volatile ("movdqa %%xmm6, %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm3\n\t"
+                    "movdqu 3*16(%[data]), %%xmm6\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm4\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm5\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[15].a), [data] "r" (data)
+                    : "memory" );
+      data += 64;
+
+      /* Merge states */
+      asm volatile ("paddd (%[abef_save]), %%xmm1\n\t"
+                    "paddd (%[cdgh_save]), %%xmm2\n\t"
+                    "pshufb %%xmm7, %%xmm6\n\t"
+                    :
+                    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                    : "memory" );
+    }
+  while (1);
+
+  /* Round 52..55 */
+  asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                "movdqa %%xmm4, %%xmm7\n\t"
+                "palignr $4, %%xmm3, %%xmm7\n\t"
+                "paddd %%xmm7, %%xmm5\n\t"
+                "sha256msg2 %%xmm4, %%xmm5\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[13].a)
+                : "memory" );
+
+  /* Round 56..59 */
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                "movdqa %%xmm5, %%xmm7\n\t"
+                "palignr $4, %%xmm4, %%xmm7\n\t"
+                "paddd %%xmm7, %%xmm6\n\t"
+                "movdqa %[mask], %%xmm7\n\t" /* Reload mask */
+                "sha256msg2 %%xmm5, %%xmm6\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[14].a), [mask] "m" (*bshuf_mask)
+                : "memory" );
+
+  /* Round 60..63 */
+  asm volatile ("movdqa %%xmm6, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[15].a)
+                : "memory" );
+
+  /* Merge states */
+  asm volatile ("paddd (%[abef_save]), %%xmm1\n\t"
+                "paddd (%[cdgh_save]), %%xmm2\n\t"
+                :
+                : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                : "memory" );
+
+  /* Save state (XMM1=FEBA, XMM2=HGDC) */
+  asm volatile ("movaps %%xmm1, %%xmm0\n\t"
+                "shufps $0x11, %%xmm2, %%xmm1\n\t" /* xmm=ABCD */
+                "shufps $0xbb, %%xmm2, %%xmm0\n\t" /* xmm=EFGH */
+                "movups %%xmm1, 16(%[state])\n\t"
+                "movups %%xmm0,  0(%[state])\n\t"
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  shaext_cleanup (abef_save, cdgh_save);
+  return 0;
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/comm/third_party/libgcrypt/cipher/sha256-ppc.c b/comm/third_party/libgcrypt/cipher/sha256-ppc.c
new file mode 100644
index 0000000000..a9b59714d2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-ppc.c
@@ -0,0 +1,795 @@
+/* sha256-ppc.c - PowerPC vcrypto implementation of SHA-256 transform
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    defined(USE_SHA256) && \
+    __GNUC__ >= 4
+
+#include <altivec.h>
+#include "bufhelp.h"
+
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+static const u32 K[64] =
+  {
+#define TBL(v) v
+    TBL(0x428a2f98), TBL(0x71374491), TBL(0xb5c0fbcf), TBL(0xe9b5dba5),
+    TBL(0x3956c25b), TBL(0x59f111f1), TBL(0x923f82a4), TBL(0xab1c5ed5),
+    TBL(0xd807aa98), TBL(0x12835b01), TBL(0x243185be), TBL(0x550c7dc3),
+    TBL(0x72be5d74), TBL(0x80deb1fe), TBL(0x9bdc06a7), TBL(0xc19bf174),
+    TBL(0xe49b69c1), TBL(0xefbe4786), TBL(0x0fc19dc6), TBL(0x240ca1cc),
+    TBL(0x2de92c6f), TBL(0x4a7484aa), TBL(0x5cb0a9dc), TBL(0x76f988da),
+    TBL(0x983e5152), TBL(0xa831c66d), TBL(0xb00327c8), TBL(0xbf597fc7),
+    TBL(0xc6e00bf3), TBL(0xd5a79147), TBL(0x06ca6351), TBL(0x14292967),
+    TBL(0x27b70a85), TBL(0x2e1b2138), TBL(0x4d2c6dfc), TBL(0x53380d13),
+    TBL(0x650a7354), TBL(0x766a0abb), TBL(0x81c2c92e), TBL(0x92722c85),
+    TBL(0xa2bfe8a1), TBL(0xa81a664b), TBL(0xc24b8b70), TBL(0xc76c51a3),
+    TBL(0xd192e819), TBL(0xd6990624), TBL(0xf40e3585), TBL(0x106aa070),
+    TBL(0x19a4c116), TBL(0x1e376c08), TBL(0x2748774c), TBL(0x34b0bcb5),
+    TBL(0x391c0cb3), TBL(0x4ed8aa4a), TBL(0x5b9cca4f), TBL(0x682e6ff3),
+    TBL(0x748f82ee), TBL(0x78a5636f), TBL(0x84c87814), TBL(0x8cc70208),
+    TBL(0x90befffa), TBL(0xa4506ceb), TBL(0xbef9a3f7), TBL(0xc67178f2)
+#undef TBL
+  };
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_rol_elems(vector4x_u32 v, unsigned int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return vec_sld (v, v, (16 - (4 * idx)) & 15);
+#else
+  return vec_sld (v, v, (4 * idx) & 15);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_merge_idx0_elems(vector4x_u32 v0, vector4x_u32 v1,
+		     vector4x_u32 v2, vector4x_u32 v3)
+{
+  return (vector4x_u32)vec_mergeh ((vector2x_u64) vec_mergeh(v0, v1),
+				   (vector2x_u64) vec_mergeh(v2, v3));
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_ror_u32(vector4x_u32 v, unsigned int shift)
+{
+  return (v >> (shift & 31)) ^ (v << ((32 - shift) & 31));
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
+{
+  asm ("vshasigmaw %0,%1,%2,%3"
+       : "=v" (v)
+       : "v" (v), "g" (a), "g" (b)
+       : "memory");
+  return v;
+}
+
+
+/* SHA2 round in vector registers */
+#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+    {                                                         \
+      t1  = (h);                                              \
+      t1 += ((k) + (w));                                      \
+      t1 += Cho((e),(f),(g));                                 \
+      t1 += Sum1((e));                                        \
+      t2  = Sum0((a));                                        \
+      t2 += Maj((a),(b),(c));                                 \
+      d  += t1;                                               \
+      h   = t1 + t2;                                          \
+    } while (0)
+
+#define Cho(b, c, d)  (vec_sel(d, c, b))
+
+#define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
+
+#define Sum0(x)       (vec_vshasigma_u32(x, 1, 0))
+
+#define Sum1(x)       (vec_vshasigma_u32(x, 1, 15))
+
+
+/* Message expansion on general purpose registers */
+#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
+#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
+
+#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
+#define W(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
+		w[i&0x0f] += S0(w[(i-15)&0x0f]); \
+		w[i&0x0f] += S1(w[(i-2) &0x0f]); \
+		w[i&0x0f]; })
+
+#define I2(i) ( w2[i] = buf_get_be32(64 + data + i * 4), I(i) )
+#define W2(i) ({ w2[i]  = w2[i-7];       \
+		 w2[i] += S1(w2[i-2]);   \
+		 w2[i] += S0(w2[i-15]);  \
+		 w2[i] += w2[i-16];      \
+		 W(i); })
+#define R2(i) ( w2[i] )
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
+			    size_t nblks)
+{
+  /* GPRs used for message expansion as vector intrinsics based generates
+   * slower code. */
+  vector4x_u32 h0, h1, h2, h3, h4, h5, h6, h7;
+  vector4x_u32 h0_h3, h4_h7;
+  vector4x_u32 a, b, c, d, e, f, g, h, t1, t2;
+  u32 w[16];
+  u32 w2[64];
+
+  h0_h3 = vec_vsx_ld (4 * 0, state);
+  h4_h7 = vec_vsx_ld (4 * 4, state);
+
+  h0 = h0_h3;
+  h1 = vec_rol_elems (h0_h3, 1);
+  h2 = vec_rol_elems (h0_h3, 2);
+  h3 = vec_rol_elems (h0_h3, 3);
+  h4 = h4_h7;
+  h5 = vec_rol_elems (h4_h7, 1);
+  h6 = vec_rol_elems (h4_h7, 2);
+  h7 = vec_rol_elems (h4_h7, 3);
+
+  while (nblks >= 2)
+    {
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], I2(0));
+      R(h, a, b, c, d, e, f, g, K[1], I2(1));
+      R(g, h, a, b, c, d, e, f, K[2], I2(2));
+      R(f, g, h, a, b, c, d, e, K[3], I2(3));
+      R(e, f, g, h, a, b, c, d, K[4], I2(4));
+      R(d, e, f, g, h, a, b, c, K[5], I2(5));
+      R(c, d, e, f, g, h, a, b, K[6], I2(6));
+      R(b, c, d, e, f, g, h, a, K[7], I2(7));
+      R(a, b, c, d, e, f, g, h, K[8], I2(8));
+      R(h, a, b, c, d, e, f, g, K[9], I2(9));
+      R(g, h, a, b, c, d, e, f, K[10], I2(10));
+      R(f, g, h, a, b, c, d, e, K[11], I2(11));
+      R(e, f, g, h, a, b, c, d, K[12], I2(12));
+      R(d, e, f, g, h, a, b, c, K[13], I2(13));
+      R(c, d, e, f, g, h, a, b, K[14], I2(14));
+      R(b, c, d, e, f, g, h, a, K[15], I2(15));
+      data += 64 * 2;
+
+      R(a, b, c, d, e, f, g, h, K[16], W2(16));
+      R(h, a, b, c, d, e, f, g, K[17], W2(17));
+      R(g, h, a, b, c, d, e, f, K[18], W2(18));
+      R(f, g, h, a, b, c, d, e, K[19], W2(19));
+      R(e, f, g, h, a, b, c, d, K[20], W2(20));
+      R(d, e, f, g, h, a, b, c, K[21], W2(21));
+      R(c, d, e, f, g, h, a, b, K[22], W2(22));
+      R(b, c, d, e, f, g, h, a, K[23], W2(23));
+      R(a, b, c, d, e, f, g, h, K[24], W2(24));
+      R(h, a, b, c, d, e, f, g, K[25], W2(25));
+      R(g, h, a, b, c, d, e, f, K[26], W2(26));
+      R(f, g, h, a, b, c, d, e, K[27], W2(27));
+      R(e, f, g, h, a, b, c, d, K[28], W2(28));
+      R(d, e, f, g, h, a, b, c, K[29], W2(29));
+      R(c, d, e, f, g, h, a, b, K[30], W2(30));
+      R(b, c, d, e, f, g, h, a, K[31], W2(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W2(32));
+      R(h, a, b, c, d, e, f, g, K[33], W2(33));
+      R(g, h, a, b, c, d, e, f, K[34], W2(34));
+      R(f, g, h, a, b, c, d, e, K[35], W2(35));
+      R(e, f, g, h, a, b, c, d, K[36], W2(36));
+      R(d, e, f, g, h, a, b, c, K[37], W2(37));
+      R(c, d, e, f, g, h, a, b, K[38], W2(38));
+      R(b, c, d, e, f, g, h, a, K[39], W2(39));
+      R(a, b, c, d, e, f, g, h, K[40], W2(40));
+      R(h, a, b, c, d, e, f, g, K[41], W2(41));
+      R(g, h, a, b, c, d, e, f, K[42], W2(42));
+      R(f, g, h, a, b, c, d, e, K[43], W2(43));
+      R(e, f, g, h, a, b, c, d, K[44], W2(44));
+      R(d, e, f, g, h, a, b, c, K[45], W2(45));
+      R(c, d, e, f, g, h, a, b, K[46], W2(46));
+      R(b, c, d, e, f, g, h, a, K[47], W2(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W2(48));
+      R(h, a, b, c, d, e, f, g, K[49], W2(49));
+      R(g, h, a, b, c, d, e, f, K[50], W2(50));
+      R(f, g, h, a, b, c, d, e, K[51], W2(51));
+      R(e, f, g, h, a, b, c, d, K[52], W2(52));
+      R(d, e, f, g, h, a, b, c, K[53], W2(53));
+      R(c, d, e, f, g, h, a, b, K[54], W2(54));
+      R(b, c, d, e, f, g, h, a, K[55], W2(55));
+      R(a, b, c, d, e, f, g, h, K[56], W2(56));
+      R(h, a, b, c, d, e, f, g, K[57], W2(57));
+      R(g, h, a, b, c, d, e, f, K[58], W2(58));
+      R(f, g, h, a, b, c, d, e, K[59], W2(59));
+      R(e, f, g, h, a, b, c, d, K[60], W2(60));
+      R(d, e, f, g, h, a, b, c, K[61], W2(61));
+      R(c, d, e, f, g, h, a, b, K[62], W2(62));
+      R(b, c, d, e, f, g, h, a, K[63], W2(63));
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], R2(0));
+      R(h, a, b, c, d, e, f, g, K[1], R2(1));
+      R(g, h, a, b, c, d, e, f, K[2], R2(2));
+      R(f, g, h, a, b, c, d, e, K[3], R2(3));
+      R(e, f, g, h, a, b, c, d, K[4], R2(4));
+      R(d, e, f, g, h, a, b, c, K[5], R2(5));
+      R(c, d, e, f, g, h, a, b, K[6], R2(6));
+      R(b, c, d, e, f, g, h, a, K[7], R2(7));
+      R(a, b, c, d, e, f, g, h, K[8], R2(8));
+      R(h, a, b, c, d, e, f, g, K[9], R2(9));
+      R(g, h, a, b, c, d, e, f, K[10], R2(10));
+      R(f, g, h, a, b, c, d, e, K[11], R2(11));
+      R(e, f, g, h, a, b, c, d, K[12], R2(12));
+      R(d, e, f, g, h, a, b, c, K[13], R2(13));
+      R(c, d, e, f, g, h, a, b, K[14], R2(14));
+      R(b, c, d, e, f, g, h, a, K[15], R2(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], R2(16));
+      R(h, a, b, c, d, e, f, g, K[17], R2(17));
+      R(g, h, a, b, c, d, e, f, K[18], R2(18));
+      R(f, g, h, a, b, c, d, e, K[19], R2(19));
+      R(e, f, g, h, a, b, c, d, K[20], R2(20));
+      R(d, e, f, g, h, a, b, c, K[21], R2(21));
+      R(c, d, e, f, g, h, a, b, K[22], R2(22));
+      R(b, c, d, e, f, g, h, a, K[23], R2(23));
+      R(a, b, c, d, e, f, g, h, K[24], R2(24));
+      R(h, a, b, c, d, e, f, g, K[25], R2(25));
+      R(g, h, a, b, c, d, e, f, K[26], R2(26));
+      R(f, g, h, a, b, c, d, e, K[27], R2(27));
+      R(e, f, g, h, a, b, c, d, K[28], R2(28));
+      R(d, e, f, g, h, a, b, c, K[29], R2(29));
+      R(c, d, e, f, g, h, a, b, K[30], R2(30));
+      R(b, c, d, e, f, g, h, a, K[31], R2(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], R2(32));
+      R(h, a, b, c, d, e, f, g, K[33], R2(33));
+      R(g, h, a, b, c, d, e, f, K[34], R2(34));
+      R(f, g, h, a, b, c, d, e, K[35], R2(35));
+      R(e, f, g, h, a, b, c, d, K[36], R2(36));
+      R(d, e, f, g, h, a, b, c, K[37], R2(37));
+      R(c, d, e, f, g, h, a, b, K[38], R2(38));
+      R(b, c, d, e, f, g, h, a, K[39], R2(39));
+      R(a, b, c, d, e, f, g, h, K[40], R2(40));
+      R(h, a, b, c, d, e, f, g, K[41], R2(41));
+      R(g, h, a, b, c, d, e, f, K[42], R2(42));
+      R(f, g, h, a, b, c, d, e, K[43], R2(43));
+      R(e, f, g, h, a, b, c, d, K[44], R2(44));
+      R(d, e, f, g, h, a, b, c, K[45], R2(45));
+      R(c, d, e, f, g, h, a, b, K[46], R2(46));
+      R(b, c, d, e, f, g, h, a, K[47], R2(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], R2(48));
+      R(h, a, b, c, d, e, f, g, K[49], R2(49));
+      R(g, h, a, b, c, d, e, f, K[50], R2(50));
+      R(f, g, h, a, b, c, d, e, K[51], R2(51));
+      R(e, f, g, h, a, b, c, d, K[52], R2(52));
+      R(d, e, f, g, h, a, b, c, K[53], R2(53));
+      R(c, d, e, f, g, h, a, b, K[54], R2(54));
+      R(b, c, d, e, f, g, h, a, K[55], R2(55));
+      R(a, b, c, d, e, f, g, h, K[56], R2(56));
+      R(h, a, b, c, d, e, f, g, K[57], R2(57));
+      R(g, h, a, b, c, d, e, f, K[58], R2(58));
+      R(f, g, h, a, b, c, d, e, K[59], R2(59));
+      R(e, f, g, h, a, b, c, d, K[60], R2(60));
+      R(d, e, f, g, h, a, b, c, K[61], R2(61));
+      R(c, d, e, f, g, h, a, b, K[62], R2(62));
+      R(b, c, d, e, f, g, h, a, K[63], R2(63));
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+
+      nblks -= 2;
+    }
+
+  while (nblks)
+    {
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], I(0));
+      R(h, a, b, c, d, e, f, g, K[1], I(1));
+      R(g, h, a, b, c, d, e, f, K[2], I(2));
+      R(f, g, h, a, b, c, d, e, K[3], I(3));
+      R(e, f, g, h, a, b, c, d, K[4], I(4));
+      R(d, e, f, g, h, a, b, c, K[5], I(5));
+      R(c, d, e, f, g, h, a, b, K[6], I(6));
+      R(b, c, d, e, f, g, h, a, K[7], I(7));
+      R(a, b, c, d, e, f, g, h, K[8], I(8));
+      R(h, a, b, c, d, e, f, g, K[9], I(9));
+      R(g, h, a, b, c, d, e, f, K[10], I(10));
+      R(f, g, h, a, b, c, d, e, K[11], I(11));
+      R(e, f, g, h, a, b, c, d, K[12], I(12));
+      R(d, e, f, g, h, a, b, c, K[13], I(13));
+      R(c, d, e, f, g, h, a, b, K[14], I(14));
+      R(b, c, d, e, f, g, h, a, K[15], I(15));
+      data += 64;
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+
+      nblks--;
+    }
+
+  h0_h3 = vec_merge_idx0_elems (h0, h1, h2, h3);
+  h4_h7 = vec_merge_idx0_elems (h4, h5, h6, h7);
+  vec_vsx_st (h0_h3, 4 * 0, state);
+  vec_vsx_st (h4_h7, 4 * 4, state);
+
+  return sizeof(w2) + sizeof(w);
+}
+#undef R
+#undef Cho
+#undef Maj
+#undef Sum0
+#undef Sum1
+#undef S0
+#undef S1
+#undef I
+#undef W
+#undef I2
+#undef W2
+#undef R2
+
+
+/* SHA2 round in general purpose registers */
+#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
+          {                                                       \
+            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
+            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
+            d += t1;                                              \
+            h  = t1 + t2;                                         \
+          } while (0)
+
+#define Cho(x, y, z)  ((x & y) + (~x & z))
+
+#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
+
+#define Sum0(x)       (ror (x, 2) ^ ror (x ^ ror (x, 22-13), 13))
+
+#define Sum1(x)       (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
+
+
+/* Message expansion on general purpose registers */
+#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
+#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
+
+#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
+#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
+		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
+		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
+		 w[i&0x0f]; })
+#define W(i) ({ u32 r = w[i&0x0f]; WN(i); r; })
+#define L(i) w[i&0x0f]
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_sha256_transform_ppc9(u32 state[8], const unsigned char *data,
+			    size_t nblks)
+{
+  /* GPRs used for round function and message expansion as vector intrinsics
+   * based generates slower code for POWER9. */
+  u32 a, b, c, d, e, f, g, h, t1, t2;
+  u32 w[16];
+
+  a = state[0];
+  b = state[1];
+  c = state[2];
+  d = state[3];
+  e = state[4];
+  f = state[5];
+  g = state[6];
+  h = state[7];
+
+  while (nblks >= 2)
+    {
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 64;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], L(48));
+      R(h, a, b, c, d, e, f, g, K[49], L(49));
+      R(g, h, a, b, c, d, e, f, K[50], L(50));
+      R(f, g, h, a, b, c, d, e, K[51], L(51));
+      I(0); I(1); I(2); I(3);
+      R(e, f, g, h, a, b, c, d, K[52], L(52));
+      R(d, e, f, g, h, a, b, c, K[53], L(53));
+      R(c, d, e, f, g, h, a, b, K[54], L(54));
+      R(b, c, d, e, f, g, h, a, K[55], L(55));
+      I(4); I(5); I(6); I(7);
+      R(a, b, c, d, e, f, g, h, K[56], L(56));
+      R(h, a, b, c, d, e, f, g, K[57], L(57));
+      R(g, h, a, b, c, d, e, f, K[58], L(58));
+      R(f, g, h, a, b, c, d, e, K[59], L(59));
+      I(8); I(9); I(10); I(11);
+      R(e, f, g, h, a, b, c, d, K[60], L(60));
+      R(d, e, f, g, h, a, b, c, K[61], L(61));
+      R(c, d, e, f, g, h, a, b, K[62], L(62));
+      R(b, c, d, e, f, g, h, a, K[63], L(63));
+      I(12); I(13); I(14); I(15);
+      data += 64;
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], L(48));
+      R(h, a, b, c, d, e, f, g, K[49], L(49));
+      R(g, h, a, b, c, d, e, f, K[50], L(50));
+      R(f, g, h, a, b, c, d, e, K[51], L(51));
+      R(e, f, g, h, a, b, c, d, K[52], L(52));
+      R(d, e, f, g, h, a, b, c, K[53], L(53));
+      R(c, d, e, f, g, h, a, b, K[54], L(54));
+      R(b, c, d, e, f, g, h, a, K[55], L(55));
+      R(a, b, c, d, e, f, g, h, K[56], L(56));
+      R(h, a, b, c, d, e, f, g, K[57], L(57));
+      R(g, h, a, b, c, d, e, f, K[58], L(58));
+      R(f, g, h, a, b, c, d, e, K[59], L(59));
+      R(e, f, g, h, a, b, c, d, K[60], L(60));
+      R(d, e, f, g, h, a, b, c, K[61], L(61));
+      R(c, d, e, f, g, h, a, b, K[62], L(62));
+      R(b, c, d, e, f, g, h, a, K[63], L(63));
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      nblks -= 2;
+    }
+
+  while (nblks)
+    {
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 64;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], L(48));
+      R(h, a, b, c, d, e, f, g, K[49], L(49));
+      R(g, h, a, b, c, d, e, f, K[50], L(50));
+      R(f, g, h, a, b, c, d, e, K[51], L(51));
+      R(e, f, g, h, a, b, c, d, K[52], L(52));
+      R(d, e, f, g, h, a, b, c, K[53], L(53));
+      R(c, d, e, f, g, h, a, b, K[54], L(54));
+      R(b, c, d, e, f, g, h, a, K[55], L(55));
+      R(a, b, c, d, e, f, g, h, K[56], L(56));
+      R(h, a, b, c, d, e, f, g, K[57], L(57));
+      R(g, h, a, b, c, d, e, f, K[58], L(58));
+      R(f, g, h, a, b, c, d, e, K[59], L(59));
+      R(e, f, g, h, a, b, c, d, K[60], L(60));
+      R(d, e, f, g, h, a, b, c, K[61], L(61));
+      R(c, d, e, f, g, h, a, b, K[62], L(62));
+      R(b, c, d, e, f, g, h, a, K[63], L(63));
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      nblks--;
+    }
+
+  return sizeof(w);
+}
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/comm/third_party/libgcrypt/cipher/sha256-ssse3-amd64.S b/comm/third_party/libgcrypt/cipher/sha256-ssse3-amd64.S
new file mode 100644
index 0000000000..098b0eb641
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256-ssse3-amd64.S
@@ -0,0 +1,528 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+; The paper is expected to be released roughly at the end of April, 2012
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Note: original implementation was named as SHA256-SSE4. However, only SSSE3
+ *       is required.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+#define	MOVDQ movdqu /* assume buffers not aligned */
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
+
+/* addm [mem], reg
+ * Add reg to mem using reg-mem add and store */
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+ * Load xmm with mem and byte swap each dword */
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	MOVDQ p1, p2; \
+	pshufb p1, p3;
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
+
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
+
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
+
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
+
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
+
+#define TBL rbp
+#define a eax
+#define b ebx
+
+#define f r9d
+#define g r10d
+#define h r11d
+
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+
+#define _INP_END_SIZE	8
+#define _INP_SIZE	8
+#define _XFER_SIZE	8
+#define _XMM_SAVE_SIZE	0
+/* STACK_SIZE plus pushes must be an odd multiple of 8 */
+#define _ALIGN_SIZE	8
+
+#define _INP_END	0
+#define _INP		(_INP_END  + _INP_END_SIZE)
+#define _XFER		(_INP      + _INP_SIZE)
+#define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
+#define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
+
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+		movdqa	XTMP0, X3; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		movdqa	XTMP1, X1; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pslld	XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		psrld	XTMP2, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+		pslld	XTMP3, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrld	XTMP2, 18; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP1, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		pxor	XTMP2, XTMP3; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
+
+/* input is [rsp + _XFER + %1 * 4] */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+;; arg 3 : Num blocks
+*/
+.text
+.globl _gcry_sha256_transform_amd64_ssse3
+ELF(.type  _gcry_sha256_transform_amd64_ssse3,@function;)
+.align 16
+_gcry_sha256_transform_amd64_ssse3:
+	CFI_STARTPROC()
+	push	rbx
+	CFI_PUSH(rbx)
+	push	rbp
+	CFI_PUSH(rbp)
+	push	r13
+	CFI_PUSH(r13)
+	push	r14
+	CFI_PUSH(r14)
+	push	r15
+	CFI_PUSH(r15)
+
+	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
+
+	shl	NUM_BLKS, 6	/* convert to bytes */
+	jz	.Ldone_hash
+	add	NUM_BLKS, INP	/* pointer to end of data */
+	mov	[rsp + _INP_END], NUM_BLKS
+
+	/* load initial digest */
+	mov	a,[4*0 + CTX]
+	mov	b,[4*1 + CTX]
+	mov	c,[4*2 + CTX]
+	mov	d,[4*3 + CTX]
+	mov	e,[4*4 + CTX]
+	mov	f,[4*5 + CTX]
+	mov	g,[4*6 + CTX]
+	mov	h,[4*7 + CTX]
+
+	movdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	movdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
+	movdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
+
+.Loop0:
+	lea	TBL, [.LK256 ADD_RIP]
+
+	/* byte swap first 16 dwords */
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
+
+	mov	[rsp + _INP], INP
+
+	/* schedule 48 input dwords, by doing 3 rounds of 16 each */
+	mov	SRND, 3
+.align 16
+.Loop1:
+	movdqa	XFER, [TBL + 0*16]
+	paddd	XFER, X0
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+
+	movdqa	XFER, [TBL + 1*16]
+	paddd	XFER, X1
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+
+	movdqa	XFER, [TBL + 2*16]
+	paddd	XFER, X2
+	movdqa	[rsp + _XFER], XFER
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+
+	movdqa	XFER, [TBL + 3*16]
+	paddd	XFER, X3
+	movdqa	[rsp + _XFER], XFER
+	add	TBL, 4*16
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
+
+	sub	SRND, 1
+	jne	.Loop1
+
+	mov	SRND, 2
+.Loop2:
+	paddd	X0, [TBL + 0*16]
+	movdqa	[rsp + _XFER], X0
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
+	paddd	X1, [TBL + 1*16]
+	movdqa	[rsp + _XFER], X1
+	add	TBL, 2*16
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
+
+	movdqa	X0, X2
+	movdqa	X1, X3
+
+	sub	SRND, 1
+	jne	.Loop2
+
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
+
+	mov	INP, [rsp + _INP]
+	add	INP, 64
+	cmp	INP, [rsp + _INP_END]
+	jne	.Loop0
+
+	pxor	xmm0, xmm0
+	pxor	xmm1, xmm1
+	pxor	xmm2, xmm2
+	pxor	xmm3, xmm3
+	pxor	xmm4, xmm4
+	pxor	xmm5, xmm5
+	pxor	xmm6, xmm6
+	pxor	xmm7, xmm7
+	pxor	xmm8, xmm8
+	pxor	xmm9, xmm9
+	pxor	xmm10, xmm10
+	pxor	xmm11, xmm11
+	pxor	xmm12, xmm12
+
+.Ldone_hash:
+	pxor	XFER, XFER
+	movdqa	[rsp + _XFER], XFER
+	xor     eax, eax
+
+	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
+
+	pop	r15
+	CFI_POP(r15)
+	pop	r14
+	CFI_POP(r14)
+	pop	r13
+	CFI_POP(r13)
+	pop	rbp
+	CFI_POP(rbp)
+	pop	rbx
+	CFI_POP(rbx)
+
+	ret
+	CFI_ENDPROC()
+
+
+.align 16
+.LK256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
+
+/* shuffle xBxA -> 00BA */
+.L_SHUF_00BA:              .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+/* shuffle xDxC -> DC00 */
+.L_SHUF_DC00:              .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha256.c b/comm/third_party/libgcrypt/cipher/sha256.c
new file mode 100644
index 0000000000..9350589110
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha256.c
@@ -0,0 +1,857 @@
+/* sha256.c - SHA256 hash function
+ * Copyright (C) 2003, 2006, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*  Test vectors:
+
+    "abc"
+    SHA224: 23097d22 3405d822 8642a477 bda255b3 2aadbce4 bda0b3f7 e36c9da7
+    SHA256: ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad
+
+    "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+    SHA224: 75388b16 512776cc 5dba5da1 fd890150 b0c6455c b4f58b19 52522525
+    SHA256: 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1
+
+    "a" one million times
+    SHA224: 20794655 980c91d8 bbb4c1ea 97618a4b f03f4258 1948b2ee 4ee7ad67
+    SHA256: cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0
+
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSSE3 1
+#endif
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
+#undef USE_SHAEXT
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHAEXT 1
+#endif
+
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
+ * code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
+#  define USE_ARM_CE 1
+# elif defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_CE 1
+# endif
+#endif
+
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code. */
+#undef USE_PPC_CRYPTO
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_CRYPTO 1
+#  endif
+# endif
+#endif
+
+/* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define USE_S390X_CRYPTO 1
+#endif /* USE_S390X_CRYPTO */
+
+
+typedef struct {
+  gcry_md_block_ctx_t bctx;
+  u32  h0,h1,h2,h3,h4,h5,h6,h7;
+#ifdef USE_S390X_CRYPTO
+  u32  final_len_msb, final_len_lsb; /* needs to be right after h7. */
+  int  use_s390x_crypto;
+#endif
+} SHA256_CONTEXT;
+
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
+    defined(USE_SHAEXT)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+#ifdef USE_SSSE3
+unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data,
+                                                u32 state[8],
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_ssse3(void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX
+unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data,
+                                              u32 state[8],
+                                              size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_avx(void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX2
+unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data,
+                                               u32 state[8],
+                                               size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha256_transform_amd64_avx2(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha256_transform_intel_shaext(u32 state[8],
+                                    const unsigned char *input_data,
+                                    size_t num_blks);
+
+static unsigned int
+do_sha256_transform_intel_shaext(void *ctx, const unsigned char *data,
+                                 size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_ARM_CE
+unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8],
+                                             const void *input_data,
+                                             size_t num_blks);
+
+static unsigned int
+do_sha256_transform_armv8_ce(void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_PPC_CRYPTO
+unsigned int _gcry_sha256_transform_ppc8(u32 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks);
+
+unsigned int _gcry_sha256_transform_ppc9(u32 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks);
+
+static unsigned int
+do_sha256_transform_ppc8(void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_ppc8 (&hd->h0, data, nblks);
+}
+
+static unsigned int
+do_sha256_transform_ppc9(void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_ppc9 (&hd->h0, data, nblks);
+}
+#endif
+
+#ifdef USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static unsigned int
+do_sha256_transform_s390x (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+
+  kimd_execute (KMID_FUNCTION_SHA256, &hd->h0, data, nblks * 64);
+  return 0;
+}
+
+static unsigned int
+do_sha256_final_s390x (void *ctx, const unsigned char *data, size_t datalen,
+		       u32 len_msb, u32 len_lsb)
+{
+  SHA256_CONTEXT *hd = ctx;
+
+  /* Make sure that 'final_len' is positioned at correct offset relative
+   * to 'h0'. This is because we are passing 'h0' pointer as start of
+   * parameter block to 'klmd' instruction. */
+
+  gcry_assert (offsetof (SHA256_CONTEXT, final_len_msb)
+	       - offsetof (SHA256_CONTEXT, h0) == 8 * sizeof(u32));
+  gcry_assert (offsetof (SHA256_CONTEXT, final_len_lsb)
+	       - offsetof (SHA256_CONTEXT, final_len_msb) == 1 * sizeof(u32));
+
+  hd->final_len_msb = len_msb;
+  hd->final_len_lsb = len_lsb;
+
+  klmd_execute (KMID_FUNCTION_SHA256, &hd->h0, data, datalen);
+  return 0;
+}
+#endif
+
+
+static unsigned int
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks);
+
+
+static void
+sha256_common_init (SHA256_CONTEXT *hd)
+{
+  unsigned int features = _gcry_get_hw_features ();
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.nblocks_high = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  hd->bctx.bwrite = do_transform_generic;
+#ifdef USE_SSSE3
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    hd->bctx.bwrite = do_sha256_transform_amd64_ssse3;
+#endif
+#ifdef USE_AVX
+  /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
+   * Therefore use this implementation on Intel CPUs only. */
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx;
+#endif
+#ifdef USE_AVX2
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha256_transform_amd64_avx2;
+#endif
+#ifdef USE_SHAEXT
+  if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
+    hd->bctx.bwrite = do_sha256_transform_intel_shaext;
+#endif
+#ifdef USE_ARM_CE
+  if ((features & HWF_ARM_SHA2) != 0)
+    hd->bctx.bwrite = do_sha256_transform_armv8_ce;
+#endif
+#ifdef USE_PPC_CRYPTO
+  if ((features & HWF_PPC_VCRYPTO) != 0)
+    hd->bctx.bwrite = do_sha256_transform_ppc8;
+  if ((features & HWF_PPC_VCRYPTO) != 0 && (features & HWF_PPC_ARCH_3_00) != 0)
+    hd->bctx.bwrite = do_sha256_transform_ppc9;
+#endif
+#ifdef USE_S390X_CRYPTO
+  hd->use_s390x_crypto = 0;
+  if ((features & HWF_S390X_MSA) != 0)
+    {
+      if ((kimd_query () & km_function_to_mask (KMID_FUNCTION_SHA256)) &&
+	  (klmd_query () & km_function_to_mask (KMID_FUNCTION_SHA256)))
+	{
+	  hd->bctx.bwrite = do_sha256_transform_s390x;
+	  hd->use_s390x_crypto = 1;
+	}
+    }
+#endif
+  (void)features;
+}
+
+
+static void
+sha256_init (void *context, unsigned int flags)
+{
+  SHA256_CONTEXT *hd = context;
+
+  (void)flags;
+
+  hd->h0 = 0x6a09e667;
+  hd->h1 = 0xbb67ae85;
+  hd->h2 = 0x3c6ef372;
+  hd->h3 = 0xa54ff53a;
+  hd->h4 = 0x510e527f;
+  hd->h5 = 0x9b05688c;
+  hd->h6 = 0x1f83d9ab;
+  hd->h7 = 0x5be0cd19;
+
+  sha256_common_init (hd);
+}
+
+
+static void
+sha224_init (void *context, unsigned int flags)
+{
+  SHA256_CONTEXT *hd = context;
+
+  (void)flags;
+
+  hd->h0 = 0xc1059ed8;
+  hd->h1 = 0x367cd507;
+  hd->h2 = 0x3070dd17;
+  hd->h3 = 0xf70e5939;
+  hd->h4 = 0xffc00b31;
+  hd->h5 = 0x68581511;
+  hd->h6 = 0x64f98fa7;
+  hd->h7 = 0xbefa4fa4;
+
+  sha256_common_init (hd);
+}
+
+
+/*
+  Transform the message X which consists of 16 32-bit-words. See FIPS
+  180-2 for details.  */
+#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
+          {                                                       \
+            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + (k) + (w);  \
+            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
+            d += t1;                                              \
+            h  = t1 + t2;                                         \
+          } while (0)
+
+/* (4.2) same as SHA-1's F1.  */
+#define Cho(x, y, z)  (z ^ (x & (y ^ z)))
+
+/* (4.3) same as SHA-1's F3 */
+#define Maj(x, y, z)  ((x & y) + (z & (x ^ y)))
+
+/* (4.4) */
+#define Sum0(x)       (ror (x, 2) ^ ror (x, 13) ^ ror (x, 22))
+
+/* (4.5) */
+#define Sum1(x)       (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
+
+/* Message expansion */
+#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))       /* (4.6) */
+#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))     /* (4.7) */
+#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
+#define W(i) ( w[i&0x0f] =    S1(w[(i-2) &0x0f]) \
+                            +    w[(i-7) &0x0f]  \
+                            + S0(w[(i-15)&0x0f]) \
+                            +    w[(i-16)&0x0f] )
+
+static unsigned int
+do_transform_generic (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  static const u32 K[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+  };
+
+  do
+    {
+
+      u32 a,b,c,d,e,f,g,h,t1,t2;
+      u32 w[16];
+
+      a = hd->h0;
+      b = hd->h1;
+      c = hd->h2;
+      d = hd->h3;
+      e = hd->h4;
+      f = hd->h5;
+      g = hd->h6;
+      h = hd->h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], I(0));
+      R(h, a, b, c, d, e, f, g, K[1], I(1));
+      R(g, h, a, b, c, d, e, f, K[2], I(2));
+      R(f, g, h, a, b, c, d, e, K[3], I(3));
+      R(e, f, g, h, a, b, c, d, K[4], I(4));
+      R(d, e, f, g, h, a, b, c, K[5], I(5));
+      R(c, d, e, f, g, h, a, b, K[6], I(6));
+      R(b, c, d, e, f, g, h, a, K[7], I(7));
+      R(a, b, c, d, e, f, g, h, K[8], I(8));
+      R(h, a, b, c, d, e, f, g, K[9], I(9));
+      R(g, h, a, b, c, d, e, f, K[10], I(10));
+      R(f, g, h, a, b, c, d, e, K[11], I(11));
+      R(e, f, g, h, a, b, c, d, K[12], I(12));
+      R(d, e, f, g, h, a, b, c, K[13], I(13));
+      R(c, d, e, f, g, h, a, b, K[14], I(14));
+      R(b, c, d, e, f, g, h, a, K[15], I(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      hd->h0 += a;
+      hd->h1 += b;
+      hd->h2 += c;
+      hd->h3 += d;
+      hd->h4 += e;
+      hd->h5 += f;
+      hd->h6 += g;
+      hd->h7 += h;
+
+      data += 64;
+    }
+  while (--nblks);
+
+  return 26*4 + 32 + 3 * sizeof(void*);
+}
+
+#undef S0
+#undef S1
+#undef R
+
+
+/*
+   The routine finally terminates the computation and returns the
+   digest.  The handle is prepared for a new cycle, but adding bytes
+   to the handle will the destroy the returned buffer.  Returns: 32
+   bytes with the message the digest.  */
+static void
+sha256_final(void *context)
+{
+  SHA256_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if ((lsb += hd->bctx.count) < t)
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (0)
+    { }
+#ifdef USE_S390X_CRYPTO
+  else if (hd->use_s390x_crypto)
+    {
+      burn = do_sha256_final_s390x (hd, hd->bctx.buf, hd->bctx.count, msb, lsb);
+    }
+#endif
+  else if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
+    }
+  else  /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 2);
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
+  X(0);
+  X(1);
+  X(2);
+  X(3);
+  X(4);
+  X(5);
+  X(6);
+  X(7);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+sha256_read (void *context)
+{
+  SHA256_CONTEXT *hd = context;
+
+  return hd->bctx.buf;
+}
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 32 bytes.  */
+void
+_gcry_sha256_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA256_CONTEXT hd;
+
+  sha256_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha256_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+void
+_gcry_sha256_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SHA256_CONTEXT hd;
+
+  sha256_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha256_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 28 bytes.  */
+static void
+_gcry_sha224_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA256_CONTEXT hd;
+
+  sha224_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha256_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha224_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SHA256_CONTEXT hd;
+
+  sha224_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha256_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+
+static gpg_err_code_t
+selftests_sha224 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA224, 0,
+     "abc", 3,
+     "\x23\x09\x7d\x22\x34\x05\xd8\x22\x86\x42\xa4\x77\xbd\xa2\x55\xb3"
+     "\x2a\xad\xbc\xe4\xbd\xa0\xb3\xf7\xe3\x6c\x9d\xa7", 28);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA224, 0,
+         "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
+         "\x75\x38\x8b\x16\x51\x27\x76\xcc\x5d\xba\x5d\xa1\xfd\x89\x01\x50"
+         "\xb0\xc6\x45\x5c\xb4\xf5\x8b\x19\x52\x52\x25\x25", 28);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA224, 1,
+         NULL, 0,
+         "\x20\x79\x46\x55\x98\x0c\x91\xd8\xbb\xb4\xc1\xea\x97\x61\x8a\x4b"
+         "\xf0\x3f\x42\x58\x19\x48\xb2\xee\x4e\xe7\xad\x67", 28);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA224, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_sha256 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA256, 0,
+     "abc", 3,
+     "\xba\x78\x16\xbf\x8f\x01\xcf\xea\x41\x41\x40\xde\x5d\xae\x22\x23"
+     "\xb0\x03\x61\xa3\x96\x17\x7a\x9c\xb4\x10\xff\x61\xf2\x00\x15\xad", 32);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA256, 0,
+         "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
+         "\x24\x8d\x6a\x61\xd2\x06\x38\xb8\xe5\xc0\x26\x93\x0c\x3e\x60\x39"
+         "\xa3\x3c\xe4\x59\x64\xff\x21\x67\xf6\xec\xed\xd4\x19\xdb\x06\xc1",
+         32);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA256, 1,
+         NULL, 0,
+         "\xcd\xc7\x6e\x5c\x99\x14\xfb\x92\x81\xa1\xc7\xe2\x84\xd7\x3e\x67"
+         "\xf1\x80\x9a\x48\xa4\x97\x20\x0e\x04\x6d\x39\xcc\xc7\x11\x2c\xd0",
+         32);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA256, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MD_SHA224:
+      ec = selftests_sha224 (extended, report);
+      break;
+    case GCRY_MD_SHA256:
+      ec = selftests_sha256 (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_DIGEST_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+
+static byte asn224[19] = /* Object ID is 2.16.840.1.101.3.4.2.4 */
+  { 0x30, 0x2D, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48,
+    0x01, 0x65, 0x03, 0x04, 0x02, 0x04, 0x05, 0x00, 0x04,
+    0x1C
+  };
+
+static gcry_md_oid_spec_t oid_spec_sha224[] =
+  {
+    /* From RFC3874, Section 4 */
+    { "2.16.840.1.101.3.4.2.4" },
+    { NULL },
+  };
+
+static byte asn256[19] = /* Object ID is  2.16.840.1.101.3.4.2.1 */
+  { 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
+    0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0x05,
+    0x00, 0x04, 0x20 };
+
+static gcry_md_oid_spec_t oid_spec_sha256[] =
+  {
+    /* According to the OpenPGP draft rfc2440-bis06 */
+    { "2.16.840.1.101.3.4.2.1" },
+    /* PKCS#1 sha256WithRSAEncryption */
+    { "1.2.840.113549.1.1.11" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha224 =
+  {
+    GCRY_MD_SHA224, {0, 1},
+    "SHA224", asn224, DIM (asn224), oid_spec_sha224, 28,
+    sha224_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
+    _gcry_sha224_hash_buffer, _gcry_sha224_hash_buffers,
+    sizeof (SHA256_CONTEXT),
+    run_selftests
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha256 =
+  {
+    GCRY_MD_SHA256, {0, 1},
+    "SHA256", asn256, DIM (asn256), oid_spec_sha256, 32,
+    sha256_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
+    _gcry_sha256_hash_buffer, _gcry_sha256_hash_buffers,
+    sizeof (SHA256_CONTEXT),
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/sha512-arm.S b/comm/third_party/libgcrypt/cipher/sha512-arm.S
new file mode 100644
index 0000000000..94ec0141e7
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-arm.S
@@ -0,0 +1,464 @@
+/* sha512-arm.S  -  ARM assembly implementation of SHA-512 transform
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+#define hd_h ((hd_g) + 8)
+
+/* register macros */
+#define RK    %r2
+
+#define RElo %r0
+#define REhi %r1
+
+#define RT1lo %r3
+#define RT1hi %r4
+#define RT2lo %r5
+#define RT2hi %r6
+#define RWlo  %r7
+#define RWhi  %r8
+#define RT3lo %r9
+#define RT3hi %r10
+#define RT4lo %r11
+#define RT4hi %ip
+
+#define RRND  %lr
+
+/* variable offsets in stack */
+#define ctx (0)
+#define data ((ctx) + 4)
+#define nblks ((data) + 4)
+#define _a ((nblks) + 4)
+#define _b ((_a) + 8)
+#define _c ((_b) + 8)
+#define _d ((_c) + 8)
+#define _e ((_d) + 8)
+#define _f ((_e) + 8)
+#define _g ((_f) + 8)
+#define _h ((_g) + 8)
+
+#define w(i) ((_h) + 8 + ((i) % 16) * 8)
+
+#define STACK_MAX (w(15) + 8)
+
+/* helper macros */
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+    ldrb rout, [rsrc, #((offs) + 3)]; \
+    ldrb rtmp, [rsrc, #((offs) + 2)]; \
+    orr rout, rout, rtmp, lsl #8; \
+    ldrb rtmp, [rsrc, #((offs) + 1)]; \
+    orr rout, rout, rtmp, lsl #16; \
+    ldrb rtmp, [rsrc, #((offs) + 0)]; \
+    orr rout, rout, rtmp, lsl #24;
+
+#ifdef __ARMEL__
+    /* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+    #define be_to_host(reg, rtmp) \
+	rev reg, reg;
+#else
+    #define be_to_host(reg, rtmp) \
+	eor rtmp, reg, reg, ror #16; \
+	mov rtmp, rtmp, lsr #8; \
+	bic rtmp, rtmp, #65280; \
+	eor reg, rtmp, reg, ror #8;
+#endif
+#else
+    /* nop on big-endian */
+    #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+#define read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, convert, rtmp) \
+    ldr lo0, [rin, #((offs) + 0 * 8 + 4)]; \
+    ldr hi0, [rin, #((offs) + 0 * 8 + 0)]; \
+    ldr lo1, [rin, #((offs) + 1 * 8 + 4)]; \
+    ldr hi1, [rin, #((offs) + 1 * 8 + 0)]; \
+    ldr lo2, [rin, #((offs) + 2 * 8 + 4)]; \
+    convert(lo0, rtmp); \
+    ldr hi2, [rin, #((offs) + 2 * 8 + 0)]; \
+    convert(hi0, rtmp); \
+    ldr lo3, [rin, #((offs) + 3 * 8 + 4)]; \
+    convert(lo1, rtmp); \
+    ldr hi3, [rin, #((offs) + 3 * 8 + 0)]; \
+    convert(hi1, rtmp); \
+    convert(lo2, rtmp); \
+    convert(hi2, rtmp); \
+    convert(lo3, rtmp); \
+    convert(hi3, rtmp);
+
+#define read_be64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, be_to_host, rtmp0)
+
+/* need to handle unaligned reads by byte reads */
+#define read_be64_unaligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    ldr_unaligned_be(lo0, rin, (offs) + 0 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi0, rin, (offs) + 0 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo1, rin, (offs) + 1 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi1, rin, (offs) + 1 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo2, rin, (offs) + 2 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi2, rin, (offs) + 2 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo3, rin, (offs) + 3 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi3, rin, (offs) + 3 * 8 + 0, rtmp0);
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+
+/* Round function */
+
+#define R(_a,_b,_c,_d,_e,_f,_g,_h,W,wi) \
+    /* Message expansion, t1 = _h + w[i] */ \
+    W(_a,_h,wi); \
+    \
+    /* w = Sum1(_e) */ \
+    mov RWlo, RElo, lsr#14; \
+    ldm RK!, {RT2lo-RT2hi}; \
+    mov RWhi, REhi, lsr#14; \
+    eor RWlo, RWlo, RElo, lsr#18; \
+    eor RWhi, RWhi, REhi, lsr#18; \
+    ldr RT3lo, [%sp, #(_f)]; \
+    adds RT1lo, RT2lo; /* t1 += K */ \
+    ldr RT3hi, [%sp, #(_f) + 4]; \
+    adc RT1hi, RT2hi; \
+    ldr RT4lo, [%sp, #(_g)]; \
+    eor RWlo, RWlo, RElo, lsl#23; \
+    ldr RT4hi, [%sp, #(_g) + 4]; \
+    eor RWhi, RWhi, REhi, lsl#23; \
+    eor RWlo, RWlo, REhi, lsl#18; \
+    eor RWhi, RWhi, RElo, lsl#18; \
+    eor RWlo, RWlo, REhi, lsl#14; \
+    eor RWhi, RWhi, RElo, lsl#14; \
+    eor RWlo, RWlo, REhi, lsr#9; \
+    eor RWhi, RWhi, RElo, lsr#9; \
+    \
+    /* Cho(_e,_f,_g) => (_e & _f) ^ (~_e & _g) */ \
+    adds RT1lo, RWlo; /* t1 += Sum1(_e) */ \
+    and RT3lo, RT3lo, RElo; \
+    adc RT1hi, RWhi; \
+    and RT3hi, RT3hi, REhi; \
+    bic RT4lo, RT4lo, RElo; \
+    bic RT4hi, RT4hi, REhi; \
+    eor RT3lo, RT3lo, RT4lo; \
+    eor RT3hi, RT3hi, RT4hi; \
+    \
+    /* Load D */ \
+    /* t1 += Cho(_e,_f,_g) */ \
+    ldr RElo, [%sp, #(_d)]; \
+    adds RT1lo, RT3lo; \
+    ldr REhi, [%sp, #(_d) + 4]; \
+    adc RT1hi, RT3hi; \
+    \
+    /* Load A */ \
+    ldr RT3lo, [%sp, #(_a)]; \
+    \
+    /* _d += t1 */ \
+    adds RElo, RT1lo; \
+    ldr RT3hi, [%sp, #(_a) + 4]; \
+    adc REhi, RT1hi; \
+    \
+    /* Store D */ \
+    str RElo, [%sp, #(_d)]; \
+    \
+    /* t2 = Sum0(_a) */ \
+    mov RT2lo, RT3lo, lsr#28; \
+    str REhi, [%sp, #(_d) + 4]; \
+    mov RT2hi, RT3hi, lsr#28; \
+    ldr RWlo, [%sp, #(_b)]; \
+    eor RT2lo, RT2lo, RT3lo, lsl#30; \
+    ldr RWhi, [%sp, #(_b) + 4]; \
+    eor RT2hi, RT2hi, RT3hi, lsl#30; \
+    eor RT2lo, RT2lo, RT3lo, lsl#25; \
+    eor RT2hi, RT2hi, RT3hi, lsl#25; \
+    eor RT2lo, RT2lo, RT3hi, lsl#4; \
+    eor RT2hi, RT2hi, RT3lo, lsl#4; \
+    eor RT2lo, RT2lo, RT3hi, lsr#2; \
+    eor RT2hi, RT2hi, RT3lo, lsr#2; \
+    eor RT2lo, RT2lo, RT3hi, lsr#7; \
+    eor RT2hi, RT2hi, RT3lo, lsr#7; \
+    \
+    /* t2 += t1 */ \
+    adds RT2lo, RT1lo; \
+    ldr RT1lo, [%sp, #(_c)]; \
+    adc RT2hi, RT1hi; \
+    \
+    /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
+    ldr RT1hi, [%sp, #(_c) + 4]; \
+    and RT4lo, RWlo, RT3lo; \
+    and RT4hi, RWhi, RT3hi; \
+    eor RWlo, RWlo, RT3lo; \
+    eor RWhi, RWhi, RT3hi; \
+    and RWlo, RWlo, RT1lo; \
+    and RWhi, RWhi, RT1hi; \
+    eor RWlo, RWlo, RT4lo; \
+    eor RWhi, RWhi, RT4hi; \
+
+/* Message expansion */
+
+#define W_0_63(_a,_h,i) \
+    ldr RT3lo, [%sp, #(w(i-2))]; \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    adc RT2hi, RWhi; \
+    /* nw = S1(w[i-2]) */ \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    mov RWlo, RT3lo, lsr#19; \
+    str RT2lo, [%sp, #(_a)]; \
+    eor RWlo, RWlo, RT3lo, lsl#3; \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    mov RWhi, RT3hi, lsr#19; \
+    ldr RT2lo, [%sp, #(w(i-7))]; \
+    eor RWhi, RWhi, RT3hi, lsl#3; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    eor RWlo, RWlo, RT3lo, lsr#6; \
+    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    eor RWhi, RWhi, RT3hi, lsr#6; \
+    eor RWlo, RWlo, RT3hi, lsl#13; \
+    eor RWhi, RWhi, RT3lo, lsl#13; \
+    eor RWlo, RWlo, RT3hi, lsr#29; \
+    eor RWhi, RWhi, RT3lo, lsr#29; \
+    ldr RT3lo, [%sp, #(w(i-15))]; \
+    eor RWlo, RWlo, RT3hi, lsl#26; \
+    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    \
+    adds RT2lo, RWlo; /* nw += w[i-7] */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    mov RT4hi, RT3hi, lsr#1; \
+    adds RT2lo, RWlo; /* nw += w[i-16] */ \
+    eor RT4lo, RT4lo, RT3lo, lsr#8; \
+    eor RT4hi, RT4hi, RT3hi, lsr#8; \
+    eor RT4lo, RT4lo, RT3lo, lsr#7; \
+    eor RT4hi, RT4hi, RT3hi, lsr#7; \
+    eor RT4lo, RT4lo, RT3hi, lsl#31; \
+    eor RT4hi, RT4hi, RT3lo, lsl#31; \
+    eor RT4lo, RT4lo, RT3hi, lsl#24; \
+    eor RT4hi, RT4hi, RT3lo, lsl#24; \
+    eor RT4lo, RT4lo, RT3hi, lsl#25; \
+    adc RT2hi, RWhi; \
+    \
+    /* nw += S0(w[i-15]) */ \
+    adds RT2lo, RT4lo; \
+    adc RT2hi, RT4hi; \
+    \
+    /* w[0] = nw */ \
+    str RT2lo, [%sp, #(w(i))]; \
+    adds RT1lo, RWlo; \
+    str RT2hi, [%sp, #(w(i)) + 4]; \
+    adc RT1hi, RWhi;
+
+#define W_64_79(_a,_h,i) \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    str RT2lo, [%sp, #(_a)]; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    adds RT1lo, RWlo; \
+    adc RT1hi, RWhi;
+
+.align 3
+.globl _gcry_sha512_transform_arm
+.type  _gcry_sha512_transform_arm,%function;
+
+_gcry_sha512_transform_arm:
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 *	%r3: nblks
+	 */
+	push {%r4-%r11, %ip, %lr};
+	sub %sp, %sp, #STACK_MAX;
+	movs RWlo, %r3;
+	str %r0, [%sp, #(ctx)];
+
+	beq .Ldone;
+
+.Loop_blocks:
+	str RWlo, [%sp, #nblks];
+
+	/* Load context to stack */
+	add RWhi, %sp, #(_a);
+	ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	/* Load input to w[16] */
+
+	/* test if data is unaligned */
+	tst %r1, #3;
+	beq 1f;
+
+	/* unaligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	b 2f;
+1:
+	/* aligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+2:
+	add %r1, #(16 * 8);
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	str %r1, [%sp, #(data)];
+
+	/* preload E & A */
+	ldr RElo, [%sp, #(_e)];
+	ldr REhi, [%sp, #(_e) + 4];
+	mov RWlo, #0;
+	ldr RT2lo, [%sp, #(_a)];
+	mov RRND, #(80-16);
+	ldr RT2hi, [%sp, #(_a) + 4];
+	mov RWhi, #0;
+
+.Loop_rounds:
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 31);
+
+	subs RRND, #16;
+	bne .Loop_rounds;
+
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
+
+	ldr %r0, [%sp, #(ctx)];
+	adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
+	ldr %r1, [%sp, #(data)];
+	adc RT2hi, RWhi;
+
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_b + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_b + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_c + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_c + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_d + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_d + 4)];
+	adds RT4lo, RT2lo;
+	ldr RT2lo, [%sp, #(_e + 0)];
+	adc  RT4hi, RT2hi;
+	stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	ldr RT2hi, [%sp, #(_e + 4)];
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_f + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_f + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_g + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_g + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_h + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_h + 4)];
+	adds RT4lo, RT2lo;
+	adc  RT4hi, RT2hi;
+	stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	sub %r0, %r0, #(4 * 8);
+	ldr RWlo, [%sp, #nblks];
+
+	sub RK, #(80 * 8);
+	subs RWlo, #1;
+	bne .Loop_blocks;
+
+.Ldone:
+	mov %r0, #STACK_MAX;
+__out:
+	add %sp, %sp, #STACK_MAX;
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512-armv7-neon.S b/comm/third_party/libgcrypt/cipher/sha512-armv7-neon.S
new file mode 100644
index 0000000000..6596f2cdb2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-armv7-neon.S
@@ -0,0 +1,450 @@
+/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RT01q q4
+#define RT23q q5
+#define RT45q q6
+#define RT67q q7
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vshr.u64 RT2, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	interleave_op(arg1); \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, re, #41; \
+	vshl.u64 RT5, re, #64 - 41; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, re; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, rf, rg; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	vshr.u64 RT2, ra, #28; \
+	vshl.u64 RT3, ra, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* h = Sum0 (a) + Maj (a, b, c); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, ra, #39; \
+	vshl.u64 RT5, ra, #64 - 39; \
+	veor.64 RT0, ra, rb; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rc, rb; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	veor.64 rh, RT2, RT3; \
+	\
+	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+	vshr.u64 RT2, rd, #14; \
+	vshl.u64 RT3, rd, #64 - 14; \
+	vadd.u64 rh, rh, RT0; \
+	vshr.u64 RT4, rd, #18; \
+	vshl.u64 RT5, rd, #64 - 18; \
+	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rd, #41; \
+	vshl.u64 RT5, rd, #64 - 41; \
+	vadd.u64 RT0, RT0, rw1; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, rd; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, re, rf; \
+	\
+	vadd.u64 RT1, RT1, rg; \
+	vshr.u64 RT2, rh, #28; \
+	vshl.u64 RT3, rh, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, rh, #34; \
+	vshl.u64 RT5, rh, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* g = Sum0 (h) + Maj (h, a, b); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rh, #39; \
+	vshl.u64 RT5, rh, #64 - 39; \
+	veor.64 RT0, rh, ra; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rb, ra; \
+	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+	veor.64 rg, RT2, RT3; \
+	\
+	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
+	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
+	\
+	/**** S0(w[1:2]) */ \
+	\
+	/* w[0:1] += w[9:10] */ \
+	/* RT23q = rw1:rw2 */ \
+	vext.u64 RT23q, rw01q, rw23q, #1; \
+	vadd.u64 rw0, rw9; \
+	vadd.u64 rg, rg, RT0; \
+	vadd.u64 rw1, rw10;\
+	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
+	\
+	vshr.u64 RT45q, RT23q, #1; \
+	vshl.u64 RT67q, RT23q, #64 - 1; \
+	vshr.u64 RT01q, RT23q, #8; \
+	veor.u64 RT45q, RT45q, RT67q; \
+	vshl.u64 RT67q, RT23q, #64 - 8; \
+	veor.u64 RT45q, RT45q, RT01q; \
+	vshr.u64 RT01q, RT23q, #7; \
+	veor.u64 RT45q, RT45q, RT67q; \
+	\
+	/**** S1(w[14:15]) */ \
+	vshr.u64 RT23q, rw1415q, #6; \
+	veor.u64 RT01q, RT01q, RT45q; \
+	vshr.u64 RT45q, rw1415q, #19; \
+	vshl.u64 RT67q, rw1415q, #64 - 19; \
+	veor.u64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT45q, rw1415q, #61; \
+	veor.u64 RT23q, RT23q, RT67q; \
+	vshl.u64 RT67q, rw1415q, #64 - 61; \
+	veor.u64 RT23q, RT23q, RT45q; \
+	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
+	veor.u64 RT01q, RT23q, RT67q;
+#define vadd_RT01q(rw01q) \
+	/* w[0:1] += S(w[14:15]) */ \
+	vadd.u64 rw01q, RT01q;
+
+#define dummy(_) /*_*/
+
+#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vshr.u64 RT2, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	interleave_op1(arg1); \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	interleave_op2(arg2); \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, re, #41; \
+	vshl.u64 RT5, re, #64 - 41; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, re; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, rf, rg; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	vshr.u64 RT2, ra, #28; \
+	vshl.u64 RT3, ra, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* h = Sum0 (a) + Maj (a, b, c); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, ra, #39; \
+	vshl.u64 RT5, ra, #64 - 39; \
+	veor.64 RT0, ra, rb; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rc, rb; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	veor.64 rh, RT2, RT3; \
+	\
+	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+	vshr.u64 RT2, rd, #14; \
+	vshl.u64 RT3, rd, #64 - 14; \
+	vadd.u64 rh, rh, RT0; \
+	vshr.u64 RT4, rd, #18; \
+	vshl.u64 RT5, rd, #64 - 18; \
+	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rd, #41; \
+	vshl.u64 RT5, rd, #64 - 41; \
+	vadd.u64 RT0, RT0, rw1; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, rd; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, re, rf; \
+	\
+	vadd.u64 RT1, RT1, rg; \
+	vshr.u64 RT2, rh, #28; \
+	vshl.u64 RT3, rh, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, rh, #34; \
+	vshl.u64 RT5, rh, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* g = Sum0 (h) + Maj (h, a, b); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rh, #39; \
+	vshl.u64 RT5, rh, #64 - 39; \
+	veor.64 RT0, rh, ra; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rb, ra; \
+	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+	veor.64 rg, RT2, RT3;
+#define vadd_rg_RT0(rg) \
+	vadd.u64 rg, rg, RT0;
+#define vadd_rg_RT1(rg) \
+	vadd.u64 rg, rg, RT1; /* g+=t1; */
+
+.align 3
+.globl _gcry_sha512_transform_armv7_neon
+.type  _gcry_sha512_transform_armv7_neon,%function;
+
+_gcry_sha512_transform_armv7_neon:
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 *	%r3: nblks
+	 */
+	push {%lr};
+
+	mov %lr, #0;
+
+	/* Load context to d0-d7 */
+	vld1.64 {RA-RD}, [%r0]!;
+	vld1.64 {RE-RH}, [%r0];
+	sub %r0, #(4*8);
+
+	/* Load input to w[16], d16-d31 */
+	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
+	vld1.64 {RW0-RW3}, [%r1]!;
+	vld1.64 {RW4-RW7}, [%r1]!;
+	vld1.64 {RW8-RW11}, [%r1]!;
+	vld1.64 {RW12-RW15}, [%r1]!;
+#ifdef __ARMEL__
+	/* byteswap */
+	vrev64.8 RW01q, RW01q;
+	vrev64.8 RW23q, RW23q;
+	vrev64.8 RW45q, RW45q;
+	vrev64.8 RW67q, RW67q;
+	vrev64.8 RW89q, RW89q;
+	vrev64.8 RW1011q, RW1011q;
+	vrev64.8 RW1213q, RW1213q;
+	vrev64.8 RW1415q, RW1415q;
+#endif
+
+	/* EABI says that d8-d15 must be preserved by callee. */
+	vpush {RT0-RT7};
+
+.Loop:
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _);
+	b .Lenter_rounds;
+
+.Loop_rounds:
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
+.Lenter_rounds:
+	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
+	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
+	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
+	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
+	add %lr, #16;
+	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
+	cmp %lr, #64;
+	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
+	bne .Loop_rounds;
+
+	subs %r3, #1;
+
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+	beq .Lhandle_tail;
+	vld1.64 {RW0-RW3}, [%r1]!;
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+	vrev64.8 RW01q, RW01q;
+	vrev64.8 RW23q, RW23q;
+#endif
+	vld1.64 {RW4-RW7}, [%r1]!;
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+#ifdef __ARMEL__
+	vrev64.8 RW45q, RW45q;
+	vrev64.8 RW67q, RW67q;
+#endif
+	vld1.64 {RW8-RW11}, [%r1]!;
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+	vrev64.8 RW89q, RW89q;
+	vrev64.8 RW1011q, RW1011q;
+#endif
+	vld1.64 {RW12-RW15}, [%r1]!;
+	vadd_rg_RT0(RA);
+	vadd_rg_RT1(RA);
+
+	/* Load context */
+	vld1.64 {RT0-RT3}, [%r0]!;
+	vld1.64 {RT4-RT7}, [%r0];
+	sub %r0, #(4*8);
+
+#ifdef __ARMEL__
+	vrev64.8 RW1213q, RW1213q;
+	vrev64.8 RW1415q, RW1415q;
+#endif
+
+	vadd.u64 RA, RT0;
+	vadd.u64 RB, RT1;
+	vadd.u64 RC, RT2;
+	vadd.u64 RD, RT3;
+	vadd.u64 RE, RT4;
+	vadd.u64 RF, RT5;
+	vadd.u64 RG, RT6;
+	vadd.u64 RH, RT7;
+
+	/* Store the first half of context */
+	vst1.64 {RA-RD}, [%r0]!;
+	sub RK, $(8*80);
+	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	mov %lr, #0;
+	sub %r0, #(4*8);
+
+	b .Loop;
+.ltorg
+
+.Lhandle_tail:
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+
+	/* Load context to d16-d23 */
+	vld1.64 {RW0-RW3}, [%r0]!;
+	vadd_rg_RT0(RA);
+	vld1.64 {RW4-RW7}, [%r0];
+	vadd_rg_RT1(RA);
+	sub %r0, #(4*8);
+
+	vadd.u64 RA, RW0;
+	vadd.u64 RB, RW1;
+	vadd.u64 RC, RW2;
+	vadd.u64 RD, RW3;
+	vadd.u64 RE, RW4;
+	vadd.u64 RF, RW5;
+	vadd.u64 RG, RW6;
+	vadd.u64 RH, RW7;
+
+	/* Store the first half of context */
+	vst1.64 {RA-RD}, [%r0]!;
+
+	/* Clear used registers */
+	/* d16-d31 */
+	veor.u64 RW01q, RW01q;
+	veor.u64 RW23q, RW23q;
+	veor.u64 RW45q, RW45q;
+	veor.u64 RW67q, RW67q;
+	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	veor.u64 RW89q, RW89q;
+	veor.u64 RW1011q, RW1011q;
+	veor.u64 RW1213q, RW1213q;
+	veor.u64 RW1415q, RW1415q;
+	/* d8-d15 */
+	vpop {RT0-RT7};
+	/* d0-d7 (q0-q3) */
+	veor.u64 %q0, %q0;
+	veor.u64 %q1, %q1;
+	veor.u64 %q2, %q2;
+	veor.u64 %q3, %q3;
+
+	eor %r0, %r0;
+	pop {%pc};
+.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512-avx-amd64.S b/comm/third_party/libgcrypt/cipher/sha512-avx-amd64.S
new file mode 100644
index 0000000000..75f7b07059
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-avx-amd64.S
@@ -0,0 +1,461 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+/* Virtual Registers */
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
+
+/*
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+*/
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+
+
+/* Useful QWORD "arrays" for simpler memory references */
+#define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
+#define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
+#define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
+#define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
+#define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
+/* MSG, DIGEST, K_t, W_t are arrays */
+/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
+
+#define RORQ(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (64 - p2)
+
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	RORQ(	tmp0, 23) /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	RORQ(	tmp0, 4) /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	RORQ(	tmp0, 14) /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	RORQ(	tmp0, 5) /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	RORQ(	tmp0, 6) /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	RORQ(	tmp0, 28) /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+		vmovdqa	xmm4, [W_t(t-2)]   /* XMM4 = W[t-2] */; \
+		vmovdqu	xmm5, [W_t(t-15)]  /* XMM5 = W[t-15] */; \
+	mov	T1,   f; \
+		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */; \
+	mov	tmp0, e; \
+		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */; \
+	xor	T1,   g; \
+	RORQ(	tmp0, 23) /* 41 */; \
+		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */; \
+	xor	tmp0, e; \
+	mov	T2,   a; \
+	add	T1,   h; \
+		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */; \
+	mov 	tmp0, a; \
+	xor	T2,   c; \
+		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */; \
+	and	tmp0, c; \
+	and	T2,   b; \
+		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */; \
+	RORQ(	tmp0, 5) /* 39 */; \
+		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \
+		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \
+	mov	T1, f; \
+		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, e; \
+	xor	T1,   g; \
+		vpaddq	xmm0, xmm0, [W_t(t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */; \
+		vmovdqu	xmm1, [W_t(t- 7)]  /* XMM1 = W[t-7] */; \
+	RORQ(	tmp0, 23) /* 41 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+	xor	T1,   g; \
+		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */; \
+	add	T1,   [WK_2(t+1)]; \
+		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \
+	xor	tmp0, e; \
+		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	mov	T2,   a; \
+	add	T1,   h; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vmovdqa	[W_t(t)], xmm0      /* Store W[t] */; \
+		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */; \
+		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */; \
+	mov	tmp0, a; \
+	xor	T2,   c; \
+	and	tmp0, c; \
+	and	T2,   b; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+	RORQ(	tmp0, 5) /* 39 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g)
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_avx(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks
+*/
+.globl _gcry_sha512_transform_amd64_avx
+ELF(.type _gcry_sha512_transform_amd64_avx,@function;)
+.align 16
+_gcry_sha512_transform_amd64_avx:
+	CFI_STARTPROC()
+	xor eax, eax
+
+	cmp	msglen, 0
+	je	.Lnowork
+
+	vzeroupper
+
+	/* Allocate Stack Space */
+	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
+
+	/* Save GPRs */
+	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
+	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
+	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
+	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
+	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
+
+.Lupdateblock:
+
+	/* Load state variables */
+	mov	a_64, [DIGEST(0)]
+	mov	b_64, [DIGEST(1)]
+	mov	c_64, [DIGEST(2)]
+	mov	d_64, [DIGEST(3)]
+	mov	e_64, [DIGEST(4)]
+	mov	f_64, [DIGEST(5)]
+	mov	g_64, [DIGEST(6)]
+	mov	h_64, [DIGEST(7)]
+
+	/* BSWAP 2 QWORDS */
+	vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	vmovdqu	xmm0, [MSG(0)]
+	vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
+	vmovdqa	[W_t(0)], xmm0       /* Store Scheduled Pair */
+	vpaddq	xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */
+	vmovdqa	[WK_2(0)], xmm0      /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS, Compute 2 Rounds */; \
+		vmovdqu	xmm0, [MSG(t)]; \
+		vpshufb	xmm0, xmm0, xmm1     /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */; \
+		vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
+
+	/* Update digest */
+	add	[DIGEST(0)], a_64
+	add	[DIGEST(1)], b_64
+	add	[DIGEST(2)], c_64
+	add	[DIGEST(3)], d_64
+	add	[DIGEST(4)], e_64
+	add	[DIGEST(5)], f_64
+	add	[DIGEST(6)], g_64
+	add	[DIGEST(7)], h_64
+
+	/* Advance to next message block */
+	add	msg, 16*8
+	dec	msglen
+	jnz	.Lupdateblock
+
+	/* Restore GPRs */
+	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
+	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
+	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
+	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
+	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
+
+	vzeroall
+
+	/* Burn stack */
+	mov eax, 0
+.Lerase_stack:
+	vmovdqu [rsp + rax], ymm0
+	add eax, 32
+	cmp eax, frame_W_size
+	jne .Lerase_stack
+	vmovdqu [rsp + frame_WK], xmm0
+	xor     eax, eax
+
+	/* Restore Stack Pointer */
+	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
+
+.Lnowork:
+	ret
+	CFI_ENDPROC()
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+*/
+
+.align 16
+
+/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
+.LXMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+/* K[t] used in SHA512 hashing */
+.LK512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512-avx2-bmi2-amd64.S b/comm/third_party/libgcrypt/cipher/sha512-avx2-bmi2-amd64.S
new file mode 100644
index 0000000000..7f119e6c10
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-avx2-bmi2-amd64.S
@@ -0,0 +1,502 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(USE_SHA512)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+/* Virtual Registers */
+#define Y_0 ymm4
+#define Y_1 ymm5
+#define Y_2 ymm6
+#define Y_3 ymm7
+
+#define YTMP0 ymm0
+#define YTMP1 ymm1
+#define YTMP2 ymm2
+#define YTMP3 ymm3
+#define YTMP4 ymm8
+#define XFER YTMP0
+
+#define BYTE_FLIP_MASK ymm9
+#define MASK_YMM_LO ymm10
+#define MASK_YMM_LOx xmm10
+
+#define INP rdi /* 1st arg */
+#define CTX rsi /* 2nd arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define c rcx
+#define d r8
+#define e rdx
+#define y3 rdi
+
+#define TBL rbp
+
+#define a rax
+#define b rbx
+
+#define f r9
+#define g r10
+#define h r11
+
+#define T1 r12
+#define y0 r13
+#define y1 r14
+#define y2 r15
+
+#define y4 r12
+
+/* Local variables (stack frame) */
+#define frame_XFER      0
+#define frame_XFER_size (4*4*8)
+#define frame_SRND      (frame_XFER + frame_XFER_size)
+#define frame_SRND_size (1*8)
+#define frame_INP      (frame_SRND + frame_SRND_size)
+#define frame_INP_size (1*8)
+#define frame_NBLKS      (frame_INP + frame_INP_size)
+#define frame_NBLKS_size (1*8)
+#define frame_RSPSAVE      (frame_NBLKS + frame_NBLKS_size)
+#define frame_RSPSAVE_size (1*8)
+#define frame_GPRSAVE      (frame_RSPSAVE + frame_RSPSAVE_size)
+#define frame_GPRSAVE_size (6*8)
+#define frame_size (frame_GPRSAVE + frame_GPRSAVE_size)
+
+#define	VMOVDQ vmovdqu /*; assume buffers not aligned  */
+
+/* addm [mem], reg */
+/* Add reg to mem using reg-mem add and store */
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
+
+
+/* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
+/* Load ymm with mem and byte swap each dword */
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3
+
+/* %macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL */
+/* YDST = {YSRC1, YSRC2} >> RVAL*8 */
+#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
+	vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \
+	vpalignr   YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
+	 * d += h; \
+	 * h += Sum0 (a) + Maj (a, b, c); \
+	 * \
+	 * Ch(x, y, z) => ((x & y) + (~x & z)) \
+	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
+	 */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 41; \
+	rorx y1, e, 18; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 34; \
+	xor y0, y1; \
+	lea h, [h + y3]
+
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 39; \
+	rorx y1, e, 14; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 28; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
+
+#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Extract w[t-7] */; \
+		MY_VPALIGNR(	YTMP0, Y_3, Y_2, 8)		/* YTMP0 = W[-7] */; \
+		/* Calculate w[t-16] + w[t-7] */; \
+		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */; \
+		/* Extract w[t-15] */; \
+		MY_VPALIGNR(	YTMP1, Y_1, Y_0, 8)		/* YTMP1 = W[-15] */; \
+		\
+		/* Calculate sigma0 */; \
+		\
+		/* Calculate w[t-15] ror 1 */; \
+		vpsrlq		YTMP2, YTMP1, 1; \
+		vpsllq		YTMP3, YTMP1, (64-1); \
+		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */; \
+		/* Calculate w[t-15] shr 7 */; \
+		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Calculate w[t-15] ror 8 */; \
+		vpsrlq		YTMP2, YTMP1, 8; \
+		vpsllq		YTMP1, YTMP1, (64-8); \
+		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */; \
+		/* XOR the three components */; \
+		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \
+		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */; \
+		\
+		/* Add three components, w[t-16], w[t-7] and sigma0 */; \
+		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */; \
+		/* Move to appropriate lanes for calculating w[16] and w[17] */; \
+		vperm2i128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
+		/* Move to appropriate lanes for calculating w[18] and w[19] */; \
+		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \
+		\
+		/* Calculate w[16] and w[17] in both 128 bit lanes */; \
+		\
+		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
+		vperm2i128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */; \
+		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \
+		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
+		\
+		/* Add sigma1 to the other compunents to get w[16] and w[17] */; \
+		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */; \
+		\
+		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
+		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \
+		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
+		\
+		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
+		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */; \
+		\
+		/* Form w[19, w[18], w17], w[16] */; \
+		vpblendd	Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */; \
+	\
+	ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
+		vpaddq		XFER, Y_0, [TBL + (4+X)*32]; \
+		vmovdqa		[rsp + frame_XFER + X*32], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
+
+#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_rorx(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks
+*/
+.globl _gcry_sha512_transform_amd64_avx2
+ELF(.type _gcry_sha512_transform_amd64_avx2,@function;)
+.align 16
+_gcry_sha512_transform_amd64_avx2:
+	CFI_STARTPROC()
+	xor eax, eax
+
+	cmp rdx, 0
+	je .Lnowork
+
+	vzeroupper
+
+	/* Allocate Stack Space */
+	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
+	sub	rsp, frame_size
+	and	rsp, ~(0x40 - 1)
+	mov	[rsp + frame_RSPSAVE], rax
+	CFI_CFA_ON_STACK(frame_RSPSAVE, 0)
+
+	/* Save GPRs */
+	mov	[rsp + frame_GPRSAVE + 8 * 0], rbp
+	mov	[rsp + frame_GPRSAVE + 8 * 1], rbx
+	mov	[rsp + frame_GPRSAVE + 8 * 2], r12
+	mov	[rsp + frame_GPRSAVE + 8 * 3], r13
+	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
+	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
+	CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0)
+	CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1)
+	CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2)
+	CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3)
+	CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4)
+	CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5)
+
+	mov	[rsp + frame_NBLKS], NUM_BLKS
+
+	/*; load initial digest */
+	mov	a,[8*0 + CTX]
+	mov	b,[8*1 + CTX]
+	mov	c,[8*2 + CTX]
+	mov	d,[8*3 + CTX]
+	mov	e,[8*4 + CTX]
+	mov	f,[8*5 + CTX]
+	mov	g,[8*6 + CTX]
+	mov	h,[8*7 + CTX]
+
+	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP]
+
+	lea	TBL,[.LK512 ADD_RIP]
+
+	/*; byte swap first 16 dwords */
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
+
+	add	INP, 128
+	mov	[rsp + frame_INP], INP
+
+	vpaddq	XFER, Y_0, [TBL + 0*32]
+	vmovdqa [rsp + frame_XFER + 0*32], XFER
+	vpaddq	XFER, Y_1, [TBL + 1*32]
+	vmovdqa [rsp + frame_XFER + 1*32], XFER
+	vpaddq	XFER, Y_2, [TBL + 2*32]
+	vmovdqa [rsp + frame_XFER + 2*32], XFER
+	vpaddq	XFER, Y_3, [TBL + 3*32]
+	vmovdqa [rsp + frame_XFER + 3*32], XFER
+
+	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
+	mov	qword ptr [rsp + frame_SRND], 4
+
+.align 16
+.Loop0:
+	FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
+	add	TBL, 4*32
+
+	sub	qword ptr [rsp + frame_SRND], 1
+	jne	.Loop0
+
+	sub	qword ptr [rsp + frame_NBLKS], 1
+	je	.Ldone_hash
+
+	mov	INP, [rsp + frame_INP]
+
+	lea	TBL,[.LK512 ADD_RIP]
+
+	/* load next block and byte swap */
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
+
+	add	INP, 128
+	mov	[rsp + frame_INP], INP
+
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
+	vpaddq	XFER, Y_0, [TBL + 0*32]
+	vmovdqa [rsp + frame_XFER + 0*32], XFER
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
+	vpaddq	XFER, Y_1, [TBL + 1*32]
+	vmovdqa [rsp + frame_XFER + 1*32], XFER
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
+	vpaddq	XFER, Y_2, [TBL + 2*32]
+	vmovdqa [rsp + frame_XFER + 2*32], XFER
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
+	vpaddq	XFER, Y_3, [TBL + 3*32]
+	vmovdqa [rsp + frame_XFER + 3*32], XFER
+
+	addm([8*0 + CTX],a)
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
+
+	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
+	mov	qword ptr [rsp + frame_SRND],4
+
+	jmp	.Loop0
+
+.Ldone_hash:
+	vzeroall
+
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
+	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
+	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
+	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
+	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
+
+	addm([8*0 + CTX],a)
+	xor     eax, eax /* burn stack */
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
+
+	/* Restore GPRs */
+	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
+	mov	rbx, [rsp + frame_GPRSAVE + 8 * 1]
+	mov	r12, [rsp + frame_GPRSAVE + 8 * 2]
+	mov	r13, [rsp + frame_GPRSAVE + 8 * 3]
+	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
+	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
+	CFI_RESTORE(rbp)
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
+
+	/* Restore Stack Pointer */
+	mov	rsp, [rsp + frame_RSPSAVE]
+	CFI_DEF_CFA_REGISTER(rsp)
+
+.Lnowork:
+	ret
+	CFI_ENDPROC()
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+/*;; Binary Data */
+
+.align 64
+/* K[t] used in SHA512 hashing */
+.LK512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.align 32
+
+/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
+.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
+			   .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.LMASK_YMM_LO:		   .octa 0x00000000000000000000000000000000
+			   .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512-ppc.c b/comm/third_party/libgcrypt/cipher/sha512-ppc.c
new file mode 100644
index 0000000000..31ea25bf9a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-ppc.c
@@ -0,0 +1,969 @@
+/* sha512-ppc.c - PowerPC vcrypto implementation of SHA-512 transform
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    defined(USE_SHA512) && \
+    __GNUC__ >= 4
+
+#include <altivec.h>
+#include "bufhelp.h"
+
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned long long vector2x_u64;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+static const u64 K[80] =
+  {
+    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
+    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
+    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
+    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
+    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
+    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
+    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
+    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
+    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
+    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
+    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
+    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
+    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
+    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
+    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
+    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
+    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
+    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
+    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
+    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
+    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
+    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
+    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
+    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
+    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
+    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
+    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
+    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
+    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
+    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
+    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
+    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
+    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
+    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
+    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
+    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
+    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
+    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
+    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
+    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+  };
+
+
+static ASM_FUNC_ATTR_INLINE u64
+ror64 (u64 v, u64 shift)
+{
+  return (v >> (shift & 63)) ^ (v << ((64 - shift) & 63));
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_rol_elems(vector2x_u64 v, unsigned int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return vec_sld (v, v, (16 - (8 * idx)) & 15);
+#else
+  return vec_sld (v, v, (8 * idx) & 15);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_merge_idx0_elems(vector2x_u64 v0, vector2x_u64 v1)
+{
+  return vec_mergeh (v0, v1);
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_vshasigma_u64(vector2x_u64 v, unsigned int a, unsigned int b)
+{
+  __asm__ ("vshasigmad %0,%1,%2,%3"
+	   : "=v" (v)
+	   : "v" (v), "g" (a), "g" (b)
+	   : "memory");
+  return v;
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_u64_load(unsigned long offset, const void *ptr)
+{
+  vector2x_u64 vecu64;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ ("lxvd2x %x0,0,%1\n\t"
+	     : "=wa" (vecu64)
+	     : "r" ((uintptr_t)ptr)
+	     : "memory");
+  else
+#endif
+    __asm__ ("lxvd2x %x0,%1,%2\n\t"
+	     : "=wa" (vecu64)
+	     : "r" (offset), "r" ((uintptr_t)ptr)
+	     : "memory", "r0");
+#ifndef WORDS_BIGENDIAN
+  __asm__ ("xxswapd %x0, %x1"
+	   : "=wa" (vecu64)
+	   : "wa" (vecu64));
+#endif
+  return vecu64;
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+vec_u64_store(vector2x_u64 vecu64, unsigned long offset, void *ptr)
+{
+#ifndef WORDS_BIGENDIAN
+  __asm__ ("xxswapd %x0, %x1"
+	   : "=wa" (vecu64)
+	   : "wa" (vecu64));
+#endif
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ ("stxvd2x %x0,0,%1\n\t"
+	     :
+	     : "wa" (vecu64), "r" ((uintptr_t)ptr)
+	     : "memory");
+  else
+#endif
+    __asm__ ("stxvd2x %x0,%1,%2\n\t"
+	     :
+	     : "wa" (vecu64), "r" (offset), "r" ((uintptr_t)ptr)
+	     : "memory", "r0");
+}
+
+
+/* SHA2 round in vector registers */
+#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+    {                                                         \
+      t1  = (h);                                              \
+      t1 += ((k) + (w));                                      \
+      t1 += Cho((e),(f),(g));                                 \
+      t1 += Sum1((e));                                        \
+      t2  = Sum0((a));                                        \
+      t2 += Maj((a),(b),(c));                                 \
+      d  += t1;                                               \
+      h   = t1 + t2;                                          \
+    } while (0)
+
+#define Cho(b, c, d)  (vec_sel(d, c, b))
+
+#define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
+
+#define Sum0(x)       (vec_vshasigma_u64(x, 1, 0))
+
+#define Sum1(x)       (vec_vshasigma_u64(x, 1, 15))
+
+
+/* Message expansion on general purpose registers */
+#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
+#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
+
+#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
+#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
+		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
+		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
+		 w[i&0x0f]; })
+#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
+#define L(i) w[i&0x0f]
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_sha512_transform_ppc8(u64 state[8],
+			    const unsigned char *data, size_t nblks)
+{
+  /* GPRs used for message expansion as vector intrinsics based generates
+   * slower code. */
+  vector2x_u64 h0, h1, h2, h3, h4, h5, h6, h7;
+  vector2x_u64 a, b, c, d, e, f, g, h, t1, t2;
+  u64 w[16];
+
+  h0 = vec_u64_load (8 * 0, (unsigned long long *)state);
+  h1 = vec_rol_elems (h0, 1);
+  h2 = vec_u64_load (8 * 2, (unsigned long long *)state);
+  h3 = vec_rol_elems (h2, 1);
+  h4 = vec_u64_load (8 * 4, (unsigned long long *)state);
+  h5 = vec_rol_elems (h4, 1);
+  h6 = vec_u64_load (8 * 6, (unsigned long long *)state);
+  h7 = vec_rol_elems (h6, 1);
+
+  while (nblks >= 2)
+    {
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 128;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      I(0); I(1); I(2); I(3);
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      I(4); I(5); I(6); I(7);
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      I(8); I(9); I(10); I(11);
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+      I(12); I(13); I(14); I(15);
+      data += 128;
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+
+      nblks -= 2;
+    }
+
+  while (nblks)
+    {
+      a = h0;
+      b = h1;
+      c = h2;
+      d = h3;
+      e = h4;
+      f = h5;
+      g = h6;
+      h = h7;
+
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 128;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      h0 += a;
+      h1 += b;
+      h2 += c;
+      h3 += d;
+      h4 += e;
+      h5 += f;
+      h6 += g;
+      h7 += h;
+
+      nblks--;
+    }
+
+  h0 = vec_merge_idx0_elems (h0, h1);
+  h2 = vec_merge_idx0_elems (h2, h3);
+  h4 = vec_merge_idx0_elems (h4, h5);
+  h6 = vec_merge_idx0_elems (h6, h7);
+  vec_u64_store (h0, 8 * 0, (unsigned long long *)state);
+  vec_u64_store (h2, 8 * 2, (unsigned long long *)state);
+  vec_u64_store (h4, 8 * 4, (unsigned long long *)state);
+  vec_u64_store (h6, 8 * 6, (unsigned long long *)state);
+
+  return sizeof(w);
+}
+#undef R
+#undef Cho
+#undef Maj
+#undef Sum0
+#undef Sum1
+#undef S0
+#undef S1
+#undef I
+#undef W
+#undef I2
+#undef W2
+#undef R2
+
+
+/* SHA2 round in general purpose registers */
+#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
+          {                                                       \
+            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
+            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
+            d += t1;                                              \
+            h  = t1 + t2;                                         \
+          } while (0)
+
+#define Cho(x, y, z)  ((x & y) + (~x & z))
+
+#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
+
+#define Sum0(x)       (ror64(x, 28) ^ ror64(x ^ ror64(x, 39-34), 34))
+
+#define Sum1(x)       (ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41))
+
+
+/* Message expansion on general purpose registers */
+#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
+#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
+
+#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
+#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
+		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
+		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
+		 w[i&0x0f]; })
+#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
+#define L(i) w[i&0x0f]
+
+
+unsigned int ASM_FUNC_ATTR
+_gcry_sha512_transform_ppc9(u64 state[8], const unsigned char *data,
+			    size_t nblks)
+{
+  /* GPRs used for round function and message expansion as vector intrinsics
+   * based generates slower code for POWER9. */
+  u64 a, b, c, d, e, f, g, h, t1, t2;
+  u64 w[16];
+
+  a = state[0];
+  b = state[1];
+  c = state[2];
+  d = state[3];
+  e = state[4];
+  f = state[5];
+  g = state[6];
+  h = state[7];
+
+  while (nblks >= 2)
+    {
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 128;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      I(0); I(1); I(2); I(3);
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      I(4); I(5); I(6); I(7);
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      I(8); I(9); I(10); I(11);
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+      I(12); I(13); I(14); I(15);
+      data += 128;
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      nblks -= 2;
+    }
+
+  while (nblks)
+    {
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+      data += 128;
+      R(a, b, c, d, e, f, g, h, K[0], W(0));
+      R(h, a, b, c, d, e, f, g, K[1], W(1));
+      R(g, h, a, b, c, d, e, f, K[2], W(2));
+      R(f, g, h, a, b, c, d, e, K[3], W(3));
+      R(e, f, g, h, a, b, c, d, K[4], W(4));
+      R(d, e, f, g, h, a, b, c, K[5], W(5));
+      R(c, d, e, f, g, h, a, b, K[6], W(6));
+      R(b, c, d, e, f, g, h, a, K[7], W(7));
+      R(a, b, c, d, e, f, g, h, K[8], W(8));
+      R(h, a, b, c, d, e, f, g, K[9], W(9));
+      R(g, h, a, b, c, d, e, f, K[10], W(10));
+      R(f, g, h, a, b, c, d, e, K[11], W(11));
+      R(e, f, g, h, a, b, c, d, K[12], W(12));
+      R(d, e, f, g, h, a, b, c, K[13], W(13));
+      R(c, d, e, f, g, h, a, b, K[14], W(14));
+      R(b, c, d, e, f, g, h, a, K[15], W(15));
+
+      R(a, b, c, d, e, f, g, h, K[16], W(16));
+      R(h, a, b, c, d, e, f, g, K[17], W(17));
+      R(g, h, a, b, c, d, e, f, K[18], W(18));
+      R(f, g, h, a, b, c, d, e, K[19], W(19));
+      R(e, f, g, h, a, b, c, d, K[20], W(20));
+      R(d, e, f, g, h, a, b, c, K[21], W(21));
+      R(c, d, e, f, g, h, a, b, K[22], W(22));
+      R(b, c, d, e, f, g, h, a, K[23], W(23));
+      R(a, b, c, d, e, f, g, h, K[24], W(24));
+      R(h, a, b, c, d, e, f, g, K[25], W(25));
+      R(g, h, a, b, c, d, e, f, K[26], W(26));
+      R(f, g, h, a, b, c, d, e, K[27], W(27));
+      R(e, f, g, h, a, b, c, d, K[28], W(28));
+      R(d, e, f, g, h, a, b, c, K[29], W(29));
+      R(c, d, e, f, g, h, a, b, K[30], W(30));
+      R(b, c, d, e, f, g, h, a, K[31], W(31));
+
+      R(a, b, c, d, e, f, g, h, K[32], W(32));
+      R(h, a, b, c, d, e, f, g, K[33], W(33));
+      R(g, h, a, b, c, d, e, f, K[34], W(34));
+      R(f, g, h, a, b, c, d, e, K[35], W(35));
+      R(e, f, g, h, a, b, c, d, K[36], W(36));
+      R(d, e, f, g, h, a, b, c, K[37], W(37));
+      R(c, d, e, f, g, h, a, b, K[38], W(38));
+      R(b, c, d, e, f, g, h, a, K[39], W(39));
+      R(a, b, c, d, e, f, g, h, K[40], W(40));
+      R(h, a, b, c, d, e, f, g, K[41], W(41));
+      R(g, h, a, b, c, d, e, f, K[42], W(42));
+      R(f, g, h, a, b, c, d, e, K[43], W(43));
+      R(e, f, g, h, a, b, c, d, K[44], W(44));
+      R(d, e, f, g, h, a, b, c, K[45], W(45));
+      R(c, d, e, f, g, h, a, b, K[46], W(46));
+      R(b, c, d, e, f, g, h, a, K[47], W(47));
+
+      R(a, b, c, d, e, f, g, h, K[48], W(48));
+      R(h, a, b, c, d, e, f, g, K[49], W(49));
+      R(g, h, a, b, c, d, e, f, K[50], W(50));
+      R(f, g, h, a, b, c, d, e, K[51], W(51));
+      R(e, f, g, h, a, b, c, d, K[52], W(52));
+      R(d, e, f, g, h, a, b, c, K[53], W(53));
+      R(c, d, e, f, g, h, a, b, K[54], W(54));
+      R(b, c, d, e, f, g, h, a, K[55], W(55));
+      R(a, b, c, d, e, f, g, h, K[56], W(56));
+      R(h, a, b, c, d, e, f, g, K[57], W(57));
+      R(g, h, a, b, c, d, e, f, K[58], W(58));
+      R(f, g, h, a, b, c, d, e, K[59], W(59));
+      R(e, f, g, h, a, b, c, d, K[60], W(60));
+      R(d, e, f, g, h, a, b, c, K[61], W(61));
+      R(c, d, e, f, g, h, a, b, K[62], W(62));
+      R(b, c, d, e, f, g, h, a, K[63], W(63));
+
+      R(a, b, c, d, e, f, g, h, K[64], L(64));
+      R(h, a, b, c, d, e, f, g, K[65], L(65));
+      R(g, h, a, b, c, d, e, f, K[66], L(66));
+      R(f, g, h, a, b, c, d, e, K[67], L(67));
+      R(e, f, g, h, a, b, c, d, K[68], L(68));
+      R(d, e, f, g, h, a, b, c, K[69], L(69));
+      R(c, d, e, f, g, h, a, b, K[70], L(70));
+      R(b, c, d, e, f, g, h, a, K[71], L(71));
+      R(a, b, c, d, e, f, g, h, K[72], L(72));
+      R(h, a, b, c, d, e, f, g, K[73], L(73));
+      R(g, h, a, b, c, d, e, f, K[74], L(74));
+      R(f, g, h, a, b, c, d, e, K[75], L(75));
+      R(e, f, g, h, a, b, c, d, K[76], L(76));
+      R(d, e, f, g, h, a, b, c, K[77], L(77));
+      R(c, d, e, f, g, h, a, b, K[78], L(78));
+      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      a += state[0];
+      b += state[1];
+      c += state[2];
+      d += state[3];
+      e += state[4];
+      f += state[5];
+      g += state[6];
+      h += state[7];
+      state[0] = a;
+      state[1] = b;
+      state[2] = c;
+      state[3] = d;
+      state[4] = e;
+      state[5] = f;
+      state[6] = g;
+      state[7] = h;
+
+      nblks--;
+    }
+
+  return sizeof(w);
+}
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/comm/third_party/libgcrypt/cipher/sha512-ssse3-amd64.S b/comm/third_party/libgcrypt/cipher/sha512-ssse3-amd64.S
new file mode 100644
index 0000000000..6a1328a690
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-ssse3-amd64.S
@@ -0,0 +1,467 @@
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright (c) 2012, Intel Corporation
+;
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are
+; met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in the
+;   documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived from
+;   this software without specific prior written permission.
+;
+;
+; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+*/
+/*
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Note: original implementation was named as SHA512-SSE4. However, only SSSE3
+ *       is required.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
+
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+/* Virtual Registers */
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
+
+/*
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+*/
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+
+
+/* Useful QWORD "arrays" for simpler memory references */
+#define MSG(i)    msg    + 8*(i)               /* Input message (arg1) */
+#define DIGEST(i) digest + 8*(i)               /* Output Digest (arg2) */
+#define K_t(i)    .LK512   + 8*(i) ADD_RIP     /* SHA Constants (static mem) */
+#define W_t(i)    rsp + frame_W  + 8*(i)       /* Message Schedule (stack frame) */
+#define WK_2(i)   rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */
+/* MSG, DIGEST, K_t, W_t are arrays */
+/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
+
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0     /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0     /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+	mov	T1, f; \
+		movdqa	xmm2, [W_t(t-2)]  /* XMM2 = W[t-2] */; \
+	xor	T1,   g; \
+	and	T1,   e; \
+		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		movdqu	xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \
+	mov	tmp0, e; \
+	ror	tmp0, 23 /* 41 */; \
+		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */; \
+	xor	tmp0, e; \
+	ror	tmp0, 4 /* 18 */; \
+		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */; \
+	xor	tmp0, e; \
+	ror	tmp0, 14 /* 14 */; \
+		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */; \
+	add	T1,   tmp0; \
+	add	T1,   h; \
+		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \
+	mov	T2,   a; \
+	xor	T2,   c; \
+		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \
+	and	T2,   b; \
+	mov	tmp0, a; \
+		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \
+	and	tmp0, c; \
+	xor	T2,   tmp0; \
+		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \
+	mov	tmp0, a; \
+	ror	tmp0, 5 /* 39 */; \
+		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 6 /* 34 */; \
+		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 28 /* 28 */; \
+		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \
+		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */; \
+	mov	T1,   f; \
+	xor	T1,   g; \
+		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */; \
+	and	T1,   e; \
+	xor	T1,   g; \
+		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \
+	add	T1,   [WK_2(t+1)]; \
+	mov	tmp0, e; \
+		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \
+	ror	tmp0, 23 /* 41 */; \
+	xor	tmp0, e; \
+		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */; \
+	ror	tmp0, 4 /* 18 */; \
+	xor	tmp0, e; \
+		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */; \
+	ror	tmp0, 14 /* 14 */; \
+	add	T1,   tmp0; \
+		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \
+	add	T1,   h; \
+	mov	T2,   a; \
+		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \
+	xor	T2,   c; \
+	and	T2,   b; \
+		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, a; \
+	and	tmp0, c; \
+		movdqu	xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \
+	xor	T2,   tmp0; \
+		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */; \
+	mov	tmp0, a; \
+		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \
+	ror	tmp0, 5 /* 39 */; \
+		paddq	xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \
+	xor	tmp0, a; \
+		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	ror	tmp0, 6 /* 34 */; \
+		movdqa	[W_t(t)], xmm0     /* Store scheduled qwords */; \
+	xor	tmp0, a; \
+		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */; \
+	ror	tmp0, 28 /* 28 */; \
+		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g)
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+;   message blocks.
+; L is the message length in SHA512 blocks.
+*/
+.globl _gcry_sha512_transform_amd64_ssse3
+ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;)
+.align 16
+_gcry_sha512_transform_amd64_ssse3:
+	CFI_STARTPROC()
+	xor eax, eax
+
+	cmp msglen, 0
+	je .Lnowork
+
+	/* Allocate Stack Space */
+	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
+
+	/* Save GPRs */
+	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
+	mov	[rsp + frame_GPRSAVE + 8 * 1], r12
+	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
+	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
+	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
+
+.Lupdateblock:
+
+	/* Load state variables */
+	mov	a_64, [DIGEST(0)]
+	mov	b_64, [DIGEST(1)]
+	mov	c_64, [DIGEST(2)]
+	mov	d_64, [DIGEST(3)]
+	mov	e_64, [DIGEST(4)]
+	mov	f_64, [DIGEST(5)]
+	mov	g_64, [DIGEST(6)]
+	mov	h_64, [DIGEST(7)]
+
+	/* BSWAP 2 QWORDS */
+	movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	movdqu	xmm0, [MSG(0)]
+	pshufb	xmm0, xmm1      /* BSWAP */
+	movdqa	[W_t(0)], xmm0  /* Store Scheduled Pair */
+	paddq	xmm0, [K_t(0)]  /* Compute W[t]+K[t] */
+	movdqa	[WK_2(0)], xmm0 /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS; Compute 2 Rounds */; \
+		movdqu	xmm0, [MSG(t)]; \
+		pshufb	xmm0, xmm1      /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */; \
+		paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
+
+	/* Update digest */
+	add	[DIGEST(0)], a_64
+	add	[DIGEST(1)], b_64
+	add	[DIGEST(2)], c_64
+	add	[DIGEST(3)], d_64
+	add	[DIGEST(4)], e_64
+	add	[DIGEST(5)], f_64
+	add	[DIGEST(6)], g_64
+	add	[DIGEST(7)], h_64
+
+	/* Advance to next message block */
+	add	msg, 16*8
+	dec	msglen
+	jnz	.Lupdateblock
+
+	/* Restore GPRs */
+	mov	rbx, [rsp + frame_GPRSAVE + 8 * 0]
+	mov	r12, [rsp + frame_GPRSAVE + 8 * 1]
+	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
+	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
+	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
+
+	pxor	xmm0, xmm0
+	pxor	xmm1, xmm1
+	pxor	xmm2, xmm2
+	pxor	xmm3, xmm3
+	pxor	xmm4, xmm4
+	pxor	xmm5, xmm5
+
+	/* Burn stack */
+	mov eax, 0
+.Lerase_stack:
+	movdqu [rsp + rax], xmm0
+	add eax, 16
+	cmp eax, frame_W_size
+	jne .Lerase_stack
+	movdqu [rsp + frame_WK], xmm0
+	xor     eax, eax
+
+	/* Restore Stack Pointer */
+	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
+
+.Lnowork:
+	ret
+	CFI_ENDPROC()
+
+/*
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+*/
+
+.align 16
+
+/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
+.LXMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+/* K[t] used in SHA512 hashing */
+.LK512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512-ssse3-i386.c b/comm/third_party/libgcrypt/cipher/sha512-ssse3-i386.c
new file mode 100644
index 0000000000..0fc98d8ed2
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512-ssse3-i386.c
@@ -0,0 +1,404 @@
+/* sha512-ssse3-i386.c - i386/SSSE3 implementation of SHA-512 transform
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * SHA512 Message Expansion (I2 and W2 macros) based on implementation
+ * from file "sha512-ssse3-amd64.s":
+ ************************************************************************
+ * Copyright (c) 2012, Intel Corporation
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * * Neither the name of the Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************
+ */
+
+#include <config.h>
+
+#if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
+
+#include "bufhelp.h"
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE/MMX instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#  pragma GCC target("no-mmx")
+#endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#  pragma clang attribute push (__attribute__((target("no-mmx"))), apply_to = function)
+#endif
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+static const u64 K[80] __attribute__ ((aligned (16))) =
+  {
+    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
+    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
+    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
+    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
+    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
+    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
+    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
+    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
+    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
+    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
+    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
+    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
+    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
+    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
+    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
+    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
+    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
+    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
+    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
+    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
+    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
+    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
+    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
+    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
+    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
+    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
+    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
+    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
+    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
+    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
+    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
+    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
+    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
+    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
+    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
+    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
+    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
+    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
+    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
+    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+  };
+
+static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) =
+  { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+
+/* SHA2 round */
+#define RA "%%mm0"
+#define RB "%%mm1"
+#define RC "%%mm2"
+#define RD "%%mm3"
+#define RE "%%mm4"
+#define RF "%%mm5"
+#define RG "%%mm6"
+#define RH "%%mm7"
+
+#define Rx(a,b,c,d,e,f,g,h,wk) \
+	asm volatile (/* Cho + Sum1 */					\
+		      "movq2dq "a", %%xmm2;\n\t"			\
+		      "movq "e", "a";\n\t"				\
+		      "movq2dq "c", %%xmm3;\n\t"			\
+		      "movq "e", "c";\n\t"				\
+		      "movq2dq "b", %%xmm4;\n\t"			\
+		      "movq "e", "b";\n\t"				\
+		      "psrlq $(41-18), "c";\n\t"			\
+		      "pandn "g", "a";\n\t"				\
+		      "pxor "e", "c";\n\t"				\
+		      "pand "f", "b";\n\t"				\
+		      "psrlq $(18-14), "c";\n\t"			\
+		      "paddq "a", "h";\n\t"				\
+		      wk(a)						\
+		      "pxor "e", "c";\n\t"				\
+		      "paddq "b", "h";\n\t"				\
+		      "psrlq $(14), "c";\n\t"				\
+		      "movq "e", "b";\n\t"				\
+		      "psllq $(50-46), "b";\n\t"			\
+		      "paddq "a", "h";\n\t"				\
+		      "movdq2q %%xmm2, "a";\n\t"			\
+		      "pxor "e", "b";\n\t"				\
+		      "psllq $(46-23), "b";\n\t"			\
+		      "pxor "e", "b";\n\t"				\
+		      "psllq $(23), "b";\n\t"				\
+		      "pxor "b", "c";\n\t"				\
+		      "movdq2q %%xmm4, "b";\n\t"			\
+		      "paddq "c", "h";\n\t"				\
+		      "movdq2q %%xmm3, "c";\n\t"			\
+		      \
+		      /* Maj + Sum0 */ \
+		      "movq2dq "e", %%xmm2;\n\t"			\
+		      "movq "a", "e";\n\t"				\
+		      "movq2dq "g", %%xmm3;\n\t"			\
+		      "movq "a", "g";\n\t"				\
+		      "movq2dq "f", %%xmm4;\n\t"			\
+		      "movq "a", "f";\n\t"				\
+		      "psrlq $(39-34), "g";\n\t"			\
+		      "pxor "b", "e";\n\t"				\
+		      "pxor "a", "g";\n\t"				\
+		      "pand "b", "f";\n\t"				\
+		      "psrlq $(34-28), "g";\n\t"			\
+		      "pand "c", "e";\n\t"				\
+		      "pxor "a", "g";\n\t"				\
+		      "paddq "h", "d";\n\t"				\
+		      "paddq "f", "h";\n\t"				\
+		      "movdq2q %%xmm4, "f";\n\t"			\
+		      "psrlq $28, "g";\n\t"				\
+		      "paddq "e", "h";\n\t"				\
+		      "movq "a", "e";\n\t"				\
+		      "psllq $(36-30), "e";\n\t"			\
+		      "pxor "a", "e";\n\t"				\
+		      "psllq $(30-25), "e";\n\t"			\
+		      "pxor "a", "e";\n\t"				\
+		      "psllq $(25), "e";\n\t"				\
+		      "pxor "e", "g";\n\t"				\
+		      "movdq2q %%xmm2, "e";\n\t"			\
+		      "paddq "g", "h";\n\t"				\
+		      "movdq2q %%xmm3, "g";\n\t"			\
+		      \
+		      :							\
+		      :							\
+		      : "memory" )
+
+#define WK0(tmp)      "movdq2q %%xmm0, "tmp";\n\t"			\
+		      "pshufd $0xee, %%xmm0, %%xmm0;\n\t"
+
+#define WK1(tmp)      "movdq2q %%xmm0, "tmp";\n\t"
+
+/* Message expansion */
+#define I2(i) \
+	asm volatile ("movdqu %[inbuf], %%xmm0;\n\t"			\
+		      "pshufb %%xmm6, %%xmm0;\n\t"			\
+		      "movdqu %%xmm0, %[w];\n\t"			\
+		      "paddq %[k], %%xmm0;\n\t"				\
+		      :							\
+		      : [k] "m" (K[i]),					\
+		        [w] "m" (w[i]),					\
+		        [inbuf] "m" (data[(i)*8])			\
+		      : "memory" )
+
+#define W2(i) \
+	asm volatile ("movdqu %[w_t_m_2], %%xmm2;\n\t"			\
+		      "movdqa %%xmm2, %%xmm0;\n\t"			\
+		      "movdqu %[w_t_m_15], %%xmm5;\n\t"			\
+		      :							\
+		      : [w_t_m_2] "m" (w[(i)-2]),			\
+		        [w_t_m_15] "m" (w[(i)-15])			\
+		      : "memory" );					\
+	asm volatile ("movdqa %%xmm5, %%xmm3;\n\t"			\
+		      "psrlq $(61-19), %%xmm0;\n\t"			\
+		      "psrlq $(8-7), %%xmm3;\n\t"			\
+		      "pxor %%xmm2, %%xmm0;\n\t"			\
+		      "pxor %%xmm5, %%xmm3;\n\t"			\
+		      "psrlq $(19-6), %%xmm0;\n\t"			\
+		      "psrlq $(7-1), %%xmm3;\n\t"			\
+		      "pxor %%xmm2, %%xmm0;\n\t"			\
+		      "pxor %%xmm5, %%xmm3;\n\t"			\
+		      "psrlq $6, %%xmm0;\n\t"				\
+		      "psrlq $1, %%xmm3;\n\t"				\
+		      "movdqa %%xmm2, %%xmm1;\n\t"			\
+		      "movdqa %%xmm5, %%xmm4;\n\t"			\
+		      "psllq $(61-19), %%xmm1;\n\t"			\
+		      "psllq $(8-1), %%xmm4;\n\t"			\
+		      "pxor %%xmm2, %%xmm1;\n\t"			\
+		      "pxor %%xmm5, %%xmm4;\n\t"			\
+		      "psllq $(64-61), %%xmm1;\n\t"			\
+		      "psllq $(64-8), %%xmm4;\n\t"			\
+		      "pxor %%xmm1, %%xmm0;\n\t"			\
+		      "movdqu %[w_t_m_16], %%xmm2;\n\t"			\
+		      "pxor %%xmm4, %%xmm3;\n\t"			\
+		      "movdqu %[w_t_m_7], %%xmm1;\n\t"			\
+		      :							\
+		      : [w_t_m_7] "m" (w[(i)-7]),			\
+		        [w_t_m_16] "m" (w[(i)-16])			\
+		      : "memory" );					\
+	asm volatile ("paddq %%xmm3, %%xmm0;\n\t"			\
+		      "paddq %%xmm2, %%xmm0;\n\t"			\
+		      "paddq %%xmm1, %%xmm0;\n\t"			\
+		      "movdqu %%xmm0, %[w_t_m_0];\n\t"			\
+		      "paddq %[k], %%xmm0;\n\t"				\
+		      :	[w_t_m_0] "=m" (w[(i)-0])			\
+		      : [k] "m" (K[i])					\
+		      : "memory" )
+
+unsigned int ASM_FUNC_ATTR
+_gcry_sha512_transform_i386_ssse3(u64 state[8], const unsigned char *data,
+				  size_t nblks)
+{
+  unsigned int t;
+  u64 w[80];
+
+  /* Load state to MMX registers. */
+  asm volatile ("movq 8*0(%[state]), "RA";\n\t"
+		"movq 8*1(%[state]), "RB";\n\t"
+		"movq 8*2(%[state]), "RC";\n\t"
+		"movq 8*3(%[state]), "RD";\n\t"
+		"movq 8*4(%[state]), "RE";\n\t"
+		"movq 8*5(%[state]), "RF";\n\t"
+		"movq 8*6(%[state]), "RG";\n\t"
+		"movq 8*7(%[state]), "RH";\n\t"
+		:
+		: [state] "r" (state)
+		: "memory" );
+
+  asm volatile ("movdqa %[bshuf_mask], %%xmm6;\n\t"
+		:
+		: [bshuf_mask] "m" (*bshuf_mask)
+		: "memory" );
+
+  while (nblks)
+    {
+      I2(0);
+      Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0);
+      Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1);
+      I2(2);
+      Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0);
+      Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1);
+      I2(4);
+      Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0);
+      Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1);
+      I2(6);
+      Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0);
+      Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1);
+      I2(8);
+      Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0);
+      Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1);
+      I2(10);
+      Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0);
+      Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1);
+      I2(12);
+      Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0);
+      Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1);
+      I2(14);
+      Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0);
+      Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1);
+      data += 128;
+
+      for (t = 16; t < 80; t += 16)
+	{
+	  W2(t + 0);
+	  Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0);
+	  Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1);
+	  W2(t + 2);
+	  Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0);
+	  Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1);
+	  W2(t + 4);
+	  Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0);
+	  Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1);
+	  W2(t + 6);
+	  Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0);
+	  Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1);
+	  W2(t + 8);
+	  Rx(RA, RB, RC, RD, RE, RF, RG, RH, WK0);
+	  Rx(RH, RA, RB, RC, RD, RE, RF, RG, WK1);
+	  W2(t + 10);
+	  Rx(RG, RH, RA, RB, RC, RD, RE, RF, WK0);
+	  Rx(RF, RG, RH, RA, RB, RC, RD, RE, WK1);
+	  W2(t + 12);
+	  Rx(RE, RF, RG, RH, RA, RB, RC, RD, WK0);
+	  Rx(RD, RE, RF, RG, RH, RA, RB, RC, WK1);
+	  W2(t + 14);
+	  Rx(RC, RD, RE, RF, RG, RH, RA, RB, WK0);
+	  Rx(RB, RC, RD, RE, RF, RG, RH, RA, WK1);
+	}
+
+      asm volatile ("paddq 8*0(%[state]), "RA";\n\t"
+		    "paddq 8*1(%[state]), "RB";\n\t"
+		    "paddq 8*2(%[state]), "RC";\n\t"
+		    "paddq 8*3(%[state]), "RD";\n\t"
+		    "paddq 8*4(%[state]), "RE";\n\t"
+		    "paddq 8*5(%[state]), "RF";\n\t"
+		    "paddq 8*6(%[state]), "RG";\n\t"
+		    "paddq 8*7(%[state]), "RH";\n\t"
+		    "movq "RA", 8*0(%[state]);\n\t"
+		    "movq "RB", 8*1(%[state]);\n\t"
+		    "movq "RC", 8*2(%[state]);\n\t"
+		    "movq "RD", 8*3(%[state]);\n\t"
+		    "movq "RE", 8*4(%[state]);\n\t"
+		    "movq "RF", 8*5(%[state]);\n\t"
+		    "movq "RG", 8*6(%[state]);\n\t"
+		    "movq "RH", 8*7(%[state]);\n\t"
+		    :
+		    : [state] "r" (state)
+		    : "memory" );
+
+      nblks--;
+    }
+
+  /* Clear registers */
+  asm volatile ("pxor %%xmm0, %%xmm0;\n\t"
+		"pxor %%xmm1, %%xmm1;\n\t"
+		"pxor %%xmm2, %%xmm2;\n\t"
+		"pxor %%xmm3, %%xmm3;\n\t"
+		"pxor %%xmm4, %%xmm4;\n\t"
+		"pxor %%xmm5, %%xmm5;\n\t"
+		"pxor %%xmm6, %%xmm6;\n\t"
+		"pxor %%mm0, %%mm0;\n\t"
+		"pxor %%mm1, %%mm1;\n\t"
+		"pxor %%mm2, %%mm2;\n\t"
+		"pxor %%mm3, %%mm3;\n\t"
+		"pxor %%mm4, %%mm4;\n\t"
+		"pxor %%mm5, %%mm5;\n\t"
+		"pxor %%mm6, %%mm6;\n\t"
+		"pxor %%mm7, %%mm7;\n\t"
+		"emms;\n\t"
+	       :
+	       :
+	       : "memory" );
+
+  return sizeof(w);
+}
+
+#if __clang__
+#  pragma clang attribute pop
+#  pragma clang attribute pop
+#endif
+
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/sha512.c b/comm/third_party/libgcrypt/cipher/sha512.c
new file mode 100644
index 0000000000..bc4657a8b9
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sha512.c
@@ -0,0 +1,1316 @@
+/* sha512.c - SHA384 and SHA512 hash functions
+ * Copyright (C) 2003, 2008, 2009 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*  Test vectors from FIPS-180-2:
+ *
+ *  "abc"
+ * 384:
+ *  CB00753F 45A35E8B B5A03D69 9AC65007 272C32AB 0EDED163
+ *  1A8B605A 43FF5BED 8086072B A1E7CC23 58BAECA1 34C825A7
+ * 512:
+ *  DDAF35A1 93617ABA CC417349 AE204131 12E6FA4E 89A97EA2 0A9EEEE6 4B55D39A
+ *  2192992A 274FC1A8 36BA3C23 A3FEEBBD 454D4423 643CE80E 2A9AC94F A54CA49F
+ *
+ *  "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu"
+ * 384:
+ *  09330C33 F71147E8 3D192FC7 82CD1B47 53111B17 3B3B05D2
+ *  2FA08086 E3B0F712 FCC7C71A 557E2DB9 66C3E9FA 91746039
+ * 512:
+ *  8E959B75 DAE313DA 8CF4F728 14FC143F 8F7779C6 EB9F7FA1 7299AEAD B6889018
+ *  501D289E 4900F7E4 331B99DE C4B5433A C7D329EE B6DD2654 5E96E55B 874BE909
+ *
+ *  "a" x 1000000
+ * 384:
+ *  9D0E1809 716474CB 086E834E 310A4A1C ED149E9C 00F24852
+ *  7972CEC5 704C2A5B 07B8B3DC 38ECC4EB AE97DDD8 7F3D8985
+ * 512:
+ *  E718483D 0CE76964 4E2E42C7 BC15B463 8E1F98B1 3B204428 5632A803 AFA973EB
+ *  DE0FF244 877EA60A 4CB0432C E577C31B EB009C5C 2C49AA2E 4EADB217 AD8CC09B
+ */
+
+
+#include <config.h>
+#include <string.h>
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+
+/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
+#undef USE_ARM_NEON_ASM
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_ARM_NEON_ASM 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+/* USE_ARM_ASM indicates whether to enable ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+# define USE_ARM_ASM 1
+#endif
+
+
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSSE3 1
+#endif
+
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+
+/* USE_SSSE3_I386 indicates whether to compile with Intel SSSE3/i386 code. */
+#undef USE_SSSE3_I386
+#if defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3)
+# define USE_SSSE3_I386 1
+#endif
+
+
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code. */
+#undef USE_PPC_CRYPTO
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_CRYPTO 1
+#  endif
+# endif
+#endif
+
+
+/* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
+#undef USE_S390X_CRYPTO
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define USE_S390X_CRYPTO 1
+#endif /* USE_S390X_CRYPTO */
+
+
+typedef struct
+{
+  u64 h0, h1, h2, h3, h4, h5, h6, h7;
+} SHA512_STATE;
+
+typedef struct
+{
+  gcry_md_block_ctx_t bctx;
+  SHA512_STATE state;
+#ifdef USE_S390X_CRYPTO
+  u64 final_len_msb, final_len_lsb; /* needs to be right after state.h7. */
+  int use_s390x_crypto;
+#endif
+} SHA512_CONTEXT;
+
+
+static const u64 k[] =
+  {
+    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
+    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
+    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
+    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
+    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
+    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
+    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
+    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
+    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
+    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
+    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
+    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
+    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
+    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
+    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
+    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
+    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
+    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
+    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
+    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
+    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
+    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
+    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
+    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
+    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
+    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
+    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
+    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
+    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
+    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
+    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
+    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
+    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
+    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
+    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
+    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
+    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
+    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
+    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
+    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+  };
+
+
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *))
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+#ifdef USE_ARM_NEON_ASM
+unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
+                                                const unsigned char *data,
+                                                const u64 k[], size_t num_blks);
+
+static unsigned int
+do_sha512_transform_armv7_neon(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks);
+}
+#endif
+
+#ifdef USE_SSSE3
+unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
+                                                void *state,
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_ssse3(void *ctx, const unsigned char *data,
+                                size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_ssse3 (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX
+unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
+                                              void *state,
+                                              size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_avx(void *ctx, const unsigned char *data,
+                              size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_avx (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_AVX2
+unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
+                                               void *state,
+                                               size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha512_transform_amd64_avx2(void *ctx, const unsigned char *data,
+                               size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_amd64_avx2 (data, &hd->state, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif
+
+#ifdef USE_SSSE3_I386
+unsigned int _gcry_sha512_transform_i386_ssse3(u64 state[8],
+					       const unsigned char *input_data,
+					       size_t num_blks);
+
+static unsigned int
+do_sha512_transform_i386_ssse3(void *ctx, const unsigned char *data,
+			       size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_i386_ssse3 (&hd->state.h0, data, nblks);
+}
+#endif
+
+
+#ifdef USE_ARM_ASM
+unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd,
+					 const unsigned char *data,
+					 const u64 k[], size_t num_blks);
+
+static unsigned int
+do_transform_generic (void *context, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *hd = context;
+  return _gcry_sha512_transform_arm (&hd->state, data, k, nblks);
+}
+#else
+static unsigned int
+do_transform_generic (void *context, const unsigned char *data, size_t nblks);
+#endif
+
+
+#ifdef USE_PPC_CRYPTO
+unsigned int _gcry_sha512_transform_ppc8(u64 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks);
+
+unsigned int _gcry_sha512_transform_ppc9(u64 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks);
+
+static unsigned int
+do_sha512_transform_ppc8(void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_ppc8 (&hd->state.h0, data, nblks);
+}
+
+static unsigned int
+do_sha512_transform_ppc9(void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_ppc9 (&hd->state.h0, data, nblks);
+}
+#endif
+
+
+#ifdef USE_S390X_CRYPTO
+#include "asm-inline-s390x.h"
+
+static unsigned int
+do_sha512_transform_s390x (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+
+  kimd_execute (KMID_FUNCTION_SHA512, &hd->state.h0, data, nblks * 128);
+  return 0;
+}
+
+static unsigned int
+do_sha512_final_s390x (void *ctx, const unsigned char *data, size_t datalen,
+		       u64 len_msb, u64 len_lsb)
+{
+  SHA512_CONTEXT *hd = ctx;
+
+  /* Make sure that 'final_len' is positioned at correct offset relative
+   * to 'state.h0'. This is because we are passing 'state.h0' pointer as start of
+   * parameter block to 'klmd' instruction. */
+
+  gcry_assert (offsetof (SHA512_CONTEXT, final_len_msb)
+	       - offsetof (SHA512_CONTEXT, state.h0) == 8 * sizeof(u64));
+  gcry_assert (offsetof (SHA512_CONTEXT, final_len_lsb)
+	       - offsetof (SHA512_CONTEXT, final_len_msb) == 1 * sizeof(u64));
+
+  hd->final_len_msb = len_msb;
+  hd->final_len_lsb = len_lsb;
+
+  klmd_execute (KMID_FUNCTION_SHA512, &hd->state.h0, data, datalen);
+  return 0;
+}
+#endif
+
+
+static void
+sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
+{
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)flags;
+  (void)k;
+
+  ctx->bctx.nblocks = 0;
+  ctx->bctx.nblocks_high = 0;
+  ctx->bctx.count = 0;
+  ctx->bctx.blocksize_shift = _gcry_ctz(128);
+
+  /* Order of feature checks is important here; last match will be
+   * selected.  Keep slower implementations at the top and faster at
+   * the bottom.  */
+  ctx->bctx.bwrite = do_transform_generic;
+#ifdef USE_ARM_NEON_ASM
+  if ((features & HWF_ARM_NEON) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
+#endif
+#ifdef USE_SSSE3
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
+#endif
+#ifdef USE_AVX
+  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx;
+#endif
+#ifdef USE_AVX2
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
+#endif
+#ifdef USE_PPC_CRYPTO
+  if ((features & HWF_PPC_VCRYPTO) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_ppc8;
+  if ((features & HWF_PPC_VCRYPTO) != 0 && (features & HWF_PPC_ARCH_3_00) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_ppc9;
+#endif
+#ifdef USE_SSSE3_I386
+  if ((features & HWF_INTEL_SSSE3) != 0)
+    ctx->bctx.bwrite = do_sha512_transform_i386_ssse3;
+#endif
+#ifdef USE_S390X_CRYPTO
+  ctx->use_s390x_crypto = 0;
+  if ((features & HWF_S390X_MSA) != 0)
+    {
+      if ((kimd_query () & km_function_to_mask (KMID_FUNCTION_SHA512)) &&
+	  (klmd_query () & km_function_to_mask (KMID_FUNCTION_SHA512)))
+	{
+	  ctx->bctx.bwrite = do_sha512_transform_s390x;
+	  ctx->use_s390x_crypto = 1;
+	}
+    }
+#endif
+  (void)features;
+}
+
+
+static void
+sha512_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0x6a09e667f3bcc908);
+  hd->h1 = U64_C(0xbb67ae8584caa73b);
+  hd->h2 = U64_C(0x3c6ef372fe94f82b);
+  hd->h3 = U64_C(0xa54ff53a5f1d36f1);
+  hd->h4 = U64_C(0x510e527fade682d1);
+  hd->h5 = U64_C(0x9b05688c2b3e6c1f);
+  hd->h6 = U64_C(0x1f83d9abfb41bd6b);
+  hd->h7 = U64_C(0x5be0cd19137e2179);
+
+  sha512_init_common (ctx, flags);
+}
+
+static void
+sha384_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
+  hd->h1 = U64_C(0x629a292a367cd507);
+  hd->h2 = U64_C(0x9159015a3070dd17);
+  hd->h3 = U64_C(0x152fecd8f70e5939);
+  hd->h4 = U64_C(0x67332667ffc00b31);
+  hd->h5 = U64_C(0x8eb44a8768581511);
+  hd->h6 = U64_C(0xdb0c2e0d64f98fa7);
+  hd->h7 = U64_C(0x47b5481dbefa4fa4);
+
+  sha512_init_common (ctx, flags);
+}
+
+
+static void
+sha512_256_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0x22312194fc2bf72c);
+  hd->h1 = U64_C(0x9f555fa3c84c64c2);
+  hd->h2 = U64_C(0x2393b86b6f53b151);
+  hd->h3 = U64_C(0x963877195940eabd);
+  hd->h4 = U64_C(0x96283ee2a88effe3);
+  hd->h5 = U64_C(0xbe5e1e2553863992);
+  hd->h6 = U64_C(0x2b0199fc2c85b8aa);
+  hd->h7 = U64_C(0x0eb72ddc81c52ca2);
+
+  sha512_init_common (ctx, flags);
+}
+
+
+static void
+sha512_224_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0x8c3d37c819544da2);
+  hd->h1 = U64_C(0x73e1996689dcd4d6);
+  hd->h2 = U64_C(0x1dfab7ae32ff9c82);
+  hd->h3 = U64_C(0x679dd514582f9fcf);
+  hd->h4 = U64_C(0x0f6d2b697bd44da8);
+  hd->h5 = U64_C(0x77e36f7304c48942);
+  hd->h6 = U64_C(0x3f9d85a86a1d36c8);
+  hd->h7 = U64_C(0x1112e6ad91d692a1);
+
+  sha512_init_common (ctx, flags);
+}
+
+
+
+#ifndef USE_ARM_ASM
+
+static inline u64
+ROTR (u64 x, u64 n)
+{
+  return ((x >> n) | (x << (64 - n)));
+}
+
+static inline u64
+Ch (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ ( ~x & z));
+}
+
+static inline u64
+Maj (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ (x & z) ^ (y & z));
+}
+
+static inline u64
+Sum0 (u64 x)
+{
+  return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
+}
+
+static inline u64
+Sum1 (u64 x)
+{
+  return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
+}
+
+/****************
+ * Transform the message W which consists of 16 64-bit-words
+ */
+static unsigned int
+do_transform_generic (void *context, const unsigned char *data, size_t nblks)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  do
+    {
+      u64 a, b, c, d, e, f, g, h;
+      u64 w[16];
+      int t;
+
+      /* get values from the chaining vars */
+      a = hd->h0;
+      b = hd->h1;
+      c = hd->h2;
+      d = hd->h3;
+      e = hd->h4;
+      f = hd->h5;
+      g = hd->h6;
+      h = hd->h7;
+
+      for ( t = 0; t < 16; t++ )
+        w[t] = buf_get_be64(data + t * 8);
+
+#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+
+      for (t = 0; t < 80 - 16; )
+        {
+          u64 t1, t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+          w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+          w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+          w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+          w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+          w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+          w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+          w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+          w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+          w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+          w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+          w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+          w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+          w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+          w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+          w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+          w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t += 16;
+        }
+
+      for (; t < 80; )
+        {
+          u64 t1, t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+          t2 = Sum0 (a) + Maj (a, b, c);
+          d += t1;
+          h  = t1 + t2;
+
+          t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+          t2 = Sum0 (h) + Maj (h, a, b);
+          c += t1;
+          g  = t1 + t2;
+
+          t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+          t2 = Sum0 (g) + Maj (g, h, a);
+          b += t1;
+          f  = t1 + t2;
+
+          t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+          t2 = Sum0 (f) + Maj (f, g, h);
+          a += t1;
+          e  = t1 + t2;
+
+          t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+          t2 = Sum0 (e) + Maj (e, f, g);
+          h += t1;
+          d  = t1 + t2;
+
+          t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+          t2 = Sum0 (d) + Maj (d, e, f);
+          g += t1;
+          c  = t1 + t2;
+
+          t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+          t2 = Sum0 (c) + Maj (c, d, e);
+          f += t1;
+          b  = t1 + t2;
+
+          t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+          t2 = Sum0 (b) + Maj (b, c, d);
+          e += t1;
+          a  = t1 + t2;
+
+          t += 16;
+        }
+
+      /* Update chaining vars.  */
+      hd->h0 += a;
+      hd->h1 += b;
+      hd->h2 += c;
+      hd->h3 += d;
+      hd->h4 += e;
+      hd->h5 += f;
+      hd->h6 += g;
+      hd->h7 += h;
+
+      data += 128;
+    }
+  while (--nblks);
+
+  return (8 + 16) * sizeof(u64) + sizeof(u32) + 3 * sizeof(void*);
+}
+#endif /*!USE_ARM_ASM*/
+
+
+/* The routine final terminates the computation and
+ * returns the digest.
+ * The handle is prepared for a new cycle, but adding bytes to the
+ * handle will the destroy the returned buffer.
+ * Returns: 64 bytes representing the digest.  When used for sha384,
+ * we take the leftmost 48 of those bytes.
+ */
+
+static void
+sha512_final (void *context)
+{
+  SHA512_CONTEXT *hd = context;
+  unsigned int burn;
+  u64 t, th, msb, lsb;
+  byte *p;
+
+  t = hd->bctx.nblocks;
+  /* if (sizeof t == sizeof hd->bctx.nblocks) */
+  th = hd->bctx.nblocks_high;
+  /* else */
+  /*   th = hd->bctx.nblocks >> 64; In case we ever use u128  */
+
+  /* multiply by 128 to make a byte count */
+  lsb = t << 7;
+  msb = (th << 7) | (t >> 57);
+  /* add the count */
+  t = lsb;
+  if ((lsb += hd->bctx.count) < t)
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 61;
+
+  if (0)
+    { }
+#ifdef USE_S390X_CRYPTO
+  else if (hd->use_s390x_crypto)
+    {
+      burn = do_sha512_final_s390x (hd, hd->bctx.buf, hd->bctx.count, msb, lsb);
+    }
+#endif
+  else
+    {
+      if (hd->bctx.count < 112)
+	{
+	  /* enough room */
+	  hd->bctx.buf[hd->bctx.count++] = 0x80;  /* pad */
+	  if (hd->bctx.count < 112)
+	    memset (&hd->bctx.buf[hd->bctx.count], 0, 112 - hd->bctx.count);
+	}
+      else
+	{
+	  /* need one extra block */
+	  hd->bctx.buf[hd->bctx.count++] = 0x80;  /* pad character */
+	  if (hd->bctx.count < 128)
+	    memset (&hd->bctx.buf[hd->bctx.count], 0, 128 - hd->bctx.count);
+	  hd->bctx.count = 128;
+	  _gcry_md_block_write (context, NULL, 0); /* flush */
+	  memset (hd->bctx.buf, 0, 112);  /* fill next block with zeroes */
+	}
+      /* append the 128 bit count */
+      buf_put_be64(hd->bctx.buf + 112, msb);
+      buf_put_be64(hd->bctx.buf + 120, lsb);
+      burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_be64(p, hd->state.h##a); p += 8; } while (0)
+  X (0);
+  X (1);
+  X (2);
+  X (3);
+  X (4);
+  X (5);
+  /* Note that these last two chunks are included even for SHA384.
+     We just ignore them. */
+  X (6);
+  X (7);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+sha512_read (void *context)
+{
+  SHA512_CONTEXT *hd = (SHA512_CONTEXT *) context;
+  return hd->bctx.buf;
+}
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 64 bytes.  */
+void
+_gcry_sha512_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 64);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+void
+_gcry_sha512_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 64);
+}
+
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 48 bytes.  */
+static void
+_gcry_sha384_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha384_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 48);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha384_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha384_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 48);
+}
+
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 32 bytes.  */
+static void
+_gcry_sha512_256_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_256_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha512_256_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+			       int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_256_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 28 bytes.  */
+static void
+_gcry_sha512_224_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_224_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha512_224_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+			       int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_224_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+
+static gpg_err_code_t
+selftests_sha384 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA384, 0,
+     "abc", 3,
+     "\xcb\x00\x75\x3f\x45\xa3\x5e\x8b\xb5\xa0\x3d\x69\x9a\xc6\x50\x07"
+     "\x27\x2c\x32\xab\x0e\xde\xd1\x63\x1a\x8b\x60\x5a\x43\xff\x5b\xed"
+     "\x80\x86\x07\x2b\xa1\xe7\xcc\x23\x58\xba\xec\xa1\x34\xc8\x25\xa7", 48);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA384, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x09\x33\x0C\x33\xF7\x11\x47\xE8\x3D\x19\x2F\xC7\x82\xCD\x1B\x47"
+         "\x53\x11\x1B\x17\x3B\x3B\x05\xD2\x2F\xA0\x80\x86\xE3\xB0\xF7\x12"
+         "\xFC\xC7\xC7\x1A\x55\x7E\x2D\xB9\x66\xC3\xE9\xFA\x91\x74\x60\x39",
+         48);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA384, 1,
+         NULL, 0,
+         "\x9D\x0E\x18\x09\x71\x64\x74\xCB\x08\x6E\x83\x4E\x31\x0A\x4A\x1C"
+         "\xED\x14\x9E\x9C\x00\xF2\x48\x52\x79\x72\xCE\xC5\x70\x4C\x2A\x5B"
+         "\x07\xB8\xB3\xDC\x38\xEC\xC4\xEB\xAE\x97\xDD\xD8\x7F\x3D\x89\x85",
+         48);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA384, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_sha512 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA512, 0,
+     "abc", 3,
+     "\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31"
+     "\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A"
+     "\x21\x92\x99\x2A\x27\x4F\xC1\xA8\x36\xBA\x3C\x23\xA3\xFE\xEB\xBD"
+     "\x45\x4D\x44\x23\x64\x3C\xE8\x0E\x2A\x9A\xC9\x4F\xA5\x4C\xA4\x9F", 64);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x8E\x95\x9B\x75\xDA\xE3\x13\xDA\x8C\xF4\xF7\x28\x14\xFC\x14\x3F"
+         "\x8F\x77\x79\xC6\xEB\x9F\x7F\xA1\x72\x99\xAE\xAD\xB6\x88\x90\x18"
+         "\x50\x1D\x28\x9E\x49\x00\xF7\xE4\x33\x1B\x99\xDE\xC4\xB5\x43\x3A"
+         "\xC7\xD3\x29\xEE\xB6\xDD\x26\x54\x5E\x96\xE5\x5B\x87\x4B\xE9\x09",
+         64);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512, 1,
+         NULL, 0,
+         "\xE7\x18\x48\x3D\x0C\xE7\x69\x64\x4E\x2E\x42\xC7\xBC\x15\xB4\x63"
+         "\x8E\x1F\x98\xB1\x3B\x20\x44\x28\x56\x32\xA8\x03\xAF\xA9\x73\xEB"
+         "\xDE\x0F\xF2\x44\x87\x7E\xA6\x0A\x4C\xB0\x43\x2C\xE5\x77\xC3\x1B"
+         "\xEB\x00\x9C\x5C\x2C\x49\xAA\x2E\x4E\xAD\xB2\x17\xAD\x8C\xC0\x9B",
+         64);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA512, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_sha512_224 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA512_224, 0,
+     "abc", 3,
+     "\x46\x34\x27\x0F\x70\x7B\x6A\x54\xDA\xAE\x75\x30\x46\x08\x42\xE2"
+     "\x0E\x37\xED\x26\x5C\xEE\xE9\xA4\x3E\x89\x24\xAA",
+     28);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_224, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x23\xFE\xC5\xBB\x94\xD6\x0B\x23\x30\x81\x92\x64\x0B\x0C\x45\x33"
+         "\x35\xD6\x64\x73\x4F\xE4\x0E\x72\x68\x67\x4A\xF9",
+         28);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_224, 1,
+         NULL, 0,
+         "\x37\xab\x33\x1d\x76\xf0\xd3\x6d\xe4\x22\xbd\x0e\xde\xb2\x2a\x28"
+         "\xac\xcd\x48\x7b\x7a\x84\x53\xae\x96\x5d\xd2\x87",
+         28);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA512_224, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_sha512_256 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA512_256, 0,
+     "abc", 3,
+     "\x53\x04\x8E\x26\x81\x94\x1E\xF9\x9B\x2E\x29\xB7\x6B\x4C\x7D\xAB"
+     "\xE4\xC2\xD0\xC6\x34\xFC\x6D\x46\xE0\xE2\xF1\x31\x07\xE7\xAF\x23",
+     32);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_256, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x39\x28\xE1\x84\xFB\x86\x90\xF8\x40\xDA\x39\x88\x12\x1D\x31\xBE"
+         "\x65\xCB\x9D\x3E\xF8\x3E\xE6\x14\x6F\xEA\xC8\x61\xE1\x9B\x56\x3A",
+         32);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_256, 1,
+         NULL, 0,
+         "\x9a\x59\xa0\x52\x93\x01\x87\xa9\x70\x38\xca\xe6\x92\xf3\x07\x08"
+         "\xaa\x64\x91\x92\x3e\xf5\x19\x43\x94\xdc\x68\xd5\x6c\x74\xfb\x21",
+         32);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA512_256, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MD_SHA384:
+      ec = selftests_sha384 (extended, report);
+      break;
+    case GCRY_MD_SHA512:
+      ec = selftests_sha512 (extended, report);
+      break;
+    case GCRY_MD_SHA512_224:
+      ec = selftests_sha512_224 (extended, report);
+      break;
+    case GCRY_MD_SHA512_256:
+      ec = selftests_sha512_256 (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_DIGEST_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+
+
+
+static byte sha512_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.3 */
+  {
+    0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
+    0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05,
+    0x00, 0x04, 0x40
+  };
+
+static gcry_md_oid_spec_t oid_spec_sha512[] =
+  {
+    { "2.16.840.1.101.3.4.2.3" },
+
+    /* PKCS#1 sha512WithRSAEncryption */
+    { "1.2.840.113549.1.1.13" },
+
+    { NULL }
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha512 =
+  {
+    GCRY_MD_SHA512, {0, 1},
+    "SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64,
+    sha512_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha512_hash_buffer, _gcry_sha512_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
+
+static byte sha384_asn[] =	/* Object ID is 2.16.840.1.101.3.4.2.2 */
+  {
+    0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86,
+    0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05,
+    0x00, 0x04, 0x30
+  };
+
+static gcry_md_oid_spec_t oid_spec_sha384[] =
+  {
+    { "2.16.840.1.101.3.4.2.2" },
+
+    /* PKCS#1 sha384WithRSAEncryption */
+    { "1.2.840.113549.1.1.12" },
+
+    /* SHA384WithECDSA: RFC 7427 (A.3.3.) */
+    { "1.2.840.10045.4.3.3" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha384 =
+  {
+    GCRY_MD_SHA384, {0, 1},
+    "SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48,
+    sha384_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha384_hash_buffer, _gcry_sha384_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
+
+static byte sha512_256_asn[] = { 0x30 };
+
+static gcry_md_oid_spec_t oid_spec_sha512_256[] =
+  {
+    { "2.16.840.1.101.3.4.2.6" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha512_256 =
+  {
+    GCRY_MD_SHA512_256, {0, 1},
+    "SHA512_256", sha512_256_asn, DIM (sha512_256_asn), oid_spec_sha512_256, 32,
+    sha512_256_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha512_256_hash_buffer, _gcry_sha512_256_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
+
+static byte sha512_224_asn[] = { 0x30 };
+
+static gcry_md_oid_spec_t oid_spec_sha512_224[] =
+  {
+    { "2.16.840.1.101.3.4.2.5" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha512_224 =
+  {
+    GCRY_MD_SHA512_224, {0, 1},
+    "SHA512_224", sha512_224_asn, DIM (sha512_224_asn), oid_spec_sha512_224, 28,
+    sha512_224_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha512_224_hash_buffer, _gcry_sha512_224_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/sm3.c b/comm/third_party/libgcrypt/cipher/sm3.c
new file mode 100644
index 0000000000..0f9bae3bf5
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sm3.c
@@ -0,0 +1,473 @@
+/* sm3.c - SM3 hash function
+ * Copyright (C) 2017 Jia Zhang
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*  Test vectors:
+
+    "abc"
+    SM3: 66c7f0f4 62eeedd9 d1f2d46b dc10e4e2 4167c487 5cf2f7a2 297da02b 8f4ba8e0
+
+    "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd"
+    SM3: debe9ff9 2275b8a1 38604889 c18e5a4d 6fdb70e5 387e5765 293dcba3 9c0c5732
+
+    "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+    SM3: 639b6cc5 e64d9e37 a390b192 df4fa1ea 0720ab74 7ff692b9 f38c4e66 ad7b8c05
+
+    "a" one million times
+    SM3: c8aaf894 29554029 e231941a 2acc0ad6 1ff2a5ac d8fadd25 847a3a73 2b3b02c3
+
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+
+typedef struct {
+  gcry_md_block_ctx_t bctx;
+  u32  h0,h1,h2,h3,h4,h5,h6,h7;
+} SM3_CONTEXT;
+
+
+static unsigned int
+transform (void *c, const unsigned char *data, size_t nblks);
+
+
+static void
+sm3_init (void *context, unsigned int flags)
+{
+  SM3_CONTEXT *hd = context;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)flags;
+
+  hd->h0 = 0x7380166f;
+  hd->h1 = 0x4914b2b9;
+  hd->h2 = 0x172442d7;
+  hd->h3 = 0xda8a0600;
+  hd->h4 = 0xa96f30bc;
+  hd->h5 = 0x163138aa;
+  hd->h6 = 0xe38dee4d;
+  hd->h7 = 0xb0fb0e4e;
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.nblocks_high = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+  hd->bctx.bwrite = transform;
+
+  (void)features;
+}
+
+
+/*
+  Transform the message X which consists of 16 32-bit-words. See
+  GM/T 004-2012 for details.  */
+#define R(i,a,b,c,d,e,f,g,h,t,w1,w2) do                               \
+          {                                                           \
+            ss1 = rol ((rol ((a), 12) + (e) + (t)), 7);               \
+            ss2 = ss1 ^ rol ((a), 12);                                \
+            d += FF##i(a,b,c) + ss2 + ((w1) ^ (w2));                  \
+            h += GG##i(e,f,g) + ss1 + (w1);                           \
+            b = rol ((b), 9);                                         \
+            f = rol ((f), 19);                                        \
+            h = P0 ((h));                                             \
+          } while (0)
+
+#define R1(a,b,c,d,e,f,g,h,t,w1,w2) R(1,a,b,c,d,e,f,g,h,t,w1,w2)
+#define R2(a,b,c,d,e,f,g,h,t,w1,w2) R(2,a,b,c,d,e,f,g,h,t,w1,w2)
+
+#define FF1(x, y, z)  (x ^ y ^ z)
+
+#define FF2(x, y, z)  ((x & y) | (x & z) | (y & z))
+
+#define GG1(x, y, z)  (x ^ y ^ z)
+
+#define GG2(x, y, z)  ((x & y) | ( ~x & z))
+
+/* Message expansion */
+#define P0(x) ((x) ^ rol ((x), 9) ^ rol ((x), 17))
+#define P1(x) ((x) ^ rol ((x), 15) ^ rol ((x), 23))
+#define I(i)  ( w[i] = buf_get_be32(data + i * 4) )
+#define W1(i) ( w[i&0x0f] )
+#define W2(i) ( w[i&0x0f] =   P1(w[i    &0x0f] \
+                               ^ w[(i-9)&0x0f] \
+                               ^ rol (w[(i-3)&0x0f], 15)) \
+                            ^ rol (w[(i-13)&0x0f], 7) \
+                            ^ w[(i-6)&0x0f] )
+
+static unsigned int
+transform_blk (void *ctx, const unsigned char *data)
+{
+  SM3_CONTEXT *hd = ctx;
+  static const u32 K[64] = {
+    0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb,
+    0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc,
+    0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce,
+    0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6,
+    0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+    0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+    0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+    0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5,
+    0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53,
+    0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d,
+    0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4,
+    0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43,
+    0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+    0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+    0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+    0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+  };
+
+  u32 a,b,c,d,e,f,g,h,ss1,ss2;
+  u32 w[16];
+
+  a = hd->h0;
+  b = hd->h1;
+  c = hd->h2;
+  d = hd->h3;
+  e = hd->h4;
+  f = hd->h5;
+  g = hd->h6;
+  h = hd->h7;
+
+  R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4));
+  R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5));
+  R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6));
+  R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7));
+  R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8));
+  R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9));
+  R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10));
+  R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11));
+  R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12));
+  R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13));
+  R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14));
+  R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15));
+  R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16));
+  R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17));
+  R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18));
+  R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19));
+
+  R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20));
+  R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21));
+  R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22));
+  R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23));
+  R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24));
+  R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25));
+  R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26));
+  R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27));
+  R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28));
+  R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29));
+  R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30));
+  R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31));
+  R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32));
+  R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33));
+  R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34));
+  R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35));
+
+  R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36));
+  R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37));
+  R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38));
+  R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39));
+  R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40));
+  R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41));
+  R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42));
+  R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43));
+  R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44));
+  R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45));
+  R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46));
+  R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47));
+  R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48));
+  R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49));
+  R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50));
+  R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51));
+
+  R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52));
+  R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53));
+  R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54));
+  R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55));
+  R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56));
+  R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57));
+  R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58));
+  R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59));
+  R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60));
+  R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61));
+  R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62));
+  R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63));
+  R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64));
+  R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65));
+  R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66));
+  R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67));
+
+  hd->h0 ^= a;
+  hd->h1 ^= b;
+  hd->h2 ^= c;
+  hd->h3 ^= d;
+  hd->h4 ^= e;
+  hd->h5 ^= f;
+  hd->h6 ^= g;
+  hd->h7 ^= h;
+
+  return /*burn_stack*/ 26*4+32;
+}
+#undef P0
+#undef P1
+#undef R
+#undef R1
+#undef R2
+
+static unsigned int
+transform (void *ctx, const unsigned char *data, size_t nblks)
+{
+  SM3_CONTEXT *hd = ctx;
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (hd, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+/*
+   The routine finally terminates the computation and returns the
+   digest.  The handle is prepared for a new cycle, but adding bytes
+   to the handle will the destroy the returned buffer.  Returns: 32
+   bytes with the message the digest.  */
+static void
+sm3_final(void *context)
+{
+  SM3_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if ((lsb += hd->bctx.count) < t)
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if (hd->bctx.count < 56)  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
+    }
+  else  /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
+  X(0);
+  X(1);
+  X(2);
+  X(3);
+  X(4);
+  X(5);
+  X(6);
+  X(7);
+#undef X
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+sm3_read (void *context)
+{
+  SM3_CONTEXT *hd = context;
+
+  return hd->bctx.buf;
+}
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 32 bytes.  */
+void
+_gcry_sm3_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SM3_CONTEXT hd;
+
+  sm3_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sm3_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+void
+_gcry_sm3_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
+{
+  SM3_CONTEXT hd;
+
+  sm3_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sm3_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+
+/*
+     Self-test section.
+ */
+
+
+static gpg_err_code_t
+selftests_sm3 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string (spec example 1)";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SM3, 0,
+     "abc", 3,
+     "\x66\xc7\xf0\xf4\x62\xee\xed\xd9\xd1\xf2\xd4\x6b\xdc\x10\xe4\xe2"
+     "\x41\x67\xc4\x87\x5c\xf2\xf7\xa2\x29\x7d\xa0\x2b\x8f\x4b\xa8\xe0", 32);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string (spec example 2)";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SM3, 0,
+         "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd", 64,
+         "\xde\xbe\x9f\xf9\x22\x75\xb8\xa1\x38\x60\x48\x89\xc1\x8e\x5a\x4d"
+         "\x6f\xdb\x70\xe5\x38\x7e\x57\x65\x29\x3d\xcb\xa3\x9c\x0c\x57\x32",
+         32);
+      if (errtxt)
+        goto failed;
+
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SM3, 0,
+         "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56,
+         "\x63\x9b\x6c\xc5\xe6\x4d\x9e\x37\xa3\x90\xb1\x92\xdf\x4f\xa1\xea"
+         "\x07\x20\xab\x74\x7f\xf6\x92\xb9\xf3\x8c\x4e\x66\xad\x7b\x8c\x05",
+         32);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SM3, 1,
+         NULL, 0,
+         "\xc8\xaa\xf8\x94\x29\x55\x40\x29\xe2\x31\x94\x1a\x2a\xcc\x0a\xd6"
+         "\x1f\xf2\xa5\xac\xd8\xfa\xdd\x25\x84\x7a\x3a\x73\x2b\x3b\x02\xc3",
+         32);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SM3, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+/* Run a full self-test for ALGO and return 0 on success.  */
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  gpg_err_code_t ec;
+
+  switch (algo)
+    {
+    case GCRY_MD_SM3:
+      ec = selftests_sm3 (extended, report);
+      break;
+    default:
+      ec = GPG_ERR_DIGEST_ALGO;
+      break;
+
+    }
+  return ec;
+}
+
+static byte asn_sm3[] = /* Object ID is 1.2.156.10197.401 */
+  { 0x30, 0x2F, 0x30, 0x0B, 0x06, 0x07, 0x2A, 0x81,
+    0x1C, 0xCF, 0x55, 0x83, 0x11, 0x05, 0x00, 0x04,
+    0x20 };
+
+static gcry_md_oid_spec_t oid_spec_sm3[] =
+  {
+    /* China Electronics Standardization Instutute,
+       OID White paper (2015), Table 6 */
+    { "1.2.156.10197.401" },
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sm3 =
+  {
+    GCRY_MD_SM3, {0, 0},
+    "SM3", asn_sm3, DIM (asn_sm3), oid_spec_sm3, 32,
+    sm3_init, _gcry_md_block_write, sm3_final, sm3_read, NULL,
+    _gcry_sm3_hash_buffer, _gcry_sm3_hash_buffers,
+    sizeof (SM3_CONTEXT),
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/sm4-aesni-avx-amd64.S b/comm/third_party/libgcrypt/cipher/sm4-aesni-avx-amd64.S
new file mode 100644
index 0000000000..3610b98c67
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sm4-aesni-avx-amd64.S
@@ -0,0 +1,987 @@
+/* sm4-avx-aesni-amd64.S  -  AES-NI/AVX implementation of SM4 cipher
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Based on SM4 AES-NI work by Markku-Juhani O. Saarinen at:
+ *  https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* vector registers */
+#define RX0          %xmm0
+#define RX1          %xmm1
+#define MASK_4BIT    %xmm2
+#define RTMP0        %xmm3
+#define RTMP1        %xmm4
+#define RTMP2        %xmm5
+#define RTMP3        %xmm6
+#define RTMP4        %xmm7
+
+#define RA0          %xmm8
+#define RA1          %xmm9
+#define RA2          %xmm10
+#define RA3          %xmm11
+
+#define RB0          %xmm12
+#define RB1          %xmm13
+#define RB2          %xmm14
+#define RB3          %xmm15
+
+#define RNOT         %xmm0
+#define RBSWAP       %xmm1
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/* post-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction. */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpandn mask4bit, x, tmp0; \
+	vpsrld $4, x, x; \
+	vpand x, mask4bit, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/**********************************************************************
+  4-way && 8-way SM4 with AES-NI and AVX
+ **********************************************************************/
+
+.text
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.align 8
+.globl _gcry_sm4_aesni_avx_expand_key
+ELF(.type   _gcry_sm4_aesni_avx_expand_key,@function;)
+_gcry_sm4_aesni_avx_expand_key:
+	/* input:
+	 *	%rdi: 128-bit key
+	 *	%rsi: rkey_enc
+	 *	%rdx: rkey_dec
+	 *	%rcx: fk array
+	 *	%r8: ck array
+	 */
+	CFI_STARTPROC();
+
+	vmovd 0*4(%rdi), RA0;
+	vmovd 1*4(%rdi), RA1;
+	vmovd 2*4(%rdi), RA2;
+	vmovd 3*4(%rdi), RA3;
+
+	vmovdqa .Lbswap32_mask rRIP, RTMP2;
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+
+	vmovd 0*4(%rcx), RB0;
+	vmovd 1*4(%rcx), RB1;
+	vmovd 2*4(%rcx), RB2;
+	vmovd 3*4(%rcx), RB3;
+	vpxor RB0, RA0, RA0;
+	vpxor RB1, RA1, RA1;
+	vpxor RB2, RA2, RA2;
+	vpxor RB3, RA3, RA3;
+
+	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+	vmovdqa .Linv_shift_row rRIP, RB3;
+
+#define ROUND(round, s0, s1, s2, s3) \
+	vbroadcastss (4*(round))(%r8), RX0; \
+	vpxor s1, RX0, RX0; \
+	vpxor s2, RX0, RX0; \
+	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+	\
+	/* sbox, non-linear part */ \
+	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+	vaesenclast MASK_4BIT, RX0, RX0; \
+	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+	\
+	/* linear part */ \
+	vpshufb RB3, RX0, RX0; \
+	vpxor RX0, s0, s0; /* s0 ^ x */ \
+	vpslld $13, RX0, RTMP0; \
+	vpsrld $19, RX0, RTMP1; \
+	vpslld $23, RX0, RTMP2; \
+	vpsrld $9, RX0, RTMP3; \
+	vpxor RTMP0, RTMP1, RTMP1;  \
+	vpxor RTMP2, RTMP3, RTMP3;  \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,13) */ \
+	vpxor RTMP3, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+	leaq (32*4)(%r8), %rax;
+	leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+	leaq (-4*4)(%rdx), %rdx;
+	ROUND(0, RA0, RA1, RA2, RA3);
+	ROUND(1, RA1, RA2, RA3, RA0);
+	ROUND(2, RA2, RA3, RA0, RA1);
+	ROUND(3, RA3, RA0, RA1, RA2);
+	leaq (4*4)(%r8), %r8;
+	vmovd RA0, (0*4)(%rsi);
+	vmovd RA1, (1*4)(%rsi);
+	vmovd RA2, (2*4)(%rsi);
+	vmovd RA3, (3*4)(%rsi);
+	vmovd RA0, (3*4)(%rdx);
+	vmovd RA1, (2*4)(%rdx);
+	vmovd RA2, (1*4)(%rdx);
+	vmovd RA3, (0*4)(%rdx);
+	leaq (4*4)(%rsi), %rsi;
+	cmpq %rax, %r8;
+	jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+	vzeroall;
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;)
+
+.align 8
+ELF(.type   sm4_aesni_avx_crypt_blk1_4,@function;)
+sm4_aesni_avx_crypt_blk1_4:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (1..4 blocks)
+	 *	%rdx: src (1..4 blocks)
+	 *	%rcx: num blocks (1..4)
+	 */
+	CFI_STARTPROC();
+
+	vmovdqu 0*16(%rdx), RA0;
+	vmovdqa RA0, RA1;
+	vmovdqa RA0, RA2;
+	vmovdqa RA0, RA3;
+	cmpq $2, %rcx;
+	jb .Lblk4_load_input_done;
+	vmovdqu 1*16(%rdx), RA1;
+	je .Lblk4_load_input_done;
+	vmovdqu 2*16(%rdx), RA2;
+	cmpq $3, %rcx;
+	je .Lblk4_load_input_done;
+	vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+	vmovdqa .Lbswap32_mask rRIP, RTMP2;
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+
+	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+	vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+	vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+	vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+	vmovdqa .Linv_shift_row rRIP, RB3;
+	vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+	vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+	vbroadcastss (4*(round))(%rdi), RX0; \
+	vpxor s1, RX0, RX0; \
+	vpxor s2, RX0, RX0; \
+	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+	\
+	/* sbox, non-linear part */ \
+	transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+	vaesenclast MASK_4BIT, RX0, RX0; \
+	transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+	\
+	/* linear part */ \
+	vpshufb RB3, RX0, RTMP0; \
+	vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+	vpshufb RTMP2, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+	vpshufb RTMP3, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+	vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+	vpslld $2, RTMP0, RTMP1; \
+	vpsrld $30, RTMP0, RTMP0; \
+	vpxor RTMP0, s0, s0;  \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+	leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+	ROUND(0, RA0, RA1, RA2, RA3);
+	ROUND(1, RA1, RA2, RA3, RA0);
+	ROUND(2, RA2, RA3, RA0, RA1);
+	ROUND(3, RA3, RA0, RA1, RA2);
+	leaq (4*4)(%rdi), %rdi;
+	cmpq %rax, %rdi;
+	jne .Lroundloop_blk4;
+
+#undef ROUND
+
+	vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+
+	vmovdqu RA0, 0*16(%rsi);
+	cmpq $2, %rcx;
+	jb .Lblk4_store_output_done;
+	vmovdqu RA1, 1*16(%rsi);
+	je .Lblk4_store_output_done;
+	vmovdqu RA2, 2*16(%rsi);
+	cmpq $3, %rcx;
+	je .Lblk4_store_output_done;
+	vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+	vzeroall;
+	xorl %eax, %eax;
+	ret;
+	CFI_ENDPROC();
+ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;)
+
+.align 8
+ELF(.type __sm4_crypt_blk8,@function;)
+__sm4_crypt_blk8:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+	CFI_STARTPROC();
+
+	vmovdqa .Lbswap32_mask rRIP, RTMP2;
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+	vpshufb RTMP2, RB0, RB0;
+	vpshufb RTMP2, RB1, RB1;
+	vpshufb RTMP2, RB2, RB2;
+	vpshufb RTMP2, RB3, RB3;
+
+	vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+	vbroadcastss (4*(round))(%rdi), RX0; \
+	vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; \
+	vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; \
+	vmovdqa RX0, RX1; \
+	vpxor s1, RX0, RX0; \
+	vpxor s2, RX0, RX0; \
+	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+	    vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; \
+	    vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; \
+	    vpxor r1, RX1, RX1; \
+	    vpxor r2, RX1, RX1; \
+	    vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+	\
+	/* sbox, non-linear part */ \
+	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+	    transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+	    vmovdqa .Linv_shift_row rRIP, RTMP4; \
+	vaesenclast MASK_4BIT, RX0, RX0; \
+	    vaesenclast MASK_4BIT, RX1, RX1; \
+	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+	    transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+	\
+	/* linear part */ \
+	vpshufb RTMP4, RX0, RTMP0; \
+	vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+	    vpshufb RTMP4, RX1, RTMP2; \
+	    vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP4; \
+	    vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP4; \
+	    vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vmovdqa .Linv_shift_row_rol_24 rRIP, RTMP4; \
+	    vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+	vpslld $2, RTMP0, RTMP1; \
+	vpsrld $30, RTMP0, RTMP0; \
+	vpxor RTMP0, s0, s0;  \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+	    vpslld $2, RTMP2, RTMP3; \
+	    vpsrld $30, RTMP2, RTMP2; \
+	    vpxor RTMP2, r0, r0;  \
+	    vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+	leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+	leaq (4*4)(%rdi), %rdi;
+	cmpq %rax, %rdi;
+	jne .Lroundloop_blk8;
+
+#undef ROUND
+
+	vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+	vpshufb RTMP2, RB0, RB0;
+	vpshufb RTMP2, RB1, RB1;
+	vpshufb RTMP2, RB2, RB2;
+	vpshufb RTMP2, RB3, RB3;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_crypt_blk1_8
+ELF(.type   _gcry_sm4_aesni_avx_crypt_blk1_8,@function;)
+_gcry_sm4_aesni_avx_crypt_blk1_8:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (1..8 blocks)
+	 *	%rdx: src (1..8 blocks)
+	 *	%rcx: num blocks (1..8)
+	 */
+	CFI_STARTPROC();
+
+	cmpq $5, %rcx;
+	jb sm4_aesni_avx_crypt_blk1_4;
+	vmovdqu (0 * 16)(%rdx), RA0;
+	vmovdqu (1 * 16)(%rdx), RA1;
+	vmovdqu (2 * 16)(%rdx), RA2;
+	vmovdqu (3 * 16)(%rdx), RA3;
+	vmovdqu (4 * 16)(%rdx), RB0;
+	vmovdqa RB0, RB1;
+	vmovdqa RB0, RB2;
+	vmovdqa RB0, RB3;
+	je .Lblk8_load_input_done;
+	vmovdqu (5 * 16)(%rdx), RB1;
+	cmpq $7, %rcx;
+	jb .Lblk8_load_input_done;
+	vmovdqu (6 * 16)(%rdx), RB2;
+	je .Lblk8_load_input_done;
+	vmovdqu (7 * 16)(%rdx), RB3;
+
+.Lblk8_load_input_done:
+	call __sm4_crypt_blk8;
+
+	cmpq $6, %rcx;
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	jb .Lblk8_store_output_done;
+	vmovdqu RB1, (5 * 16)(%rsi);
+	je .Lblk8_store_output_done;
+	vmovdqu RB2, (6 * 16)(%rsi);
+	cmpq $7, %rcx;
+	je .Lblk8_store_output_done;
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+	vzeroall;
+	xorl %eax, %eax;
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_ctr_enc
+ELF(.type   _gcry_sm4_aesni_avx_ctr_enc,@function;)
+_gcry_sm4_aesni_avx_ctr_enc:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RA0;
+
+	vmovdqa .Lbswap128_mask rRIP, RBSWAP;
+	vpshufb RBSWAP, RA0, RTMP0; /* be => le */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RTMP2); /* +1 */
+	vpshufb RBSWAP, RTMP0, RA1;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +2 */
+	vpshufb RBSWAP, RTMP0, RA2;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +3 */
+	vpshufb RBSWAP, RTMP0, RA3;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +4 */
+	vpshufb RBSWAP, RTMP0, RB0;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +5 */
+	vpshufb RBSWAP, RTMP0, RB1;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +6 */
+	vpshufb RBSWAP, RTMP0, RB2;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +7 */
+	vpshufb RBSWAP, RTMP0, RB3;
+	inc_le128(RTMP0, RNOT, RTMP2); /* +8 */
+	vpshufb RBSWAP, RTMP0, RTMP1;
+
+	/* store new IV */
+	vmovdqu RTMP1, (%rcx);
+
+	call __sm4_crypt_blk8;
+
+	vpxor (0 * 16)(%rdx), RA0, RA0;
+	vpxor (1 * 16)(%rdx), RA1, RA1;
+	vpxor (2 * 16)(%rdx), RA2, RA2;
+	vpxor (3 * 16)(%rdx), RA3, RA3;
+	vpxor (4 * 16)(%rdx), RB0, RB0;
+	vpxor (5 * 16)(%rdx), RB1, RB1;
+	vpxor (6 * 16)(%rdx), RB2, RB2;
+	vpxor (7 * 16)(%rdx), RB3, RB3;
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_cbc_dec
+ELF(.type   _gcry_sm4_aesni_avx_cbc_dec,@function;)
+_gcry_sm4_aesni_avx_cbc_dec:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vmovdqu (0 * 16)(%rdx), RA0;
+	vmovdqu (1 * 16)(%rdx), RA1;
+	vmovdqu (2 * 16)(%rdx), RA2;
+	vmovdqu (3 * 16)(%rdx), RA3;
+	vmovdqu (4 * 16)(%rdx), RB0;
+	vmovdqu (5 * 16)(%rdx), RB1;
+	vmovdqu (6 * 16)(%rdx), RB2;
+	vmovdqu (7 * 16)(%rdx), RB3;
+
+	call __sm4_crypt_blk8;
+
+	vmovdqu (7 * 16)(%rdx), RNOT;
+	vpxor (%rcx), RA0, RA0;
+	vpxor (0 * 16)(%rdx), RA1, RA1;
+	vpxor (1 * 16)(%rdx), RA2, RA2;
+	vpxor (2 * 16)(%rdx), RA3, RA3;
+	vpxor (3 * 16)(%rdx), RB0, RB0;
+	vpxor (4 * 16)(%rdx), RB1, RB1;
+	vpxor (5 * 16)(%rdx), RB2, RB2;
+	vpxor (6 * 16)(%rdx), RB3, RB3;
+	vmovdqu RNOT, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_cfb_dec
+ELF(.type   _gcry_sm4_aesni_avx_cfb_dec,@function;)
+_gcry_sm4_aesni_avx_cfb_dec:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	/* Load input */
+	vmovdqu (%rcx), RA0;
+	vmovdqu 0 * 16(%rdx), RA1;
+	vmovdqu 1 * 16(%rdx), RA2;
+	vmovdqu 2 * 16(%rdx), RA3;
+	vmovdqu 3 * 16(%rdx), RB0;
+	vmovdqu 4 * 16(%rdx), RB1;
+	vmovdqu 5 * 16(%rdx), RB2;
+	vmovdqu 6 * 16(%rdx), RB3;
+
+	/* Update IV */
+	vmovdqu 7 * 16(%rdx), RNOT;
+	vmovdqu RNOT, (%rcx);
+
+	call __sm4_crypt_blk8;
+
+	vpxor (0 * 16)(%rdx), RA0, RA0;
+	vpxor (1 * 16)(%rdx), RA1, RA1;
+	vpxor (2 * 16)(%rdx), RA2, RA2;
+	vpxor (3 * 16)(%rdx), RA3, RA3;
+	vpxor (4 * 16)(%rdx), RB0, RB0;
+	vpxor (5 * 16)(%rdx), RB1, RB1;
+	vpxor (6 * 16)(%rdx), RB2, RB2;
+	vpxor (7 * 16)(%rdx), RB3, RB3;
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_ocb_enc
+ELF(.type _gcry_sm4_aesni_avx_ocb_enc,@function;)
+
+_gcry_sm4_aesni_avx_ocb_enc:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0;
+	vmovdqu (%r8), RTMP1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), RTMP0, RTMP0; \
+	  vpxor xreg, RTMP1, RTMP1; \
+	  vpxor RTMP0, xreg, xreg; \
+	  vmovdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0, (%rcx);
+	vmovdqu RTMP1, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	CFI_RESTORE(%r10);
+	movq (1 * 8)(%rsp), %r11;
+	CFI_RESTORE(%r11);
+	movq (2 * 8)(%rsp), %r12;
+	CFI_RESTORE(%r12);
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor (0 * 16)(%rsi), RA0, RA0;
+	vpxor (1 * 16)(%rsi), RA1, RA1;
+	vpxor (2 * 16)(%rsi), RA2, RA2;
+	vpxor (3 * 16)(%rsi), RA3, RA3;
+	vpxor (4 * 16)(%rsi), RB0, RB0;
+	vpxor (5 * 16)(%rsi), RB1, RB1;
+	vpxor (6 * 16)(%rsi), RB2, RB2;
+	vpxor (7 * 16)(%rsi), RB3, RB3;
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_ocb_dec
+ELF(.type _gcry_sm4_aesni_avx_ocb_dec,@function;)
+
+_gcry_sm4_aesni_avx_ocb_dec:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	movdqu (%rcx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), RTMP0, RTMP0; \
+	  vpxor RTMP0, xreg, xreg; \
+	  vmovdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	CFI_RESTORE(%r10);
+	movq (1 * 8)(%rsp), %r11;
+	CFI_RESTORE(%r11);
+	movq (2 * 8)(%rsp), %r12;
+	CFI_RESTORE(%r12);
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vmovdqu (%r8), RTMP0;
+
+	vpxor (0 * 16)(%rsi), RA0, RA0;
+	vpxor (1 * 16)(%rsi), RA1, RA1;
+	vpxor (2 * 16)(%rsi), RA2, RA2;
+	vpxor (3 * 16)(%rsi), RA3, RA3;
+	vpxor (4 * 16)(%rsi), RB0, RB0;
+	vpxor (5 * 16)(%rsi), RB1, RB1;
+	vpxor (6 * 16)(%rsi), RB2, RB2;
+	vpxor (7 * 16)(%rsi), RB3, RB3;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vpxor RA0, RTMP0, RTMP0;
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vpxor RA1, RTMP0, RTMP0;
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vpxor RA2, RTMP0, RTMP0;
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vpxor RA3, RTMP0, RTMP0;
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vpxor RB0, RTMP0, RTMP0;
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vpxor RB1, RTMP0, RTMP0;
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vpxor RB2, RTMP0, RTMP0;
+	vmovdqu RB3, (7 * 16)(%rsi);
+	vpxor RB3, RTMP0, RTMP0;
+
+	vmovdqu RTMP0, (%r8);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx_ocb_auth
+ELF(.type _gcry_sm4_aesni_avx_ocb_auth,@function;)
+
+_gcry_sm4_aesni_avx_ocb_auth:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: abuf (8 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[8])
+	 */
+	CFI_STARTPROC();
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rdx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rsi), xreg; \
+	  vpxor (lreg), RTMP0, RTMP0; \
+	  vpxor RTMP0, xreg, xreg;
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	CFI_RESTORE(%r10);
+	movq (1 * 8)(%rsp), %r11;
+	CFI_RESTORE(%r11);
+	movq (2 * 8)(%rsp), %r12;
+	CFI_RESTORE(%r12);
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk8;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vmovdqu (%rcx), RTMP0;
+	vpxor RB0, RA0, RA0;
+	vpxor RB1, RA1, RA1;
+	vpxor RB2, RA2, RA2;
+	vpxor RB3, RA3, RA3;
+
+	vpxor RTMP0, RA3, RA3;
+	vpxor RA2, RA0, RA0;
+	vpxor RA3, RA1, RA1;
+
+	vpxor RA1, RA0, RA0;
+	vmovdqu RA0, (%rcx);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx_ocb_auth,.-_gcry_sm4_aesni_avx_ocb_auth;)
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/sm4-aesni-avx2-amd64.S b/comm/third_party/libgcrypt/cipher/sm4-aesni-avx2-amd64.S
new file mode 100644
index 0000000000..6e46c0dca8
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sm4-aesni-avx2-amd64.S
@@ -0,0 +1,851 @@
+/* sm4-avx2-amd64.S  -  AVX2 implementation of SM4 cipher
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Based on SM4 AES-NI work by Markku-Juhani O. Saarinen at:
+ *  https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* vector registers */
+#define RX0          %ymm0
+#define RX1          %ymm1
+#define MASK_4BIT    %ymm2
+#define RTMP0        %ymm3
+#define RTMP1        %ymm4
+#define RTMP2        %ymm5
+#define RTMP3        %ymm6
+#define RTMP4        %ymm7
+
+#define RA0          %ymm8
+#define RA1          %ymm9
+#define RA2          %ymm10
+#define RA3          %ymm11
+
+#define RB0          %ymm12
+#define RB1          %ymm13
+#define RB2          %ymm14
+#define RB3          %ymm15
+
+#define RNOT         %ymm0
+#define RBSWAP       %ymm1
+
+#define RX0x         %xmm0
+#define RX1x         %xmm1
+#define MASK_4BITx   %xmm2
+
+#define RNOTx        %xmm0
+#define RBSWAPx      %xmm1
+
+#define RTMP0x       %xmm3
+#define RTMP1x       %xmm4
+#define RTMP2x       %xmm5
+#define RTMP3x       %xmm6
+#define RTMP4x       %xmm7
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* Transpose four 32-bit words between 128-bit vector lanes. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/* post-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction. */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpandn mask4bit, x, tmp0; \
+	vpsrld $4, x, x; \
+	vpand x, mask4bit, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/**********************************************************************
+  16-way SM4 with AES-NI and AVX
+ **********************************************************************/
+
+.text
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+	.quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+	.quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+	.quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+	.quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+	.byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+	.byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+	.byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+	.byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+	.byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+	.byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.align 8
+ELF(.type   __sm4_crypt_blk16,@function;)
+__sm4_crypt_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 * 						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+
+	vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+	vpshufb RTMP2, RB0, RB0;
+	vpshufb RTMP2, RB1, RB1;
+	vpshufb RTMP2, RB2, RB2;
+	vpshufb RTMP2, RB3, RB3;
+
+	vpbroadcastd .L0f0f0f0f rRIP, MASK_4BIT;
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+	vpbroadcastd (4*(round))(%rdi), RX0; \
+	vbroadcasti128 .Lpre_tf_lo_s rRIP, RTMP4; \
+	vbroadcasti128 .Lpre_tf_hi_s rRIP, RTMP1; \
+	vmovdqa RX0, RX1; \
+	vpxor s1, RX0, RX0; \
+	vpxor s2, RX0, RX0; \
+	vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+	    vbroadcasti128 .Lpost_tf_lo_s rRIP, RTMP2; \
+	    vbroadcasti128 .Lpost_tf_hi_s rRIP, RTMP3; \
+	    vpxor r1, RX1, RX1; \
+	    vpxor r2, RX1, RX1; \
+	    vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+	\
+	/* sbox, non-linear part */ \
+	transform_pre(RX0, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+	    transform_pre(RX1, RTMP4, RTMP1, MASK_4BIT, RTMP0); \
+	vextracti128 $1, RX0, RTMP4x; \
+	    vextracti128 $1, RX1, RTMP0x; \
+	vaesenclast MASK_4BITx, RX0x, RX0x; \
+	vaesenclast MASK_4BITx, RTMP4x, RTMP4x; \
+	    vaesenclast MASK_4BITx, RX1x, RX1x; \
+	    vaesenclast MASK_4BITx, RTMP0x, RTMP0x; \
+	vinserti128 $1, RTMP4x, RX0, RX0; \
+	vbroadcasti128 .Linv_shift_row rRIP, RTMP4; \
+	    vinserti128 $1, RTMP0x, RX1, RX1; \
+	transform_post(RX0, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+	    transform_post(RX1, RTMP2, RTMP3, MASK_4BIT, RTMP0); \
+	\
+	/* linear part */ \
+	vpshufb RTMP4, RX0, RTMP0; \
+	vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+	    vpshufb RTMP4, RX1, RTMP2; \
+	    vbroadcasti128 .Linv_shift_row_rol_8 rRIP, RTMP4; \
+	    vpxor RTMP2, r0, r0; /* r0 ^ x */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vbroadcasti128 .Linv_shift_row_rol_16 rRIP, RTMP4; \
+	    vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vbroadcasti128 .Linv_shift_row_rol_24 rRIP, RTMP4; \
+	    vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+	vpshufb RTMP4, RX0, RTMP1; \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+	vpslld $2, RTMP0, RTMP1; \
+	vpsrld $30, RTMP0, RTMP0; \
+	vpxor RTMP0, s0, s0;  \
+	vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+	    vpshufb RTMP4, RX1, RTMP3; \
+	    vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+	    vpslld $2, RTMP2, RTMP3; \
+	    vpsrld $30, RTMP2, RTMP2; \
+	    vpxor RTMP2, r0, r0;  \
+	    vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+	leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+	ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+	ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+	ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+	ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+	leaq (4*4)(%rdi), %rdi;
+	cmpq %rax, %rdi;
+	jne .Lroundloop_blk8;
+
+#undef ROUND
+
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+	vpshufb RTMP2, RA0, RA0;
+	vpshufb RTMP2, RA1, RA1;
+	vpshufb RTMP2, RA2, RA2;
+	vpshufb RTMP2, RA3, RA3;
+	vpshufb RTMP2, RB0, RB0;
+	vpshufb RTMP2, RB1, RB1;
+	vpshufb RTMP2, RB2, RB2;
+	vpshufb RTMP2, RB3, RB3;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_ctr_enc
+ELF(.type   _gcry_sm4_aesni_avx2_ctr_enc,@function;)
+_gcry_sm4_aesni_avx2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	movq 8(%rcx), %rax;
+	bswapq %rax;
+
+	vzeroupper;
+
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
+	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RTMP4x;
+	vpshufb RTMP3x, RTMP4x, RTMP4x;
+	vmovdqa RTMP4x, RTMP0x;
+	inc_le128(RTMP4x, RNOTx, RTMP1x);
+	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 16), %rax;
+	ja .Lhandle_ctr_carry;
+
+	/* construct IVs */
+	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+	vpshufb RTMP3, RTMP0, RA1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+	vpshufb RTMP3, RTMP0, RA2;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+	vpshufb RTMP3, RTMP0, RA3;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+	vpshufb RTMP3, RTMP0, RB0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+	vpshufb RTMP3, RTMP0, RB1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+	vpshufb RTMP3, RTMP0, RB2;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+	vpshufb RTMP3, RTMP0, RB3;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+	vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+	jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vextracti128 $1, RTMP0, RTMP0x;
+	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+	/* store new IV */
+	vmovdqu RTMP0x, (%rcx);
+
+	call __sm4_crypt_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RA1, RA1;
+	vpxor (2 * 32)(%rdx), RA2, RA2;
+	vpxor (3 * 32)(%rdx), RA3, RA3;
+	vpxor (4 * 32)(%rdx), RB0, RB0;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RB2, RB2;
+	vpxor (7 * 32)(%rdx), RB3, RB3;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_cbc_dec
+ELF(.type   _gcry_sm4_aesni_avx2_cbc_dec,@function;)
+_gcry_sm4_aesni_avx2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	vmovdqu (4 * 32)(%rdx), RB0;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RB2;
+	vmovdqu (7 * 32)(%rdx), RB3;
+
+	call __sm4_crypt_blk16;
+
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RNOT;
+	vpxor RNOT, RA0, RA0;
+	vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+	vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+	vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+	vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+	vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+	vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_cfb_dec
+ELF(.type   _gcry_sm4_aesni_avx2_cfb_dec,@function;)
+_gcry_sm4_aesni_avx2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	/* Load input */
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RA0;
+	vmovdqu (0 * 32 + 16)(%rdx), RA1;
+	vmovdqu (1 * 32 + 16)(%rdx), RA2;
+	vmovdqu (2 * 32 + 16)(%rdx), RA3;
+	vmovdqu (3 * 32 + 16)(%rdx), RB0;
+	vmovdqu (4 * 32 + 16)(%rdx), RB1;
+	vmovdqu (5 * 32 + 16)(%rdx), RB2;
+	vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+	/* Update IV */
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx);
+
+	call __sm4_crypt_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RA1, RA1;
+	vpxor (2 * 32)(%rdx), RA2, RA2;
+	vpxor (3 * 32)(%rdx), RA3, RA3;
+	vpxor (4 * 32)(%rdx), RB0, RB0;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RB2, RB2;
+	vpxor (7 * 32)(%rdx), RB3, RB3;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_ocb_enc
+ELF(.type _gcry_sm4_aesni_avx2_ocb_enc,@function;)
+
+_gcry_sm4_aesni_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+	vmovdqu (%r8), RTMP1x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RTMP1, RTMP1; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vmovdqu RTMP0x, (%rcx);
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA3, RA3;
+	vpxor (4 * 32)(%rsi), RB0, RB0;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB3, RB3;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB3, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_ocb_dec
+ELF(.type _gcry_sm4_aesni_avx2_ocb_dec,@function;)
+
+_gcry_sm4_aesni_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vmovdqu (%r8), RTMP1x;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA3, RA3;
+	vpxor (4 * 32)(%rsi), RB0, RB0;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB3, RB3;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vpxor RA0, RTMP1, RTMP1;
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vpxor RA1, RTMP1, RTMP1;
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vpxor RA2, RTMP1, RTMP1;
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vpxor RA3, RTMP1, RTMP1;
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vpxor RB0, RTMP1, RTMP1;
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vpxor RB1, RTMP1, RTMP1;
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vpxor RB2, RTMP1, RTMP1;
+	vmovdqu RB3, (7 * 32)(%rsi);
+	vpxor RB3, RTMP1, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_sm4_aesni_avx2_ocb_auth
+ELF(.type _gcry_sm4_aesni_avx2_ocb_auth,@function;)
+
+_gcry_sm4_aesni_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rdx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __sm4_crypt_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor RA0, RB0, RA0;
+	vpxor RA1, RB1, RA1;
+	vpxor RA2, RB2, RA2;
+	vpxor RA3, RB3, RA3;
+
+	vpxor RA1, RA0, RA0;
+	vpxor RA3, RA2, RA2;
+
+	vpxor RA2, RA0, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor (%rcx), RTMP1x, RTMP1x;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_ocb_auth,.-_gcry_sm4_aesni_avx2_ocb_auth;)
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/sm4.c b/comm/third_party/libgcrypt/cipher/sm4.c
new file mode 100644
index 0000000000..c8dd0406e1
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/sm4.c
@@ -0,0 +1,1251 @@
+/* sm4.c  -  SM4 Cipher Algorithm
+ * Copyright (C) 2020 Alibaba Group.
+ * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "bithelp.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+/* Helper macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+#  define ASM_FUNC_ABI
+# endif
+#endif
+
+static const char *sm4_selftest (void);
+
+static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
+			       void *outbuf_arg, const void *inbuf_arg,
+			       size_t nblocks);
+static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				   const void *inbuf_arg, size_t nblocks,
+				   int encrypt);
+static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				  size_t nblocks);
+
+typedef struct
+{
+  u32 rkey_enc[32];
+  u32 rkey_dec[32];
+#ifdef USE_AESNI_AVX
+  unsigned int use_aesni_avx:1;
+#endif
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;
+#endif
+} SM4_context;
+
+static const u32 fk[4] =
+{
+  0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+};
+
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  byte S[256];
+  volatile u32 counter_tail;
+} sbox_table ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+    {
+      0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
+      0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
+      0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
+      0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+      0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
+      0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
+      0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
+      0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
+      0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
+      0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
+      0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
+      0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
+      0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
+      0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
+      0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
+      0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
+      0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
+      0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
+      0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
+      0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
+      0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
+      0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
+      0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
+      0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
+      0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
+      0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
+      0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
+      0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
+      0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
+      0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
+      0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
+      0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
+    },
+    0
+  };
+
+static const u32 ck[] =
+{
+  0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
+  0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
+  0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
+  0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
+  0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
+  0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
+  0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
+  0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
+};
+
+#ifdef USE_AESNI_AVX
+extern void _gcry_sm4_aesni_avx_expand_key(const byte *key, u32 *rk_enc,
+					   u32 *rk_dec, const u32 *fk,
+					   const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_ctr_enc(const u32 *rk_enc, byte *out,
+					const byte *in, byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_cbc_dec(const u32 *rk_dec, byte *out,
+					const byte *in, byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_cfb_dec(const u32 *rk_enc, byte *out,
+					const byte *in, byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_ocb_enc(const u32 *rk_enc,
+					unsigned char *out,
+					const unsigned char *in,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_ocb_dec(const u32 *rk_dec,
+					unsigned char *out,
+					const unsigned char *in,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx_ocb_auth(const u32 *rk_enc,
+					 const unsigned char *abuf,
+					 unsigned char *offset,
+					 unsigned char *checksum,
+					 const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
+				 unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
+			   unsigned int num_blks)
+{
+  return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
+}
+
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+extern void _gcry_sm4_aesni_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+					 const byte *in,
+					 byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+					 const byte *in,
+					 byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+					 const byte *in,
+					 byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx2_ocb_enc(const u32 *rk_enc,
+					 unsigned char *out,
+					 const unsigned char *in,
+					 unsigned char *offset,
+					 unsigned char *checksum,
+					 const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx2_ocb_dec(const u32 *rk_dec,
+					 unsigned char *out,
+					 const unsigned char *in,
+					 unsigned char *offset,
+					 unsigned char *checksum,
+					 const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
+					  const unsigned char *abuf,
+					  unsigned char *offset,
+					  unsigned char *checksum,
+					  const u64 Ls[16]) ASM_FUNC_ABI;
+#endif /* USE_AESNI_AVX2 */
+
+static inline void prefetch_sbox_table(void)
+{
+  const volatile byte *vtab = (void *)&sbox_table;
+
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  sbox_table.counter_head++;
+  sbox_table.counter_tail++;
+
+  /* Prefetch look-up table to cache.  */
+  (void)vtab[0 * 32];
+  (void)vtab[1 * 32];
+  (void)vtab[2 * 32];
+  (void)vtab[3 * 32];
+  (void)vtab[4 * 32];
+  (void)vtab[5 * 32];
+  (void)vtab[6 * 32];
+  (void)vtab[7 * 32];
+  (void)vtab[8 * 32 - 1];
+}
+
+static inline u32 sm4_t_non_lin_sub(u32 x)
+{
+  u32 out;
+
+  out  = (u32)sbox_table.S[(x >> 0) & 0xff] << 0;
+  out |= (u32)sbox_table.S[(x >> 8) & 0xff] << 8;
+  out |= (u32)sbox_table.S[(x >> 16) & 0xff] << 16;
+  out |= (u32)sbox_table.S[(x >> 24) & 0xff] << 24;
+
+  return out;
+}
+
+static inline u32 sm4_key_lin_sub(u32 x)
+{
+  return x ^ rol(x, 13) ^ rol(x, 23);
+}
+
+static inline u32 sm4_enc_lin_sub(u32 x)
+{
+  u32 xrol2 = rol(x, 2);
+  return x ^ xrol2 ^ rol(xrol2, 8) ^ rol(xrol2, 16) ^ rol(x, 24);
+}
+
+static inline u32 sm4_key_sub(u32 x)
+{
+  return sm4_key_lin_sub(sm4_t_non_lin_sub(x));
+}
+
+static inline u32 sm4_enc_sub(u32 x)
+{
+  return sm4_enc_lin_sub(sm4_t_non_lin_sub(x));
+}
+
+static inline u32
+sm4_round(const u32 x0, const u32 x1, const u32 x2, const u32 x3, const u32 rk)
+{
+  return x0 ^ sm4_enc_sub(x1 ^ x2 ^ x3 ^ rk);
+}
+
+static void
+sm4_expand_key (SM4_context *ctx, const byte *key)
+{
+  u32 rk[4];
+  int i;
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      _gcry_sm4_aesni_avx_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+				      fk, ck);
+      return;
+    }
+#endif
+
+  rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0];
+  rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1];
+  rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2];
+  rk[3] = buf_get_be32(key + 4 * 3) ^ fk[3];
+
+  for (i = 0; i < 32; i += 4)
+    {
+      rk[0] = rk[0] ^ sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i + 0]);
+      rk[1] = rk[1] ^ sm4_key_sub(rk[2] ^ rk[3] ^ rk[0] ^ ck[i + 1]);
+      rk[2] = rk[2] ^ sm4_key_sub(rk[3] ^ rk[0] ^ rk[1] ^ ck[i + 2]);
+      rk[3] = rk[3] ^ sm4_key_sub(rk[0] ^ rk[1] ^ rk[2] ^ ck[i + 3]);
+      ctx->rkey_enc[i + 0] = rk[0];
+      ctx->rkey_enc[i + 1] = rk[1];
+      ctx->rkey_enc[i + 2] = rk[2];
+      ctx->rkey_enc[i + 3] = rk[3];
+      ctx->rkey_dec[31 - i - 0] = rk[0];
+      ctx->rkey_dec[31 - i - 1] = rk[1];
+      ctx->rkey_dec[31 - i - 2] = rk[2];
+      ctx->rkey_dec[31 - i - 3] = rk[3];
+    }
+
+  wipememory (rk, sizeof(rk));
+}
+
+static gcry_err_code_t
+sm4_setkey (void *context, const byte *key, const unsigned keylen,
+            cipher_bulk_ops_t *bulk_ops)
+{
+  SM4_context *ctx = context;
+  static int init = 0;
+  static const char *selftest_failed = NULL;
+  unsigned int hwf = _gcry_get_hw_features ();
+
+  (void)hwf;
+
+  if (!init)
+    {
+      init = 1;
+      selftest_failed = sm4_selftest();
+      if (selftest_failed)
+	log_error("%s\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != 16)
+    return GPG_ERR_INV_KEYLEN;
+
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
+  bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
+  bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
+  bulk_ops->ocb_auth  = _gcry_sm4_ocb_auth;
+
+  sm4_expand_key (ctx, key);
+  return 0;
+}
+
+static unsigned int
+sm4_do_crypt (const u32 *rk, byte *out, const byte *in)
+{
+  u32 x[4];
+  int i;
+
+  x[0] = buf_get_be32(in + 0 * 4);
+  x[1] = buf_get_be32(in + 1 * 4);
+  x[2] = buf_get_be32(in + 2 * 4);
+  x[3] = buf_get_be32(in + 3 * 4);
+
+  for (i = 0; i < 32; i += 4)
+    {
+      x[0] = sm4_round(x[0], x[1], x[2], x[3], rk[i + 0]);
+      x[1] = sm4_round(x[1], x[2], x[3], x[0], rk[i + 1]);
+      x[2] = sm4_round(x[2], x[3], x[0], x[1], rk[i + 2]);
+      x[3] = sm4_round(x[3], x[0], x[1], x[2], rk[i + 3]);
+    }
+
+  buf_put_be32(out + 0 * 4, x[3 - 0]);
+  buf_put_be32(out + 1 * 4, x[3 - 1]);
+  buf_put_be32(out + 2 * 4, x[3 - 2]);
+  buf_put_be32(out + 3 * 4, x[3 - 3]);
+
+  return /*burn_stack*/ 4*6+sizeof(void*)*4;
+}
+
+static unsigned int
+sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
+{
+  SM4_context *ctx = context;
+
+  prefetch_sbox_table ();
+
+  return sm4_do_crypt (ctx->rkey_enc, outbuf, inbuf);
+}
+
+static unsigned int
+sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
+{
+  SM4_context *ctx = context;
+
+  prefetch_sbox_table ();
+
+  return sm4_do_crypt (ctx->rkey_dec, outbuf, inbuf);
+}
+
+static unsigned int
+sm4_do_crypt_blks2 (const u32 *rk, byte *out, const byte *in)
+{
+  u32 x[4];
+  u32 y[4];
+  u32 k;
+  int i;
+
+  /* Encrypts/Decrypts two blocks for higher instruction level
+   * parallelism. */
+
+  x[0] = buf_get_be32(in + 0 * 4);
+  x[1] = buf_get_be32(in + 1 * 4);
+  x[2] = buf_get_be32(in + 2 * 4);
+  x[3] = buf_get_be32(in + 3 * 4);
+  y[0] = buf_get_be32(in + 4 * 4);
+  y[1] = buf_get_be32(in + 5 * 4);
+  y[2] = buf_get_be32(in + 6 * 4);
+  y[3] = buf_get_be32(in + 7 * 4);
+
+  for (i = 0; i < 32; i += 4)
+    {
+      k = rk[i + 0];
+      x[0] = sm4_round(x[0], x[1], x[2], x[3], k);
+      y[0] = sm4_round(y[0], y[1], y[2], y[3], k);
+      k = rk[i + 1];
+      x[1] = sm4_round(x[1], x[2], x[3], x[0], k);
+      y[1] = sm4_round(y[1], y[2], y[3], y[0], k);
+      k = rk[i + 2];
+      x[2] = sm4_round(x[2], x[3], x[0], x[1], k);
+      y[2] = sm4_round(y[2], y[3], y[0], y[1], k);
+      k = rk[i + 3];
+      x[3] = sm4_round(x[3], x[0], x[1], x[2], k);
+      y[3] = sm4_round(y[3], y[0], y[1], y[2], k);
+    }
+
+  buf_put_be32(out + 0 * 4, x[3 - 0]);
+  buf_put_be32(out + 1 * 4, x[3 - 1]);
+  buf_put_be32(out + 2 * 4, x[3 - 2]);
+  buf_put_be32(out + 3 * 4, x[3 - 3]);
+  buf_put_be32(out + 4 * 4, y[3 - 0]);
+  buf_put_be32(out + 5 * 4, y[3 - 1]);
+  buf_put_be32(out + 6 * 4, y[3 - 2]);
+  buf_put_be32(out + 7 * 4, y[3 - 3]);
+
+  return /*burn_stack*/ 4*10+sizeof(void*)*4;
+}
+
+static unsigned int
+sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
+		  unsigned int num_blks)
+{
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (num_blks >= 2)
+    {
+      nburn = sm4_do_crypt_blks2 (rk, out, in);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 2 * 16;
+      in += 2 * 16;
+      num_blks -= 2;
+    }
+
+  while (num_blks)
+    {
+      nburn = sm4_do_crypt (rk, out, in);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 16;
+      in += 16;
+      num_blks--;
+    }
+
+  if (burn_depth)
+    burn_depth += sizeof(void *) * 5;
+  return burn_depth;
+}
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size 16. */
+static void
+_gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_aesni_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_sm4_aesni_avx_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+          nblocks -= 8;
+          outbuf += 8 * 16;
+          inbuf += 8 * 16;
+        }
+    }
+#endif
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      byte tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+#ifdef USE_AESNI_AVX
+      else if (ctx->use_aesni_avx)
+	{
+	  crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
+	}
+#endif
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+	  for (i = 1; i < curr_blks; i++)
+	    {
+	      cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
+	      cipher_block_add (&tmpbuf[i * 16], i, 16);
+	    }
+	  cipher_block_add (ctr, curr_blks, 16);
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_sm4_cbc_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_aesni_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_sm4_aesni_avx_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+          nblocks -= 8;
+          outbuf += 8 * 16;
+          inbuf += 8 * 16;
+        }
+    }
+#endif
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char savebuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+#ifdef USE_AESNI_AVX
+      else if (ctx->use_aesni_avx)
+	{
+	  crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
+	}
+#endif
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_dec, savebuf, inbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_n_copy_2(outbuf, &savebuf[i * 16], iv, inbuf,
+					16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(savebuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_sm4_cfb_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_sm4_aesni_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * 16;
+          inbuf += 16 * 16;
+        }
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_sm4_aesni_avx_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+          nblocks -= 8;
+          outbuf += 8 * 16;
+          inbuf += 8 * 16;
+        }
+    }
+#endif
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char ivbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+#ifdef USE_AESNI_AVX
+      else if (ctx->use_aesni_avx)
+	{
+	  crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
+	}
+#endif
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  cipher_block_cpy (&ivbuf[0 * 16], iv, 16);
+	  for (i = 1; i < curr_blks; i++)
+	    cipher_block_cpy (&ivbuf[i * 16], &inbuf[(i - 1) * 16], 16);
+	  cipher_block_cpy (iv, &inbuf[(i - 1) * 16], 16);
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, ivbuf, ivbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor (outbuf, inbuf, &ivbuf[i * 16], 16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(ivbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+		     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  SM4_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  int burn_stack_depth = 0;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_sm4_aesni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+					     c->u_iv.iv, c->u_ctr.ctr, Ls);
+	      else
+		_gcry_sm4_aesni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+					     c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * 16;
+	      inbuf += 16 * 16;
+	    }
+	}
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      u64 Ls[8];
+      unsigned int n = 8 - (blkn % 8);
+      u64 *l;
+
+      if (nblocks >= 8)
+	{
+	  /* Use u64 to store pointers for x32 support (assembly function
+	   * assumes 64-bit pointers). */
+	  Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
+	    {
+	      blkn += 8;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
+
+	      if (encrypt)
+		_gcry_sm4_aesni_avx_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+					    c->u_iv.iv, c->u_ctr.ctr, Ls);
+	      else
+		_gcry_sm4_aesni_avx_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+					    c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+	      nblocks -= 8;
+	      outbuf += 8 * 16;
+	      inbuf += 8 * 16;
+	    }
+	}
+    }
+#endif
+
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      const u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+      unsigned char tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+#ifdef USE_AESNI_AVX
+      else if (ctx->use_aesni_avx)
+	{
+	  crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
+	}
+#endif
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      const unsigned char *l = ocb_get_l(c, ++blkn);
+
+	      /* Checksum_i = Checksum_{i-1} xor P_i  */
+	      if (encrypt)
+		cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
+
+	      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	      cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
+	      cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
+				c->u_iv.iv, 16);
+	    }
+
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  crypt_blk1_8 (rk, outbuf, outbuf, curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
+
+	      /* Checksum_i = Checksum_{i-1} xor P_i  */
+	      if (!encrypt)
+		  cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
+	    }
+
+	  outbuf += curr_blks * 16;
+	  inbuf  += curr_blks * 16;
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+
+  return 0;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+  SM4_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_sm4_aesni_avx2_ocb_auth(ctx->rkey_enc, abuf,
+					    c->u_mode.ocb.aad_offset,
+					    c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * 16;
+	    }
+	}
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      u64 Ls[8];
+      unsigned int n = 8 - (blkn % 8);
+      u64 *l;
+
+      if (nblocks >= 8)
+	{
+	  /* Use u64 to store pointers for x32 support (assembly function
+	    * assumes 64-bit pointers). */
+	  Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	  Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
+	    {
+	      blkn += 8;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
+
+	      _gcry_sm4_aesni_avx_ocb_auth(ctx->rkey_enc, abuf,
+					   c->u_mode.ocb.aad_offset,
+					   c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 8;
+	      abuf += 8 * 16;
+	    }
+	}
+    }
+#endif
+
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+#ifdef USE_AESNI_AVX
+      else if (ctx->use_aesni_avx)
+	{
+	  crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
+	}
+#endif
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      const unsigned char *l = ocb_get_l(c, ++blkn);
+
+	      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	      cipher_block_xor_2dst (&tmpbuf[i * 16],
+				     c->u_mode.ocb.aad_offset, l, 16);
+	      cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
+	    }
+
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf, curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
+	    }
+
+	  abuf += curr_blks * 16;
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  return 0;
+}
+
+/* Run the self-tests for SM4-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_ctr("SM4", &sm4_setkey,
+           &sm4_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for SM4-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_cbc("SM4", &sm4_setkey,
+           &sm4_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for SM4-CFB, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_cfb("SM4", &sm4_setkey,
+           &sm4_encrypt, nblocks, blocksize, context_size);
+}
+
+static const char *
+sm4_selftest (void)
+{
+  SM4_context ctx;
+  byte scratch[16];
+  const char *r;
+
+  static const byte plaintext[16] = {
+    0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
+    0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10,
+  };
+  static const byte key[16] = {
+    0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
+    0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10,
+  };
+  static const byte ciphertext[16] = {
+    0x68, 0x1E, 0xDF, 0x34, 0xD2, 0x06, 0x96, 0x5E,
+    0x86, 0xB3, 0xE9, 0x4F, 0x53, 0x6E, 0x42, 0x46
+  };
+
+  memset (&ctx, 0, sizeof(ctx));
+
+  sm4_expand_key (&ctx, key);
+  sm4_encrypt (&ctx, scratch, plaintext);
+  if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
+    return "SM4 test encryption failed.";
+  sm4_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext, sizeof (plaintext)))
+    return "SM4 test decryption failed.";
+
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
+  return NULL;
+}
+
+static gpg_err_code_t
+run_selftests (int algo, int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  (void)extended;
+
+  if (algo != GCRY_CIPHER_SM4)
+    return GPG_ERR_CIPHER_ALGO;
+
+  what = "selftest";
+  errtxt = sm4_selftest ();
+  if (errtxt)
+    goto failed;
+
+  return 0;
+
+ failed:
+  if (report)
+    report ("cipher", GCRY_CIPHER_SM4, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+static gcry_cipher_oid_spec_t sm4_oids[] =
+  {
+    { "1.2.156.10197.1.104.1", GCRY_CIPHER_MODE_ECB },
+    { "1.2.156.10197.1.104.2", GCRY_CIPHER_MODE_CBC },
+    { "1.2.156.10197.1.104.3", GCRY_CIPHER_MODE_OFB },
+    { "1.2.156.10197.1.104.4", GCRY_CIPHER_MODE_CFB },
+    { "1.2.156.10197.1.104.7", GCRY_CIPHER_MODE_CTR },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_sm4 =
+  {
+    GCRY_CIPHER_SM4, {0, 0},
+    "SM4", NULL, sm4_oids, 16, 128,
+    sizeof (SM4_context),
+    sm4_setkey, sm4_encrypt, sm4_decrypt,
+    NULL, NULL,
+    run_selftests
+  };
diff --git a/comm/third_party/libgcrypt/cipher/stribog.c b/comm/third_party/libgcrypt/cipher/stribog.c
new file mode 100644
index 0000000000..f8776a3e8f
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/stribog.c
@@ -0,0 +1,1362 @@
+/* stribog.c - GOST R 34.11-2012 (Stribog) hash function
+ * Copyright (C) 2013 Dmitry Eremin-Solenikov
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+
+typedef struct
+{
+  gcry_md_block_ctx_t bctx;
+  union
+  {
+    u64 h[8];
+    unsigned char result[64];
+  };
+  u64 N[8];
+  u64 Sigma[8];
+} STRIBOG_CONTEXT;
+
+
+/* Pre-computed results of multiplication of bytes on A and reordered with
+   Pi[]. */
+static const u64 stribog_table[8][256] =
+{
+  /* 0 */
+  { U64_C(0xd01f715b5c7ef8e6), U64_C(0x16fa240980778325),
+    U64_C(0xa8a42e857ee049c8), U64_C(0x6ac1068fa186465b),
+    U64_C(0x6e417bd7a2e9320b), U64_C(0x665c8167a437daab),
+    U64_C(0x7666681aa89617f6), U64_C(0x4b959163700bdcf5),
+    U64_C(0xf14be6b78df36248), U64_C(0xc585bd689a625cff),
+    U64_C(0x9557d7fca67d82cb), U64_C(0x89f0b969af6dd366),
+    U64_C(0xb0833d48749f6c35), U64_C(0xa1998c23b1ecbc7c),
+    U64_C(0x8d70c431ac02a736), U64_C(0xd6dfbc2fd0a8b69e),
+    U64_C(0x37aeb3e551fa198b), U64_C(0x0b7d128a40b5cf9c),
+    U64_C(0x5a8f2008b5780cbc), U64_C(0xedec882284e333e5),
+    U64_C(0xd25fc177d3c7c2ce), U64_C(0x5e0f5d50b61778ec),
+    U64_C(0x1d873683c0c24cb9), U64_C(0xad040bcbb45d208c),
+    U64_C(0x2f89a0285b853c76), U64_C(0x5732fff6791b8d58),
+    U64_C(0x3e9311439ef6ec3f), U64_C(0xc9183a809fd3c00f),
+    U64_C(0x83adf3f5260a01ee), U64_C(0xa6791941f4e8ef10),
+    U64_C(0x103ae97d0ca1cd5d), U64_C(0x2ce948121dee1b4a),
+    U64_C(0x39738421dbf2bf53), U64_C(0x093da2a6cf0cf5b4),
+    U64_C(0xcd9847d89cbcb45f), U64_C(0xf9561c078b2d8ae8),
+    U64_C(0x9c6a755a6971777f), U64_C(0xbc1ebaa0712ef0c5),
+    U64_C(0x72e61542abf963a6), U64_C(0x78bb5fde229eb12e),
+    U64_C(0x14ba94250fceb90d), U64_C(0x844d6697630e5282),
+    U64_C(0x98ea08026a1e032f), U64_C(0xf06bbea144217f5c),
+    U64_C(0xdb6263d11ccb377a), U64_C(0x641c314b2b8ee083),
+    U64_C(0x320e96ab9b4770cf), U64_C(0x1ee7deb986a96b85),
+    U64_C(0xe96cf57a878c47b5), U64_C(0xfdd6615f8842feb8),
+    U64_C(0xc83862965601dd1b), U64_C(0x2ea9f83e92572162),
+    U64_C(0xf876441142ff97fc), U64_C(0xeb2c455608357d9d),
+    U64_C(0x5612a7e0b0c9904c), U64_C(0x6c01cbfb2d500823),
+    U64_C(0x4548a6a7fa037a2d), U64_C(0xabc4c6bf388b6ef4),
+    U64_C(0xbade77d4fdf8bebd), U64_C(0x799b07c8eb4cac3a),
+    U64_C(0x0c9d87e805b19cf0), U64_C(0xcb588aac106afa27),
+    U64_C(0xea0c1d40c1e76089), U64_C(0x2869354a1e816f1a),
+    U64_C(0xff96d17307fbc490), U64_C(0x9f0a9d602f1a5043),
+    U64_C(0x96373fc6e016a5f7), U64_C(0x5292dab8b3a6e41c),
+    U64_C(0x9b8ae0382c752413), U64_C(0x4f15ec3b7364a8a5),
+    U64_C(0x3fb349555724f12b), U64_C(0xc7c50d4415db66d7),
+    U64_C(0x92b7429ee379d1a7), U64_C(0xd37f99611a15dfda),
+    U64_C(0x231427c05e34a086), U64_C(0xa439a96d7b51d538),
+    U64_C(0xb403401077f01865), U64_C(0xdda2aea5901d7902),
+    U64_C(0x0a5d4a9c8967d288), U64_C(0xc265280adf660f93),
+    U64_C(0x8bb0094520d4e94e), U64_C(0x2a29856691385532),
+    U64_C(0x42a833c5bf072941), U64_C(0x73c64d54622b7eb2),
+    U64_C(0x07e095624504536c), U64_C(0x8a905153e906f45a),
+    U64_C(0x6f6123c16b3b2f1f), U64_C(0xc6e55552dc097bc3),
+    U64_C(0x4468feb133d16739), U64_C(0xe211e7f0c7398829),
+    U64_C(0xa2f96419f7879b40), U64_C(0x19074bdbc3ad38e9),
+    U64_C(0xf4ebc3f9474e0b0c), U64_C(0x43886bd376d53455),
+    U64_C(0xd8028beb5aa01046), U64_C(0x51f23282f5cdc320),
+    U64_C(0xe7b1c2be0d84e16d), U64_C(0x081dfab006dee8a0),
+    U64_C(0x3b33340d544b857b), U64_C(0x7f5bcabc679ae242),
+    U64_C(0x0edd37c48a08a6d8), U64_C(0x81ed43d9a9b33bc6),
+    U64_C(0xb1a3655ebd4d7121), U64_C(0x69a1eeb5e7ed6167),
+    U64_C(0xf6ab73d5c8f73124), U64_C(0x1a67a3e185c61fd5),
+    U64_C(0x2dc91004d43c065e), U64_C(0x0240b02c8fb93a28),
+    U64_C(0x90f7f2b26cc0eb8f), U64_C(0x3cd3a16f114fd617),
+    U64_C(0xaae49ea9f15973e0), U64_C(0x06c0cd748cd64e78),
+    U64_C(0xda423bc7d5192a6e), U64_C(0xc345701c16b41287),
+    U64_C(0x6d2193ede4821537), U64_C(0xfcf639494190e3ac),
+    U64_C(0x7c3b228621f1c57e), U64_C(0xfb16ac2b0494b0c0),
+    U64_C(0xbf7e529a3745d7f9), U64_C(0x6881b6a32e3f7c73),
+    U64_C(0xca78d2bad9b8e733), U64_C(0xbbfe2fc2342aa3a9),
+    U64_C(0x0dbddffecc6381e4), U64_C(0x70a6a56e2440598e),
+    U64_C(0xe4d12a844befc651), U64_C(0x8c509c2765d0ba22),
+    U64_C(0xee8c6018c28814d9), U64_C(0x17da7c1f49a59e31),
+    U64_C(0x609c4c1328e194d3), U64_C(0xb3e3d57232f44b09),
+    U64_C(0x91d7aaa4a512f69b), U64_C(0x0ffd6fd243dabbcc),
+    U64_C(0x50d26a943c1fde34), U64_C(0x6be15e9968545b4f),
+    U64_C(0x94778fea6faf9fdf), U64_C(0x2b09dd7058ea4826),
+    U64_C(0x677cd9716de5c7bf), U64_C(0x49d5214fffb2e6dd),
+    U64_C(0x0360e83a466b273c), U64_C(0x1fc786af4f7b7691),
+    U64_C(0xa0b9d435783ea168), U64_C(0xd49f0c035f118cb6),
+    U64_C(0x01205816c9d21d14), U64_C(0xac2453dd7d8f3d98),
+    U64_C(0x545217cc3f70aa64), U64_C(0x26b4028e9489c9c2),
+    U64_C(0xdec2469fd6765e3e), U64_C(0x04807d58036f7450),
+    U64_C(0xe5f17292823ddb45), U64_C(0xf30b569b024a5860),
+    U64_C(0x62dcfc3fa758aefb), U64_C(0xe84cad6c4e5e5aa1),
+    U64_C(0xccb81fce556ea94b), U64_C(0x53b282ae7a74f908),
+    U64_C(0x1b47fbf74c1402c1), U64_C(0x368eebf39828049f),
+    U64_C(0x7afbeff2ad278b06), U64_C(0xbe5e0a8cfe97caed),
+    U64_C(0xcfd8f7f413058e77), U64_C(0xf78b2bc301252c30),
+    U64_C(0x4d555c17fcdd928d), U64_C(0x5f2f05467fc565f8),
+    U64_C(0x24f4b2a21b30f3ea), U64_C(0x860dd6bbecb768aa),
+    U64_C(0x4c750401350f8f99), U64_C(0x0000000000000000),
+    U64_C(0xecccd0344d312ef1), U64_C(0xb5231806be220571),
+    U64_C(0xc105c030990d28af), U64_C(0x653c695de25cfd97),
+    U64_C(0x159acc33c61ca419), U64_C(0xb89ec7f872418495),
+    U64_C(0xa9847693b73254dc), U64_C(0x58cf90243ac13694),
+    U64_C(0x59efc832f3132b80), U64_C(0x5c4fed7c39ae42c4),
+    U64_C(0x828dabe3efd81cfa), U64_C(0xd13f294d95ace5f2),
+    U64_C(0x7d1b7a90e823d86a), U64_C(0xb643f03cf849224d),
+    U64_C(0x3df3f979d89dcb03), U64_C(0x7426d836272f2dde),
+    U64_C(0xdfe21e891fa4432a), U64_C(0x3a136c1b9d99986f),
+    U64_C(0xfa36f43dcd46add4), U64_C(0xc025982650df35bb),
+    U64_C(0x856d3e81aadc4f96), U64_C(0xc4a5e57e53b041eb),
+    U64_C(0x4708168b75ba4005), U64_C(0xaf44bbe73be41aa4),
+    U64_C(0x971767d029c4b8e3), U64_C(0xb9be9feebb939981),
+    U64_C(0x215497ecd18d9aae), U64_C(0x316e7e91dd2c57f3),
+    U64_C(0xcef8afe2dad79363), U64_C(0x3853dc371220a247),
+    U64_C(0x35ee03c9de4323a3), U64_C(0xe6919aa8c456fc79),
+    U64_C(0xe05157dc4880b201), U64_C(0x7bdbb7e464f59612),
+    U64_C(0x127a59518318f775), U64_C(0x332ecebd52956ddb),
+    U64_C(0x8f30741d23bb9d1e), U64_C(0xd922d3fd93720d52),
+    U64_C(0x7746300c61440ae2), U64_C(0x25d4eab4d2e2eefe),
+    U64_C(0x75068020eefd30ca), U64_C(0x135a01474acaea61),
+    U64_C(0x304e268714fe4ae7), U64_C(0xa519f17bb283c82c),
+    U64_C(0xdc82f6b359cf6416), U64_C(0x5baf781e7caa11a8),
+    U64_C(0xb2c38d64fb26561d), U64_C(0x34ce5bdf17913eb7),
+    U64_C(0x5d6fb56af07c5fd0), U64_C(0x182713cd0a7f25fd),
+    U64_C(0x9e2ac576e6c84d57), U64_C(0x9aaab82ee5a73907),
+    U64_C(0xa3d93c0f3e558654), U64_C(0x7e7b92aaae48ff56),
+    U64_C(0x872d8ead256575be), U64_C(0x41c8dbfff96c0e7d),
+    U64_C(0x99ca5014a3cc1e3b), U64_C(0x40e883e930be1369),
+    U64_C(0x1ca76e95091051ad), U64_C(0x4e35b42dbab6b5b1),
+    U64_C(0x05a0254ecabd6944), U64_C(0xe1710fca8152af15),
+    U64_C(0xf22b0e8dcb984574), U64_C(0xb763a82a319b3f59),
+    U64_C(0x63fca4296e8ab3ef), U64_C(0x9d4a2d4ca0a36a6b),
+    U64_C(0xe331bfe60eeb953d), U64_C(0xd5bf541596c391a2),
+    U64_C(0xf5cb9bef8e9c1618), U64_C(0x46284e9dbc685d11),
+    U64_C(0x2074cffa185f87ba), U64_C(0xbd3ee2b6b8fcedd1),
+    U64_C(0xae64e3f1f23607b0), U64_C(0xfeb68965ce29d984),
+    U64_C(0x55724fdaf6a2b770), U64_C(0x29496d5cd753720e),
+    U64_C(0xa75941573d3af204), U64_C(0x8e102c0bea69800a),
+    U64_C(0x111ab16bc573d049), U64_C(0xd7ffe439197aab8a),
+    U64_C(0xefac380e0b5a09cd), U64_C(0x48f579593660fbc9),
+    U64_C(0x22347fd697e6bd92), U64_C(0x61bc1405e13389c7),
+    U64_C(0x4ab5c975b9d9c1e1), U64_C(0x80cd1bcf606126d2),
+    U64_C(0x7186fd78ed92449a), U64_C(0x93971a882aabccb3),
+    U64_C(0x88d0e17f66bfce72), U64_C(0x27945a985d5bd4d6) },
+  /* 1 */
+  { U64_C(0xde553f8c05a811c8), U64_C(0x1906b59631b4f565),
+    U64_C(0x436e70d6b1964ff7), U64_C(0x36d343cb8b1e9d85),
+    U64_C(0x843dfacc858aab5a), U64_C(0xfdfc95c299bfc7f9),
+    U64_C(0x0f634bdea1d51fa2), U64_C(0x6d458b3b76efb3cd),
+    U64_C(0x85c3f77cf8593f80), U64_C(0x3c91315fbe737cb2),
+    U64_C(0x2148b03366ace398), U64_C(0x18f8b8264c6761bf),
+    U64_C(0xc830c1c495c9fb0f), U64_C(0x981a76102086a0aa),
+    U64_C(0xaa16012142f35760), U64_C(0x35cc54060c763cf6),
+    U64_C(0x42907d66cc45db2d), U64_C(0x8203d44b965af4bc),
+    U64_C(0x3d6f3cefc3a0e868), U64_C(0xbc73ff69d292bda7),
+    U64_C(0x8722ed0102e20a29), U64_C(0x8f8185e8cd34deb7),
+    U64_C(0x9b0561dda7ee01d9), U64_C(0x5335a0193227fad6),
+    U64_C(0xc9cecc74e81a6fd5), U64_C(0x54f5832e5c2431ea),
+    U64_C(0x99e47ba05d553470), U64_C(0xf7bee756acd226ce),
+    U64_C(0x384e05a5571816fd), U64_C(0xd1367452a47d0e6a),
+    U64_C(0xf29fde1c386ad85b), U64_C(0x320c77316275f7ca),
+    U64_C(0xd0c879e2d9ae9ab0), U64_C(0xdb7406c69110ef5d),
+    U64_C(0x45505e51a2461011), U64_C(0xfc029872e46c5323),
+    U64_C(0xfa3cb6f5f7bc0cc5), U64_C(0x031f17cd8768a173),
+    U64_C(0xbd8df2d9af41297d), U64_C(0x9d3b4f5ab43e5e3f),
+    U64_C(0x4071671b36feee84), U64_C(0x716207e7d3e3b83d),
+    U64_C(0x48d20ff2f9283a1a), U64_C(0x27769eb4757cbc7e),
+    U64_C(0x5c56ebc793f2e574), U64_C(0xa48b474f9ef5dc18),
+    U64_C(0x52cbada94ff46e0c), U64_C(0x60c7da982d8199c6),
+    U64_C(0x0e9d466edc068b78), U64_C(0x4eec2175eaf865fc),
+    U64_C(0x550b8e9e21f7a530), U64_C(0x6b7ba5bc653fec2b),
+    U64_C(0x5eb7f1ba6949d0dd), U64_C(0x57ea94e3db4c9099),
+    U64_C(0xf640eae6d101b214), U64_C(0xdd4a284182c0b0bb),
+    U64_C(0xff1d8fbf6304f250), U64_C(0xb8accb933bf9d7e8),
+    U64_C(0xe8867c478eb68c4d), U64_C(0x3f8e2692391bddc1),
+    U64_C(0xcb2fd60912a15a7c), U64_C(0xaec935dbab983d2f),
+    U64_C(0xf55ffd2b56691367), U64_C(0x80e2ce366ce1c115),
+    U64_C(0x179bf3f8edb27e1d), U64_C(0x01fe0db07dd394da),
+    U64_C(0xda8a0b76ecc37b87), U64_C(0x44ae53e1df9584cb),
+    U64_C(0xb310b4b77347a205), U64_C(0xdfab323c787b8512),
+    U64_C(0x3b511268d070b78e), U64_C(0x65e6e3d2b9396753),
+    U64_C(0x6864b271e2574d58), U64_C(0x259784c98fc789d7),
+    U64_C(0x02e11a7dfabb35a9), U64_C(0x8841a6dfa337158b),
+    U64_C(0x7ade78c39b5dcdd0), U64_C(0xb7cf804d9a2cc84a),
+    U64_C(0x20b6bd831b7f7742), U64_C(0x75bd331d3a88d272),
+    U64_C(0x418f6aab4b2d7a5e), U64_C(0xd9951cbb6babdaf4),
+    U64_C(0xb6318dfde7ff5c90), U64_C(0x1f389b112264aa83),
+    U64_C(0x492c024284fbaec0), U64_C(0xe33a0363c608f9a0),
+    U64_C(0x2688930408af28a4), U64_C(0xc7538a1a341ce4ad),
+    U64_C(0x5da8e677ee2171ae), U64_C(0x8c9e92254a5c7fc4),
+    U64_C(0x63d8cd55aae938b5), U64_C(0x29ebd8daa97a3706),
+    U64_C(0x959827b37be88aa1), U64_C(0x1484e4356adadf6e),
+    U64_C(0xa7945082199d7d6b), U64_C(0xbf6ce8a455fa1cd4),
+    U64_C(0x9cc542eac9edcae5), U64_C(0x79c16f0e1c356ca3),
+    U64_C(0x89bfab6fdee48151), U64_C(0xd4174d1830c5f0ff),
+    U64_C(0x9258048415eb419d), U64_C(0x6139d72850520d1c),
+    U64_C(0x6a85a80c18ec78f1), U64_C(0xcd11f88e0171059a),
+    U64_C(0xcceff53e7ca29140), U64_C(0xd229639f2315af19),
+    U64_C(0x90b91ef9ef507434), U64_C(0x5977d28d074a1be1),
+    U64_C(0x311360fce51d56b9), U64_C(0xc093a92d5a1f2f91),
+    U64_C(0x1a19a25bb6dc5416), U64_C(0xeb996b8a09de2d3e),
+    U64_C(0xfee3820f1ed7668a), U64_C(0xd7085ad5b7ad518c),
+    U64_C(0x7fff41890fe53345), U64_C(0xec5948bd67dde602),
+    U64_C(0x2fd5f65dbaaa68e0), U64_C(0xa5754affe32648c2),
+    U64_C(0xf8ddac880d07396c), U64_C(0x6fa491468c548664),
+    U64_C(0x0c7c5c1326bdbed1), U64_C(0x4a33158f03930fb3),
+    U64_C(0x699abfc19f84d982), U64_C(0xe4fa2054a80b329c),
+    U64_C(0x6707f9af438252fa), U64_C(0x08a368e9cfd6d49e),
+    U64_C(0x47b1442c58fd25b8), U64_C(0xbbb3dc5ebc91769b),
+    U64_C(0x1665fe489061eac7), U64_C(0x33f27a811fa66310),
+    U64_C(0x93a609346838d547), U64_C(0x30ed6d4c98cec263),
+    U64_C(0x1dd9816cd8df9f2a), U64_C(0x94662a03063b1e7b),
+    U64_C(0x83fdd9fbeb896066), U64_C(0x7b207573e68e590a),
+    U64_C(0x5f49fc0a149a4407), U64_C(0x343259b671a5a82c),
+    U64_C(0xfbc2bb458a6f981f), U64_C(0xc272b350a0a41a38),
+    U64_C(0x3aaf1fd8ada32354), U64_C(0x6cbb868b0b3c2717),
+    U64_C(0xa2b569c88d2583fe), U64_C(0xf180c9d1bf027928),
+    U64_C(0xaf37386bd64ba9f5), U64_C(0x12bacab2790a8088),
+    U64_C(0x4c0d3b0810435055), U64_C(0xb2eeb9070e9436df),
+    U64_C(0xc5b29067cea7d104), U64_C(0xdcb425f1ff132461),
+    U64_C(0x4f122cc5972bf126), U64_C(0xac282fa651230886),
+    U64_C(0xe7e537992f6393ef), U64_C(0xe61b3a2952b00735),
+    U64_C(0x709c0a57ae302ce7), U64_C(0xe02514ae416058d3),
+    U64_C(0xc44c9dd7b37445de), U64_C(0x5a68c5408022ba92),
+    U64_C(0x1c278cdca50c0bf0), U64_C(0x6e5a9cf6f18712be),
+    U64_C(0x86dce0b17f319ef3), U64_C(0x2d34ec2040115d49),
+    U64_C(0x4bcd183f7e409b69), U64_C(0x2815d56ad4a9a3dc),
+    U64_C(0x24698979f2141d0d), U64_C(0x0000000000000000),
+    U64_C(0x1ec696a15fb73e59), U64_C(0xd86b110b16784e2e),
+    U64_C(0x8e7f8858b0e74a6d), U64_C(0x063e2e8713d05fe6),
+    U64_C(0xe2c40ed3bbdb6d7a), U64_C(0xb1f1aeca89fc97ac),
+    U64_C(0xe1db191e3cb3cc09), U64_C(0x6418ee62c4eaf389),
+    U64_C(0xc6ad87aa49cf7077), U64_C(0xd6f65765ca7ec556),
+    U64_C(0x9afb6c6dda3d9503), U64_C(0x7ce05644888d9236),
+    U64_C(0x8d609f95378feb1e), U64_C(0x23a9aa4e9c17d631),
+    U64_C(0x6226c0e5d73aac6f), U64_C(0x56149953a69f0443),
+    U64_C(0xeeb852c09d66d3ab), U64_C(0x2b0ac2a753c102af),
+    U64_C(0x07c023376e03cb3c), U64_C(0x2ccae1903dc2c993),
+    U64_C(0xd3d76e2f5ec63bc3), U64_C(0x9e2458973356ff4c),
+    U64_C(0xa66a5d32644ee9b1), U64_C(0x0a427294356de137),
+    U64_C(0x783f62be61e6f879), U64_C(0x1344c70204d91452),
+    U64_C(0x5b96c8f0fdf12e48), U64_C(0xa90916ecc59bf613),
+    U64_C(0xbe92e5142829880e), U64_C(0x727d102a548b194e),
+    U64_C(0x1be7afebcb0fc0cc), U64_C(0x3e702b2244c8491b),
+    U64_C(0xd5e940a84d166425), U64_C(0x66f9f41f3e51c620),
+    U64_C(0xabe80c913f20c3ba), U64_C(0xf07ec461c2d1edf2),
+    U64_C(0xf361d3ac45b94c81), U64_C(0x0521394a94b8fe95),
+    U64_C(0xadd622162cf09c5c), U64_C(0xe97871f7f3651897),
+    U64_C(0xf4a1f09b2bba87bd), U64_C(0x095d6559b2054044),
+    U64_C(0x0bbc7f2448be75ed), U64_C(0x2af4cf172e129675),
+    U64_C(0x157ae98517094bb4), U64_C(0x9fda55274e856b96),
+    U64_C(0x914713499283e0ee), U64_C(0xb952c623462a4332),
+    U64_C(0x74433ead475b46a8), U64_C(0x8b5eb112245fb4f8),
+    U64_C(0xa34b6478f0f61724), U64_C(0x11a5dd7ffe6221fb),
+    U64_C(0xc16da49d27ccbb4b), U64_C(0x76a224d0bde07301),
+    U64_C(0x8aa0bca2598c2022), U64_C(0x4df336b86d90c48f),
+    U64_C(0xea67663a740db9e4), U64_C(0xef465f70e0b54771),
+    U64_C(0x39b008152acb8227), U64_C(0x7d1e5bf4f55e06ec),
+    U64_C(0x105bd0cf83b1b521), U64_C(0x775c2960c033e7db),
+    U64_C(0x7e014c397236a79f), U64_C(0x811cc386113255cf),
+    U64_C(0xeda7450d1a0e72d8), U64_C(0x5889df3d7a998f3b),
+    U64_C(0x2e2bfbedc779fc3a), U64_C(0xce0eef438619a4e9),
+    U64_C(0x372d4e7bf6cd095f), U64_C(0x04df34fae96b6a4f),
+    U64_C(0xf923a13870d4adb6), U64_C(0xa1aa7e050a4d228d),
+    U64_C(0xa8f71b5cb84862c9), U64_C(0xb52e9a306097fde3),
+    U64_C(0x0d8251a35b6e2a0b), U64_C(0x2257a7fee1c442eb),
+    U64_C(0x73831d9a29588d94), U64_C(0x51d4ba64c89ccf7f),
+    U64_C(0x502ab7d4b54f5ba5), U64_C(0x97793dce8153bf08),
+    U64_C(0xe5042de4d5d8a646), U64_C(0x9687307efc802bd2),
+    U64_C(0xa05473b5779eb657), U64_C(0xb4d097801d446939),
+    U64_C(0xcff0e2f3fbca3033), U64_C(0xc38cbee0dd778ee2),
+    U64_C(0x464f499c252eb162), U64_C(0xcad1dbb96f72cea6),
+    U64_C(0xba4dd1eec142e241), U64_C(0xb00fa37af42f0376) },
+  /* 2 */
+  { U64_C(0xcce4cd3aa968b245), U64_C(0x089d5484e80b7faf),
+    U64_C(0x638246c1b3548304), U64_C(0xd2fe0ec8c2355492),
+    U64_C(0xa7fbdf7ff2374eee), U64_C(0x4df1600c92337a16),
+    U64_C(0x84e503ea523b12fb), U64_C(0x0790bbfd53ab0c4a),
+    U64_C(0x198a780f38f6ea9d), U64_C(0x2ab30c8f55ec48cb),
+    U64_C(0xe0f7fed6b2c49db5), U64_C(0xb6ecf3f422cadbdc),
+    U64_C(0x409c9a541358df11), U64_C(0xd3ce8a56dfde3fe3),
+    U64_C(0xc3e9224312c8c1a0), U64_C(0x0d6dfa58816ba507),
+    U64_C(0xddf3e1b179952777), U64_C(0x04c02a42748bb1d9),
+    U64_C(0x94c2abff9f2decb8), U64_C(0x4f91752da8f8acf4),
+    U64_C(0x78682befb169bf7b), U64_C(0xe1c77a48af2ff6c4),
+    U64_C(0x0c5d7ec69c80ce76), U64_C(0x4cc1e4928fd81167),
+    U64_C(0xfeed3d24d9997b62), U64_C(0x518bb6dfc3a54a23),
+    U64_C(0x6dbf2d26151f9b90), U64_C(0xb5bc624b05ea664f),
+    U64_C(0xe86aaa525acfe21a), U64_C(0x4801ced0fb53a0be),
+    U64_C(0xc91463e6c00868ed), U64_C(0x1027a815cd16fe43),
+    U64_C(0xf67069a0319204cd), U64_C(0xb04ccc976c8abce7),
+    U64_C(0xc0b9b3fc35e87c33), U64_C(0xf380c77c58f2de65),
+    U64_C(0x50bb3241de4e2152), U64_C(0xdf93f490435ef195),
+    U64_C(0xf1e0d25d62390887), U64_C(0xaf668bfb1a3c3141),
+    U64_C(0xbc11b251f00a7291), U64_C(0x73a5eed47e427d47),
+    U64_C(0x25bee3f6ee4c3b2e), U64_C(0x43cc0beb34786282),
+    U64_C(0xc824e778dde3039c), U64_C(0xf97d86d98a327728),
+    U64_C(0xf2b043e24519b514), U64_C(0xe297ebf7880f4b57),
+    U64_C(0x3a94a49a98fab688), U64_C(0x868516cb68f0c419),
+    U64_C(0xeffa11af0964ee50), U64_C(0xa4ab4ec0d517f37d),
+    U64_C(0xa9c6b498547c567a), U64_C(0x8e18424f80fbbbb6),
+    U64_C(0x0bcdc53bcf2bc23c), U64_C(0x137739aaea3643d0),
+    U64_C(0x2c1333ec1bac2ff0), U64_C(0x8d48d3f0a7db0625),
+    U64_C(0x1e1ac3f26b5de6d7), U64_C(0xf520f81f16b2b95e),
+    U64_C(0x9f0f6ec450062e84), U64_C(0x0130849e1deb6b71),
+    U64_C(0xd45e31ab8c7533a9), U64_C(0x652279a2fd14e43f),
+    U64_C(0x3209f01e70f1c927), U64_C(0xbe71a770cac1a473),
+    U64_C(0x0e3d6be7a64b1894), U64_C(0x7ec8148cff29d840),
+    U64_C(0xcb7476c7fac3be0f), U64_C(0x72956a4a63a91636),
+    U64_C(0x37f95ec21991138f), U64_C(0x9e3fea5a4ded45f5),
+    U64_C(0x7b38ba50964902e8), U64_C(0x222e580bbde73764),
+    U64_C(0x61e253e0899f55e6), U64_C(0xfc8d2805e352ad80),
+    U64_C(0x35994be3235ac56d), U64_C(0x09add01af5e014de),
+    U64_C(0x5e8659a6780539c6), U64_C(0xb17c48097161d796),
+    U64_C(0x026015213acbd6e2), U64_C(0xd1ae9f77e515e901),
+    U64_C(0xb7dc776a3f21b0ad), U64_C(0xaba6a1b96eb78098),
+    U64_C(0x9bcf4486248d9f5d), U64_C(0x582666c536455efd),
+    U64_C(0xfdbdac9bfeb9c6f1), U64_C(0xc47999be4163cdea),
+    U64_C(0x765540081722a7ef), U64_C(0x3e548ed8ec710751),
+    U64_C(0x3d041f67cb51bac2), U64_C(0x7958af71ac82d40a),
+    U64_C(0x36c9da5c047a78fe), U64_C(0xed9a048e33af38b2),
+    U64_C(0x26ee7249c96c86bd), U64_C(0x900281bdeba65d61),
+    U64_C(0x11172c8bd0fd9532), U64_C(0xea0abf73600434f8),
+    U64_C(0x42fc8f75299309f3), U64_C(0x34a9cf7d3eb1ae1c),
+    U64_C(0x2b838811480723ba), U64_C(0x5ce64c8742ceef24),
+    U64_C(0x1adae9b01fd6570e), U64_C(0x3c349bf9d6bad1b3),
+    U64_C(0x82453c891c7b75c0), U64_C(0x97923a40b80d512b),
+    U64_C(0x4a61dbf1c198765c), U64_C(0xb48ce6d518010d3e),
+    U64_C(0xcfb45c858e480fd6), U64_C(0xd933cbf30d1e96ae),
+    U64_C(0xd70ea014ab558e3a), U64_C(0xc189376228031742),
+    U64_C(0x9262949cd16d8b83), U64_C(0xeb3a3bed7def5f89),
+    U64_C(0x49314a4ee6b8cbcf), U64_C(0xdcc3652f647e4c06),
+    U64_C(0xda635a4c2a3e2b3d), U64_C(0x470c21a940f3d35b),
+    U64_C(0x315961a157d174b4), U64_C(0x6672e81dda3459ac),
+    U64_C(0x5b76f77a1165e36e), U64_C(0x445cb01667d36ec8),
+    U64_C(0xc5491d205c88a69b), U64_C(0x456c34887a3805b9),
+    U64_C(0xffddb9bac4721013), U64_C(0x99af51a71e4649bf),
+    U64_C(0xa15be01cbc7729d5), U64_C(0x52db2760e485f7b0),
+    U64_C(0x8c78576eba306d54), U64_C(0xae560f6507d75a30),
+    U64_C(0x95f22f6182c687c9), U64_C(0x71c5fbf54489aba5),
+    U64_C(0xca44f259e728d57e), U64_C(0x88b87d2ccebbdc8d),
+    U64_C(0xbab18d32be4a15aa), U64_C(0x8be8ec93e99b611e),
+    U64_C(0x17b713e89ebdf209), U64_C(0xb31c5d284baa0174),
+    U64_C(0xeeca9531148f8521), U64_C(0xb8d198138481c348),
+    U64_C(0x8988f9b2d350b7fc), U64_C(0xb9e11c8d996aa839),
+    U64_C(0x5a4673e40c8e881f), U64_C(0x1687977683569978),
+    U64_C(0xbf4123eed72acf02), U64_C(0x4ea1f1b3b513c785),
+    U64_C(0xe767452be16f91ff), U64_C(0x7505d1b730021a7c),
+    U64_C(0xa59bca5ec8fc980c), U64_C(0xad069eda20f7e7a3),
+    U64_C(0x38f4b1bba231606a), U64_C(0x60d2d77e94743e97),
+    U64_C(0x9affc0183966f42c), U64_C(0x248e6768f3a7505f),
+    U64_C(0xcdd449a4b483d934), U64_C(0x87b59255751baf68),
+    U64_C(0x1bea6d2e023d3c7f), U64_C(0x6b1f12455b5ffcab),
+    U64_C(0x743555292de9710d), U64_C(0xd8034f6d10f5fddf),
+    U64_C(0xc6198c9f7ba81b08), U64_C(0xbb8109aca3a17edb),
+    U64_C(0xfa2d1766ad12cabb), U64_C(0xc729080166437079),
+    U64_C(0x9c5fff7b77269317), U64_C(0x0000000000000000),
+    U64_C(0x15d706c9a47624eb), U64_C(0x6fdf38072fd44d72),
+    U64_C(0x5fb6dd3865ee52b7), U64_C(0xa33bf53d86bcff37),
+    U64_C(0xe657c1b5fc84fa8e), U64_C(0xaa962527735cebe9),
+    U64_C(0x39c43525bfda0b1b), U64_C(0x204e4d2a872ce186),
+    U64_C(0x7a083ece8ba26999), U64_C(0x554b9c9db72efbfa),
+    U64_C(0xb22cd9b656416a05), U64_C(0x96a2bedea5e63a5a),
+    U64_C(0x802529a826b0a322), U64_C(0x8115ad363b5bc853),
+    U64_C(0x8375b81701901eb1), U64_C(0x3069e53f4a3a1fc5),
+    U64_C(0xbd2136cfede119e0), U64_C(0x18bafc91251d81ec),
+    U64_C(0x1d4a524d4c7d5b44), U64_C(0x05f0aedc6960daa8),
+    U64_C(0x29e39d3072ccf558), U64_C(0x70f57f6b5962c0d4),
+    U64_C(0x989fd53903ad22ce), U64_C(0xf84d024797d91c59),
+    U64_C(0x547b1803aac5908b), U64_C(0xf0d056c37fd263f6),
+    U64_C(0xd56eb535919e58d8), U64_C(0x1c7ad6d351963035),
+    U64_C(0x2e7326cd2167f912), U64_C(0xac361a443d1c8cd2),
+    U64_C(0x697f076461942a49), U64_C(0x4b515f6fdc731d2d),
+    U64_C(0x8ad8680df4700a6f), U64_C(0x41ac1eca0eb3b460),
+    U64_C(0x7d988533d80965d3), U64_C(0xa8f6300649973d0b),
+    U64_C(0x7765c4960ac9cc9e), U64_C(0x7ca801adc5e20ea2),
+    U64_C(0xdea3700e5eb59ae4), U64_C(0xa06b6482a19c42a4),
+    U64_C(0x6a2f96db46b497da), U64_C(0x27def6d7d487edcc),
+    U64_C(0x463ca5375d18b82a), U64_C(0xa6cb5be1efdc259f),
+    U64_C(0x53eba3fef96e9cc1), U64_C(0xce84d81b93a364a7),
+    U64_C(0xf4107c810b59d22f), U64_C(0x333974806d1aa256),
+    U64_C(0x0f0def79bba073e5), U64_C(0x231edc95a00c5c15),
+    U64_C(0xe437d494c64f2c6c), U64_C(0x91320523f64d3610),
+    U64_C(0x67426c83c7df32dd), U64_C(0x6eefbc99323f2603),
+    U64_C(0x9d6f7be56acdf866), U64_C(0x5916e25b2bae358c),
+    U64_C(0x7ff89012e2c2b331), U64_C(0x035091bf2720bd93),
+    U64_C(0x561b0d22900e4669), U64_C(0x28d319ae6f279e29),
+    U64_C(0x2f43a2533c8c9263), U64_C(0xd09e1be9f8fe8270),
+    U64_C(0xf740ed3e2c796fbc), U64_C(0xdb53ded237d5404c),
+    U64_C(0x62b2c25faebfe875), U64_C(0x0afd41a5d2c0a94d),
+    U64_C(0x6412fd3ce0ff8f4e), U64_C(0xe3a76f6995e42026),
+    U64_C(0x6c8fa9b808f4f0e1), U64_C(0xc2d9a6dd0f23aad1),
+    U64_C(0x8f28c6d19d10d0c7), U64_C(0x85d587744fd0798a),
+    U64_C(0xa20b71a39b579446), U64_C(0x684f83fa7c7f4138),
+    U64_C(0xe507500adba4471d), U64_C(0x3f640a46f19a6c20),
+    U64_C(0x1247bd34f7dd28a1), U64_C(0x2d23b77206474481),
+    U64_C(0x93521002cc86e0f2), U64_C(0x572b89bc8de52d18),
+    U64_C(0xfb1d93f8b0f9a1ca), U64_C(0xe95a2ecc4724896b),
+    U64_C(0x3ba420048511ddf9), U64_C(0xd63e248ab6bee54b),
+    U64_C(0x5dd6c8195f258455), U64_C(0x06a03f634e40673b),
+    U64_C(0x1f2a476c76b68da6), U64_C(0x217ec9b49ac78af7),
+    U64_C(0xecaa80102e4453c3), U64_C(0x14e78257b99d4f9a) },
+  /* 3 */
+  { U64_C(0x20329b2cc87bba05), U64_C(0x4f5eb6f86546a531),
+    U64_C(0xd4f44775f751b6b1), U64_C(0x8266a47b850dfa8b),
+    U64_C(0xbb986aa15a6ca985), U64_C(0xc979eb08f9ae0f99),
+    U64_C(0x2da6f447a2375ea1), U64_C(0x1e74275dcd7d8576),
+    U64_C(0xbc20180a800bc5f8), U64_C(0xb4a2f701b2dc65be),
+    U64_C(0xe726946f981b6d66), U64_C(0x48e6c453bf21c94c),
+    U64_C(0x42cad9930f0a4195), U64_C(0xefa47b64aacccd20),
+    U64_C(0x71180a8960409a42), U64_C(0x8bb3329bf6a44e0c),
+    U64_C(0xd34c35de2d36dacc), U64_C(0xa92f5b7cbc23dc96),
+    U64_C(0xb31a85aa68bb09c3), U64_C(0x13e04836a73161d2),
+    U64_C(0xb24dfc4129c51d02), U64_C(0x8ae44b70b7da5acd),
+    U64_C(0xe671ed84d96579a7), U64_C(0xa4bb3417d66f3832),
+    U64_C(0x4572ab38d56d2de8), U64_C(0xb1b47761ea47215c),
+    U64_C(0xe81c09cf70aba15d), U64_C(0xffbdb872ce7f90ac),
+    U64_C(0xa8782297fd5dc857), U64_C(0x0d946f6b6a4ce4a4),
+    U64_C(0xe4df1f4f5b995138), U64_C(0x9ebc71edca8c5762),
+    U64_C(0x0a2c1dc0b02b88d9), U64_C(0x3b503c115d9d7b91),
+    U64_C(0xc64376a8111ec3a2), U64_C(0xcec199a323c963e4),
+    U64_C(0xdc76a87ec58616f7), U64_C(0x09d596e073a9b487),
+    U64_C(0x14583a9d7d560daf), U64_C(0xf4c6dc593f2a0cb4),
+    U64_C(0xdd21d19584f80236), U64_C(0x4a4836983ddde1d3),
+    U64_C(0xe58866a41ae745f9), U64_C(0xf591a5b27e541875),
+    U64_C(0x891dc05074586693), U64_C(0x5b068c651810a89e),
+    U64_C(0xa30346bc0c08544f), U64_C(0x3dbf3751c684032d),
+    U64_C(0x2a1e86ec785032dc), U64_C(0xf73f5779fca830ea),
+    U64_C(0xb60c05ca30204d21), U64_C(0x0cc316802b32f065),
+    U64_C(0x8770241bdd96be69), U64_C(0xb861e18199ee95db),
+    U64_C(0xf805cad91418fcd1), U64_C(0x29e70dccbbd20e82),
+    U64_C(0xc7140f435060d763), U64_C(0x0f3a9da0e8b0cc3b),
+    U64_C(0xa2543f574d76408e), U64_C(0xbd7761e1c175d139),
+    U64_C(0x4b1f4f737ca3f512), U64_C(0x6dc2df1f2fc137ab),
+    U64_C(0xf1d05c3967b14856), U64_C(0xa742bf3715ed046c),
+    U64_C(0x654030141d1697ed), U64_C(0x07b872abda676c7d),
+    U64_C(0x3ce84eba87fa17ec), U64_C(0xc1fb0403cb79afdf),
+    U64_C(0x3e46bc7105063f73), U64_C(0x278ae987121cd678),
+    U64_C(0xa1adb4778ef47cd0), U64_C(0x26dd906c5362c2b9),
+    U64_C(0x05168060589b44e2), U64_C(0xfbfc41f9d79ac08f),
+    U64_C(0x0e6de44ba9ced8fa), U64_C(0x9feb08068bf243a3),
+    U64_C(0x7b341749d06b129b), U64_C(0x229c69e74a87929a),
+    U64_C(0xe09ee6c4427c011b), U64_C(0x5692e30e725c4c3a),
+    U64_C(0xda99a33e5e9f6e4b), U64_C(0x353dd85af453a36b),
+    U64_C(0x25241b4c90e0fee7), U64_C(0x5de987258309d022),
+    U64_C(0xe230140fc0802984), U64_C(0x93281e86a0c0b3c6),
+    U64_C(0xf229d719a4337408), U64_C(0x6f6c2dd4ad3d1f34),
+    U64_C(0x8ea5b2fbae3f0aee), U64_C(0x8331dd90c473ee4a),
+    U64_C(0x346aa1b1b52db7aa), U64_C(0xdf8f235e06042aa9),
+    U64_C(0xcc6f6b68a1354b7b), U64_C(0x6c95a6f46ebf236a),
+    U64_C(0x52d31a856bb91c19), U64_C(0x1a35ded6d498d555),
+    U64_C(0xf37eaef2e54d60c9), U64_C(0x72e181a9a3c2a61c),
+    U64_C(0x98537aad51952fde), U64_C(0x16f6c856ffaa2530),
+    U64_C(0xd960281e9d1d5215), U64_C(0x3a0745fa1ce36f50),
+    U64_C(0x0b7b642bf1559c18), U64_C(0x59a87eae9aec8001),
+    U64_C(0x5e100c05408bec7c), U64_C(0x0441f98b19e55023),
+    U64_C(0xd70dcc5534d38aef), U64_C(0x927f676de1bea707),
+    U64_C(0x9769e70db925e3e5), U64_C(0x7a636ea29115065a),
+    U64_C(0x468b201816ef11b6), U64_C(0xab81a9b73edff409),
+    U64_C(0xc0ac7de88a07bb1e), U64_C(0x1f235eb68c0391b7),
+    U64_C(0x6056b074458dd30f), U64_C(0xbe8eeac102f7ed67),
+    U64_C(0xcd381283e04b5fba), U64_C(0x5cbefecec277c4e3),
+    U64_C(0xd21b4c356c48ce0d), U64_C(0x1019c31664b35d8c),
+    U64_C(0x247362a7d19eea26), U64_C(0xebe582efb3299d03),
+    U64_C(0x02aef2cb82fc289f), U64_C(0x86275df09ce8aaa8),
+    U64_C(0x28b07427faac1a43), U64_C(0x38a9b7319e1f47cf),
+    U64_C(0xc82e92e3b8d01b58), U64_C(0x06ef0b409b1978bc),
+    U64_C(0x62f842bfc771fb90), U64_C(0x9904034610eb3b1f),
+    U64_C(0xded85ab5477a3e68), U64_C(0x90d195a663428f98),
+    U64_C(0x5384636e2ac708d8), U64_C(0xcbd719c37b522706),
+    U64_C(0xae9729d76644b0eb), U64_C(0x7c8c65e20a0c7ee6),
+    U64_C(0x80c856b007f1d214), U64_C(0x8c0b40302cc32271),
+    U64_C(0xdbcedad51fe17a8a), U64_C(0x740e8ae938dbdea0),
+    U64_C(0xa615c6dc549310ad), U64_C(0x19cc55f6171ae90b),
+    U64_C(0x49b1bdb8fe5fdd8d), U64_C(0xed0a89af2830e5bf),
+    U64_C(0x6a7aadb4f5a65bd6), U64_C(0x7e22972988f05679),
+    U64_C(0xf952b3325566e810), U64_C(0x39fecedadf61530e),
+    U64_C(0x6101c99f04f3c7ce), U64_C(0x2e5f7f6761b562ff),
+    U64_C(0xf08725d226cf5c97), U64_C(0x63af3b54860fef51),
+    U64_C(0x8ff2cb10ef411e2f), U64_C(0x884ab9bb35267252),
+    U64_C(0x4df04433e7ba8dae), U64_C(0x9afd8866d3690741),
+    U64_C(0x66b9bb34de94abb3), U64_C(0x9baaf18d92171380),
+    U64_C(0x543c11c5f0a064a5), U64_C(0x17a1b1bdbed431f1),
+    U64_C(0xb5f58eeaf3a2717f), U64_C(0xc355f6c849858740),
+    U64_C(0xec5df044694ef17e), U64_C(0xd83751f5dc6346d4),
+    U64_C(0xfc4433520dfdacf2), U64_C(0x0000000000000000),
+    U64_C(0x5a51f58e596ebc5f), U64_C(0x3285aaf12e34cf16),
+    U64_C(0x8d5c39db6dbd36b0), U64_C(0x12b731dde64f7513),
+    U64_C(0x94906c2d7aa7dfbb), U64_C(0x302b583aacc8e789),
+    U64_C(0x9d45facd090e6b3c), U64_C(0x2165e2c78905aec4),
+    U64_C(0x68d45f7f775a7349), U64_C(0x189b2c1d5664fdca),
+    U64_C(0xe1c99f2f030215da), U64_C(0x6983269436246788),
+    U64_C(0x8489af3b1e148237), U64_C(0xe94b702431d5b59c),
+    U64_C(0x33d2d31a6f4adbd7), U64_C(0xbfd9932a4389f9a6),
+    U64_C(0xb0e30e8aab39359d), U64_C(0xd1e2c715afcaf253),
+    U64_C(0x150f43763c28196e), U64_C(0xc4ed846393e2eb3d),
+    U64_C(0x03f98b20c3823c5e), U64_C(0xfd134ab94c83b833),
+    U64_C(0x556b682eb1de7064), U64_C(0x36c4537a37d19f35),
+    U64_C(0x7559f30279a5ca61), U64_C(0x799ae58252973a04),
+    U64_C(0x9c12832648707ffd), U64_C(0x78cd9c6913e92ec5),
+    U64_C(0x1d8dac7d0effb928), U64_C(0x439da0784e745554),
+    U64_C(0x413352b3cc887dcb), U64_C(0xbacf134a1b12bd44),
+    U64_C(0x114ebafd25cd494d), U64_C(0x2f08068c20cb763e),
+    U64_C(0x76a07822ba27f63f), U64_C(0xeab2fb04f25789c2),
+    U64_C(0xe3676de481fe3d45), U64_C(0x1b62a73d95e6c194),
+    U64_C(0x641749ff5c68832c), U64_C(0xa5ec4dfc97112cf3),
+    U64_C(0xf6682e92bdd6242b), U64_C(0x3f11c59a44782bb2),
+    U64_C(0x317c21d1edb6f348), U64_C(0xd65ab5be75ad9e2e),
+    U64_C(0x6b2dd45fb4d84f17), U64_C(0xfaab381296e4d44e),
+    U64_C(0xd0b5befeeeb4e692), U64_C(0x0882ef0b32d7a046),
+    U64_C(0x512a91a5a83b2047), U64_C(0x963e9ee6f85bf724),
+    U64_C(0x4e09cf132438b1f0), U64_C(0x77f701c9fb59e2fe),
+    U64_C(0x7ddb1c094b726a27), U64_C(0x5f4775ee01f5f8bd),
+    U64_C(0x9186ec4d223c9b59), U64_C(0xfeeac1998f01846d),
+    U64_C(0xac39db1ce4b89874), U64_C(0xb75b7c21715e59e0),
+    U64_C(0xafc0503c273aa42a), U64_C(0x6e3b543fec430bf5),
+    U64_C(0x704f7362213e8e83), U64_C(0x58ff0745db9294c0),
+    U64_C(0x67eec2df9feabf72), U64_C(0xa0facd9ccf8a6811),
+    U64_C(0xb936986ad890811a), U64_C(0x95c715c63bd9cb7a),
+    U64_C(0xca8060283a2c33c7), U64_C(0x507de84ee9453486),
+    U64_C(0x85ded6d05f6a96f6), U64_C(0x1cdad5964f81ade9),
+    U64_C(0xd5a33e9eb62fa270), U64_C(0x40642b588df6690a),
+    U64_C(0x7f75eec2c98e42b8), U64_C(0x2cf18dace3494a60),
+    U64_C(0x23cb100c0bf9865b), U64_C(0xeef3028febb2d9e1),
+    U64_C(0x4425d2d394133929), U64_C(0xaad6d05c7fa1e0c8),
+    U64_C(0xad6ea2f7a5c68cb5), U64_C(0xc2028f2308fb9381),
+    U64_C(0x819f2f5b468fc6d5), U64_C(0xc5bafd88d29cfffc),
+    U64_C(0x47dc59f357910577), U64_C(0x2b49ff07392e261d),
+    U64_C(0x57c59ae5332258fb), U64_C(0x73b6f842e2bcb2dd),
+    U64_C(0xcf96e04862b77725), U64_C(0x4ca73dd8a6c4996f),
+    U64_C(0x015779eb417e14c1), U64_C(0x37932a9176af8bf4) },
+  /* 4 */
+  { U64_C(0x190a2c9b249df23e), U64_C(0x2f62f8b62263e1e9),
+    U64_C(0x7a7f754740993655), U64_C(0x330b7ba4d5564d9f),
+    U64_C(0x4c17a16a46672582), U64_C(0xb22f08eb7d05f5b8),
+    U64_C(0x535f47f40bc148cc), U64_C(0x3aec5d27d4883037),
+    U64_C(0x10ed0a1825438f96), U64_C(0x516101f72c233d17),
+    U64_C(0x13cc6f949fd04eae), U64_C(0x739853c441474bfd),
+    U64_C(0x653793d90d3f5b1b), U64_C(0x5240647b96b0fc2f),
+    U64_C(0x0c84890ad27623e0), U64_C(0xd7189b32703aaea3),
+    U64_C(0x2685de3523bd9c41), U64_C(0x99317c5b11bffefa),
+    U64_C(0x0d9baa854f079703), U64_C(0x70b93648fbd48ac5),
+    U64_C(0xa80441fce30bc6be), U64_C(0x7287704bdc36ff1e),
+    U64_C(0xb65384ed33dc1f13), U64_C(0xd36417343ee34408),
+    U64_C(0x39cd38ab6e1bf10f), U64_C(0x5ab861770a1f3564),
+    U64_C(0x0ebacf09f594563b), U64_C(0xd04572b884708530),
+    U64_C(0x3cae9722bdb3af47), U64_C(0x4a556b6f2f5cbaf2),
+    U64_C(0xe1704f1f76c4bd74), U64_C(0x5ec4ed7144c6dfcf),
+    U64_C(0x16afc01d4c7810e6), U64_C(0x283f113cd629ca7a),
+    U64_C(0xaf59a8761741ed2d), U64_C(0xeed5a3991e215fac),
+    U64_C(0x3bf37ea849f984d4), U64_C(0xe413e096a56ce33c),
+    U64_C(0x2c439d3a98f020d1), U64_C(0x637559dc6404c46b),
+    U64_C(0x9e6c95d1e5f5d569), U64_C(0x24bb9836045fe99a),
+    U64_C(0x44efa466dac8ecc9), U64_C(0xc6eab2a5c80895d6),
+    U64_C(0x803b50c035220cc4), U64_C(0x0321658cba93c138),
+    U64_C(0x8f9ebc465dc7ee1c), U64_C(0xd15a5137190131d3),
+    U64_C(0x0fa5ec8668e5e2d8), U64_C(0x91c979578d1037b1),
+    U64_C(0x0642ca05693b9f70), U64_C(0xefca80168350eb4f),
+    U64_C(0x38d21b24f36a45ec), U64_C(0xbeab81e1af73d658),
+    U64_C(0x8cbfd9cae7542f24), U64_C(0xfd19cc0d81f11102),
+    U64_C(0x0ac6430fbb4dbc90), U64_C(0x1d76a09d6a441895),
+    U64_C(0x2a01573ff1cbbfa1), U64_C(0xb572e161894fde2b),
+    U64_C(0x8124734fa853b827), U64_C(0x614b1fdf43e6b1b0),
+    U64_C(0x68ac395c4238cc18), U64_C(0x21d837bfd7f7b7d2),
+    U64_C(0x20c714304a860331), U64_C(0x5cfaab726324aa14),
+    U64_C(0x74c5ba4eb50d606e), U64_C(0xf3a3030474654739),
+    U64_C(0x23e671bcf015c209), U64_C(0x45f087e947b9582a),
+    U64_C(0xd8bd77b418df4c7b), U64_C(0xe06f6c90ebb50997),
+    U64_C(0x0bd96080263c0873), U64_C(0x7e03f9410e40dcfe),
+    U64_C(0xb8e94be4c6484928), U64_C(0xfb5b0608e8ca8e72),
+    U64_C(0x1a2b49179e0e3306), U64_C(0x4e29e76961855059),
+    U64_C(0x4f36c4e6fcf4e4ba), U64_C(0x49740ee395cf7bca),
+    U64_C(0xc2963ea386d17f7d), U64_C(0x90d65ad810618352),
+    U64_C(0x12d34c1b02a1fa4d), U64_C(0xfa44258775bb3a91),
+    U64_C(0x18150f14b9ec46dd), U64_C(0x1491861e6b9a653d),
+    U64_C(0x9a1019d7ab2c3fc2), U64_C(0x3668d42d06fe13d7),
+    U64_C(0xdcc1fbb25606a6d0), U64_C(0x969490dd795a1c22),
+    U64_C(0x3549b1a1bc6dd2ef), U64_C(0xc94f5e23a0ed770e),
+    U64_C(0xb9f6686b5b39fdcb), U64_C(0xc4d4f4a6efeae00d),
+    U64_C(0xe732851a1fff2204), U64_C(0x94aad6de5eb869f9),
+    U64_C(0x3f8ff2ae07206e7f), U64_C(0xfe38a9813b62d03a),
+    U64_C(0xa7a1ad7a8bee2466), U64_C(0x7b6056c8dde882b6),
+    U64_C(0x302a1e286fc58ca7), U64_C(0x8da0fa457a259bc7),
+    U64_C(0xb3302b64e074415b), U64_C(0x5402ae7eff8b635f),
+    U64_C(0x08f8050c9cafc94b), U64_C(0xae468bf98a3059ce),
+    U64_C(0x88c355cca98dc58f), U64_C(0xb10e6d67c7963480),
+    U64_C(0xbad70de7e1aa3cf3), U64_C(0xbfb4a26e320262bb),
+    U64_C(0xcb711820870f02d5), U64_C(0xce12b7a954a75c9d),
+    U64_C(0x563ce87dd8691684), U64_C(0x9f73b65e7884618a),
+    U64_C(0x2b1e74b06cba0b42), U64_C(0x47cec1ea605b2df1),
+    U64_C(0x1c698312f735ac76), U64_C(0x5fdbcefed9b76b2c),
+    U64_C(0x831a354c8fb1cdfc), U64_C(0x820516c312c0791f),
+    U64_C(0xb74ca762aeadabf0), U64_C(0xfc06ef821c80a5e1),
+    U64_C(0x5723cbf24518a267), U64_C(0x9d4df05d5f661451),
+    U64_C(0x588627742dfd40bf), U64_C(0xda8331b73f3d39a0),
+    U64_C(0x17b0e392d109a405), U64_C(0xf965400bcf28fba9),
+    U64_C(0x7c3dbf4229a2a925), U64_C(0x023e460327e275db),
+    U64_C(0x6cd0b55a0ce126b3), U64_C(0xe62da695828e96e7),
+    U64_C(0x42ad6e63b3f373b9), U64_C(0xe50cc319381d57df),
+    U64_C(0xc5cbd729729b54ee), U64_C(0x46d1e265fd2a9912),
+    U64_C(0x6428b056904eeff8), U64_C(0x8be23040131e04b7),
+    U64_C(0x6709d5da2add2ec0), U64_C(0x075de98af44a2b93),
+    U64_C(0x8447dcc67bfbe66f), U64_C(0x6616f655b7ac9a23),
+    U64_C(0xd607b8bded4b1a40), U64_C(0x0563af89d3a85e48),
+    U64_C(0x3db1b4ad20c21ba4), U64_C(0x11f22997b8323b75),
+    U64_C(0x292032b34b587e99), U64_C(0x7f1cdace9331681d),
+    U64_C(0x8e819fc9c0b65aff), U64_C(0xa1e3677fe2d5bb16),
+    U64_C(0xcd33d225ee349da5), U64_C(0xd9a2543b85aef898),
+    U64_C(0x795e10cbfa0af76d), U64_C(0x25a4bbb9992e5d79),
+    U64_C(0x78413344677b438e), U64_C(0xf0826688cef68601),
+    U64_C(0xd27b34bba392f0eb), U64_C(0x551d8df162fad7bc),
+    U64_C(0x1e57c511d0d7d9ad), U64_C(0xdeffbdb171e4d30b),
+    U64_C(0xf4feea8e802f6caa), U64_C(0xa480c8f6317de55e),
+    U64_C(0xa0fc44f07fa40ff5), U64_C(0x95b5f551c3c9dd1a),
+    U64_C(0x22f952336d6476ea), U64_C(0x0000000000000000),
+    U64_C(0xa6be8ef5169f9085), U64_C(0xcc2cf1aa73452946),
+    U64_C(0x2e7ddb39bf12550a), U64_C(0xd526dd3157d8db78),
+    U64_C(0x486b2d6c08becf29), U64_C(0x9b0f3a58365d8b21),
+    U64_C(0xac78cdfaadd22c15), U64_C(0xbc95c7e28891a383),
+    U64_C(0x6a927f5f65dab9c3), U64_C(0xc3891d2c1ba0cb9e),
+    U64_C(0xeaa92f9f50f8b507), U64_C(0xcf0d9426c9d6e87e),
+    U64_C(0xca6e3baf1a7eb636), U64_C(0xab25247059980786),
+    U64_C(0x69b31ad3df4978fb), U64_C(0xe2512a93cc577c4c),
+    U64_C(0xff278a0ea61364d9), U64_C(0x71a615c766a53e26),
+    U64_C(0x89dc764334fc716c), U64_C(0xf87a638452594f4a),
+    U64_C(0xf2bc208be914f3da), U64_C(0x8766b94ac1682757),
+    U64_C(0xbbc82e687cdb8810), U64_C(0x626a7a53f9757088),
+    U64_C(0xa2c202f358467a2e), U64_C(0x4d0882e5db169161),
+    U64_C(0x09e7268301de7da8), U64_C(0xe897699c771ac0dc),
+    U64_C(0xc8507dac3d9cc3ed), U64_C(0xc0a878a0a1330aa6),
+    U64_C(0x978bb352e42ba8c1), U64_C(0xe9884a13ea6b743f),
+    U64_C(0x279afdbabecc28a2), U64_C(0x047c8c064ed9eaab),
+    U64_C(0x507e2278b15289f4), U64_C(0x599904fbb08cf45c),
+    U64_C(0xbd8ae46d15e01760), U64_C(0x31353da7f2b43844),
+    U64_C(0x8558ff49e68a528c), U64_C(0x76fbfc4d92ef15b5),
+    U64_C(0x3456922e211c660c), U64_C(0x86799ac55c1993b4),
+    U64_C(0x3e90d1219a51da9c), U64_C(0x2d5cbeb505819432),
+    U64_C(0x982e5fd48cce4a19), U64_C(0xdb9c1238a24c8d43),
+    U64_C(0xd439febecaa96f9b), U64_C(0x418c0bef0960b281),
+    U64_C(0x158ea591f6ebd1de), U64_C(0x1f48e69e4da66d4e),
+    U64_C(0x8afd13cf8e6fb054), U64_C(0xf5e1c9011d5ed849),
+    U64_C(0xe34e091c5126c8af), U64_C(0xad67ee7530a398f6),
+    U64_C(0x43b24dec2e82c75a), U64_C(0x75da99c1287cd48d),
+    U64_C(0x92e81cdb3783f689), U64_C(0xa3dd217cc537cecd),
+    U64_C(0x60543c50de970553), U64_C(0x93f73f54aaf2426a),
+    U64_C(0xa91b62737e7a725d), U64_C(0xf19d4507538732e2),
+    U64_C(0x77e4dfc20f9ea156), U64_C(0x7d229ccdb4d31dc6),
+    U64_C(0x1b346a98037f87e5), U64_C(0xedf4c615a4b29e94),
+    U64_C(0x4093286094110662), U64_C(0xb0114ee85ae78063),
+    U64_C(0x6ff1d0d6b672e78b), U64_C(0x6dcf96d591909250),
+    U64_C(0xdfe09e3eec9567e8), U64_C(0x3214582b4827f97c),
+    U64_C(0xb46dc2ee143e6ac8), U64_C(0xf6c0ac8da7cd1971),
+    U64_C(0xebb60c10cd8901e4), U64_C(0xf7df8f023abcad92),
+    U64_C(0x9c52d3d2c217a0b2), U64_C(0x6b8d5cd0f8ab0d20),
+    U64_C(0x3777f7a29b8fa734), U64_C(0x011f238f9d71b4e3),
+    U64_C(0xc1b75b2f3c42be45), U64_C(0x5de588fdfe551ef7),
+    U64_C(0x6eeef3592b035368), U64_C(0xaa3a07ffc4e9b365),
+    U64_C(0xecebe59a39c32a77), U64_C(0x5ba742f8976e8187),
+    U64_C(0x4b4a48e0b22d0e11), U64_C(0xddded83dcb771233),
+    U64_C(0xa59feb79ac0c51bd), U64_C(0xc7f5912a55792135) },
+  /* 5 */
+  { U64_C(0x6d6ae04668a9b08a), U64_C(0x3ab3f04b0be8c743),
+    U64_C(0xe51e166b54b3c908), U64_C(0xbe90a9eb35c2f139),
+    U64_C(0xb2c7066637f2bec1), U64_C(0xaa6945613392202c),
+    U64_C(0x9a28c36f3b5201eb), U64_C(0xddce5a93ab536994),
+    U64_C(0x0e34133ef6382827), U64_C(0x52a02ba1ec55048b),
+    U64_C(0xa2f88f97c4b2a177), U64_C(0x8640e513ca2251a5),
+    U64_C(0xcdf1d36258137622), U64_C(0xfe6cb708dedf8ddb),
+    U64_C(0x8a174a9ec8121e5d), U64_C(0x679896036b81560e),
+    U64_C(0x59ed033395795fee), U64_C(0x1dd778ab8b74edaf),
+    U64_C(0xee533ef92d9f926d), U64_C(0x2a8c79baf8a8d8f5),
+    U64_C(0x6bcf398e69b119f6), U64_C(0xe20491742fafdd95),
+    U64_C(0x276488e0809c2aec), U64_C(0xea955b82d88f5cce),
+    U64_C(0x7102c63a99d9e0c4), U64_C(0xf9763017a5c39946),
+    U64_C(0x429fa2501f151b3d), U64_C(0x4659c72bea05d59e),
+    U64_C(0x984b7fdccf5a6634), U64_C(0xf742232953fbb161),
+    U64_C(0x3041860e08c021c7), U64_C(0x747bfd9616cd9386),
+    U64_C(0x4bb1367192312787), U64_C(0x1b72a1638a6c44d3),
+    U64_C(0x4a0e68a6e8359a66), U64_C(0x169a5039f258b6ca),
+    U64_C(0xb98a2ef44edee5a4), U64_C(0xd9083fe85e43a737),
+    U64_C(0x967f6ce239624e13), U64_C(0x8874f62d3c1a7982),
+    U64_C(0x3c1629830af06e3f), U64_C(0x9165ebfd427e5a8e),
+    U64_C(0xb5dd81794ceeaa5c), U64_C(0x0de8f15a7834f219),
+    U64_C(0x70bd98ede3dd5d25), U64_C(0xaccc9ca9328a8950),
+    U64_C(0x56664eda1945ca28), U64_C(0x221db34c0f8859ae),
+    U64_C(0x26dbd637fa98970d), U64_C(0x1acdffb4f068f932),
+    U64_C(0x4585254f64090fa0), U64_C(0x72de245e17d53afa),
+    U64_C(0x1546b25d7c546cf4), U64_C(0x207e0ffffb803e71),
+    U64_C(0xfaaad2732bcf4378), U64_C(0xb462dfae36ea17bd),
+    U64_C(0xcf926fd1ac1b11fd), U64_C(0xe0672dc7dba7ba4a),
+    U64_C(0xd3fa49ad5d6b41b3), U64_C(0x8ba81449b216a3bc),
+    U64_C(0x14f9ec8a0650d115), U64_C(0x40fc1ee3eb1d7ce2),
+    U64_C(0x23a2ed9b758ce44f), U64_C(0x782c521b14fddc7e),
+    U64_C(0x1c68267cf170504e), U64_C(0xbcf31558c1ca96e6),
+    U64_C(0xa781b43b4ba6d235), U64_C(0xf6fd7dfe29ff0c80),
+    U64_C(0xb0a4bad5c3fad91e), U64_C(0xd199f51ea963266c),
+    U64_C(0x414340349119c103), U64_C(0x5405f269ed4dadf7),
+    U64_C(0xabd61bb649969dcd), U64_C(0x6813dbeae7bdc3c8),
+    U64_C(0x65fb2ab09f8931d1), U64_C(0xf1e7fae152e3181d),
+    U64_C(0xc1a67cef5a2339da), U64_C(0x7a4feea8e0f5bba1),
+    U64_C(0x1e0b9acf05783791), U64_C(0x5b8ebf8061713831),
+    U64_C(0x80e53cdbcb3af8d9), U64_C(0x7e898bd315e57502),
+    U64_C(0xc6bcfbf0213f2d47), U64_C(0x95a38e86b76e942d),
+    U64_C(0x092e94218d243cba), U64_C(0x8339debf453622e7),
+    U64_C(0xb11be402b9fe64ff), U64_C(0x57d9100d634177c9),
+    U64_C(0xcc4e8db52217cbc3), U64_C(0x3b0cae9c71ec7aa2),
+    U64_C(0xfb158ca451cbfe99), U64_C(0x2b33276d82ac6514),
+    U64_C(0x01bf5ed77a04bde1), U64_C(0xc5601994af33f779),
+    U64_C(0x75c4a3416cc92e67), U64_C(0xf3844652a6eb7fc2),
+    U64_C(0x3487e375fdd0ef64), U64_C(0x18ae430704609eed),
+    U64_C(0x4d14efb993298efb), U64_C(0x815a620cb13e4538),
+    U64_C(0x125c354207487869), U64_C(0x9eeea614ce42cf48),
+    U64_C(0xce2d3106d61fac1c), U64_C(0xbbe99247bad6827b),
+    U64_C(0x071a871f7b1c149d), U64_C(0x2e4a1cc10db81656),
+    U64_C(0x77a71ff298c149b8), U64_C(0x06a5d9c80118a97c),
+    U64_C(0xad73c27e488e34b1), U64_C(0x443a7b981e0db241),
+    U64_C(0xe3bbcfa355ab6074), U64_C(0x0af276450328e684),
+    U64_C(0x73617a896dd1871b), U64_C(0x58525de4ef7de20f),
+    U64_C(0xb7be3dcab8e6cd83), U64_C(0x19111dd07e64230c),
+    U64_C(0x842359a03e2a367a), U64_C(0x103f89f1f3401fb6),
+    U64_C(0xdc710444d157d475), U64_C(0xb835702334da5845),
+    U64_C(0x4320fc876511a6dc), U64_C(0xd026abc9d3679b8d),
+    U64_C(0x17250eee885c0b2b), U64_C(0x90dab52a387ae76f),
+    U64_C(0x31fed8d972c49c26), U64_C(0x89cba8fa461ec463),
+    U64_C(0x2ff5421677bcabb7), U64_C(0x396f122f85e41d7d),
+    U64_C(0xa09b332430bac6a8), U64_C(0xc888e8ced7070560),
+    U64_C(0xaeaf201ac682ee8f), U64_C(0x1180d7268944a257),
+    U64_C(0xf058a43628e7a5fc), U64_C(0xbd4c4b8fbbce2b07),
+    U64_C(0xa1246df34abe7b49), U64_C(0x7d5569b79be9af3c),
+    U64_C(0xa9b5a705bd9efa12), U64_C(0xdb6b835baa4bc0e8),
+    U64_C(0x05793bac8f147342), U64_C(0x21c1512881848390),
+    U64_C(0xfdb0556c50d357e5), U64_C(0x613d4fcb6a99ff72),
+    U64_C(0x03dce2648e0cda3e), U64_C(0xe949b9e6568386f0),
+    U64_C(0xfc0f0bbb2ad7ea04), U64_C(0x6a70675913b5a417),
+    U64_C(0x7f36d5046fe1c8e3), U64_C(0x0c57af8d02304ff8),
+    U64_C(0x32223abdfcc84618), U64_C(0x0891caf6f720815b),
+    U64_C(0xa63eeaec31a26fd4), U64_C(0x2507345374944d33),
+    U64_C(0x49d28ac266394058), U64_C(0xf5219f9aa7f3d6be),
+    U64_C(0x2d96fea583b4cc68), U64_C(0x5a31e1571b7585d0),
+    U64_C(0x8ed12fe53d02d0fe), U64_C(0xdfade6205f5b0e4b),
+    U64_C(0x4cabb16ee92d331a), U64_C(0x04c6657bf510cea3),
+    U64_C(0xd73c2cd6a87b8f10), U64_C(0xe1d87310a1a307ab),
+    U64_C(0x6cd5be9112ad0d6b), U64_C(0x97c032354366f3f2),
+    U64_C(0xd4e0ceb22677552e), U64_C(0x0000000000000000),
+    U64_C(0x29509bde76a402cb), U64_C(0xc27a9e8bd42fe3e4),
+    U64_C(0x5ef7842cee654b73), U64_C(0xaf107ecdbc86536e),
+    U64_C(0x3fcacbe784fcb401), U64_C(0xd55f90655c73e8cf),
+    U64_C(0xe6c2f40fdabf1336), U64_C(0xe8f6e7312c873b11),
+    U64_C(0xeb2a0555a28be12f), U64_C(0xe4a148bc2eb774e9),
+    U64_C(0x9b979db84156bc0a), U64_C(0x6eb60222e6a56ab4),
+    U64_C(0x87ffbbc4b026ec44), U64_C(0xc703a5275b3b90a6),
+    U64_C(0x47e699fc9001687f), U64_C(0x9c8d1aa73a4aa897),
+    U64_C(0x7cea3760e1ed12dd), U64_C(0x4ec80ddd1d2554c5),
+    U64_C(0x13e36b957d4cc588), U64_C(0x5d2b66486069914d),
+    U64_C(0x92b90999cc7280b0), U64_C(0x517cc9c56259deb5),
+    U64_C(0xc937b619ad03b881), U64_C(0xec30824ad997f5b2),
+    U64_C(0xa45d565fc5aa080b), U64_C(0xd6837201d27f32f1),
+    U64_C(0x635ef3789e9198ad), U64_C(0x531f75769651b96a),
+    U64_C(0x4f77530a6721e924), U64_C(0x486dd4151c3dfdb9),
+    U64_C(0x5f48dafb9461f692), U64_C(0x375b011173dc355a),
+    U64_C(0x3da9775470f4d3de), U64_C(0x8d0dcd81b30e0ac0),
+    U64_C(0x36e45fc609d888bb), U64_C(0x55baacbe97491016),
+    U64_C(0x8cb29356c90ab721), U64_C(0x76184125e2c5f459),
+    U64_C(0x99f4210bb55edbd5), U64_C(0x6f095cf59ca1d755),
+    U64_C(0x9f51f8c3b44672a9), U64_C(0x3538bda287d45285),
+    U64_C(0x50c39712185d6354), U64_C(0xf23b1885dcefc223),
+    U64_C(0x79930ccc6ef9619f), U64_C(0xed8fdc9da3934853),
+    U64_C(0xcb540aaa590bdf5e), U64_C(0x5c94389f1a6d2cac),
+    U64_C(0xe77daad8a0bbaed7), U64_C(0x28efc5090ca0bf2a),
+    U64_C(0xbf2ff73c4fc64cd8), U64_C(0xb37858b14df60320),
+    U64_C(0xf8c96ec0dfc724a7), U64_C(0x828680683f329f06),
+    U64_C(0x941cd051cd6a29cc), U64_C(0xc3c5c05cae2b5e05),
+    U64_C(0xb601631dc2e27062), U64_C(0xc01922382027843b),
+    U64_C(0x24b86a840e90f0d2), U64_C(0xd245177a276ffc52),
+    U64_C(0x0f8b4de98c3c95c6), U64_C(0x3e759530fef809e0),
+    U64_C(0x0b4d2892792c5b65), U64_C(0xc4df4743d5374a98),
+    U64_C(0xa5e20888bfaeb5ea), U64_C(0xba56cc90c0d23f9a),
+    U64_C(0x38d04cf8ffe0a09c), U64_C(0x62e1adafe495254c),
+    U64_C(0x0263bcb3f40867df), U64_C(0xcaeb547d230f62bf),
+    U64_C(0x6082111c109d4293), U64_C(0xdad4dd8cd04f7d09),
+    U64_C(0xefec602e579b2f8c), U64_C(0x1fb4c4187f7c8a70),
+    U64_C(0xffd3e9dfa4db303a), U64_C(0x7bf0b07f9af10640),
+    U64_C(0xf49ec14dddf76b5f), U64_C(0x8f6e713247066d1f),
+    U64_C(0x339d646a86ccfbf9), U64_C(0x64447467e58d8c30),
+    U64_C(0x2c29a072f9b07189), U64_C(0xd8b7613f24471ad6),
+    U64_C(0x6627c8d41185ebef), U64_C(0xa347d140beb61c96),
+    U64_C(0xde12b8f7255fb3aa), U64_C(0x9d324470404e1576),
+    U64_C(0x9306574eb6763d51), U64_C(0xa80af9d2c79a47f3),
+    U64_C(0x859c0777442e8b9b), U64_C(0x69ac853d9db97e29) },
+  /* 6 */
+  { U64_C(0xc3407dfc2de6377e), U64_C(0x5b9e93eea4256f77),
+    U64_C(0xadb58fdd50c845e0), U64_C(0x5219ff11a75bed86),
+    U64_C(0x356b61cfd90b1de9), U64_C(0xfb8f406e25abe037),
+    U64_C(0x7a5a0231c0f60796), U64_C(0x9d3cd216e1f5020b),
+    U64_C(0x0c6550fb6b48d8f3), U64_C(0xf57508c427ff1c62),
+    U64_C(0x4ad35ffa71cb407d), U64_C(0x6290a2da1666aa6d),
+    U64_C(0xe284ec2349355f9f), U64_C(0xb3c307c53d7c84ec),
+    U64_C(0x05e23c0468365a02), U64_C(0x190bac4d6c9ebfa8),
+    U64_C(0x94bbbee9e28b80fa), U64_C(0xa34fc777529cb9b5),
+    U64_C(0xcc7b39f095bcd978), U64_C(0x2426addb0ce532e3),
+    U64_C(0x7e79329312ce4fc7), U64_C(0xab09a72eebec2917),
+    U64_C(0xf8d15499f6b9d6c2), U64_C(0x1a55b8babf8c895d),
+    U64_C(0xdb8add17fb769a85), U64_C(0xb57f2f368658e81b),
+    U64_C(0x8acd36f18f3f41f6), U64_C(0x5ce3b7bba50f11d3),
+    U64_C(0x114dcc14d5ee2f0a), U64_C(0xb91a7fcded1030e8),
+    U64_C(0x81d5425fe55de7a1), U64_C(0xb6213bc1554adeee),
+    U64_C(0x80144ef95f53f5f2), U64_C(0x1e7688186db4c10c),
+    U64_C(0x3b912965db5fe1bc), U64_C(0xc281715a97e8252d),
+    U64_C(0x54a5d7e21c7f8171), U64_C(0x4b12535ccbc5522e),
+    U64_C(0x1d289cefbea6f7f9), U64_C(0x6ef5f2217d2e729e),
+    U64_C(0xe6a7dc819b0d17ce), U64_C(0x1b94b41c05829b0e),
+    U64_C(0x33d7493c622f711e), U64_C(0xdcf7f942fa5ce421),
+    U64_C(0x600fba8b7f7a8ecb), U64_C(0x46b60f011a83988e),
+    U64_C(0x235b898e0dcf4c47), U64_C(0x957ab24f588592a9),
+    U64_C(0x4354330572b5c28c), U64_C(0xa5f3ef84e9b8d542),
+    U64_C(0x8c711e02341b2d01), U64_C(0x0b1874ae6a62a657),
+    U64_C(0x1213d8e306fc19ff), U64_C(0xfe6d7c6a4d9dba35),
+    U64_C(0x65ed868f174cd4c9), U64_C(0x88522ea0e6236550),
+    U64_C(0x899322065c2d7703), U64_C(0xc01e690bfef4018b),
+    U64_C(0x915982ed8abddaf8), U64_C(0xbe675b98ec3a4e4c),
+    U64_C(0xa996bf7f82f00db1), U64_C(0xe1daf8d49a27696a),
+    U64_C(0x2effd5d3dc8986e7), U64_C(0xd153a51f2b1a2e81),
+    U64_C(0x18caa0ebd690adfb), U64_C(0x390e3134b243c51a),
+    U64_C(0x2778b92cdff70416), U64_C(0x029f1851691c24a6),
+    U64_C(0x5e7cafeacc133575), U64_C(0xfa4e4cc89fa5f264),
+    U64_C(0x5a5f9f481e2b7d24), U64_C(0x484c47ab18d764db),
+    U64_C(0x400a27f2a1a7f479), U64_C(0xaeeb9b2a83da7315),
+    U64_C(0x721c626879869734), U64_C(0x042330a2d2384851),
+    U64_C(0x85f672fd3765aff0), U64_C(0xba446b3a3e02061d),
+    U64_C(0x73dd6ecec3888567), U64_C(0xffac70ccf793a866),
+    U64_C(0xdfa9edb5294ed2d4), U64_C(0x6c6aea7014325638),
+    U64_C(0x834a5a0e8c41c307), U64_C(0xcdba35562fb2cb2b),
+    U64_C(0x0ad97808d06cb404), U64_C(0x0f3b440cb85aee06),
+    U64_C(0xe5f9c876481f213b), U64_C(0x98deee1289c35809),
+    U64_C(0x59018bbfcd394bd1), U64_C(0xe01bf47220297b39),
+    U64_C(0xde68e1139340c087), U64_C(0x9fa3ca4788e926ad),
+    U64_C(0xbb85679c840c144e), U64_C(0x53d8f3b71d55ffd5),
+    U64_C(0x0da45c5dd146caa0), U64_C(0x6f34fe87c72060cd),
+    U64_C(0x57fbc315cf6db784), U64_C(0xcee421a1fca0fdde),
+    U64_C(0x3d2d0196607b8d4b), U64_C(0x642c8a29ad42c69a),
+    U64_C(0x14aff010bdd87508), U64_C(0xac74837beac657b3),
+    U64_C(0x3216459ad821634d), U64_C(0x3fb219c70967a9ed),
+    U64_C(0x06bc28f3bb246cf7), U64_C(0xf2082c9126d562c6),
+    U64_C(0x66b39278c45ee23c), U64_C(0xbd394f6f3f2878b9),
+    U64_C(0xfd33689d9e8f8cc0), U64_C(0x37f4799eb017394f),
+    U64_C(0x108cc0b26fe03d59), U64_C(0xda4bd1b1417888d6),
+    U64_C(0xb09d1332ee6eb219), U64_C(0x2f3ed975668794b4),
+    U64_C(0x58c0871977375982), U64_C(0x7561463d78ace990),
+    U64_C(0x09876cff037e82f1), U64_C(0x7fb83e35a8c05d94),
+    U64_C(0x26b9b58a65f91645), U64_C(0xef20b07e9873953f),
+    U64_C(0x3148516d0b3355b8), U64_C(0x41cb2b541ba9e62a),
+    U64_C(0x790416c613e43163), U64_C(0xa011d380818e8f40),
+    U64_C(0x3a5025c36151f3ef), U64_C(0xd57095bdf92266d0),
+    U64_C(0x498d4b0da2d97688), U64_C(0x8b0c3a57353153a5),
+    U64_C(0x21c491df64d368e1), U64_C(0x8f2f0af5e7091bf4),
+    U64_C(0x2da1c1240f9bb012), U64_C(0xc43d59a92ccc49da),
+    U64_C(0xbfa6573e56345c1f), U64_C(0x828b56a8364fd154),
+    U64_C(0x9a41f643e0df7caf), U64_C(0xbcf843c985266aea),
+    U64_C(0x2b1de9d7b4bfdce5), U64_C(0x20059d79dedd7ab2),
+    U64_C(0x6dabe6d6ae3c446b), U64_C(0x45e81bf6c991ae7b),
+    U64_C(0x6351ae7cac68b83e), U64_C(0xa432e32253b6c711),
+    U64_C(0xd092a9b991143cd2), U64_C(0xcac711032e98b58f),
+    U64_C(0xd8d4c9e02864ac70), U64_C(0xc5fc550f96c25b89),
+    U64_C(0xd7ef8dec903e4276), U64_C(0x67729ede7e50f06f),
+    U64_C(0xeac28c7af045cf3d), U64_C(0xb15c1f945460a04a),
+    U64_C(0x9cfddeb05bfb1058), U64_C(0x93c69abce3a1fe5e),
+    U64_C(0xeb0380dc4a4bdd6e), U64_C(0xd20db1e8f8081874),
+    U64_C(0x229a8528b7c15e14), U64_C(0x44291750739fbc28),
+    U64_C(0xd3ccbd4e42060a27), U64_C(0xf62b1c33f4ed2a97),
+    U64_C(0x86a8660ae4779905), U64_C(0xd62e814a2a305025),
+    U64_C(0x477703a7a08d8add), U64_C(0x7b9b0e977af815c5),
+    U64_C(0x78c51a60a9ea2330), U64_C(0xa6adfb733aaae3b7),
+    U64_C(0x97e5aa1e3199b60f), U64_C(0x0000000000000000),
+    U64_C(0xf4b404629df10e31), U64_C(0x5564db44a6719322),
+    U64_C(0x9207961a59afec0d), U64_C(0x9624a6b88b97a45c),
+    U64_C(0x363575380a192b1c), U64_C(0x2c60cd82b595a241),
+    U64_C(0x7d272664c1dc7932), U64_C(0x7142769faa94a1c1),
+    U64_C(0xa1d0df263b809d13), U64_C(0x1630e841d4c451ae),
+    U64_C(0xc1df65ad44fa13d8), U64_C(0x13d2d445bcf20bac),
+    U64_C(0xd915c546926abe23), U64_C(0x38cf3d92084dd749),
+    U64_C(0xe766d0272103059d), U64_C(0xc7634d5effde7f2f),
+    U64_C(0x077d2455012a7ea4), U64_C(0xedbfa82ff16fb199),
+    U64_C(0xaf2a978c39d46146), U64_C(0x42953fa3c8bbd0df),
+    U64_C(0xcb061da59496a7dc), U64_C(0x25e7a17db6eb20b0),
+    U64_C(0x34aa6d6963050fba), U64_C(0xa76cf7d580a4f1e4),
+    U64_C(0xf7ea10954ee338c4), U64_C(0xfcf2643b24819e93),
+    U64_C(0xcf252d0746aeef8d), U64_C(0x4ef06f58a3f3082c),
+    U64_C(0x563acfb37563a5d7), U64_C(0x5086e740ce47c920),
+    U64_C(0x2982f186dda3f843), U64_C(0x87696aac5e798b56),
+    U64_C(0x5d22bb1d1f010380), U64_C(0x035e14f7d31236f5),
+    U64_C(0x3cec0d30da759f18), U64_C(0xf3c920379cdb7095),
+    U64_C(0xb8db736b571e22bb), U64_C(0xdd36f5e44052f672),
+    U64_C(0xaac8ab8851e23b44), U64_C(0xa857b3d938fe1fe2),
+    U64_C(0x17f1e4e76eca43fd), U64_C(0xec7ea4894b61a3ca),
+    U64_C(0x9e62c6e132e734fe), U64_C(0xd4b1991b432c7483),
+    U64_C(0x6ad6c283af163acf), U64_C(0x1ce9904904a8e5aa),
+    U64_C(0x5fbda34c761d2726), U64_C(0xf910583f4cb7c491),
+    U64_C(0xc6a241f845d06d7c), U64_C(0x4f3163fe19fd1a7f),
+    U64_C(0xe99c988d2357f9c8), U64_C(0x8eee06535d0709a7),
+    U64_C(0x0efa48aa0254fc55), U64_C(0xb4be23903c56fa48),
+    U64_C(0x763f52caabbedf65), U64_C(0xeee1bcd8227d876c),
+    U64_C(0xe345e085f33b4dcc), U64_C(0x3e731561b369bbbe),
+    U64_C(0x2843fd2067adea10), U64_C(0x2adce5710eb1ceb6),
+    U64_C(0xb7e03767ef44ccbd), U64_C(0x8db012a48e153f52),
+    U64_C(0x61ceb62dc5749c98), U64_C(0xe85d942b9959eb9b),
+    U64_C(0x4c6f7709caef2c8a), U64_C(0x84377e5b8d6bbda3),
+    U64_C(0x30895dcbb13d47eb), U64_C(0x74a04a9bc2a2fbc3),
+    U64_C(0x6b17ce251518289c), U64_C(0xe438c4d0f2113368),
+    U64_C(0x1fb784bed7bad35f), U64_C(0x9b80fae55ad16efc),
+    U64_C(0x77fe5e6c11b0cd36), U64_C(0xc858095247849129),
+    U64_C(0x08466059b97090a2), U64_C(0x01c10ca6ba0e1253),
+    U64_C(0x6988d6747c040c3a), U64_C(0x6849dad2c60a1e69),
+    U64_C(0x5147ebe67449db73), U64_C(0xc99905f4fd8a837a),
+    U64_C(0x991fe2b433cd4a5a), U64_C(0xf09734c04fc94660),
+    U64_C(0xa28ecbd1e892abe6), U64_C(0xf1563866f5c75433),
+    U64_C(0x4dae7baf70e13ed9), U64_C(0x7ce62ac27bd26b61),
+    U64_C(0x70837a39109ab392), U64_C(0x90988e4b30b3c8ab),
+    U64_C(0xb2020b63877296bf), U64_C(0x156efcb607d6675b) },
+  /* 7 */
+  { U64_C(0xe63f55ce97c331d0), U64_C(0x25b506b0015bba16),
+    U64_C(0xc8706e29e6ad9ba8), U64_C(0x5b43d3775d521f6a),
+    U64_C(0x0bfa3d577035106e), U64_C(0xab95fc172afb0e66),
+    U64_C(0xf64b63979e7a3276), U64_C(0xf58b4562649dad4b),
+    U64_C(0x48f7c3dbae0c83f1), U64_C(0xff31916642f5c8c5),
+    U64_C(0xcbb048dc1c4a0495), U64_C(0x66b8f83cdf622989),
+    U64_C(0x35c130e908e2b9b0), U64_C(0x7c761a61f0b34fa1),
+    U64_C(0x3601161cf205268d), U64_C(0x9e54ccfe2219b7d6),
+    U64_C(0x8b7d90a538940837), U64_C(0x9cd403588ea35d0b),
+    U64_C(0xbc3c6fea9ccc5b5a), U64_C(0xe5ff733b6d24aeed),
+    U64_C(0xceed22de0f7eb8d2), U64_C(0xec8581cab1ab545e),
+    U64_C(0xb96105e88ff8e71d), U64_C(0x8ca03501871a5ead),
+    U64_C(0x76ccce65d6db2a2f), U64_C(0x5883f582a7b58057),
+    U64_C(0x3f7be4ed2e8adc3e), U64_C(0x0fe7be06355cd9c9),
+    U64_C(0xee054e6c1d11be83), U64_C(0x1074365909b903a6),
+    U64_C(0x5dde9f80b4813c10), U64_C(0x4a770c7d02b6692c),
+    U64_C(0x5379c8d5d7809039), U64_C(0xb4067448161ed409),
+    U64_C(0x5f5e5026183bd6cd), U64_C(0xe898029bf4c29df9),
+    U64_C(0x7fb63c940a54d09c), U64_C(0xc5171f897f4ba8bc),
+    U64_C(0xa6f28db7b31d3d72), U64_C(0x2e4f3be7716eaa78),
+    U64_C(0x0d6771a099e63314), U64_C(0x82076254e41bf284),
+    U64_C(0x2f0fd2b42733df98), U64_C(0x5c9e76d3e2dc49f0),
+    U64_C(0x7aeb569619606cdb), U64_C(0x83478b07b2468764),
+    U64_C(0xcfadcb8d5923cd32), U64_C(0x85dac7f05b95a41e),
+    U64_C(0xb5469d1b4043a1e9), U64_C(0xb821ecbbd9a592fd),
+    U64_C(0x1b8e0b0e798c13c8), U64_C(0x62a57b6d9a0be02e),
+    U64_C(0xfcf1b793b81257f8), U64_C(0x9d94ea0bd8fe28eb),
+    U64_C(0x4cea408aeb654a56), U64_C(0x23284a47e888996c),
+    U64_C(0x2d8f1d128b893545), U64_C(0xf4cbac3132c0d8ab),
+    U64_C(0xbd7c86b9ca912eba), U64_C(0x3a268eef3dbe6079),
+    U64_C(0xf0d62f6077a9110c), U64_C(0x2735c916ade150cb),
+    U64_C(0x89fd5f03942ee2ea), U64_C(0x1acee25d2fd16628),
+    U64_C(0x90f39bab41181bff), U64_C(0x430dfe8cde39939f),
+    U64_C(0xf70b8ac4c8274796), U64_C(0x1c53aeaac6024552),
+    U64_C(0x13b410acf35e9c9b), U64_C(0xa532ab4249faa24f),
+    U64_C(0x2b1251e5625a163f), U64_C(0xd7e3e676da4841c7),
+    U64_C(0xa7b264e4e5404892), U64_C(0xda8497d643ae72d3),
+    U64_C(0x861ae105a1723b23), U64_C(0x38a6414991048aa4),
+    U64_C(0x6578dec92585b6b4), U64_C(0x0280cfa6acbaeadd),
+    U64_C(0x88bdb650c273970a), U64_C(0x9333bd5ebbff84c2),
+    U64_C(0x4e6a8f2c47dfa08b), U64_C(0x321c954db76cef2a),
+    U64_C(0x418d312a72837942), U64_C(0xb29b38bfffcdf773),
+    U64_C(0x6c022c38f90a4c07), U64_C(0x5a033a240b0f6a8a),
+    U64_C(0x1f93885f3ce5da6f), U64_C(0xc38a537e96988bc6),
+    U64_C(0x39e6a81ac759ff44), U64_C(0x29929e43cee0fce2),
+    U64_C(0x40cdd87924de0ca2), U64_C(0xe9d8ebc8a29fe819),
+    U64_C(0x0c2798f3cfbb46f4), U64_C(0x55e484223e53b343),
+    U64_C(0x4650948ecd0d2fd8), U64_C(0x20e86cb2126f0651),
+    U64_C(0x6d42c56baf5739e7), U64_C(0xa06fc1405ace1e08),
+    U64_C(0x7babbfc54f3d193b), U64_C(0x424d17df8864e67f),
+    U64_C(0xd8045870ef14980e), U64_C(0xc6d7397c85ac3781),
+    U64_C(0x21a885e1443273b1), U64_C(0x67f8116f893f5c69),
+    U64_C(0x24f5efe35706cff6), U64_C(0xd56329d076f2ab1a),
+    U64_C(0x5e1eb9754e66a32d), U64_C(0x28d2771098bd8902),
+    U64_C(0x8f6013f47dfdc190), U64_C(0x17a993fdb637553c),
+    U64_C(0xe0a219397e1012aa), U64_C(0x786b9930b5da8606),
+    U64_C(0x6e82e39e55b0a6da), U64_C(0x875a0856f72f4ec3),
+    U64_C(0x3741ff4fa458536d), U64_C(0xac4859b3957558fc),
+    U64_C(0x7ef6d5c75c09a57c), U64_C(0xc04a758b6c7f14fb),
+    U64_C(0xf9acdd91ab26ebbf), U64_C(0x7391a467c5ef9668),
+    U64_C(0x335c7c1ee1319aca), U64_C(0xa91533b18641e4bb),
+    U64_C(0xe4bf9a683b79db0d), U64_C(0x8e20faa72ba0b470),
+    U64_C(0x51f907737b3a7ae4), U64_C(0x2268a314bed5ec8c),
+    U64_C(0xd944b123b949edee), U64_C(0x31dcb3b84d8b7017),
+    U64_C(0xd3fe65279f218860), U64_C(0x097af2f1dc8ffab3),
+    U64_C(0x9b09a6fc312d0b91), U64_C(0xcc6ded78a3c4520f),
+    U64_C(0x3481d9ba5ebfcc50), U64_C(0x4f2a667f1182d56b),
+    U64_C(0xdfd9fdd4509ace94), U64_C(0x26752045fbbc252b),
+    U64_C(0xbffc491f662bc467), U64_C(0xdd593272fc202449),
+    U64_C(0x3cbbc218d46d4303), U64_C(0x91b372f817456e1f),
+    U64_C(0x681faf69bc6385a0), U64_C(0xb686bbeebaa43ed4),
+    U64_C(0x1469b5084cd0ca01), U64_C(0x98c98009cbca94ac),
+    U64_C(0x6438379a73d8c354), U64_C(0xc2caba2dc0c5fe26),
+    U64_C(0x3e3b0dbe78d7a9de), U64_C(0x50b9ee202d670f04),
+    U64_C(0x4590b27b37eab0e5), U64_C(0x6025b4cb36b10af3),
+    U64_C(0xfb2c1237079c0162), U64_C(0xa12f28130c936be8),
+    U64_C(0x4b37e52e54eb1ccc), U64_C(0x083a1ba28ad28f53),
+    U64_C(0xc10a9cd83a22611b), U64_C(0x9f1425ad7444c236),
+    U64_C(0x069d4cf7e9d3237a), U64_C(0xedc56899e7f621be),
+    U64_C(0x778c273680865fcf), U64_C(0x309c5aeb1bd605f7),
+    U64_C(0x8de0dc52d1472b4d), U64_C(0xf8ec34c2fd7b9e5f),
+    U64_C(0xea18cd3d58787724), U64_C(0xaad515447ca67b86),
+    U64_C(0x9989695a9d97e14c), U64_C(0x0000000000000000),
+    U64_C(0xf196c63321f464ec), U64_C(0x71116bc169557cb5),
+    U64_C(0xaf887f466f92c7c1), U64_C(0x972e3e0ffe964d65),
+    U64_C(0x190ec4a8d536f915), U64_C(0x95aef1a9522ca7b8),
+    U64_C(0xdc19db21aa7d51a9), U64_C(0x94ee18fa0471d258),
+    U64_C(0x8087adf248a11859), U64_C(0xc457f6da2916dd5c),
+    U64_C(0xfa6cfb6451c17482), U64_C(0xf256e0c6db13fbd1),
+    U64_C(0x6a9f60cf10d96f7d), U64_C(0x4daaa9d9bd383fb6),
+    U64_C(0x03c026f5fae79f3d), U64_C(0xde99148706c7bb74),
+    U64_C(0x2a52b8b6340763df), U64_C(0x6fc20acd03edd33a),
+    U64_C(0xd423c08320afdefa), U64_C(0xbbe1ca4e23420dc0),
+    U64_C(0x966ed75ca8cb3885), U64_C(0xeb58246e0e2502c4),
+    U64_C(0x055d6a021334bc47), U64_C(0xa47242111fa7d7af),
+    U64_C(0xe3623fcc84f78d97), U64_C(0x81c744a11efc6db9),
+    U64_C(0xaec8961539cfb221), U64_C(0xf31609958d4e8e31),
+    U64_C(0x63e5923ecc5695ce), U64_C(0x47107ddd9b505a38),
+    U64_C(0xa3afe7b5a0298135), U64_C(0x792b7063e387f3e6),
+    U64_C(0x0140e953565d75e0), U64_C(0x12f4f9ffa503e97b),
+    U64_C(0x750ce8902c3cb512), U64_C(0xdbc47e8515f30733),
+    U64_C(0x1ed3610c6ab8af8f), U64_C(0x5239218681dde5d9),
+    U64_C(0xe222d69fd2aaf877), U64_C(0xfe71783514a8bd25),
+    U64_C(0xcaf0a18f4a177175), U64_C(0x61655d9860ec7f13),
+    U64_C(0xe77fbc9dc19e4430), U64_C(0x2ccff441ddd440a5),
+    U64_C(0x16e97aaee06a20dc), U64_C(0xa855dae2d01c915b),
+    U64_C(0x1d1347f9905f30b2), U64_C(0xb7c652bdecf94b34),
+    U64_C(0xd03e43d265c6175d), U64_C(0xfdb15ec0ee4f2218),
+    U64_C(0x57644b8492e9599e), U64_C(0x07dda5a4bf8e569a),
+    U64_C(0x54a46d71680ec6a3), U64_C(0x5624a2d7c4b42c7e),
+    U64_C(0xbebca04c3076b187), U64_C(0x7d36f332a6ee3a41),
+    U64_C(0x3b6667bc6be31599), U64_C(0x695f463aea3ef040),
+    U64_C(0xad08b0e0c3282d1c), U64_C(0xb15b1e4a052a684e),
+    U64_C(0x44d05b2861b7c505), U64_C(0x15295c5b1a8dbfe1),
+    U64_C(0x744c01c37a61c0f2), U64_C(0x59c31cd1f1e8f5b7),
+    U64_C(0xef45a73f4b4ccb63), U64_C(0x6bdf899c46841a9d),
+    U64_C(0x3dfb2b4b823036e3), U64_C(0xa2ef0ee6f674f4d5),
+    U64_C(0x184e2dfb836b8cf5), U64_C(0x1134df0a5fe47646),
+    U64_C(0xbaa1231d751f7820), U64_C(0xd17eaa81339b62bd),
+    U64_C(0xb01bf71953771dae), U64_C(0x849a2ea30dc8d1fe),
+    U64_C(0x705182923f080955), U64_C(0x0ea757556301ac29),
+    U64_C(0x041d83514569c9a7), U64_C(0x0abad4042668658e),
+    U64_C(0x49b72a88f851f611), U64_C(0x8a3d79f66ec97dd7),
+    U64_C(0xcd2d042bf59927ef), U64_C(0xc930877ab0f0ee48),
+    U64_C(0x9273540deda2f122), U64_C(0xc797d02fd3f14261),
+    U64_C(0xe1e2f06a284d674a), U64_C(0xd2be8c74c97cfd80),
+    U64_C(0x9a494faf67707e71), U64_C(0xb3dbd1eca9908293),
+    U64_C(0x72d14d3493b2e388), U64_C(0xd6a30f258c153427) },
+};
+
+static const u64 C16[12][8] =
+{
+  { U64_C(0xdd806559f2a64507), U64_C(0x05767436cc744d23),
+    U64_C(0xa2422a08a460d315), U64_C(0x4b7ce09192676901),
+    U64_C(0x714eb88d7585c4fc), U64_C(0x2f6a76432e45d016),
+    U64_C(0xebcb2f81c0657c1f), U64_C(0xb1085bda1ecadae9) },
+  { U64_C(0xe679047021b19bb7), U64_C(0x55dda21bd7cbcd56),
+    U64_C(0x5cb561c2db0aa7ca), U64_C(0x9ab5176b12d69958),
+    U64_C(0x61d55e0f16b50131), U64_C(0xf3feea720a232b98),
+    U64_C(0x4fe39d460f70b5d7), U64_C(0x6fa3b58aa99d2f1a) },
+  { U64_C(0x991e96f50aba0ab2), U64_C(0xc2b6f443867adb31),
+    U64_C(0xc1c93a376062db09), U64_C(0xd3e20fe490359eb1),
+    U64_C(0xf2ea7514b1297b7b), U64_C(0x06f15e5f529c1f8b),
+    U64_C(0x0a39fc286a3d8435), U64_C(0xf574dcac2bce2fc7) },
+  { U64_C(0x220cbebc84e3d12e), U64_C(0x3453eaa193e837f1),
+    U64_C(0xd8b71333935203be), U64_C(0xa9d72c82ed03d675),
+    U64_C(0x9d721cad685e353f), U64_C(0x488e857e335c3c7d),
+    U64_C(0xf948e1a05d71e4dd), U64_C(0xef1fdfb3e81566d2) },
+  { U64_C(0x601758fd7c6cfe57), U64_C(0x7a56a27ea9ea63f5),
+    U64_C(0xdfff00b723271a16), U64_C(0xbfcd1747253af5a3),
+    U64_C(0x359e35d7800fffbd), U64_C(0x7f151c1f1686104a),
+    U64_C(0x9a3f410c6ca92363), U64_C(0x4bea6bacad474799) },
+  { U64_C(0xfa68407a46647d6e), U64_C(0xbf71c57236904f35),
+    U64_C(0x0af21f66c2bec6b6), U64_C(0xcffaa6b71c9ab7b4),
+    U64_C(0x187f9ab49af08ec6), U64_C(0x2d66c4f95142a46c),
+    U64_C(0x6fa4c33b7a3039c0), U64_C(0xae4faeae1d3ad3d9) },
+  { U64_C(0x8886564d3a14d493), U64_C(0x3517454ca23c4af3),
+    U64_C(0x06476983284a0504), U64_C(0x0992abc52d822c37),
+    U64_C(0xd3473e33197a93c9), U64_C(0x399ec6c7e6bf87c9),
+    U64_C(0x51ac86febf240954), U64_C(0xf4c70e16eeaac5ec) },
+  { U64_C(0xa47f0dd4bf02e71e), U64_C(0x36acc2355951a8d9),
+    U64_C(0x69d18d2bd1a5c42f), U64_C(0xf4892bcb929b0690),
+    U64_C(0x89b4443b4ddbc49a), U64_C(0x4eb7f8719c36de1e),
+    U64_C(0x03e7aa020c6e4141), U64_C(0x9b1f5b424d93c9a7) },
+  { U64_C(0x7261445183235adb), U64_C(0x0e38dc92cb1f2a60),
+    U64_C(0x7b2b8a9aa6079c54), U64_C(0x800a440bdbb2ceb1),
+    U64_C(0x3cd955b7e00d0984), U64_C(0x3a7d3a1b25894224),
+    U64_C(0x944c9ad8ec165fde), U64_C(0x378f5a541631229b) },
+  { U64_C(0x74b4c7fb98459ced), U64_C(0x3698fad1153bb6c3),
+    U64_C(0x7a1e6c303b7652f4), U64_C(0x9fe76702af69334b),
+    U64_C(0x1fffe18a1b336103), U64_C(0x8941e71cff8a78db),
+    U64_C(0x382ae548b2e4f3f3), U64_C(0xabbedea680056f52) },
+  { U64_C(0x6bcaa4cd81f32d1b), U64_C(0xdea2594ac06fd85d),
+    U64_C(0xefbacd1d7d476e98), U64_C(0x8a1d71efea48b9ca),
+    U64_C(0x2001802114846679), U64_C(0xd8fa6bbbebab0761),
+    U64_C(0x3002c6cd635afe94), U64_C(0x7bcd9ed0efc889fb) },
+  { U64_C(0x48bc924af11bd720), U64_C(0xfaf417d5d9b21b99),
+    U64_C(0xe71da4aa88e12852), U64_C(0x5d80ef9d1891cc86),
+    U64_C(0xf82012d430219f9b), U64_C(0xcda43c32bcdf1d77),
+    U64_C(0xd21380b00449b17a), U64_C(0x378ee767f11631ba) },
+};
+
+
+#define strido(out, temp, i) do { \
+	u64 t; \
+	t  = stribog_table[0][(temp[0] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[1][(temp[1] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[2][(temp[2] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[3][(temp[3] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[4][(temp[4] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[5][(temp[5] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[6][(temp[6] >> (i * 8)) & 0xff]; \
+	t ^= stribog_table[7][(temp[7] >> (i * 8)) & 0xff]; \
+	out[i] = t; } while(0)
+
+static void LPSX (u64 *out, const u64 *a, const u64 *b)
+{
+  u64 temp[8];
+  temp[0] = a[0] ^ b[0];
+  temp[1] = a[1] ^ b[1];
+  temp[2] = a[2] ^ b[2];
+  temp[3] = a[3] ^ b[3];
+  temp[4] = a[4] ^ b[4];
+  temp[5] = a[5] ^ b[5];
+  temp[6] = a[6] ^ b[6];
+  temp[7] = a[7] ^ b[7];
+  strido (out, temp, 0);
+  strido (out, temp, 1);
+  strido (out, temp, 2);
+  strido (out, temp, 3);
+  strido (out, temp, 4);
+  strido (out, temp, 5);
+  strido (out, temp, 6);
+  strido (out, temp, 7);
+}
+
+static inline void g (u64 *h, u64 *m, u64 *N)
+{
+  u64 K[8];
+  u64 T[8];
+  int i;
+
+  LPSX (K, h, N);
+
+  LPSX (T, K, m);
+  LPSX (K, K, C16[0]);
+  for (i = 1; i < 12; i++)
+    {
+      LPSX (T, K, T);
+      LPSX (K, K, C16[i]);
+    }
+
+  h[0] ^= T[0] ^ K[0] ^ m[0];
+  h[1] ^= T[1] ^ K[1] ^ m[1];
+  h[2] ^= T[2] ^ K[2] ^ m[2];
+  h[3] ^= T[3] ^ K[3] ^ m[3];
+  h[4] ^= T[4] ^ K[4] ^ m[4];
+  h[5] ^= T[5] ^ K[5] ^ m[5];
+  h[6] ^= T[6] ^ K[6] ^ m[6];
+  h[7] ^= T[7] ^ K[7] ^ m[7];
+}
+
+
+static unsigned int
+transform (void *context, const unsigned char *inbuf_arg, size_t datalen);
+
+
+static void
+stribog_init_512 (void *context, unsigned int flags)
+{
+  STRIBOG_CONTEXT *hd = context;
+
+  (void)flags;
+
+  memset (hd, 0, sizeof (*hd));
+
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+  hd->bctx.bwrite = transform;
+}
+
+static void
+stribog_init_256 (void *context, unsigned int flags)
+{
+  STRIBOG_CONTEXT *hd = context;
+
+  stribog_init_512 (context, flags);
+  memset (hd->h, 1, 64);
+}
+
+static void
+transform_bits (STRIBOG_CONTEXT *hd, const unsigned char *data, unsigned count)
+{
+  u64 M[8];
+  u64 l, cf;
+  int i;
+
+  for (i = 0; i < 8; i++)
+    M[i] = buf_get_le64(data + i * 8);
+
+  g (hd->h, M, hd->N);
+  l = hd->N[0];
+  hd->N[0] += count;
+  if (hd->N[0] < l)
+    { /* overflow */
+      for (i = 1; i < 8; i++)
+        {
+          hd->N[i]++;
+          if (hd->N[i] != 0)
+            break;
+        }
+    }
+
+  hd->Sigma[0] += M[0];
+  cf = 0;
+  for (i = 1; i < 8; i++)
+    {
+      if (hd->Sigma[i-1] != M[i-1])
+	cf = (hd->Sigma[i-1] < M[i-1]);
+      hd->Sigma[i] += M[i] + cf;
+    }
+}
+
+static unsigned int
+transform_blk (void *context, const unsigned char *inbuf_arg)
+{
+  STRIBOG_CONTEXT *hd = context;
+
+  transform_bits (hd, inbuf_arg, 64 * 8);
+
+  return /* burn_stack */ 768;
+}
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+/*
+   The routine finally terminates the computation and returns the
+   digest.  The handle is prepared for a new cycle, but adding bytes
+   to the handle will the destroy the returned buffer.  Returns: 32
+   bytes with the message the digest.  */
+static void
+stribog_final (void *context)
+{
+  STRIBOG_CONTEXT *hd = context;
+  u64 Z[8] = {};
+  int i;
+
+  /* PAD. It does not count towards message length */
+  i = hd->bctx.count;
+  /* After flush we have at least one byte free) */
+  hd->bctx.buf[i++] = 1;
+  if (i < 64)
+    memset (&hd->bctx.buf[i], 0, 64 - i);
+  i = 64;
+  transform_bits (hd, hd->bctx.buf, hd->bctx.count * 8);
+
+  g (hd->h, hd->N, Z);
+  g (hd->h, hd->Sigma, Z);
+
+  for (i = 0; i < 8; i++)
+    hd->h[i] = le_bswap64(hd->h[i]);
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (768);
+}
+
+static byte *
+stribog_read_512 (void *context)
+{
+  STRIBOG_CONTEXT *hd = context;
+
+  return hd->result;
+}
+
+static byte *
+stribog_read_256 (void *context)
+{
+  STRIBOG_CONTEXT *hd = context;
+
+  return hd->result + 32;
+}
+
+static gcry_md_oid_spec_t oid_spec_stribog256[] =
+  {
+    /* id-tc26-signwithdigest-gost3410-12-256 */
+    { "1.2.643.7.1.1.3.2" },
+    /* id-tc26-gost3411-12-256 */
+    { "1.2.643.7.1.1.2.2" },
+    { NULL },
+  };
+
+static gcry_md_oid_spec_t oid_spec_stribog512[] =
+  {
+    /* id-tc26-signwithdigest-gost3410-12-512 */
+    { "1.2.643.7.1.1.3.3" },
+    /* id-tc26-gost3411-12-512 */
+    { "1.2.643.7.1.1.2.3" },
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_stribog_256 =
+  {
+    GCRY_MD_STRIBOG256, {0, 0},
+    "STRIBOG256", NULL, 0, oid_spec_stribog256, 32,
+    stribog_init_256, _gcry_md_block_write, stribog_final, stribog_read_256,
+    NULL, NULL, NULL,
+    sizeof (STRIBOG_CONTEXT)
+  };
+
+gcry_md_spec_t _gcry_digest_spec_stribog_512 =
+  {
+    GCRY_MD_STRIBOG512, {0, 0},
+    "STRIBOG512", NULL, 0, oid_spec_stribog512, 64,
+    stribog_init_512, _gcry_md_block_write, stribog_final, stribog_read_512,
+    NULL, NULL, NULL,
+    sizeof (STRIBOG_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/tiger.c b/comm/third_party/libgcrypt/cipher/tiger.c
new file mode 100644
index 0000000000..4039b22b1c
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/tiger.c
@@ -0,0 +1,860 @@
+/* tiger.c  -  The TIGER hash function
+ * Copyright (C) 1998, 2001, 2002, 2003, 2010 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+/* See http://www.cs.technion.ac.il/~biham/Reports/Tiger/  */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "hash-common.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+
+typedef struct
+{
+  gcry_md_block_ctx_t bctx;
+  u64  a, b, c;
+  int  variant;  /* 0 = old code, 1 = fixed code, 2 - TIGER2.  */
+} TIGER_CONTEXT;
+
+
+/*********************************
+ * Okay, okay, this is not the fastest code - improvements are welcome.
+ *
+ */
+
+/* Some test vectors:
+ * ""                   24F0130C63AC9332 16166E76B1BB925F F373DE2D49584E7A
+ * "abc"                F258C1E88414AB2A 527AB541FFC5B8BF 935F7B951C132951
+ * "Tiger"              9F00F599072300DD 276ABB38C8EB6DEC 37790C116F9D2BDF
+ * "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-"
+ *			87FB2A9083851CF7 470D2CF810E6DF9E B586445034A5A386
+ * "ABCDEFGHIJKLMNOPQRSTUVWXYZ=abcdefghijklmnopqrstuvwxyz+0123456789"
+ *			467DB80863EBCE48 8DF1CD1261655DE9 57896565975F9197
+ * "Tiger - A Fast New Hash Function, by Ross Anderson and Eli Biham"
+ *			0C410A042968868A 1671DA5A3FD29A72 5EC1E457D3CDB303
+ * "Tiger - A Fast New Hash Function, by Ross Anderson and Eli Biham, proc"
+ * "eedings of Fast Software Encryption 3, Cambridge."
+ *			EBF591D5AFA655CE 7F22894FF87F54AC 89C811B6B0DA3193
+ * "Tiger - A Fast New Hash Function, by Ross Anderson and Eli Biham, proc"
+ * "eedings of Fast Software Encryption 3, Cambridge, 1996."
+ *			3D9AEB03D1BD1A63 57B2774DFD6D5B24 DD68151D503974FC
+ * "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-ABCDEF"
+ * "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-"
+ *			00B83EB4E53440C5 76AC6AAEE0A74858 25FD15E70A59FFE4
+ */
+
+static u64 sbox1[256] = {
+  U64_C(0x02aab17cf7e90c5e) /*    0 */, U64_C(0xac424b03e243a8ec) /*    1 */,
+  U64_C(0x72cd5be30dd5fcd3) /*    2 */, U64_C(0x6d019b93f6f97f3a) /*    3 */,
+  U64_C(0xcd9978ffd21f9193) /*    4 */, U64_C(0x7573a1c9708029e2) /*    5 */,
+  U64_C(0xb164326b922a83c3) /*    6 */, U64_C(0x46883eee04915870) /*    7 */,
+  U64_C(0xeaace3057103ece6) /*    8 */, U64_C(0xc54169b808a3535c) /*    9 */,
+  U64_C(0x4ce754918ddec47c) /*   10 */, U64_C(0x0aa2f4dfdc0df40c) /*   11 */,
+  U64_C(0x10b76f18a74dbefa) /*   12 */, U64_C(0xc6ccb6235ad1ab6a) /*   13 */,
+  U64_C(0x13726121572fe2ff) /*   14 */, U64_C(0x1a488c6f199d921e) /*   15 */,
+  U64_C(0x4bc9f9f4da0007ca) /*   16 */, U64_C(0x26f5e6f6e85241c7) /*   17 */,
+  U64_C(0x859079dbea5947b6) /*   18 */, U64_C(0x4f1885c5c99e8c92) /*   19 */,
+  U64_C(0xd78e761ea96f864b) /*   20 */, U64_C(0x8e36428c52b5c17d) /*   21 */,
+  U64_C(0x69cf6827373063c1) /*   22 */, U64_C(0xb607c93d9bb4c56e) /*   23 */,
+  U64_C(0x7d820e760e76b5ea) /*   24 */, U64_C(0x645c9cc6f07fdc42) /*   25 */,
+  U64_C(0xbf38a078243342e0) /*   26 */, U64_C(0x5f6b343c9d2e7d04) /*   27 */,
+  U64_C(0xf2c28aeb600b0ec6) /*   28 */, U64_C(0x6c0ed85f7254bcac) /*   29 */,
+  U64_C(0x71592281a4db4fe5) /*   30 */, U64_C(0x1967fa69ce0fed9f) /*   31 */,
+  U64_C(0xfd5293f8b96545db) /*   32 */, U64_C(0xc879e9d7f2a7600b) /*   33 */,
+  U64_C(0x860248920193194e) /*   34 */, U64_C(0xa4f9533b2d9cc0b3) /*   35 */,
+  U64_C(0x9053836c15957613) /*   36 */, U64_C(0xdb6dcf8afc357bf1) /*   37 */,
+  U64_C(0x18beea7a7a370f57) /*   38 */, U64_C(0x037117ca50b99066) /*   39 */,
+  U64_C(0x6ab30a9774424a35) /*   40 */, U64_C(0xf4e92f02e325249b) /*   41 */,
+  U64_C(0x7739db07061ccae1) /*   42 */, U64_C(0xd8f3b49ceca42a05) /*   43 */,
+  U64_C(0xbd56be3f51382f73) /*   44 */, U64_C(0x45faed5843b0bb28) /*   45 */,
+  U64_C(0x1c813d5c11bf1f83) /*   46 */, U64_C(0x8af0e4b6d75fa169) /*   47 */,
+  U64_C(0x33ee18a487ad9999) /*   48 */, U64_C(0x3c26e8eab1c94410) /*   49 */,
+  U64_C(0xb510102bc0a822f9) /*   50 */, U64_C(0x141eef310ce6123b) /*   51 */,
+  U64_C(0xfc65b90059ddb154) /*   52 */, U64_C(0xe0158640c5e0e607) /*   53 */,
+  U64_C(0x884e079826c3a3cf) /*   54 */, U64_C(0x930d0d9523c535fd) /*   55 */,
+  U64_C(0x35638d754e9a2b00) /*   56 */, U64_C(0x4085fccf40469dd5) /*   57 */,
+  U64_C(0xc4b17ad28be23a4c) /*   58 */, U64_C(0xcab2f0fc6a3e6a2e) /*   59 */,
+  U64_C(0x2860971a6b943fcd) /*   60 */, U64_C(0x3dde6ee212e30446) /*   61 */,
+  U64_C(0x6222f32ae01765ae) /*   62 */, U64_C(0x5d550bb5478308fe) /*   63 */,
+  U64_C(0xa9efa98da0eda22a) /*   64 */, U64_C(0xc351a71686c40da7) /*   65 */,
+  U64_C(0x1105586d9c867c84) /*   66 */, U64_C(0xdcffee85fda22853) /*   67 */,
+  U64_C(0xccfbd0262c5eef76) /*   68 */, U64_C(0xbaf294cb8990d201) /*   69 */,
+  U64_C(0xe69464f52afad975) /*   70 */, U64_C(0x94b013afdf133e14) /*   71 */,
+  U64_C(0x06a7d1a32823c958) /*   72 */, U64_C(0x6f95fe5130f61119) /*   73 */,
+  U64_C(0xd92ab34e462c06c0) /*   74 */, U64_C(0xed7bde33887c71d2) /*   75 */,
+  U64_C(0x79746d6e6518393e) /*   76 */, U64_C(0x5ba419385d713329) /*   77 */,
+  U64_C(0x7c1ba6b948a97564) /*   78 */, U64_C(0x31987c197bfdac67) /*   79 */,
+  U64_C(0xde6c23c44b053d02) /*   80 */, U64_C(0x581c49fed002d64d) /*   81 */,
+  U64_C(0xdd474d6338261571) /*   82 */, U64_C(0xaa4546c3e473d062) /*   83 */,
+  U64_C(0x928fce349455f860) /*   84 */, U64_C(0x48161bbacaab94d9) /*   85 */,
+  U64_C(0x63912430770e6f68) /*   86 */, U64_C(0x6ec8a5e602c6641c) /*   87 */,
+  U64_C(0x87282515337ddd2b) /*   88 */, U64_C(0x2cda6b42034b701b) /*   89 */,
+  U64_C(0xb03d37c181cb096d) /*   90 */, U64_C(0xe108438266c71c6f) /*   91 */,
+  U64_C(0x2b3180c7eb51b255) /*   92 */, U64_C(0xdf92b82f96c08bbc) /*   93 */,
+  U64_C(0x5c68c8c0a632f3ba) /*   94 */, U64_C(0x5504cc861c3d0556) /*   95 */,
+  U64_C(0xabbfa4e55fb26b8f) /*   96 */, U64_C(0x41848b0ab3baceb4) /*   97 */,
+  U64_C(0xb334a273aa445d32) /*   98 */, U64_C(0xbca696f0a85ad881) /*   99 */,
+  U64_C(0x24f6ec65b528d56c) /*  100 */, U64_C(0x0ce1512e90f4524a) /*  101 */,
+  U64_C(0x4e9dd79d5506d35a) /*  102 */, U64_C(0x258905fac6ce9779) /*  103 */,
+  U64_C(0x2019295b3e109b33) /*  104 */, U64_C(0xf8a9478b73a054cc) /*  105 */,
+  U64_C(0x2924f2f934417eb0) /*  106 */, U64_C(0x3993357d536d1bc4) /*  107 */,
+  U64_C(0x38a81ac21db6ff8b) /*  108 */, U64_C(0x47c4fbf17d6016bf) /*  109 */,
+  U64_C(0x1e0faadd7667e3f5) /*  110 */, U64_C(0x7abcff62938beb96) /*  111 */,
+  U64_C(0xa78dad948fc179c9) /*  112 */, U64_C(0x8f1f98b72911e50d) /*  113 */,
+  U64_C(0x61e48eae27121a91) /*  114 */, U64_C(0x4d62f7ad31859808) /*  115 */,
+  U64_C(0xeceba345ef5ceaeb) /*  116 */, U64_C(0xf5ceb25ebc9684ce) /*  117 */,
+  U64_C(0xf633e20cb7f76221) /*  118 */, U64_C(0xa32cdf06ab8293e4) /*  119 */,
+  U64_C(0x985a202ca5ee2ca4) /*  120 */, U64_C(0xcf0b8447cc8a8fb1) /*  121 */,
+  U64_C(0x9f765244979859a3) /*  122 */, U64_C(0xa8d516b1a1240017) /*  123 */,
+  U64_C(0x0bd7ba3ebb5dc726) /*  124 */, U64_C(0xe54bca55b86adb39) /*  125 */,
+  U64_C(0x1d7a3afd6c478063) /*  126 */, U64_C(0x519ec608e7669edd) /*  127 */,
+  U64_C(0x0e5715a2d149aa23) /*  128 */, U64_C(0x177d4571848ff194) /*  129 */,
+  U64_C(0xeeb55f3241014c22) /*  130 */, U64_C(0x0f5e5ca13a6e2ec2) /*  131 */,
+  U64_C(0x8029927b75f5c361) /*  132 */, U64_C(0xad139fabc3d6e436) /*  133 */,
+  U64_C(0x0d5df1a94ccf402f) /*  134 */, U64_C(0x3e8bd948bea5dfc8) /*  135 */,
+  U64_C(0xa5a0d357bd3ff77e) /*  136 */, U64_C(0xa2d12e251f74f645) /*  137 */,
+  U64_C(0x66fd9e525e81a082) /*  138 */, U64_C(0x2e0c90ce7f687a49) /*  139 */,
+  U64_C(0xc2e8bcbeba973bc5) /*  140 */, U64_C(0x000001bce509745f) /*  141 */,
+  U64_C(0x423777bbe6dab3d6) /*  142 */, U64_C(0xd1661c7eaef06eb5) /*  143 */,
+  U64_C(0xa1781f354daacfd8) /*  144 */, U64_C(0x2d11284a2b16affc) /*  145 */,
+  U64_C(0xf1fc4f67fa891d1f) /*  146 */, U64_C(0x73ecc25dcb920ada) /*  147 */,
+  U64_C(0xae610c22c2a12651) /*  148 */, U64_C(0x96e0a810d356b78a) /*  149 */,
+  U64_C(0x5a9a381f2fe7870f) /*  150 */, U64_C(0xd5ad62ede94e5530) /*  151 */,
+  U64_C(0xd225e5e8368d1427) /*  152 */, U64_C(0x65977b70c7af4631) /*  153 */,
+  U64_C(0x99f889b2de39d74f) /*  154 */, U64_C(0x233f30bf54e1d143) /*  155 */,
+  U64_C(0x9a9675d3d9a63c97) /*  156 */, U64_C(0x5470554ff334f9a8) /*  157 */,
+  U64_C(0x166acb744a4f5688) /*  158 */, U64_C(0x70c74caab2e4aead) /*  159 */,
+  U64_C(0xf0d091646f294d12) /*  160 */, U64_C(0x57b82a89684031d1) /*  161 */,
+  U64_C(0xefd95a5a61be0b6b) /*  162 */, U64_C(0x2fbd12e969f2f29a) /*  163 */,
+  U64_C(0x9bd37013feff9fe8) /*  164 */, U64_C(0x3f9b0404d6085a06) /*  165 */,
+  U64_C(0x4940c1f3166cfe15) /*  166 */, U64_C(0x09542c4dcdf3defb) /*  167 */,
+  U64_C(0xb4c5218385cd5ce3) /*  168 */, U64_C(0xc935b7dc4462a641) /*  169 */,
+  U64_C(0x3417f8a68ed3b63f) /*  170 */, U64_C(0xb80959295b215b40) /*  171 */,
+  U64_C(0xf99cdaef3b8c8572) /*  172 */, U64_C(0x018c0614f8fcb95d) /*  173 */,
+  U64_C(0x1b14accd1a3acdf3) /*  174 */, U64_C(0x84d471f200bb732d) /*  175 */,
+  U64_C(0xc1a3110e95e8da16) /*  176 */, U64_C(0x430a7220bf1a82b8) /*  177 */,
+  U64_C(0xb77e090d39df210e) /*  178 */, U64_C(0x5ef4bd9f3cd05e9d) /*  179 */,
+  U64_C(0x9d4ff6da7e57a444) /*  180 */, U64_C(0xda1d60e183d4a5f8) /*  181 */,
+  U64_C(0xb287c38417998e47) /*  182 */, U64_C(0xfe3edc121bb31886) /*  183 */,
+  U64_C(0xc7fe3ccc980ccbef) /*  184 */, U64_C(0xe46fb590189bfd03) /*  185 */,
+  U64_C(0x3732fd469a4c57dc) /*  186 */, U64_C(0x7ef700a07cf1ad65) /*  187 */,
+  U64_C(0x59c64468a31d8859) /*  188 */, U64_C(0x762fb0b4d45b61f6) /*  189 */,
+  U64_C(0x155baed099047718) /*  190 */, U64_C(0x68755e4c3d50baa6) /*  191 */,
+  U64_C(0xe9214e7f22d8b4df) /*  192 */, U64_C(0x2addbf532eac95f4) /*  193 */,
+  U64_C(0x32ae3909b4bd0109) /*  194 */, U64_C(0x834df537b08e3450) /*  195 */,
+  U64_C(0xfa209da84220728d) /*  196 */, U64_C(0x9e691d9b9efe23f7) /*  197 */,
+  U64_C(0x0446d288c4ae8d7f) /*  198 */, U64_C(0x7b4cc524e169785b) /*  199 */,
+  U64_C(0x21d87f0135ca1385) /*  200 */, U64_C(0xcebb400f137b8aa5) /*  201 */,
+  U64_C(0x272e2b66580796be) /*  202 */, U64_C(0x3612264125c2b0de) /*  203 */,
+  U64_C(0x057702bdad1efbb2) /*  204 */, U64_C(0xd4babb8eacf84be9) /*  205 */,
+  U64_C(0x91583139641bc67b) /*  206 */, U64_C(0x8bdc2de08036e024) /*  207 */,
+  U64_C(0x603c8156f49f68ed) /*  208 */, U64_C(0xf7d236f7dbef5111) /*  209 */,
+  U64_C(0x9727c4598ad21e80) /*  210 */, U64_C(0xa08a0896670a5fd7) /*  211 */,
+  U64_C(0xcb4a8f4309eba9cb) /*  212 */, U64_C(0x81af564b0f7036a1) /*  213 */,
+  U64_C(0xc0b99aa778199abd) /*  214 */, U64_C(0x959f1ec83fc8e952) /*  215 */,
+  U64_C(0x8c505077794a81b9) /*  216 */, U64_C(0x3acaaf8f056338f0) /*  217 */,
+  U64_C(0x07b43f50627a6778) /*  218 */, U64_C(0x4a44ab49f5eccc77) /*  219 */,
+  U64_C(0x3bc3d6e4b679ee98) /*  220 */, U64_C(0x9cc0d4d1cf14108c) /*  221 */,
+  U64_C(0x4406c00b206bc8a0) /*  222 */, U64_C(0x82a18854c8d72d89) /*  223 */,
+  U64_C(0x67e366b35c3c432c) /*  224 */, U64_C(0xb923dd61102b37f2) /*  225 */,
+  U64_C(0x56ab2779d884271d) /*  226 */, U64_C(0xbe83e1b0ff1525af) /*  227 */,
+  U64_C(0xfb7c65d4217e49a9) /*  228 */, U64_C(0x6bdbe0e76d48e7d4) /*  229 */,
+  U64_C(0x08df828745d9179e) /*  230 */, U64_C(0x22ea6a9add53bd34) /*  231 */,
+  U64_C(0xe36e141c5622200a) /*  232 */, U64_C(0x7f805d1b8cb750ee) /*  233 */,
+  U64_C(0xafe5c7a59f58e837) /*  234 */, U64_C(0xe27f996a4fb1c23c) /*  235 */,
+  U64_C(0xd3867dfb0775f0d0) /*  236 */, U64_C(0xd0e673de6e88891a) /*  237 */,
+  U64_C(0x123aeb9eafb86c25) /*  238 */, U64_C(0x30f1d5d5c145b895) /*  239 */,
+  U64_C(0xbb434a2dee7269e7) /*  240 */, U64_C(0x78cb67ecf931fa38) /*  241 */,
+  U64_C(0xf33b0372323bbf9c) /*  242 */, U64_C(0x52d66336fb279c74) /*  243 */,
+  U64_C(0x505f33ac0afb4eaa) /*  244 */, U64_C(0xe8a5cd99a2cce187) /*  245 */,
+  U64_C(0x534974801e2d30bb) /*  246 */, U64_C(0x8d2d5711d5876d90) /*  247 */,
+  U64_C(0x1f1a412891bc038e) /*  248 */, U64_C(0xd6e2e71d82e56648) /*  249 */,
+  U64_C(0x74036c3a497732b7) /*  250 */, U64_C(0x89b67ed96361f5ab) /*  251 */,
+  U64_C(0xffed95d8f1ea02a2) /*  252 */, U64_C(0xe72b3bd61464d43d) /*  253 */,
+  U64_C(0xa6300f170bdc4820) /*  254 */, U64_C(0xebc18760ed78a77a) /*  255 */
+};
+static u64 sbox2[256] = {
+  U64_C(0xe6a6be5a05a12138) /*  256 */, U64_C(0xb5a122a5b4f87c98) /*  257 */,
+  U64_C(0x563c6089140b6990) /*  258 */, U64_C(0x4c46cb2e391f5dd5) /*  259 */,
+  U64_C(0xd932addbc9b79434) /*  260 */, U64_C(0x08ea70e42015aff5) /*  261 */,
+  U64_C(0xd765a6673e478cf1) /*  262 */, U64_C(0xc4fb757eab278d99) /*  263 */,
+  U64_C(0xdf11c6862d6e0692) /*  264 */, U64_C(0xddeb84f10d7f3b16) /*  265 */,
+  U64_C(0x6f2ef604a665ea04) /*  266 */, U64_C(0x4a8e0f0ff0e0dfb3) /*  267 */,
+  U64_C(0xa5edeef83dbcba51) /*  268 */, U64_C(0xfc4f0a2a0ea4371e) /*  269 */,
+  U64_C(0xe83e1da85cb38429) /*  270 */, U64_C(0xdc8ff882ba1b1ce2) /*  271 */,
+  U64_C(0xcd45505e8353e80d) /*  272 */, U64_C(0x18d19a00d4db0717) /*  273 */,
+  U64_C(0x34a0cfeda5f38101) /*  274 */, U64_C(0x0be77e518887caf2) /*  275 */,
+  U64_C(0x1e341438b3c45136) /*  276 */, U64_C(0xe05797f49089ccf9) /*  277 */,
+  U64_C(0xffd23f9df2591d14) /*  278 */, U64_C(0x543dda228595c5cd) /*  279 */,
+  U64_C(0x661f81fd99052a33) /*  280 */, U64_C(0x8736e641db0f7b76) /*  281 */,
+  U64_C(0x15227725418e5307) /*  282 */, U64_C(0xe25f7f46162eb2fa) /*  283 */,
+  U64_C(0x48a8b2126c13d9fe) /*  284 */, U64_C(0xafdc541792e76eea) /*  285 */,
+  U64_C(0x03d912bfc6d1898f) /*  286 */, U64_C(0x31b1aafa1b83f51b) /*  287 */,
+  U64_C(0xf1ac2796e42ab7d9) /*  288 */, U64_C(0x40a3a7d7fcd2ebac) /*  289 */,
+  U64_C(0x1056136d0afbbcc5) /*  290 */, U64_C(0x7889e1dd9a6d0c85) /*  291 */,
+  U64_C(0xd33525782a7974aa) /*  292 */, U64_C(0xa7e25d09078ac09b) /*  293 */,
+  U64_C(0xbd4138b3eac6edd0) /*  294 */, U64_C(0x920abfbe71eb9e70) /*  295 */,
+  U64_C(0xa2a5d0f54fc2625c) /*  296 */, U64_C(0xc054e36b0b1290a3) /*  297 */,
+  U64_C(0xf6dd59ff62fe932b) /*  298 */, U64_C(0x3537354511a8ac7d) /*  299 */,
+  U64_C(0xca845e9172fadcd4) /*  300 */, U64_C(0x84f82b60329d20dc) /*  301 */,
+  U64_C(0x79c62ce1cd672f18) /*  302 */, U64_C(0x8b09a2add124642c) /*  303 */,
+  U64_C(0xd0c1e96a19d9e726) /*  304 */, U64_C(0x5a786a9b4ba9500c) /*  305 */,
+  U64_C(0x0e020336634c43f3) /*  306 */, U64_C(0xc17b474aeb66d822) /*  307 */,
+  U64_C(0x6a731ae3ec9baac2) /*  308 */, U64_C(0x8226667ae0840258) /*  309 */,
+  U64_C(0x67d4567691caeca5) /*  310 */, U64_C(0x1d94155c4875adb5) /*  311 */,
+  U64_C(0x6d00fd985b813fdf) /*  312 */, U64_C(0x51286efcb774cd06) /*  313 */,
+  U64_C(0x5e8834471fa744af) /*  314 */, U64_C(0xf72ca0aee761ae2e) /*  315 */,
+  U64_C(0xbe40e4cdaee8e09a) /*  316 */, U64_C(0xe9970bbb5118f665) /*  317 */,
+  U64_C(0x726e4beb33df1964) /*  318 */, U64_C(0x703b000729199762) /*  319 */,
+  U64_C(0x4631d816f5ef30a7) /*  320 */, U64_C(0xb880b5b51504a6be) /*  321 */,
+  U64_C(0x641793c37ed84b6c) /*  322 */, U64_C(0x7b21ed77f6e97d96) /*  323 */,
+  U64_C(0x776306312ef96b73) /*  324 */, U64_C(0xae528948e86ff3f4) /*  325 */,
+  U64_C(0x53dbd7f286a3f8f8) /*  326 */, U64_C(0x16cadce74cfc1063) /*  327 */,
+  U64_C(0x005c19bdfa52c6dd) /*  328 */, U64_C(0x68868f5d64d46ad3) /*  329 */,
+  U64_C(0x3a9d512ccf1e186a) /*  330 */, U64_C(0x367e62c2385660ae) /*  331 */,
+  U64_C(0xe359e7ea77dcb1d7) /*  332 */, U64_C(0x526c0773749abe6e) /*  333 */,
+  U64_C(0x735ae5f9d09f734b) /*  334 */, U64_C(0x493fc7cc8a558ba8) /*  335 */,
+  U64_C(0xb0b9c1533041ab45) /*  336 */, U64_C(0x321958ba470a59bd) /*  337 */,
+  U64_C(0x852db00b5f46c393) /*  338 */, U64_C(0x91209b2bd336b0e5) /*  339 */,
+  U64_C(0x6e604f7d659ef19f) /*  340 */, U64_C(0xb99a8ae2782ccb24) /*  341 */,
+  U64_C(0xccf52ab6c814c4c7) /*  342 */, U64_C(0x4727d9afbe11727b) /*  343 */,
+  U64_C(0x7e950d0c0121b34d) /*  344 */, U64_C(0x756f435670ad471f) /*  345 */,
+  U64_C(0xf5add442615a6849) /*  346 */, U64_C(0x4e87e09980b9957a) /*  347 */,
+  U64_C(0x2acfa1df50aee355) /*  348 */, U64_C(0xd898263afd2fd556) /*  349 */,
+  U64_C(0xc8f4924dd80c8fd6) /*  350 */, U64_C(0xcf99ca3d754a173a) /*  351 */,
+  U64_C(0xfe477bacaf91bf3c) /*  352 */, U64_C(0xed5371f6d690c12d) /*  353 */,
+  U64_C(0x831a5c285e687094) /*  354 */, U64_C(0xc5d3c90a3708a0a4) /*  355 */,
+  U64_C(0x0f7f903717d06580) /*  356 */, U64_C(0x19f9bb13b8fdf27f) /*  357 */,
+  U64_C(0xb1bd6f1b4d502843) /*  358 */, U64_C(0x1c761ba38fff4012) /*  359 */,
+  U64_C(0x0d1530c4e2e21f3b) /*  360 */, U64_C(0x8943ce69a7372c8a) /*  361 */,
+  U64_C(0xe5184e11feb5ce66) /*  362 */, U64_C(0x618bdb80bd736621) /*  363 */,
+  U64_C(0x7d29bad68b574d0b) /*  364 */, U64_C(0x81bb613e25e6fe5b) /*  365 */,
+  U64_C(0x071c9c10bc07913f) /*  366 */, U64_C(0xc7beeb7909ac2d97) /*  367 */,
+  U64_C(0xc3e58d353bc5d757) /*  368 */, U64_C(0xeb017892f38f61e8) /*  369 */,
+  U64_C(0xd4effb9c9b1cc21a) /*  370 */, U64_C(0x99727d26f494f7ab) /*  371 */,
+  U64_C(0xa3e063a2956b3e03) /*  372 */, U64_C(0x9d4a8b9a4aa09c30) /*  373 */,
+  U64_C(0x3f6ab7d500090fb4) /*  374 */, U64_C(0x9cc0f2a057268ac0) /*  375 */,
+  U64_C(0x3dee9d2dedbf42d1) /*  376 */, U64_C(0x330f49c87960a972) /*  377 */,
+  U64_C(0xc6b2720287421b41) /*  378 */, U64_C(0x0ac59ec07c00369c) /*  379 */,
+  U64_C(0xef4eac49cb353425) /*  380 */, U64_C(0xf450244eef0129d8) /*  381 */,
+  U64_C(0x8acc46e5caf4deb6) /*  382 */, U64_C(0x2ffeab63989263f7) /*  383 */,
+  U64_C(0x8f7cb9fe5d7a4578) /*  384 */, U64_C(0x5bd8f7644e634635) /*  385 */,
+  U64_C(0x427a7315bf2dc900) /*  386 */, U64_C(0x17d0c4aa2125261c) /*  387 */,
+  U64_C(0x3992486c93518e50) /*  388 */, U64_C(0xb4cbfee0a2d7d4c3) /*  389 */,
+  U64_C(0x7c75d6202c5ddd8d) /*  390 */, U64_C(0xdbc295d8e35b6c61) /*  391 */,
+  U64_C(0x60b369d302032b19) /*  392 */, U64_C(0xce42685fdce44132) /*  393 */,
+  U64_C(0x06f3ddb9ddf65610) /*  394 */, U64_C(0x8ea4d21db5e148f0) /*  395 */,
+  U64_C(0x20b0fce62fcd496f) /*  396 */, U64_C(0x2c1b912358b0ee31) /*  397 */,
+  U64_C(0xb28317b818f5a308) /*  398 */, U64_C(0xa89c1e189ca6d2cf) /*  399 */,
+  U64_C(0x0c6b18576aaadbc8) /*  400 */, U64_C(0xb65deaa91299fae3) /*  401 */,
+  U64_C(0xfb2b794b7f1027e7) /*  402 */, U64_C(0x04e4317f443b5beb) /*  403 */,
+  U64_C(0x4b852d325939d0a6) /*  404 */, U64_C(0xd5ae6beefb207ffc) /*  405 */,
+  U64_C(0x309682b281c7d374) /*  406 */, U64_C(0xbae309a194c3b475) /*  407 */,
+  U64_C(0x8cc3f97b13b49f05) /*  408 */, U64_C(0x98a9422ff8293967) /*  409 */,
+  U64_C(0x244b16b01076ff7c) /*  410 */, U64_C(0xf8bf571c663d67ee) /*  411 */,
+  U64_C(0x1f0d6758eee30da1) /*  412 */, U64_C(0xc9b611d97adeb9b7) /*  413 */,
+  U64_C(0xb7afd5887b6c57a2) /*  414 */, U64_C(0x6290ae846b984fe1) /*  415 */,
+  U64_C(0x94df4cdeacc1a5fd) /*  416 */, U64_C(0x058a5bd1c5483aff) /*  417 */,
+  U64_C(0x63166cc142ba3c37) /*  418 */, U64_C(0x8db8526eb2f76f40) /*  419 */,
+  U64_C(0xe10880036f0d6d4e) /*  420 */, U64_C(0x9e0523c9971d311d) /*  421 */,
+  U64_C(0x45ec2824cc7cd691) /*  422 */, U64_C(0x575b8359e62382c9) /*  423 */,
+  U64_C(0xfa9e400dc4889995) /*  424 */, U64_C(0xd1823ecb45721568) /*  425 */,
+  U64_C(0xdafd983b8206082f) /*  426 */, U64_C(0xaa7d29082386a8cb) /*  427 */,
+  U64_C(0x269fcd4403b87588) /*  428 */, U64_C(0x1b91f5f728bdd1e0) /*  429 */,
+  U64_C(0xe4669f39040201f6) /*  430 */, U64_C(0x7a1d7c218cf04ade) /*  431 */,
+  U64_C(0x65623c29d79ce5ce) /*  432 */, U64_C(0x2368449096c00bb1) /*  433 */,
+  U64_C(0xab9bf1879da503ba) /*  434 */, U64_C(0xbc23ecb1a458058e) /*  435 */,
+  U64_C(0x9a58df01bb401ecc) /*  436 */, U64_C(0xa070e868a85f143d) /*  437 */,
+  U64_C(0x4ff188307df2239e) /*  438 */, U64_C(0x14d565b41a641183) /*  439 */,
+  U64_C(0xee13337452701602) /*  440 */, U64_C(0x950e3dcf3f285e09) /*  441 */,
+  U64_C(0x59930254b9c80953) /*  442 */, U64_C(0x3bf299408930da6d) /*  443 */,
+  U64_C(0xa955943f53691387) /*  444 */, U64_C(0xa15edecaa9cb8784) /*  445 */,
+  U64_C(0x29142127352be9a0) /*  446 */, U64_C(0x76f0371fff4e7afb) /*  447 */,
+  U64_C(0x0239f450274f2228) /*  448 */, U64_C(0xbb073af01d5e868b) /*  449 */,
+  U64_C(0xbfc80571c10e96c1) /*  450 */, U64_C(0xd267088568222e23) /*  451 */,
+  U64_C(0x9671a3d48e80b5b0) /*  452 */, U64_C(0x55b5d38ae193bb81) /*  453 */,
+  U64_C(0x693ae2d0a18b04b8) /*  454 */, U64_C(0x5c48b4ecadd5335f) /*  455 */,
+  U64_C(0xfd743b194916a1ca) /*  456 */, U64_C(0x2577018134be98c4) /*  457 */,
+  U64_C(0xe77987e83c54a4ad) /*  458 */, U64_C(0x28e11014da33e1b9) /*  459 */,
+  U64_C(0x270cc59e226aa213) /*  460 */, U64_C(0x71495f756d1a5f60) /*  461 */,
+  U64_C(0x9be853fb60afef77) /*  462 */, U64_C(0xadc786a7f7443dbf) /*  463 */,
+  U64_C(0x0904456173b29a82) /*  464 */, U64_C(0x58bc7a66c232bd5e) /*  465 */,
+  U64_C(0xf306558c673ac8b2) /*  466 */, U64_C(0x41f639c6b6c9772a) /*  467 */,
+  U64_C(0x216defe99fda35da) /*  468 */, U64_C(0x11640cc71c7be615) /*  469 */,
+  U64_C(0x93c43694565c5527) /*  470 */, U64_C(0xea038e6246777839) /*  471 */,
+  U64_C(0xf9abf3ce5a3e2469) /*  472 */, U64_C(0x741e768d0fd312d2) /*  473 */,
+  U64_C(0x0144b883ced652c6) /*  474 */, U64_C(0xc20b5a5ba33f8552) /*  475 */,
+  U64_C(0x1ae69633c3435a9d) /*  476 */, U64_C(0x97a28ca4088cfdec) /*  477 */,
+  U64_C(0x8824a43c1e96f420) /*  478 */, U64_C(0x37612fa66eeea746) /*  479 */,
+  U64_C(0x6b4cb165f9cf0e5a) /*  480 */, U64_C(0x43aa1c06a0abfb4a) /*  481 */,
+  U64_C(0x7f4dc26ff162796b) /*  482 */, U64_C(0x6cbacc8e54ed9b0f) /*  483 */,
+  U64_C(0xa6b7ffefd2bb253e) /*  484 */, U64_C(0x2e25bc95b0a29d4f) /*  485 */,
+  U64_C(0x86d6a58bdef1388c) /*  486 */, U64_C(0xded74ac576b6f054) /*  487 */,
+  U64_C(0x8030bdbc2b45805d) /*  488 */, U64_C(0x3c81af70e94d9289) /*  489 */,
+  U64_C(0x3eff6dda9e3100db) /*  490 */, U64_C(0xb38dc39fdfcc8847) /*  491 */,
+  U64_C(0x123885528d17b87e) /*  492 */, U64_C(0xf2da0ed240b1b642) /*  493 */,
+  U64_C(0x44cefadcd54bf9a9) /*  494 */, U64_C(0x1312200e433c7ee6) /*  495 */,
+  U64_C(0x9ffcc84f3a78c748) /*  496 */, U64_C(0xf0cd1f72248576bb) /*  497 */,
+  U64_C(0xec6974053638cfe4) /*  498 */, U64_C(0x2ba7b67c0cec4e4c) /*  499 */,
+  U64_C(0xac2f4df3e5ce32ed) /*  500 */, U64_C(0xcb33d14326ea4c11) /*  501 */,
+  U64_C(0xa4e9044cc77e58bc) /*  502 */, U64_C(0x5f513293d934fcef) /*  503 */,
+  U64_C(0x5dc9645506e55444) /*  504 */, U64_C(0x50de418f317de40a) /*  505 */,
+  U64_C(0x388cb31a69dde259) /*  506 */, U64_C(0x2db4a83455820a86) /*  507 */,
+  U64_C(0x9010a91e84711ae9) /*  508 */, U64_C(0x4df7f0b7b1498371) /*  509 */,
+  U64_C(0xd62a2eabc0977179) /*  510 */, U64_C(0x22fac097aa8d5c0e) /*  511 */
+};
+static u64 sbox3[256] = {
+  U64_C(0xf49fcc2ff1daf39b) /*  512 */, U64_C(0x487fd5c66ff29281) /*  513 */,
+  U64_C(0xe8a30667fcdca83f) /*  514 */, U64_C(0x2c9b4be3d2fcce63) /*  515 */,
+  U64_C(0xda3ff74b93fbbbc2) /*  516 */, U64_C(0x2fa165d2fe70ba66) /*  517 */,
+  U64_C(0xa103e279970e93d4) /*  518 */, U64_C(0xbecdec77b0e45e71) /*  519 */,
+  U64_C(0xcfb41e723985e497) /*  520 */, U64_C(0xb70aaa025ef75017) /*  521 */,
+  U64_C(0xd42309f03840b8e0) /*  522 */, U64_C(0x8efc1ad035898579) /*  523 */,
+  U64_C(0x96c6920be2b2abc5) /*  524 */, U64_C(0x66af4163375a9172) /*  525 */,
+  U64_C(0x2174abdcca7127fb) /*  526 */, U64_C(0xb33ccea64a72ff41) /*  527 */,
+  U64_C(0xf04a4933083066a5) /*  528 */, U64_C(0x8d970acdd7289af5) /*  529 */,
+  U64_C(0x8f96e8e031c8c25e) /*  530 */, U64_C(0xf3fec02276875d47) /*  531 */,
+  U64_C(0xec7bf310056190dd) /*  532 */, U64_C(0xf5adb0aebb0f1491) /*  533 */,
+  U64_C(0x9b50f8850fd58892) /*  534 */, U64_C(0x4975488358b74de8) /*  535 */,
+  U64_C(0xa3354ff691531c61) /*  536 */, U64_C(0x0702bbe481d2c6ee) /*  537 */,
+  U64_C(0x89fb24057deded98) /*  538 */, U64_C(0xac3075138596e902) /*  539 */,
+  U64_C(0x1d2d3580172772ed) /*  540 */, U64_C(0xeb738fc28e6bc30d) /*  541 */,
+  U64_C(0x5854ef8f63044326) /*  542 */, U64_C(0x9e5c52325add3bbe) /*  543 */,
+  U64_C(0x90aa53cf325c4623) /*  544 */, U64_C(0xc1d24d51349dd067) /*  545 */,
+  U64_C(0x2051cfeea69ea624) /*  546 */, U64_C(0x13220f0a862e7e4f) /*  547 */,
+  U64_C(0xce39399404e04864) /*  548 */, U64_C(0xd9c42ca47086fcb7) /*  549 */,
+  U64_C(0x685ad2238a03e7cc) /*  550 */, U64_C(0x066484b2ab2ff1db) /*  551 */,
+  U64_C(0xfe9d5d70efbf79ec) /*  552 */, U64_C(0x5b13b9dd9c481854) /*  553 */,
+  U64_C(0x15f0d475ed1509ad) /*  554 */, U64_C(0x0bebcd060ec79851) /*  555 */,
+  U64_C(0xd58c6791183ab7f8) /*  556 */, U64_C(0xd1187c5052f3eee4) /*  557 */,
+  U64_C(0xc95d1192e54e82ff) /*  558 */, U64_C(0x86eea14cb9ac6ca2) /*  559 */,
+  U64_C(0x3485beb153677d5d) /*  560 */, U64_C(0xdd191d781f8c492a) /*  561 */,
+  U64_C(0xf60866baa784ebf9) /*  562 */, U64_C(0x518f643ba2d08c74) /*  563 */,
+  U64_C(0x8852e956e1087c22) /*  564 */, U64_C(0xa768cb8dc410ae8d) /*  565 */,
+  U64_C(0x38047726bfec8e1a) /*  566 */, U64_C(0xa67738b4cd3b45aa) /*  567 */,
+  U64_C(0xad16691cec0dde19) /*  568 */, U64_C(0xc6d4319380462e07) /*  569 */,
+  U64_C(0xc5a5876d0ba61938) /*  570 */, U64_C(0x16b9fa1fa58fd840) /*  571 */,
+  U64_C(0x188ab1173ca74f18) /*  572 */, U64_C(0xabda2f98c99c021f) /*  573 */,
+  U64_C(0x3e0580ab134ae816) /*  574 */, U64_C(0x5f3b05b773645abb) /*  575 */,
+  U64_C(0x2501a2be5575f2f6) /*  576 */, U64_C(0x1b2f74004e7e8ba9) /*  577 */,
+  U64_C(0x1cd7580371e8d953) /*  578 */, U64_C(0x7f6ed89562764e30) /*  579 */,
+  U64_C(0xb15926ff596f003d) /*  580 */, U64_C(0x9f65293da8c5d6b9) /*  581 */,
+  U64_C(0x6ecef04dd690f84c) /*  582 */, U64_C(0x4782275fff33af88) /*  583 */,
+  U64_C(0xe41433083f820801) /*  584 */, U64_C(0xfd0dfe409a1af9b5) /*  585 */,
+  U64_C(0x4325a3342cdb396b) /*  586 */, U64_C(0x8ae77e62b301b252) /*  587 */,
+  U64_C(0xc36f9e9f6655615a) /*  588 */, U64_C(0x85455a2d92d32c09) /*  589 */,
+  U64_C(0xf2c7dea949477485) /*  590 */, U64_C(0x63cfb4c133a39eba) /*  591 */,
+  U64_C(0x83b040cc6ebc5462) /*  592 */, U64_C(0x3b9454c8fdb326b0) /*  593 */,
+  U64_C(0x56f56a9e87ffd78c) /*  594 */, U64_C(0x2dc2940d99f42bc6) /*  595 */,
+  U64_C(0x98f7df096b096e2d) /*  596 */, U64_C(0x19a6e01e3ad852bf) /*  597 */,
+  U64_C(0x42a99ccbdbd4b40b) /*  598 */, U64_C(0xa59998af45e9c559) /*  599 */,
+  U64_C(0x366295e807d93186) /*  600 */, U64_C(0x6b48181bfaa1f773) /*  601 */,
+  U64_C(0x1fec57e2157a0a1d) /*  602 */, U64_C(0x4667446af6201ad5) /*  603 */,
+  U64_C(0xe615ebcacfb0f075) /*  604 */, U64_C(0xb8f31f4f68290778) /*  605 */,
+  U64_C(0x22713ed6ce22d11e) /*  606 */, U64_C(0x3057c1a72ec3c93b) /*  607 */,
+  U64_C(0xcb46acc37c3f1f2f) /*  608 */, U64_C(0xdbb893fd02aaf50e) /*  609 */,
+  U64_C(0x331fd92e600b9fcf) /*  610 */, U64_C(0xa498f96148ea3ad6) /*  611 */,
+  U64_C(0xa8d8426e8b6a83ea) /*  612 */, U64_C(0xa089b274b7735cdc) /*  613 */,
+  U64_C(0x87f6b3731e524a11) /*  614 */, U64_C(0x118808e5cbc96749) /*  615 */,
+  U64_C(0x9906e4c7b19bd394) /*  616 */, U64_C(0xafed7f7e9b24a20c) /*  617 */,
+  U64_C(0x6509eadeeb3644a7) /*  618 */, U64_C(0x6c1ef1d3e8ef0ede) /*  619 */,
+  U64_C(0xb9c97d43e9798fb4) /*  620 */, U64_C(0xa2f2d784740c28a3) /*  621 */,
+  U64_C(0x7b8496476197566f) /*  622 */, U64_C(0x7a5be3e6b65f069d) /*  623 */,
+  U64_C(0xf96330ed78be6f10) /*  624 */, U64_C(0xeee60de77a076a15) /*  625 */,
+  U64_C(0x2b4bee4aa08b9bd0) /*  626 */, U64_C(0x6a56a63ec7b8894e) /*  627 */,
+  U64_C(0x02121359ba34fef4) /*  628 */, U64_C(0x4cbf99f8283703fc) /*  629 */,
+  U64_C(0x398071350caf30c8) /*  630 */, U64_C(0xd0a77a89f017687a) /*  631 */,
+  U64_C(0xf1c1a9eb9e423569) /*  632 */, U64_C(0x8c7976282dee8199) /*  633 */,
+  U64_C(0x5d1737a5dd1f7abd) /*  634 */, U64_C(0x4f53433c09a9fa80) /*  635 */,
+  U64_C(0xfa8b0c53df7ca1d9) /*  636 */, U64_C(0x3fd9dcbc886ccb77) /*  637 */,
+  U64_C(0xc040917ca91b4720) /*  638 */, U64_C(0x7dd00142f9d1dcdf) /*  639 */,
+  U64_C(0x8476fc1d4f387b58) /*  640 */, U64_C(0x23f8e7c5f3316503) /*  641 */,
+  U64_C(0x032a2244e7e37339) /*  642 */, U64_C(0x5c87a5d750f5a74b) /*  643 */,
+  U64_C(0x082b4cc43698992e) /*  644 */, U64_C(0xdf917becb858f63c) /*  645 */,
+  U64_C(0x3270b8fc5bf86dda) /*  646 */, U64_C(0x10ae72bb29b5dd76) /*  647 */,
+  U64_C(0x576ac94e7700362b) /*  648 */, U64_C(0x1ad112dac61efb8f) /*  649 */,
+  U64_C(0x691bc30ec5faa427) /*  650 */, U64_C(0xff246311cc327143) /*  651 */,
+  U64_C(0x3142368e30e53206) /*  652 */, U64_C(0x71380e31e02ca396) /*  653 */,
+  U64_C(0x958d5c960aad76f1) /*  654 */, U64_C(0xf8d6f430c16da536) /*  655 */,
+  U64_C(0xc8ffd13f1be7e1d2) /*  656 */, U64_C(0x7578ae66004ddbe1) /*  657 */,
+  U64_C(0x05833f01067be646) /*  658 */, U64_C(0xbb34b5ad3bfe586d) /*  659 */,
+  U64_C(0x095f34c9a12b97f0) /*  660 */, U64_C(0x247ab64525d60ca8) /*  661 */,
+  U64_C(0xdcdbc6f3017477d1) /*  662 */, U64_C(0x4a2e14d4decad24d) /*  663 */,
+  U64_C(0xbdb5e6d9be0a1eeb) /*  664 */, U64_C(0x2a7e70f7794301ab) /*  665 */,
+  U64_C(0xdef42d8a270540fd) /*  666 */, U64_C(0x01078ec0a34c22c1) /*  667 */,
+  U64_C(0xe5de511af4c16387) /*  668 */, U64_C(0x7ebb3a52bd9a330a) /*  669 */,
+  U64_C(0x77697857aa7d6435) /*  670 */, U64_C(0x004e831603ae4c32) /*  671 */,
+  U64_C(0xe7a21020ad78e312) /*  672 */, U64_C(0x9d41a70c6ab420f2) /*  673 */,
+  U64_C(0x28e06c18ea1141e6) /*  674 */, U64_C(0xd2b28cbd984f6b28) /*  675 */,
+  U64_C(0x26b75f6c446e9d83) /*  676 */, U64_C(0xba47568c4d418d7f) /*  677 */,
+  U64_C(0xd80badbfe6183d8e) /*  678 */, U64_C(0x0e206d7f5f166044) /*  679 */,
+  U64_C(0xe258a43911cbca3e) /*  680 */, U64_C(0x723a1746b21dc0bc) /*  681 */,
+  U64_C(0xc7caa854f5d7cdd3) /*  682 */, U64_C(0x7cac32883d261d9c) /*  683 */,
+  U64_C(0x7690c26423ba942c) /*  684 */, U64_C(0x17e55524478042b8) /*  685 */,
+  U64_C(0xe0be477656a2389f) /*  686 */, U64_C(0x4d289b5e67ab2da0) /*  687 */,
+  U64_C(0x44862b9c8fbbfd31) /*  688 */, U64_C(0xb47cc8049d141365) /*  689 */,
+  U64_C(0x822c1b362b91c793) /*  690 */, U64_C(0x4eb14655fb13dfd8) /*  691 */,
+  U64_C(0x1ecbba0714e2a97b) /*  692 */, U64_C(0x6143459d5cde5f14) /*  693 */,
+  U64_C(0x53a8fbf1d5f0ac89) /*  694 */, U64_C(0x97ea04d81c5e5b00) /*  695 */,
+  U64_C(0x622181a8d4fdb3f3) /*  696 */, U64_C(0xe9bcd341572a1208) /*  697 */,
+  U64_C(0x1411258643cce58a) /*  698 */, U64_C(0x9144c5fea4c6e0a4) /*  699 */,
+  U64_C(0x0d33d06565cf620f) /*  700 */, U64_C(0x54a48d489f219ca1) /*  701 */,
+  U64_C(0xc43e5eac6d63c821) /*  702 */, U64_C(0xa9728b3a72770daf) /*  703 */,
+  U64_C(0xd7934e7b20df87ef) /*  704 */, U64_C(0xe35503b61a3e86e5) /*  705 */,
+  U64_C(0xcae321fbc819d504) /*  706 */, U64_C(0x129a50b3ac60bfa6) /*  707 */,
+  U64_C(0xcd5e68ea7e9fb6c3) /*  708 */, U64_C(0xb01c90199483b1c7) /*  709 */,
+  U64_C(0x3de93cd5c295376c) /*  710 */, U64_C(0xaed52edf2ab9ad13) /*  711 */,
+  U64_C(0x2e60f512c0a07884) /*  712 */, U64_C(0xbc3d86a3e36210c9) /*  713 */,
+  U64_C(0x35269d9b163951ce) /*  714 */, U64_C(0x0c7d6e2ad0cdb5fa) /*  715 */,
+  U64_C(0x59e86297d87f5733) /*  716 */, U64_C(0x298ef221898db0e7) /*  717 */,
+  U64_C(0x55000029d1a5aa7e) /*  718 */, U64_C(0x8bc08ae1b5061b45) /*  719 */,
+  U64_C(0xc2c31c2b6c92703a) /*  720 */, U64_C(0x94cc596baf25ef42) /*  721 */,
+  U64_C(0x0a1d73db22540456) /*  722 */, U64_C(0x04b6a0f9d9c4179a) /*  723 */,
+  U64_C(0xeffdafa2ae3d3c60) /*  724 */, U64_C(0xf7c8075bb49496c4) /*  725 */,
+  U64_C(0x9cc5c7141d1cd4e3) /*  726 */, U64_C(0x78bd1638218e5534) /*  727 */,
+  U64_C(0xb2f11568f850246a) /*  728 */, U64_C(0xedfabcfa9502bc29) /*  729 */,
+  U64_C(0x796ce5f2da23051b) /*  730 */, U64_C(0xaae128b0dc93537c) /*  731 */,
+  U64_C(0x3a493da0ee4b29ae) /*  732 */, U64_C(0xb5df6b2c416895d7) /*  733 */,
+  U64_C(0xfcabbd25122d7f37) /*  734 */, U64_C(0x70810b58105dc4b1) /*  735 */,
+  U64_C(0xe10fdd37f7882a90) /*  736 */, U64_C(0x524dcab5518a3f5c) /*  737 */,
+  U64_C(0x3c9e85878451255b) /*  738 */, U64_C(0x4029828119bd34e2) /*  739 */,
+  U64_C(0x74a05b6f5d3ceccb) /*  740 */, U64_C(0xb610021542e13eca) /*  741 */,
+  U64_C(0x0ff979d12f59e2ac) /*  742 */, U64_C(0x6037da27e4f9cc50) /*  743 */,
+  U64_C(0x5e92975a0df1847d) /*  744 */, U64_C(0xd66de190d3e623fe) /*  745 */,
+  U64_C(0x5032d6b87b568048) /*  746 */, U64_C(0x9a36b7ce8235216e) /*  747 */,
+  U64_C(0x80272a7a24f64b4a) /*  748 */, U64_C(0x93efed8b8c6916f7) /*  749 */,
+  U64_C(0x37ddbff44cce1555) /*  750 */, U64_C(0x4b95db5d4b99bd25) /*  751 */,
+  U64_C(0x92d3fda169812fc0) /*  752 */, U64_C(0xfb1a4a9a90660bb6) /*  753 */,
+  U64_C(0x730c196946a4b9b2) /*  754 */, U64_C(0x81e289aa7f49da68) /*  755 */,
+  U64_C(0x64669a0f83b1a05f) /*  756 */, U64_C(0x27b3ff7d9644f48b) /*  757 */,
+  U64_C(0xcc6b615c8db675b3) /*  758 */, U64_C(0x674f20b9bcebbe95) /*  759 */,
+  U64_C(0x6f31238275655982) /*  760 */, U64_C(0x5ae488713e45cf05) /*  761 */,
+  U64_C(0xbf619f9954c21157) /*  762 */, U64_C(0xeabac46040a8eae9) /*  763 */,
+  U64_C(0x454c6fe9f2c0c1cd) /*  764 */, U64_C(0x419cf6496412691c) /*  765 */,
+  U64_C(0xd3dc3bef265b0f70) /*  766 */, U64_C(0x6d0e60f5c3578a9e) /*  767 */
+};
+static u64 sbox4[256] = {
+  U64_C(0x5b0e608526323c55) /*  768 */, U64_C(0x1a46c1a9fa1b59f5) /*  769 */,
+  U64_C(0xa9e245a17c4c8ffa) /*  770 */, U64_C(0x65ca5159db2955d7) /*  771 */,
+  U64_C(0x05db0a76ce35afc2) /*  772 */, U64_C(0x81eac77ea9113d45) /*  773 */,
+  U64_C(0x528ef88ab6ac0a0d) /*  774 */, U64_C(0xa09ea253597be3ff) /*  775 */,
+  U64_C(0x430ddfb3ac48cd56) /*  776 */, U64_C(0xc4b3a67af45ce46f) /*  777 */,
+  U64_C(0x4ececfd8fbe2d05e) /*  778 */, U64_C(0x3ef56f10b39935f0) /*  779 */,
+  U64_C(0x0b22d6829cd619c6) /*  780 */, U64_C(0x17fd460a74df2069) /*  781 */,
+  U64_C(0x6cf8cc8e8510ed40) /*  782 */, U64_C(0xd6c824bf3a6ecaa7) /*  783 */,
+  U64_C(0x61243d581a817049) /*  784 */, U64_C(0x048bacb6bbc163a2) /*  785 */,
+  U64_C(0xd9a38ac27d44cc32) /*  786 */, U64_C(0x7fddff5baaf410ab) /*  787 */,
+  U64_C(0xad6d495aa804824b) /*  788 */, U64_C(0xe1a6a74f2d8c9f94) /*  789 */,
+  U64_C(0xd4f7851235dee8e3) /*  790 */, U64_C(0xfd4b7f886540d893) /*  791 */,
+  U64_C(0x247c20042aa4bfda) /*  792 */, U64_C(0x096ea1c517d1327c) /*  793 */,
+  U64_C(0xd56966b4361a6685) /*  794 */, U64_C(0x277da5c31221057d) /*  795 */,
+  U64_C(0x94d59893a43acff7) /*  796 */, U64_C(0x64f0c51ccdc02281) /*  797 */,
+  U64_C(0x3d33bcc4ff6189db) /*  798 */, U64_C(0xe005cb184ce66af1) /*  799 */,
+  U64_C(0xff5ccd1d1db99bea) /*  800 */, U64_C(0xb0b854a7fe42980f) /*  801 */,
+  U64_C(0x7bd46a6a718d4b9f) /*  802 */, U64_C(0xd10fa8cc22a5fd8c) /*  803 */,
+  U64_C(0xd31484952be4bd31) /*  804 */, U64_C(0xc7fa975fcb243847) /*  805 */,
+  U64_C(0x4886ed1e5846c407) /*  806 */, U64_C(0x28cddb791eb70b04) /*  807 */,
+  U64_C(0xc2b00be2f573417f) /*  808 */, U64_C(0x5c9590452180f877) /*  809 */,
+  U64_C(0x7a6bddfff370eb00) /*  810 */, U64_C(0xce509e38d6d9d6a4) /*  811 */,
+  U64_C(0xebeb0f00647fa702) /*  812 */, U64_C(0x1dcc06cf76606f06) /*  813 */,
+  U64_C(0xe4d9f28ba286ff0a) /*  814 */, U64_C(0xd85a305dc918c262) /*  815 */,
+  U64_C(0x475b1d8732225f54) /*  816 */, U64_C(0x2d4fb51668ccb5fe) /*  817 */,
+  U64_C(0xa679b9d9d72bba20) /*  818 */, U64_C(0x53841c0d912d43a5) /*  819 */,
+  U64_C(0x3b7eaa48bf12a4e8) /*  820 */, U64_C(0x781e0e47f22f1ddf) /*  821 */,
+  U64_C(0xeff20ce60ab50973) /*  822 */, U64_C(0x20d261d19dffb742) /*  823 */,
+  U64_C(0x16a12b03062a2e39) /*  824 */, U64_C(0x1960eb2239650495) /*  825 */,
+  U64_C(0x251c16fed50eb8b8) /*  826 */, U64_C(0x9ac0c330f826016e) /*  827 */,
+  U64_C(0xed152665953e7671) /*  828 */, U64_C(0x02d63194a6369570) /*  829 */,
+  U64_C(0x5074f08394b1c987) /*  830 */, U64_C(0x70ba598c90b25ce1) /*  831 */,
+  U64_C(0x794a15810b9742f6) /*  832 */, U64_C(0x0d5925e9fcaf8c6c) /*  833 */,
+  U64_C(0x3067716cd868744e) /*  834 */, U64_C(0x910ab077e8d7731b) /*  835 */,
+  U64_C(0x6a61bbdb5ac42f61) /*  836 */, U64_C(0x93513efbf0851567) /*  837 */,
+  U64_C(0xf494724b9e83e9d5) /*  838 */, U64_C(0xe887e1985c09648d) /*  839 */,
+  U64_C(0x34b1d3c675370cfd) /*  840 */, U64_C(0xdc35e433bc0d255d) /*  841 */,
+  U64_C(0xd0aab84234131be0) /*  842 */, U64_C(0x08042a50b48b7eaf) /*  843 */,
+  U64_C(0x9997c4ee44a3ab35) /*  844 */, U64_C(0x829a7b49201799d0) /*  845 */,
+  U64_C(0x263b8307b7c54441) /*  846 */, U64_C(0x752f95f4fd6a6ca6) /*  847 */,
+  U64_C(0x927217402c08c6e5) /*  848 */, U64_C(0x2a8ab754a795d9ee) /*  849 */,
+  U64_C(0xa442f7552f72943d) /*  850 */, U64_C(0x2c31334e19781208) /*  851 */,
+  U64_C(0x4fa98d7ceaee6291) /*  852 */, U64_C(0x55c3862f665db309) /*  853 */,
+  U64_C(0xbd0610175d53b1f3) /*  854 */, U64_C(0x46fe6cb840413f27) /*  855 */,
+  U64_C(0x3fe03792df0cfa59) /*  856 */, U64_C(0xcfe700372eb85e8f) /*  857 */,
+  U64_C(0xa7be29e7adbce118) /*  858 */, U64_C(0xe544ee5cde8431dd) /*  859 */,
+  U64_C(0x8a781b1b41f1873e) /*  860 */, U64_C(0xa5c94c78a0d2f0e7) /*  861 */,
+  U64_C(0x39412e2877b60728) /*  862 */, U64_C(0xa1265ef3afc9a62c) /*  863 */,
+  U64_C(0xbcc2770c6a2506c5) /*  864 */, U64_C(0x3ab66dd5dce1ce12) /*  865 */,
+  U64_C(0xe65499d04a675b37) /*  866 */, U64_C(0x7d8f523481bfd216) /*  867 */,
+  U64_C(0x0f6f64fcec15f389) /*  868 */, U64_C(0x74efbe618b5b13c8) /*  869 */,
+  U64_C(0xacdc82b714273e1d) /*  870 */, U64_C(0xdd40bfe003199d17) /*  871 */,
+  U64_C(0x37e99257e7e061f8) /*  872 */, U64_C(0xfa52626904775aaa) /*  873 */,
+  U64_C(0x8bbbf63a463d56f9) /*  874 */, U64_C(0xf0013f1543a26e64) /*  875 */,
+  U64_C(0xa8307e9f879ec898) /*  876 */, U64_C(0xcc4c27a4150177cc) /*  877 */,
+  U64_C(0x1b432f2cca1d3348) /*  878 */, U64_C(0xde1d1f8f9f6fa013) /*  879 */,
+  U64_C(0x606602a047a7ddd6) /*  880 */, U64_C(0xd237ab64cc1cb2c7) /*  881 */,
+  U64_C(0x9b938e7225fcd1d3) /*  882 */, U64_C(0xec4e03708e0ff476) /*  883 */,
+  U64_C(0xfeb2fbda3d03c12d) /*  884 */, U64_C(0xae0bced2ee43889a) /*  885 */,
+  U64_C(0x22cb8923ebfb4f43) /*  886 */, U64_C(0x69360d013cf7396d) /*  887 */,
+  U64_C(0x855e3602d2d4e022) /*  888 */, U64_C(0x073805bad01f784c) /*  889 */,
+  U64_C(0x33e17a133852f546) /*  890 */, U64_C(0xdf4874058ac7b638) /*  891 */,
+  U64_C(0xba92b29c678aa14a) /*  892 */, U64_C(0x0ce89fc76cfaadcd) /*  893 */,
+  U64_C(0x5f9d4e0908339e34) /*  894 */, U64_C(0xf1afe9291f5923b9) /*  895 */,
+  U64_C(0x6e3480f60f4a265f) /*  896 */, U64_C(0xeebf3a2ab29b841c) /*  897 */,
+  U64_C(0xe21938a88f91b4ad) /*  898 */, U64_C(0x57dfeff845c6d3c3) /*  899 */,
+  U64_C(0x2f006b0bf62caaf2) /*  900 */, U64_C(0x62f479ef6f75ee78) /*  901 */,
+  U64_C(0x11a55ad41c8916a9) /*  902 */, U64_C(0xf229d29084fed453) /*  903 */,
+  U64_C(0x42f1c27b16b000e6) /*  904 */, U64_C(0x2b1f76749823c074) /*  905 */,
+  U64_C(0x4b76eca3c2745360) /*  906 */, U64_C(0x8c98f463b91691bd) /*  907 */,
+  U64_C(0x14bcc93cf1ade66a) /*  908 */, U64_C(0x8885213e6d458397) /*  909 */,
+  U64_C(0x8e177df0274d4711) /*  910 */, U64_C(0xb49b73b5503f2951) /*  911 */,
+  U64_C(0x10168168c3f96b6b) /*  912 */, U64_C(0x0e3d963b63cab0ae) /*  913 */,
+  U64_C(0x8dfc4b5655a1db14) /*  914 */, U64_C(0xf789f1356e14de5c) /*  915 */,
+  U64_C(0x683e68af4e51dac1) /*  916 */, U64_C(0xc9a84f9d8d4b0fd9) /*  917 */,
+  U64_C(0x3691e03f52a0f9d1) /*  918 */, U64_C(0x5ed86e46e1878e80) /*  919 */,
+  U64_C(0x3c711a0e99d07150) /*  920 */, U64_C(0x5a0865b20c4e9310) /*  921 */,
+  U64_C(0x56fbfc1fe4f0682e) /*  922 */, U64_C(0xea8d5de3105edf9b) /*  923 */,
+  U64_C(0x71abfdb12379187a) /*  924 */, U64_C(0x2eb99de1bee77b9c) /*  925 */,
+  U64_C(0x21ecc0ea33cf4523) /*  926 */, U64_C(0x59a4d7521805c7a1) /*  927 */,
+  U64_C(0x3896f5eb56ae7c72) /*  928 */, U64_C(0xaa638f3db18f75dc) /*  929 */,
+  U64_C(0x9f39358dabe9808e) /*  930 */, U64_C(0xb7defa91c00b72ac) /*  931 */,
+  U64_C(0x6b5541fd62492d92) /*  932 */, U64_C(0x6dc6dee8f92e4d5b) /*  933 */,
+  U64_C(0x353f57abc4beea7e) /*  934 */, U64_C(0x735769d6da5690ce) /*  935 */,
+  U64_C(0x0a234aa642391484) /*  936 */, U64_C(0xf6f9508028f80d9d) /*  937 */,
+  U64_C(0xb8e319a27ab3f215) /*  938 */, U64_C(0x31ad9c1151341a4d) /*  939 */,
+  U64_C(0x773c22a57bef5805) /*  940 */, U64_C(0x45c7561a07968633) /*  941 */,
+  U64_C(0xf913da9e249dbe36) /*  942 */, U64_C(0xda652d9b78a64c68) /*  943 */,
+  U64_C(0x4c27a97f3bc334ef) /*  944 */, U64_C(0x76621220e66b17f4) /*  945 */,
+  U64_C(0x967743899acd7d0b) /*  946 */, U64_C(0xf3ee5bcae0ed6782) /*  947 */,
+  U64_C(0x409f753600c879fc) /*  948 */, U64_C(0x06d09a39b5926db6) /*  949 */,
+  U64_C(0x6f83aeb0317ac588) /*  950 */, U64_C(0x01e6ca4a86381f21) /*  951 */,
+  U64_C(0x66ff3462d19f3025) /*  952 */, U64_C(0x72207c24ddfd3bfb) /*  953 */,
+  U64_C(0x4af6b6d3e2ece2eb) /*  954 */, U64_C(0x9c994dbec7ea08de) /*  955 */,
+  U64_C(0x49ace597b09a8bc4) /*  956 */, U64_C(0xb38c4766cf0797ba) /*  957 */,
+  U64_C(0x131b9373c57c2a75) /*  958 */, U64_C(0xb1822cce61931e58) /*  959 */,
+  U64_C(0x9d7555b909ba1c0c) /*  960 */, U64_C(0x127fafdd937d11d2) /*  961 */,
+  U64_C(0x29da3badc66d92e4) /*  962 */, U64_C(0xa2c1d57154c2ecbc) /*  963 */,
+  U64_C(0x58c5134d82f6fe24) /*  964 */, U64_C(0x1c3ae3515b62274f) /*  965 */,
+  U64_C(0xe907c82e01cb8126) /*  966 */, U64_C(0xf8ed091913e37fcb) /*  967 */,
+  U64_C(0x3249d8f9c80046c9) /*  968 */, U64_C(0x80cf9bede388fb63) /*  969 */,
+  U64_C(0x1881539a116cf19e) /*  970 */, U64_C(0x5103f3f76bd52457) /*  971 */,
+  U64_C(0x15b7e6f5ae47f7a8) /*  972 */, U64_C(0xdbd7c6ded47e9ccf) /*  973 */,
+  U64_C(0x44e55c410228bb1a) /*  974 */, U64_C(0xb647d4255edb4e99) /*  975 */,
+  U64_C(0x5d11882bb8aafc30) /*  976 */, U64_C(0xf5098bbb29d3212a) /*  977 */,
+  U64_C(0x8fb5ea14e90296b3) /*  978 */, U64_C(0x677b942157dd025a) /*  979 */,
+  U64_C(0xfb58e7c0a390acb5) /*  980 */, U64_C(0x89d3674c83bd4a01) /*  981 */,
+  U64_C(0x9e2da4df4bf3b93b) /*  982 */, U64_C(0xfcc41e328cab4829) /*  983 */,
+  U64_C(0x03f38c96ba582c52) /*  984 */, U64_C(0xcad1bdbd7fd85db2) /*  985 */,
+  U64_C(0xbbb442c16082ae83) /*  986 */, U64_C(0xb95fe86ba5da9ab0) /*  987 */,
+  U64_C(0xb22e04673771a93f) /*  988 */, U64_C(0x845358c9493152d8) /*  989 */,
+  U64_C(0xbe2a488697b4541e) /*  990 */, U64_C(0x95a2dc2dd38e6966) /*  991 */,
+  U64_C(0xc02c11ac923c852b) /*  992 */, U64_C(0x2388b1990df2a87b) /*  993 */,
+  U64_C(0x7c8008fa1b4f37be) /*  994 */, U64_C(0x1f70d0c84d54e503) /*  995 */,
+  U64_C(0x5490adec7ece57d4) /*  996 */, U64_C(0x002b3c27d9063a3a) /*  997 */,
+  U64_C(0x7eaea3848030a2bf) /*  998 */, U64_C(0xc602326ded2003c0) /*  999 */,
+  U64_C(0x83a7287d69a94086) /* 1000 */, U64_C(0xc57a5fcb30f57a8a) /* 1001 */,
+  U64_C(0xb56844e479ebe779) /* 1002 */, U64_C(0xa373b40f05dcbce9) /* 1003 */,
+  U64_C(0xd71a786e88570ee2) /* 1004 */, U64_C(0x879cbacdbde8f6a0) /* 1005 */,
+  U64_C(0x976ad1bcc164a32f) /* 1006 */, U64_C(0xab21e25e9666d78b) /* 1007 */,
+  U64_C(0x901063aae5e5c33c) /* 1008 */, U64_C(0x9818b34448698d90) /* 1009 */,
+  U64_C(0xe36487ae3e1e8abb) /* 1010 */, U64_C(0xafbdf931893bdcb4) /* 1011 */,
+  U64_C(0x6345a0dc5fbbd519) /* 1012 */, U64_C(0x8628fe269b9465ca) /* 1013 */,
+  U64_C(0x1e5d01603f9c51ec) /* 1014 */, U64_C(0x4de44006a15049b7) /* 1015 */,
+  U64_C(0xbf6c70e5f776cbb1) /* 1016 */, U64_C(0x411218f2ef552bed) /* 1017 */,
+  U64_C(0xcb0c0708705a36a3) /* 1018 */, U64_C(0xe74d14754f986044) /* 1019 */,
+  U64_C(0xcd56d9430ea8280e) /* 1020 */, U64_C(0xc12591d7535f5065) /* 1021 */,
+  U64_C(0xc83223f1720aef96) /* 1022 */, U64_C(0xc3a0396f7363a51f) /* 1023 */
+};
+
+static unsigned int
+transform ( void *ctx, const unsigned char *data, size_t nblks );
+
+static void
+do_init (void *context, int variant)
+{
+  TIGER_CONTEXT *hd = context;
+
+  hd->a = 0x0123456789abcdefLL;
+  hd->b = 0xfedcba9876543210LL;
+  hd->c = 0xf096a5b4c3b2e187LL;
+
+  hd->bctx.nblocks = 0;
+  hd->bctx.nblocks_high = 0;
+  hd->bctx.count = 0;
+  hd->bctx.blocksize_shift = _gcry_ctz(64);
+  hd->bctx.bwrite = transform;
+  hd->variant = variant;
+}
+
+static void
+tiger_init (void *context, unsigned int flags)
+{
+  (void)flags;
+
+  do_init (context, 0);
+}
+
+static void
+tiger1_init (void *context, unsigned int flags)
+{
+  (void)flags;
+
+  do_init (context, 1);
+}
+
+static void
+tiger2_init (void *context, unsigned int flags)
+{
+  (void)flags;
+
+  do_init (context, 2);
+}
+
+
+#define tiger_round(xa, xb, xc, xx, xmul) { \
+  xc ^= xx; \
+  xa -= (  sbox1[  (xc)        & 0xff ] ^ sbox2[ ((xc) >> 16) & 0xff ] \
+         ^ sbox3[ ((xc) >> 32) & 0xff ] ^ sbox4[ ((xc) >> 48) & 0xff ]); \
+  xb += (  sbox4[ ((xc) >>  8) & 0xff ] ^ sbox3[ ((xc) >> 24) & 0xff ] \
+         ^ sbox2[ ((xc) >> 40) & 0xff ] ^ sbox1[ ((xc) >> 56) & 0xff ]); \
+  xb *= xmul; }
+
+
+#define pass(ya, yb, yc, yx, ymul) { \
+  tiger_round( ya, yb, yc, yx[0], ymul ); \
+  tiger_round( yb, yc, ya, yx[1], ymul ); \
+  tiger_round( yc, ya, yb, yx[2], ymul ); \
+  tiger_round( ya, yb, yc, yx[3], ymul ); \
+  tiger_round( yb, yc, ya, yx[4], ymul ); \
+  tiger_round( yc, ya, yb, yx[5], ymul ); \
+  tiger_round( ya, yb, yc, yx[6], ymul ); \
+  tiger_round( yb, yc, ya, yx[7], ymul ); }
+
+
+#define key_schedule(x) { \
+  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL; \
+  x[1] ^= x[0]; \
+  x[2] += x[1]; \
+  x[3] -= x[2] ^ ((~x[1]) << 19 ); \
+  x[4] ^= x[3]; \
+  x[5] += x[4]; \
+  x[6] -= x[5] ^ ((~x[4]) >> 23 ); \
+  x[7] ^= x[6]; \
+  x[0] += x[7]; \
+  x[1] -= x[0] ^ ((~x[7]) << 19 ); \
+  x[2] ^= x[1]; \
+  x[3] += x[2]; \
+  x[4] -= x[3] ^ ((~x[2]) >> 23 ); \
+  x[5] ^= x[4]; \
+  x[6] += x[5]; \
+  x[7] -= x[6] ^ 0x0123456789abcdefLL; }
+
+
+/****************
+ * Transform the message DATA which consists of 512 bytes (8 words)
+ */
+static unsigned int
+transform_blk ( void *ctx, const unsigned char *data )
+{
+  TIGER_CONTEXT *hd = ctx;
+  u64 a,b,c,aa,bb,cc;
+  u64 x[8];
+  int i;
+
+  for ( i = 0; i < 8; i++ )
+    x[i] = buf_get_le64(data + i * 8);
+
+  /* save */
+  a = aa = hd->a;
+  b = bb = hd->b;
+  c = cc = hd->c;
+
+  pass( a, b, c, x, 5);
+  key_schedule( x );
+  pass( c, a, b, x, 7);
+  key_schedule( x );
+  pass( b, c, a, x, 9);
+
+  /* feedforward */
+  a ^= aa;
+  b -= bb;
+  c += cc;
+  /* store */
+  hd->a = a;
+  hd->b = b;
+  hd->c = c;
+
+  return /*burn_stack*/ 21*8+11*sizeof(void*);
+}
+
+
+static unsigned int
+transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = transform_blk (c, data);
+      data += 64;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+
+
+/* The routine terminates the computation
+ */
+static void
+tiger_final( void *context )
+{
+  TIGER_CONTEXT *hd = context;
+  u32 t, th, msb, lsb;
+  byte *p;
+  unsigned int burn;
+  byte pad = hd->variant == 2? 0x80 : 0x01;
+
+  t = hd->bctx.nblocks;
+  if (sizeof t == sizeof hd->bctx.nblocks)
+    th = hd->bctx.nblocks_high;
+  else
+    th = hd->bctx.nblocks >> 32;
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 26);
+  /* add the count */
+  t = lsb;
+  if( (lsb += hd->bctx.count) < t )
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 29;
+
+  if( hd->bctx.count < 56 )  /* enough room */
+    {
+      hd->bctx.buf[hd->bctx.count++] = pad;
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform( hd, hd->bctx.buf, 1 );
+    }
+  else  /* need one extra block */
+    {
+      hd->bctx.buf[hd->bctx.count++] = pad; /* pad character */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 64 + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 64 + 60, msb);
+      burn = transform( hd, hd->bctx.buf, 2 );
+    }
+
+  p = hd->bctx.buf;
+#define X(a) do { buf_put_be64(p, hd->a); p += 8; } while(0)
+#define Y(a) do { buf_put_le64(p, hd->a); p += 8; } while(0)
+  if (hd->variant == 0)
+    {
+      X(a);
+      X(b);
+      X(c);
+    }
+  else
+    {
+      Y(a);
+      Y(b);
+      Y(c);
+    }
+#undef X
+#undef Y
+
+  hd->bctx.count = 0;
+
+  _gcry_burn_stack (burn);
+}
+
+static byte *
+tiger_read( void *context )
+{
+  TIGER_CONTEXT *hd = context;
+
+  return hd->bctx.buf;
+}
+
+
+
+/* This is the old TIGER variant based on the unfixed reference
+   implementation.  IT was used in GnupG up to 1.3.2.  We don't provide
+   an OID anymore because that would not be correct.  */
+gcry_md_spec_t _gcry_digest_spec_tiger =
+  {
+    GCRY_MD_TIGER, {0, 0},
+    "TIGER192", NULL, 0, NULL, 24,
+    tiger_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
+    NULL, NULL,
+    sizeof (TIGER_CONTEXT)
+  };
+
+
+
+/* This is the fixed TIGER implementation.  */
+static byte asn1[19] = /* Object ID is 1.3.6.1.4.1.11591.12.2 */
+  { 0x30, 0x29, 0x30, 0x0d, 0x06, 0x09, 0x2b, 0x06,
+    0x01, 0x04, 0x01, 0xda, 0x47, 0x0c, 0x02,
+    0x05, 0x00, 0x04, 0x18 };
+
+static gcry_md_oid_spec_t oid_spec_tiger1[] =
+  {
+    /* GNU.digestAlgorithm TIGER */
+    { "1.3.6.1.4.1.11591.12.2" },
+    { NULL }
+  };
+
+gcry_md_spec_t _gcry_digest_spec_tiger1 =
+  {
+    GCRY_MD_TIGER1, {0, 0},
+    "TIGER", asn1, DIM (asn1), oid_spec_tiger1, 24,
+    tiger1_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
+    NULL, NULL,
+    sizeof (TIGER_CONTEXT)
+  };
+
+
+
+/* This is TIGER2 which usues a changed padding algorithm.  */
+gcry_md_spec_t _gcry_digest_spec_tiger2 =
+  {
+    GCRY_MD_TIGER2, {0, 0},
+    "TIGER2", NULL, 0, NULL, 24,
+    tiger2_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
+    NULL, NULL,
+    sizeof (TIGER_CONTEXT)
+  };
diff --git a/comm/third_party/libgcrypt/cipher/twofish-aarch64.S b/comm/third_party/libgcrypt/cipher/twofish-aarch64.S
new file mode 100644
index 0000000000..9f35b5cdeb
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/twofish-aarch64.S
@@ -0,0 +1,321 @@
+/* twofish-aarch64.S  -  ARMv8/AArch64 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX x0
+#define RDST x1
+#define RSRC x2
+#define CTXs0 CTX
+#define CTXs1 x3
+#define CTXs2 x4
+#define CTXs3 x5
+#define CTXw x17
+
+#define RA w6
+#define RB w7
+#define RC w8
+#define RD w9
+
+#define RX w10
+#define RY w11
+
+#define xRX x10
+#define xRY x11
+
+#define RMASK w12
+
+#define RT0 w13
+#define RT1 w14
+#define RT2 w15
+#define RT3 w16
+
+#define xRT0 x13
+#define xRT1 x14
+#define xRT2 x15
+#define xRT3 x16
+
+/* helper macros */
+#ifndef __AARCH64EL__
+	/* bswap on big-endian */
+	#define host_to_le(reg) \
+		rev reg, reg;
+	#define le_to_host(reg) \
+		rev reg, reg;
+#else
+	/* nop on little-endian */
+	#define host_to_le(reg) /*_*/
+	#define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	le_to_host(a); \
+	ldr c, [rin, #8]; \
+	le_to_host(b); \
+	ldr d, [rin, #12]; \
+	le_to_host(c); \
+	le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+	le_to_host(a); \
+	le_to_host(b); \
+	str a, [rout, #0]; \
+	le_to_host(c); \
+	str b, [rout, #4]; \
+	le_to_host(d); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+/* unaligned word reads/writes allowed */
+#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+	ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+	str_output_aligned_le(rout, ra, rb, rc, rd)
+
+/**********************************************************************
+  1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+	and RT0, RMASK, b, lsr#(8 - 2); \
+	and RY, RMASK, b, lsr#(16 - 2); \
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	ldr RY, [CTXs3, xRY]; \
+	and RT2, RMASK, b, lsl#(2); \
+	ldr RT0, [CTXs2, xRT0]; \
+	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+	ldr RT1, [CTXs0, xRT1]; \
+	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+	ldr RT2, [CTXs1, xRT2]; \
+	ldr RX, [CTXs1, xRX]; \
+	ror_a(a); \
+	\
+	eor RY, RY, RT0; \
+	ldr RT3, [CTXs2, xRT3]; \
+	and RT0, RMASK, a, lsl#(2); \
+	eor RY, RY, RT1; \
+	and RT1, RMASK, a, lsr#(24 - 2); \
+	eor RY, RY, RT2; \
+	ldr RT0, [CTXs0, xRT0]; \
+	eor RX, RX, RT3; \
+	ldr RT1, [CTXs3, xRT1]; \
+	eor RX, RX, RT0; \
+	\
+	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT1; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT3; \
+	add RX, RX, RT2; \
+	eor rd, RT0, rd, ror #31; \
+	eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+	ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+	ror_b(b); \
+	and RT2, RMASK, a, lsl#(2); \
+	and RT0, RMASK, a, lsr#(8 - 2); \
+	\
+	ldr RY, [CTXs1, xRT3]; \
+	ldr RX, [CTXs0, xRT2]; \
+	and RT3, RMASK, b, lsr#(16 - 2); \
+	ldr RT1, [CTXs2, xRT1]; \
+	and RT2, RMASK, a, lsr#(16 - 2); \
+	ldr RT0, [CTXs1, xRT0]; \
+	\
+	ldr RT3, [CTXs3, xRT3]; \
+	eor RY, RY, RT1; \
+	\
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs2, xRT2]; \
+	and RT0, RMASK, a, lsr#(24 - 2); \
+	\
+	ldr RT1, [CTXs0, xRT1]; \
+	\
+	eor RY, RY, RT3; \
+	ldr RT0, [CTXs3, xRT0]; \
+	eor RX, RX, RT2; \
+	eor RY, RY, RT1; \
+	\
+	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT1; \
+	add RX, RX, RT2; \
+	eor rd, rd, RT0; \
+	eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	ror1(RD);
+
+.globl _gcry_twofish_arm_encrypt_block
+ELF(.type   _gcry_twofish_arm_encrypt_block,%function;)
+
+_gcry_twofish_arm_encrypt_block:
+	/* input:
+	 *	x0: ctx
+	 *	x1: dst
+	 *	x2: src
+	 */
+	CFI_STARTPROC();
+
+	add CTXw, CTX, #(w);
+
+	ldr_input_le(RSRC, RA, RB, RC, RD, RT0);
+
+	/* Input whitening */
+	ldp RT0, RT1, [CTXw, #(0*8)];
+	ldp RT2, RT3, [CTXw, #(1*8)];
+	add CTXs3, CTX, #(s3);
+	add CTXs2, CTX, #(s2);
+	add CTXs1, CTX, #(s1);
+	mov RMASK, #(0xff << 2);
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	first_encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	last_encrypt_cycle(7);
+
+	/* Output whitening */
+	ldp RT0, RT1, [CTXw, #(2*8)];
+	ldp RT2, RT3, [CTXw, #(3*8)];
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	str_output_le(RDST, RC, RD, RA, RB, RT0, RT1);
+
+	ret;
+	CFI_ENDPROC();
+.ltorg
+ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;)
+
+.globl _gcry_twofish_arm_decrypt_block
+ELF(.type   _gcry_twofish_arm_decrypt_block,%function;)
+
+_gcry_twofish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	CFI_STARTPROC();
+
+	add CTXw, CTX, #(w);
+
+	ldr_input_le(RSRC, RC, RD, RA, RB, RT0);
+
+	/* Input whitening */
+	ldp RT0, RT1, [CTXw, #(2*8)];
+	ldp RT2, RT3, [CTXw, #(3*8)];
+	add CTXs3, CTX, #(s3);
+	add CTXs2, CTX, #(s2);
+	add CTXs1, CTX, #(s1);
+	mov RMASK, #(0xff << 2);
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	first_decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	last_decrypt_cycle(0);
+
+	/* Output whitening */
+	ldp RT0, RT1, [CTXw, #(0*8)];
+	ldp RT2, RT3, [CTXw, #(1*8)];
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	str_output_le(RDST, RA, RB, RC, RD, RT0, RT1);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;)
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__*/
diff --git a/comm/third_party/libgcrypt/cipher/twofish-amd64.S b/comm/third_party/libgcrypt/cipher/twofish-amd64.S
new file mode 100644
index 0000000000..3cb734317d
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/twofish-amd64.S
@@ -0,0 +1,1184 @@
+/* twofish-amd64.S  -  AMD64 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX	%rdi
+
+#define RA	%rax
+#define RB	%rbx
+#define RC	%rcx
+#define RD	%rdx
+
+#define RAd	%eax
+#define RBd	%ebx
+#define RCd	%ecx
+#define RDd	%edx
+
+#define RAbl	%al
+#define RBbl	%bl
+#define RCbl	%cl
+#define RDbl	%dl
+
+#define RAbh	%ah
+#define RBbh	%bh
+#define RCbh	%ch
+#define RDbh	%dh
+
+#define RX	%r8
+#define RY	%r9
+
+#define RXd	%r8d
+#define RYd	%r9d
+
+#define RT0	%rsi
+#define RT1	%rbp
+#define RT2	%r10
+#define RT3	%r11
+
+#define RT0d	%esi
+#define RT1d	%ebp
+#define RT2d	%r10d
+#define RT3d	%r11d
+
+/***********************************************************************
+ * AMD64 assembly implementation of the Twofish cipher
+ ***********************************************************************/
+#define enc_g1_2(a, b, x, y) \
+	movzbl b ## bl, RT3d; \
+	movzbl b ## bh, RT1d; \
+	movzbl a ## bl, RT2d; \
+	movzbl a ## bh, RT0d; \
+	rorl $16, b ## d; \
+	rorl $16, a ## d; \
+	movl s1(CTX, RT3, 4), RYd; \
+	movzbl b ## bl, RT3d; \
+	movl s0(CTX, RT2, 4), RXd; \
+	movzbl a ## bl, RT2d; \
+	xorl s2(CTX, RT1, 4), RYd; \
+	movzbl b ## bh, RT1d; \
+	xorl s1(CTX, RT0, 4), RXd; \
+	movzbl a ## bh, RT0d; \
+	rorl $16, b ## d; \
+	rorl $16, a ## d; \
+	xorl s3(CTX, RT3, 4), RYd; \
+	xorl s2(CTX, RT2, 4), RXd; \
+	xorl s0(CTX, RT1, 4), RYd; \
+	xorl s3(CTX, RT0, 4), RXd;
+
+#define dec_g1_2(a, b, x, y) \
+	movzbl a ## bl, RT2d; \
+	movzbl a ## bh, RT0d; \
+	movzbl b ## bl, RT3d; \
+	movzbl b ## bh, RT1d; \
+	rorl $16, a ## d; \
+	rorl $16, b ## d; \
+	movl s0(CTX, RT2, 4), RXd; \
+	movzbl a ## bl, RT2d; \
+	movl s1(CTX, RT3, 4), RYd; \
+	movzbl b ## bl, RT3d; \
+	xorl s1(CTX, RT0, 4), RXd; \
+	movzbl a ## bh, RT0d; \
+	xorl s2(CTX, RT1, 4), RYd; \
+	movzbl b ## bh, RT1d; \
+	rorl $16, a ## d; \
+	rorl $16, b ## d; \
+	xorl s2(CTX, RT2, 4), RXd; \
+	xorl s3(CTX, RT3, 4), RYd; \
+	xorl s3(CTX, RT0, 4), RXd; \
+	xorl s0(CTX, RT1, 4), RYd;
+
+#define encrypt_round(ra, rb, rc, rd, n) \
+	enc_g1_2(##ra, ##rb, RX, RY); \
+	\
+	leal (RXd, RYd, 2), RT0d; \
+	addl RYd, RXd; \
+	addl (k + 8 * (n) + 4)(CTX), RT0d; \
+	roll $1, rd ## d; \
+	addl (k + 8 * (n))(CTX), RXd; \
+	xorl RT0d, rd ## d; \
+	xorl RXd, rc ## d; \
+	rorl $1, rc ## d;
+
+#define decrypt_round(ra, rb, rc, rd, n) \
+	dec_g1_2(##ra, ##rb, RX, RY); \
+	\
+	leal (RXd, RYd, 2), RT0d; \
+	addl RYd, RXd; \
+	addl (k + 8 * (n) + 4)(CTX), RT0d; \
+	roll $1, rc ## d; \
+	addl (k + 8 * (n))(CTX), RXd; \
+	xorl RXd, rc ## d; \
+	xorl RT0d, rd ## d; \
+	rorl $1, rd ## d;
+
+#define encrypt_cycle(a, b, c, d, nc) \
+	encrypt_round(##a, ##b, ##c, ##d, (nc) * 2); \
+	encrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1);
+
+#define decrypt_cycle(a, b, c, d, nc) \
+	decrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); \
+	decrypt_round(##a, ##b, ##c, ##d, (nc) * 2);
+
+#define inpack(in, n, x, m) \
+	movl (4 * (n))(in), x; \
+	xorl (w + 4 * (m))(CTX), x;
+
+#define outunpack(out, n, x, m) \
+	xorl (w + 4 * (m))(CTX), x; \
+	movl x, (4 * (n))(out);
+
+.align 8
+.globl _gcry_twofish_amd64_encrypt_block
+ELF(.type   _gcry_twofish_amd64_encrypt_block,@function;)
+
+_gcry_twofish_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: context, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
+	movq %rsi, (0 * 8)(%rsp);
+	movq %rbp, (1 * 8)(%rsp);
+	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
+
+	movq %rdx, RX;
+	inpack(RX, 0, RAd, 0);
+	inpack(RX, 1, RBd, 1);
+	inpack(RX, 2, RCd, 2);
+	inpack(RX, 3, RDd, 3);
+
+	encrypt_cycle(RA, RB, RC, RD, 0);
+	encrypt_cycle(RA, RB, RC, RD, 1);
+	encrypt_cycle(RA, RB, RC, RD, 2);
+	encrypt_cycle(RA, RB, RC, RD, 3);
+	encrypt_cycle(RA, RB, RC, RD, 4);
+	encrypt_cycle(RA, RB, RC, RD, 5);
+	encrypt_cycle(RA, RB, RC, RD, 6);
+	encrypt_cycle(RA, RB, RC, RD, 7);
+
+	movq (0 * 8)(%rsp), RX; /*dst*/
+	outunpack(RX, 0, RCd, 4);
+	outunpack(RX, 1, RDd, 5);
+	outunpack(RX, 2, RAd, 6);
+	outunpack(RX, 3, RBd, 7);
+
+	movq (2 * 8)(%rsp), %rbx;
+	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
+	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
+
+.align 8
+.globl _gcry_twofish_amd64_decrypt_block
+ELF(.type   _gcry_twofish_amd64_decrypt_block,@function;)
+
+_gcry_twofish_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: context, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
+	movq %rsi, (0 * 8)(%rsp);
+	movq %rbp, (1 * 8)(%rsp);
+	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
+
+	movq %rdx, RX;
+	inpack(RX, 0, RCd, 4);
+	inpack(RX, 1, RDd, 5);
+	inpack(RX, 2, RAd, 6);
+	inpack(RX, 3, RBd, 7);
+
+	decrypt_cycle(RA, RB, RC, RD, 7);
+	decrypt_cycle(RA, RB, RC, RD, 6);
+	decrypt_cycle(RA, RB, RC, RD, 5);
+	decrypt_cycle(RA, RB, RC, RD, 4);
+	decrypt_cycle(RA, RB, RC, RD, 3);
+	decrypt_cycle(RA, RB, RC, RD, 2);
+	decrypt_cycle(RA, RB, RC, RD, 1);
+	decrypt_cycle(RA, RB, RC, RD, 0);
+
+	movq (0 * 8)(%rsp), RX; /*dst*/
+	outunpack(RX, 0, RAd, 0);
+	outunpack(RX, 1, RBd, 1);
+	outunpack(RX, 2, RCd, 2);
+	outunpack(RX, 3, RDd, 3);
+
+	movq (2 * 8)(%rsp), %rbx;
+	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
+	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
+
+#undef CTX
+
+#undef RA
+#undef RB
+#undef RC
+#undef RD
+
+#undef RAd
+#undef RBd
+#undef RCd
+#undef RDd
+
+#undef RAbl
+#undef RBbl
+#undef RCbl
+#undef RDbl
+
+#undef RAbh
+#undef RBbh
+#undef RCbh
+#undef RDbh
+
+#undef RX
+#undef RY
+
+#undef RXd
+#undef RYd
+
+#undef RT0
+#undef RT1
+#undef RT2
+#undef RT3
+
+#undef RT0d
+#undef RT1d
+#undef RT2d
+#undef RT3d
+
+/***********************************************************************
+ * AMD64 assembly implementation of the Twofish cipher, 3-way parallel
+ ***********************************************************************/
+#define CTX %rdi
+#define RIO %rdx
+
+#define RAB0 %rax
+#define RAB1 %rbx
+#define RAB2 %rcx
+
+#define RAB0d %eax
+#define RAB1d %ebx
+#define RAB2d %ecx
+
+#define RAB0bh %ah
+#define RAB1bh %bh
+#define RAB2bh %ch
+
+#define RAB0bl %al
+#define RAB1bl %bl
+#define RAB2bl %cl
+
+#define RCD0 %r8
+#define RCD1 %r9
+#define RCD2 %r10
+
+#define RCD0d %r8d
+#define RCD1d %r9d
+#define RCD2d %r10d
+
+#define RX0 %rbp
+#define RX1 %r11
+#define RX2 %r12
+
+#define RX0d %ebp
+#define RX1d %r11d
+#define RX2d %r12d
+
+#define RY0 %r13
+#define RY1 %r14
+#define RY2 %r15
+
+#define RY0d %r13d
+#define RY1d %r14d
+#define RY2d %r15d
+
+#define RT0 %rdx
+#define RT1 %rsi
+
+#define RT0d %edx
+#define RT1d %esi
+
+#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
+	movzbl ab ## bl,		tmp2 ## d; \
+	movzbl ab ## bh,		tmp1 ## d; \
+	rorq $(rot),			ab; \
+	op1##l T0(CTX, tmp2, 4),	dst ## d; \
+	op2##l T1(CTX, tmp1, 4),	dst ## d;
+
+/*
+ * Combined G1 & G2 function. Reordered with help of rotates to have moves
+ * at beginning.
+ */
+#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
+	/* G1,1 && G2,1 */ \
+	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
+	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
+	\
+	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
+	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
+	\
+	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
+	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
+	\
+	/* G1,2 && G2,2 */ \
+	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
+	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
+	movq ab ## 0, RT0; \
+	movq cd ## 0, ab ## 0; \
+	movq RT0, cd ## 0; \
+	\
+	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
+	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
+	movq ab ## 1, RT0; \
+	movq cd ## 1, ab ## 1; \
+	movq RT0, cd ## 1; \
+	\
+	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
+	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
+	movq ab ## 2, RT0; \
+	movq cd ## 2, ab ## 2; \
+	movq RT0, cd ## 2;
+
+#define enc_round_end(ab, x, y, n) \
+	addl y ## d,			x ## d; \
+	addl x ## d,			y ## d; \
+	addl k+4*(2*(n))(CTX),		x ## d; \
+	xorl ab ## d,			x ## d; \
+	addl k+4*(2*(n)+1)(CTX),	y ## d; \
+	shrq $32,			ab; \
+	roll $1,			ab ## d; \
+	xorl y ## d,			ab ## d; \
+	shlq $32,			ab; \
+	rorl $1,			x ## d; \
+	orq x,				ab;
+
+#define dec_round_end(ba, x, y, n) \
+	addl y ## d,			x ## d; \
+	addl x ## d,			y ## d; \
+	addl k+4*(2*(n))(CTX),		x ## d; \
+	addl k+4*(2*(n)+1)(CTX),	y ## d; \
+	xorl ba ## d,			y ## d; \
+	shrq $32,			ba; \
+	roll $1,			ba ## d; \
+	xorl x ## d,			ba ## d; \
+	shlq $32,			ba; \
+	rorl $1,			y ## d; \
+	orq y,				ba;
+
+#define encrypt_round3(ab, cd, n) \
+	g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
+	\
+	enc_round_end(ab ## 0, RX0, RY0, n); \
+	enc_round_end(ab ## 1, RX1, RY1, n); \
+	enc_round_end(ab ## 2, RX2, RY2, n);
+
+#define decrypt_round3(ba, dc, n) \
+	g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
+	\
+	dec_round_end(ba ## 0, RX0, RY0, n); \
+	dec_round_end(ba ## 1, RX1, RY1, n); \
+	dec_round_end(ba ## 2, RX2, RY2, n);
+
+#define encrypt_cycle3(ab, cd, n) \
+	encrypt_round3(ab, cd, n*2); \
+	encrypt_round3(ab, cd, (n*2)+1);
+
+#define decrypt_cycle3(ba, dc, n) \
+	decrypt_round3(ba, dc, (n*2)+1); \
+	decrypt_round3(ba, dc, (n*2));
+
+#define inpack3(xy, m) \
+	xorq w+4*m(CTX),		xy ## 0; \
+	xorq w+4*m(CTX),		xy ## 1; \
+	xorq w+4*m(CTX),		xy ## 2;
+
+#define outunpack3(xy, m) \
+	xorq w+4*m(CTX),		xy ## 0; \
+	xorq w+4*m(CTX),		xy ## 1; \
+	xorq w+4*m(CTX),		xy ## 2;
+
+#define inpack_enc3() \
+	inpack3(RAB, 0); \
+	inpack3(RCD, 2);
+
+#define outunpack_enc3() \
+	outunpack3(RAB, 6); \
+	outunpack3(RCD, 4);
+
+#define inpack_dec3() \
+	inpack3(RAB, 4); \
+	rorq $32,			RAB0; \
+	rorq $32,			RAB1; \
+	rorq $32,			RAB2; \
+	inpack3(RCD, 6); \
+	rorq $32,			RCD0; \
+	rorq $32,			RCD1; \
+	rorq $32,			RCD2;
+
+#define outunpack_dec3() \
+	rorq $32,			RCD0; \
+	rorq $32,			RCD1; \
+	rorq $32,			RCD2; \
+	outunpack3(RCD, 0); \
+	rorq $32,			RAB0; \
+	rorq $32,			RAB1; \
+	rorq $32,			RAB2; \
+	outunpack3(RAB, 2);
+
+.align 8
+ELF(.type __twofish_enc_blk3,@function;)
+
+__twofish_enc_blk3:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three plaintext blocks
+	 * output:
+	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks
+	 */
+	CFI_STARTPROC();
+
+	inpack_enc3();
+
+	encrypt_cycle3(RAB, RCD, 0);
+	encrypt_cycle3(RAB, RCD, 1);
+	encrypt_cycle3(RAB, RCD, 2);
+	encrypt_cycle3(RAB, RCD, 3);
+	encrypt_cycle3(RAB, RCD, 4);
+	encrypt_cycle3(RAB, RCD, 5);
+	encrypt_cycle3(RAB, RCD, 6);
+	encrypt_cycle3(RAB, RCD, 7);
+
+	outunpack_enc3();
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
+
+.align 8
+ELF(.type  __twofish_dec_blk3,@function;)
+
+__twofish_dec_blk3:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three ciphertext blocks
+	 * output:
+	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks
+	 */
+	CFI_STARTPROC();
+
+	inpack_dec3();
+
+	decrypt_cycle3(RAB, RCD, 7);
+	decrypt_cycle3(RAB, RCD, 6);
+	decrypt_cycle3(RAB, RCD, 5);
+	decrypt_cycle3(RAB, RCD, 4);
+	decrypt_cycle3(RAB, RCD, 3);
+	decrypt_cycle3(RAB, RCD, 2);
+	decrypt_cycle3(RAB, RCD, 1);
+	decrypt_cycle3(RAB, RCD, 0);
+
+	outunpack_dec3();
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
+
+.align 8
+.globl _gcry_twofish_amd64_ctr_enc
+ELF(.type   _gcry_twofish_amd64_ctr_enc,@function;)
+_gcry_twofish_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %rdx, (7 * 8)(%rsp);
+	movq %rcx, RX0;
+
+	/* load IV and byteswap */
+	movq 8(RX0), RT0;
+	movq 0(RX0), RT1;
+	movq RT0, RCD0;
+	movq RT1, RAB0;
+	bswapq RT0;
+	bswapq RT1;
+
+	/* construct IVs */
+	movq RT0, RCD1;
+	movq RT1, RAB1;
+	movq RT0, RCD2;
+	movq RT1, RAB2;
+	addq $1, RCD1;
+	adcq $0, RAB1;
+	bswapq RCD1;
+	bswapq RAB1;
+	addq $2, RCD2;
+	adcq $0, RAB2;
+	bswapq RCD2;
+	bswapq RAB2;
+	addq $3, RT0;
+	adcq $0, RT1;
+	bswapq RT0;
+	bswapq RT1;
+
+	/* store new IV */
+	movq RT0, 8(RX0);
+	movq RT1, 0(RX0);
+
+	call __twofish_enc_blk3;
+
+	movq (7 * 8)(%rsp), RX0; /*src*/
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	/* XOR key-stream with plaintext */
+	xorq (0 * 8)(RX0), RCD0;
+	xorq (1 * 8)(RX0), RAB0;
+	xorq (2 * 8)(RX0), RCD1;
+	xorq (3 * 8)(RX0), RAB1;
+	xorq (4 * 8)(RX0), RCD2;
+	xorq (5 * 8)(RX0), RAB2;
+	movq RCD0, (0 * 8)(RX1);
+	movq RAB0, (1 * 8)(RX1);
+	movq RCD1, (2 * 8)(RX1);
+	movq RAB1, (3 * 8)(RX1);
+	movq RCD2, (4 * 8)(RX1);
+	movq RAB2, (5 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
+
+.align 8
+.globl _gcry_twofish_amd64_cbc_dec
+ELF(.type   _gcry_twofish_amd64_cbc_dec,@function;)
+_gcry_twofish_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (128bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(9 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %rdx, (7 * 8)(%rsp);
+	movq %rcx, (8 * 8)(%rsp);
+	movq %rdx, RX0;
+
+	/* load input */
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+
+	call __twofish_dec_blk3;
+
+	movq (8 * 8)(%rsp), RT0; /*iv*/
+	movq (7 * 8)(%rsp), RX0; /*src*/
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	movq (4 * 8)(RX0), RY0;
+	movq (5 * 8)(RX0), RY1;
+	xorq (0 * 8)(RT0), RCD0;
+	xorq (1 * 8)(RT0), RAB0;
+	xorq (0 * 8)(RX0), RCD1;
+	xorq (1 * 8)(RX0), RAB1;
+	xorq (2 * 8)(RX0), RCD2;
+	xorq (3 * 8)(RX0), RAB2;
+	movq RY0, (0 * 8)(RT0);
+	movq RY1, (1 * 8)(RT0);
+
+	movq RCD0, (0 * 8)(RX1);
+	movq RAB0, (1 * 8)(RX1);
+	movq RCD1, (2 * 8)(RX1);
+	movq RAB1, (3 * 8)(RX1);
+	movq RCD2, (4 * 8)(RX1);
+	movq RAB2, (5 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-9 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
+
+.align 8
+.globl _gcry_twofish_amd64_cfb_dec
+ELF(.type   _gcry_twofish_amd64_cfb_dec,@function;)
+_gcry_twofish_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (128bit)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %rdx, (7 * 8)(%rsp);
+	movq %rdx, RX0;
+	movq %rcx, RX1;
+
+	/* load input */
+	movq (0 * 8)(RX1), RAB0;
+	movq (1 * 8)(RX1), RCD0;
+	movq (0 * 8)(RX0), RAB1;
+	movq (1 * 8)(RX0), RCD1;
+	movq (2 * 8)(RX0), RAB2;
+	movq (3 * 8)(RX0), RCD2;
+
+	/* Update IV */
+	movq (4 * 8)(RX0), RY0;
+	movq (5 * 8)(RX0), RY1;
+	movq RY0, (0 * 8)(RX1);
+	movq RY1, (1 * 8)(RX1);
+
+	call __twofish_enc_blk3;
+
+	movq (7 * 8)(%rsp), RX0; /*src*/
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	xorq (0 * 8)(RX0), RCD0;
+	xorq (1 * 8)(RX0), RAB0;
+	xorq (2 * 8)(RX0), RCD1;
+	xorq (3 * 8)(RX0), RAB1;
+	xorq (4 * 8)(RX0), RCD2;
+	xorq (5 * 8)(RX0), RAB2;
+	movq RCD0, (0 * 8)(RX1);
+	movq RAB0, (1 * 8)(RX1);
+	movq RCD1, (2 * 8)(RX1);
+	movq RAB1, (3 * 8)(RX1);
+	movq RCD2, (4 * 8)(RX1);
+	movq RAB2, (5 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
+
+.align 8
+.globl _gcry_twofish_amd64_ocb_enc
+ELF(.type   _gcry_twofish_amd64_ocb_enc,@function;)
+_gcry_twofish_amd64_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[3])
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_6
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %rdx, RX0;
+	movq %rcx, RX1;
+	movq %r8, RX2;
+	movq %r9, RY0;
+	movq %rsi, RY1;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* Store Offset_i */
+	movq RT0, (0 * 8)(RY1);
+	movq RT1, (1 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB0, (0 * 8)(RX2);
+	xor RCD0, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* Store Offset_i */
+	movq RT0, (2 * 8)(RY1);
+	movq RT1, (3 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB1, (0 * 8)(RX2);
+	xor RCD1, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* Store Offset_i */
+	movq RT0, (4 * 8)(RY1);
+	movq RT1, (5 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB2, (0 * 8)(RX2);
+	xor RCD2, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* CX_i = ENCIPHER(K, PX_i)  */
+	call __twofish_enc_blk3;
+
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	/* C_i = CX_i xor Offset_i  */
+	xorq RCD0, (0 * 8)(RX1);
+	xorq RAB0, (1 * 8)(RX1);
+	xorq RCD1, (2 * 8)(RX1);
+	xorq RAB1, (3 * 8)(RX1);
+	xorq RCD2, (4 * 8)(RX1);
+	xorq RAB2, (5 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
+
+.align 8
+.globl _gcry_twofish_amd64_ocb_dec
+ELF(.type   _gcry_twofish_amd64_ocb_dec,@function;)
+_gcry_twofish_amd64_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[3])
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_6
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %r8,  (7 * 8)(%rsp);
+	movq %rdx, RX0;
+	movq %rcx, RX1;
+	movq %r9, RY0;
+	movq %rsi, RY1;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* Store Offset_i */
+	movq RT0, (0 * 8)(RY1);
+	movq RT1, (1 * 8)(RY1);
+	/* CX_i = C_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* Store Offset_i */
+	movq RT0, (2 * 8)(RY1);
+	movq RT1, (3 * 8)(RY1);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* Store Offset_i */
+	movq RT0, (4 * 8)(RY1);
+	movq RT1, (5 * 8)(RY1);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* PX_i = DECIPHER(K, CX_i)  */
+	call __twofish_dec_blk3;
+
+	movq (7 * 8)(%rsp), RX2; /*checksum*/
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	/* Load checksum */
+	movq (0 * 8)(RX2), RT0;
+	movq (1 * 8)(RX2), RT1;
+
+	/* P_i = PX_i xor Offset_i  */
+	xorq RCD0, (0 * 8)(RX1);
+	xorq RAB0, (1 * 8)(RX1);
+	xorq RCD1, (2 * 8)(RX1);
+	xorq RAB1, (3 * 8)(RX1);
+	xorq RCD2, (4 * 8)(RX1);
+	xorq RAB2, (5 * 8)(RX1);
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xorq (0 * 8)(RX1), RT0;
+	xorq (1 * 8)(RX1), RT1;
+	xorq (2 * 8)(RX1), RT0;
+	xorq (3 * 8)(RX1), RT1;
+	xorq (4 * 8)(RX1), RT0;
+	xorq (5 * 8)(RX1), RT1;
+
+	/* Store checksum */
+	movq RT0, (0 * 8)(RX2);
+	movq RT1, (1 * 8)(RX2);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
+
+.align 8
+.globl _gcry_twofish_amd64_ocb_auth
+ELF(.type   _gcry_twofish_amd64_ocb_auth,@function;)
+_gcry_twofish_amd64_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (3 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[3])
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_5
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	movq %rcx, (6 * 8)(%rsp);
+	movq %rsi, RX0;
+	movq %rdx, RX1;
+	movq %r8, RY0;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* C_i = ENCIPHER(K, PX_i)  */
+	call __twofish_enc_blk3;
+
+	movq (6 * 8)(%rsp), RX1; /*checksum*/
+
+	/* Checksum_i = C_i xor Checksum_i  */
+	xorq RCD0, RCD1;
+	xorq RAB0, RAB1;
+	xorq RCD1, RCD2;
+	xorq RAB1, RAB2;
+	xorq RCD2, (0 * 8)(RX1);
+	xorq RAB2, (1 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
+
+#endif /*USE_TWOFISH*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/twofish-arm.S b/comm/third_party/libgcrypt/cipher/twofish-arm.S
new file mode 100644
index 0000000000..2e1da6cd15
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/twofish-arm.S
@@ -0,0 +1,363 @@
+/* twofish-arm.S  -  ARM assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX %r0
+#define CTXs0 %r0
+#define CTXs1 %r1
+#define CTXs3 %r7
+
+#define RA %r3
+#define RB %r4
+#define RC %r5
+#define RD %r6
+
+#define RX %r2
+#define RY %ip
+
+#define RMASK %lr
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#ifndef __ARMEL__
+	/* bswap on big-endian */
+	#define host_to_le(reg) \
+		rev reg, reg;
+	#define le_to_host(reg) \
+		rev reg, reg;
+#else
+	/* nop on little-endian */
+	#define host_to_le(reg) /*_*/
+	#define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	le_to_host(a); \
+	ldr c, [rin, #8]; \
+	le_to_host(b); \
+	ldr d, [rin, #12]; \
+	le_to_host(c); \
+	le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+	le_to_host(a); \
+	le_to_host(b); \
+	str a, [rout, #0]; \
+	le_to_host(c); \
+	str b, [rout, #4]; \
+	le_to_host(d); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads/writes allowed */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+		ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		str_output_aligned_le(rout, ra, rb, rc, rd)
+#else
+	/* need to handle unaligned reads/writes by byte reads */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_le(ra, rin, 0, rtmp0); \
+			ldr_unaligned_le(rb, rin, 4, rtmp0); \
+			ldr_unaligned_le(rc, rin, 8, rtmp0); \
+			ldr_unaligned_le(rd, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
+		2:;
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			str_output_aligned_le(rout, ra, rb, rc, rd); \
+		2:;
+#endif
+
+/**********************************************************************
+  1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+	and RT0, RMASK, b, lsr#(8 - 2); \
+	and RY, RMASK, b, lsr#(16 - 2); \
+	add RT0, RT0, #(s2 - s1); \
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	ldr RY, [CTXs3, RY]; \
+	and RT2, RMASK, b, lsl#(2); \
+	ldr RT0, [CTXs1, RT0]; \
+	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+	ldr RT1, [CTXs0, RT1]; \
+	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+	ldr RT2, [CTXs1, RT2]; \
+	add RT3, RT3, #(s2 - s1); \
+	ldr RX, [CTXs1, RX]; \
+	ror_a(a); \
+	\
+	eor RY, RY, RT0; \
+	ldr RT3, [CTXs1, RT3]; \
+	and RT0, RMASK, a, lsl#(2); \
+	eor RY, RY, RT1; \
+	and RT1, RMASK, a, lsr#(24 - 2); \
+	eor RY, RY, RT2; \
+	ldr RT0, [CTXs0, RT0]; \
+	eor RX, RX, RT3; \
+	ldr RT1, [CTXs3, RT1]; \
+	eor RX, RX, RT0; \
+	\
+	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT1; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT3; \
+	add RX, RX, RT2; \
+	eor rd, RT0, rd, ror #31; \
+	eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+	ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+	ror_b(b); \
+	and RT2, RMASK, a, lsl#(2); \
+	and RT0, RMASK, a, lsr#(8 - 2); \
+	\
+	ldr RY, [CTXs1, RT3]; \
+	add RT1, RT1, #(s2 - s1); \
+	ldr RX, [CTXs0, RT2]; \
+	and RT3, RMASK, b, lsr#(16 - 2); \
+	ldr RT1, [CTXs1, RT1]; \
+	and RT2, RMASK, a, lsr#(16 - 2); \
+	ldr RT0, [CTXs1, RT0]; \
+	\
+	add RT2, RT2, #(s2 - s1); \
+	ldr RT3, [CTXs3, RT3]; \
+	eor RY, RY, RT1; \
+	\
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs1, RT2]; \
+	and RT0, RMASK, a, lsr#(24 - 2); \
+	\
+	ldr RT1, [CTXs0, RT1]; \
+	\
+	eor RY, RY, RT3; \
+	ldr RT0, [CTXs3, RT0]; \
+	eor RX, RX, RT2; \
+	eor RY, RY, RT1; \
+	\
+	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT1; \
+	add RX, RX, RT2; \
+	eor rd, rd, RT0; \
+	eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	ror1(RD);
+
+.align 3
+.globl _gcry_twofish_arm_encrypt_block
+.type   _gcry_twofish_arm_encrypt_block,%function;
+
+_gcry_twofish_arm_encrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add RY, CTXs0, #w;
+
+	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs3, CTXs0, #(s3 - s0);
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	first_encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	last_encrypt_cycle(7);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
+
+.align 3
+.globl _gcry_twofish_arm_decrypt_block
+.type   _gcry_twofish_arm_decrypt_block,%function;
+
+_gcry_twofish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	first_decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	last_decrypt_cycle(0);
+
+	add RY, CTXs0, #w;
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARMEL__*/
diff --git a/comm/third_party/libgcrypt/cipher/twofish-avx2-amd64.S b/comm/third_party/libgcrypt/cipher/twofish-avx2-amd64.S
new file mode 100644
index 0000000000..74cad35589
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/twofish-avx2-amd64.S
@@ -0,0 +1,1048 @@
+/* twofish-avx2-amd64.S  -  AMD64/AVX2 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2013-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
+    defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0	0
+#define s1	((s0) + 4 * 256)
+#define s2	((s1) + 4 * 256)
+#define s3	((s2) + 4 * 256)
+#define w	((s3) + 4 * 256)
+#define k	((w) + 4 * 8)
+
+/* register macros */
+#define CTX	%rdi
+
+#define RROUND  %rbp
+#define RROUNDd %ebp
+#define RS0	CTX
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+#define RK	%r11
+#define RW	%rax
+
+#define RA0	%ymm8
+#define RB0	%ymm9
+#define RC0	%ymm10
+#define RD0	%ymm11
+#define RA1	%ymm12
+#define RB1	%ymm13
+#define RC1	%ymm14
+#define RD1	%ymm15
+
+/* temp regs */
+#define RX0	%ymm0
+#define RY0	%ymm1
+#define RX1	%ymm2
+#define RY1	%ymm3
+#define RT0	%ymm4
+#define RIDX	%ymm5
+
+#define RX0x	%xmm0
+#define RY0x	%xmm1
+#define RX1x	%xmm2
+#define RY1x	%xmm3
+#define RT0x	%xmm4
+#define RIDXx	%xmm5
+
+#define RTMP0   RX0
+#define RTMP0x  RX0x
+#define RTMP1   RX1
+#define RTMP1x  RX1x
+#define RTMP2   RY0
+#define RTMP2x  RY0x
+#define RTMP3   RY1
+#define RTMP3x  RY1x
+#define RTMP4   RIDX
+#define RTMP4x  RIDXx
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+#define RNOTx	%xmm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/**********************************************************************
+  16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	leaq k(CTX), RK; \
+	leaq w(CTX), RW; \
+	vpsrld $24, RNOT, RBYTE; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+	vpand RBYTE, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+		\
+		vpand RBYTE, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+	\
+	vpsrld $8, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $8, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $16, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $16, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $24, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $24, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1;
+
+#define g1_16(a, x) \
+	g16(a, RS0, RS1, RS2, RS3, x);
+
+#define g2_16(b, y) \
+	g16(b, RS1, RS2, RS3, RS0, y);
+
+#define encrypt_round_end16(a, b, c, d, nk, r) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	vpsrld $1, c ## 0, RT0; \
+	vpslld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		vpsrld $1, c ## 1, RT0; \
+		vpslld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+
+#define encrypt_round16(a, b, c, d, nk, r) \
+	g2_16(b, RY); \
+	\
+	vpslld $1, b ## 0, RT0; \
+	vpsrld $31, b ## 0, b ## 0; \
+	vpor RT0, b ## 0, b ## 0; \
+	\
+		vpslld $1, b ## 1, RT0; \
+		vpsrld $31, b ## 1, b ## 1; \
+		vpor RT0, b ## 1, b ## 1; \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_round_first16(a, b, c, d, nk, r) \
+	vpslld $1, d ## 0, RT0; \
+	vpsrld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpslld $1, d ## 1, RT0; \
+		vpsrld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1; \
+	\
+	encrypt_round16(a, b, c, d, nk, r);
+
+#define encrypt_round_last16(a, b, c, d, nk, r) \
+	g2_16(b, RY); \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_end16(a, b, c, d, nk, r) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	vpsrld $1, d ## 0, RT0; \
+	vpslld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		vpsrld $1, d ## 1, RT0; \
+		vpslld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1;
+
+#define decrypt_round16(a, b, c, d, nk, r) \
+	g1_16(a, RX); \
+	\
+	vpslld $1, a ## 0, RT0; \
+	vpsrld $31, a ## 0, a ## 0; \
+	vpor RT0, a ## 0, a ## 0; \
+	\
+		vpslld $1, a ## 1, RT0; \
+		vpsrld $31, a ## 1, a ## 1; \
+		vpor RT0, a ## 1, a ## 1; \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_first16(a, b, c, d, nk, r) \
+	vpslld $1, c ## 0, RT0; \
+	vpsrld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpslld $1, c ## 1, RT0; \
+		vpsrld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+	\
+	decrypt_round16(a, b, c, d, nk, r)
+
+#define decrypt_round_last16(a, b, c, d, nk, r) \
+	g1_16(a, RX); \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_cycle16(r) \
+	encrypt_round16(RA, RB, RC, RD, 0, r); \
+	encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_first16(r) \
+	encrypt_round_first16(RA, RB, RC, RD, 0, r); \
+	encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_last16(r) \
+	encrypt_round16(RA, RB, RC, RD, 0, r); \
+	encrypt_round_last16(RC, RD, RA, RB, 8, r);
+
+#define decrypt_cycle16(r) \
+	decrypt_round16(RC, RD, RA, RB, 8, r); \
+	decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_first16(r) \
+	decrypt_round_first16(RC, RD, RA, RB, 8, r); \
+	decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_last16(r) \
+	decrypt_round16(RC, RD, RA, RB, 8, r); \
+	decrypt_round_last16(RA, RB, RC, RD, 0, r);
+
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+#define read_blocks8(offs,a,b,c,d) \
+	vmovdqu 16*offs(RIO), a; \
+	vmovdqu 16*offs+32(RIO), b; \
+	vmovdqu 16*offs+64(RIO), c; \
+	vmovdqu 16*offs+96(RIO), d; \
+	\
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define write_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0); \
+	\
+	vmovdqu a, 16*offs(RIO); \
+	vmovdqu b, 16*offs+32(RIO); \
+	vmovdqu c, 16*offs+64(RIO); \
+	vmovdqu d, 16*offs+96(RIO);
+
+#define inpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define outunpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, c, RX0; \
+	vpxor RY0, d, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, a, c; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, b, d; \
+	\
+	vmovdqa RX0, a; \
+	vmovdqa RY0, b;
+
+#define inpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, a, RX0; \
+	vpxor RY0, b, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, c, a; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, d, b; \
+	\
+	vmovdqa RX0, c; \
+	vmovdqa RY0, d;
+
+#define outunpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define transpose4x4_16(a,b,c,d) \
+	transpose_4x4(a ## 0, b ## 0, c ## 0, d ## 0, RX0, RY0); \
+	transpose_4x4(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0);
+
+#define inpack_enc16(a,b,c,d) \
+	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_enc16(a,b,c,d) \
+	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_dec16(a,b,c,d) \
+	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_dec16(a,b,c,d) \
+	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+.align 8
+ELF(.type __twofish_enc_blk16,@function;)
+__twofish_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+	init_round_constants();
+
+	transpose4x4_16(RA, RB, RC, RD);
+	inpack_enc16(RA, RB, RC, RD);
+
+	encrypt_cycle_first16(0);
+	encrypt_cycle16(2);
+	encrypt_cycle16(4);
+	encrypt_cycle16(6);
+	encrypt_cycle16(8);
+	encrypt_cycle16(10);
+	encrypt_cycle16(12);
+	encrypt_cycle_last16(14);
+
+	outunpack_enc16(RA, RB, RC, RD);
+	transpose4x4_16(RA, RB, RC, RD);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
+
+.align 8
+ELF(.type __twofish_dec_blk16,@function;)
+__twofish_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						ciphertext blocks
+	 */
+	CFI_STARTPROC();
+	init_round_constants();
+
+	transpose4x4_16(RA, RB, RC, RD);
+	inpack_dec16(RA, RB, RC, RD);
+
+	decrypt_cycle_first16(14);
+	decrypt_cycle16(12);
+	decrypt_cycle16(10);
+	decrypt_cycle16(8);
+	decrypt_cycle16(6);
+	decrypt_cycle16(4);
+	decrypt_cycle16(2);
+	decrypt_cycle_last16(0);
+
+	outunpack_dec16(RA, RB, RC, RD);
+	transpose4x4_16(RA, RB, RC, RD);
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_twofish_avx2_ctr_enc
+ELF(.type   _gcry_twofish_avx2_ctr_enc,@function;)
+_gcry_twofish_avx2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	movq 8(%rcx), %rax;
+	bswapq %rax;
+
+	vzeroupper;
+
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
+	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RTMP4x;
+	vpshufb RTMP3x, RTMP4x, RTMP4x;
+	vmovdqa RTMP4x, RTMP0x;
+	inc_le128(RTMP4x, RNOTx, RTMP1x);
+	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 16), %rax;
+	ja .Lhandle_ctr_carry;
+
+	/* construct IVs */
+	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+	vpshufb RTMP3, RTMP0, RB0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+	vpshufb RTMP3, RTMP0, RC0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+	vpshufb RTMP3, RTMP0, RD0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+	vpshufb RTMP3, RTMP0, RA1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+	vpshufb RTMP3, RTMP0, RB1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+	vpshufb RTMP3, RTMP0, RC1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+	vpshufb RTMP3, RTMP0, RD1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+	vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+	jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB0; /* +3 ; +2 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RC0; /* +5 ; +4 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RD0; /* +7 ; +6 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA1; /* +9 ; +8 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RC1; /* +13 ; +12 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RD1; /* +15 ; +14 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vextracti128 $1, RTMP0, RTMP0x;
+	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+	/* store new IV */
+	vmovdqu RTMP0x, (%rcx);
+
+	call __twofish_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RB0, RB0;
+	vpxor (2 * 32)(%rdx), RC0, RC0;
+	vpxor (3 * 32)(%rdx), RD0, RD0;
+	vpxor (4 * 32)(%rdx), RA1, RA1;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RC1, RC1;
+	vpxor (7 * 32)(%rdx), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_cbc_dec
+ELF(.type   _gcry_twofish_avx2_cbc_dec,@function;)
+_gcry_twofish_avx2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RB0;
+	vmovdqu (2 * 32)(%rdx), RC0;
+	vmovdqu (3 * 32)(%rdx), RD0;
+	vmovdqu (4 * 32)(%rdx), RA1;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RC1;
+	vmovdqu (7 * 32)(%rdx), RD1;
+
+	call __twofish_dec_blk16;
+
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RNOT;
+	vpxor RNOT, RA0, RA0;
+	vpxor (0 * 32 + 16)(%rdx), RB0, RB0;
+	vpxor (1 * 32 + 16)(%rdx), RC0, RC0;
+	vpxor (2 * 32 + 16)(%rdx), RD0, RD0;
+	vpxor (3 * 32 + 16)(%rdx), RA1, RA1;
+	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+	vpxor (5 * 32 + 16)(%rdx), RC1, RC1;
+	vpxor (6 * 32 + 16)(%rdx), RD1, RD1;
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_cfb_dec
+ELF(.type   _gcry_twofish_avx2_cfb_dec,@function;)
+_gcry_twofish_avx2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	/* Load input */
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RA0;
+	vmovdqu (0 * 32 + 16)(%rdx), RB0;
+	vmovdqu (1 * 32 + 16)(%rdx), RC0;
+	vmovdqu (2 * 32 + 16)(%rdx), RD0;
+	vmovdqu (3 * 32 + 16)(%rdx), RA1;
+	vmovdqu (4 * 32 + 16)(%rdx), RB1;
+	vmovdqu (5 * 32 + 16)(%rdx), RC1;
+	vmovdqu (6 * 32 + 16)(%rdx), RD1;
+
+	/* Update IV */
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx);
+
+	call __twofish_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RB0, RB0;
+	vpxor (2 * 32)(%rdx), RC0, RC0;
+	vpxor (3 * 32)(%rdx), RD0, RD0;
+	vpxor (4 * 32)(%rdx), RA1, RA1;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RC1, RC1;
+	vpxor (7 * 32)(%rdx), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_enc
+ELF(.type _gcry_twofish_avx2_ocb_enc,@function;)
+
+_gcry_twofish_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+	vmovdqu (%r8), RTMP1x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RTMP1, RTMP1; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vmovdqu RTMP0x, (%rcx);
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __twofish_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RB0, RB0;
+	vpxor (2 * 32)(%rsi), RC0, RC0;
+	vpxor (3 * 32)(%rsi), RD0, RD0;
+	vpxor (4 * 32)(%rsi), RA1, RA1;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RC1, RC1;
+	vpxor (7 * 32)(%rsi), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_dec
+ELF(.type _gcry_twofish_avx2_ocb_dec,@function;)
+
+_gcry_twofish_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rcx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rcx);
+	mov %r8, %rcx
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __twofish_dec_blk16;
+
+	vmovdqu (%rcx), RTMP1x;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RB0, RB0;
+	vpxor (2 * 32)(%rsi), RC0, RC0;
+	vpxor (3 * 32)(%rsi), RD0, RD0;
+	vpxor (4 * 32)(%rsi), RA1, RA1;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RC1, RC1;
+	vpxor (7 * 32)(%rsi), RD1, RD1;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vpxor RA0, RTMP1, RTMP1;
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vpxor RB0, RTMP1, RTMP1;
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vpxor RC0, RTMP1, RTMP1;
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vpxor RD0, RTMP1, RTMP1;
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vpxor RA1, RTMP1, RTMP1;
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vpxor RB1, RTMP1, RTMP1;
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vpxor RC1, RTMP1, RTMP1;
+	vmovdqu RD1, (7 * 32)(%rsi);
+	vpxor RD1, RTMP1, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_auth
+ELF(.type _gcry_twofish_avx2_ocb_auth,@function;)
+
+_gcry_twofish_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+
+	vmovdqu (%rdx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+
+	call __twofish_enc_blk16;
+
+	vpxor RA0, RB0, RA0;
+	vpxor RC0, RD0, RC0;
+	vpxor RA1, RB1, RA1;
+	vpxor RC1, RD1, RC1;
+
+	vpxor RA0, RC0, RA0;
+	vpxor RA1, RC1, RA1;
+
+	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+	vpxor RA1, RA0, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor (%rcx), RTMP1x, RTMP1x;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
+
+.align 16
+
+/* For CTR-mode IV byteswap */
+ _gcry_twofish_bswap128_mask:
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
+
+#endif /*defined(USE_TWOFISH) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/comm/third_party/libgcrypt/cipher/twofish.c b/comm/third_party/libgcrypt/cipher/twofish.c
new file mode 100644
index 0000000000..d19e079046
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/twofish.c
@@ -0,0 +1,1793 @@
+/* Twofish for GPG
+ * Copyright (C) 1998, 2002, 2003 Free Software Foundation, Inc.
+ * Written by Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ ********************************************************************
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ *
+ * Only the 128- and 256-bit key sizes are supported.  This code is intended
+ * for GNU C on a 32-bit system, but it should work almost anywhere.  Loops
+ * are unrolled, precomputation tables are used, etc., for maximum speed at
+ * some cost in memory consumption. */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+
+#define TWOFISH_BLOCKSIZE 16
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+#  define USE_ARM_ASM 1
+# endif
+#endif
+# if defined(__AARCH64EL__)
+#  ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+#   define USE_ARM_ASM 1
+#  endif
+# endif
+
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# if defined(ENABLE_AVX2_SUPPORT)
+#  define USE_AVX2 1
+# endif
+#endif
+
+
+/* Prototype for the self-test function. */
+static const char *selftest(void);
+
+
+/* Prototypes for the bulk functions. */
+static void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static void _gcry_twofish_cbc_dec (void *context, unsigned char *iv,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static void _gcry_twofish_cfb_dec (void *context, unsigned char *iv,
+				   void *outbuf_arg, const void *inbuf_arg,
+				   size_t nblocks);
+static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				       const void *inbuf_arg, size_t nblocks,
+				       int encrypt);
+static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				      size_t nblocks);
+
+
+/* Structure for an expanded Twofish key.  s contains the key-dependent
+ * S-boxes composed with the MDS matrix; w contains the eight "whitening"
+ * subkeys, K[0] through K[7].	k holds the remaining, "round" subkeys.  Note
+ * that k[i] corresponds to what the Twofish paper calls K[i+8]. */
+typedef struct {
+   u32 s[4][256], w[8], k[32];
+
+#ifdef USE_AVX2
+  int use_avx2;
+#endif
+} TWOFISH_context;
+
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+#  define ASM_FUNC_ABI
+# endif
+#endif
+
+
+/* These two tables are the q0 and q1 permutations, exactly as described in
+ * the Twofish paper. */
+
+static const byte q0[256] = {
+   0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78,
+   0xE4, 0xDD, 0xD1, 0x38, 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C,
+   0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, 0xF2, 0xD0, 0x8B, 0x30,
+   0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82,
+   0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE,
+   0x16, 0x0C, 0xE3, 0x61, 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B,
+   0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, 0xE1, 0xE6, 0xBD, 0x45,
+   0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7,
+   0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF,
+   0x33, 0xC9, 0x62, 0x71, 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8,
+   0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, 0xA1, 0x1D, 0xAA, 0xED,
+   0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90,
+   0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B,
+   0x5F, 0x93, 0x0A, 0xEF, 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B,
+   0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, 0x2A, 0xCE, 0xCB, 0x2F,
+   0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A,
+   0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17,
+   0x55, 0x1F, 0x8A, 0x7D, 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72,
+   0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, 0x6E, 0x50, 0xDE, 0x68,
+   0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4,
+   0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42,
+   0x4A, 0x5E, 0xC1, 0xE0
+};
+
+static const byte q1[256] = {
+   0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B,
+   0x45, 0x7D, 0xE8, 0x4B, 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1,
+   0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, 0x5E, 0xBA, 0xAE, 0x5B,
+   0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5,
+   0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54,
+   0x92, 0x74, 0x36, 0x51, 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96,
+   0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, 0x13, 0x95, 0x9C, 0xC7,
+   0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8,
+   0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF,
+   0x40, 0xE7, 0x2B, 0xE2, 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9,
+   0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, 0x66, 0x94, 0xA1, 0x1D,
+   0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E,
+   0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21,
+   0xC4, 0x1A, 0xEB, 0xD9, 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01,
+   0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, 0x4F, 0xF2, 0x65, 0x8E,
+   0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64,
+   0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44,
+   0xE0, 0x4D, 0x43, 0x69, 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E,
+   0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, 0x22, 0xC9, 0xC0, 0x9B,
+   0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9,
+   0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56,
+   0x55, 0x09, 0xBE, 0x91
+};
+
+/* These MDS tables are actually tables of MDS composed with q0 and q1,
+ * because it is only ever used that way and we can save some time by
+ * precomputing.  Of course the main saving comes from precomputing the
+ * GF(2^8) multiplication involved in the MDS matrix multiply; by looking
+ * things up in these tables we reduce the matrix multiply to four lookups
+ * and three XORs.  Semi-formally, the definition of these tables is:
+ * mds[0][i] = MDS (q1[i] 0 0 0)^T  mds[1][i] = MDS (0 q0[i] 0 0)^T
+ * mds[2][i] = MDS (0 0 q1[i] 0)^T  mds[3][i] = MDS (0 0 0 q0[i])^T
+ * where ^T means "transpose", the matrix multiply is performed in GF(2^8)
+ * represented as GF(2)[x]/v(x) where v(x)=x^8+x^6+x^5+x^3+1 as described
+ * by Schneier et al, and I'm casually glossing over the byte/word
+ * conversion issues. */
+
+static const u32 mds[4][256] = {
+   {0xBCBC3275, 0xECEC21F3, 0x202043C6, 0xB3B3C9F4, 0xDADA03DB, 0x02028B7B,
+    0xE2E22BFB, 0x9E9EFAC8, 0xC9C9EC4A, 0xD4D409D3, 0x18186BE6, 0x1E1E9F6B,
+    0x98980E45, 0xB2B2387D, 0xA6A6D2E8, 0x2626B74B, 0x3C3C57D6, 0x93938A32,
+    0x8282EED8, 0x525298FD, 0x7B7BD437, 0xBBBB3771, 0x5B5B97F1, 0x474783E1,
+    0x24243C30, 0x5151E20F, 0xBABAC6F8, 0x4A4AF31B, 0xBFBF4887, 0x0D0D70FA,
+    0xB0B0B306, 0x7575DE3F, 0xD2D2FD5E, 0x7D7D20BA, 0x666631AE, 0x3A3AA35B,
+    0x59591C8A, 0x00000000, 0xCDCD93BC, 0x1A1AE09D, 0xAEAE2C6D, 0x7F7FABC1,
+    0x2B2BC7B1, 0xBEBEB90E, 0xE0E0A080, 0x8A8A105D, 0x3B3B52D2, 0x6464BAD5,
+    0xD8D888A0, 0xE7E7A584, 0x5F5FE807, 0x1B1B1114, 0x2C2CC2B5, 0xFCFCB490,
+    0x3131272C, 0x808065A3, 0x73732AB2, 0x0C0C8173, 0x79795F4C, 0x6B6B4154,
+    0x4B4B0292, 0x53536974, 0x94948F36, 0x83831F51, 0x2A2A3638, 0xC4C49CB0,
+    0x2222C8BD, 0xD5D5F85A, 0xBDBDC3FC, 0x48487860, 0xFFFFCE62, 0x4C4C0796,
+    0x4141776C, 0xC7C7E642, 0xEBEB24F7, 0x1C1C1410, 0x5D5D637C, 0x36362228,
+    0x6767C027, 0xE9E9AF8C, 0x4444F913, 0x1414EA95, 0xF5F5BB9C, 0xCFCF18C7,
+    0x3F3F2D24, 0xC0C0E346, 0x7272DB3B, 0x54546C70, 0x29294CCA, 0xF0F035E3,
+    0x0808FE85, 0xC6C617CB, 0xF3F34F11, 0x8C8CE4D0, 0xA4A45993, 0xCACA96B8,
+    0x68683BA6, 0xB8B84D83, 0x38382820, 0xE5E52EFF, 0xADAD569F, 0x0B0B8477,
+    0xC8C81DC3, 0x9999FFCC, 0x5858ED03, 0x19199A6F, 0x0E0E0A08, 0x95957EBF,
+    0x70705040, 0xF7F730E7, 0x6E6ECF2B, 0x1F1F6EE2, 0xB5B53D79, 0x09090F0C,
+    0x616134AA, 0x57571682, 0x9F9F0B41, 0x9D9D803A, 0x111164EA, 0x2525CDB9,
+    0xAFAFDDE4, 0x4545089A, 0xDFDF8DA4, 0xA3A35C97, 0xEAEAD57E, 0x353558DA,
+    0xEDEDD07A, 0x4343FC17, 0xF8F8CB66, 0xFBFBB194, 0x3737D3A1, 0xFAFA401D,
+    0xC2C2683D, 0xB4B4CCF0, 0x32325DDE, 0x9C9C71B3, 0x5656E70B, 0xE3E3DA72,
+    0x878760A7, 0x15151B1C, 0xF9F93AEF, 0x6363BFD1, 0x3434A953, 0x9A9A853E,
+    0xB1B1428F, 0x7C7CD133, 0x88889B26, 0x3D3DA65F, 0xA1A1D7EC, 0xE4E4DF76,
+    0x8181942A, 0x91910149, 0x0F0FFB81, 0xEEEEAA88, 0x161661EE, 0xD7D77321,
+    0x9797F5C4, 0xA5A5A81A, 0xFEFE3FEB, 0x6D6DB5D9, 0x7878AEC5, 0xC5C56D39,
+    0x1D1DE599, 0x7676A4CD, 0x3E3EDCAD, 0xCBCB6731, 0xB6B6478B, 0xEFEF5B01,
+    0x12121E18, 0x6060C523, 0x6A6AB0DD, 0x4D4DF61F, 0xCECEE94E, 0xDEDE7C2D,
+    0x55559DF9, 0x7E7E5A48, 0x2121B24F, 0x03037AF2, 0xA0A02665, 0x5E5E198E,
+    0x5A5A6678, 0x65654B5C, 0x62624E58, 0xFDFD4519, 0x0606F48D, 0x404086E5,
+    0xF2F2BE98, 0x3333AC57, 0x17179067, 0x05058E7F, 0xE8E85E05, 0x4F4F7D64,
+    0x89896AAF, 0x10109563, 0x74742FB6, 0x0A0A75FE, 0x5C5C92F5, 0x9B9B74B7,
+    0x2D2D333C, 0x3030D6A5, 0x2E2E49CE, 0x494989E9, 0x46467268, 0x77775544,
+    0xA8A8D8E0, 0x9696044D, 0x2828BD43, 0xA9A92969, 0xD9D97929, 0x8686912E,
+    0xD1D187AC, 0xF4F44A15, 0x8D8D1559, 0xD6D682A8, 0xB9B9BC0A, 0x42420D9E,
+    0xF6F6C16E, 0x2F2FB847, 0xDDDD06DF, 0x23233934, 0xCCCC6235, 0xF1F1C46A,
+    0xC1C112CF, 0x8585EBDC, 0x8F8F9E22, 0x7171A1C9, 0x9090F0C0, 0xAAAA539B,
+    0x0101F189, 0x8B8BE1D4, 0x4E4E8CED, 0x8E8E6FAB, 0xABABA212, 0x6F6F3EA2,
+    0xE6E6540D, 0xDBDBF252, 0x92927BBB, 0xB7B7B602, 0x6969CA2F, 0x3939D9A9,
+    0xD3D30CD7, 0xA7A72361, 0xA2A2AD1E, 0xC3C399B4, 0x6C6C4450, 0x07070504,
+    0x04047FF6, 0x272746C2, 0xACACA716, 0xD0D07625, 0x50501386, 0xDCDCF756,
+    0x84841A55, 0xE1E15109, 0x7A7A25BE, 0x1313EF91},
+
+   {0xA9D93939, 0x67901717, 0xB3719C9C, 0xE8D2A6A6, 0x04050707, 0xFD985252,
+    0xA3658080, 0x76DFE4E4, 0x9A084545, 0x92024B4B, 0x80A0E0E0, 0x78665A5A,
+    0xE4DDAFAF, 0xDDB06A6A, 0xD1BF6363, 0x38362A2A, 0x0D54E6E6, 0xC6432020,
+    0x3562CCCC, 0x98BEF2F2, 0x181E1212, 0xF724EBEB, 0xECD7A1A1, 0x6C774141,
+    0x43BD2828, 0x7532BCBC, 0x37D47B7B, 0x269B8888, 0xFA700D0D, 0x13F94444,
+    0x94B1FBFB, 0x485A7E7E, 0xF27A0303, 0xD0E48C8C, 0x8B47B6B6, 0x303C2424,
+    0x84A5E7E7, 0x54416B6B, 0xDF06DDDD, 0x23C56060, 0x1945FDFD, 0x5BA33A3A,
+    0x3D68C2C2, 0x59158D8D, 0xF321ECEC, 0xAE316666, 0xA23E6F6F, 0x82165757,
+    0x63951010, 0x015BEFEF, 0x834DB8B8, 0x2E918686, 0xD9B56D6D, 0x511F8383,
+    0x9B53AAAA, 0x7C635D5D, 0xA63B6868, 0xEB3FFEFE, 0xA5D63030, 0xBE257A7A,
+    0x16A7ACAC, 0x0C0F0909, 0xE335F0F0, 0x6123A7A7, 0xC0F09090, 0x8CAFE9E9,
+    0x3A809D9D, 0xF5925C5C, 0x73810C0C, 0x2C273131, 0x2576D0D0, 0x0BE75656,
+    0xBB7B9292, 0x4EE9CECE, 0x89F10101, 0x6B9F1E1E, 0x53A93434, 0x6AC4F1F1,
+    0xB499C3C3, 0xF1975B5B, 0xE1834747, 0xE66B1818, 0xBDC82222, 0x450E9898,
+    0xE26E1F1F, 0xF4C9B3B3, 0xB62F7474, 0x66CBF8F8, 0xCCFF9999, 0x95EA1414,
+    0x03ED5858, 0x56F7DCDC, 0xD4E18B8B, 0x1C1B1515, 0x1EADA2A2, 0xD70CD3D3,
+    0xFB2BE2E2, 0xC31DC8C8, 0x8E195E5E, 0xB5C22C2C, 0xE9894949, 0xCF12C1C1,
+    0xBF7E9595, 0xBA207D7D, 0xEA641111, 0x77840B0B, 0x396DC5C5, 0xAF6A8989,
+    0x33D17C7C, 0xC9A17171, 0x62CEFFFF, 0x7137BBBB, 0x81FB0F0F, 0x793DB5B5,
+    0x0951E1E1, 0xADDC3E3E, 0x242D3F3F, 0xCDA47676, 0xF99D5555, 0xD8EE8282,
+    0xE5864040, 0xC5AE7878, 0xB9CD2525, 0x4D049696, 0x44557777, 0x080A0E0E,
+    0x86135050, 0xE730F7F7, 0xA1D33737, 0x1D40FAFA, 0xAA346161, 0xED8C4E4E,
+    0x06B3B0B0, 0x706C5454, 0xB22A7373, 0xD2523B3B, 0x410B9F9F, 0x7B8B0202,
+    0xA088D8D8, 0x114FF3F3, 0x3167CBCB, 0xC2462727, 0x27C06767, 0x90B4FCFC,
+    0x20283838, 0xF67F0404, 0x60784848, 0xFF2EE5E5, 0x96074C4C, 0x5C4B6565,
+    0xB1C72B2B, 0xAB6F8E8E, 0x9E0D4242, 0x9CBBF5F5, 0x52F2DBDB, 0x1BF34A4A,
+    0x5FA63D3D, 0x9359A4A4, 0x0ABCB9B9, 0xEF3AF9F9, 0x91EF1313, 0x85FE0808,
+    0x49019191, 0xEE611616, 0x2D7CDEDE, 0x4FB22121, 0x8F42B1B1, 0x3BDB7272,
+    0x47B82F2F, 0x8748BFBF, 0x6D2CAEAE, 0x46E3C0C0, 0xD6573C3C, 0x3E859A9A,
+    0x6929A9A9, 0x647D4F4F, 0x2A948181, 0xCE492E2E, 0xCB17C6C6, 0x2FCA6969,
+    0xFCC3BDBD, 0x975CA3A3, 0x055EE8E8, 0x7AD0EDED, 0xAC87D1D1, 0x7F8E0505,
+    0xD5BA6464, 0x1AA8A5A5, 0x4BB72626, 0x0EB9BEBE, 0xA7608787, 0x5AF8D5D5,
+    0x28223636, 0x14111B1B, 0x3FDE7575, 0x2979D9D9, 0x88AAEEEE, 0x3C332D2D,
+    0x4C5F7979, 0x02B6B7B7, 0xB896CACA, 0xDA583535, 0xB09CC4C4, 0x17FC4343,
+    0x551A8484, 0x1FF64D4D, 0x8A1C5959, 0x7D38B2B2, 0x57AC3333, 0xC718CFCF,
+    0x8DF40606, 0x74695353, 0xB7749B9B, 0xC4F59797, 0x9F56ADAD, 0x72DAE3E3,
+    0x7ED5EAEA, 0x154AF4F4, 0x229E8F8F, 0x12A2ABAB, 0x584E6262, 0x07E85F5F,
+    0x99E51D1D, 0x34392323, 0x6EC1F6F6, 0x50446C6C, 0xDE5D3232, 0x68724646,
+    0x6526A0A0, 0xBC93CDCD, 0xDB03DADA, 0xF8C6BABA, 0xC8FA9E9E, 0xA882D6D6,
+    0x2BCF6E6E, 0x40507070, 0xDCEB8585, 0xFE750A0A, 0x328A9393, 0xA48DDFDF,
+    0xCA4C2929, 0x10141C1C, 0x2173D7D7, 0xF0CCB4B4, 0xD309D4D4, 0x5D108A8A,
+    0x0FE25151, 0x00000000, 0x6F9A1919, 0x9DE01A1A, 0x368F9494, 0x42E6C7C7,
+    0x4AECC9C9, 0x5EFDD2D2, 0xC1AB7F7F, 0xE0D8A8A8},
+
+   {0xBC75BC32, 0xECF3EC21, 0x20C62043, 0xB3F4B3C9, 0xDADBDA03, 0x027B028B,
+    0xE2FBE22B, 0x9EC89EFA, 0xC94AC9EC, 0xD4D3D409, 0x18E6186B, 0x1E6B1E9F,
+    0x9845980E, 0xB27DB238, 0xA6E8A6D2, 0x264B26B7, 0x3CD63C57, 0x9332938A,
+    0x82D882EE, 0x52FD5298, 0x7B377BD4, 0xBB71BB37, 0x5BF15B97, 0x47E14783,
+    0x2430243C, 0x510F51E2, 0xBAF8BAC6, 0x4A1B4AF3, 0xBF87BF48, 0x0DFA0D70,
+    0xB006B0B3, 0x753F75DE, 0xD25ED2FD, 0x7DBA7D20, 0x66AE6631, 0x3A5B3AA3,
+    0x598A591C, 0x00000000, 0xCDBCCD93, 0x1A9D1AE0, 0xAE6DAE2C, 0x7FC17FAB,
+    0x2BB12BC7, 0xBE0EBEB9, 0xE080E0A0, 0x8A5D8A10, 0x3BD23B52, 0x64D564BA,
+    0xD8A0D888, 0xE784E7A5, 0x5F075FE8, 0x1B141B11, 0x2CB52CC2, 0xFC90FCB4,
+    0x312C3127, 0x80A38065, 0x73B2732A, 0x0C730C81, 0x794C795F, 0x6B546B41,
+    0x4B924B02, 0x53745369, 0x9436948F, 0x8351831F, 0x2A382A36, 0xC4B0C49C,
+    0x22BD22C8, 0xD55AD5F8, 0xBDFCBDC3, 0x48604878, 0xFF62FFCE, 0x4C964C07,
+    0x416C4177, 0xC742C7E6, 0xEBF7EB24, 0x1C101C14, 0x5D7C5D63, 0x36283622,
+    0x672767C0, 0xE98CE9AF, 0x441344F9, 0x149514EA, 0xF59CF5BB, 0xCFC7CF18,
+    0x3F243F2D, 0xC046C0E3, 0x723B72DB, 0x5470546C, 0x29CA294C, 0xF0E3F035,
+    0x088508FE, 0xC6CBC617, 0xF311F34F, 0x8CD08CE4, 0xA493A459, 0xCAB8CA96,
+    0x68A6683B, 0xB883B84D, 0x38203828, 0xE5FFE52E, 0xAD9FAD56, 0x0B770B84,
+    0xC8C3C81D, 0x99CC99FF, 0x580358ED, 0x196F199A, 0x0E080E0A, 0x95BF957E,
+    0x70407050, 0xF7E7F730, 0x6E2B6ECF, 0x1FE21F6E, 0xB579B53D, 0x090C090F,
+    0x61AA6134, 0x57825716, 0x9F419F0B, 0x9D3A9D80, 0x11EA1164, 0x25B925CD,
+    0xAFE4AFDD, 0x459A4508, 0xDFA4DF8D, 0xA397A35C, 0xEA7EEAD5, 0x35DA3558,
+    0xED7AEDD0, 0x431743FC, 0xF866F8CB, 0xFB94FBB1, 0x37A137D3, 0xFA1DFA40,
+    0xC23DC268, 0xB4F0B4CC, 0x32DE325D, 0x9CB39C71, 0x560B56E7, 0xE372E3DA,
+    0x87A78760, 0x151C151B, 0xF9EFF93A, 0x63D163BF, 0x345334A9, 0x9A3E9A85,
+    0xB18FB142, 0x7C337CD1, 0x8826889B, 0x3D5F3DA6, 0xA1ECA1D7, 0xE476E4DF,
+    0x812A8194, 0x91499101, 0x0F810FFB, 0xEE88EEAA, 0x16EE1661, 0xD721D773,
+    0x97C497F5, 0xA51AA5A8, 0xFEEBFE3F, 0x6DD96DB5, 0x78C578AE, 0xC539C56D,
+    0x1D991DE5, 0x76CD76A4, 0x3EAD3EDC, 0xCB31CB67, 0xB68BB647, 0xEF01EF5B,
+    0x1218121E, 0x602360C5, 0x6ADD6AB0, 0x4D1F4DF6, 0xCE4ECEE9, 0xDE2DDE7C,
+    0x55F9559D, 0x7E487E5A, 0x214F21B2, 0x03F2037A, 0xA065A026, 0x5E8E5E19,
+    0x5A785A66, 0x655C654B, 0x6258624E, 0xFD19FD45, 0x068D06F4, 0x40E54086,
+    0xF298F2BE, 0x335733AC, 0x17671790, 0x057F058E, 0xE805E85E, 0x4F644F7D,
+    0x89AF896A, 0x10631095, 0x74B6742F, 0x0AFE0A75, 0x5CF55C92, 0x9BB79B74,
+    0x2D3C2D33, 0x30A530D6, 0x2ECE2E49, 0x49E94989, 0x46684672, 0x77447755,
+    0xA8E0A8D8, 0x964D9604, 0x284328BD, 0xA969A929, 0xD929D979, 0x862E8691,
+    0xD1ACD187, 0xF415F44A, 0x8D598D15, 0xD6A8D682, 0xB90AB9BC, 0x429E420D,
+    0xF66EF6C1, 0x2F472FB8, 0xDDDFDD06, 0x23342339, 0xCC35CC62, 0xF16AF1C4,
+    0xC1CFC112, 0x85DC85EB, 0x8F228F9E, 0x71C971A1, 0x90C090F0, 0xAA9BAA53,
+    0x018901F1, 0x8BD48BE1, 0x4EED4E8C, 0x8EAB8E6F, 0xAB12ABA2, 0x6FA26F3E,
+    0xE60DE654, 0xDB52DBF2, 0x92BB927B, 0xB702B7B6, 0x692F69CA, 0x39A939D9,
+    0xD3D7D30C, 0xA761A723, 0xA21EA2AD, 0xC3B4C399, 0x6C506C44, 0x07040705,
+    0x04F6047F, 0x27C22746, 0xAC16ACA7, 0xD025D076, 0x50865013, 0xDC56DCF7,
+    0x8455841A, 0xE109E151, 0x7ABE7A25, 0x139113EF},
+
+   {0xD939A9D9, 0x90176790, 0x719CB371, 0xD2A6E8D2, 0x05070405, 0x9852FD98,
+    0x6580A365, 0xDFE476DF, 0x08459A08, 0x024B9202, 0xA0E080A0, 0x665A7866,
+    0xDDAFE4DD, 0xB06ADDB0, 0xBF63D1BF, 0x362A3836, 0x54E60D54, 0x4320C643,
+    0x62CC3562, 0xBEF298BE, 0x1E12181E, 0x24EBF724, 0xD7A1ECD7, 0x77416C77,
+    0xBD2843BD, 0x32BC7532, 0xD47B37D4, 0x9B88269B, 0x700DFA70, 0xF94413F9,
+    0xB1FB94B1, 0x5A7E485A, 0x7A03F27A, 0xE48CD0E4, 0x47B68B47, 0x3C24303C,
+    0xA5E784A5, 0x416B5441, 0x06DDDF06, 0xC56023C5, 0x45FD1945, 0xA33A5BA3,
+    0x68C23D68, 0x158D5915, 0x21ECF321, 0x3166AE31, 0x3E6FA23E, 0x16578216,
+    0x95106395, 0x5BEF015B, 0x4DB8834D, 0x91862E91, 0xB56DD9B5, 0x1F83511F,
+    0x53AA9B53, 0x635D7C63, 0x3B68A63B, 0x3FFEEB3F, 0xD630A5D6, 0x257ABE25,
+    0xA7AC16A7, 0x0F090C0F, 0x35F0E335, 0x23A76123, 0xF090C0F0, 0xAFE98CAF,
+    0x809D3A80, 0x925CF592, 0x810C7381, 0x27312C27, 0x76D02576, 0xE7560BE7,
+    0x7B92BB7B, 0xE9CE4EE9, 0xF10189F1, 0x9F1E6B9F, 0xA93453A9, 0xC4F16AC4,
+    0x99C3B499, 0x975BF197, 0x8347E183, 0x6B18E66B, 0xC822BDC8, 0x0E98450E,
+    0x6E1FE26E, 0xC9B3F4C9, 0x2F74B62F, 0xCBF866CB, 0xFF99CCFF, 0xEA1495EA,
+    0xED5803ED, 0xF7DC56F7, 0xE18BD4E1, 0x1B151C1B, 0xADA21EAD, 0x0CD3D70C,
+    0x2BE2FB2B, 0x1DC8C31D, 0x195E8E19, 0xC22CB5C2, 0x8949E989, 0x12C1CF12,
+    0x7E95BF7E, 0x207DBA20, 0x6411EA64, 0x840B7784, 0x6DC5396D, 0x6A89AF6A,
+    0xD17C33D1, 0xA171C9A1, 0xCEFF62CE, 0x37BB7137, 0xFB0F81FB, 0x3DB5793D,
+    0x51E10951, 0xDC3EADDC, 0x2D3F242D, 0xA476CDA4, 0x9D55F99D, 0xEE82D8EE,
+    0x8640E586, 0xAE78C5AE, 0xCD25B9CD, 0x04964D04, 0x55774455, 0x0A0E080A,
+    0x13508613, 0x30F7E730, 0xD337A1D3, 0x40FA1D40, 0x3461AA34, 0x8C4EED8C,
+    0xB3B006B3, 0x6C54706C, 0x2A73B22A, 0x523BD252, 0x0B9F410B, 0x8B027B8B,
+    0x88D8A088, 0x4FF3114F, 0x67CB3167, 0x4627C246, 0xC06727C0, 0xB4FC90B4,
+    0x28382028, 0x7F04F67F, 0x78486078, 0x2EE5FF2E, 0x074C9607, 0x4B655C4B,
+    0xC72BB1C7, 0x6F8EAB6F, 0x0D429E0D, 0xBBF59CBB, 0xF2DB52F2, 0xF34A1BF3,
+    0xA63D5FA6, 0x59A49359, 0xBCB90ABC, 0x3AF9EF3A, 0xEF1391EF, 0xFE0885FE,
+    0x01914901, 0x6116EE61, 0x7CDE2D7C, 0xB2214FB2, 0x42B18F42, 0xDB723BDB,
+    0xB82F47B8, 0x48BF8748, 0x2CAE6D2C, 0xE3C046E3, 0x573CD657, 0x859A3E85,
+    0x29A96929, 0x7D4F647D, 0x94812A94, 0x492ECE49, 0x17C6CB17, 0xCA692FCA,
+    0xC3BDFCC3, 0x5CA3975C, 0x5EE8055E, 0xD0ED7AD0, 0x87D1AC87, 0x8E057F8E,
+    0xBA64D5BA, 0xA8A51AA8, 0xB7264BB7, 0xB9BE0EB9, 0x6087A760, 0xF8D55AF8,
+    0x22362822, 0x111B1411, 0xDE753FDE, 0x79D92979, 0xAAEE88AA, 0x332D3C33,
+    0x5F794C5F, 0xB6B702B6, 0x96CAB896, 0x5835DA58, 0x9CC4B09C, 0xFC4317FC,
+    0x1A84551A, 0xF64D1FF6, 0x1C598A1C, 0x38B27D38, 0xAC3357AC, 0x18CFC718,
+    0xF4068DF4, 0x69537469, 0x749BB774, 0xF597C4F5, 0x56AD9F56, 0xDAE372DA,
+    0xD5EA7ED5, 0x4AF4154A, 0x9E8F229E, 0xA2AB12A2, 0x4E62584E, 0xE85F07E8,
+    0xE51D99E5, 0x39233439, 0xC1F66EC1, 0x446C5044, 0x5D32DE5D, 0x72466872,
+    0x26A06526, 0x93CDBC93, 0x03DADB03, 0xC6BAF8C6, 0xFA9EC8FA, 0x82D6A882,
+    0xCF6E2BCF, 0x50704050, 0xEB85DCEB, 0x750AFE75, 0x8A93328A, 0x8DDFA48D,
+    0x4C29CA4C, 0x141C1014, 0x73D72173, 0xCCB4F0CC, 0x09D4D309, 0x108A5D10,
+    0xE2510FE2, 0x00000000, 0x9A196F9A, 0xE01A9DE0, 0x8F94368F, 0xE6C742E6,
+    0xECC94AEC, 0xFDD25EFD, 0xAB7FC1AB, 0xD8A8E0D8}
+};
+
+/* The exp_to_poly and poly_to_exp tables are used to perform efficient
+ * operations in GF(2^8) represented as GF(2)[x]/w(x) where
+ * w(x)=x^8+x^6+x^3+x^2+1.  We care about doing that because it's part of the
+ * definition of the RS matrix in the key schedule.  Elements of that field
+ * are polynomials of degree not greater than 7 and all coefficients 0 or 1,
+ * which can be represented naturally by bytes (just substitute x=2).  In that
+ * form, GF(2^8) addition is the same as bitwise XOR, but GF(2^8)
+ * multiplication is inefficient without hardware support.  To multiply
+ * faster, I make use of the fact x is a generator for the nonzero elements,
+ * so that every element p of GF(2)[x]/w(x) is either 0 or equal to (x)^n for
+ * some n in 0..254.  Note that that caret is exponentiation in GF(2^8),
+ * *not* polynomial notation.  So if I want to compute pq where p and q are
+ * in GF(2^8), I can just say:
+ *    1. if p=0 or q=0 then pq=0
+ *    2. otherwise, find m and n such that p=x^m and q=x^n
+ *    3. pq=(x^m)(x^n)=x^(m+n), so add m and n and find pq
+ * The translations in steps 2 and 3 are looked up in the tables
+ * poly_to_exp (for step 2) and exp_to_poly (for step 3).  To see this
+ * in action, look at the CALC_S macro.  As additional wrinkles, note that
+ * one of my operands is always a constant, so the poly_to_exp lookup on it
+ * is done in advance; I included the original values in the comments so
+ * readers can have some chance of recognizing that this *is* the RS matrix
+ * from the Twofish paper.  I've only included the table entries I actually
+ * need; I never do a lookup on a variable input of zero and the biggest
+ * exponents I'll ever see are 254 (variable) and 237 (constant), so they'll
+ * never sum to more than 491.	I'm repeating part of the exp_to_poly table
+ * so that I don't have to do mod-255 reduction in the exponent arithmetic.
+ * Since I know my constant operands are never zero, I only have to worry
+ * about zero values in the variable operand, and I do it with a simple
+ * conditional branch.	I know conditionals are expensive, but I couldn't
+ * see a non-horrible way of avoiding them, and I did manage to group the
+ * statements so that each if covers four group multiplications. */
+
+static const u16 poly_to_exp[256] = {
+   492,
+   0x00, 0x01, 0x17, 0x02, 0x2E, 0x18, 0x53, 0x03, 0x6A, 0x2F, 0x93, 0x19,
+   0x34, 0x54, 0x45, 0x04, 0x5C, 0x6B, 0xB6, 0x30, 0xA6, 0x94, 0x4B, 0x1A,
+   0x8C, 0x35, 0x81, 0x55, 0xAA, 0x46, 0x0D, 0x05, 0x24, 0x5D, 0x87, 0x6C,
+   0x9B, 0xB7, 0xC1, 0x31, 0x2B, 0xA7, 0xA3, 0x95, 0x98, 0x4C, 0xCA, 0x1B,
+   0xE6, 0x8D, 0x73, 0x36, 0xCD, 0x82, 0x12, 0x56, 0x62, 0xAB, 0xF0, 0x47,
+   0x4F, 0x0E, 0xBD, 0x06, 0xD4, 0x25, 0xD2, 0x5E, 0x27, 0x88, 0x66, 0x6D,
+   0xD6, 0x9C, 0x79, 0xB8, 0x08, 0xC2, 0xDF, 0x32, 0x68, 0x2C, 0xFD, 0xA8,
+   0x8A, 0xA4, 0x5A, 0x96, 0x29, 0x99, 0x22, 0x4D, 0x60, 0xCB, 0xE4, 0x1C,
+   0x7B, 0xE7, 0x3B, 0x8E, 0x9E, 0x74, 0xF4, 0x37, 0xD8, 0xCE, 0xF9, 0x83,
+   0x6F, 0x13, 0xB2, 0x57, 0xE1, 0x63, 0xDC, 0xAC, 0xC4, 0xF1, 0xAF, 0x48,
+   0x0A, 0x50, 0x42, 0x0F, 0xBA, 0xBE, 0xC7, 0x07, 0xDE, 0xD5, 0x78, 0x26,
+   0x65, 0xD3, 0xD1, 0x5F, 0xE3, 0x28, 0x21, 0x89, 0x59, 0x67, 0xFC, 0x6E,
+   0xB1, 0xD7, 0xF8, 0x9D, 0xF3, 0x7A, 0x3A, 0xB9, 0xC6, 0x09, 0x41, 0xC3,
+   0xAE, 0xE0, 0xDB, 0x33, 0x44, 0x69, 0x92, 0x2D, 0x52, 0xFE, 0x16, 0xA9,
+   0x0C, 0x8B, 0x80, 0xA5, 0x4A, 0x5B, 0xB5, 0x97, 0xC9, 0x2A, 0xA2, 0x9A,
+   0xC0, 0x23, 0x86, 0x4E, 0xBC, 0x61, 0xEF, 0xCC, 0x11, 0xE5, 0x72, 0x1D,
+   0x3D, 0x7C, 0xEB, 0xE8, 0xE9, 0x3C, 0xEA, 0x8F, 0x7D, 0x9F, 0xEC, 0x75,
+   0x1E, 0xF5, 0x3E, 0x38, 0xF6, 0xD9, 0x3F, 0xCF, 0x76, 0xFA, 0x1F, 0x84,
+   0xA0, 0x70, 0xED, 0x14, 0x90, 0xB3, 0x7E, 0x58, 0xFB, 0xE2, 0x20, 0x64,
+   0xD0, 0xDD, 0x77, 0xAD, 0xDA, 0xC5, 0x40, 0xF2, 0x39, 0xB0, 0xF7, 0x49,
+   0xB4, 0x0B, 0x7F, 0x51, 0x15, 0x43, 0x91, 0x10, 0x71, 0xBB, 0xEE, 0xBF,
+   0x85, 0xC8, 0xA1
+};
+
+static const byte exp_to_poly[492 + 256] = {
+   0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2,
+   0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03,
+   0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6,
+   0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52, 0xA4, 0x05, 0x0A,
+   0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0, 0xED, 0x97, 0x63,
+   0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1, 0x0F, 0x1E, 0x3C,
+   0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A, 0xF4, 0xA5, 0x07,
+   0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11, 0x22, 0x44, 0x88,
+   0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51, 0xA2, 0x09, 0x12,
+   0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66, 0xCC, 0xD5, 0xE7,
+   0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB, 0x1B, 0x36, 0x6C,
+   0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19, 0x32, 0x64, 0xC8,
+   0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D, 0x5A, 0xB4, 0x25,
+   0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56, 0xAC, 0x15, 0x2A,
+   0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE, 0x91, 0x6F, 0xDE,
+   0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9, 0x3F, 0x7E, 0xFC,
+   0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE, 0xB1, 0x2F, 0x5E,
+   0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41, 0x82, 0x49, 0x92,
+   0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E, 0x71, 0xE2, 0x89,
+   0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB, 0xDB, 0xFB, 0xBB,
+   0x3B, 0x76, 0xEC, 0x95, 0x67, 0xCE, 0xD1, 0xEF, 0x93, 0x6B, 0xD6, 0xE1,
+   0x8F, 0x53, 0xA6, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D,
+   0x9A, 0x79, 0xF2, 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC,
+   0xF5, 0xA7, 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3,
+   0x8B, 0x5B, 0xB6, 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52,
+   0xA4, 0x05, 0x0A, 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0,
+   0xED, 0x97, 0x63, 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1,
+   0x0F, 0x1E, 0x3C, 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A,
+   0xF4, 0xA5, 0x07, 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11,
+   0x22, 0x44, 0x88, 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51,
+   0xA2, 0x09, 0x12, 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66,
+   0xCC, 0xD5, 0xE7, 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB,
+   0x1B, 0x36, 0x6C, 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19,
+   0x32, 0x64, 0xC8, 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D,
+   0x5A, 0xB4, 0x25, 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56,
+   0xAC, 0x15, 0x2A, 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE,
+   0x91, 0x6F, 0xDE, 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9,
+   0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE,
+   0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41,
+   0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E,
+   0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB,
+};
+
+
+/* The table constants are indices of
+ * S-box entries, preprocessed through q0 and q1. */
+static byte calc_sb_tbl[512] = {
+    0xA9, 0x75, 0x67, 0xF3, 0xB3, 0xC6, 0xE8, 0xF4,
+    0x04, 0xDB, 0xFD, 0x7B, 0xA3, 0xFB, 0x76, 0xC8,
+    0x9A, 0x4A, 0x92, 0xD3, 0x80, 0xE6, 0x78, 0x6B,
+    0xE4, 0x45, 0xDD, 0x7D, 0xD1, 0xE8, 0x38, 0x4B,
+    0x0D, 0xD6, 0xC6, 0x32, 0x35, 0xD8, 0x98, 0xFD,
+    0x18, 0x37, 0xF7, 0x71, 0xEC, 0xF1, 0x6C, 0xE1,
+    0x43, 0x30, 0x75, 0x0F, 0x37, 0xF8, 0x26, 0x1B,
+    0xFA, 0x87, 0x13, 0xFA, 0x94, 0x06, 0x48, 0x3F,
+    0xF2, 0x5E, 0xD0, 0xBA, 0x8B, 0xAE, 0x30, 0x5B,
+    0x84, 0x8A, 0x54, 0x00, 0xDF, 0xBC, 0x23, 0x9D,
+    0x19, 0x6D, 0x5B, 0xC1, 0x3D, 0xB1, 0x59, 0x0E,
+    0xF3, 0x80, 0xAE, 0x5D, 0xA2, 0xD2, 0x82, 0xD5,
+    0x63, 0xA0, 0x01, 0x84, 0x83, 0x07, 0x2E, 0x14,
+    0xD9, 0xB5, 0x51, 0x90, 0x9B, 0x2C, 0x7C, 0xA3,
+    0xA6, 0xB2, 0xEB, 0x73, 0xA5, 0x4C, 0xBE, 0x54,
+    0x16, 0x92, 0x0C, 0x74, 0xE3, 0x36, 0x61, 0x51,
+    0xC0, 0x38, 0x8C, 0xB0, 0x3A, 0xBD, 0xF5, 0x5A,
+    0x73, 0xFC, 0x2C, 0x60, 0x25, 0x62, 0x0B, 0x96,
+    0xBB, 0x6C, 0x4E, 0x42, 0x89, 0xF7, 0x6B, 0x10,
+    0x53, 0x7C, 0x6A, 0x28, 0xB4, 0x27, 0xF1, 0x8C,
+    0xE1, 0x13, 0xE6, 0x95, 0xBD, 0x9C, 0x45, 0xC7,
+    0xE2, 0x24, 0xF4, 0x46, 0xB6, 0x3B, 0x66, 0x70,
+    0xCC, 0xCA, 0x95, 0xE3, 0x03, 0x85, 0x56, 0xCB,
+    0xD4, 0x11, 0x1C, 0xD0, 0x1E, 0x93, 0xD7, 0xB8,
+    0xFB, 0xA6, 0xC3, 0x83, 0x8E, 0x20, 0xB5, 0xFF,
+    0xE9, 0x9F, 0xCF, 0x77, 0xBF, 0xC3, 0xBA, 0xCC,
+    0xEA, 0x03, 0x77, 0x6F, 0x39, 0x08, 0xAF, 0xBF,
+    0x33, 0x40, 0xC9, 0xE7, 0x62, 0x2B, 0x71, 0xE2,
+    0x81, 0x79, 0x79, 0x0C, 0x09, 0xAA, 0xAD, 0x82,
+    0x24, 0x41, 0xCD, 0x3A, 0xF9, 0xEA, 0xD8, 0xB9,
+    0xE5, 0xE4, 0xC5, 0x9A, 0xB9, 0xA4, 0x4D, 0x97,
+    0x44, 0x7E, 0x08, 0xDA, 0x86, 0x7A, 0xE7, 0x17,
+    0xA1, 0x66, 0x1D, 0x94, 0xAA, 0xA1, 0xED, 0x1D,
+    0x06, 0x3D, 0x70, 0xF0, 0xB2, 0xDE, 0xD2, 0xB3,
+    0x41, 0x0B, 0x7B, 0x72, 0xA0, 0xA7, 0x11, 0x1C,
+    0x31, 0xEF, 0xC2, 0xD1, 0x27, 0x53, 0x90, 0x3E,
+    0x20, 0x8F, 0xF6, 0x33, 0x60, 0x26, 0xFF, 0x5F,
+    0x96, 0xEC, 0x5C, 0x76, 0xB1, 0x2A, 0xAB, 0x49,
+    0x9E, 0x81, 0x9C, 0x88, 0x52, 0xEE, 0x1B, 0x21,
+    0x5F, 0xC4, 0x93, 0x1A, 0x0A, 0xEB, 0xEF, 0xD9,
+    0x91, 0xC5, 0x85, 0x39, 0x49, 0x99, 0xEE, 0xCD,
+    0x2D, 0xAD, 0x4F, 0x31, 0x8F, 0x8B, 0x3B, 0x01,
+    0x47, 0x18, 0x87, 0x23, 0x6D, 0xDD, 0x46, 0x1F,
+    0xD6, 0x4E, 0x3E, 0x2D, 0x69, 0xF9, 0x64, 0x48,
+    0x2A, 0x4F, 0xCE, 0xF2, 0xCB, 0x65, 0x2F, 0x8E,
+    0xFC, 0x78, 0x97, 0x5C, 0x05, 0x58, 0x7A, 0x19,
+    0xAC, 0x8D, 0x7F, 0xE5, 0xD5, 0x98, 0x1A, 0x57,
+    0x4B, 0x67, 0x0E, 0x7F, 0xA7, 0x05, 0x5A, 0x64,
+    0x28, 0xAF, 0x14, 0x63, 0x3F, 0xB6, 0x29, 0xFE,
+    0x88, 0xF5, 0x3C, 0xB7, 0x4C, 0x3C, 0x02, 0xA5,
+    0xB8, 0xCE, 0xDA, 0xE9, 0xB0, 0x68, 0x17, 0x44,
+    0x55, 0xE0, 0x1F, 0x4D, 0x8A, 0x43, 0x7D, 0x69,
+    0x57, 0x29, 0xC7, 0x2E, 0x8D, 0xAC, 0x74, 0x15,
+    0xB7, 0x59, 0xC4, 0xA8, 0x9F, 0x0A, 0x72, 0x9E,
+    0x7E, 0x6E, 0x15, 0x47, 0x22, 0xDF, 0x12, 0x34,
+    0x58, 0x35, 0x07, 0x6A, 0x99, 0xCF, 0x34, 0xDC,
+    0x6E, 0x22, 0x50, 0xC9, 0xDE, 0xC0, 0x68, 0x9B,
+    0x65, 0x89, 0xBC, 0xD4, 0xDB, 0xED, 0xF8, 0xAB,
+    0xC8, 0x12, 0xA8, 0xA2, 0x2B, 0x0D, 0x40, 0x52,
+    0xDC, 0xBB, 0xFE, 0x02, 0x32, 0x2F, 0xA4, 0xA9,
+    0xCA, 0xD7, 0x10, 0x61, 0x21, 0x1E, 0xF0, 0xB4,
+    0xD3, 0x50, 0x5D, 0x04, 0x0F, 0xF6, 0x00, 0xC2,
+    0x6F, 0x16, 0x9D, 0x25, 0x36, 0x86, 0x42, 0x56,
+    0x4A, 0x55, 0x5E, 0x09, 0xC1, 0xBE, 0xE0, 0x91
+};
+
+/* Macro to perform one column of the RS matrix multiplication.  The
+ * parameters a, b, c, and d are the four bytes of output; i is the index
+ * of the key bytes, and w, x, y, and z, are the column of constants from
+ * the RS matrix, preprocessed through the poly_to_exp table. */
+
+#define CALC_S(a, b, c, d, i, w, x, y, z) \
+   { \
+      tmp = poly_to_exp[key[i]]; \
+      (a) ^= exp_to_poly[tmp + (w)]; \
+      (b) ^= exp_to_poly[tmp + (x)]; \
+      (c) ^= exp_to_poly[tmp + (y)]; \
+      (d) ^= exp_to_poly[tmp + (z)]; \
+   }
+
+/* Macros to calculate the key-dependent S-boxes for a 128-bit key using
+ * the S vector from CALC_S.  CALC_SB_2 computes a single entry in all
+ * four S-boxes, where i is the index of the entry to compute, and a and b
+ * are the index numbers preprocessed through the q0 and q1 tables
+ * respectively.  CALC_SB is simply a convenience to make the code shorter;
+ * it calls CALC_SB_2 four times with consecutive indices from i to i+3,
+ * using the remaining parameters two by two. */
+
+#define CALC_SB_2(i, a, b) \
+   ctx->s[0][i] = mds[0][q0[(a) ^ sa] ^ se]; \
+   ctx->s[1][i] = mds[1][q0[(b) ^ sb] ^ sf]; \
+   ctx->s[2][i] = mds[2][q1[(a) ^ sc] ^ sg]; \
+   ctx->s[3][i] = mds[3][q1[(b) ^ sd] ^ sh]
+
+#define CALC_SB(i, a, b, c, d, e, f, g, h) \
+   CALC_SB_2 (i, a, b); CALC_SB_2 ((i)+1, c, d); \
+   CALC_SB_2 ((i)+2, e, f); CALC_SB_2 ((i)+3, g, h)
+
+/* Macros exactly like CALC_SB and CALC_SB_2, but for 256-bit keys. */
+
+#define CALC_SB256_2(i, a, b) \
+   ctx->s[0][i] = mds[0][q0[q0[q1[(b) ^ sa] ^ se] ^ si] ^ sm]; \
+   ctx->s[1][i] = mds[1][q0[q1[q1[(a) ^ sb] ^ sf] ^ sj] ^ sn]; \
+   ctx->s[2][i] = mds[2][q1[q0[q0[(a) ^ sc] ^ sg] ^ sk] ^ so]; \
+   ctx->s[3][i] = mds[3][q1[q1[q0[(b) ^ sd] ^ sh] ^ sl] ^ sp];
+
+#define CALC_SB256(i, a, b, c, d, e, f, g, h) \
+   CALC_SB256_2 (i, a, b); CALC_SB256_2 ((i)+1, c, d); \
+   CALC_SB256_2 ((i)+2, e, f); CALC_SB256_2 ((i)+3, g, h)
+
+/* Macros to calculate the whitening and round subkeys.  CALC_K_2 computes the
+ * last two stages of the h() function for a given index (either 2i or 2i+1).
+ * a, b, c, and d are the four bytes going into the last two stages.  For
+ * 128-bit keys, this is the entire h() function and a and c are the index
+ * preprocessed through q0 and q1 respectively; for longer keys they are the
+ * output of previous stages.  j is the index of the first key byte to use.
+ * CALC_K computes a pair of subkeys for 128-bit Twofish, by calling CALC_K_2
+ * twice, doing the Pseudo-Hadamard Transform, and doing the necessary
+ * rotations.  Its parameters are: a, the array to write the results into,
+ * j, the index of the first output entry, k and l, the preprocessed indices
+ * for index 2i, and m and n, the preprocessed indices for index 2i+1.
+ * CALC_K256_2 expands CALC_K_2 to handle 256-bit keys, by doing two
+ * additional lookup-and-XOR stages.  The parameters a and b are the index
+ * preprocessed through q0 and q1 respectively; j is the index of the first
+ * key byte to use.  CALC_K256 is identical to CALC_K but for using the
+ * CALC_K256_2 macro instead of CALC_K_2. */
+
+#define CALC_K_2(a, b, c, d, j) \
+     mds[0][q0[a ^ key[(j) + 8]] ^ key[j]] \
+   ^ mds[1][q0[b ^ key[(j) + 9]] ^ key[(j) + 1]] \
+   ^ mds[2][q1[c ^ key[(j) + 10]] ^ key[(j) + 2]] \
+   ^ mds[3][q1[d ^ key[(j) + 11]] ^ key[(j) + 3]]
+
+#define CALC_K(a, j, k, l, m, n) \
+   x = CALC_K_2 (k, l, k, l, 0); \
+   y = CALC_K_2 (m, n, m, n, 4); \
+   y = (y << 8) + (y >> 24); \
+   x += y; y += x; ctx->a[j] = x; \
+   ctx->a[(j) + 1] = (y << 9) + (y >> 23)
+
+#define CALC_K256_2(a, b, j) \
+   CALC_K_2 (q0[q1[b ^ key[(j) + 24]] ^ key[(j) + 16]], \
+	     q1[q1[a ^ key[(j) + 25]] ^ key[(j) + 17]], \
+	     q0[q0[a ^ key[(j) + 26]] ^ key[(j) + 18]], \
+	     q1[q0[b ^ key[(j) + 27]] ^ key[(j) + 19]], j)
+
+#define CALC_K256(a, j, k, l, m, n) \
+   x = CALC_K256_2 (k, l, 0); \
+   y = CALC_K256_2 (m, n, 4); \
+   y = (y << 8) + (y >> 24); \
+   x += y; y += x; ctx->a[j] = x; \
+   ctx->a[(j) + 1] = (y << 9) + (y >> 23)
+
+
+
+/* Perform the key setup.  Note that this works only with 128- and 256-bit
+ * keys, despite the API that looks like it might support other sizes. */
+
+static gcry_err_code_t
+do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
+{
+  int i, j, k;
+
+  /* Temporaries for CALC_K. */
+  u32 x, y;
+
+  /* The S vector used to key the S-boxes, split up into individual bytes.
+   * 128-bit keys use only sa through sh; 256-bit use all of them. */
+  byte sa = 0, sb = 0, sc = 0, sd = 0, se = 0, sf = 0, sg = 0, sh = 0;
+  byte si = 0, sj = 0, sk = 0, sl = 0, sm = 0, sn = 0, so = 0, sp = 0;
+
+  /* Temporary for CALC_S. */
+  unsigned int tmp;
+
+  /* Flags for self-test. */
+  static int initialized = 0;
+  static const char *selftest_failed=0;
+
+  /* Check key length. */
+  if( ( ( keylen - 16 ) | 16 ) != 16 )
+    return GPG_ERR_INV_KEYLEN;
+
+  /* Do self-test if necessary. */
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if( selftest_failed )
+        log_error("%s\n", selftest_failed );
+    }
+  if( selftest_failed )
+    return GPG_ERR_SELFTEST_FAILED;
+
+  /* Compute the first two words of the S vector.  The magic numbers are
+   * the entries of the RS matrix, preprocessed through poly_to_exp.	The
+   * numbers in the comments are the original (polynomial form) matrix
+   * entries. */
+  CALC_S (sa, sb, sc, sd, 0, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
+  CALC_S (sa, sb, sc, sd, 1, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
+  CALC_S (sa, sb, sc, sd, 2, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
+  CALC_S (sa, sb, sc, sd, 3, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
+  CALC_S (sa, sb, sc, sd, 4, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
+  CALC_S (sa, sb, sc, sd, 5, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
+  CALC_S (sa, sb, sc, sd, 6, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
+  CALC_S (sa, sb, sc, sd, 7, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
+  CALC_S (se, sf, sg, sh, 8, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
+  CALC_S (se, sf, sg, sh, 9, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
+  CALC_S (se, sf, sg, sh, 10, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
+  CALC_S (se, sf, sg, sh, 11, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
+  CALC_S (se, sf, sg, sh, 12, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
+  CALC_S (se, sf, sg, sh, 13, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
+  CALC_S (se, sf, sg, sh, 14, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
+  CALC_S (se, sf, sg, sh, 15, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
+
+  if (keylen == 32)  /* 256-bit key */
+    {
+      /* Calculate the remaining two words of the S vector */
+      CALC_S (si, sj, sk, sl, 16, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
+      CALC_S (si, sj, sk, sl, 17, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
+      CALC_S (si, sj, sk, sl, 18, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
+      CALC_S (si, sj, sk, sl, 19, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
+      CALC_S (si, sj, sk, sl, 20, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
+      CALC_S (si, sj, sk, sl, 21, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
+      CALC_S (si, sj, sk, sl, 22, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
+      CALC_S (si, sj, sk, sl, 23, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
+      CALC_S (sm, sn, so, sp, 24, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */
+      CALC_S (sm, sn, so, sp, 25, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */
+      CALC_S (sm, sn, so, sp, 26, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */
+      CALC_S (sm, sn, so, sp, 27, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */
+      CALC_S (sm, sn, so, sp, 28, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */
+      CALC_S (sm, sn, so, sp, 29, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */
+      CALC_S (sm, sn, so, sp, 30, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */
+      CALC_S (sm, sn, so, sp, 31, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */
+
+      /* Compute the S-boxes. */
+      for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
+        {
+          CALC_SB256_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
+	}
+
+      /* Calculate whitening and round subkeys. */
+      for (i = 0; i < 8; i += 2)
+	{
+	  CALC_K256 ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+      for (j = 0; j < 32; j += 2, i += 2)
+	{
+	  CALC_K256 ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+    }
+  else
+    {
+      /* Compute the S-boxes. */
+      for(i=j=0,k=1; i < 256; i++, j += 2, k += 2 )
+        {
+          CALC_SB_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
+        }
+
+      /* Calculate whitening and round subkeys. */
+      for (i = 0; i < 8; i += 2)
+	{
+	  CALC_K ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+      for (j = 0; j < 32; j += 2, i += 2)
+	{
+	  CALC_K ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t
+twofish_setkey (void *context, const byte *key, unsigned int keylen,
+                cipher_bulk_ops_t *bulk_ops)
+{
+  TWOFISH_context *ctx = context;
+  unsigned int hwfeatures = _gcry_get_hw_features ();
+  int rc;
+
+  rc = do_twofish_setkey (ctx, key, keylen);
+
+#ifdef USE_AVX2
+  ctx->use_avx2 = 0;
+  if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
+    {
+      ctx->use_avx2 = 1;
+    }
+#endif
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_dec = _gcry_twofish_cbc_dec;
+  bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
+  bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
+  bulk_ops->ocb_auth  = _gcry_twofish_ocb_auth;
+
+  (void)hwfeatures;
+
+  _gcry_burn_stack (23+6*sizeof(void*));
+  return rc;
+}
+
+
+#ifdef USE_AVX2
+/* Assembler implementations of Twofish using AVX2.  Process 16 block in
+   parallel.
+ */
+extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[16]) ASM_FUNC_ABI;
+#endif
+
+
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of Twofish. */
+extern void _gcry_twofish_amd64_encrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+/* These assembly implementations process three blocks in parallel. */
+extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
+					const byte *in, byte *ctr);
+
+extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
+					const byte *in, byte *iv);
+
+extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
+					const byte *in, byte *iv);
+
+extern void _gcry_twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out,
+					const byte *in, byte *offset,
+					byte *checksum, const u64 Ls[3]);
+
+extern void _gcry_twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out,
+					const byte *in, byte *offset,
+					byte *checksum, const u64 Ls[3]);
+
+extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
+					 const byte *abuf, byte *offset,
+					 byte *checksum, const u64 Ls[3]);
+
+static inline void
+twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
+{
+  _gcry_twofish_amd64_encrypt_block(c, out, in);
+}
+
+static inline void
+twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
+{
+  _gcry_twofish_amd64_decrypt_block(c, out, in);
+}
+
+static inline void
+twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *ctr)
+{
+  _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
+}
+
+static inline void
+twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *iv)
+{
+  _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
+}
+
+static inline void
+twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *iv)
+{
+  _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
+}
+
+static inline void
+twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
+		      byte *offset, byte *checksum, const u64 Ls[3])
+{
+  _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
+}
+
+static inline void
+twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
+		      byte *offset, byte *checksum, const u64 Ls[3])
+{
+  _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
+}
+
+static inline void
+twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
+		       byte *offset, byte *checksum, const u64 Ls[3])
+{
+  _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
+}
+
+#elif defined(USE_ARM_ASM)
+
+/* Assembly implementations of Twofish. */
+extern void _gcry_twofish_arm_encrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+extern void _gcry_twofish_arm_decrypt_block(const TWOFISH_context *c,
+					      byte *out, const byte *in);
+
+#else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+
+/* Macros to compute the g() function in the encryption and decryption
+ * rounds.  G1 is the straight g() function; G2 includes the 8-bit
+ * rotation for the high 32-bit word. */
+
+#define G1(a) \
+     (ctx->s[0][(a) & 0xFF]) ^ (ctx->s[1][((a) >> 8) & 0xFF]) \
+   ^ (ctx->s[2][((a) >> 16) & 0xFF]) ^ (ctx->s[3][(a) >> 24])
+
+#define G2(b) \
+     (ctx->s[1][(b) & 0xFF]) ^ (ctx->s[2][((b) >> 8) & 0xFF]) \
+   ^ (ctx->s[3][((b) >> 16) & 0xFF]) ^ (ctx->s[0][(b) >> 24])
+
+/* Encryption and decryption Feistel rounds.  Each one calls the two g()
+ * macros, does the PHT, and performs the XOR and the appropriate bit
+ * rotations.  The parameters are the round number (used to select subkeys),
+ * and the four 32-bit chunks of the text. */
+
+#define ENCROUND(n, a, b, c, d) \
+   x = G1 (a); y = G2 (b); \
+   x += y; y += x + ctx->k[2 * (n) + 1]; \
+   (c) ^= x + ctx->k[2 * (n)]; \
+   (c) = ((c) >> 1) + ((c) << 31); \
+   (d) = (((d) << 1)+((d) >> 31)) ^ y
+
+#define DECROUND(n, a, b, c, d) \
+   x = G1 (a); y = G2 (b); \
+   x += y; y += x; \
+   (d) ^= y + ctx->k[2 * (n) + 1]; \
+   (d) = ((d) >> 1) + ((d) << 31); \
+   (c) = (((c) << 1)+((c) >> 31)); \
+   (c) ^= (x + ctx->k[2 * (n)])
+
+/* Encryption and decryption cycles; each one is simply two Feistel rounds
+ * with the 32-bit chunks re-ordered to simulate the "swap" */
+
+#define ENCCYCLE(n) \
+   ENCROUND (2 * (n), a, b, c, d); \
+   ENCROUND (2 * (n) + 1, c, d, a, b)
+
+#define DECCYCLE(n) \
+   DECROUND (2 * (n) + 1, c, d, a, b); \
+   DECROUND (2 * (n), a, b, c, d)
+
+/* Macros to convert the input and output bytes into 32-bit words,
+ * and simultaneously perform the whitening step.  INPACK packs word
+ * number n into the variable named by x, using whitening subkey number m.
+ * OUTUNPACK unpacks word number n from the variable named by x, using
+ * whitening subkey number m. */
+
+#define INPACK(n, x, m) \
+   x = buf_get_le32(in + (n) * 4); \
+   x ^= ctx->w[m]
+
+#define OUTUNPACK(n, x, m) \
+   x ^= ctx->w[m]; \
+   buf_put_le32(out + (n) * 4, x)
+
+#endif /*!USE_AMD64_ASM*/
+
+
+/* Encrypt one block.  in and out may be the same. */
+
+#ifdef USE_AMD64_ASM
+
+static unsigned int
+twofish_encrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+  twofish_amd64_encrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
+}
+
+#elif defined(USE_ARM_ASM)
+
+static unsigned int
+twofish_encrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+  _gcry_twofish_arm_encrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
+}
+
+#else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+
+static void
+do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+{
+  /* The four 32-bit chunks of the text. */
+  u32 a, b, c, d;
+
+  /* Temporaries used by the round function. */
+  u32 x, y;
+
+  /* Input whitening and packing. */
+  INPACK (0, a, 0);
+  INPACK (1, b, 1);
+  INPACK (2, c, 2);
+  INPACK (3, d, 3);
+
+  /* Encryption Feistel cycles. */
+  ENCCYCLE (0);
+  ENCCYCLE (1);
+  ENCCYCLE (2);
+  ENCCYCLE (3);
+  ENCCYCLE (4);
+  ENCCYCLE (5);
+  ENCCYCLE (6);
+  ENCCYCLE (7);
+
+  /* Output whitening and unpacking. */
+  OUTUNPACK (0, c, 4);
+  OUTUNPACK (1, d, 5);
+  OUTUNPACK (2, a, 6);
+  OUTUNPACK (3, b, 7);
+}
+
+static unsigned int
+twofish_encrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+  do_twofish_encrypt (ctx, out, in);
+  return /*burn_stack*/ (24+3*sizeof (void*));
+}
+
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+
+
+/* Decrypt one block.  in and out may be the same. */
+
+#ifdef USE_AMD64_ASM
+
+static unsigned int
+twofish_decrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+  twofish_amd64_decrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
+}
+
+#elif defined(USE_ARM_ASM)
+
+static unsigned int
+twofish_decrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+  _gcry_twofish_arm_decrypt_block(ctx, out, in);
+  return /*burn_stack*/ (4*sizeof (void*));
+}
+
+#else /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+
+static void
+do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+{
+  /* The four 32-bit chunks of the text. */
+  u32 a, b, c, d;
+
+  /* Temporaries used by the round function. */
+  u32 x, y;
+
+  /* Input whitening and packing. */
+  INPACK (0, c, 4);
+  INPACK (1, d, 5);
+  INPACK (2, a, 6);
+  INPACK (3, b, 7);
+
+  /* Encryption Feistel cycles. */
+  DECCYCLE (7);
+  DECCYCLE (6);
+  DECCYCLE (5);
+  DECCYCLE (4);
+  DECCYCLE (3);
+  DECCYCLE (2);
+  DECCYCLE (1);
+  DECCYCLE (0);
+
+  /* Output whitening and unpacking. */
+  OUTUNPACK (0, a, 0);
+  OUTUNPACK (1, b, 1);
+  OUTUNPACK (2, c, 2);
+  OUTUNPACK (3, d, 3);
+}
+
+static unsigned int
+twofish_decrypt (void *context, byte *out, const byte *in)
+{
+  TWOFISH_context *ctx = context;
+
+  do_twofish_decrypt (ctx, out, in);
+  return /*burn_stack*/ (24+3*sizeof (void*));
+}
+
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size TWOFISH_BLOCKSIZE. */
+static void
+_gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[TWOFISH_BLOCKSIZE];
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  {
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 3;
+        outbuf += 3 * TWOFISH_BLOCKSIZE;
+        inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 8 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      burn = twofish_encrypt(ctx, tmpbuf, ctr);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
+
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE);
+      outbuf += TWOFISH_BLOCKSIZE;
+      inbuf  += TWOFISH_BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add(ctr, 1, TWOFISH_BLOCKSIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[TWOFISH_BLOCKSIZE];
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  {
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 3;
+        outbuf += 3 * TWOFISH_BLOCKSIZE;
+        inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 9 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      burn = twofish_decrypt (ctx, savebuf, inbuf);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, TWOFISH_BLOCKSIZE);
+      inbuf += TWOFISH_BLOCKSIZE;
+      outbuf += TWOFISH_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, size_t nblocks)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  {
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 3;
+        outbuf += 3 * TWOFISH_BLOCKSIZE;
+        inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+        burn = 8 * sizeof(void*);
+        if (burn > burn_stack_depth)
+          burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      burn = twofish_encrypt(ctx, iv, iv);
+      if (burn > burn_stack_depth)
+        burn_stack_depth = burn;
+
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE);
+      outbuf += TWOFISH_BLOCKSIZE;
+      inbuf += TWOFISH_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+#ifdef USE_AMD64_ASM
+  TWOFISH_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn, burn_stack_depth = 0;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * TWOFISH_BLOCKSIZE;
+	      inbuf  += 16 * TWOFISH_BLOCKSIZE;
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* twofish-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+    }
+#endif
+
+  {
+    /* Use u64 to store pointers for x32 support (assembly function
+      * assumes 64-bit pointers). */
+    u64 Ls[3];
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+	Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
+	Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
+	Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
+	blkn += 3;
+
+	if (encrypt)
+	  twofish_amd64_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+				Ls);
+	else
+	  twofish_amd64_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+				Ls);
+
+	nblocks -= 3;
+	outbuf += 3 * TWOFISH_BLOCKSIZE;
+	inbuf  += 3 * TWOFISH_BLOCKSIZE;
+
+	burn = 8 * sizeof(void*);
+	if (burn > burn_stack_depth)
+	  burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
+
+  return nblocks;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			size_t nblocks)
+{
+#ifdef USE_AMD64_ASM
+  TWOFISH_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  unsigned int burn, burn_stack_depth = 0;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * TWOFISH_BLOCKSIZE;
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* twofish-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  {
+    /* Use u64 to store pointers for x32 support (assembly function
+      * assumes 64-bit pointers). */
+    u64 Ls[3];
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+	Ls[0] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 1);
+	Ls[1] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 2);
+	Ls[2] = (uintptr_t)(const void *)ocb_get_l(c, blkn + 3);
+	blkn += 3;
+
+	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+			      c->u_mode.ocb.aad_sum, Ls);
+
+	nblocks -= 3;
+	abuf += 3 * TWOFISH_BLOCKSIZE;
+
+	burn = 8 * sizeof(void*);
+	if (burn > burn_stack_depth)
+	  burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
+
+  return nblocks;
+}
+
+
+
+/* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 16+1;
+  const int blocksize = TWOFISH_BLOCKSIZE;
+  const int context_size = sizeof(TWOFISH_context);
+
+  return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey,
+           &twofish_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 16+2;
+  const int blocksize = TWOFISH_BLOCKSIZE;
+  const int context_size = sizeof(TWOFISH_context);
+
+  return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey,
+           &twofish_encrypt, nblocks, blocksize, context_size);
+}
+
+/* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 16+2;
+  const int blocksize = TWOFISH_BLOCKSIZE;
+  const int context_size = sizeof(TWOFISH_context);
+
+  return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey,
+           &twofish_encrypt, nblocks, blocksize, context_size);
+}
+
+
+/* Test a single encryption and decryption with each key size. */
+
+static const char*
+selftest (void)
+{
+  TWOFISH_context ctx; /* Expanded key. */
+  byte scratch[16];    /* Encryption/decryption result buffer. */
+  cipher_bulk_ops_t bulk_ops;
+  const char *r;
+
+  /* Test vectors for single encryption/decryption.  Note that I am using
+   * the vectors from the Twofish paper's "known answer test", I=3 for
+   * 128-bit and I=4 for 256-bit, instead of the all-0 vectors from the
+   * "intermediate value test", because an all-0 key would trigger all the
+   * special cases in the RS matrix multiply, leaving the math untested. */
+  static  byte plaintext[16] = {
+    0xD4, 0x91, 0xDB, 0x16, 0xE7, 0xB1, 0xC3, 0x9E,
+    0x86, 0xCB, 0x08, 0x6B, 0x78, 0x9F, 0x54, 0x19
+  };
+  static byte key[16] = {
+    0x9F, 0x58, 0x9F, 0x5C, 0xF6, 0x12, 0x2C, 0x32,
+    0xB6, 0xBF, 0xEC, 0x2F, 0x2A, 0xE8, 0xC3, 0x5A
+  };
+  static const byte ciphertext[16] = {
+    0x01, 0x9F, 0x98, 0x09, 0xDE, 0x17, 0x11, 0x85,
+    0x8F, 0xAA, 0xC3, 0xA3, 0xBA, 0x20, 0xFB, 0xC3
+  };
+  static byte plaintext_256[16] = {
+    0x90, 0xAF, 0xE9, 0x1B, 0xB2, 0x88, 0x54, 0x4F,
+    0x2C, 0x32, 0xDC, 0x23, 0x9B, 0x26, 0x35, 0xE6
+  };
+  static byte key_256[32] = {
+    0xD4, 0x3B, 0xB7, 0x55, 0x6E, 0xA3, 0x2E, 0x46,
+    0xF2, 0xA2, 0x82, 0xB7, 0xD4, 0x5B, 0x4E, 0x0D,
+    0x57, 0xFF, 0x73, 0x9D, 0x4D, 0xC9, 0x2C, 0x1B,
+    0xD7, 0xFC, 0x01, 0x70, 0x0C, 0xC8, 0x21, 0x6F
+  };
+  static const byte ciphertext_256[16] = {
+    0x6C, 0xB4, 0x56, 0x1C, 0x40, 0xBF, 0x0A, 0x97,
+    0x05, 0x93, 0x1C, 0xB6, 0xD4, 0x08, 0xE7, 0xFA
+  };
+
+  twofish_setkey (&ctx, key, sizeof(key), &bulk_ops);
+  twofish_encrypt (&ctx, scratch, plaintext);
+  if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
+    return "Twofish-128 test encryption failed.";
+  twofish_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext, sizeof (plaintext)))
+    return "Twofish-128 test decryption failed.";
+
+  twofish_setkey (&ctx, key_256, sizeof(key_256), &bulk_ops);
+  twofish_encrypt (&ctx, scratch, plaintext_256);
+  if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
+    return "Twofish-256 test encryption failed.";
+  twofish_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
+    return "Twofish-256 test decryption failed.";
+
+  if ((r = selftest_ctr()) != NULL)
+    return r;
+  if ((r = selftest_cbc()) != NULL)
+    return r;
+  if ((r = selftest_cfb()) != NULL)
+    return r;
+
+  return NULL;
+}
+
+/* More complete test program.	This does 1000 encryptions and decryptions
+ * with each of 250 128-bit keys and 2000 encryptions and decryptions with
+ * each of 125 256-bit keys, using a feedback scheme similar to a Feistel
+ * cipher, so as to be sure of testing all the table entries pretty
+ * thoroughly.	We keep changing the keys so as to get a more meaningful
+ * performance number, since the key setup is non-trivial for Twofish. */
+
+#ifdef TEST
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+int
+main()
+{
+  TWOFISH_context ctx;     /* Expanded key. */
+  int i, j;                /* Loop counters. */
+  cipher_bulk_ops_t bulk_ops;
+
+  const char *encrypt_msg; /* Message to print regarding encryption test;
+                            * the printf is done outside the loop to avoid
+                            * stuffing up the timing. */
+  clock_t timer; /* For computing elapsed time. */
+
+  /* Test buffer. */
+  byte buffer[4][16] = {
+    {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+     0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
+    {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
+     0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
+    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
+     0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
+    {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
+     0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
+  };
+
+  /* Expected outputs for the million-operation test */
+  static const byte test_encrypt[4][16] = {
+    {0xC8, 0x23, 0xB8, 0xB7, 0x6B, 0xFE, 0x91, 0x13,
+     0x2F, 0xA7, 0x5E, 0xE6, 0x94, 0x77, 0x6F, 0x6B},
+    {0x90, 0x36, 0xD8, 0x29, 0xD5, 0x96, 0xC2, 0x8E,
+     0xE4, 0xFF, 0x76, 0xBC, 0xE5, 0x77, 0x88, 0x27},
+    {0xB8, 0x78, 0x69, 0xAF, 0x42, 0x8B, 0x48, 0x64,
+     0xF7, 0xE9, 0xF3, 0x9C, 0x42, 0x18, 0x7B, 0x73},
+    {0x7A, 0x88, 0xFB, 0xEB, 0x90, 0xA4, 0xB4, 0xA8,
+     0x43, 0xA3, 0x1D, 0xF1, 0x26, 0xC4, 0x53, 0x57}
+  };
+  static const byte test_decrypt[4][16] = {
+    {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+     0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF},
+    {0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, 0x78,
+     0x87, 0x96, 0xA5, 0xB4, 0xC3, 0xD2 ,0xE1, 0xF0},
+    {0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
+     0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54 ,0x32, 0x10},
+    {0x01, 0x23, 0x45, 0x67, 0x76, 0x54 ,0x32, 0x10,
+     0x89, 0xAB, 0xCD, 0xEF, 0xFE, 0xDC, 0xBA, 0x98}
+  };
+
+  /* Start the timer ticking. */
+  timer = clock ();
+
+  /* Encryption test. */
+  for (i = 0; i < 125; i++)
+    {
+      twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
+      for (j = 0; j < 1000; j++)
+        twofish_encrypt (&ctx, buffer[2], buffer[2]);
+      twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
+      for (j = 0; j < 1000; j++)
+        twofish_encrypt (&ctx, buffer[3], buffer[3]);
+      twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
+      for (j = 0; j < 1000; j++) {
+        twofish_encrypt (&ctx, buffer[0], buffer[0]);
+        twofish_encrypt (&ctx, buffer[1], buffer[1]);
+      }
+    }
+  encrypt_msg = memcmp (buffer, test_encrypt, sizeof (test_encrypt)) ?
+    "encryption failure!\n" : "encryption OK!\n";
+
+  /* Decryption test. */
+  for (i = 0; i < 125; i++)
+    {
+      twofish_setkey (&ctx, buffer[2], sizeof (buffer[2])*2, &bulk_ops);
+      for (j = 0; j < 1000; j++) {
+        twofish_decrypt (&ctx, buffer[0], buffer[0]);
+        twofish_decrypt (&ctx, buffer[1], buffer[1]);
+      }
+      twofish_setkey (&ctx, buffer[1], sizeof (buffer[1]), &bulk_ops);
+      for (j = 0; j < 1000; j++)
+        twofish_decrypt (&ctx, buffer[3], buffer[3]);
+      twofish_setkey (&ctx, buffer[0], sizeof (buffer[0]), &bulk_ops);
+      for (j = 0; j < 1000; j++)
+        twofish_decrypt (&ctx, buffer[2], buffer[2]);
+    }
+
+  /* Stop the timer, and print results. */
+  timer = clock () - timer;
+  printf (encrypt_msg);
+  printf (memcmp (buffer, test_decrypt, sizeof (test_decrypt)) ?
+          "decryption failure!\n" : "decryption OK!\n");
+  printf ("elapsed time: %.1f s.\n", (float) timer / CLOCKS_PER_SEC);
+
+  return 0;
+}
+
+#endif /* TEST */
+
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_twofish =
+  {
+    GCRY_CIPHER_TWOFISH, {0, 0},
+    "TWOFISH", NULL, NULL, 16, 256, sizeof (TWOFISH_context),
+    twofish_setkey, twofish_encrypt, twofish_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_twofish128 =
+  {
+    GCRY_CIPHER_TWOFISH128, {0, 0},
+    "TWOFISH128", NULL, NULL, 16, 128, sizeof (TWOFISH_context),
+    twofish_setkey, twofish_encrypt, twofish_decrypt
+  };
diff --git a/comm/third_party/libgcrypt/cipher/whirlpool-sse2-amd64.S b/comm/third_party/libgcrypt/cipher/whirlpool-sse2-amd64.S
new file mode 100644
index 0000000000..5631dc567a
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/whirlpool-sse2-amd64.S
@@ -0,0 +1,348 @@
+/* whirlpool-sse2-amd64.S  -  AMD64 assembly implementation of Whirlpool
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL)
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* look-up table offsets on RTAB */
+#define RC (0)
+#define C0 (RC + (8 * 10))
+#define C1 (C0 + (8 * 256))
+#define C2 (C1 + (8 * 256))
+#define C3 (C2 + (8 * 256))
+#define C4 (C3 + (8 * 256))
+#define C5 (C4 + (8 * 256))
+#define C6 (C5 + (8 * 256))
+#define C7 (C6 + (8 * 256))
+
+/* stack variables */
+#define STACK_DATAP  (0)
+#define STACK_STATEP (STACK_DATAP + 8)
+#define STACK_ROUNDS (STACK_STATEP + 8)
+#define STACK_NBLKS  (STACK_ROUNDS + 8)
+#define STACK_RBP    (STACK_NBLKS + 8)
+#define STACK_RBX    (STACK_RBP + 8)
+#define STACK_R12    (STACK_RBX + 8)
+#define STACK_R13    (STACK_R12 + 8)
+#define STACK_R14    (STACK_R13 + 8)
+#define STACK_R15    (STACK_R14 + 8)
+#define STACK_MAX    (STACK_R15 + 8)
+
+/* register macros */
+#define RTAB	%rbp
+
+#define RI1	%rax
+#define RI2	%rbx
+#define RI3	%rcx
+#define RI4	%rdx
+
+#define RI1d	%eax
+#define RI2d	%ebx
+#define RI3d	%ecx
+#define RI4d	%edx
+
+#define RI1bl	%al
+#define RI2bl	%bl
+#define RI3bl	%cl
+#define RI4bl	%dl
+
+#define RI1bh	%ah
+#define RI2bh	%bh
+#define RI3bh	%ch
+#define RI4bh	%dh
+
+#define RB0	%r8
+#define RB1	%r9
+#define RB2	%r10
+#define RB3	%r11
+#define RB4	%r12
+#define RB5	%r13
+#define RB6	%r14
+#define RB7	%r15
+
+#define RT0	%rsi
+#define RT1	%rdi
+
+#define RT0d	%esi
+#define RT1d	%edi
+
+#define XKEY0	%xmm0
+#define XKEY1	%xmm1
+#define XKEY2	%xmm2
+#define XKEY3	%xmm3
+#define XKEY4	%xmm4
+#define XKEY5	%xmm5
+#define XKEY6	%xmm6
+#define XKEY7	%xmm7
+
+#define XSTATE0	%xmm8
+#define XSTATE1	%xmm9
+#define XSTATE2	%xmm10
+#define XSTATE3	%xmm11
+#define XSTATE4	%xmm12
+#define XSTATE5	%xmm13
+#define XSTATE6	%xmm14
+#define XSTATE7	%xmm15
+
+/***********************************************************************
+ * AMD64 assembly implementation of Whirlpool.
+ *  - Using table-lookups
+ *  - Store state in XMM registers
+ ***********************************************************************/
+#define __do_whirl(op, ri, \
+		   b0, b1, b2, b3, b4, b5, b6, b7, \
+		   load_ri, load_arg) \
+	movzbl		ri ## bl,	RT0d; \
+	movzbl		ri ## bh,	RT1d; \
+	shrq		$16,		ri; \
+	op ## q		C7(RTAB,RT0,8),	b7; \
+	op ## q		C6(RTAB,RT1,8),	b6; \
+	movzbl		ri ## bl,	RT0d; \
+	movzbl		ri ## bh,	RT1d; \
+	shrq		$16,		ri; \
+	op ## q		C5(RTAB,RT0,8),	b5; \
+	op ## q		C4(RTAB,RT1,8),	b4; \
+	movzbl		ri ## bl,	RT0d; \
+	movzbl		ri ## bh,	RT1d; \
+	shrl		$16,		ri ## d; \
+	op ## q		C3(RTAB,RT0,8),	b3; \
+	op ## q		C2(RTAB,RT1,8),	b2; \
+	movzbl		ri ## bl,	RT0d; \
+	movzbl		ri ## bh,	RT1d; \
+	load_ri(	load_arg,	ri); \
+	op ## q		C1(RTAB,RT0,8),	b1; \
+	op ## q		C0(RTAB,RT1,8),	b0;
+
+#define do_whirl(op, ri, rb_add, load_ri, load_arg) \
+	__do_whirl(op, ##ri, rb_add, load_ri, load_arg)
+
+#define dummy(...) /*_*/
+
+#define do_movq(src, dst) movq src, dst;
+
+#define RB_ADD0 RB0, RB1, RB2, RB3, RB4, RB5, RB6, RB7
+#define RB_ADD1 RB1, RB2, RB3, RB4, RB5, RB6, RB7, RB0
+#define RB_ADD2 RB2, RB3, RB4, RB5, RB6, RB7, RB0, RB1
+#define RB_ADD3 RB3, RB4, RB5, RB6, RB7, RB0, RB1, RB2
+#define RB_ADD4 RB4, RB5, RB6, RB7, RB0, RB1, RB2, RB3
+#define RB_ADD5 RB5, RB6, RB7, RB0, RB1, RB2, RB3, RB4
+#define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5
+#define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6
+
+.align 8
+.globl _gcry_whirlpool_transform_amd64
+ELF(.type  _gcry_whirlpool_transform_amd64,@function;)
+
+_gcry_whirlpool_transform_amd64:
+	/* input:
+	 *	%rdi: state
+	 *	%rsi: inblk
+	 *	%rdx: nblks
+	 *      %rcx: look-up tables
+	 */
+	CFI_STARTPROC();
+	cmp $0, %rdx;
+	je .Lskip;
+
+	subq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(STACK_MAX);
+	movq %rbp, STACK_RBP(%rsp);
+	movq %rbx, STACK_RBX(%rsp);
+	movq %r12, STACK_R12(%rsp);
+	movq %r13, STACK_R13(%rsp);
+	movq %r14, STACK_R14(%rsp);
+	movq %r15, STACK_R15(%rsp);
+	CFI_REL_OFFSET(%rbp, STACK_RBP);
+	CFI_REL_OFFSET(%rbx, STACK_RBX);
+	CFI_REL_OFFSET(%r12, STACK_R12);
+	CFI_REL_OFFSET(%r13, STACK_R13);
+	CFI_REL_OFFSET(%r14, STACK_R14);
+	CFI_REL_OFFSET(%r15, STACK_R15);
+
+	movq %rdx, STACK_NBLKS(%rsp);
+	movq %rdi, STACK_STATEP(%rsp);
+	movq %rsi, STACK_DATAP(%rsp);
+
+	movq %rcx, RTAB;
+
+	jmp .Lfirst_block;
+
+.align 8
+.Lblock_loop:
+	movq STACK_DATAP(%rsp), %rsi;
+	movq RI1, %rdi;
+
+.Lfirst_block:
+	/* load data_block */
+	movq 0*8(%rsi), RB0;
+	movq 1*8(%rsi), RB1;
+	bswapq RB0;
+	movq 2*8(%rsi), RB2;
+	bswapq RB1;
+	movq 3*8(%rsi), RB3;
+	bswapq RB2;
+	movq 4*8(%rsi), RB4;
+	bswapq RB3;
+	movq 5*8(%rsi), RB5;
+	bswapq RB4;
+	movq RB0, XSTATE0;
+	movq 6*8(%rsi), RB6;
+	bswapq RB5;
+	movq RB1, XSTATE1;
+	movq 7*8(%rsi), RB7;
+	bswapq RB6;
+	movq RB2, XSTATE2;
+	bswapq RB7;
+	movq RB3, XSTATE3;
+	movq RB4, XSTATE4;
+	movq RB5, XSTATE5;
+	movq RB6, XSTATE6;
+	movq RB7, XSTATE7;
+
+	/* load key */
+	movq 0*8(%rdi), XKEY0;
+	movq 1*8(%rdi), XKEY1;
+	movq 2*8(%rdi), XKEY2;
+	movq 3*8(%rdi), XKEY3;
+	movq 4*8(%rdi), XKEY4;
+	movq 5*8(%rdi), XKEY5;
+	movq 6*8(%rdi), XKEY6;
+	movq 7*8(%rdi), XKEY7;
+
+	movq XKEY0, RI1;
+	movq XKEY1, RI2;
+	movq XKEY2, RI3;
+	movq XKEY3, RI4;
+
+	/* prepare and store state */
+	pxor XKEY0, XSTATE0;
+	pxor XKEY1, XSTATE1;
+	pxor XKEY2, XSTATE2;
+	pxor XKEY3, XSTATE3;
+	pxor XKEY4, XSTATE4;
+	pxor XKEY5, XSTATE5;
+	pxor XKEY6, XSTATE6;
+	pxor XKEY7, XSTATE7;
+
+	movq XSTATE0, 0*8(%rdi);
+	movq XSTATE1, 1*8(%rdi);
+	movq XSTATE2, 2*8(%rdi);
+	movq XSTATE3, 3*8(%rdi);
+	movq XSTATE4, 4*8(%rdi);
+	movq XSTATE5, 5*8(%rdi);
+	movq XSTATE6, 6*8(%rdi);
+	movq XSTATE7, 7*8(%rdi);
+
+	addq $64, STACK_DATAP(%rsp);
+	movl $(0), STACK_ROUNDS(%rsp);
+.align 8
+.Lround_loop:
+	do_whirl(mov, RI1 /*XKEY0*/, RB_ADD0, do_movq, XKEY4);
+	do_whirl(xor, RI2 /*XKEY1*/, RB_ADD1, do_movq, XKEY5);
+	do_whirl(xor, RI3 /*XKEY2*/, RB_ADD2, do_movq, XKEY6);
+	do_whirl(xor, RI4 /*XKEY3*/, RB_ADD3, do_movq, XKEY7);
+	do_whirl(xor, RI1 /*XKEY0*/, RB_ADD4, do_movq, XSTATE0);
+	do_whirl(xor, RI2 /*XKEY1*/, RB_ADD5, do_movq, XSTATE1);
+	do_whirl(xor, RI3 /*XKEY2*/, RB_ADD6, do_movq, XSTATE2);
+	do_whirl(xor, RI4 /*XKEY3*/, RB_ADD7, do_movq, XSTATE3);
+
+	movl STACK_ROUNDS(%rsp), RT0d;
+	movq RB1, XKEY1;
+	addl $1, STACK_ROUNDS(%rsp);
+	movq RB2, XKEY2;
+	movq RB3, XKEY3;
+	xorq RC(RTAB,RT0,8), RB0; /* Add round constant */
+	movq RB4, XKEY4;
+	movq RB5, XKEY5;
+	movq RB0, XKEY0;
+	movq RB6, XKEY6;
+	movq RB7, XKEY7;
+
+	do_whirl(xor, RI1 /*XSTATE0*/, RB_ADD0, do_movq, XSTATE4);
+	do_whirl(xor, RI2 /*XSTATE1*/, RB_ADD1, do_movq, XSTATE5);
+	do_whirl(xor, RI3 /*XSTATE2*/, RB_ADD2, do_movq, XSTATE6);
+	do_whirl(xor, RI4 /*XSTATE3*/, RB_ADD3, do_movq, XSTATE7);
+
+	cmpl $10, STACK_ROUNDS(%rsp);
+	je .Lis_last_round;
+
+	do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, do_movq, XKEY0);
+	do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, do_movq, XKEY1);
+	do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, do_movq, XKEY2);
+	do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, do_movq, XKEY3);
+	movq RB0, XSTATE0;
+	movq RB1, XSTATE1;
+	movq RB2, XSTATE2;
+	movq RB3, XSTATE3;
+	movq RB4, XSTATE4;
+	movq RB5, XSTATE5;
+	movq RB6, XSTATE6;
+	movq RB7, XSTATE7;
+
+	jmp .Lround_loop;
+.align 8
+.Lis_last_round:
+	do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, dummy, _);
+	movq STACK_STATEP(%rsp), RI1;
+	do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, dummy, _);
+	do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, dummy, _);
+	do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, dummy, _);
+
+	/* store state */
+	xorq RB0, 0*8(RI1);
+	xorq RB1, 1*8(RI1);
+	xorq RB2, 2*8(RI1);
+	xorq RB3, 3*8(RI1);
+	xorq RB4, 4*8(RI1);
+	xorq RB5, 5*8(RI1);
+	xorq RB6, 6*8(RI1);
+	xorq RB7, 7*8(RI1);
+
+	subq $1, STACK_NBLKS(%rsp);
+	jnz .Lblock_loop;
+
+	movq STACK_RBP(%rsp), %rbp;
+	movq STACK_RBX(%rsp), %rbx;
+	movq STACK_R12(%rsp), %r12;
+	movq STACK_R13(%rsp), %r13;
+	movq STACK_R14(%rsp), %r14;
+	movq STACK_R15(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(-STACK_MAX);
+.Lskip:
+	movl $(STACK_MAX + 8), %eax;
+	ret;
+	CFI_ENDPROC();
+ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;)
+
+#endif
+#endif
diff --git a/comm/third_party/libgcrypt/cipher/whirlpool.c b/comm/third_party/libgcrypt/cipher/whirlpool.c
new file mode 100644
index 0000000000..79b2026b57
--- /dev/null
+++ b/comm/third_party/libgcrypt/cipher/whirlpool.c
@@ -0,0 +1,1535 @@
+/* whirlpool.c - Whirlpool hashing algorithm
+ * Copyright (C) 2005 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* This is an implementation of the Whirlpool hashing algorithm, which
+   has been developed by Vincent Rijmen and Paulo S. L. M. Barreto;
+   it's homepage is located at:
+   http://www.larc.usp.br/~pbarreto/WhirlpoolPage.html
+
+   The S-Boxes and the structure of the main transformation function,
+   which implements an optimized version of the algorithm, is taken
+   from the reference implementation available from
+   http://www.larc.usp.br/~pbarreto/whirlpool.zip
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+
+#include "bufhelp.h"
+#include "hash-common.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+
+
+/* Size of a whirlpool block (in bytes).  */
+#define BLOCK_SIZE 64
+
+/* Number of rounds.  */
+#define R 10
+
+
+
+/* Types.  */
+typedef u64 whirlpool_block_t[BLOCK_SIZE / 8];
+
+typedef struct {
+  gcry_md_block_ctx_t bctx;
+  whirlpool_block_t hash_state;
+  int use_bugemu;
+  struct {
+    size_t count;
+    unsigned char length[32];
+  } bugemu;
+} whirlpool_context_t;
+
+
+
+/* Macros. */
+
+/* Convert the the buffer BUFFER into a block BLOCK, using I as
+   counter.  */
+#define buffer_to_block(buffer, block, i) \
+  for (i = 0; i < 8; i++) \
+    (block)[i] = buf_get_be64((buffer) + i * 8);
+
+/* Convert the block BLOCK into a buffer BUFFER, using I as
+   counter.  */
+#define block_to_buffer(buffer, block, i) \
+  for (i = 0; i < 8; i++) \
+    buf_put_be64((buffer) + i * 8, (block)[i]);
+
+/* Copy the block BLOCK_SRC to BLOCK_DST, using I as counter.  */
+#define block_copy(block_dst, block_src, i) \
+  for (i = 0; i < 8; i++) \
+    block_dst[i] = block_src[i];
+
+/* XOR the block BLOCK_SRC into BLOCK_DST, using I as counter.  */
+#define block_xor(block_dst, block_src, i) \
+  for (i = 0; i < 8; i++) \
+    block_dst[i] ^= block_src[i];
+
+
+
+
+struct whirlpool_tables_s {
+  u64 RC[R];
+  u64 C[8][256];
+};
+
+static const struct whirlpool_tables_s tab =
+{
+/* Round constants.  */
+  {
+    U64_C (0x1823c6e887b8014f),
+    U64_C (0x36a6d2f5796f9152),
+    U64_C (0x60bc9b8ea30c7b35),
+    U64_C (0x1de0d7c22e4bfe57),
+    U64_C (0x157737e59ff04ada),
+    U64_C (0x58c9290ab1a06b85),
+    U64_C (0xbd5d10f4cb3e0567),
+    U64_C (0xe427418ba77d95d8),
+    U64_C (0xfbee7c66dd17479e),
+    U64_C (0xca2dbf07ad5a8333),
+  },
+/* Main lookup boxes.  */
+  { {
+    U64_C (0x18186018c07830d8), U64_C (0x23238c2305af4626),
+    U64_C (0xc6c63fc67ef991b8), U64_C (0xe8e887e8136fcdfb),
+    U64_C (0x878726874ca113cb), U64_C (0xb8b8dab8a9626d11),
+    U64_C (0x0101040108050209), U64_C (0x4f4f214f426e9e0d),
+    U64_C (0x3636d836adee6c9b), U64_C (0xa6a6a2a6590451ff),
+    U64_C (0xd2d26fd2debdb90c), U64_C (0xf5f5f3f5fb06f70e),
+    U64_C (0x7979f979ef80f296), U64_C (0x6f6fa16f5fcede30),
+    U64_C (0x91917e91fcef3f6d), U64_C (0x52525552aa07a4f8),
+    U64_C (0x60609d6027fdc047), U64_C (0xbcbccabc89766535),
+    U64_C (0x9b9b569baccd2b37), U64_C (0x8e8e028e048c018a),
+    U64_C (0xa3a3b6a371155bd2), U64_C (0x0c0c300c603c186c),
+    U64_C (0x7b7bf17bff8af684), U64_C (0x3535d435b5e16a80),
+    U64_C (0x1d1d741de8693af5), U64_C (0xe0e0a7e05347ddb3),
+    U64_C (0xd7d77bd7f6acb321), U64_C (0xc2c22fc25eed999c),
+    U64_C (0x2e2eb82e6d965c43), U64_C (0x4b4b314b627a9629),
+    U64_C (0xfefedffea321e15d), U64_C (0x575741578216aed5),
+    U64_C (0x15155415a8412abd), U64_C (0x7777c1779fb6eee8),
+    U64_C (0x3737dc37a5eb6e92), U64_C (0xe5e5b3e57b56d79e),
+    U64_C (0x9f9f469f8cd92313), U64_C (0xf0f0e7f0d317fd23),
+    U64_C (0x4a4a354a6a7f9420), U64_C (0xdada4fda9e95a944),
+    U64_C (0x58587d58fa25b0a2), U64_C (0xc9c903c906ca8fcf),
+    U64_C (0x2929a429558d527c), U64_C (0x0a0a280a5022145a),
+    U64_C (0xb1b1feb1e14f7f50), U64_C (0xa0a0baa0691a5dc9),
+    U64_C (0x6b6bb16b7fdad614), U64_C (0x85852e855cab17d9),
+    U64_C (0xbdbdcebd8173673c), U64_C (0x5d5d695dd234ba8f),
+    U64_C (0x1010401080502090), U64_C (0xf4f4f7f4f303f507),
+    U64_C (0xcbcb0bcb16c08bdd), U64_C (0x3e3ef83eedc67cd3),
+    U64_C (0x0505140528110a2d), U64_C (0x676781671fe6ce78),
+    U64_C (0xe4e4b7e47353d597), U64_C (0x27279c2725bb4e02),
+    U64_C (0x4141194132588273), U64_C (0x8b8b168b2c9d0ba7),
+    U64_C (0xa7a7a6a7510153f6), U64_C (0x7d7de97dcf94fab2),
+    U64_C (0x95956e95dcfb3749), U64_C (0xd8d847d88e9fad56),
+    U64_C (0xfbfbcbfb8b30eb70), U64_C (0xeeee9fee2371c1cd),
+    U64_C (0x7c7ced7cc791f8bb), U64_C (0x6666856617e3cc71),
+    U64_C (0xdddd53dda68ea77b), U64_C (0x17175c17b84b2eaf),
+    U64_C (0x4747014702468e45), U64_C (0x9e9e429e84dc211a),
+    U64_C (0xcaca0fca1ec589d4), U64_C (0x2d2db42d75995a58),
+    U64_C (0xbfbfc6bf9179632e), U64_C (0x07071c07381b0e3f),
+    U64_C (0xadad8ead012347ac), U64_C (0x5a5a755aea2fb4b0),
+    U64_C (0x838336836cb51bef), U64_C (0x3333cc3385ff66b6),
+    U64_C (0x636391633ff2c65c), U64_C (0x02020802100a0412),
+    U64_C (0xaaaa92aa39384993), U64_C (0x7171d971afa8e2de),
+    U64_C (0xc8c807c80ecf8dc6), U64_C (0x19196419c87d32d1),
+    U64_C (0x494939497270923b), U64_C (0xd9d943d9869aaf5f),
+    U64_C (0xf2f2eff2c31df931), U64_C (0xe3e3abe34b48dba8),
+    U64_C (0x5b5b715be22ab6b9), U64_C (0x88881a8834920dbc),
+    U64_C (0x9a9a529aa4c8293e), U64_C (0x262698262dbe4c0b),
+    U64_C (0x3232c8328dfa64bf), U64_C (0xb0b0fab0e94a7d59),
+    U64_C (0xe9e983e91b6acff2), U64_C (0x0f0f3c0f78331e77),
+    U64_C (0xd5d573d5e6a6b733), U64_C (0x80803a8074ba1df4),
+    U64_C (0xbebec2be997c6127), U64_C (0xcdcd13cd26de87eb),
+    U64_C (0x3434d034bde46889), U64_C (0x48483d487a759032),
+    U64_C (0xffffdbffab24e354), U64_C (0x7a7af57af78ff48d),
+    U64_C (0x90907a90f4ea3d64), U64_C (0x5f5f615fc23ebe9d),
+    U64_C (0x202080201da0403d), U64_C (0x6868bd6867d5d00f),
+    U64_C (0x1a1a681ad07234ca), U64_C (0xaeae82ae192c41b7),
+    U64_C (0xb4b4eab4c95e757d), U64_C (0x54544d549a19a8ce),
+    U64_C (0x93937693ece53b7f), U64_C (0x222288220daa442f),
+    U64_C (0x64648d6407e9c863), U64_C (0xf1f1e3f1db12ff2a),
+    U64_C (0x7373d173bfa2e6cc), U64_C (0x12124812905a2482),
+    U64_C (0x40401d403a5d807a), U64_C (0x0808200840281048),
+    U64_C (0xc3c32bc356e89b95), U64_C (0xecec97ec337bc5df),
+    U64_C (0xdbdb4bdb9690ab4d), U64_C (0xa1a1bea1611f5fc0),
+    U64_C (0x8d8d0e8d1c830791), U64_C (0x3d3df43df5c97ac8),
+    U64_C (0x97976697ccf1335b), U64_C (0x0000000000000000),
+    U64_C (0xcfcf1bcf36d483f9), U64_C (0x2b2bac2b4587566e),
+    U64_C (0x7676c57697b3ece1), U64_C (0x8282328264b019e6),
+    U64_C (0xd6d67fd6fea9b128), U64_C (0x1b1b6c1bd87736c3),
+    U64_C (0xb5b5eeb5c15b7774), U64_C (0xafaf86af112943be),
+    U64_C (0x6a6ab56a77dfd41d), U64_C (0x50505d50ba0da0ea),
+    U64_C (0x45450945124c8a57), U64_C (0xf3f3ebf3cb18fb38),
+    U64_C (0x3030c0309df060ad), U64_C (0xefef9bef2b74c3c4),
+    U64_C (0x3f3ffc3fe5c37eda), U64_C (0x55554955921caac7),
+    U64_C (0xa2a2b2a2791059db), U64_C (0xeaea8fea0365c9e9),
+    U64_C (0x656589650fecca6a), U64_C (0xbabad2bab9686903),
+    U64_C (0x2f2fbc2f65935e4a), U64_C (0xc0c027c04ee79d8e),
+    U64_C (0xdede5fdebe81a160), U64_C (0x1c1c701ce06c38fc),
+    U64_C (0xfdfdd3fdbb2ee746), U64_C (0x4d4d294d52649a1f),
+    U64_C (0x92927292e4e03976), U64_C (0x7575c9758fbceafa),
+    U64_C (0x06061806301e0c36), U64_C (0x8a8a128a249809ae),
+    U64_C (0xb2b2f2b2f940794b), U64_C (0xe6e6bfe66359d185),
+    U64_C (0x0e0e380e70361c7e), U64_C (0x1f1f7c1ff8633ee7),
+    U64_C (0x6262956237f7c455), U64_C (0xd4d477d4eea3b53a),
+    U64_C (0xa8a89aa829324d81), U64_C (0x96966296c4f43152),
+    U64_C (0xf9f9c3f99b3aef62), U64_C (0xc5c533c566f697a3),
+    U64_C (0x2525942535b14a10), U64_C (0x59597959f220b2ab),
+    U64_C (0x84842a8454ae15d0), U64_C (0x7272d572b7a7e4c5),
+    U64_C (0x3939e439d5dd72ec), U64_C (0x4c4c2d4c5a619816),
+    U64_C (0x5e5e655eca3bbc94), U64_C (0x7878fd78e785f09f),
+    U64_C (0x3838e038ddd870e5), U64_C (0x8c8c0a8c14860598),
+    U64_C (0xd1d163d1c6b2bf17), U64_C (0xa5a5aea5410b57e4),
+    U64_C (0xe2e2afe2434dd9a1), U64_C (0x616199612ff8c24e),
+    U64_C (0xb3b3f6b3f1457b42), U64_C (0x2121842115a54234),
+    U64_C (0x9c9c4a9c94d62508), U64_C (0x1e1e781ef0663cee),
+    U64_C (0x4343114322528661), U64_C (0xc7c73bc776fc93b1),
+    U64_C (0xfcfcd7fcb32be54f), U64_C (0x0404100420140824),
+    U64_C (0x51515951b208a2e3), U64_C (0x99995e99bcc72f25),
+    U64_C (0x6d6da96d4fc4da22), U64_C (0x0d0d340d68391a65),
+    U64_C (0xfafacffa8335e979), U64_C (0xdfdf5bdfb684a369),
+    U64_C (0x7e7ee57ed79bfca9), U64_C (0x242490243db44819),
+    U64_C (0x3b3bec3bc5d776fe), U64_C (0xabab96ab313d4b9a),
+    U64_C (0xcece1fce3ed181f0), U64_C (0x1111441188552299),
+    U64_C (0x8f8f068f0c890383), U64_C (0x4e4e254e4a6b9c04),
+    U64_C (0xb7b7e6b7d1517366), U64_C (0xebeb8beb0b60cbe0),
+    U64_C (0x3c3cf03cfdcc78c1), U64_C (0x81813e817cbf1ffd),
+    U64_C (0x94946a94d4fe3540), U64_C (0xf7f7fbf7eb0cf31c),
+    U64_C (0xb9b9deb9a1676f18), U64_C (0x13134c13985f268b),
+    U64_C (0x2c2cb02c7d9c5851), U64_C (0xd3d36bd3d6b8bb05),
+    U64_C (0xe7e7bbe76b5cd38c), U64_C (0x6e6ea56e57cbdc39),
+    U64_C (0xc4c437c46ef395aa), U64_C (0x03030c03180f061b),
+    U64_C (0x565645568a13acdc), U64_C (0x44440d441a49885e),
+    U64_C (0x7f7fe17fdf9efea0), U64_C (0xa9a99ea921374f88),
+    U64_C (0x2a2aa82a4d825467), U64_C (0xbbbbd6bbb16d6b0a),
+    U64_C (0xc1c123c146e29f87), U64_C (0x53535153a202a6f1),
+    U64_C (0xdcdc57dcae8ba572), U64_C (0x0b0b2c0b58271653),
+    U64_C (0x9d9d4e9d9cd32701), U64_C (0x6c6cad6c47c1d82b),
+    U64_C (0x3131c43195f562a4), U64_C (0x7474cd7487b9e8f3),
+    U64_C (0xf6f6fff6e309f115), U64_C (0x464605460a438c4c),
+    U64_C (0xacac8aac092645a5), U64_C (0x89891e893c970fb5),
+    U64_C (0x14145014a04428b4), U64_C (0xe1e1a3e15b42dfba),
+    U64_C (0x16165816b04e2ca6), U64_C (0x3a3ae83acdd274f7),
+    U64_C (0x6969b9696fd0d206), U64_C (0x09092409482d1241),
+    U64_C (0x7070dd70a7ade0d7), U64_C (0xb6b6e2b6d954716f),
+    U64_C (0xd0d067d0ceb7bd1e), U64_C (0xeded93ed3b7ec7d6),
+    U64_C (0xcccc17cc2edb85e2), U64_C (0x424215422a578468),
+    U64_C (0x98985a98b4c22d2c), U64_C (0xa4a4aaa4490e55ed),
+    U64_C (0x2828a0285d885075), U64_C (0x5c5c6d5cda31b886),
+    U64_C (0xf8f8c7f8933fed6b), U64_C (0x8686228644a411c2),
+  }, {
+    U64_C (0xd818186018c07830), U64_C (0x2623238c2305af46),
+    U64_C (0xb8c6c63fc67ef991), U64_C (0xfbe8e887e8136fcd),
+    U64_C (0xcb878726874ca113), U64_C (0x11b8b8dab8a9626d),
+    U64_C (0x0901010401080502), U64_C (0x0d4f4f214f426e9e),
+    U64_C (0x9b3636d836adee6c), U64_C (0xffa6a6a2a6590451),
+    U64_C (0x0cd2d26fd2debdb9), U64_C (0x0ef5f5f3f5fb06f7),
+    U64_C (0x967979f979ef80f2), U64_C (0x306f6fa16f5fcede),
+    U64_C (0x6d91917e91fcef3f), U64_C (0xf852525552aa07a4),
+    U64_C (0x4760609d6027fdc0), U64_C (0x35bcbccabc897665),
+    U64_C (0x379b9b569baccd2b), U64_C (0x8a8e8e028e048c01),
+    U64_C (0xd2a3a3b6a371155b), U64_C (0x6c0c0c300c603c18),
+    U64_C (0x847b7bf17bff8af6), U64_C (0x803535d435b5e16a),
+    U64_C (0xf51d1d741de8693a), U64_C (0xb3e0e0a7e05347dd),
+    U64_C (0x21d7d77bd7f6acb3), U64_C (0x9cc2c22fc25eed99),
+    U64_C (0x432e2eb82e6d965c), U64_C (0x294b4b314b627a96),
+    U64_C (0x5dfefedffea321e1), U64_C (0xd5575741578216ae),
+    U64_C (0xbd15155415a8412a), U64_C (0xe87777c1779fb6ee),
+    U64_C (0x923737dc37a5eb6e), U64_C (0x9ee5e5b3e57b56d7),
+    U64_C (0x139f9f469f8cd923), U64_C (0x23f0f0e7f0d317fd),
+    U64_C (0x204a4a354a6a7f94), U64_C (0x44dada4fda9e95a9),
+    U64_C (0xa258587d58fa25b0), U64_C (0xcfc9c903c906ca8f),
+    U64_C (0x7c2929a429558d52), U64_C (0x5a0a0a280a502214),
+    U64_C (0x50b1b1feb1e14f7f), U64_C (0xc9a0a0baa0691a5d),
+    U64_C (0x146b6bb16b7fdad6), U64_C (0xd985852e855cab17),
+    U64_C (0x3cbdbdcebd817367), U64_C (0x8f5d5d695dd234ba),
+    U64_C (0x9010104010805020), U64_C (0x07f4f4f7f4f303f5),
+    U64_C (0xddcbcb0bcb16c08b), U64_C (0xd33e3ef83eedc67c),
+    U64_C (0x2d0505140528110a), U64_C (0x78676781671fe6ce),
+    U64_C (0x97e4e4b7e47353d5), U64_C (0x0227279c2725bb4e),
+    U64_C (0x7341411941325882), U64_C (0xa78b8b168b2c9d0b),
+    U64_C (0xf6a7a7a6a7510153), U64_C (0xb27d7de97dcf94fa),
+    U64_C (0x4995956e95dcfb37), U64_C (0x56d8d847d88e9fad),
+    U64_C (0x70fbfbcbfb8b30eb), U64_C (0xcdeeee9fee2371c1),
+    U64_C (0xbb7c7ced7cc791f8), U64_C (0x716666856617e3cc),
+    U64_C (0x7bdddd53dda68ea7), U64_C (0xaf17175c17b84b2e),
+    U64_C (0x454747014702468e), U64_C (0x1a9e9e429e84dc21),
+    U64_C (0xd4caca0fca1ec589), U64_C (0x582d2db42d75995a),
+    U64_C (0x2ebfbfc6bf917963), U64_C (0x3f07071c07381b0e),
+    U64_C (0xacadad8ead012347), U64_C (0xb05a5a755aea2fb4),
+    U64_C (0xef838336836cb51b), U64_C (0xb63333cc3385ff66),
+    U64_C (0x5c636391633ff2c6), U64_C (0x1202020802100a04),
+    U64_C (0x93aaaa92aa393849), U64_C (0xde7171d971afa8e2),
+    U64_C (0xc6c8c807c80ecf8d), U64_C (0xd119196419c87d32),
+    U64_C (0x3b49493949727092), U64_C (0x5fd9d943d9869aaf),
+    U64_C (0x31f2f2eff2c31df9), U64_C (0xa8e3e3abe34b48db),
+    U64_C (0xb95b5b715be22ab6), U64_C (0xbc88881a8834920d),
+    U64_C (0x3e9a9a529aa4c829), U64_C (0x0b262698262dbe4c),
+    U64_C (0xbf3232c8328dfa64), U64_C (0x59b0b0fab0e94a7d),
+    U64_C (0xf2e9e983e91b6acf), U64_C (0x770f0f3c0f78331e),
+    U64_C (0x33d5d573d5e6a6b7), U64_C (0xf480803a8074ba1d),
+    U64_C (0x27bebec2be997c61), U64_C (0xebcdcd13cd26de87),
+    U64_C (0x893434d034bde468), U64_C (0x3248483d487a7590),
+    U64_C (0x54ffffdbffab24e3), U64_C (0x8d7a7af57af78ff4),
+    U64_C (0x6490907a90f4ea3d), U64_C (0x9d5f5f615fc23ebe),
+    U64_C (0x3d202080201da040), U64_C (0x0f6868bd6867d5d0),
+    U64_C (0xca1a1a681ad07234), U64_C (0xb7aeae82ae192c41),
+    U64_C (0x7db4b4eab4c95e75), U64_C (0xce54544d549a19a8),
+    U64_C (0x7f93937693ece53b), U64_C (0x2f222288220daa44),
+    U64_C (0x6364648d6407e9c8), U64_C (0x2af1f1e3f1db12ff),
+    U64_C (0xcc7373d173bfa2e6), U64_C (0x8212124812905a24),
+    U64_C (0x7a40401d403a5d80), U64_C (0x4808082008402810),
+    U64_C (0x95c3c32bc356e89b), U64_C (0xdfecec97ec337bc5),
+    U64_C (0x4ddbdb4bdb9690ab), U64_C (0xc0a1a1bea1611f5f),
+    U64_C (0x918d8d0e8d1c8307), U64_C (0xc83d3df43df5c97a),
+    U64_C (0x5b97976697ccf133), U64_C (0x0000000000000000),
+    U64_C (0xf9cfcf1bcf36d483), U64_C (0x6e2b2bac2b458756),
+    U64_C (0xe17676c57697b3ec), U64_C (0xe68282328264b019),
+    U64_C (0x28d6d67fd6fea9b1), U64_C (0xc31b1b6c1bd87736),
+    U64_C (0x74b5b5eeb5c15b77), U64_C (0xbeafaf86af112943),
+    U64_C (0x1d6a6ab56a77dfd4), U64_C (0xea50505d50ba0da0),
+    U64_C (0x5745450945124c8a), U64_C (0x38f3f3ebf3cb18fb),
+    U64_C (0xad3030c0309df060), U64_C (0xc4efef9bef2b74c3),
+    U64_C (0xda3f3ffc3fe5c37e), U64_C (0xc755554955921caa),
+    U64_C (0xdba2a2b2a2791059), U64_C (0xe9eaea8fea0365c9),
+    U64_C (0x6a656589650fecca), U64_C (0x03babad2bab96869),
+    U64_C (0x4a2f2fbc2f65935e), U64_C (0x8ec0c027c04ee79d),
+    U64_C (0x60dede5fdebe81a1), U64_C (0xfc1c1c701ce06c38),
+    U64_C (0x46fdfdd3fdbb2ee7), U64_C (0x1f4d4d294d52649a),
+    U64_C (0x7692927292e4e039), U64_C (0xfa7575c9758fbcea),
+    U64_C (0x3606061806301e0c), U64_C (0xae8a8a128a249809),
+    U64_C (0x4bb2b2f2b2f94079), U64_C (0x85e6e6bfe66359d1),
+    U64_C (0x7e0e0e380e70361c), U64_C (0xe71f1f7c1ff8633e),
+    U64_C (0x556262956237f7c4), U64_C (0x3ad4d477d4eea3b5),
+    U64_C (0x81a8a89aa829324d), U64_C (0x5296966296c4f431),
+    U64_C (0x62f9f9c3f99b3aef), U64_C (0xa3c5c533c566f697),
+    U64_C (0x102525942535b14a), U64_C (0xab59597959f220b2),
+    U64_C (0xd084842a8454ae15), U64_C (0xc57272d572b7a7e4),
+    U64_C (0xec3939e439d5dd72), U64_C (0x164c4c2d4c5a6198),
+    U64_C (0x945e5e655eca3bbc), U64_C (0x9f7878fd78e785f0),
+    U64_C (0xe53838e038ddd870), U64_C (0x988c8c0a8c148605),
+    U64_C (0x17d1d163d1c6b2bf), U64_C (0xe4a5a5aea5410b57),
+    U64_C (0xa1e2e2afe2434dd9), U64_C (0x4e616199612ff8c2),
+    U64_C (0x42b3b3f6b3f1457b), U64_C (0x342121842115a542),
+    U64_C (0x089c9c4a9c94d625), U64_C (0xee1e1e781ef0663c),
+    U64_C (0x6143431143225286), U64_C (0xb1c7c73bc776fc93),
+    U64_C (0x4ffcfcd7fcb32be5), U64_C (0x2404041004201408),
+    U64_C (0xe351515951b208a2), U64_C (0x2599995e99bcc72f),
+    U64_C (0x226d6da96d4fc4da), U64_C (0x650d0d340d68391a),
+    U64_C (0x79fafacffa8335e9), U64_C (0x69dfdf5bdfb684a3),
+    U64_C (0xa97e7ee57ed79bfc), U64_C (0x19242490243db448),
+    U64_C (0xfe3b3bec3bc5d776), U64_C (0x9aabab96ab313d4b),
+    U64_C (0xf0cece1fce3ed181), U64_C (0x9911114411885522),
+    U64_C (0x838f8f068f0c8903), U64_C (0x044e4e254e4a6b9c),
+    U64_C (0x66b7b7e6b7d15173), U64_C (0xe0ebeb8beb0b60cb),
+    U64_C (0xc13c3cf03cfdcc78), U64_C (0xfd81813e817cbf1f),
+    U64_C (0x4094946a94d4fe35), U64_C (0x1cf7f7fbf7eb0cf3),
+    U64_C (0x18b9b9deb9a1676f), U64_C (0x8b13134c13985f26),
+    U64_C (0x512c2cb02c7d9c58), U64_C (0x05d3d36bd3d6b8bb),
+    U64_C (0x8ce7e7bbe76b5cd3), U64_C (0x396e6ea56e57cbdc),
+    U64_C (0xaac4c437c46ef395), U64_C (0x1b03030c03180f06),
+    U64_C (0xdc565645568a13ac), U64_C (0x5e44440d441a4988),
+    U64_C (0xa07f7fe17fdf9efe), U64_C (0x88a9a99ea921374f),
+    U64_C (0x672a2aa82a4d8254), U64_C (0x0abbbbd6bbb16d6b),
+    U64_C (0x87c1c123c146e29f), U64_C (0xf153535153a202a6),
+    U64_C (0x72dcdc57dcae8ba5), U64_C (0x530b0b2c0b582716),
+    U64_C (0x019d9d4e9d9cd327), U64_C (0x2b6c6cad6c47c1d8),
+    U64_C (0xa43131c43195f562), U64_C (0xf37474cd7487b9e8),
+    U64_C (0x15f6f6fff6e309f1), U64_C (0x4c464605460a438c),
+    U64_C (0xa5acac8aac092645), U64_C (0xb589891e893c970f),
+    U64_C (0xb414145014a04428), U64_C (0xbae1e1a3e15b42df),
+    U64_C (0xa616165816b04e2c), U64_C (0xf73a3ae83acdd274),
+    U64_C (0x066969b9696fd0d2), U64_C (0x4109092409482d12),
+    U64_C (0xd77070dd70a7ade0), U64_C (0x6fb6b6e2b6d95471),
+    U64_C (0x1ed0d067d0ceb7bd), U64_C (0xd6eded93ed3b7ec7),
+    U64_C (0xe2cccc17cc2edb85), U64_C (0x68424215422a5784),
+    U64_C (0x2c98985a98b4c22d), U64_C (0xeda4a4aaa4490e55),
+    U64_C (0x752828a0285d8850), U64_C (0x865c5c6d5cda31b8),
+    U64_C (0x6bf8f8c7f8933fed), U64_C (0xc28686228644a411),
+  }, {
+    U64_C (0x30d818186018c078), U64_C (0x462623238c2305af),
+    U64_C (0x91b8c6c63fc67ef9), U64_C (0xcdfbe8e887e8136f),
+    U64_C (0x13cb878726874ca1), U64_C (0x6d11b8b8dab8a962),
+    U64_C (0x0209010104010805), U64_C (0x9e0d4f4f214f426e),
+    U64_C (0x6c9b3636d836adee), U64_C (0x51ffa6a6a2a65904),
+    U64_C (0xb90cd2d26fd2debd), U64_C (0xf70ef5f5f3f5fb06),
+    U64_C (0xf2967979f979ef80), U64_C (0xde306f6fa16f5fce),
+    U64_C (0x3f6d91917e91fcef), U64_C (0xa4f852525552aa07),
+    U64_C (0xc04760609d6027fd), U64_C (0x6535bcbccabc8976),
+    U64_C (0x2b379b9b569baccd), U64_C (0x018a8e8e028e048c),
+    U64_C (0x5bd2a3a3b6a37115), U64_C (0x186c0c0c300c603c),
+    U64_C (0xf6847b7bf17bff8a), U64_C (0x6a803535d435b5e1),
+    U64_C (0x3af51d1d741de869), U64_C (0xddb3e0e0a7e05347),
+    U64_C (0xb321d7d77bd7f6ac), U64_C (0x999cc2c22fc25eed),
+    U64_C (0x5c432e2eb82e6d96), U64_C (0x96294b4b314b627a),
+    U64_C (0xe15dfefedffea321), U64_C (0xaed5575741578216),
+    U64_C (0x2abd15155415a841), U64_C (0xeee87777c1779fb6),
+    U64_C (0x6e923737dc37a5eb), U64_C (0xd79ee5e5b3e57b56),
+    U64_C (0x23139f9f469f8cd9), U64_C (0xfd23f0f0e7f0d317),
+    U64_C (0x94204a4a354a6a7f), U64_C (0xa944dada4fda9e95),
+    U64_C (0xb0a258587d58fa25), U64_C (0x8fcfc9c903c906ca),
+    U64_C (0x527c2929a429558d), U64_C (0x145a0a0a280a5022),
+    U64_C (0x7f50b1b1feb1e14f), U64_C (0x5dc9a0a0baa0691a),
+    U64_C (0xd6146b6bb16b7fda), U64_C (0x17d985852e855cab),
+    U64_C (0x673cbdbdcebd8173), U64_C (0xba8f5d5d695dd234),
+    U64_C (0x2090101040108050), U64_C (0xf507f4f4f7f4f303),
+    U64_C (0x8bddcbcb0bcb16c0), U64_C (0x7cd33e3ef83eedc6),
+    U64_C (0x0a2d050514052811), U64_C (0xce78676781671fe6),
+    U64_C (0xd597e4e4b7e47353), U64_C (0x4e0227279c2725bb),
+    U64_C (0x8273414119413258), U64_C (0x0ba78b8b168b2c9d),
+    U64_C (0x53f6a7a7a6a75101), U64_C (0xfab27d7de97dcf94),
+    U64_C (0x374995956e95dcfb), U64_C (0xad56d8d847d88e9f),
+    U64_C (0xeb70fbfbcbfb8b30), U64_C (0xc1cdeeee9fee2371),
+    U64_C (0xf8bb7c7ced7cc791), U64_C (0xcc716666856617e3),
+    U64_C (0xa77bdddd53dda68e), U64_C (0x2eaf17175c17b84b),
+    U64_C (0x8e45474701470246), U64_C (0x211a9e9e429e84dc),
+    U64_C (0x89d4caca0fca1ec5), U64_C (0x5a582d2db42d7599),
+    U64_C (0x632ebfbfc6bf9179), U64_C (0x0e3f07071c07381b),
+    U64_C (0x47acadad8ead0123), U64_C (0xb4b05a5a755aea2f),
+    U64_C (0x1bef838336836cb5), U64_C (0x66b63333cc3385ff),
+    U64_C (0xc65c636391633ff2), U64_C (0x041202020802100a),
+    U64_C (0x4993aaaa92aa3938), U64_C (0xe2de7171d971afa8),
+    U64_C (0x8dc6c8c807c80ecf), U64_C (0x32d119196419c87d),
+    U64_C (0x923b494939497270), U64_C (0xaf5fd9d943d9869a),
+    U64_C (0xf931f2f2eff2c31d), U64_C (0xdba8e3e3abe34b48),
+    U64_C (0xb6b95b5b715be22a), U64_C (0x0dbc88881a883492),
+    U64_C (0x293e9a9a529aa4c8), U64_C (0x4c0b262698262dbe),
+    U64_C (0x64bf3232c8328dfa), U64_C (0x7d59b0b0fab0e94a),
+    U64_C (0xcff2e9e983e91b6a), U64_C (0x1e770f0f3c0f7833),
+    U64_C (0xb733d5d573d5e6a6), U64_C (0x1df480803a8074ba),
+    U64_C (0x6127bebec2be997c), U64_C (0x87ebcdcd13cd26de),
+    U64_C (0x68893434d034bde4), U64_C (0x903248483d487a75),
+    U64_C (0xe354ffffdbffab24), U64_C (0xf48d7a7af57af78f),
+    U64_C (0x3d6490907a90f4ea), U64_C (0xbe9d5f5f615fc23e),
+    U64_C (0x403d202080201da0), U64_C (0xd00f6868bd6867d5),
+    U64_C (0x34ca1a1a681ad072), U64_C (0x41b7aeae82ae192c),
+    U64_C (0x757db4b4eab4c95e), U64_C (0xa8ce54544d549a19),
+    U64_C (0x3b7f93937693ece5), U64_C (0x442f222288220daa),
+    U64_C (0xc86364648d6407e9), U64_C (0xff2af1f1e3f1db12),
+    U64_C (0xe6cc7373d173bfa2), U64_C (0x248212124812905a),
+    U64_C (0x807a40401d403a5d), U64_C (0x1048080820084028),
+    U64_C (0x9b95c3c32bc356e8), U64_C (0xc5dfecec97ec337b),
+    U64_C (0xab4ddbdb4bdb9690), U64_C (0x5fc0a1a1bea1611f),
+    U64_C (0x07918d8d0e8d1c83), U64_C (0x7ac83d3df43df5c9),
+    U64_C (0x335b97976697ccf1), U64_C (0x0000000000000000),
+    U64_C (0x83f9cfcf1bcf36d4), U64_C (0x566e2b2bac2b4587),
+    U64_C (0xece17676c57697b3), U64_C (0x19e68282328264b0),
+    U64_C (0xb128d6d67fd6fea9), U64_C (0x36c31b1b6c1bd877),
+    U64_C (0x7774b5b5eeb5c15b), U64_C (0x43beafaf86af1129),
+    U64_C (0xd41d6a6ab56a77df), U64_C (0xa0ea50505d50ba0d),
+    U64_C (0x8a5745450945124c), U64_C (0xfb38f3f3ebf3cb18),
+    U64_C (0x60ad3030c0309df0), U64_C (0xc3c4efef9bef2b74),
+    U64_C (0x7eda3f3ffc3fe5c3), U64_C (0xaac755554955921c),
+    U64_C (0x59dba2a2b2a27910), U64_C (0xc9e9eaea8fea0365),
+    U64_C (0xca6a656589650fec), U64_C (0x6903babad2bab968),
+    U64_C (0x5e4a2f2fbc2f6593), U64_C (0x9d8ec0c027c04ee7),
+    U64_C (0xa160dede5fdebe81), U64_C (0x38fc1c1c701ce06c),
+    U64_C (0xe746fdfdd3fdbb2e), U64_C (0x9a1f4d4d294d5264),
+    U64_C (0x397692927292e4e0), U64_C (0xeafa7575c9758fbc),
+    U64_C (0x0c3606061806301e), U64_C (0x09ae8a8a128a2498),
+    U64_C (0x794bb2b2f2b2f940), U64_C (0xd185e6e6bfe66359),
+    U64_C (0x1c7e0e0e380e7036), U64_C (0x3ee71f1f7c1ff863),
+    U64_C (0xc4556262956237f7), U64_C (0xb53ad4d477d4eea3),
+    U64_C (0x4d81a8a89aa82932), U64_C (0x315296966296c4f4),
+    U64_C (0xef62f9f9c3f99b3a), U64_C (0x97a3c5c533c566f6),
+    U64_C (0x4a102525942535b1), U64_C (0xb2ab59597959f220),
+    U64_C (0x15d084842a8454ae), U64_C (0xe4c57272d572b7a7),
+    U64_C (0x72ec3939e439d5dd), U64_C (0x98164c4c2d4c5a61),
+    U64_C (0xbc945e5e655eca3b), U64_C (0xf09f7878fd78e785),
+    U64_C (0x70e53838e038ddd8), U64_C (0x05988c8c0a8c1486),
+    U64_C (0xbf17d1d163d1c6b2), U64_C (0x57e4a5a5aea5410b),
+    U64_C (0xd9a1e2e2afe2434d), U64_C (0xc24e616199612ff8),
+    U64_C (0x7b42b3b3f6b3f145), U64_C (0x42342121842115a5),
+    U64_C (0x25089c9c4a9c94d6), U64_C (0x3cee1e1e781ef066),
+    U64_C (0x8661434311432252), U64_C (0x93b1c7c73bc776fc),
+    U64_C (0xe54ffcfcd7fcb32b), U64_C (0x0824040410042014),
+    U64_C (0xa2e351515951b208), U64_C (0x2f2599995e99bcc7),
+    U64_C (0xda226d6da96d4fc4), U64_C (0x1a650d0d340d6839),
+    U64_C (0xe979fafacffa8335), U64_C (0xa369dfdf5bdfb684),
+    U64_C (0xfca97e7ee57ed79b), U64_C (0x4819242490243db4),
+    U64_C (0x76fe3b3bec3bc5d7), U64_C (0x4b9aabab96ab313d),
+    U64_C (0x81f0cece1fce3ed1), U64_C (0x2299111144118855),
+    U64_C (0x03838f8f068f0c89), U64_C (0x9c044e4e254e4a6b),
+    U64_C (0x7366b7b7e6b7d151), U64_C (0xcbe0ebeb8beb0b60),
+    U64_C (0x78c13c3cf03cfdcc), U64_C (0x1ffd81813e817cbf),
+    U64_C (0x354094946a94d4fe), U64_C (0xf31cf7f7fbf7eb0c),
+    U64_C (0x6f18b9b9deb9a167), U64_C (0x268b13134c13985f),
+    U64_C (0x58512c2cb02c7d9c), U64_C (0xbb05d3d36bd3d6b8),
+    U64_C (0xd38ce7e7bbe76b5c), U64_C (0xdc396e6ea56e57cb),
+    U64_C (0x95aac4c437c46ef3), U64_C (0x061b03030c03180f),
+    U64_C (0xacdc565645568a13), U64_C (0x885e44440d441a49),
+    U64_C (0xfea07f7fe17fdf9e), U64_C (0x4f88a9a99ea92137),
+    U64_C (0x54672a2aa82a4d82), U64_C (0x6b0abbbbd6bbb16d),
+    U64_C (0x9f87c1c123c146e2), U64_C (0xa6f153535153a202),
+    U64_C (0xa572dcdc57dcae8b), U64_C (0x16530b0b2c0b5827),
+    U64_C (0x27019d9d4e9d9cd3), U64_C (0xd82b6c6cad6c47c1),
+    U64_C (0x62a43131c43195f5), U64_C (0xe8f37474cd7487b9),
+    U64_C (0xf115f6f6fff6e309), U64_C (0x8c4c464605460a43),
+    U64_C (0x45a5acac8aac0926), U64_C (0x0fb589891e893c97),
+    U64_C (0x28b414145014a044), U64_C (0xdfbae1e1a3e15b42),
+    U64_C (0x2ca616165816b04e), U64_C (0x74f73a3ae83acdd2),
+    U64_C (0xd2066969b9696fd0), U64_C (0x124109092409482d),
+    U64_C (0xe0d77070dd70a7ad), U64_C (0x716fb6b6e2b6d954),
+    U64_C (0xbd1ed0d067d0ceb7), U64_C (0xc7d6eded93ed3b7e),
+    U64_C (0x85e2cccc17cc2edb), U64_C (0x8468424215422a57),
+    U64_C (0x2d2c98985a98b4c2), U64_C (0x55eda4a4aaa4490e),
+    U64_C (0x50752828a0285d88), U64_C (0xb8865c5c6d5cda31),
+    U64_C (0xed6bf8f8c7f8933f), U64_C (0x11c28686228644a4),
+  }, {
+    U64_C (0x7830d818186018c0), U64_C (0xaf462623238c2305),
+    U64_C (0xf991b8c6c63fc67e), U64_C (0x6fcdfbe8e887e813),
+    U64_C (0xa113cb878726874c), U64_C (0x626d11b8b8dab8a9),
+    U64_C (0x0502090101040108), U64_C (0x6e9e0d4f4f214f42),
+    U64_C (0xee6c9b3636d836ad), U64_C (0x0451ffa6a6a2a659),
+    U64_C (0xbdb90cd2d26fd2de), U64_C (0x06f70ef5f5f3f5fb),
+    U64_C (0x80f2967979f979ef), U64_C (0xcede306f6fa16f5f),
+    U64_C (0xef3f6d91917e91fc), U64_C (0x07a4f852525552aa),
+    U64_C (0xfdc04760609d6027), U64_C (0x766535bcbccabc89),
+    U64_C (0xcd2b379b9b569bac), U64_C (0x8c018a8e8e028e04),
+    U64_C (0x155bd2a3a3b6a371), U64_C (0x3c186c0c0c300c60),
+    U64_C (0x8af6847b7bf17bff), U64_C (0xe16a803535d435b5),
+    U64_C (0x693af51d1d741de8), U64_C (0x47ddb3e0e0a7e053),
+    U64_C (0xacb321d7d77bd7f6), U64_C (0xed999cc2c22fc25e),
+    U64_C (0x965c432e2eb82e6d), U64_C (0x7a96294b4b314b62),
+    U64_C (0x21e15dfefedffea3), U64_C (0x16aed55757415782),
+    U64_C (0x412abd15155415a8), U64_C (0xb6eee87777c1779f),
+    U64_C (0xeb6e923737dc37a5), U64_C (0x56d79ee5e5b3e57b),
+    U64_C (0xd923139f9f469f8c), U64_C (0x17fd23f0f0e7f0d3),
+    U64_C (0x7f94204a4a354a6a), U64_C (0x95a944dada4fda9e),
+    U64_C (0x25b0a258587d58fa), U64_C (0xca8fcfc9c903c906),
+    U64_C (0x8d527c2929a42955), U64_C (0x22145a0a0a280a50),
+    U64_C (0x4f7f50b1b1feb1e1), U64_C (0x1a5dc9a0a0baa069),
+    U64_C (0xdad6146b6bb16b7f), U64_C (0xab17d985852e855c),
+    U64_C (0x73673cbdbdcebd81), U64_C (0x34ba8f5d5d695dd2),
+    U64_C (0x5020901010401080), U64_C (0x03f507f4f4f7f4f3),
+    U64_C (0xc08bddcbcb0bcb16), U64_C (0xc67cd33e3ef83eed),
+    U64_C (0x110a2d0505140528), U64_C (0xe6ce78676781671f),
+    U64_C (0x53d597e4e4b7e473), U64_C (0xbb4e0227279c2725),
+    U64_C (0x5882734141194132), U64_C (0x9d0ba78b8b168b2c),
+    U64_C (0x0153f6a7a7a6a751), U64_C (0x94fab27d7de97dcf),
+    U64_C (0xfb374995956e95dc), U64_C (0x9fad56d8d847d88e),
+    U64_C (0x30eb70fbfbcbfb8b), U64_C (0x71c1cdeeee9fee23),
+    U64_C (0x91f8bb7c7ced7cc7), U64_C (0xe3cc716666856617),
+    U64_C (0x8ea77bdddd53dda6), U64_C (0x4b2eaf17175c17b8),
+    U64_C (0x468e454747014702), U64_C (0xdc211a9e9e429e84),
+    U64_C (0xc589d4caca0fca1e), U64_C (0x995a582d2db42d75),
+    U64_C (0x79632ebfbfc6bf91), U64_C (0x1b0e3f07071c0738),
+    U64_C (0x2347acadad8ead01), U64_C (0x2fb4b05a5a755aea),
+    U64_C (0xb51bef838336836c), U64_C (0xff66b63333cc3385),
+    U64_C (0xf2c65c636391633f), U64_C (0x0a04120202080210),
+    U64_C (0x384993aaaa92aa39), U64_C (0xa8e2de7171d971af),
+    U64_C (0xcf8dc6c8c807c80e), U64_C (0x7d32d119196419c8),
+    U64_C (0x70923b4949394972), U64_C (0x9aaf5fd9d943d986),
+    U64_C (0x1df931f2f2eff2c3), U64_C (0x48dba8e3e3abe34b),
+    U64_C (0x2ab6b95b5b715be2), U64_C (0x920dbc88881a8834),
+    U64_C (0xc8293e9a9a529aa4), U64_C (0xbe4c0b262698262d),
+    U64_C (0xfa64bf3232c8328d), U64_C (0x4a7d59b0b0fab0e9),
+    U64_C (0x6acff2e9e983e91b), U64_C (0x331e770f0f3c0f78),
+    U64_C (0xa6b733d5d573d5e6), U64_C (0xba1df480803a8074),
+    U64_C (0x7c6127bebec2be99), U64_C (0xde87ebcdcd13cd26),
+    U64_C (0xe468893434d034bd), U64_C (0x75903248483d487a),
+    U64_C (0x24e354ffffdbffab), U64_C (0x8ff48d7a7af57af7),
+    U64_C (0xea3d6490907a90f4), U64_C (0x3ebe9d5f5f615fc2),
+    U64_C (0xa0403d202080201d), U64_C (0xd5d00f6868bd6867),
+    U64_C (0x7234ca1a1a681ad0), U64_C (0x2c41b7aeae82ae19),
+    U64_C (0x5e757db4b4eab4c9), U64_C (0x19a8ce54544d549a),
+    U64_C (0xe53b7f93937693ec), U64_C (0xaa442f222288220d),
+    U64_C (0xe9c86364648d6407), U64_C (0x12ff2af1f1e3f1db),
+    U64_C (0xa2e6cc7373d173bf), U64_C (0x5a24821212481290),
+    U64_C (0x5d807a40401d403a), U64_C (0x2810480808200840),
+    U64_C (0xe89b95c3c32bc356), U64_C (0x7bc5dfecec97ec33),
+    U64_C (0x90ab4ddbdb4bdb96), U64_C (0x1f5fc0a1a1bea161),
+    U64_C (0x8307918d8d0e8d1c), U64_C (0xc97ac83d3df43df5),
+    U64_C (0xf1335b97976697cc), U64_C (0x0000000000000000),
+    U64_C (0xd483f9cfcf1bcf36), U64_C (0x87566e2b2bac2b45),
+    U64_C (0xb3ece17676c57697), U64_C (0xb019e68282328264),
+    U64_C (0xa9b128d6d67fd6fe), U64_C (0x7736c31b1b6c1bd8),
+    U64_C (0x5b7774b5b5eeb5c1), U64_C (0x2943beafaf86af11),
+    U64_C (0xdfd41d6a6ab56a77), U64_C (0x0da0ea50505d50ba),
+    U64_C (0x4c8a574545094512), U64_C (0x18fb38f3f3ebf3cb),
+    U64_C (0xf060ad3030c0309d), U64_C (0x74c3c4efef9bef2b),
+    U64_C (0xc37eda3f3ffc3fe5), U64_C (0x1caac75555495592),
+    U64_C (0x1059dba2a2b2a279), U64_C (0x65c9e9eaea8fea03),
+    U64_C (0xecca6a656589650f), U64_C (0x686903babad2bab9),
+    U64_C (0x935e4a2f2fbc2f65), U64_C (0xe79d8ec0c027c04e),
+    U64_C (0x81a160dede5fdebe), U64_C (0x6c38fc1c1c701ce0),
+    U64_C (0x2ee746fdfdd3fdbb), U64_C (0x649a1f4d4d294d52),
+    U64_C (0xe0397692927292e4), U64_C (0xbceafa7575c9758f),
+    U64_C (0x1e0c360606180630), U64_C (0x9809ae8a8a128a24),
+    U64_C (0x40794bb2b2f2b2f9), U64_C (0x59d185e6e6bfe663),
+    U64_C (0x361c7e0e0e380e70), U64_C (0x633ee71f1f7c1ff8),
+    U64_C (0xf7c4556262956237), U64_C (0xa3b53ad4d477d4ee),
+    U64_C (0x324d81a8a89aa829), U64_C (0xf4315296966296c4),
+    U64_C (0x3aef62f9f9c3f99b), U64_C (0xf697a3c5c533c566),
+    U64_C (0xb14a102525942535), U64_C (0x20b2ab59597959f2),
+    U64_C (0xae15d084842a8454), U64_C (0xa7e4c57272d572b7),
+    U64_C (0xdd72ec3939e439d5), U64_C (0x6198164c4c2d4c5a),
+    U64_C (0x3bbc945e5e655eca), U64_C (0x85f09f7878fd78e7),
+    U64_C (0xd870e53838e038dd), U64_C (0x8605988c8c0a8c14),
+    U64_C (0xb2bf17d1d163d1c6), U64_C (0x0b57e4a5a5aea541),
+    U64_C (0x4dd9a1e2e2afe243), U64_C (0xf8c24e616199612f),
+    U64_C (0x457b42b3b3f6b3f1), U64_C (0xa542342121842115),
+    U64_C (0xd625089c9c4a9c94), U64_C (0x663cee1e1e781ef0),
+    U64_C (0x5286614343114322), U64_C (0xfc93b1c7c73bc776),
+    U64_C (0x2be54ffcfcd7fcb3), U64_C (0x1408240404100420),
+    U64_C (0x08a2e351515951b2), U64_C (0xc72f2599995e99bc),
+    U64_C (0xc4da226d6da96d4f), U64_C (0x391a650d0d340d68),
+    U64_C (0x35e979fafacffa83), U64_C (0x84a369dfdf5bdfb6),
+    U64_C (0x9bfca97e7ee57ed7), U64_C (0xb44819242490243d),
+    U64_C (0xd776fe3b3bec3bc5), U64_C (0x3d4b9aabab96ab31),
+    U64_C (0xd181f0cece1fce3e), U64_C (0x5522991111441188),
+    U64_C (0x8903838f8f068f0c), U64_C (0x6b9c044e4e254e4a),
+    U64_C (0x517366b7b7e6b7d1), U64_C (0x60cbe0ebeb8beb0b),
+    U64_C (0xcc78c13c3cf03cfd), U64_C (0xbf1ffd81813e817c),
+    U64_C (0xfe354094946a94d4), U64_C (0x0cf31cf7f7fbf7eb),
+    U64_C (0x676f18b9b9deb9a1), U64_C (0x5f268b13134c1398),
+    U64_C (0x9c58512c2cb02c7d), U64_C (0xb8bb05d3d36bd3d6),
+    U64_C (0x5cd38ce7e7bbe76b), U64_C (0xcbdc396e6ea56e57),
+    U64_C (0xf395aac4c437c46e), U64_C (0x0f061b03030c0318),
+    U64_C (0x13acdc565645568a), U64_C (0x49885e44440d441a),
+    U64_C (0x9efea07f7fe17fdf), U64_C (0x374f88a9a99ea921),
+    U64_C (0x8254672a2aa82a4d), U64_C (0x6d6b0abbbbd6bbb1),
+    U64_C (0xe29f87c1c123c146), U64_C (0x02a6f153535153a2),
+    U64_C (0x8ba572dcdc57dcae), U64_C (0x2716530b0b2c0b58),
+    U64_C (0xd327019d9d4e9d9c), U64_C (0xc1d82b6c6cad6c47),
+    U64_C (0xf562a43131c43195), U64_C (0xb9e8f37474cd7487),
+    U64_C (0x09f115f6f6fff6e3), U64_C (0x438c4c464605460a),
+    U64_C (0x2645a5acac8aac09), U64_C (0x970fb589891e893c),
+    U64_C (0x4428b414145014a0), U64_C (0x42dfbae1e1a3e15b),
+    U64_C (0x4e2ca616165816b0), U64_C (0xd274f73a3ae83acd),
+    U64_C (0xd0d2066969b9696f), U64_C (0x2d12410909240948),
+    U64_C (0xade0d77070dd70a7), U64_C (0x54716fb6b6e2b6d9),
+    U64_C (0xb7bd1ed0d067d0ce), U64_C (0x7ec7d6eded93ed3b),
+    U64_C (0xdb85e2cccc17cc2e), U64_C (0x578468424215422a),
+    U64_C (0xc22d2c98985a98b4), U64_C (0x0e55eda4a4aaa449),
+    U64_C (0x8850752828a0285d), U64_C (0x31b8865c5c6d5cda),
+    U64_C (0x3fed6bf8f8c7f893), U64_C (0xa411c28686228644),
+  }, {
+    U64_C (0xc07830d818186018), U64_C (0x05af462623238c23),
+    U64_C (0x7ef991b8c6c63fc6), U64_C (0x136fcdfbe8e887e8),
+    U64_C (0x4ca113cb87872687), U64_C (0xa9626d11b8b8dab8),
+    U64_C (0x0805020901010401), U64_C (0x426e9e0d4f4f214f),
+    U64_C (0xadee6c9b3636d836), U64_C (0x590451ffa6a6a2a6),
+    U64_C (0xdebdb90cd2d26fd2), U64_C (0xfb06f70ef5f5f3f5),
+    U64_C (0xef80f2967979f979), U64_C (0x5fcede306f6fa16f),
+    U64_C (0xfcef3f6d91917e91), U64_C (0xaa07a4f852525552),
+    U64_C (0x27fdc04760609d60), U64_C (0x89766535bcbccabc),
+    U64_C (0xaccd2b379b9b569b), U64_C (0x048c018a8e8e028e),
+    U64_C (0x71155bd2a3a3b6a3), U64_C (0x603c186c0c0c300c),
+    U64_C (0xff8af6847b7bf17b), U64_C (0xb5e16a803535d435),
+    U64_C (0xe8693af51d1d741d), U64_C (0x5347ddb3e0e0a7e0),
+    U64_C (0xf6acb321d7d77bd7), U64_C (0x5eed999cc2c22fc2),
+    U64_C (0x6d965c432e2eb82e), U64_C (0x627a96294b4b314b),
+    U64_C (0xa321e15dfefedffe), U64_C (0x8216aed557574157),
+    U64_C (0xa8412abd15155415), U64_C (0x9fb6eee87777c177),
+    U64_C (0xa5eb6e923737dc37), U64_C (0x7b56d79ee5e5b3e5),
+    U64_C (0x8cd923139f9f469f), U64_C (0xd317fd23f0f0e7f0),
+    U64_C (0x6a7f94204a4a354a), U64_C (0x9e95a944dada4fda),
+    U64_C (0xfa25b0a258587d58), U64_C (0x06ca8fcfc9c903c9),
+    U64_C (0x558d527c2929a429), U64_C (0x5022145a0a0a280a),
+    U64_C (0xe14f7f50b1b1feb1), U64_C (0x691a5dc9a0a0baa0),
+    U64_C (0x7fdad6146b6bb16b), U64_C (0x5cab17d985852e85),
+    U64_C (0x8173673cbdbdcebd), U64_C (0xd234ba8f5d5d695d),
+    U64_C (0x8050209010104010), U64_C (0xf303f507f4f4f7f4),
+    U64_C (0x16c08bddcbcb0bcb), U64_C (0xedc67cd33e3ef83e),
+    U64_C (0x28110a2d05051405), U64_C (0x1fe6ce7867678167),
+    U64_C (0x7353d597e4e4b7e4), U64_C (0x25bb4e0227279c27),
+    U64_C (0x3258827341411941), U64_C (0x2c9d0ba78b8b168b),
+    U64_C (0x510153f6a7a7a6a7), U64_C (0xcf94fab27d7de97d),
+    U64_C (0xdcfb374995956e95), U64_C (0x8e9fad56d8d847d8),
+    U64_C (0x8b30eb70fbfbcbfb), U64_C (0x2371c1cdeeee9fee),
+    U64_C (0xc791f8bb7c7ced7c), U64_C (0x17e3cc7166668566),
+    U64_C (0xa68ea77bdddd53dd), U64_C (0xb84b2eaf17175c17),
+    U64_C (0x02468e4547470147), U64_C (0x84dc211a9e9e429e),
+    U64_C (0x1ec589d4caca0fca), U64_C (0x75995a582d2db42d),
+    U64_C (0x9179632ebfbfc6bf), U64_C (0x381b0e3f07071c07),
+    U64_C (0x012347acadad8ead), U64_C (0xea2fb4b05a5a755a),
+    U64_C (0x6cb51bef83833683), U64_C (0x85ff66b63333cc33),
+    U64_C (0x3ff2c65c63639163), U64_C (0x100a041202020802),
+    U64_C (0x39384993aaaa92aa), U64_C (0xafa8e2de7171d971),
+    U64_C (0x0ecf8dc6c8c807c8), U64_C (0xc87d32d119196419),
+    U64_C (0x7270923b49493949), U64_C (0x869aaf5fd9d943d9),
+    U64_C (0xc31df931f2f2eff2), U64_C (0x4b48dba8e3e3abe3),
+    U64_C (0xe22ab6b95b5b715b), U64_C (0x34920dbc88881a88),
+    U64_C (0xa4c8293e9a9a529a), U64_C (0x2dbe4c0b26269826),
+    U64_C (0x8dfa64bf3232c832), U64_C (0xe94a7d59b0b0fab0),
+    U64_C (0x1b6acff2e9e983e9), U64_C (0x78331e770f0f3c0f),
+    U64_C (0xe6a6b733d5d573d5), U64_C (0x74ba1df480803a80),
+    U64_C (0x997c6127bebec2be), U64_C (0x26de87ebcdcd13cd),
+    U64_C (0xbde468893434d034), U64_C (0x7a75903248483d48),
+    U64_C (0xab24e354ffffdbff), U64_C (0xf78ff48d7a7af57a),
+    U64_C (0xf4ea3d6490907a90), U64_C (0xc23ebe9d5f5f615f),
+    U64_C (0x1da0403d20208020), U64_C (0x67d5d00f6868bd68),
+    U64_C (0xd07234ca1a1a681a), U64_C (0x192c41b7aeae82ae),
+    U64_C (0xc95e757db4b4eab4), U64_C (0x9a19a8ce54544d54),
+    U64_C (0xece53b7f93937693), U64_C (0x0daa442f22228822),
+    U64_C (0x07e9c86364648d64), U64_C (0xdb12ff2af1f1e3f1),
+    U64_C (0xbfa2e6cc7373d173), U64_C (0x905a248212124812),
+    U64_C (0x3a5d807a40401d40), U64_C (0x4028104808082008),
+    U64_C (0x56e89b95c3c32bc3), U64_C (0x337bc5dfecec97ec),
+    U64_C (0x9690ab4ddbdb4bdb), U64_C (0x611f5fc0a1a1bea1),
+    U64_C (0x1c8307918d8d0e8d), U64_C (0xf5c97ac83d3df43d),
+    U64_C (0xccf1335b97976697), U64_C (0x0000000000000000),
+    U64_C (0x36d483f9cfcf1bcf), U64_C (0x4587566e2b2bac2b),
+    U64_C (0x97b3ece17676c576), U64_C (0x64b019e682823282),
+    U64_C (0xfea9b128d6d67fd6), U64_C (0xd87736c31b1b6c1b),
+    U64_C (0xc15b7774b5b5eeb5), U64_C (0x112943beafaf86af),
+    U64_C (0x77dfd41d6a6ab56a), U64_C (0xba0da0ea50505d50),
+    U64_C (0x124c8a5745450945), U64_C (0xcb18fb38f3f3ebf3),
+    U64_C (0x9df060ad3030c030), U64_C (0x2b74c3c4efef9bef),
+    U64_C (0xe5c37eda3f3ffc3f), U64_C (0x921caac755554955),
+    U64_C (0x791059dba2a2b2a2), U64_C (0x0365c9e9eaea8fea),
+    U64_C (0x0fecca6a65658965), U64_C (0xb9686903babad2ba),
+    U64_C (0x65935e4a2f2fbc2f), U64_C (0x4ee79d8ec0c027c0),
+    U64_C (0xbe81a160dede5fde), U64_C (0xe06c38fc1c1c701c),
+    U64_C (0xbb2ee746fdfdd3fd), U64_C (0x52649a1f4d4d294d),
+    U64_C (0xe4e0397692927292), U64_C (0x8fbceafa7575c975),
+    U64_C (0x301e0c3606061806), U64_C (0x249809ae8a8a128a),
+    U64_C (0xf940794bb2b2f2b2), U64_C (0x6359d185e6e6bfe6),
+    U64_C (0x70361c7e0e0e380e), U64_C (0xf8633ee71f1f7c1f),
+    U64_C (0x37f7c45562629562), U64_C (0xeea3b53ad4d477d4),
+    U64_C (0x29324d81a8a89aa8), U64_C (0xc4f4315296966296),
+    U64_C (0x9b3aef62f9f9c3f9), U64_C (0x66f697a3c5c533c5),
+    U64_C (0x35b14a1025259425), U64_C (0xf220b2ab59597959),
+    U64_C (0x54ae15d084842a84), U64_C (0xb7a7e4c57272d572),
+    U64_C (0xd5dd72ec3939e439), U64_C (0x5a6198164c4c2d4c),
+    U64_C (0xca3bbc945e5e655e), U64_C (0xe785f09f7878fd78),
+    U64_C (0xddd870e53838e038), U64_C (0x148605988c8c0a8c),
+    U64_C (0xc6b2bf17d1d163d1), U64_C (0x410b57e4a5a5aea5),
+    U64_C (0x434dd9a1e2e2afe2), U64_C (0x2ff8c24e61619961),
+    U64_C (0xf1457b42b3b3f6b3), U64_C (0x15a5423421218421),
+    U64_C (0x94d625089c9c4a9c), U64_C (0xf0663cee1e1e781e),
+    U64_C (0x2252866143431143), U64_C (0x76fc93b1c7c73bc7),
+    U64_C (0xb32be54ffcfcd7fc), U64_C (0x2014082404041004),
+    U64_C (0xb208a2e351515951), U64_C (0xbcc72f2599995e99),
+    U64_C (0x4fc4da226d6da96d), U64_C (0x68391a650d0d340d),
+    U64_C (0x8335e979fafacffa), U64_C (0xb684a369dfdf5bdf),
+    U64_C (0xd79bfca97e7ee57e), U64_C (0x3db4481924249024),
+    U64_C (0xc5d776fe3b3bec3b), U64_C (0x313d4b9aabab96ab),
+    U64_C (0x3ed181f0cece1fce), U64_C (0x8855229911114411),
+    U64_C (0x0c8903838f8f068f), U64_C (0x4a6b9c044e4e254e),
+    U64_C (0xd1517366b7b7e6b7), U64_C (0x0b60cbe0ebeb8beb),
+    U64_C (0xfdcc78c13c3cf03c), U64_C (0x7cbf1ffd81813e81),
+    U64_C (0xd4fe354094946a94), U64_C (0xeb0cf31cf7f7fbf7),
+    U64_C (0xa1676f18b9b9deb9), U64_C (0x985f268b13134c13),
+    U64_C (0x7d9c58512c2cb02c), U64_C (0xd6b8bb05d3d36bd3),
+    U64_C (0x6b5cd38ce7e7bbe7), U64_C (0x57cbdc396e6ea56e),
+    U64_C (0x6ef395aac4c437c4), U64_C (0x180f061b03030c03),
+    U64_C (0x8a13acdc56564556), U64_C (0x1a49885e44440d44),
+    U64_C (0xdf9efea07f7fe17f), U64_C (0x21374f88a9a99ea9),
+    U64_C (0x4d8254672a2aa82a), U64_C (0xb16d6b0abbbbd6bb),
+    U64_C (0x46e29f87c1c123c1), U64_C (0xa202a6f153535153),
+    U64_C (0xae8ba572dcdc57dc), U64_C (0x582716530b0b2c0b),
+    U64_C (0x9cd327019d9d4e9d), U64_C (0x47c1d82b6c6cad6c),
+    U64_C (0x95f562a43131c431), U64_C (0x87b9e8f37474cd74),
+    U64_C (0xe309f115f6f6fff6), U64_C (0x0a438c4c46460546),
+    U64_C (0x092645a5acac8aac), U64_C (0x3c970fb589891e89),
+    U64_C (0xa04428b414145014), U64_C (0x5b42dfbae1e1a3e1),
+    U64_C (0xb04e2ca616165816), U64_C (0xcdd274f73a3ae83a),
+    U64_C (0x6fd0d2066969b969), U64_C (0x482d124109092409),
+    U64_C (0xa7ade0d77070dd70), U64_C (0xd954716fb6b6e2b6),
+    U64_C (0xceb7bd1ed0d067d0), U64_C (0x3b7ec7d6eded93ed),
+    U64_C (0x2edb85e2cccc17cc), U64_C (0x2a57846842421542),
+    U64_C (0xb4c22d2c98985a98), U64_C (0x490e55eda4a4aaa4),
+    U64_C (0x5d8850752828a028), U64_C (0xda31b8865c5c6d5c),
+    U64_C (0x933fed6bf8f8c7f8), U64_C (0x44a411c286862286),
+  }, {
+    U64_C (0x18c07830d8181860), U64_C (0x2305af462623238c),
+    U64_C (0xc67ef991b8c6c63f), U64_C (0xe8136fcdfbe8e887),
+    U64_C (0x874ca113cb878726), U64_C (0xb8a9626d11b8b8da),
+    U64_C (0x0108050209010104), U64_C (0x4f426e9e0d4f4f21),
+    U64_C (0x36adee6c9b3636d8), U64_C (0xa6590451ffa6a6a2),
+    U64_C (0xd2debdb90cd2d26f), U64_C (0xf5fb06f70ef5f5f3),
+    U64_C (0x79ef80f2967979f9), U64_C (0x6f5fcede306f6fa1),
+    U64_C (0x91fcef3f6d91917e), U64_C (0x52aa07a4f8525255),
+    U64_C (0x6027fdc04760609d), U64_C (0xbc89766535bcbcca),
+    U64_C (0x9baccd2b379b9b56), U64_C (0x8e048c018a8e8e02),
+    U64_C (0xa371155bd2a3a3b6), U64_C (0x0c603c186c0c0c30),
+    U64_C (0x7bff8af6847b7bf1), U64_C (0x35b5e16a803535d4),
+    U64_C (0x1de8693af51d1d74), U64_C (0xe05347ddb3e0e0a7),
+    U64_C (0xd7f6acb321d7d77b), U64_C (0xc25eed999cc2c22f),
+    U64_C (0x2e6d965c432e2eb8), U64_C (0x4b627a96294b4b31),
+    U64_C (0xfea321e15dfefedf), U64_C (0x578216aed5575741),
+    U64_C (0x15a8412abd151554), U64_C (0x779fb6eee87777c1),
+    U64_C (0x37a5eb6e923737dc), U64_C (0xe57b56d79ee5e5b3),
+    U64_C (0x9f8cd923139f9f46), U64_C (0xf0d317fd23f0f0e7),
+    U64_C (0x4a6a7f94204a4a35), U64_C (0xda9e95a944dada4f),
+    U64_C (0x58fa25b0a258587d), U64_C (0xc906ca8fcfc9c903),
+    U64_C (0x29558d527c2929a4), U64_C (0x0a5022145a0a0a28),
+    U64_C (0xb1e14f7f50b1b1fe), U64_C (0xa0691a5dc9a0a0ba),
+    U64_C (0x6b7fdad6146b6bb1), U64_C (0x855cab17d985852e),
+    U64_C (0xbd8173673cbdbdce), U64_C (0x5dd234ba8f5d5d69),
+    U64_C (0x1080502090101040), U64_C (0xf4f303f507f4f4f7),
+    U64_C (0xcb16c08bddcbcb0b), U64_C (0x3eedc67cd33e3ef8),
+    U64_C (0x0528110a2d050514), U64_C (0x671fe6ce78676781),
+    U64_C (0xe47353d597e4e4b7), U64_C (0x2725bb4e0227279c),
+    U64_C (0x4132588273414119), U64_C (0x8b2c9d0ba78b8b16),
+    U64_C (0xa7510153f6a7a7a6), U64_C (0x7dcf94fab27d7de9),
+    U64_C (0x95dcfb374995956e), U64_C (0xd88e9fad56d8d847),
+    U64_C (0xfb8b30eb70fbfbcb), U64_C (0xee2371c1cdeeee9f),
+    U64_C (0x7cc791f8bb7c7ced), U64_C (0x6617e3cc71666685),
+    U64_C (0xdda68ea77bdddd53), U64_C (0x17b84b2eaf17175c),
+    U64_C (0x4702468e45474701), U64_C (0x9e84dc211a9e9e42),
+    U64_C (0xca1ec589d4caca0f), U64_C (0x2d75995a582d2db4),
+    U64_C (0xbf9179632ebfbfc6), U64_C (0x07381b0e3f07071c),
+    U64_C (0xad012347acadad8e), U64_C (0x5aea2fb4b05a5a75),
+    U64_C (0x836cb51bef838336), U64_C (0x3385ff66b63333cc),
+    U64_C (0x633ff2c65c636391), U64_C (0x02100a0412020208),
+    U64_C (0xaa39384993aaaa92), U64_C (0x71afa8e2de7171d9),
+    U64_C (0xc80ecf8dc6c8c807), U64_C (0x19c87d32d1191964),
+    U64_C (0x497270923b494939), U64_C (0xd9869aaf5fd9d943),
+    U64_C (0xf2c31df931f2f2ef), U64_C (0xe34b48dba8e3e3ab),
+    U64_C (0x5be22ab6b95b5b71), U64_C (0x8834920dbc88881a),
+    U64_C (0x9aa4c8293e9a9a52), U64_C (0x262dbe4c0b262698),
+    U64_C (0x328dfa64bf3232c8), U64_C (0xb0e94a7d59b0b0fa),
+    U64_C (0xe91b6acff2e9e983), U64_C (0x0f78331e770f0f3c),
+    U64_C (0xd5e6a6b733d5d573), U64_C (0x8074ba1df480803a),
+    U64_C (0xbe997c6127bebec2), U64_C (0xcd26de87ebcdcd13),
+    U64_C (0x34bde468893434d0), U64_C (0x487a75903248483d),
+    U64_C (0xffab24e354ffffdb), U64_C (0x7af78ff48d7a7af5),
+    U64_C (0x90f4ea3d6490907a), U64_C (0x5fc23ebe9d5f5f61),
+    U64_C (0x201da0403d202080), U64_C (0x6867d5d00f6868bd),
+    U64_C (0x1ad07234ca1a1a68), U64_C (0xae192c41b7aeae82),
+    U64_C (0xb4c95e757db4b4ea), U64_C (0x549a19a8ce54544d),
+    U64_C (0x93ece53b7f939376), U64_C (0x220daa442f222288),
+    U64_C (0x6407e9c86364648d), U64_C (0xf1db12ff2af1f1e3),
+    U64_C (0x73bfa2e6cc7373d1), U64_C (0x12905a2482121248),
+    U64_C (0x403a5d807a40401d), U64_C (0x0840281048080820),
+    U64_C (0xc356e89b95c3c32b), U64_C (0xec337bc5dfecec97),
+    U64_C (0xdb9690ab4ddbdb4b), U64_C (0xa1611f5fc0a1a1be),
+    U64_C (0x8d1c8307918d8d0e), U64_C (0x3df5c97ac83d3df4),
+    U64_C (0x97ccf1335b979766), U64_C (0x0000000000000000),
+    U64_C (0xcf36d483f9cfcf1b), U64_C (0x2b4587566e2b2bac),
+    U64_C (0x7697b3ece17676c5), U64_C (0x8264b019e6828232),
+    U64_C (0xd6fea9b128d6d67f), U64_C (0x1bd87736c31b1b6c),
+    U64_C (0xb5c15b7774b5b5ee), U64_C (0xaf112943beafaf86),
+    U64_C (0x6a77dfd41d6a6ab5), U64_C (0x50ba0da0ea50505d),
+    U64_C (0x45124c8a57454509), U64_C (0xf3cb18fb38f3f3eb),
+    U64_C (0x309df060ad3030c0), U64_C (0xef2b74c3c4efef9b),
+    U64_C (0x3fe5c37eda3f3ffc), U64_C (0x55921caac7555549),
+    U64_C (0xa2791059dba2a2b2), U64_C (0xea0365c9e9eaea8f),
+    U64_C (0x650fecca6a656589), U64_C (0xbab9686903babad2),
+    U64_C (0x2f65935e4a2f2fbc), U64_C (0xc04ee79d8ec0c027),
+    U64_C (0xdebe81a160dede5f), U64_C (0x1ce06c38fc1c1c70),
+    U64_C (0xfdbb2ee746fdfdd3), U64_C (0x4d52649a1f4d4d29),
+    U64_C (0x92e4e03976929272), U64_C (0x758fbceafa7575c9),
+    U64_C (0x06301e0c36060618), U64_C (0x8a249809ae8a8a12),
+    U64_C (0xb2f940794bb2b2f2), U64_C (0xe66359d185e6e6bf),
+    U64_C (0x0e70361c7e0e0e38), U64_C (0x1ff8633ee71f1f7c),
+    U64_C (0x6237f7c455626295), U64_C (0xd4eea3b53ad4d477),
+    U64_C (0xa829324d81a8a89a), U64_C (0x96c4f43152969662),
+    U64_C (0xf99b3aef62f9f9c3), U64_C (0xc566f697a3c5c533),
+    U64_C (0x2535b14a10252594), U64_C (0x59f220b2ab595979),
+    U64_C (0x8454ae15d084842a), U64_C (0x72b7a7e4c57272d5),
+    U64_C (0x39d5dd72ec3939e4), U64_C (0x4c5a6198164c4c2d),
+    U64_C (0x5eca3bbc945e5e65), U64_C (0x78e785f09f7878fd),
+    U64_C (0x38ddd870e53838e0), U64_C (0x8c148605988c8c0a),
+    U64_C (0xd1c6b2bf17d1d163), U64_C (0xa5410b57e4a5a5ae),
+    U64_C (0xe2434dd9a1e2e2af), U64_C (0x612ff8c24e616199),
+    U64_C (0xb3f1457b42b3b3f6), U64_C (0x2115a54234212184),
+    U64_C (0x9c94d625089c9c4a), U64_C (0x1ef0663cee1e1e78),
+    U64_C (0x4322528661434311), U64_C (0xc776fc93b1c7c73b),
+    U64_C (0xfcb32be54ffcfcd7), U64_C (0x0420140824040410),
+    U64_C (0x51b208a2e3515159), U64_C (0x99bcc72f2599995e),
+    U64_C (0x6d4fc4da226d6da9), U64_C (0x0d68391a650d0d34),
+    U64_C (0xfa8335e979fafacf), U64_C (0xdfb684a369dfdf5b),
+    U64_C (0x7ed79bfca97e7ee5), U64_C (0x243db44819242490),
+    U64_C (0x3bc5d776fe3b3bec), U64_C (0xab313d4b9aabab96),
+    U64_C (0xce3ed181f0cece1f), U64_C (0x1188552299111144),
+    U64_C (0x8f0c8903838f8f06), U64_C (0x4e4a6b9c044e4e25),
+    U64_C (0xb7d1517366b7b7e6), U64_C (0xeb0b60cbe0ebeb8b),
+    U64_C (0x3cfdcc78c13c3cf0), U64_C (0x817cbf1ffd81813e),
+    U64_C (0x94d4fe354094946a), U64_C (0xf7eb0cf31cf7f7fb),
+    U64_C (0xb9a1676f18b9b9de), U64_C (0x13985f268b13134c),
+    U64_C (0x2c7d9c58512c2cb0), U64_C (0xd3d6b8bb05d3d36b),
+    U64_C (0xe76b5cd38ce7e7bb), U64_C (0x6e57cbdc396e6ea5),
+    U64_C (0xc46ef395aac4c437), U64_C (0x03180f061b03030c),
+    U64_C (0x568a13acdc565645), U64_C (0x441a49885e44440d),
+    U64_C (0x7fdf9efea07f7fe1), U64_C (0xa921374f88a9a99e),
+    U64_C (0x2a4d8254672a2aa8), U64_C (0xbbb16d6b0abbbbd6),
+    U64_C (0xc146e29f87c1c123), U64_C (0x53a202a6f1535351),
+    U64_C (0xdcae8ba572dcdc57), U64_C (0x0b582716530b0b2c),
+    U64_C (0x9d9cd327019d9d4e), U64_C (0x6c47c1d82b6c6cad),
+    U64_C (0x3195f562a43131c4), U64_C (0x7487b9e8f37474cd),
+    U64_C (0xf6e309f115f6f6ff), U64_C (0x460a438c4c464605),
+    U64_C (0xac092645a5acac8a), U64_C (0x893c970fb589891e),
+    U64_C (0x14a04428b4141450), U64_C (0xe15b42dfbae1e1a3),
+    U64_C (0x16b04e2ca6161658), U64_C (0x3acdd274f73a3ae8),
+    U64_C (0x696fd0d2066969b9), U64_C (0x09482d1241090924),
+    U64_C (0x70a7ade0d77070dd), U64_C (0xb6d954716fb6b6e2),
+    U64_C (0xd0ceb7bd1ed0d067), U64_C (0xed3b7ec7d6eded93),
+    U64_C (0xcc2edb85e2cccc17), U64_C (0x422a578468424215),
+    U64_C (0x98b4c22d2c98985a), U64_C (0xa4490e55eda4a4aa),
+    U64_C (0x285d8850752828a0), U64_C (0x5cda31b8865c5c6d),
+    U64_C (0xf8933fed6bf8f8c7), U64_C (0x8644a411c2868622),
+  }, {
+    U64_C (0x6018c07830d81818), U64_C (0x8c2305af46262323),
+    U64_C (0x3fc67ef991b8c6c6), U64_C (0x87e8136fcdfbe8e8),
+    U64_C (0x26874ca113cb8787), U64_C (0xdab8a9626d11b8b8),
+    U64_C (0x0401080502090101), U64_C (0x214f426e9e0d4f4f),
+    U64_C (0xd836adee6c9b3636), U64_C (0xa2a6590451ffa6a6),
+    U64_C (0x6fd2debdb90cd2d2), U64_C (0xf3f5fb06f70ef5f5),
+    U64_C (0xf979ef80f2967979), U64_C (0xa16f5fcede306f6f),
+    U64_C (0x7e91fcef3f6d9191), U64_C (0x5552aa07a4f85252),
+    U64_C (0x9d6027fdc0476060), U64_C (0xcabc89766535bcbc),
+    U64_C (0x569baccd2b379b9b), U64_C (0x028e048c018a8e8e),
+    U64_C (0xb6a371155bd2a3a3), U64_C (0x300c603c186c0c0c),
+    U64_C (0xf17bff8af6847b7b), U64_C (0xd435b5e16a803535),
+    U64_C (0x741de8693af51d1d), U64_C (0xa7e05347ddb3e0e0),
+    U64_C (0x7bd7f6acb321d7d7), U64_C (0x2fc25eed999cc2c2),
+    U64_C (0xb82e6d965c432e2e), U64_C (0x314b627a96294b4b),
+    U64_C (0xdffea321e15dfefe), U64_C (0x41578216aed55757),
+    U64_C (0x5415a8412abd1515), U64_C (0xc1779fb6eee87777),
+    U64_C (0xdc37a5eb6e923737), U64_C (0xb3e57b56d79ee5e5),
+    U64_C (0x469f8cd923139f9f), U64_C (0xe7f0d317fd23f0f0),
+    U64_C (0x354a6a7f94204a4a), U64_C (0x4fda9e95a944dada),
+    U64_C (0x7d58fa25b0a25858), U64_C (0x03c906ca8fcfc9c9),
+    U64_C (0xa429558d527c2929), U64_C (0x280a5022145a0a0a),
+    U64_C (0xfeb1e14f7f50b1b1), U64_C (0xbaa0691a5dc9a0a0),
+    U64_C (0xb16b7fdad6146b6b), U64_C (0x2e855cab17d98585),
+    U64_C (0xcebd8173673cbdbd), U64_C (0x695dd234ba8f5d5d),
+    U64_C (0x4010805020901010), U64_C (0xf7f4f303f507f4f4),
+    U64_C (0x0bcb16c08bddcbcb), U64_C (0xf83eedc67cd33e3e),
+    U64_C (0x140528110a2d0505), U64_C (0x81671fe6ce786767),
+    U64_C (0xb7e47353d597e4e4), U64_C (0x9c2725bb4e022727),
+    U64_C (0x1941325882734141), U64_C (0x168b2c9d0ba78b8b),
+    U64_C (0xa6a7510153f6a7a7), U64_C (0xe97dcf94fab27d7d),
+    U64_C (0x6e95dcfb37499595), U64_C (0x47d88e9fad56d8d8),
+    U64_C (0xcbfb8b30eb70fbfb), U64_C (0x9fee2371c1cdeeee),
+    U64_C (0xed7cc791f8bb7c7c), U64_C (0x856617e3cc716666),
+    U64_C (0x53dda68ea77bdddd), U64_C (0x5c17b84b2eaf1717),
+    U64_C (0x014702468e454747), U64_C (0x429e84dc211a9e9e),
+    U64_C (0x0fca1ec589d4caca), U64_C (0xb42d75995a582d2d),
+    U64_C (0xc6bf9179632ebfbf), U64_C (0x1c07381b0e3f0707),
+    U64_C (0x8ead012347acadad), U64_C (0x755aea2fb4b05a5a),
+    U64_C (0x36836cb51bef8383), U64_C (0xcc3385ff66b63333),
+    U64_C (0x91633ff2c65c6363), U64_C (0x0802100a04120202),
+    U64_C (0x92aa39384993aaaa), U64_C (0xd971afa8e2de7171),
+    U64_C (0x07c80ecf8dc6c8c8), U64_C (0x6419c87d32d11919),
+    U64_C (0x39497270923b4949), U64_C (0x43d9869aaf5fd9d9),
+    U64_C (0xeff2c31df931f2f2), U64_C (0xabe34b48dba8e3e3),
+    U64_C (0x715be22ab6b95b5b), U64_C (0x1a8834920dbc8888),
+    U64_C (0x529aa4c8293e9a9a), U64_C (0x98262dbe4c0b2626),
+    U64_C (0xc8328dfa64bf3232), U64_C (0xfab0e94a7d59b0b0),
+    U64_C (0x83e91b6acff2e9e9), U64_C (0x3c0f78331e770f0f),
+    U64_C (0x73d5e6a6b733d5d5), U64_C (0x3a8074ba1df48080),
+    U64_C (0xc2be997c6127bebe), U64_C (0x13cd26de87ebcdcd),
+    U64_C (0xd034bde468893434), U64_C (0x3d487a7590324848),
+    U64_C (0xdbffab24e354ffff), U64_C (0xf57af78ff48d7a7a),
+    U64_C (0x7a90f4ea3d649090), U64_C (0x615fc23ebe9d5f5f),
+    U64_C (0x80201da0403d2020), U64_C (0xbd6867d5d00f6868),
+    U64_C (0x681ad07234ca1a1a), U64_C (0x82ae192c41b7aeae),
+    U64_C (0xeab4c95e757db4b4), U64_C (0x4d549a19a8ce5454),
+    U64_C (0x7693ece53b7f9393), U64_C (0x88220daa442f2222),
+    U64_C (0x8d6407e9c8636464), U64_C (0xe3f1db12ff2af1f1),
+    U64_C (0xd173bfa2e6cc7373), U64_C (0x4812905a24821212),
+    U64_C (0x1d403a5d807a4040), U64_C (0x2008402810480808),
+    U64_C (0x2bc356e89b95c3c3), U64_C (0x97ec337bc5dfecec),
+    U64_C (0x4bdb9690ab4ddbdb), U64_C (0xbea1611f5fc0a1a1),
+    U64_C (0x0e8d1c8307918d8d), U64_C (0xf43df5c97ac83d3d),
+    U64_C (0x6697ccf1335b9797), U64_C (0x0000000000000000),
+    U64_C (0x1bcf36d483f9cfcf), U64_C (0xac2b4587566e2b2b),
+    U64_C (0xc57697b3ece17676), U64_C (0x328264b019e68282),
+    U64_C (0x7fd6fea9b128d6d6), U64_C (0x6c1bd87736c31b1b),
+    U64_C (0xeeb5c15b7774b5b5), U64_C (0x86af112943beafaf),
+    U64_C (0xb56a77dfd41d6a6a), U64_C (0x5d50ba0da0ea5050),
+    U64_C (0x0945124c8a574545), U64_C (0xebf3cb18fb38f3f3),
+    U64_C (0xc0309df060ad3030), U64_C (0x9bef2b74c3c4efef),
+    U64_C (0xfc3fe5c37eda3f3f), U64_C (0x4955921caac75555),
+    U64_C (0xb2a2791059dba2a2), U64_C (0x8fea0365c9e9eaea),
+    U64_C (0x89650fecca6a6565), U64_C (0xd2bab9686903baba),
+    U64_C (0xbc2f65935e4a2f2f), U64_C (0x27c04ee79d8ec0c0),
+    U64_C (0x5fdebe81a160dede), U64_C (0x701ce06c38fc1c1c),
+    U64_C (0xd3fdbb2ee746fdfd), U64_C (0x294d52649a1f4d4d),
+    U64_C (0x7292e4e039769292), U64_C (0xc9758fbceafa7575),
+    U64_C (0x1806301e0c360606), U64_C (0x128a249809ae8a8a),
+    U64_C (0xf2b2f940794bb2b2), U64_C (0xbfe66359d185e6e6),
+    U64_C (0x380e70361c7e0e0e), U64_C (0x7c1ff8633ee71f1f),
+    U64_C (0x956237f7c4556262), U64_C (0x77d4eea3b53ad4d4),
+    U64_C (0x9aa829324d81a8a8), U64_C (0x6296c4f431529696),
+    U64_C (0xc3f99b3aef62f9f9), U64_C (0x33c566f697a3c5c5),
+    U64_C (0x942535b14a102525), U64_C (0x7959f220b2ab5959),
+    U64_C (0x2a8454ae15d08484), U64_C (0xd572b7a7e4c57272),
+    U64_C (0xe439d5dd72ec3939), U64_C (0x2d4c5a6198164c4c),
+    U64_C (0x655eca3bbc945e5e), U64_C (0xfd78e785f09f7878),
+    U64_C (0xe038ddd870e53838), U64_C (0x0a8c148605988c8c),
+    U64_C (0x63d1c6b2bf17d1d1), U64_C (0xaea5410b57e4a5a5),
+    U64_C (0xafe2434dd9a1e2e2), U64_C (0x99612ff8c24e6161),
+    U64_C (0xf6b3f1457b42b3b3), U64_C (0x842115a542342121),
+    U64_C (0x4a9c94d625089c9c), U64_C (0x781ef0663cee1e1e),
+    U64_C (0x1143225286614343), U64_C (0x3bc776fc93b1c7c7),
+    U64_C (0xd7fcb32be54ffcfc), U64_C (0x1004201408240404),
+    U64_C (0x5951b208a2e35151), U64_C (0x5e99bcc72f259999),
+    U64_C (0xa96d4fc4da226d6d), U64_C (0x340d68391a650d0d),
+    U64_C (0xcffa8335e979fafa), U64_C (0x5bdfb684a369dfdf),
+    U64_C (0xe57ed79bfca97e7e), U64_C (0x90243db448192424),
+    U64_C (0xec3bc5d776fe3b3b), U64_C (0x96ab313d4b9aabab),
+    U64_C (0x1fce3ed181f0cece), U64_C (0x4411885522991111),
+    U64_C (0x068f0c8903838f8f), U64_C (0x254e4a6b9c044e4e),
+    U64_C (0xe6b7d1517366b7b7), U64_C (0x8beb0b60cbe0ebeb),
+    U64_C (0xf03cfdcc78c13c3c), U64_C (0x3e817cbf1ffd8181),
+    U64_C (0x6a94d4fe35409494), U64_C (0xfbf7eb0cf31cf7f7),
+    U64_C (0xdeb9a1676f18b9b9), U64_C (0x4c13985f268b1313),
+    U64_C (0xb02c7d9c58512c2c), U64_C (0x6bd3d6b8bb05d3d3),
+    U64_C (0xbbe76b5cd38ce7e7), U64_C (0xa56e57cbdc396e6e),
+    U64_C (0x37c46ef395aac4c4), U64_C (0x0c03180f061b0303),
+    U64_C (0x45568a13acdc5656), U64_C (0x0d441a49885e4444),
+    U64_C (0xe17fdf9efea07f7f), U64_C (0x9ea921374f88a9a9),
+    U64_C (0xa82a4d8254672a2a), U64_C (0xd6bbb16d6b0abbbb),
+    U64_C (0x23c146e29f87c1c1), U64_C (0x5153a202a6f15353),
+    U64_C (0x57dcae8ba572dcdc), U64_C (0x2c0b582716530b0b),
+    U64_C (0x4e9d9cd327019d9d), U64_C (0xad6c47c1d82b6c6c),
+    U64_C (0xc43195f562a43131), U64_C (0xcd7487b9e8f37474),
+    U64_C (0xfff6e309f115f6f6), U64_C (0x05460a438c4c4646),
+    U64_C (0x8aac092645a5acac), U64_C (0x1e893c970fb58989),
+    U64_C (0x5014a04428b41414), U64_C (0xa3e15b42dfbae1e1),
+    U64_C (0x5816b04e2ca61616), U64_C (0xe83acdd274f73a3a),
+    U64_C (0xb9696fd0d2066969), U64_C (0x2409482d12410909),
+    U64_C (0xdd70a7ade0d77070), U64_C (0xe2b6d954716fb6b6),
+    U64_C (0x67d0ceb7bd1ed0d0), U64_C (0x93ed3b7ec7d6eded),
+    U64_C (0x17cc2edb85e2cccc), U64_C (0x15422a5784684242),
+    U64_C (0x5a98b4c22d2c9898), U64_C (0xaaa4490e55eda4a4),
+    U64_C (0xa0285d8850752828), U64_C (0x6d5cda31b8865c5c),
+    U64_C (0xc7f8933fed6bf8f8), U64_C (0x228644a411c28686),
+  }, {
+    U64_C (0x186018c07830d818), U64_C (0x238c2305af462623),
+    U64_C (0xc63fc67ef991b8c6), U64_C (0xe887e8136fcdfbe8),
+    U64_C (0x8726874ca113cb87), U64_C (0xb8dab8a9626d11b8),
+    U64_C (0x0104010805020901), U64_C (0x4f214f426e9e0d4f),
+    U64_C (0x36d836adee6c9b36), U64_C (0xa6a2a6590451ffa6),
+    U64_C (0xd26fd2debdb90cd2), U64_C (0xf5f3f5fb06f70ef5),
+    U64_C (0x79f979ef80f29679), U64_C (0x6fa16f5fcede306f),
+    U64_C (0x917e91fcef3f6d91), U64_C (0x525552aa07a4f852),
+    U64_C (0x609d6027fdc04760), U64_C (0xbccabc89766535bc),
+    U64_C (0x9b569baccd2b379b), U64_C (0x8e028e048c018a8e),
+    U64_C (0xa3b6a371155bd2a3), U64_C (0x0c300c603c186c0c),
+    U64_C (0x7bf17bff8af6847b), U64_C (0x35d435b5e16a8035),
+    U64_C (0x1d741de8693af51d), U64_C (0xe0a7e05347ddb3e0),
+    U64_C (0xd77bd7f6acb321d7), U64_C (0xc22fc25eed999cc2),
+    U64_C (0x2eb82e6d965c432e), U64_C (0x4b314b627a96294b),
+    U64_C (0xfedffea321e15dfe), U64_C (0x5741578216aed557),
+    U64_C (0x155415a8412abd15), U64_C (0x77c1779fb6eee877),
+    U64_C (0x37dc37a5eb6e9237), U64_C (0xe5b3e57b56d79ee5),
+    U64_C (0x9f469f8cd923139f), U64_C (0xf0e7f0d317fd23f0),
+    U64_C (0x4a354a6a7f94204a), U64_C (0xda4fda9e95a944da),
+    U64_C (0x587d58fa25b0a258), U64_C (0xc903c906ca8fcfc9),
+    U64_C (0x29a429558d527c29), U64_C (0x0a280a5022145a0a),
+    U64_C (0xb1feb1e14f7f50b1), U64_C (0xa0baa0691a5dc9a0),
+    U64_C (0x6bb16b7fdad6146b), U64_C (0x852e855cab17d985),
+    U64_C (0xbdcebd8173673cbd), U64_C (0x5d695dd234ba8f5d),
+    U64_C (0x1040108050209010), U64_C (0xf4f7f4f303f507f4),
+    U64_C (0xcb0bcb16c08bddcb), U64_C (0x3ef83eedc67cd33e),
+    U64_C (0x05140528110a2d05), U64_C (0x6781671fe6ce7867),
+    U64_C (0xe4b7e47353d597e4), U64_C (0x279c2725bb4e0227),
+    U64_C (0x4119413258827341), U64_C (0x8b168b2c9d0ba78b),
+    U64_C (0xa7a6a7510153f6a7), U64_C (0x7de97dcf94fab27d),
+    U64_C (0x956e95dcfb374995), U64_C (0xd847d88e9fad56d8),
+    U64_C (0xfbcbfb8b30eb70fb), U64_C (0xee9fee2371c1cdee),
+    U64_C (0x7ced7cc791f8bb7c), U64_C (0x66856617e3cc7166),
+    U64_C (0xdd53dda68ea77bdd), U64_C (0x175c17b84b2eaf17),
+    U64_C (0x47014702468e4547), U64_C (0x9e429e84dc211a9e),
+    U64_C (0xca0fca1ec589d4ca), U64_C (0x2db42d75995a582d),
+    U64_C (0xbfc6bf9179632ebf), U64_C (0x071c07381b0e3f07),
+    U64_C (0xad8ead012347acad), U64_C (0x5a755aea2fb4b05a),
+    U64_C (0x8336836cb51bef83), U64_C (0x33cc3385ff66b633),
+    U64_C (0x6391633ff2c65c63), U64_C (0x020802100a041202),
+    U64_C (0xaa92aa39384993aa), U64_C (0x71d971afa8e2de71),
+    U64_C (0xc807c80ecf8dc6c8), U64_C (0x196419c87d32d119),
+    U64_C (0x4939497270923b49), U64_C (0xd943d9869aaf5fd9),
+    U64_C (0xf2eff2c31df931f2), U64_C (0xe3abe34b48dba8e3),
+    U64_C (0x5b715be22ab6b95b), U64_C (0x881a8834920dbc88),
+    U64_C (0x9a529aa4c8293e9a), U64_C (0x2698262dbe4c0b26),
+    U64_C (0x32c8328dfa64bf32), U64_C (0xb0fab0e94a7d59b0),
+    U64_C (0xe983e91b6acff2e9), U64_C (0x0f3c0f78331e770f),
+    U64_C (0xd573d5e6a6b733d5), U64_C (0x803a8074ba1df480),
+    U64_C (0xbec2be997c6127be), U64_C (0xcd13cd26de87ebcd),
+    U64_C (0x34d034bde4688934), U64_C (0x483d487a75903248),
+    U64_C (0xffdbffab24e354ff), U64_C (0x7af57af78ff48d7a),
+    U64_C (0x907a90f4ea3d6490), U64_C (0x5f615fc23ebe9d5f),
+    U64_C (0x2080201da0403d20), U64_C (0x68bd6867d5d00f68),
+    U64_C (0x1a681ad07234ca1a), U64_C (0xae82ae192c41b7ae),
+    U64_C (0xb4eab4c95e757db4), U64_C (0x544d549a19a8ce54),
+    U64_C (0x937693ece53b7f93), U64_C (0x2288220daa442f22),
+    U64_C (0x648d6407e9c86364), U64_C (0xf1e3f1db12ff2af1),
+    U64_C (0x73d173bfa2e6cc73), U64_C (0x124812905a248212),
+    U64_C (0x401d403a5d807a40), U64_C (0x0820084028104808),
+    U64_C (0xc32bc356e89b95c3), U64_C (0xec97ec337bc5dfec),
+    U64_C (0xdb4bdb9690ab4ddb), U64_C (0xa1bea1611f5fc0a1),
+    U64_C (0x8d0e8d1c8307918d), U64_C (0x3df43df5c97ac83d),
+    U64_C (0x976697ccf1335b97), U64_C (0x0000000000000000),
+    U64_C (0xcf1bcf36d483f9cf), U64_C (0x2bac2b4587566e2b),
+    U64_C (0x76c57697b3ece176), U64_C (0x82328264b019e682),
+    U64_C (0xd67fd6fea9b128d6), U64_C (0x1b6c1bd87736c31b),
+    U64_C (0xb5eeb5c15b7774b5), U64_C (0xaf86af112943beaf),
+    U64_C (0x6ab56a77dfd41d6a), U64_C (0x505d50ba0da0ea50),
+    U64_C (0x450945124c8a5745), U64_C (0xf3ebf3cb18fb38f3),
+    U64_C (0x30c0309df060ad30), U64_C (0xef9bef2b74c3c4ef),
+    U64_C (0x3ffc3fe5c37eda3f), U64_C (0x554955921caac755),
+    U64_C (0xa2b2a2791059dba2), U64_C (0xea8fea0365c9e9ea),
+    U64_C (0x6589650fecca6a65), U64_C (0xbad2bab9686903ba),
+    U64_C (0x2fbc2f65935e4a2f), U64_C (0xc027c04ee79d8ec0),
+    U64_C (0xde5fdebe81a160de), U64_C (0x1c701ce06c38fc1c),
+    U64_C (0xfdd3fdbb2ee746fd), U64_C (0x4d294d52649a1f4d),
+    U64_C (0x927292e4e0397692), U64_C (0x75c9758fbceafa75),
+    U64_C (0x061806301e0c3606), U64_C (0x8a128a249809ae8a),
+    U64_C (0xb2f2b2f940794bb2), U64_C (0xe6bfe66359d185e6),
+    U64_C (0x0e380e70361c7e0e), U64_C (0x1f7c1ff8633ee71f),
+    U64_C (0x62956237f7c45562), U64_C (0xd477d4eea3b53ad4),
+    U64_C (0xa89aa829324d81a8), U64_C (0x966296c4f4315296),
+    U64_C (0xf9c3f99b3aef62f9), U64_C (0xc533c566f697a3c5),
+    U64_C (0x25942535b14a1025), U64_C (0x597959f220b2ab59),
+    U64_C (0x842a8454ae15d084), U64_C (0x72d572b7a7e4c572),
+    U64_C (0x39e439d5dd72ec39), U64_C (0x4c2d4c5a6198164c),
+    U64_C (0x5e655eca3bbc945e), U64_C (0x78fd78e785f09f78),
+    U64_C (0x38e038ddd870e538), U64_C (0x8c0a8c148605988c),
+    U64_C (0xd163d1c6b2bf17d1), U64_C (0xa5aea5410b57e4a5),
+    U64_C (0xe2afe2434dd9a1e2), U64_C (0x6199612ff8c24e61),
+    U64_C (0xb3f6b3f1457b42b3), U64_C (0x21842115a5423421),
+    U64_C (0x9c4a9c94d625089c), U64_C (0x1e781ef0663cee1e),
+    U64_C (0x4311432252866143), U64_C (0xc73bc776fc93b1c7),
+    U64_C (0xfcd7fcb32be54ffc), U64_C (0x0410042014082404),
+    U64_C (0x515951b208a2e351), U64_C (0x995e99bcc72f2599),
+    U64_C (0x6da96d4fc4da226d), U64_C (0x0d340d68391a650d),
+    U64_C (0xfacffa8335e979fa), U64_C (0xdf5bdfb684a369df),
+    U64_C (0x7ee57ed79bfca97e), U64_C (0x2490243db4481924),
+    U64_C (0x3bec3bc5d776fe3b), U64_C (0xab96ab313d4b9aab),
+    U64_C (0xce1fce3ed181f0ce), U64_C (0x1144118855229911),
+    U64_C (0x8f068f0c8903838f), U64_C (0x4e254e4a6b9c044e),
+    U64_C (0xb7e6b7d1517366b7), U64_C (0xeb8beb0b60cbe0eb),
+    U64_C (0x3cf03cfdcc78c13c), U64_C (0x813e817cbf1ffd81),
+    U64_C (0x946a94d4fe354094), U64_C (0xf7fbf7eb0cf31cf7),
+    U64_C (0xb9deb9a1676f18b9), U64_C (0x134c13985f268b13),
+    U64_C (0x2cb02c7d9c58512c), U64_C (0xd36bd3d6b8bb05d3),
+    U64_C (0xe7bbe76b5cd38ce7), U64_C (0x6ea56e57cbdc396e),
+    U64_C (0xc437c46ef395aac4), U64_C (0x030c03180f061b03),
+    U64_C (0x5645568a13acdc56), U64_C (0x440d441a49885e44),
+    U64_C (0x7fe17fdf9efea07f), U64_C (0xa99ea921374f88a9),
+    U64_C (0x2aa82a4d8254672a), U64_C (0xbbd6bbb16d6b0abb),
+    U64_C (0xc123c146e29f87c1), U64_C (0x535153a202a6f153),
+    U64_C (0xdc57dcae8ba572dc), U64_C (0x0b2c0b582716530b),
+    U64_C (0x9d4e9d9cd327019d), U64_C (0x6cad6c47c1d82b6c),
+    U64_C (0x31c43195f562a431), U64_C (0x74cd7487b9e8f374),
+    U64_C (0xf6fff6e309f115f6), U64_C (0x4605460a438c4c46),
+    U64_C (0xac8aac092645a5ac), U64_C (0x891e893c970fb589),
+    U64_C (0x145014a04428b414), U64_C (0xe1a3e15b42dfbae1),
+    U64_C (0x165816b04e2ca616), U64_C (0x3ae83acdd274f73a),
+    U64_C (0x69b9696fd0d20669), U64_C (0x092409482d124109),
+    U64_C (0x70dd70a7ade0d770), U64_C (0xb6e2b6d954716fb6),
+    U64_C (0xd067d0ceb7bd1ed0), U64_C (0xed93ed3b7ec7d6ed),
+    U64_C (0xcc17cc2edb85e2cc), U64_C (0x4215422a57846842),
+    U64_C (0x985a98b4c22d2c98), U64_C (0xa4aaa4490e55eda4),
+    U64_C (0x28a0285d88507528), U64_C (0x5c6d5cda31b8865c),
+    U64_C (0xf8c7f8933fed6bf8), U64_C (0x86228644a411c286),
+  } }
+};
+#define C tab.C
+#define C0 C[0]
+#define C1 C[1]
+#define C2 C[2]
+#define C3 C[3]
+#define C4 C[4]
+#define C5 C[5]
+#define C6 C[6]
+#define C7 C[7]
+#define rc tab.RC
+
+
+
+static unsigned int
+whirlpool_transform (void *ctx, const unsigned char *data, size_t nblks);
+
+
+
+static void
+whirlpool_init (void *ctx, unsigned int flags)
+{
+  whirlpool_context_t *context = ctx;
+
+  memset (context, 0, sizeof (*context));
+
+  context->bctx.blocksize_shift = _gcry_ctz(BLOCK_SIZE);
+  context->bctx.bwrite = whirlpool_transform;
+  if ((flags & GCRY_MD_FLAG_BUGEMU1))
+    {
+      memset (&context->bugemu, 0, sizeof context->bugemu);
+      context->use_bugemu = 1;
+    }
+  else
+    context->use_bugemu = 0;
+}
+
+
+#ifdef USE_AMD64_ASM
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+extern unsigned int
+_gcry_whirlpool_transform_amd64(u64 *state, const unsigned char *data,
+    size_t nblks, const struct whirlpool_tables_s *tables) ASM_FUNC_ABI;
+
+static unsigned int
+whirlpool_transform (void *ctx, const unsigned char *data, size_t nblks)
+{
+  whirlpool_context_t *context = ctx;
+
+  return _gcry_whirlpool_transform_amd64(
+		context->hash_state, data, nblks, &tab) + ASM_EXTRA_STACK;
+}
+
+#else /* USE_AMD64_ASM */
+
+/*
+ * Transform block.
+ */
+static unsigned int
+whirlpool_transform_blk (void *ctx, const unsigned char *data)
+{
+  whirlpool_context_t *context = ctx;
+  whirlpool_block_t data_block;
+  whirlpool_block_t key;
+  whirlpool_block_t state;
+  whirlpool_block_t block;
+  unsigned int r;
+  unsigned int i;
+
+  buffer_to_block (data, data_block, i);
+  block_copy (key, context->hash_state, i);
+  block_copy (state, context->hash_state, i);
+  block_xor (state, data_block, i);
+
+  for (r = 0; r < R; r++)
+    {
+      /* Compute round key K^r.  */
+
+      block[0] = (C0[(key[0] >> 56) & 0xFF] ^ C1[(key[7] >> 48) & 0xFF] ^
+		  C2[(key[6] >> 40) & 0xFF] ^ C3[(key[5] >> 32) & 0xFF] ^
+		  C4[(key[4] >> 24) & 0xFF] ^ C5[(key[3] >> 16) & 0xFF] ^
+		  C6[(key[2] >>  8) & 0xFF] ^ C7[(key[1] >>  0) & 0xFF] ^ rc[r]);
+      block[1] = (C0[(key[1] >> 56) & 0xFF] ^ C1[(key[0] >> 48) & 0xFF] ^
+		  C2[(key[7] >> 40) & 0xFF] ^ C3[(key[6] >> 32) & 0xFF] ^
+		  C4[(key[5] >> 24) & 0xFF] ^ C5[(key[4] >> 16) & 0xFF] ^
+		  C6[(key[3] >>  8) & 0xFF] ^ C7[(key[2] >>  0) & 0xFF]);
+      block[2] = (C0[(key[2] >> 56) & 0xFF] ^ C1[(key[1] >> 48) & 0xFF] ^
+		  C2[(key[0] >> 40) & 0xFF] ^ C3[(key[7] >> 32) & 0xFF] ^
+		  C4[(key[6] >> 24) & 0xFF] ^ C5[(key[5] >> 16) & 0xFF] ^
+		  C6[(key[4] >>  8) & 0xFF] ^ C7[(key[3] >>  0) & 0xFF]);
+      block[3] = (C0[(key[3] >> 56) & 0xFF] ^ C1[(key[2] >> 48) & 0xFF] ^
+		  C2[(key[1] >> 40) & 0xFF] ^ C3[(key[0] >> 32) & 0xFF] ^
+		  C4[(key[7] >> 24) & 0xFF] ^ C5[(key[6] >> 16) & 0xFF] ^
+		  C6[(key[5] >>  8) & 0xFF] ^ C7[(key[4] >>  0) & 0xFF]);
+      block[4] = (C0[(key[4] >> 56) & 0xFF] ^ C1[(key[3] >> 48) & 0xFF] ^
+		  C2[(key[2] >> 40) & 0xFF] ^ C3[(key[1] >> 32) & 0xFF] ^
+		  C4[(key[0] >> 24) & 0xFF] ^ C5[(key[7] >> 16) & 0xFF] ^
+		  C6[(key[6] >>  8) & 0xFF] ^ C7[(key[5] >>  0) & 0xFF]);
+      block[5] = (C0[(key[5] >> 56) & 0xFF] ^ C1[(key[4] >> 48) & 0xFF] ^
+		  C2[(key[3] >> 40) & 0xFF] ^ C3[(key[2] >> 32) & 0xFF] ^
+		  C4[(key[1] >> 24) & 0xFF] ^ C5[(key[0] >> 16) & 0xFF] ^
+		  C6[(key[7] >>  8) & 0xFF] ^ C7[(key[6] >>  0) & 0xFF]);
+      block[6] = (C0[(key[6] >> 56) & 0xFF] ^ C1[(key[5] >> 48) & 0xFF] ^
+		  C2[(key[4] >> 40) & 0xFF] ^ C3[(key[3] >> 32) & 0xFF] ^
+		  C4[(key[2] >> 24) & 0xFF] ^ C5[(key[1] >> 16) & 0xFF] ^
+		  C6[(key[0] >>  8) & 0xFF] ^ C7[(key[7] >>  0) & 0xFF]);
+      block[7] = (C0[(key[7] >> 56) & 0xFF] ^ C1[(key[6] >> 48) & 0xFF] ^
+		  C2[(key[5] >> 40) & 0xFF] ^ C3[(key[4] >> 32) & 0xFF] ^
+		  C4[(key[3] >> 24) & 0xFF] ^ C5[(key[2] >> 16) & 0xFF] ^
+		  C6[(key[1] >>  8) & 0xFF] ^ C7[(key[0] >>  0) & 0xFF]);
+      block_copy (key, block, i);
+
+      /* Apply r-th round transformation.  */
+
+      block[0] = (C0[(state[0] >> 56) & 0xFF] ^ C1[(state[7] >> 48) & 0xFF] ^
+		  C2[(state[6] >> 40) & 0xFF] ^ C3[(state[5] >> 32) & 0xFF] ^
+		  C4[(state[4] >> 24) & 0xFF] ^ C5[(state[3] >> 16) & 0xFF] ^
+		  C6[(state[2] >>  8) & 0xFF] ^ C7[(state[1] >>  0) & 0xFF] ^ key[0]);
+      block[1] = (C0[(state[1] >> 56) & 0xFF] ^ C1[(state[0] >> 48) & 0xFF] ^
+		  C2[(state[7] >> 40) & 0xFF] ^ C3[(state[6] >> 32) & 0xFF] ^
+		  C4[(state[5] >> 24) & 0xFF] ^ C5[(state[4] >> 16) & 0xFF] ^
+		  C6[(state[3] >>  8) & 0xFF] ^ C7[(state[2] >>  0) & 0xFF] ^ key[1]);
+      block[2] = (C0[(state[2] >> 56) & 0xFF] ^ C1[(state[1] >> 48) & 0xFF] ^
+		  C2[(state[0] >> 40) & 0xFF] ^ C3[(state[7] >> 32) & 0xFF] ^
+		  C4[(state[6] >> 24) & 0xFF] ^ C5[(state[5] >> 16) & 0xFF] ^
+		  C6[(state[4] >>  8) & 0xFF] ^ C7[(state[3] >>  0) & 0xFF] ^ key[2]);
+      block[3] = (C0[(state[3] >> 56) & 0xFF] ^ C1[(state[2] >> 48) & 0xFF] ^
+		  C2[(state[1] >> 40) & 0xFF] ^ C3[(state[0] >> 32) & 0xFF] ^
+		  C4[(state[7] >> 24) & 0xFF] ^ C5[(state[6] >> 16) & 0xFF] ^
+		  C6[(state[5] >>  8) & 0xFF] ^ C7[(state[4] >>  0) & 0xFF] ^ key[3]);
+      block[4] = (C0[(state[4] >> 56) & 0xFF] ^ C1[(state[3] >> 48) & 0xFF] ^
+		  C2[(state[2] >> 40) & 0xFF] ^ C3[(state[1] >> 32) & 0xFF] ^
+		  C4[(state[0] >> 24) & 0xFF] ^ C5[(state[7] >> 16) & 0xFF] ^
+		  C6[(state[6] >>  8) & 0xFF] ^ C7[(state[5] >>  0) & 0xFF] ^ key[4]);
+      block[5] = (C0[(state[5] >> 56) & 0xFF] ^ C1[(state[4] >> 48) & 0xFF] ^
+		  C2[(state[3] >> 40) & 0xFF] ^ C3[(state[2] >> 32) & 0xFF] ^
+		  C4[(state[1] >> 24) & 0xFF] ^ C5[(state[0] >> 16) & 0xFF] ^
+		  C6[(state[7] >>  8) & 0xFF] ^ C7[(state[6] >>  0) & 0xFF] ^ key[5]);
+      block[6] = (C0[(state[6] >> 56) & 0xFF] ^ C1[(state[5] >> 48) & 0xFF] ^
+		  C2[(state[4] >> 40) & 0xFF] ^ C3[(state[3] >> 32) & 0xFF] ^
+		  C4[(state[2] >> 24) & 0xFF] ^ C5[(state[1] >> 16) & 0xFF] ^
+		  C6[(state[0] >>  8) & 0xFF] ^ C7[(state[7] >>  0) & 0xFF] ^ key[6]);
+      block[7] = (C0[(state[7] >> 56) & 0xFF] ^ C1[(state[6] >> 48) & 0xFF] ^
+		  C2[(state[5] >> 40) & 0xFF] ^ C3[(state[4] >> 32) & 0xFF] ^
+		  C4[(state[3] >> 24) & 0xFF] ^ C5[(state[2] >> 16) & 0xFF] ^
+		  C6[(state[1] >>  8) & 0xFF] ^ C7[(state[0] >>  0) & 0xFF] ^ key[7]);
+      block_copy (state, block, i);
+    }
+
+  /* Compression.  */
+
+  block_xor (context->hash_state, data_block, i);
+  block_xor (context->hash_state, state, i);
+
+  return /*burn_stack*/ 4 * sizeof(whirlpool_block_t) + 2 * sizeof(int) +
+                        4 * sizeof(void*);
+}
+
+static unsigned int
+whirlpool_transform ( void *c, const unsigned char *data, size_t nblks )
+{
+  unsigned int burn;
+
+  do
+    {
+      burn = whirlpool_transform_blk (c, data);
+      data += BLOCK_SIZE;
+    }
+  while (--nblks);
+
+  return burn;
+}
+
+#endif /* !USE_AMD64_ASM */
+
+
+/* Bug compatibility Whirlpool version.  */
+static void
+whirlpool_add_bugemu (whirlpool_context_t *context,
+                      const void *buffer_arg, size_t buffer_n)
+{
+  const unsigned char *buffer = buffer_arg;
+  u64 buffer_size;
+  unsigned int carry;
+  unsigned int i;
+
+  buffer_size = buffer_n;
+
+  if (context->bugemu.count == BLOCK_SIZE)
+    {
+      /* Flush the buffer.  */
+      whirlpool_transform (context, context->bctx.buf, 1);
+      context->bugemu.count = 0;
+    }
+  if (! buffer)
+    return; /* Nothing to add.  */
+
+  if (context->bugemu.count)
+    {
+      while (buffer_n && (context->bugemu.count < BLOCK_SIZE))
+	{
+	  context->bctx.buf[context->bugemu.count++] = *buffer++;
+	  buffer_n--;
+	}
+      whirlpool_add_bugemu (context, NULL, 0);
+      if (!buffer_n)
+        return; /* Done.  This is the bug we emulate.  */
+    }
+
+  while (buffer_n >= BLOCK_SIZE)
+    {
+      whirlpool_transform (context, buffer, 1);
+      context->bugemu.count = 0;
+      buffer_n -= BLOCK_SIZE;
+      buffer += BLOCK_SIZE;
+    }
+  while (buffer_n && (context->bugemu.count < BLOCK_SIZE))
+    {
+      context->bctx.buf[context->bugemu.count++] = *buffer++;
+      buffer_n--;
+    }
+
+  /* Update bit counter.  */
+  carry = 0;
+  buffer_size <<= 3;
+  for (i = 1; i <= 32; i++)
+    {
+      if (! (buffer_size || carry))
+	break;
+
+      carry += context->bugemu.length[32 - i] + (buffer_size & 0xFF);
+      context->bugemu.length[32 - i] = carry;
+      buffer_size >>= 8;
+      carry >>= 8;
+    }
+  gcry_assert (! (buffer_size || carry));
+}
+
+
+/* Bug compatibility Whirlpool version.  */
+static void
+whirlpool_final_bugemu (void *ctx)
+{
+  whirlpool_context_t *context = ctx;
+  unsigned int i;
+
+  /* Flush.  */
+  whirlpool_add_bugemu (context, NULL, 0);
+
+  /* Pad.  */
+  context->bctx.buf[context->bugemu.count++] = 0x80;
+
+  if (context->bugemu.count > 32)
+    {
+      /* An extra block is necessary.  */
+      while (context->bugemu.count < 64)
+	context->bctx.buf[context->bugemu.count++] = 0;
+      whirlpool_add_bugemu (context, NULL, 0);
+    }
+  while (context->bugemu.count < 32)
+    context->bctx.buf[context->bugemu.count++] = 0;
+
+  /* Add length of message.  */
+  memcpy (context->bctx.buf + context->bugemu.count,
+          context->bugemu.length, 32);
+  context->bugemu.count += 32;
+  whirlpool_add_bugemu (context, NULL, 0);
+
+  block_to_buffer (context->bctx.buf, context->hash_state, i);
+}
+
+
+static void
+whirlpool_write (void *ctx, const void *buffer, size_t buffer_n)
+{
+  whirlpool_context_t *context = ctx;
+
+  if (context->use_bugemu)
+    {
+      whirlpool_add_bugemu (context, buffer, buffer_n);
+    }
+  else
+    {
+      u64 old_nblocks = context->bctx.nblocks;
+
+      _gcry_md_block_write (context, buffer, buffer_n);
+
+      gcry_assert (old_nblocks <= context->bctx.nblocks);
+    }
+}
+
+static void
+whirlpool_final (void *ctx)
+{
+  whirlpool_context_t *context = ctx;
+  unsigned int i;
+  u64 t, th, lsb, msb;
+  unsigned char *length;
+
+  if (context->use_bugemu)
+    {
+      whirlpool_final_bugemu (ctx);
+      return;
+    }
+
+  t = context->bctx.nblocks;
+  /* if (sizeof t == sizeof context->bctx.nblocks) */
+  th = context->bctx.nblocks_high;
+  /* else */
+  /*   th = context->bctx.nblocks >> 64; In case we ever use u128 */
+
+  /* multiply by 64 to make a byte count */
+  lsb = t << 6;
+  msb = (th << 6) | (t >> 58);
+  /* add the count */
+  t = lsb;
+  if ((lsb += context->bctx.count) < t)
+    msb++;
+  /* multiply by 8 to make a bit count */
+  t = lsb;
+  lsb <<= 3;
+  msb <<= 3;
+  msb |= t >> 61;
+
+  /* Flush.  */
+  whirlpool_write (context, NULL, 0);
+
+  /* Pad.  */
+  context->bctx.buf[context->bctx.count++] = 0x80;
+
+  if (context->bctx.count > 32)
+    {
+      /* An extra block is necessary.  */
+      if (context->bctx.count < 64)
+	memset (&context->bctx.buf[context->bctx.count], 0,
+	        64 - context->bctx.count);
+      context->bctx.count = 64;
+      whirlpool_write (context, NULL, 0);
+    }
+  if (context->bctx.count < 32)
+    memset (&context->bctx.buf[context->bctx.count], 0,
+	    32 - context->bctx.count);
+  context->bctx.count = 32;
+
+  /* Add length of message.  */
+  length = context->bctx.buf + context->bctx.count;
+  buf_put_be64(&length[0 * 8], 0);
+  buf_put_be64(&length[1 * 8], 0);
+  buf_put_be64(&length[2 * 8], msb);
+  buf_put_be64(&length[3 * 8], lsb);
+  context->bctx.count += 32;
+  whirlpool_write (context, NULL, 0);
+
+  block_to_buffer (context->bctx.buf, context->hash_state, i);
+}
+
+static byte *
+whirlpool_read (void *ctx)
+{
+  whirlpool_context_t *context = ctx;
+
+  return context->bctx.buf;
+}
+
+gcry_md_spec_t _gcry_digest_spec_whirlpool =
+  {
+    GCRY_MD_WHIRLPOOL, {0, 0},
+    "WHIRLPOOL", NULL, 0, NULL, 64,
+    whirlpool_init, whirlpool_write, whirlpool_final, whirlpool_read, NULL,
+    NULL, NULL,
+    sizeof (whirlpool_context_t)
+  };