diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst
index 2e519193ab4a44401c571037c89e5c1fd4ed6640..b91b31736df8b778d98a9cfb49a712e9111341b9 100644
--- a/Documentation/crypto/api.rst
+++ b/Documentation/crypto/api.rst
@@ -1,15 +1,6 @@
 Programming Interface
 =====================
 
-Please note that the kernel crypto API contains the AEAD givcrypt API
-(crypto_aead_giv\* and aead_givcrypt\* function calls in
-include/crypto/aead.h). This API is obsolete and will be removed in the
-future. To obtain the functionality of an AEAD cipher with internal IV
-generation, use the IV generator as a regular cipher. For example,
-rfc4106(gcm(aes)) is the AEAD cipher with external IV generation and
-seqniv(rfc4106(gcm(aes))) implies that the kernel crypto API generates
-the IV. Different IV generators are available.
-
 .. class:: toc-title
 
 	   Table of contents
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index ca2d09b991f5de143d2b791603841cf2cebc6673..ee8ff0762d7fafd77a066b1d83700adc831ac435 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -157,10 +157,6 @@ applicable to a cipher, it is not displayed:
 
    -  rng for random number generator
 
-   -  givcipher for cipher with associated IV generator (see the geniv
-      entry below for the specification of the IV generator type used by
-      the cipher implementation)
-
    -  kpp for a Key-agreement Protocol Primitive (KPP) cipher such as
       an ECDH or DH implementation
 
@@ -174,16 +170,7 @@ applicable to a cipher, it is not displayed:
 
 -  digestsize: output size of the message digest
 
--  geniv: IV generation type:
-
-   -  eseqiv for encrypted sequence number based IV generation
-
-   -  seqiv for sequence number based IV generation
-
-   -  chainiv for chain iv generation
-
-   -  <builtin> is a marker that the cipher implements IV generation and
-      handling as it is specific to the given cipher
+-  geniv: IV generator (obsolete)
 
 Key Sizes
 ---------
@@ -218,10 +205,6 @@ the aforementioned cipher types:
 
 -  CRYPTO_ALG_TYPE_ABLKCIPHER Asynchronous multi-block cipher
 
--  CRYPTO_ALG_TYPE_GIVCIPHER Asynchronous multi-block cipher packed
-   together with an IV generator (see geniv field in the /proc/crypto
-   listing for the known IV generators)
-
 -  CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
    an ECDH or DH implementation
 
@@ -338,18 +321,14 @@ uses the API applicable to the cipher type specified for the block.
 
 The following call sequence is applicable when the IPSEC layer triggers
 an encryption operation with the esp_output function. During
-configuration, the administrator set up the use of rfc4106(gcm(aes)) as
-the cipher for ESP. The following call sequence is now depicted in the
-ASCII art above:
+configuration, the administrator set up the use of seqiv(rfc4106(gcm(aes)))
+as the cipher for ESP. The following call sequence is now depicted in
+the ASCII art above:
 
 1. esp_output() invokes crypto_aead_encrypt() to trigger an
    encryption operation of the AEAD cipher with IV generator.
 
-   In case of GCM, the SEQIV implementation is registered as GIVCIPHER
-   in crypto_rfc4106_alloc().
-
-   The SEQIV performs its operation to generate an IV where the core
-   function is seqiv_geniv().
+   The SEQIV generates the IV.
 
 2. Now, SEQIV uses the AEAD API function calls to invoke the associated
    AEAD cipher. In our case, during the instantiation of SEQIV, the
diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
index 999fb2a810f67717bac503006f10cde1ce878262..6130e6eb4af89135280b44ea5a1904a2c6e6c7b8 100644
--- a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
+++ b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
@@ -1,8 +1,12 @@
 Arm TrustZone CryptoCell cryptographic engine
 
 Required properties:
-- compatible: Should be one of: "arm,cryptocell-712-ree",
-  "arm,cryptocell-710-ree" or "arm,cryptocell-630p-ree".
+- compatible: Should be one of -
+   "arm,cryptocell-713-ree"
+   "arm,cryptocell-703-ree"
+   "arm,cryptocell-712-ree"
+   "arm,cryptocell-710-ree"
+   "arm,cryptocell-630p-ree"
 - reg: Base physical address of the engine and length of memory mapped region.
 - interrupts: Interrupt number for the device.
 
diff --git a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
index 76a0b4e80e83edd46510783274d834748eaff7a9..4e4d387e38a5ccd93b9545889ae24e97998a8ee5 100644
--- a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt
@@ -6,6 +6,8 @@ Required properties:
 - interrupts : Should contain MXS DCP interrupt numbers, VMI IRQ and DCP IRQ
                must be supplied, optionally Secure IRQ can be present, but
 	       is currently not implemented and not used.
+- clocks : Clock reference (only required on some SOCs: 6ull and 6sll).
+- clock-names : Must be "dcp".
 
 Example:
 
diff --git a/MAINTAINERS b/MAINTAINERS
index db3f690eb590d4f1192ab1cea971e94aee8cd97d..7a9804a891fdc68cb5f8c75663786d5d6577d9b9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3484,6 +3484,7 @@ F:	include/linux/spi/cc2520.h
 F:	Documentation/devicetree/bindings/net/ieee802154/cc2520.txt
 
 CCREE ARM TRUSTZONE CRYPTOCELL REE DRIVER
+M:	Yael Chemla <yael.chemla@foss.arm.com>
 M:	Gilad Ben-Yossef <gilad@benyossef.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
@@ -7147,7 +7148,9 @@ F:	crypto/842.c
 F:	lib/842/
 
 IBM Power in-Nest Crypto Acceleration
-M:	Paulo Flabiano Smorigo <pfsmorigo@linux.ibm.com>
+M:	Breno Leitão <leitao@debian.org>
+M:	Nayna Jain <nayna@linux.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@gmail.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
 F:	drivers/crypto/nx/Makefile
@@ -7211,7 +7214,9 @@ S:	Supported
 F:	drivers/scsi/ibmvscsi_tgt/
 
 IBM Power VMX Cryptographic instructions
-M:	Paulo Flabiano Smorigo <pfsmorigo@linux.ibm.com>
+M:	Breno Leitão <leitao@debian.org>
+M:	Nayna Jain <nayna@linux.ibm.com>
+M:	Paulo Flabiano Smorigo <pfsmorigo@gmail.com>
 L:	linux-crypto@vger.kernel.org
 S:	Supported
 F:	drivers/crypto/vmx/Makefile
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index ef0c7feea6e298bb2b02501dadbf6a71993f2e0c..a95322b59799f9162f366c141566e1893971561f 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -69,6 +69,15 @@ config CRYPTO_AES_ARM
 	help
 	  Use optimized AES assembler routines for ARM platforms.
 
+	  On ARM processors without the Crypto Extensions, this is the
+	  fastest AES implementation for single blocks.  For multiple
+	  blocks, the NEON bit-sliced implementation is usually faster.
+
+	  This implementation may be vulnerable to cache timing attacks,
+	  since it uses lookup tables.  However, as countermeasures it
+	  disables IRQs and preloads the tables; it is hoped this makes
+	  such attacks very difficult.
+
 config CRYPTO_AES_ARM_BS
 	tristate "Bit sliced AES using NEON instructions"
 	depends on KERNEL_MODE_NEON
@@ -117,9 +126,14 @@ config CRYPTO_CRC32_ARM_CE
 	select CRYPTO_HASH
 
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	tristate "NEON accelerated ChaCha stream cipher algorithms"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
+config CRYPTO_NHPOLY1305_NEON
+	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_NHPOLY1305
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index bd5bceef0605f1c22d158bdca6b875a4b58311c0..b65d6bfab8e6b7e483d75b7129e72eb44c9a5668 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -52,7 +53,8 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index d0a9cec73707b7be885b509e5befe393e90b902f..5affb8482379282f23982cd39117a32869c0c968 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -10,7 +10,6 @@
 
 #include <asm/hwcap.h>
 #include <asm/neon.h>
-#include <asm/hwcap.h>
 #include <crypto/aes.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2d15d5e7b4a01803f6ba2b99ee7b8fbfc2..f2d67c095e596edc67cacae01e2d9329125cb629 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -10,6 +10,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/cache.h>
 
 	.text
@@ -41,7 +42,7 @@
 	.endif
 	.endm
 
-	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
 	__select	\out0, \in0, 0
 	__select	t0, \in1, 1
 	__load		\out0, \out0, 0, \sz, \op
@@ -73,6 +74,14 @@
 	__load		t0, t0, 3, \sz, \op
 	__load		\t4, \t4, 3, \sz, \op
 
+	.ifnb		\oldcpsr
+	/*
+	 * This is the final round and we're done with all data-dependent table
+	 * lookups, so we can safely re-enable interrupts.
+	 */
+	restore_irqs	\oldcpsr
+	.endif
+
 	eor		\out1, \out1, t1, ror #24
 	eor		\out0, \out0, t2, ror #16
 	ldm		rk!, {t1, t2}
@@ -83,14 +92,14 @@
 	eor		\out1, \out1, t2
 	.endm
 
-	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
 	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
-	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
 	.endm
 
-	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
 	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
-	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
 	.endm
 
 	.macro		__rev, out, in
@@ -118,13 +127,14 @@
 	.macro		do_crypt, round, ttab, ltab, bsz
 	push		{r3-r11, lr}
 
+	// Load keys first, to reduce latency in case they're not cached yet.
+	ldm		rk!, {r8-r11}
+
 	ldr		r4, [in]
 	ldr		r5, [in, #4]
 	ldr		r6, [in, #8]
 	ldr		r7, [in, #12]
 
-	ldm		rk!, {r8-r11}
-
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	__rev		r4, r4
 	__rev		r5, r5
@@ -138,6 +148,25 @@
 	eor		r7, r7, r11
 
 	__adrl		ttab, \ttab
+	/*
+	 * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
+	 * L1 cache, assuming cacheline size >= 32.  This is a hardening measure
+	 * intended to make cache-timing attacks more difficult.  They may not
+	 * be fully prevented, however; see the paper
+	 * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+	 * ("Cache-timing attacks on AES") for a discussion of the many
+	 * difficulties involved in writing truly constant-time AES software.
+	 */
+	 save_and_disable_irqs	t0
+	.set		i, 0
+	.rept		1024 / 128
+	ldr		r8, [ttab, #i + 0]
+	ldr		r9, [ttab, #i + 32]
+	ldr		r10, [ttab, #i + 64]
+	ldr		r11, [ttab, #i + 96]
+	.set		i, i + 128
+	.endr
+	push		{t0}		// oldcpsr
 
 	tst		rounds, #2
 	bne		1f
@@ -151,8 +180,21 @@
 	\round		r4, r5, r6, r7, r8, r9, r10, r11
 	b		0b
 
-2:	__adrl		ttab, \ltab
-	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
+2:	.ifb		\ltab
+	add		ttab, ttab, #1
+	.else
+	__adrl		ttab, \ltab
+	// Prefetch inverse S-box for final round; see explanation above
+	.set		i, 0
+	.rept		256 / 64
+	ldr		t0, [ttab, #i + 0]
+	ldr		t1, [ttab, #i + 32]
+	.set		i, i + 64
+	.endr
+	.endif
+
+	pop		{rounds}	// oldcpsr
+	\round		r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
 	__rev		r4, r4
@@ -175,7 +217,7 @@
 	.endm
 
 ENTRY(__aes_arm_encrypt)
-	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
+	do_crypt	fround, crypto_ft_tab,, 2
 ENDPROC(__aes_arm_encrypt)
 
 	.align		5
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
similarity index 90%
rename from arch/arm/crypto/chacha20-neon-core.S
rename to arch/arm/crypto/chacha-neon-core.S
index 50e7b98968189bc302f95d7ced8d58c834969a9d..eb22926d49127e894a060afc003871d743dfe36d 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ * ChaCha/XChaCha NEON helper functions
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
@@ -27,9 +27,9 @@
   * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
   *					 needs index vector)
   *
-  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
-  * rotations, the only choices are (a) and (b).  We use (a) since it takes
-  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
   *
   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
   * and doesn't need a temporary register.
@@ -52,30 +52,20 @@
 	.fpu		neon
 	.align		5
 
-ENTRY(chacha20_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-
-	//
-	// This function encrypts one ChaCha20 block by loading the state matrix
-	// in four NEON registers. It performs matrix operation on four words in
-	// parallel, but requireds shuffling to rearrange the words after each
-	// round.
-	//
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	vld1.8		{d10}, [ip, :64]
 
 .Ldoubleround:
@@ -139,9 +129,31 @@ ENTRY(chacha20_block_xor_neon)
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	vext.8		q3, q3, q3, #4
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround
 
+	bx		lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	// r3: nrounds
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha_permute
+
 	add		ip, r2, #0x20
 	vld1.8		{q4-q5}, [r2]
 	vld1.8		{q6-q7}, [ip]
@@ -166,15 +178,33 @@ ENTRY(chacha20_block_xor_neon)
 	vst1.8		{q0-q1}, [r1]
 	vst1.8		{q2-q3}, [ip]
 
-	bx		lr
-ENDPROC(chacha20_block_xor_neon)
+	pop		{pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	// r2: nrounds
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	mov		r3, r2
+	bl		chacha_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha_block_neon)
 
 	.align		4
 .Lctrinc:	.word	0, 1, 2, 3
 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
 
 	.align		5
-ENTRY(chacha20_4block_xor_neon)
+ENTRY(chacha_4block_xor_neon)
 	push		{r4-r5}
 	mov		r4, sp			// preserve the stack pointer
 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
@@ -184,9 +214,10 @@ ENTRY(chacha20_4block_xor_neon)
 	// r0: Input state matrix, s
 	// r1: 4 data blocks output, o
 	// r2: 4 data blocks input, i
+	// r3: nrounds
 
 	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// This function encrypts four consecutive ChaCha blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. The words are re-interleaved before the
@@ -219,7 +250,6 @@ ENTRY(chacha20_4block_xor_neon)
 	vdup.32		q0, d0[0]
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	b		1f
 
 .Ldoubleround4:
@@ -417,7 +447,7 @@ ENTRY(chacha20_4block_xor_neon)
 	vsri.u32	q5, q8, #25
 	vsri.u32	q6, q9, #25
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround4
 
 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
@@ -527,4 +557,4 @@ ENTRY(chacha20_4block_xor_neon)
 
 	pop		{r4-r5}
 	bx		lr
-ENDPROC(chacha20_4block_xor_neon)
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..9d6fda81986da6b4e3470ebcb2ded59f7a6966e8
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -0,0 +1,201 @@
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		kernel_neon_begin();
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
+		kernel_neon_end();
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
deleted file mode 100644
index 59a7be08e80ceb5ba1b7c5897a5c2ac407f3f205..0000000000000000000000000000000000000000
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-	u8 buf[CHACHA20_BLOCK_SIZE];
-
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
-		memcpy(dst, buf, bytes);
-	}
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(req);
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_chacha20_init(state, ctx, walk.iv);
-
-	kernel_neon_begin();
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-	kernel_neon_end();
-
-	return err;
-}
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-neon",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_neon,
-	.decrypt		= chacha20_neon,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S
new file mode 100644
index 0000000000000000000000000000000000000000..434d80ab531c2a600fbcffc89c21e6a8ad5ef284
--- /dev/null
+++ b/arch/arm/crypto/nh-neon-core.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	KEY		.req	r0
+	MESSAGE		.req	r1
+	MESSAGE_LEN	.req	r2
+	HASH		.req	r3
+
+	PASS0_SUMS	.req	q0
+	PASS0_SUM_A	.req	d0
+	PASS0_SUM_B	.req	d1
+	PASS1_SUMS	.req	q1
+	PASS1_SUM_A	.req	d2
+	PASS1_SUM_B	.req	d3
+	PASS2_SUMS	.req	q2
+	PASS2_SUM_A	.req	d4
+	PASS2_SUM_B	.req	d5
+	PASS3_SUMS	.req	q3
+	PASS3_SUM_A	.req	d6
+	PASS3_SUM_B	.req	d7
+	K0		.req	q4
+	K1		.req	q5
+	K2		.req	q6
+	K3		.req	q7
+	T0		.req	q8
+	T0_L		.req	d16
+	T0_H		.req	d17
+	T1		.req	q9
+	T1_L		.req	d18
+	T1_H		.req	d19
+	T2		.req	q10
+	T2_L		.req	d20
+	T2_H		.req	d21
+	T3		.req	q11
+	T3_L		.req	d22
+	T3_H		.req	d23
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	vld1.8		{T3}, [MESSAGE]!
+
+	// Load next key stride
+	vld1.32		{\k3}, [KEY]!
+
+	// Add message words to key words
+	vadd.u32	T0, T3, \k0
+	vadd.u32	T1, T3, \k1
+	vadd.u32	T2, T3, \k2
+	vadd.u32	T3, T3, \k3
+
+	// Multiply 32x32 => 64 and accumulate
+	vmlal.u32	PASS0_SUMS, T0_L, T0_H
+	vmlal.u32	PASS1_SUMS, T1_L, T1_H
+	vmlal.u32	PASS2_SUMS, T2_L, T2_H
+	vmlal.u32	PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+	vld1.32		{K0,K1}, [KEY]!
+	  vmov.u64	PASS0_SUMS, #0
+	  vmov.u64	PASS1_SUMS, #0
+	vld1.32		{K2}, [KEY]!
+	  vmov.u64	PASS2_SUMS, #0
+	  vmov.u64	PASS3_SUMS, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	vadd.u64	T0_L, PASS0_SUM_A, PASS0_SUM_B
+	vadd.u64	T0_H, PASS1_SUM_A, PASS1_SUM_B
+	vadd.u64	T1_L, PASS2_SUM_A, PASS2_SUM_B
+	vadd.u64	T1_H, PASS3_SUM_A, PASS3_SUM_B
+	vst1.8		{T0-T1}, [HASH]
+	bx		lr
+ENDPROC(nh_neon)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..49aae87cb2bcc096ba4d01498e4458e7412db54b
--- /dev/null
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !may_use_simd())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_neon_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+		kernel_neon_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-neon",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_neon_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index a5606823ed4da3ccbe72ee141859ef506a9e9526..d9a523ecdd836d54e5742135b6ff9e2fb4c7d520 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -101,11 +101,16 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_SIMD
 
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 
+config CRYPTO_NHPOLY1305_NEON
+	tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_NHPOLY1305
+
 config CRYPTO_AES_ARM64_BS
 	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index f476fede09ba489463b9d1fb5e19b7345d100e0e..a4ffd9fe32650c0310cc12ac735fb087a8eede6b 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -50,8 +50,11 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
similarity index 52%
rename from arch/arm64/crypto/chacha20-neon-core.S
rename to arch/arm64/crypto/chacha-neon-core.S
index 13c85e272c2a1bbeb19502e10ecf9e481974c303..021bb9e9784b271cb670f8c973312e9e2ec14ba4 100644
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -1,13 +1,13 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ * ChaCha/XChaCha NEON helper functions
  *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * Based on:
+ * Originally based on:
  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  *
  * Copyright (C) 2015 Martin Willi
@@ -19,29 +19,27 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
 
 	.text
 	.align		6
 
-ENTRY(chacha20_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-
-	//
-	// This function encrypts one ChaCha20 block by loading the state matrix
-	// in four NEON registers. It performs matrix operation on four words in
-	// parallel, but requires shuffling to rearrange the words after each
-	// round.
-	//
-
-	// x0..3 = s0..3
-	adr		x3, ROT8
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-	ld1		{v12.4s}, [x3]
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+chacha_permute:
 
-	mov		x3, #10
+	adr_l		x10, ROT8
+	ld1		{v12.4s}, [x10]
 
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	ext		v3.16b, v3.16b, v3.16b, #4
 
-	subs		x3, x3, #1
+	subs		w3, w3, #2
 	b.ne		.Ldoubleround
 
+	ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+	// w3: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha_permute
+
 	ld1		{v4.16b-v7.16b}, [x2]
 
 	// o0 = i0 ^ (x0 + s0)
@@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
 
 	st1		{v0.16b-v3.16b}, [x1]
 
+	ldp		x29, x30, [sp], #16
 	ret
-ENDPROC(chacha20_block_xor_neon)
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+	// w2: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	mov		w3, w2
+	bl		chacha_permute
+
+	st1		{v0.16b}, [x1], #16
+	st1		{v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(hchacha_block_neon)
+
+	a0		.req	w12
+	a1		.req	w13
+	a2		.req	w14
+	a3		.req	w15
+	a4		.req	w16
+	a5		.req	w17
+	a6		.req	w19
+	a7		.req	w20
+	a8		.req	w21
+	a9		.req	w22
+	a10		.req	w23
+	a11		.req	w24
+	a12		.req	w25
+	a13		.req	w26
+	a14		.req	w27
+	a15		.req	w28
 
 	.align		6
-ENTRY(chacha20_4block_xor_neon)
+ENTRY(chacha_4block_xor_neon)
+	frame_push	10
+
 	// x0: Input state matrix, s
 	// x1: 4 data blocks output, o
 	// x2: 4 data blocks input, i
+	// w3: nrounds
+	// x4: byte count
+
+	adr_l		x10, .Lpermute
+	and		x5, x4, #63
+	add		x10, x10, x5
+	add		x11, x10, #64
 
 	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// This function encrypts four consecutive ChaCha blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. For final XORing step we transpose the
 	// matrix by interleaving 32- and then 64-bit words, which allows us to
 	// do XOR in NEON registers.
 	//
-	adr		x3, CTRINC		// ... and ROT8
-	ld1		{v30.4s-v31.4s}, [x3]
+	// At the same time, a fifth block is encrypted in parallel using
+	// scalar registers
+	//
+	adr_l		x9, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x9]
 
 	// x0..15[0-3] = s0..3[0..3]
-	mov		x4, x0
-	ld4r		{ v0.4s- v3.4s}, [x4], #16
-	ld4r		{ v4.4s- v7.4s}, [x4], #16
-	ld4r		{ v8.4s-v11.4s}, [x4], #16
-	ld4r		{v12.4s-v15.4s}, [x4]
-
-	// x12 += counter values 0-3
+	add		x8, x0, #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
+	ld4r		{v12.4s-v15.4s}, [x8]
+
+	mov		a0, v0.s[0]
+	mov		a1, v1.s[0]
+	mov		a2, v2.s[0]
+	mov		a3, v3.s[0]
+	mov		a4, v4.s[0]
+	mov		a5, v5.s[0]
+	mov		a6, v6.s[0]
+	mov		a7, v7.s[0]
+	mov		a8, v8.s[0]
+	mov		a9, v9.s[0]
+	mov		a10, v10.s[0]
+	mov		a11, v11.s[0]
+	mov		a12, v12.s[0]
+	mov		a13, v13.s[0]
+	mov		a14, v14.s[0]
+	mov		a15, v15.s[0]
+
+	// x12 += counter values 1-4
 	add		v12.4s, v12.4s, v30.4s
 
-	mov		x3, #10
-
 .Ldoubleround4:
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
 
 	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
 
 	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
 	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
 
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
 
 	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
 
 	shl		v4.4s, v16.4s, #12
 	shl		v5.4s, v17.4s, #12
@@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v7.4s, v19.4s, #12
 
 	sri		v4.4s, v16.4s, #20
+	  ror		a4, a4, #20
 	sri		v5.4s, v17.4s, #20
+	  ror		a5, a5, #20
 	sri		v6.4s, v18.4s, #20
+	  ror		a6, a6, #20
 	sri		v7.4s, v19.4s, #20
+	  ror		a7, a7, #20
 
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
 
 	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
 
 	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
 	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
 
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
 
 	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
 
 	shl		v4.4s, v16.4s, #7
 	shl		v5.4s, v17.4s, #7
@@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v7.4s, v19.4s, #7
 
 	sri		v4.4s, v16.4s, #25
+	  ror		a4, a4, #25
 	sri		v5.4s, v17.4s, #25
+	  ror		a5, a5, #25
 	sri		v6.4s, v18.4s, #25
+	 ror		a6, a6, #25
 	sri		v7.4s, v19.4s, #25
+	  ror		a7, a7, #25
 
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
 
 	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
 
 	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
 	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
 
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
 
 	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
 
 	shl		v5.4s, v16.4s, #12
 	shl		v6.4s, v17.4s, #12
@@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v4.4s, v19.4s, #12
 
 	sri		v5.4s, v16.4s, #20
+	  ror		a5, a5, #20
 	sri		v6.4s, v17.4s, #20
+	  ror		a6, a6, #20
 	sri		v7.4s, v18.4s, #20
+	  ror		a7, a7, #20
 	sri		v4.4s, v19.4s, #20
+	  ror		a4, a4, #20
 
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
 
 	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
 
 	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
 	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
 
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
 
 	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
 
 	shl		v5.4s, v16.4s, #7
 	shl		v6.4s, v17.4s, #7
@@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v4.4s, v19.4s, #7
 
 	sri		v5.4s, v16.4s, #25
+	  ror		a5, a5, #25
 	sri		v6.4s, v17.4s, #25
+	  ror		a6, a6, #25
 	sri		v7.4s, v18.4s, #25
+	  ror		a7, a7, #25
 	sri		v4.4s, v19.4s, #25
+	  ror		a4, a4, #25
 
-	subs		x3, x3, #1
+	subs		w3, w3, #2
 	b.ne		.Ldoubleround4
 
 	ld4r		{v16.4s-v19.4s}, [x0], #16
@@ -344,9 +521,17 @@ ENTRY(chacha20_4block_xor_neon)
 	// x2[0-3] += s0[2]
 	// x3[0-3] += s0[3]
 	add		v0.4s, v0.4s, v16.4s
+	  mov		w6, v16.s[0]
+	  mov		w7, v17.s[0]
 	add		v1.4s, v1.4s, v17.4s
+	  mov		w8, v18.s[0]
+	  mov		w9, v19.s[0]
 	add		v2.4s, v2.4s, v18.4s
+	  add		a0, a0, w6
+	  add		a1, a1, w7
 	add		v3.4s, v3.4s, v19.4s
+	  add		a2, a2, w8
+	  add		a3, a3, w9
 
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@@ -356,95 +541,304 @@ ENTRY(chacha20_4block_xor_neon)
 	// x6[0-3] += s1[2]
 	// x7[0-3] += s1[3]
 	add		v4.4s, v4.4s, v20.4s
+	  mov		w6, v20.s[0]
+	  mov		w7, v21.s[0]
 	add		v5.4s, v5.4s, v21.4s
+	  mov		w8, v22.s[0]
+	  mov		w9, v23.s[0]
 	add		v6.4s, v6.4s, v22.4s
+	  add		a4, a4, w6
+	  add		a5, a5, w7
 	add		v7.4s, v7.4s, v23.4s
+	  add		a6, a6, w8
+	  add		a7, a7, w9
 
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
 	// x10[0-3] += s2[2]
 	// x11[0-3] += s2[3]
 	add		v8.4s, v8.4s, v24.4s
+	  mov		w6, v24.s[0]
+	  mov		w7, v25.s[0]
 	add		v9.4s, v9.4s, v25.4s
+	  mov		w8, v26.s[0]
+	  mov		w9, v27.s[0]
 	add		v10.4s, v10.4s, v26.4s
+	  add		a8, a8, w6
+	  add		a9, a9, w7
 	add		v11.4s, v11.4s, v27.4s
+	  add		a10, a10, w8
+	  add		a11, a11, w9
 
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
 	// x14[0-3] += s3[2]
 	// x15[0-3] += s3[3]
 	add		v12.4s, v12.4s, v28.4s
+	  mov		w6, v28.s[0]
+	  mov		w7, v29.s[0]
 	add		v13.4s, v13.4s, v29.4s
+	  mov		w8, v30.s[0]
+	  mov		w9, v31.s[0]
 	add		v14.4s, v14.4s, v30.4s
+	  add		a12, a12, w6
+	  add		a13, a13, w7
 	add		v15.4s, v15.4s, v31.4s
+	  add		a14, a14, w8
+	  add		a15, a15, w9
 
 	// interleave 32-bit words in state n, n+1
+	  ldp		w6, w7, [x2], #64
 	zip1		v16.4s, v0.4s, v1.4s
+	  ldp		w8, w9, [x2, #-56]
+	  eor		a0, a0, w6
 	zip2		v17.4s, v0.4s, v1.4s
+	  eor		a1, a1, w7
 	zip1		v18.4s, v2.4s, v3.4s
+	  eor		a2, a2, w8
 	zip2		v19.4s, v2.4s, v3.4s
+	  eor		a3, a3, w9
+	  ldp		w6, w7, [x2, #-48]
 	zip1		v20.4s, v4.4s, v5.4s
+	  ldp		w8, w9, [x2, #-40]
+	  eor		a4, a4, w6
 	zip2		v21.4s, v4.4s, v5.4s
+	  eor		a5, a5, w7
 	zip1		v22.4s, v6.4s, v7.4s
+	  eor		a6, a6, w8
 	zip2		v23.4s, v6.4s, v7.4s
+	  eor		a7, a7, w9
+	  ldp		w6, w7, [x2, #-32]
 	zip1		v24.4s, v8.4s, v9.4s
+	  ldp		w8, w9, [x2, #-24]
+	  eor		a8, a8, w6
 	zip2		v25.4s, v8.4s, v9.4s
+	  eor		a9, a9, w7
 	zip1		v26.4s, v10.4s, v11.4s
+	  eor		a10, a10, w8
 	zip2		v27.4s, v10.4s, v11.4s
+	  eor		a11, a11, w9
+	  ldp		w6, w7, [x2, #-16]
 	zip1		v28.4s, v12.4s, v13.4s
+	  ldp		w8, w9, [x2, #-8]
+	  eor		a12, a12, w6
 	zip2		v29.4s, v12.4s, v13.4s
+	  eor		a13, a13, w7
 	zip1		v30.4s, v14.4s, v15.4s
+	  eor		a14, a14, w8
 	zip2		v31.4s, v14.4s, v15.4s
+	  eor		a15, a15, w9
+
+	mov		x3, #64
+	subs		x5, x4, #128
+	add		x6, x5, x2
+	csel		x3, x3, xzr, ge
+	csel		x2, x2, x6, ge
 
 	// interleave 64-bit words in state n, n+2
 	zip1		v0.2d, v16.2d, v18.2d
 	zip2		v4.2d, v16.2d, v18.2d
+	  stp		a0, a1, [x1], #64
 	zip1		v8.2d, v17.2d, v19.2d
 	zip2		v12.2d, v17.2d, v19.2d
-	ld1		{v16.16b-v19.16b}, [x2], #64
+	  stp		a2, a3, [x1, #-56]
+	ld1		{v16.16b-v19.16b}, [x2], x3
+
+	subs		x6, x4, #192
+	ccmp		x3, xzr, #4, lt
+	add		x7, x6, x2
+	csel		x3, x3, xzr, eq
+	csel		x2, x2, x7, eq
 
 	zip1		v1.2d, v20.2d, v22.2d
 	zip2		v5.2d, v20.2d, v22.2d
+	  stp		a4, a5, [x1, #-48]
 	zip1		v9.2d, v21.2d, v23.2d
 	zip2		v13.2d, v21.2d, v23.2d
-	ld1		{v20.16b-v23.16b}, [x2], #64
+	  stp		a6, a7, [x1, #-40]
+	ld1		{v20.16b-v23.16b}, [x2], x3
+
+	subs		x7, x4, #256
+	ccmp		x3, xzr, #4, lt
+	add		x8, x7, x2
+	csel		x3, x3, xzr, eq
+	csel		x2, x2, x8, eq
 
 	zip1		v2.2d, v24.2d, v26.2d
 	zip2		v6.2d, v24.2d, v26.2d
+	  stp		a8, a9, [x1, #-32]
 	zip1		v10.2d, v25.2d, v27.2d
 	zip2		v14.2d, v25.2d, v27.2d
-	ld1		{v24.16b-v27.16b}, [x2], #64
+	  stp		a10, a11, [x1, #-24]
+	ld1		{v24.16b-v27.16b}, [x2], x3
+
+	subs		x8, x4, #320
+	ccmp		x3, xzr, #4, lt
+	add		x9, x8, x2
+	csel		x2, x2, x9, eq
 
 	zip1		v3.2d, v28.2d, v30.2d
 	zip2		v7.2d, v28.2d, v30.2d
+	  stp		a12, a13, [x1, #-16]
 	zip1		v11.2d, v29.2d, v31.2d
 	zip2		v15.2d, v29.2d, v31.2d
+	  stp		a14, a15, [x1, #-8]
 	ld1		{v28.16b-v31.16b}, [x2]
 
 	// xor with corresponding input, write to output
+	tbnz		x5, #63, 0f
 	eor		v16.16b, v16.16b, v0.16b
 	eor		v17.16b, v17.16b, v1.16b
 	eor		v18.16b, v18.16b, v2.16b
 	eor		v19.16b, v19.16b, v3.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+	cbz		x5, .Lout
+
+	tbnz		x6, #63, 1f
 	eor		v20.16b, v20.16b, v4.16b
 	eor		v21.16b, v21.16b, v5.16b
-	st1		{v16.16b-v19.16b}, [x1], #64
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+	cbz		x6, .Lout
+
+	tbnz		x7, #63, 2f
 	eor		v24.16b, v24.16b, v8.16b
 	eor		v25.16b, v25.16b, v9.16b
-	st1		{v20.16b-v23.16b}, [x1], #64
 	eor		v26.16b, v26.16b, v10.16b
 	eor		v27.16b, v27.16b, v11.16b
-	eor		v28.16b, v28.16b, v12.16b
 	st1		{v24.16b-v27.16b}, [x1], #64
+	cbz		x7, .Lout
+
+	tbnz		x8, #63, 3f
+	eor		v28.16b, v28.16b, v12.16b
 	eor		v29.16b, v29.16b, v13.16b
 	eor		v30.16b, v30.16b, v14.16b
 	eor		v31.16b, v31.16b, v15.16b
 	st1		{v28.16b-v31.16b}, [x1]
 
+.Lout:	frame_pop
 	ret
-ENDPROC(chacha20_4block_xor_neon)
 
-CTRINC:	.word		0, 1, 2, 3
+	// fewer than 128 bytes of in/output
+0:	ld1		{v8.16b}, [x10]
+	ld1		{v9.16b}, [x11]
+	movi		v10.16b, #16
+	sub		x2, x1, #64
+	add		x1, x1, x5
+	ld1		{v16.16b-v19.16b}, [x2]
+	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
+	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+	st1		{v20.16b-v23.16b}, [x1]
+	b		.Lout
+
+	// fewer than 192 bytes of in/output
+1:	ld1		{v8.16b}, [x10]
+	ld1		{v9.16b}, [x11]
+	movi		v10.16b, #16
+	add		x1, x1, x6
+	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+	add		v8.16b, v8.16b, v10.16b
+	add		v9.16b, v9.16b, v10.16b
+	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
+	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+
+	eor		v20.16b, v20.16b, v0.16b
+	eor		v21.16b, v21.16b, v1.16b
+	eor		v22.16b, v22.16b, v2.16b
+	eor		v23.16b, v23.16b, v3.16b
+	st1		{v20.16b-v23.16b}, [x1]
+	b		.Lout
+
+	// fewer than 256 bytes of in/output
+2:	ld1		{v4.16b}, [x10]
+	ld1		{v5.16b}, [x11]
+	movi		v6.16b, #16
+	add		x1, x1, x7
+	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
+	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
+
+	eor		v24.16b, v24.16b, v0.16b
+	eor		v25.16b, v25.16b, v1.16b
+	eor		v26.16b, v26.16b, v2.16b
+	eor		v27.16b, v27.16b, v3.16b
+	st1		{v24.16b-v27.16b}, [x1]
+	b		.Lout
+
+	// fewer than 320 bytes of in/output
+3:	ld1		{v4.16b}, [x10]
+	ld1		{v5.16b}, [x11]
+	movi		v6.16b, #16
+	add		x1, x1, x8
+	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
+	add		v4.16b, v4.16b, v6.16b
+	add		v5.16b, v5.16b, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
+	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x1]
+	b		.Lout
+ENDPROC(chacha_4block_xor_neon)
+
+	.section	".rodata", "a", %progbits
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.set		.Li, 0
+	.rept		192
+	.byte		(.Li - 64)
+	.set		.Li, .Li + 1
+	.endr
+
+CTRINC:	.word		1, 2, 3, 4
 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..bece1d85bd816ad3103fd99375e24b2ec21e4c19
--- /dev/null
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -0,0 +1,198 @@
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
+				       int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  int bytes, int nrounds)
+{
+	while (bytes > 0) {
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= CHACHA_BLOCK_SIZE * 5;
+		src += CHACHA_BLOCK_SIZE * 5;
+		dst += CHACHA_BLOCK_SIZE * 5;
+		state[12] += 5;
+	}
+}
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = rounddown(nbytes, walk.stride);
+
+		kernel_neon_begin();
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
+		kernel_neon_end();
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha_crypt(req);
+
+	return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+		return crypto_xchacha_crypt(req);
+
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_neon_begin();
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
deleted file mode 100644
index 727579c93dedbdce1584818a8021b66a4a5cb4f8..0000000000000000000000000000000000000000
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-	u8 buf[CHACHA20_BLOCK_SIZE];
-
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-		kernel_neon_begin();
-		chacha20_4block_xor_neon(state, dst, src);
-		kernel_neon_end();
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-
-	if (!bytes)
-		return;
-
-	kernel_neon_begin();
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
-		memcpy(dst, buf, bytes);
-	}
-	kernel_neon_end();
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE)
-		return crypto_chacha20_crypt(req);
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	crypto_chacha20_init(state, ctx, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-neon",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_neon,
-	.decrypt		= chacha20_neon,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
-
-	return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S
new file mode 100644
index 0000000000000000000000000000000000000000..e05570c38de7621658313ef401fe46e708847594
--- /dev/null
+++ b/arch/arm64/crypto/nh-neon-core.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	KEY		.req	x0
+	MESSAGE		.req	x1
+	MESSAGE_LEN	.req	x2
+	HASH		.req	x3
+
+	PASS0_SUMS	.req	v0
+	PASS1_SUMS	.req	v1
+	PASS2_SUMS	.req	v2
+	PASS3_SUMS	.req	v3
+	K0		.req	v4
+	K1		.req	v5
+	K2		.req	v6
+	K3		.req	v7
+	T0		.req	v8
+	T1		.req	v9
+	T2		.req	v10
+	T3		.req	v11
+	T4		.req	v12
+	T5		.req	v13
+	T6		.req	v14
+	T7		.req	v15
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	ld1		{T3.16b}, [MESSAGE], #16
+
+	// Load next key stride
+	ld1		{\k3\().4s}, [KEY], #16
+
+	// Add message words to key words
+	add		T0.4s, T3.4s, \k0\().4s
+	add		T1.4s, T3.4s, \k1\().4s
+	add		T2.4s, T3.4s, \k2\().4s
+	add		T3.4s, T3.4s, \k3\().4s
+
+	// Multiply 32x32 => 64 and accumulate
+	mov		T4.d[0], T0.d[1]
+	mov		T5.d[0], T1.d[1]
+	mov		T6.d[0], T2.d[1]
+	mov		T7.d[0], T3.d[1]
+	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
+	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
+	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
+	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+	ld1		{K0.4s,K1.4s}, [KEY], #32
+	  movi		PASS0_SUMS.2d, #0
+	  movi		PASS1_SUMS.2d, #0
+	ld1		{K2.4s}, [KEY], #16
+	  movi		PASS2_SUMS.2d, #0
+	  movi		PASS3_SUMS.2d, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
+	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
+	st1		{T0.16b,T1.16b}, [HASH]
+	ret
+ENDPROC(nh_neon)
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..22cc32ac9448dba73febc4ad447d5a8144307a4c
--- /dev/null
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (ARM64 NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !may_use_simd())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_neon_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+		kernel_neon_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-neon",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_neon_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 812d9498d97be5776aeaee0f0c5831cadf40d693..dd456725189f23be3b02e330d7494ab58ba26fe2 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -137,7 +137,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm)
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
 	sctx->fallback.cip = crypto_alloc_cipher(name, 0,
-			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+						 CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(sctx->fallback.cip)) {
 		pr_err("Allocating AES fallback algorithm %s failed\n",
diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
index 3cd4f6b198b65665046101184f579b51138a688d..a9b8b0b94a8d4201cbc44a4c271733e739288d78 100644
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@@ -476,11 +476,6 @@ static bool __init sparc64_has_aes_opcode(void)
 
 static int __init aes_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_aes_opcode()) {
 		pr_info("Using sparc64 aes opcodes optimized AES implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c
index 561a84d93cf682a400a7555862f065f1fb04c84c..900d5c617e83b53de0892b35ee942c4e3cb7a8bf 100644
--- a/arch/sparc/crypto/camellia_glue.c
+++ b/arch/sparc/crypto/camellia_glue.c
@@ -299,11 +299,6 @@ static bool __init sparc64_has_camellia_opcode(void)
 
 static int __init camellia_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_camellia_opcode()) {
 		pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c
index 61af794aa2d31d5df27d0a318ac8b8f9d605637b..56499ea39fd36d5358981fcd1afafdb6d03b9471 100644
--- a/arch/sparc/crypto/des_glue.c
+++ b/arch/sparc/crypto/des_glue.c
@@ -510,11 +510,6 @@ static bool __init sparc64_has_des_opcode(void)
 
 static int __init des_sparc64_mod_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(algs); i++)
-		INIT_LIST_HEAD(&algs[i].cra_list);
-
 	if (sparc64_has_des_opcode()) {
 		pr_info("Using sparc64 des opcodes optimized DES implementation\n");
 		return crypto_register_algs(algs, ARRAY_SIZE(algs));
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a4b0007a54e17c8db9b1710873612575d7c6e107..45734e1cf96720c07eefef5c99e36bfa161cb54a 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
 sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
 sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 
@@ -23,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -46,6 +47,9 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
 obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -74,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -84,6 +88,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
 morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
 morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
 
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+
 ifeq ($(avx_supported),yes)
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 					camellia_aesni_avx_glue.o
@@ -97,10 +103,16 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
-	chacha20-x86_64-y += chacha20-avx2-x86_64.o
+	chacha-x86_64-y += chacha-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+
+	nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+endif
+
+ifeq ($(avx512_supported),yes)
+	chacha-x86_64-y += chacha-avx512vl-x86_64.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 1985ea0b551bf9bd825b05ea64be0319e4b02366..91c039ab56999d914f15d1796b70ddf94e2d6fe5 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,43 +182,30 @@ aad_shift_arr:
 .text
 
 
-##define the fields of the gcm aes context
-#{
-#        u8 expanded_keys[16*11] store expanded keys
-#        u8 shifted_hkey_1[16]   store HashKey <<1 mod poly here
-#        u8 shifted_hkey_2[16]   store HashKey^2 <<1 mod poly here
-#        u8 shifted_hkey_3[16]   store HashKey^3 <<1 mod poly here
-#        u8 shifted_hkey_4[16]   store HashKey^4 <<1 mod poly here
-#        u8 shifted_hkey_5[16]   store HashKey^5 <<1 mod poly here
-#        u8 shifted_hkey_6[16]   store HashKey^6 <<1 mod poly here
-#        u8 shifted_hkey_7[16]   store HashKey^7 <<1 mod poly here
-#        u8 shifted_hkey_8[16]   store HashKey^8 <<1 mod poly here
-#        u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-#        u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-#} gcm_ctx#
-
-HashKey        = 16*11   # store HashKey <<1 mod poly here
-HashKey_2      = 16*12   # store HashKey^2 <<1 mod poly here
-HashKey_3      = 16*13   # store HashKey^3 <<1 mod poly here
-HashKey_4      = 16*14   # store HashKey^4 <<1 mod poly here
-HashKey_5      = 16*15   # store HashKey^5 <<1 mod poly here
-HashKey_6      = 16*16   # store HashKey^6 <<1 mod poly here
-HashKey_7      = 16*17   # store HashKey^7 <<1 mod poly here
-HashKey_8      = 16*18   # store HashKey^8 <<1 mod poly here
-HashKey_k      = 16*19   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k    = 16*20   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k    = 16*21   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k    = 16*22   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k    = 16*23   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k    = 16*24   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k    = 16*25   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
+HashKey        = 16*6   # store HashKey <<1 mod poly here
+HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
+HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
+HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
+HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
+HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
+HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
+HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
+HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
 
 #define arg1 %rdi
 #define arg2 %rsi
@@ -229,6 +216,8 @@ HashKey_8_k    = 16*26   # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
 #define arg7 STACK_OFFSET+8*1(%r14)
 #define arg8 STACK_OFFSET+8*2(%r14)
 #define arg9 STACK_OFFSET+8*3(%r14)
+#define arg10 STACK_OFFSET+8*4(%r14)
+#define keysize 2*15*16(arg1)
 
 i = 0
 j = 0
@@ -267,1356 +256,1619 @@ VARIABLE_OFFSET = 16*8
 # Utility Macros
 ################################
 
+.macro FUNC_SAVE
+        #the number of pushes must equal STACK_OFFSET
+        push    %r12
+        push    %r13
+        push    %r14
+        push    %r15
+
+        mov     %rsp, %r14
+
+
+
+        sub     $VARIABLE_OFFSET, %rsp
+        and     $~63, %rsp                    # align rsp to 64 bytes
+.endm
+
+.macro FUNC_RESTORE
+        mov     %r14, %rsp
+
+        pop     %r15
+        pop     %r14
+        pop     %r13
+        pop     %r12
+.endm
+
 # Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK XMM0
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
                 vpxor    (arg1), \XMM0, \XMM0
-		i = 1
-		setreg
-.rep 9
+               i = 1
+               setreg
+.rep \REP
                 vaesenc  16*i(arg1), \XMM0, \XMM0
-		i = (i+1)
-		setreg
+               i = (i+1)
+               setreg
 .endr
-                vaesenclast 16*10(arg1), \XMM0, \XMM0
+                vaesenclast 16*i(arg1), \XMM0, \XMM0
 .endm
 
-#ifdef CONFIG_AS_AVX
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
+        vmovdqu AadHash(arg2), %xmm8
+        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
+        add arg5, InLen(arg2)
 
-        vpshufd         $0b01001110, \GH, \T2
-        vpshufd         $0b01001110, \HK, \T3
-        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
-        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
+        # initialize the data pointer offset as zero
+        xor     %r11d, %r11d
 
-        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
-        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
-        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
-        vpxor           \GH, \T2,\T2
-        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
+        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
+        sub %r11, arg5
 
-        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
-        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
-        vpxor           \T3, \GH, \GH
-        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
+        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
+        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
 
-        #first phase of the reduction
-        vpslld  $31, \GH, \T2                   # packed right shifting << 31
-        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
-        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
+        mov     %r13, %r12
+        shr     $4, %r12
+        and     $7, %r12
+        jz      _initial_num_blocks_is_0\@
 
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+        cmp     $7, %r12
+        je      _initial_num_blocks_is_7\@
+        cmp     $6, %r12
+        je      _initial_num_blocks_is_6\@
+        cmp     $5, %r12
+        je      _initial_num_blocks_is_5\@
+        cmp     $4, %r12
+        je      _initial_num_blocks_is_4\@
+        cmp     $3, %r12
+        je      _initial_num_blocks_is_3\@
+        cmp     $2, %r12
+        je      _initial_num_blocks_is_2\@
 
-        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
+        jmp     _initial_num_blocks_is_1\@
 
-        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
-        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
+_initial_num_blocks_is_7\@:
+        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*7, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        #second phase of the reduction
+_initial_num_blocks_is_6\@:
+        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*6, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
-        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
-        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+_initial_num_blocks_is_5\@:
+        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*5, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        vpxor   \T5, \T2, \T2
-        vpxor   \T2, \GH, \GH
-        vpxor   \T1, \GH, \GH                   # the result is in GH
+_initial_num_blocks_is_4\@:
+        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*4, %r13
+        jmp     _initial_blocks_encrypted\@
 
+_initial_num_blocks_is_3\@:
+        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*3, %r13
+        jmp     _initial_blocks_encrypted\@
 
-.endm
+_initial_num_blocks_is_2\@:
+        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*2, %r13
+        jmp     _initial_blocks_encrypted\@
 
-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+_initial_num_blocks_is_1\@:
+        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+        sub     $16*1, %r13
+        jmp     _initial_blocks_encrypted\@
 
-        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-        vmovdqa  \HK, \T5
+_initial_num_blocks_is_0\@:
+        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
 
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_k(arg1)
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
-        vmovdqa  \T5, HashKey_2(arg1)                    #  [HashKey_2] = HashKey^2<<1 mod poly
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_2_k(arg1)
+_initial_blocks_encrypted\@:
+        cmp     $0, %r13
+        je      _zero_cipher_left\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
-        vmovdqa  \T5, HashKey_3(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_3_k(arg1)
+        sub     $128, %r13
+        je      _eight_cipher_left\@
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
-        vmovdqa  \T5, HashKey_4(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_4_k(arg1)
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
-        vmovdqa  \T5, HashKey_5(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_5_k(arg1)
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
-        vmovdqa  \T5, HashKey_6(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_6_k(arg1)
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
-        vmovdqa  \T5, HashKey_7(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_7_k(arg1)
+        vmovd   %xmm9, %r15d
+        and     $255, %r15d
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
-        vmovdqa  \T5, HashKey_8(arg1)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqa  \T1, HashKey_8_k(arg1)
 
-.endm
+_encrypt_by_8_new\@:
+        cmp     $(255-8), %r15d
+        jg      _encrypt_by_8\@
 
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
 
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
-	i = (8-\num_initial_blocks)
-	j = 0
-	setreg
 
-	mov     arg6, %r10                      # r10 = AAD
-	mov     arg7, %r12                      # r12 = aadLen
+        add     $8, %r15b
+        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
 
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        jmp     _eight_cipher_left\@
 
-	mov     %r12, %r11
+_encrypt_by_8\@:
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $8, %r15b
+        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        add     $128, %r11
+        sub     $128, %r13
+        jne     _encrypt_by_8_new\@
 
-	vpxor   reg_j, reg_j, reg_j
-	vpxor   reg_i, reg_i, reg_i
-	cmp     $16, %r11
-	jl      _get_AAD_rest8\@
-_get_AAD_blocks\@:
-	vmovdqu (%r10), reg_i
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_i, reg_j, reg_j
-	GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     _get_AAD_blocks\@
-	vmovdqu reg_j, reg_i
-	cmp     $0, %r11
-	je      _get_AAD_done\@
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
-	vpxor   reg_i, reg_i, reg_i
 
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     _get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-	jmp     _get_AAD_rest8\@
-_get_AAD_rest4\@:
-	cmp     $0, %r11
-	jle      _get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq    %r12, %r11
-	salq    $4, %r11
-	movdqu  aad_shift_arr(%r11), \T1
-	vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_j, reg_i, reg_i
-	GHASH_MUL_AVX       reg_i, \T2, \T1, \T3, \T4, \T5, \T6
 
-_get_AAD_done\@:
-	# initialize the data pointer offset as zero
-	xor     %r11d, %r11d
 
-	# start AES for num_initial_blocks blocks
-	mov     arg5, %rax                     # rax = *Y0
-	vmovdqu (%rax), \CTR                   # CTR = Y0
-	vpshufb SHUF_MASK(%rip), \CTR, \CTR
+_eight_cipher_left\@:
+        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
 
 
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
-                vmovdqa \CTR, reg_i
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
-	i = (i+1)
-	setreg
-.endr
+_zero_cipher_left\@:
+        vmovdqu %xmm14, AadHash(arg2)
+        vmovdqu %xmm9, CurCount(arg2)
 
-	vmovdqa  (arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpxor   \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
+        # check for 0 length
+        mov     arg5, %r13
+        and     $15, %r13                            # r13 = (arg5 mod 16)
 
-	j = 1
-	setreg
-.rep 9
-	vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenc \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
+        je      _multiple_of_16_bytes\@
 
-	j = (j+1)
-	setreg
-.endr
+        # handle the last <16 Byte block separately
 
+        mov %r13, PBlockLen(arg2)
 
-	vmovdqa  16*10(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenclast      \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
+        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
+        vmovdqu %xmm9, CurCount(arg2)
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vmovdqu (arg3, %r11), \T1
-                vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for num_initial_blocks blocks
-                add     $16, %r11
-.if  \ENC_DEC == DEC
-                vmovdqa \T1, reg_i
-.endif
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
-	i = (i+1)
-	setreg
-.endr
+        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
+        vmovdqu %xmm9, PBlockEncKey(arg2)
 
+        cmp $16, arg5
+        jge _large_enough_update\@
 
-	i = (8-\num_initial_blocks)
-	j = (9-\num_initial_blocks)
-	setreg
+        lea (arg4,%r11,1), %r10
+        mov %r13, %r12
 
-.rep \num_initial_blocks
-        vpxor    reg_i, reg_j, reg_j
-        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-        # XMM8 has the combined result here
+        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
 
-        vmovdqa  \XMM8, TMP1(%rsp)
-        vmovdqa  \XMM8, \T3
+        lea     SHIFT_MASK+16(%rip), %r12
+        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
+						     # able to shift 16-r13 bytes (r13 is the
+	# number of bytes in plaintext mod 16)
 
-        cmp     $128, %r13
-        jl      _initial_blocks_done\@                  # no need for precomputed constants
+        jmp _final_ghash_mul\@
 
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM1
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
+_large_enough_update\@:
+        sub $16, %r11
+        add %r13, %r11
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM2
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
+        # receive the last <16 Byte block
+        vmovdqu	(arg4, %r11, 1), %xmm1
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM3
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
+        sub	%r13, %r11
+        add	$16, %r11
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM4
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
+        lea	SHIFT_MASK+16(%rip), %r12
+        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+        # (r13 is the number of bytes in plaintext mod 16)
+        sub	%r13, %r12
+        # get the appropriate shuffle mask
+        vmovdqu	(%r12), %xmm2
+        # shift right 16-r13 bytes
+        vpshufb  %xmm2, %xmm1, %xmm1
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM5
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
+_final_ghash_mul\@:
+        .if  \ENC_DEC ==  DEC
+        vmovdqa %xmm1, %xmm2
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm2, %xmm2
+        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+        vpxor   %xmm2, %xmm14, %xmm14
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM6
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
+        vmovdqu %xmm14, AadHash(arg2)
+        .else
+        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
+        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
+						     # mask out top 16-r13 bytes of xmm9
+        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpxor   %xmm9, %xmm14, %xmm14
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM7
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
+        vmovdqu %xmm14, AadHash(arg2)
+        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
+        .endif
 
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM8
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 
-                vmovdqa  (arg1), \T_key
-                vpxor    \T_key, \XMM1, \XMM1
-                vpxor    \T_key, \XMM2, \XMM2
-                vpxor    \T_key, \XMM3, \XMM3
-                vpxor    \T_key, \XMM4, \XMM4
-                vpxor    \T_key, \XMM5, \XMM5
-                vpxor    \T_key, \XMM6, \XMM6
-                vpxor    \T_key, \XMM7, \XMM7
-                vpxor    \T_key, \XMM8, \XMM8
+        #############################
+        # output r13 Bytes
+        vmovq   %xmm9, %rax
+        cmp     $8, %r13
+        jle     _less_than_8_bytes_left\@
 
-		i = 1
-		setreg
-.rep    9       # do 9 rounds
-                vmovdqa  16*i(arg1), \T_key
-                vaesenc  \T_key, \XMM1, \XMM1
-                vaesenc  \T_key, \XMM2, \XMM2
-                vaesenc  \T_key, \XMM3, \XMM3
-                vaesenc  \T_key, \XMM4, \XMM4
-                vaesenc  \T_key, \XMM5, \XMM5
-                vaesenc  \T_key, \XMM6, \XMM6
-                vaesenc  \T_key, \XMM7, \XMM7
-                vaesenc  \T_key, \XMM8, \XMM8
-		i = (i+1)
-		setreg
-.endr
+        mov     %rax, (arg3 , %r11)
+        add     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        vmovq   %xmm9, %rax
+        sub     $8, %r13
 
+_less_than_8_bytes_left\@:
+        movb    %al, (arg3 , %r11)
+        add     $1, %r11
+        shr     $8, %rax
+        sub     $1, %r13
+        jne     _less_than_8_bytes_left\@
+        #############################
 
-                vmovdqa  16*i(arg1), \T_key
-                vaesenclast  \T_key, \XMM1, \XMM1
-                vaesenclast  \T_key, \XMM2, \XMM2
-                vaesenclast  \T_key, \XMM3, \XMM3
-                vaesenclast  \T_key, \XMM4, \XMM4
-                vaesenclast  \T_key, \XMM5, \XMM5
-                vaesenclast  \T_key, \XMM6, \XMM6
-                vaesenclast  \T_key, \XMM7, \XMM7
-                vaesenclast  \T_key, \XMM8, \XMM8
+_multiple_of_16_bytes\@:
+.endm
 
-                vmovdqu  (arg3, %r11), \T1
-                vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM1
-                .endif
 
-                vmovdqu  16*1(arg3, %r11), \T1
-                vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM2
-                .endif
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
+        vmovdqu AadHash(arg2), %xmm14
+        vmovdqu HashKey(arg2), %xmm13
 
-                vmovdqu  16*2(arg3, %r11), \T1
-                vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM3
-                .endif
+        mov PBlockLen(arg2), %r12
+        cmp $0, %r12
+        je _partial_done\@
 
-                vmovdqu  16*3(arg3, %r11), \T1
-                vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM4
-                .endif
+	#GHASH computation for the last <16 Byte block
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 
-                vmovdqu  16*4(arg3, %r11), \T1
-                vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM5
-                .endif
+_partial_done\@:
+        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
+        shl     $3, %r12                             # convert into number of bits
+        vmovd   %r12d, %xmm15                        # len(A) in xmm15
 
-                vmovdqu  16*5(arg3, %r11), \T1
-                vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM6
-                .endif
+        mov InLen(arg2), %r12
+        shl     $3, %r12                        # len(C) in bits  (*128)
+        vmovq   %r12, %xmm1
+        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
+        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
 
-                vmovdqu  16*6(arg3, %r11), \T1
-                vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM7
-                .endif
+        vpxor   %xmm15, %xmm14, %xmm14
+        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
+        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
 
-                vmovdqu  16*7(arg3, %r11), \T1
-                vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg2 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM8
-                .endif
+        vmovdqu OrigIV(arg2), %xmm9
 
-                add     $128, %r11
+        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
 
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
-                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
+        vpxor   %xmm14, %xmm9, %xmm9
 
-###############################################################################
 
-_initial_blocks_done\@:
 
-.endm
+_return_T\@:
+        mov     \AUTH_TAG, %r10              # r10 = authTag
+        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
 
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+        cmp     $16, %r11
+        je      _T_16\@
 
-        vmovdqa \XMM1, \T2
-        vmovdqa \XMM2, TMP2(%rsp)
-        vmovdqa \XMM3, TMP3(%rsp)
-        vmovdqa \XMM4, TMP4(%rsp)
-        vmovdqa \XMM5, TMP5(%rsp)
-        vmovdqa \XMM6, TMP6(%rsp)
-        vmovdqa \XMM7, TMP7(%rsp)
-        vmovdqa \XMM8, TMP8(%rsp)
+        cmp     $8, %r11
+        jl      _T_4\@
 
-.if \loop_idx == in_order
-                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONE(%rip), \XMM1, \XMM2
-                vpaddd  ONE(%rip), \XMM2, \XMM3
-                vpaddd  ONE(%rip), \XMM3, \XMM4
-                vpaddd  ONE(%rip), \XMM4, \XMM5
-                vpaddd  ONE(%rip), \XMM5, \XMM6
-                vpaddd  ONE(%rip), \XMM6, \XMM7
-                vpaddd  ONE(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
+_T_8\@:
+        vmovq   %xmm9, %rax
+        mov     %rax, (%r10)
+        add     $8, %r10
+        sub     $8, %r11
+        vpsrldq $8, %xmm9, %xmm9
+        cmp     $0, %r11
+        je     _return_T_done\@
+_T_4\@:
+        vmovd   %xmm9, %eax
+        mov     %eax, (%r10)
+        add     $4, %r10
+        sub     $4, %r11
+        vpsrldq     $4, %xmm9, %xmm9
+        cmp     $0, %r11
+        je     _return_T_done\@
+_T_123\@:
+        vmovd     %xmm9, %eax
+        cmp     $2, %r11
+        jl     _T_1\@
+        mov     %ax, (%r10)
+        cmp     $2, %r11
+        je     _return_T_done\@
+        add     $2, %r10
+        sar     $16, %eax
+_T_1\@:
+        mov     %al, (%r10)
+        jmp     _return_T_done\@
 
-                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
-.else
-                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONEf(%rip), \XMM1, \XMM2
-                vpaddd  ONEf(%rip), \XMM2, \XMM3
-                vpaddd  ONEf(%rip), \XMM3, \XMM4
-                vpaddd  ONEf(%rip), \XMM4, \XMM5
-                vpaddd  ONEf(%rip), \XMM5, \XMM6
-                vpaddd  ONEf(%rip), \XMM6, \XMM7
-                vpaddd  ONEf(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-.endif
+_T_16\@:
+        vmovdqu %xmm9, (%r10)
 
+_return_T_done\@:
+.endm
 
-        #######################################################################
-
-                vmovdqu (arg1), \T1
-                vpxor   \T1, \XMM1, \XMM1
-                vpxor   \T1, \XMM2, \XMM2
-                vpxor   \T1, \XMM3, \XMM3
-                vpxor   \T1, \XMM4, \XMM4
-                vpxor   \T1, \XMM5, \XMM5
-                vpxor   \T1, \XMM6, \XMM6
-                vpxor   \T1, \XMM7, \XMM7
-                vpxor   \T1, \XMM8, \XMM8
-
-        #######################################################################
-
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
 
+	mov     \AAD, %r10                      # r10 = AAD
+	mov     \AADLEN, %r12                      # r12 = aadLen
 
 
+	mov     %r12, %r11
 
-                vmovdqu 16*1(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+	vpxor   \T8, \T8, \T8
+	vpxor   \T7, \T7, \T7
+	cmp     $16, %r11
+	jl      _get_AAD_rest8\@
+_get_AAD_blocks\@:
+	vmovdqu (%r10), \T7
+	vpshufb SHUF_MASK(%rip), \T7, \T7
+	vpxor   \T7, \T8, \T8
+	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
+	add     $16, %r10
+	sub     $16, %r12
+	sub     $16, %r11
+	cmp     $16, %r11
+	jge     _get_AAD_blocks\@
+	vmovdqu \T8, \T7
+	cmp     $0, %r11
+	je      _get_AAD_done\@
 
-                vmovdqu 16*2(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+	vpxor   \T7, \T7, \T7
 
+	/* read the last <16B of AAD. since we have at least 4B of
+	data right after the AAD (the ICV, and maybe some CT), we can
+	read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+	cmp     $4, %r11
+	jle     _get_AAD_rest4\@
+	movq    (%r10), \T1
+	add     $8, %r10
+	sub     $8, %r11
+	vpslldq $8, \T1, \T1
+	vpsrldq $8, \T7, \T7
+	vpxor   \T1, \T7, \T7
+	jmp     _get_AAD_rest8\@
+_get_AAD_rest4\@:
+	cmp     $0, %r11
+	jle      _get_AAD_rest0\@
+	mov     (%r10), %eax
+	movq    %rax, \T1
+	add     $4, %r10
+	sub     $4, %r11
+	vpslldq $12, \T1, \T1
+	vpsrldq $4, \T7, \T7
+	vpxor   \T1, \T7, \T7
+_get_AAD_rest0\@:
+	/* finalize: shift out the extra bytes we read, and align
+	left. since pslldq can only shift by an immediate, we use
+	vpshufb and an array of shuffle masks */
+	movq    %r12, %r11
+	salq    $4, %r11
+	vmovdqu  aad_shift_arr(%r11), \T1
+	vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+	vpshufb SHUF_MASK(%rip), \T7, \T7
+	vpxor   \T8, \T7, \T7
+	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
 
-        #######################################################################
+_get_AAD_done\@:
+        vmovdqu \T7, AadHash(arg2)
+.endm
 
-        vmovdqa         HashKey_8(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
-        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
+.macro INIT GHASH_MUL PRECOMPUTE
+        mov arg6, %r11
+        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
+        xor %r11d, %r11d
+        mov %r11, InLen(arg2) # ctx_data.in_length = 0
 
-        vpshufd         $0b01001110, \T2, \T6
-        vpxor           \T2, \T6, \T6
+        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
+        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
+        mov arg3, %rax
+        movdqu (%rax), %xmm0
+        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
 
-        vmovdqa         HashKey_8_k(arg1), \T5
-        vpclmulqdq      $0x00, \T5, \T6, \T6
+        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
+        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
 
-                vmovdqu 16*3(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP2(%rsp), \T1
-        vmovdqa         HashKey_7(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_7_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*4(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
 
+        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
+        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+        vmovdqa  %xmm6, %xmm2
+        vpsllq   $1, %xmm6, %xmm6
+        vpsrlq   $63, %xmm2, %xmm2
+        vmovdqa  %xmm2, %xmm1
+        vpslldq  $8, %xmm2, %xmm2
+        vpsrldq  $8, %xmm1, %xmm1
+        vpor     %xmm2, %xmm6, %xmm6
+        #reduction
+        vpshufd  $0b00100100, %xmm1, %xmm2
+        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+        vpand    POLY(%rip), %xmm2, %xmm2
+        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
         #######################################################################
+        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
 
-        vmovdqa         TMP3(%rsp), \T1
-        vmovdqa         HashKey_6(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_6_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*5(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP4(%rsp), \T1
-        vmovdqa         HashKey_5(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_5_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*6(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        vmovdqa         TMP5(%rsp), \T1
-        vmovdqa         HashKey_4(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_4_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
 
-                vmovdqu 16*7(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP6(%rsp), \T1
-        vmovdqa         HashKey_3(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_3_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+.endm
 
 
-                vmovdqu 16*8(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+        vpxor \XMMDst, \XMMDst, \XMMDst
+
+        cmp $8, \DLEN
+        jl _read_lt8_\@
+        mov (\DPTR), %rax
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+        sub $8, \DLEN
+        jz _done_read_partial_block_\@
+        xor %eax, %eax
+_read_next_byte_\@:
+        shl $8, %rax
+        mov 7(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_\@
+        vpinsrq $1, %rax, \XMMDst, \XMMDst
+        jmp _done_read_partial_block_\@
+_read_lt8_\@:
+        xor %eax, %eax
+_read_next_byte_lt8_\@:
+        shl $8, %rax
+        mov -1(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_lt8_\@
+        vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
 
-        vmovdqa         TMP7(%rsp), \T1
-        vmovdqa         HashKey_2(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+        AAD_HASH ENC_DEC
+        mov 	PBlockLen(arg2), %r13
+        cmp	$0, %r13
+        je	_partial_block_done_\@	# Leave Macro if no partial blocks
+        # Read in input data without over reading
+        cmp	$16, \PLAIN_CYPH_LEN
+        jl	_fewer_than_16_bytes_\@
+        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
+        jmp	_data_read_\@
+
+_fewer_than_16_bytes_\@:
+        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+        mov	\PLAIN_CYPH_LEN, %r12
+        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+        mov PBlockLen(arg2), %r13
+
+_data_read_\@:				# Finished reading in data
+
+        vmovdqu	PBlockEncKey(arg2), %xmm9
+        vmovdqu	HashKey(arg2), %xmm13
+
+        lea	SHIFT_MASK(%rip), %r12
+
+        # adjust the shuffle mask pointer to be able to shift r13 bytes
+        # r16-r13 is the number of bytes in plaintext mod 16)
+        add	%r13, %r12
+        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
+        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
+
+.if  \ENC_DEC ==  DEC
+        vmovdqa	%xmm1, %xmm3
+        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
+
+        mov	\PLAIN_CYPH_LEN, %r10
+        add	%r13, %r10
+        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+        sub	$16, %r10
+        # Determine if if partial block is not being filled and
+        # shift mask accordingly
+        jge	_no_extra_mask_1_\@
+        sub	%r10, %r12
+_no_extra_mask_1_\@:
+
+        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+        # get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
+
+        vpand	%xmm1, %xmm3, %xmm3
+        vmovdqa	SHUF_MASK(%rip), %xmm10
+        vpshufb	%xmm10, %xmm3, %xmm3
+        vpshufb	%xmm2, %xmm3, %xmm3
+        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
+
+        cmp	$0, %r10
+        jl	_partial_incomplete_1_\@
+
+        # GHASH computation for the last <16 Byte block
+        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        xor	%eax,%eax
+
+        mov	%rax, PBlockLen(arg2)
+        jmp	_dec_done_\@
+_partial_incomplete_1_\@:
+        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
+_dec_done_\@:
+        vmovdqu	\AAD_HASH, AadHash(arg2)
+.else
+        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
+
+        mov	\PLAIN_CYPH_LEN, %r10
+        add	%r13, %r10
+        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+        sub	$16, %r10
+        # Determine if if partial block is not being filled and
+        # shift mask accordingly
+        jge	_no_extra_mask_2_\@
+        sub	%r10, %r12
+_no_extra_mask_2_\@:
+
+        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+        # get the appropriate mask to mask out bottom r13 bytes of xmm9
+        vpand	%xmm1, %xmm9, %xmm9
+
+        vmovdqa	SHUF_MASK(%rip), %xmm1
+        vpshufb %xmm1, %xmm9, %xmm9
+        vpshufb %xmm2, %xmm9, %xmm9
+        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
+
+        cmp	$0, %r10
+        jl	_partial_incomplete_2_\@
+
+        # GHASH computation for the last <16 Byte block
+        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+        xor	%eax,%eax
+
+        mov	%rax, PBlockLen(arg2)
+        jmp	_encode_done_\@
+_partial_incomplete_2_\@:
+        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
+_encode_done_\@:
+        vmovdqu	\AAD_HASH, AadHash(arg2)
+
+        vmovdqa	SHUF_MASK(%rip), %xmm10
+        # shuffle xmm9 back to output as ciphertext
+        vpshufb	%xmm10, %xmm9, %xmm9
+        vpshufb	%xmm2, %xmm9, %xmm9
+.endif
+        # output encrypted Bytes
+        cmp	$0, %r10
+        jl	_partial_fill_\@
+        mov	%r13, %r12
+        mov	$16, %r13
+        # Set r13 to be the number of bytes to write out
+        sub	%r12, %r13
+        jmp	_count_set_\@
+_partial_fill_\@:
+        mov	\PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+        vmovdqa	%xmm9, %xmm0
+        vmovq	%xmm0, %rax
+        cmp	$8, %r13
+        jle	_less_than_8_bytes_left_\@
+
+        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+        add	$8, \DATA_OFFSET
+        psrldq	$8, %xmm0
+        vmovq	%xmm0, %rax
+        sub	$8, %r13
+_less_than_8_bytes_left_\@:
+        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+        add	$1, \DATA_OFFSET
+        shr	$8, %rax
+        sub	$1, %r13
+        jne	_less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_2_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
 
-        #######################################################################
+        vpshufd         $0b01001110, \GH, \T2
+        vpshufd         $0b01001110, \HK, \T3
+        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
+        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
 
-                vmovdqu 16*9(arg1), \T5
-                vaesenc \T5, \XMM1, \XMM1
-                vaesenc \T5, \XMM2, \XMM2
-                vaesenc \T5, \XMM3, \XMM3
-                vaesenc \T5, \XMM4, \XMM4
-                vaesenc \T5, \XMM5, \XMM5
-                vaesenc \T5, \XMM6, \XMM6
-                vaesenc \T5, \XMM7, \XMM7
-                vaesenc \T5, \XMM8, \XMM8
+        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
+        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
+        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
+        vpxor           \GH, \T2,\T2
+        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
 
-        vmovdqa         TMP8(%rsp), \T1
-        vmovdqa         HashKey(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
+        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
+        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
+        vpxor           \T3, \GH, \GH
+        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
 
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqa         HashKey_k(arg1), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
+        #first phase of the reduction
+        vpslld  $31, \GH, \T2                   # packed right shifting << 31
+        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
+        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
 
-        vpxor           \T4, \T6, \T6
-        vpxor           \T7, \T6, \T6
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-                vmovdqu 16*10(arg1), \T5
+        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
 
-	i = 0
-	j = 1
-	setreg
-.rep 8
-		vpxor	16*i(arg3, %r11), \T5, \T2
-                .if \ENC_DEC == ENC
-                vaesenclast     \T2, reg_j, reg_j
-                .else
-                vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg3, %r11), reg_j
-                vmovdqu \T3, 16*i(arg2, %r11)
-                .endif
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-	#######################################################################
+        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
+        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
 
+        #second phase of the reduction
 
-	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
-	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
-	vpxor	\T3, \T7, \T7
-	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
+        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
+        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
+        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
+        vpxor   \T5, \T2, \T2
+        vpxor   \T2, \GH, \GH
+        vpxor   \T1, \GH, \GH                   # the result is in GH
 
 
-	#######################################################################
-	#first phase of the reduction
-	#######################################################################
-        vpslld  $31, \T7, \T2                           # packed right shifting << 31
-        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
+.endm
 
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
 
-        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
+        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+        vmovdqa  \HK, \T5
 
-        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
-	#######################################################################
-                .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
-                .endif
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_k(arg2)
 
-	#######################################################################
-	#second phase of the reduction
-        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
+        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_2_k(arg2)
 
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6                           # the result is in T6
-	#######################################################################
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
+        vmovdqu  \T5, HashKey_3(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_3_k(arg2)
 
-		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
+        vmovdqu  \T5, HashKey_4(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_4_k(arg2)
 
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
+        vmovdqu  \T5, HashKey_5(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_5_k(arg2)
 
-	vpxor	\T6, \XMM1, \XMM1
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
+        vmovdqu  \T5, HashKey_6(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_6_k(arg2)
 
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
+        vmovdqu  \T5, HashKey_7(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_7_k(arg2)
 
+        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
+        vmovdqu  \T5, HashKey_8(arg2)
+        vpshufd  $0b01001110, \T5, \T1
+        vpxor    \T5, \T1, \T1
+        vmovdqu  \T1, HashKey_8_k(arg2)
 
 .endm
 
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
-# GHASH the last 4 ciphertext blocks.
-.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+	i = (8-\num_initial_blocks)
+	setreg
+        vmovdqu AadHash(arg2), reg_i
 
-        ## Karatsuba Method
+	# start AES for num_initial_blocks blocks
+	vmovdqu CurCount(arg2), \CTR
 
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
+                vmovdqa \CTR, reg_i
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
+	i = (i+1)
+	setreg
+.endr
 
-        vpshufd         $0b01001110, \XMM1, \T2
-        vpxor           \XMM1, \T2, \T2
-        vmovdqa         HashKey_8(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM1, \T6
-        vpclmulqdq      $0x00, \T5, \XMM1, \T7
+	vmovdqa  (arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vpxor   \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
 
-        vmovdqa         HashKey_8_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \XMM1
+       j = 1
+       setreg
+.rep \REP
+       vmovdqa  16*j(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenc \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
 
-        ######################
+       j = (j+1)
+       setreg
+.endr
 
-        vpshufd         $0b01001110, \XMM2, \T2
-        vpxor           \XMM2, \T2, \T2
-        vmovdqa         HashKey_7(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
+	vmovdqa  16*j(arg1), \T_key
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+        vaesenclast      \T_key, reg_i, reg_i
+	i = (i+1)
+	setreg
+.endr
 
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
+	i = (9-\num_initial_blocks)
+	setreg
+.rep \num_initial_blocks
+                vmovdqu (arg4, %r11), \T1
+                vpxor   \T1, reg_i, reg_i
+                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
+                add     $16, %r11
+.if  \ENC_DEC == DEC
+                vmovdqa \T1, reg_i
+.endif
+                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
+	i = (i+1)
+	setreg
+.endr
 
-        vmovdqa         HashKey_7_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
 
-        ######################
+	i = (8-\num_initial_blocks)
+	j = (9-\num_initial_blocks)
+	setreg
 
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpxor           \XMM3, \T2, \T2
-        vmovdqa         HashKey_6(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
+.rep \num_initial_blocks
+        vpxor    reg_i, reg_j, reg_j
+        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+        # XMM8 has the combined result here
 
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
+        vmovdqa  \XMM8, TMP1(%rsp)
+        vmovdqa  \XMM8, \T3
 
-        vmovdqa         HashKey_6_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+        cmp     $128, %r13
+        jl      _initial_blocks_done\@                  # no need for precomputed constants
 
-        ######################
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM1
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
 
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpxor           \XMM4, \T2, \T2
-        vmovdqa         HashKey_5(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM2
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
 
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM3
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
 
-        vmovdqa         HashKey_5_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM4
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
 
-        ######################
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM5
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
 
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpxor           \XMM5, \T2, \T2
-        vmovdqa         HashKey_4(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM6
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
 
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM7
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
 
-        vmovdqa         HashKey_4_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
+                vmovdqa  \CTR, \XMM8
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
 
-        ######################
+                vmovdqa  (arg1), \T_key
+                vpxor    \T_key, \XMM1, \XMM1
+                vpxor    \T_key, \XMM2, \XMM2
+                vpxor    \T_key, \XMM3, \XMM3
+                vpxor    \T_key, \XMM4, \XMM4
+                vpxor    \T_key, \XMM5, \XMM5
+                vpxor    \T_key, \XMM6, \XMM6
+                vpxor    \T_key, \XMM7, \XMM7
+                vpxor    \T_key, \XMM8, \XMM8
 
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpxor           \XMM6, \T2, \T2
-        vmovdqa         HashKey_3(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
+               i = 1
+               setreg
+.rep    \REP       # do REP rounds
+                vmovdqa  16*i(arg1), \T_key
+                vaesenc  \T_key, \XMM1, \XMM1
+                vaesenc  \T_key, \XMM2, \XMM2
+                vaesenc  \T_key, \XMM3, \XMM3
+                vaesenc  \T_key, \XMM4, \XMM4
+                vaesenc  \T_key, \XMM5, \XMM5
+                vaesenc  \T_key, \XMM6, \XMM6
+                vaesenc  \T_key, \XMM7, \XMM7
+                vaesenc  \T_key, \XMM8, \XMM8
+               i = (i+1)
+               setreg
+.endr
 
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
+                vmovdqa  16*i(arg1), \T_key
+                vaesenclast  \T_key, \XMM1, \XMM1
+                vaesenclast  \T_key, \XMM2, \XMM2
+                vaesenclast  \T_key, \XMM3, \XMM3
+                vaesenclast  \T_key, \XMM4, \XMM4
+                vaesenclast  \T_key, \XMM5, \XMM5
+                vaesenclast  \T_key, \XMM6, \XMM6
+                vaesenclast  \T_key, \XMM7, \XMM7
+                vaesenclast  \T_key, \XMM8, \XMM8
 
-        vmovdqa         HashKey_3_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vmovdqu  (arg4, %r11), \T1
+                vpxor    \T1, \XMM1, \XMM1
+                vmovdqu  \XMM1, (arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM1
+                .endif
 
-        ######################
+                vmovdqu  16*1(arg4, %r11), \T1
+                vpxor    \T1, \XMM2, \XMM2
+                vmovdqu  \XMM2, 16*1(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM2
+                .endif
 
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpxor           \XMM7, \T2, \T2
-        vmovdqa         HashKey_2(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
+                vmovdqu  16*2(arg4, %r11), \T1
+                vpxor    \T1, \XMM3, \XMM3
+                vmovdqu  \XMM3, 16*2(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM3
+                .endif
 
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
+                vmovdqu  16*3(arg4, %r11), \T1
+                vpxor    \T1, \XMM4, \XMM4
+                vmovdqu  \XMM4, 16*3(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM4
+                .endif
 
-        vmovdqa         HashKey_2_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
+                vmovdqu  16*4(arg4, %r11), \T1
+                vpxor    \T1, \XMM5, \XMM5
+                vmovdqu  \XMM5, 16*4(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM5
+                .endif
 
-        ######################
+                vmovdqu  16*5(arg4, %r11), \T1
+                vpxor    \T1, \XMM6, \XMM6
+                vmovdqu  \XMM6, 16*5(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM6
+                .endif
 
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpxor           \XMM8, \T2, \T2
-        vmovdqa         HashKey(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
+                vmovdqu  16*6(arg4, %r11), \T1
+                vpxor    \T1, \XMM7, \XMM7
+                vmovdqu  \XMM7, 16*6(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM7
+                .endif
+
+                vmovdqu  16*7(arg4, %r11), \T1
+                vpxor    \T1, \XMM8, \XMM8
+                vmovdqu  \XMM8, 16*7(arg3 , %r11)
+                .if   \ENC_DEC == DEC
+                vmovdqa  \T1, \XMM8
+                .endif
+
+                add     $128, %r11
+
+                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
+                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
+                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
+                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
 
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
+###############################################################################
 
-        vmovdqa         HashKey_k(arg1), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
+_initial_blocks_done\@:
 
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
+.endm
 
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg3, arg4 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
+        vmovdqa \XMM1, \T2
+        vmovdqa \XMM2, TMP2(%rsp)
+        vmovdqa \XMM3, TMP3(%rsp)
+        vmovdqa \XMM4, TMP4(%rsp)
+        vmovdqa \XMM5, TMP5(%rsp)
+        vmovdqa \XMM6, TMP6(%rsp)
+        vmovdqa \XMM7, TMP7(%rsp)
+        vmovdqa \XMM8, TMP8(%rsp)
 
+.if \loop_idx == in_order
+                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONE(%rip), \XMM1, \XMM2
+                vpaddd  ONE(%rip), \XMM2, \XMM3
+                vpaddd  ONE(%rip), \XMM3, \XMM4
+                vpaddd  ONE(%rip), \XMM4, \XMM5
+                vpaddd  ONE(%rip), \XMM5, \XMM6
+                vpaddd  ONE(%rip), \XMM6, \XMM7
+                vpaddd  ONE(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
 
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
+                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
+                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
+.else
+                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
+                vpaddd  ONEf(%rip), \XMM1, \XMM2
+                vpaddd  ONEf(%rip), \XMM2, \XMM3
+                vpaddd  ONEf(%rip), \XMM3, \XMM4
+                vpaddd  ONEf(%rip), \XMM4, \XMM5
+                vpaddd  ONEf(%rip), \XMM5, \XMM6
+                vpaddd  ONEf(%rip), \XMM6, \XMM7
+                vpaddd  ONEf(%rip), \XMM7, \XMM8
+                vmovdqa \XMM8, \CTR
+.endif
 
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
-				# the accumulated carry-less multiplications
 
         #######################################################################
-        #first phase of the reduction
-        vpslld  $31, \T7, \T2   # packed right shifting << 31
-        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
 
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
+                vmovdqu (arg1), \T1
+                vpxor   \T1, \XMM1, \XMM1
+                vpxor   \T1, \XMM2, \XMM2
+                vpxor   \T1, \XMM3, \XMM3
+                vpxor   \T1, \XMM4, \XMM4
+                vpxor   \T1, \XMM5, \XMM5
+                vpxor   \T1, \XMM6, \XMM6
+                vpxor   \T1, \XMM7, \XMM7
+                vpxor   \T1, \XMM8, \XMM8
 
-        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
         #######################################################################
 
 
-        #second phase of the reduction
-        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
 
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6   # the result is in T6
 
-.endm
 
+                vmovdqu 16*1(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro  GCM_ENC_DEC_AVX     ENC_DEC
+                vmovdqu 16*2(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
 
-        mov     %rsp, %r14
+        #######################################################################
 
+        vmovdqu         HashKey_8(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
+        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
 
+        vpshufd         $0b01001110, \T2, \T6
+        vpxor           \T2, \T6, \T6
 
+        vmovdqu         HashKey_8_k(arg2), \T5
+        vpclmulqdq      $0x00, \T5, \T6, \T6
 
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                  # align rsp to 64 bytes
+                vmovdqu 16*3(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP2(%rsp), \T1
+        vmovdqu         HashKey_7(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vmovdqu  HashKey(arg1), %xmm13      # xmm13 = HashKey
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_7_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        mov     arg4, %r13                  # save the number of bytes of plaintext/ciphertext
-        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
+                vmovdqu 16*4(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        mov     %r13, %r12
-        shr     $4, %r12
-        and     $7, %r12
-        jz      _initial_num_blocks_is_0\@
+        #######################################################################
 
-        cmp     $7, %r12
-        je      _initial_num_blocks_is_7\@
-        cmp     $6, %r12
-        je      _initial_num_blocks_is_6\@
-        cmp     $5, %r12
-        je      _initial_num_blocks_is_5\@
-        cmp     $4, %r12
-        je      _initial_num_blocks_is_4\@
-        cmp     $3, %r12
-        je      _initial_num_blocks_is_3\@
-        cmp     $2, %r12
-        je      _initial_num_blocks_is_2\@
+        vmovdqa         TMP3(%rsp), \T1
+        vmovdqu         HashKey_6(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        jmp     _initial_num_blocks_is_1\@
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_6_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-_initial_num_blocks_is_7\@:
-        INITIAL_BLOCKS_AVX  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*7, %r13
-        jmp     _initial_blocks_encrypted\@
+                vmovdqu 16*5(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-_initial_num_blocks_is_6\@:
-        INITIAL_BLOCKS_AVX  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*6, %r13
-        jmp     _initial_blocks_encrypted\@
+        vmovdqa         TMP4(%rsp), \T1
+        vmovdqu         HashKey_5(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-_initial_num_blocks_is_5\@:
-        INITIAL_BLOCKS_AVX  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*5, %r13
-        jmp     _initial_blocks_encrypted\@
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_5_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-_initial_num_blocks_is_4\@:
-        INITIAL_BLOCKS_AVX  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*4, %r13
-        jmp     _initial_blocks_encrypted\@
+                vmovdqu 16*6(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-_initial_num_blocks_is_3\@:
-        INITIAL_BLOCKS_AVX  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*3, %r13
-        jmp     _initial_blocks_encrypted\@
 
-_initial_num_blocks_is_2\@:
-        INITIAL_BLOCKS_AVX  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*2, %r13
-        jmp     _initial_blocks_encrypted\@
+        vmovdqa         TMP5(%rsp), \T1
+        vmovdqu         HashKey_4(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-_initial_num_blocks_is_1\@:
-        INITIAL_BLOCKS_AVX  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*1, %r13
-        jmp     _initial_blocks_encrypted\@
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_4_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-_initial_num_blocks_is_0\@:
-        INITIAL_BLOCKS_AVX  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+                vmovdqu 16*7(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP6(%rsp), \T1
+        vmovdqu         HashKey_3(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-_initial_blocks_encrypted\@:
-        cmp     $0, %r13
-        je      _zero_cipher_left\@
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_3_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        sub     $128, %r13
-        je      _eight_cipher_left\@
 
+                vmovdqu 16*8(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP7(%rsp), \T1
+        vmovdqu         HashKey_2(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_2_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
-        vmovd   %xmm9, %r15d
-        and     $255, %r15d
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        #######################################################################
 
+                vmovdqu 16*9(arg1), \T5
+                vaesenc \T5, \XMM1, \XMM1
+                vaesenc \T5, \XMM2, \XMM2
+                vaesenc \T5, \XMM3, \XMM3
+                vaesenc \T5, \XMM4, \XMM4
+                vaesenc \T5, \XMM5, \XMM5
+                vaesenc \T5, \XMM6, \XMM6
+                vaesenc \T5, \XMM7, \XMM7
+                vaesenc \T5, \XMM8, \XMM8
 
-_encrypt_by_8_new\@:
-        cmp     $(255-8), %r15d
-        jg      _encrypt_by_8\@
+        vmovdqa         TMP8(%rsp), \T1
+        vmovdqu         HashKey(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpshufd         $0b01001110, \T1, \T3
+        vpxor           \T1, \T3, \T3
+        vmovdqu         HashKey_k(arg2), \T5
+        vpclmulqdq      $0x10, \T5, \T3, \T3
+        vpxor           \T3, \T6, \T6
 
+        vpxor           \T4, \T6, \T6
+        vpxor           \T7, \T6, \T6
 
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+                vmovdqu 16*10(arg1), \T5
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        jmp     _eight_cipher_left\@
+        i = 11
+        setreg
+.rep (\REP-9)
+
+        vaesenc \T5, \XMM1, \XMM1
+        vaesenc \T5, \XMM2, \XMM2
+        vaesenc \T5, \XMM3, \XMM3
+        vaesenc \T5, \XMM4, \XMM4
+        vaesenc \T5, \XMM5, \XMM5
+        vaesenc \T5, \XMM6, \XMM6
+        vaesenc \T5, \XMM7, \XMM7
+        vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqu 16*i(arg1), \T5
+        i = i + 1
+        setreg
+.endr
 
-_encrypt_by_8\@:
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+	i = 0
+	j = 1
+	setreg
+.rep 8
+		vpxor	16*i(arg4, %r11), \T5, \T2
+                .if \ENC_DEC == ENC
+                vaesenclast     \T2, reg_j, reg_j
+                .else
+                vaesenclast     \T2, reg_j, \T3
+                vmovdqu 16*i(arg4, %r11), reg_j
+                vmovdqu \T3, 16*i(arg3, %r11)
+                .endif
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+	#######################################################################
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
 
+	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
+	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
+	vpxor	\T3, \T7, \T7
+	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
 
 
 
-_eight_cipher_left\@:
-        GHASH_LAST_8_AVX    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+	#######################################################################
+	#first phase of the reduction
+	#######################################################################
+        vpslld  $31, \T7, \T2                           # packed right shifting << 31
+        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
 
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-_zero_cipher_left\@:
-        cmp     $16, arg4
-        jl      _only_less_than_16\@
+        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
 
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
+	#######################################################################
+                .if \ENC_DEC == ENC
+		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
+                .endif
 
-        je      _multiple_of_16_bytes\@
+	#######################################################################
+	#second phase of the reduction
+        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2                           # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-        # handle the last <16 Byte block seperately
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6                           # the result is in T6
+	#######################################################################
 
+		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+	vpxor	\T6, \XMM1, \XMM1
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
 
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
 
-        je      _multiple_of_16_bytes\@
+.endm
 
-        # handle the last <16 Byte block seperately
 
+# GHASH the last 4 ciphertext blocks.
+.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
 
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        ## Karatsuba Method
 
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+        vpshufd         $0b01001110, \XMM1, \T2
+        vpxor           \XMM1, \T2, \T2
+        vmovdqu         HashKey_8(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM1, \T6
+        vpclmulqdq      $0x00, \T5, \XMM1, \T7
 
-_get_last_16_byte_loop\@:
-        movb    (arg3, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        vmovdqu         HashKey_8_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \XMM1
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        ######################
 
-        sub     $16, %r11
+        vpshufd         $0b01001110, \XMM2, \T2
+        vpxor           \XMM2, \T2, \T2
+        vmovdqu         HashKey_7(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
 
-_final_ghash_mul\@:
-        .if  \ENC_DEC ==  DEC
-        vmovdqa %xmm1, %xmm2
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm2, %xmm2
-        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
-        vpxor   %xmm2, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        .else
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        vpxor   %xmm9, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
-        .endif
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
 
+        vmovdqu         HashKey_7_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        #############################
-        # output r13 Bytes
-        vmovq   %xmm9, %rax
-        cmp     $8, %r13
-        jle     _less_than_8_bytes_left\@
+        ######################
 
-        mov     %rax, (arg2 , %r11)
-        add     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        vmovq   %xmm9, %rax
-        sub     $8, %r13
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpxor           \XMM3, \T2, \T2
+        vmovdqu         HashKey_6(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
 
-_less_than_8_bytes_left\@:
-        movb    %al, (arg2 , %r11)
-        add     $1, %r11
-        shr     $8, %rax
-        sub     $1, %r13
-        jne     _less_than_8_bytes_left\@
-        #############################
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
 
-_multiple_of_16_bytes\@:
-        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
-        shl     $3, %r12                             # convert into number of bits
-        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+        vmovdqu         HashKey_6_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        shl     $3, arg4                             # len(C) in bits  (*128)
-        vmovq   arg4, %xmm1
-        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
-        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+        ######################
 
-        vpxor   %xmm15, %xmm14, %xmm14
-        GHASH_MUL_AVX       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
-        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpxor           \XMM4, \T2, \T2
+        vmovdqu         HashKey_5(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
 
-        mov     arg5, %rax                           # rax = *Y0
-        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
 
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
+        vmovdqu         HashKey_5_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        vpxor   %xmm14, %xmm9, %xmm9
+        ######################
 
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpxor           \XMM5, \T2, \T2
+        vmovdqu         HashKey_4(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
 
-_return_T\@:
-        mov     arg8, %r10              # r10 = authTag
-        mov     arg9, %r11              # r11 = auth_tag_len
+        vmovdqu         HashKey_4_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        cmp     $16, %r11
-        je      _T_16\@
+        ######################
 
-        cmp     $8, %r11
-        jl      _T_4\@
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpxor           \XMM6, \T2, \T2
+        vmovdqu         HashKey_3(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
 
-_T_8\@:
-        vmovq   %xmm9, %rax
-        mov     %rax, (%r10)
-        add     $8, %r10
-        sub     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_4\@:
-        vmovd   %xmm9, %eax
-        mov     %eax, (%r10)
-        add     $4, %r10
-        sub     $4, %r11
-        vpsrldq     $4, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_123\@:
-        vmovd     %xmm9, %eax
-        cmp     $2, %r11
-        jl     _T_1\@
-        mov     %ax, (%r10)
-        cmp     $2, %r11
-        je     _return_T_done\@
-        add     $2, %r10
-        sar     $16, %eax
-_T_1\@:
-        mov     %al, (%r10)
-        jmp     _return_T_done\@
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
 
-_T_16\@:
-        vmovdqu %xmm9, (%r10)
+        vmovdqu         HashKey_3_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-_return_T_done\@:
-        mov     %r14, %rsp
+        ######################
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
-.endm
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpxor           \XMM7, \T2, \T2
+        vmovdqu         HashKey_2(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
 
-#############################################################
-#void   aesni_gcm_precomp_avx_gen2
-#        (gcm_data     *my_ctx_data,
-#        u8     *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen2)
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
+        vmovdqu         HashKey_2_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpxor           \T2, \XMM1, \XMM1
 
-        mov     %rsp, %r14
+        ######################
+
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpxor           \XMM8, \T2, \T2
+        vmovdqu         HashKey(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
 
+        vmovdqu         HashKey_k(arg2), \T3
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                  # align rsp to 64 bytes
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
 
-        vmovdqu  (arg2), %xmm6              # xmm6 = HashKey
 
-        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
-        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
-        vmovdqa  %xmm6, %xmm2
-        vpsllq   $1, %xmm6, %xmm6
-        vpsrlq   $63, %xmm2, %xmm2
-        vmovdqa  %xmm2, %xmm1
-        vpslldq  $8, %xmm2, %xmm2
-        vpsrldq  $8, %xmm1, %xmm1
-        vpor     %xmm2, %xmm6, %xmm6
-        #reduction
-        vpshufd  $0b00100100, %xmm1, %xmm2
-        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
-        vpand    POLY(%rip), %xmm2, %xmm2
-        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
+
+
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
+
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
+				# the accumulated carry-less multiplications
+
         #######################################################################
-        vmovdqa  %xmm6, HashKey(arg1)       # store HashKey<<1 mod poly
+        #first phase of the reduction
+        vpslld  $31, \T7, \T2   # packed right shifting << 31
+        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
+        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
 
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
 
-        PRECOMPUTE_AVX  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
 
-        mov     %r14, %rsp
+        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
+        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
+        #######################################################################
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
-        ret
-ENDPROC(aesni_gcm_precomp_avx_gen2)
 
-###############################################################################
-#void   aesni_gcm_enc_avx_gen2(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
-#        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
+        #second phase of the reduction
+        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
+        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
+        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
+        vpxor   \T3, \T2, \T2   # xor the shifted versions
+        vpxor   \T4, \T2, \T2
+
+        vpxor   \T1, \T2, \T2
+        vpxor   \T2, \T7, \T7
+        vpxor   \T7, \T6, \T6   # the result is in T6
+
+.endm
+
+#############################################################
+#void   aesni_gcm_precomp_avx_gen2
+#        (gcm_data     *my_ctx_data,
+#         gcm_context_data *data,
+#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
 #        u8      *iv, /* Pre-counter block j0: 4 byte salt
 #			(from Security Association) concatenated with 8 byte
 #			Initialisation Vector (from IPSec ESP Payload)
 #			concatenated with 0x00000001. 16-byte aligned pointer. */
 #        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#############################################################
+ENTRY(aesni_gcm_init_avx_gen2)
+        FUNC_SAVE
+        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_init_avx_gen2)
+
 ###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen2)
-        GCM_ENC_DEC_AVX     ENC
-	ret
-ENDPROC(aesni_gcm_enc_avx_gen2)
+#void   aesni_gcm_enc_update_avx_gen2(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
+#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
+#        const   u8 *in, /* Plaintext input */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_enc_update_avx_gen2)
+        FUNC_SAVE
+        mov     keysize, %eax
+        cmp     $32, %eax
+        je      key_256_enc_update
+        cmp     $16, %eax
+        je      key_128_enc_update
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
+        FUNC_RESTORE
+        ret
+key_128_enc_update:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
+        FUNC_RESTORE
+        ret
+key_256_enc_update:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_enc_update_avx_gen2)
 
 ###############################################################################
-#void   aesni_gcm_dec_avx_gen2(
+#void   aesni_gcm_dec_update_avx_gen2(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen2)
+        FUNC_SAVE
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_dec_update
+        cmp     $16, %eax
+        je      key_128_dec_update
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
+        FUNC_RESTORE
+        ret
+key_128_dec_update:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
+        FUNC_RESTORE
+        ret
+key_256_dec_update:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_dec_update_avx_gen2)
+
+###############################################################################
+#void   aesni_gcm_finalize_avx_gen2(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *auth_tag, /* Authenticated Tag output. */
 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
 #				Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen2)
-        GCM_ENC_DEC_AVX     DEC
-	ret
-ENDPROC(aesni_gcm_dec_avx_gen2)
+ENTRY(aesni_gcm_finalize_avx_gen2)
+        FUNC_SAVE
+        mov	keysize,%eax
+        cmp     $32, %eax
+        je      key_256_finalize
+        cmp     $16, %eax
+        je      key_128_finalize
+        # must be 192
+        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_128_finalize:
+        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_256_finalize:
+        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_finalize_avx_gen2)
+
 #endif /* CONFIG_AS_AVX */
 
 #ifdef CONFIG_AS_AVX2
@@ -1670,113 +1922,42 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
         # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
         vmovdqa  \HK, \T5
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
-        vmovdqa  \T5, HashKey_2(arg1)                       #  [HashKey_2] = HashKey^2<<1 mod poly
+        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
-        vmovdqa  \T5, HashKey_3(arg1)
+        vmovdqu  \T5, HashKey_3(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
-        vmovdqa  \T5, HashKey_4(arg1)
+        vmovdqu  \T5, HashKey_4(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
-        vmovdqa  \T5, HashKey_5(arg1)
+        vmovdqu  \T5, HashKey_5(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
-        vmovdqa  \T5, HashKey_6(arg1)
+        vmovdqu  \T5, HashKey_6(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
-        vmovdqa  \T5, HashKey_7(arg1)
+        vmovdqu  \T5, HashKey_7(arg2)
 
         GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
-        vmovdqa  \T5, HashKey_8(arg1)
+        vmovdqu  \T5, HashKey_8(arg2)
 
 .endm
 
-
 ## if a = number of total plaintext bytes
 ## b = floor(a/16)
 ## num_initial_blocks = b mod 4#
 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
 ## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
 
-.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
 	i = (8-\num_initial_blocks)
-	j = 0
 	setreg
-
-	mov     arg6, %r10                       # r10 = AAD
-	mov     arg7, %r12                       # r12 = aadLen
-
-
-	mov     %r12, %r11
-
-	vpxor   reg_j, reg_j, reg_j
-	vpxor   reg_i, reg_i, reg_i
-
-	cmp     $16, %r11
-	jl      _get_AAD_rest8\@
-_get_AAD_blocks\@:
-	vmovdqu (%r10), reg_i
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_i, reg_j, reg_j
-	GHASH_MUL_AVX2      reg_j, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     _get_AAD_blocks\@
-	vmovdqu reg_j, reg_i
-	cmp     $0, %r11
-	je      _get_AAD_done\@
-
-	vpxor   reg_i, reg_i, reg_i
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     _get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-	jmp     _get_AAD_rest8\@
-_get_AAD_rest4\@:
-	cmp     $0, %r11
-	jle     _get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, reg_i, reg_i
-	vpxor   \T1, reg_i, reg_i
-_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq    %r12, %r11
-	salq    $4, %r11
-	movdqu  aad_shift_arr(%r11), \T1
-	vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), reg_i, reg_i
-	vpxor   reg_j, reg_i, reg_i
-	GHASH_MUL_AVX2      reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
-	# initialize the data pointer offset as zero
-	xor     %r11d, %r11d
+	vmovdqu AadHash(arg2), reg_i
 
 	# start AES for num_initial_blocks blocks
-	mov     arg5, %rax                     # rax = *Y0
-	vmovdqu (%rax), \CTR                   # CTR = Y0
-	vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+	vmovdqu CurCount(arg2), \CTR
 
 	i = (9-\num_initial_blocks)
 	setreg
@@ -1799,7 +1980,7 @@ _get_AAD_done\@:
 
 	j = 1
 	setreg
-.rep 9
+.rep \REP
 	vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
@@ -1814,7 +1995,7 @@ _get_AAD_done\@:
 .endr
 
 
-	vmovdqa  16*10(arg1), \T_key
+	vmovdqa  16*j(arg1), \T_key
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
@@ -1826,9 +2007,9 @@ _get_AAD_done\@:
 	i = (9-\num_initial_blocks)
 	setreg
 .rep \num_initial_blocks
-                vmovdqu (arg3, %r11), \T1
+                vmovdqu (arg4, %r11), \T1
                 vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg2 , %r11)           # write back ciphertext for
+                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
 						       # num_initial_blocks blocks
                 add     $16, %r11
 .if  \ENC_DEC == DEC
@@ -1905,7 +2086,7 @@ _get_AAD_done\@:
 
 		i = 1
 		setreg
-.rep    9       # do 9 rounds
+.rep    \REP       # do REP rounds
                 vmovdqa  16*i(arg1), \T_key
                 vaesenc  \T_key, \XMM1, \XMM1
                 vaesenc  \T_key, \XMM2, \XMM2
@@ -1930,58 +2111,58 @@ _get_AAD_done\@:
                 vaesenclast  \T_key, \XMM7, \XMM7
                 vaesenclast  \T_key, \XMM8, \XMM8
 
-                vmovdqu  (arg3, %r11), \T1
+                vmovdqu  (arg4, %r11), \T1
                 vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg2 , %r11)
+                vmovdqu  \XMM1, (arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM1
                 .endif
 
-                vmovdqu  16*1(arg3, %r11), \T1
+                vmovdqu  16*1(arg4, %r11), \T1
                 vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg2 , %r11)
+                vmovdqu  \XMM2, 16*1(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM2
                 .endif
 
-                vmovdqu  16*2(arg3, %r11), \T1
+                vmovdqu  16*2(arg4, %r11), \T1
                 vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg2 , %r11)
+                vmovdqu  \XMM3, 16*2(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM3
                 .endif
 
-                vmovdqu  16*3(arg3, %r11), \T1
+                vmovdqu  16*3(arg4, %r11), \T1
                 vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg2 , %r11)
+                vmovdqu  \XMM4, 16*3(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM4
                 .endif
 
-                vmovdqu  16*4(arg3, %r11), \T1
+                vmovdqu  16*4(arg4, %r11), \T1
                 vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg2 , %r11)
+                vmovdqu  \XMM5, 16*4(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM5
                 .endif
 
-                vmovdqu  16*5(arg3, %r11), \T1
+                vmovdqu  16*5(arg4, %r11), \T1
                 vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg2 , %r11)
+                vmovdqu  \XMM6, 16*5(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM6
                 .endif
 
-                vmovdqu  16*6(arg3, %r11), \T1
+                vmovdqu  16*6(arg4, %r11), \T1
                 vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg2 , %r11)
+                vmovdqu  \XMM7, 16*6(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM7
                 .endif
 
-                vmovdqu  16*7(arg3, %r11), \T1
+                vmovdqu  16*7(arg4, %r11), \T1
                 vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg2 , %r11)
+                vmovdqu  \XMM8, 16*7(arg3 , %r11)
                 .if   \ENC_DEC == DEC
                 vmovdqa  \T1, \XMM8
                 .endif
@@ -2010,9 +2191,9 @@ _initial_blocks_done\@:
 
 # encrypt 8 blocks at a time
 # ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
 # r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
 
         vmovdqa \XMM1, \T2
         vmovdqa \XMM2, TMP2(%rsp)
@@ -2073,87 +2254,7 @@ _initial_blocks_done\@:
 
 
 
-                vmovdqu 16*1(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-                vmovdqu 16*2(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        #######################################################################
-
-        vmovdqa         HashKey_8(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
-        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
-        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
-        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
-        vpxor           \T5, \T6, \T6
-
-                vmovdqu 16*3(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP2(%rsp), \T1
-        vmovdqa         HashKey_7(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*4(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        #######################################################################
-
-        vmovdqa         TMP3(%rsp), \T1
-        vmovdqa         HashKey_6(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*5(arg1), \T1
+                vmovdqu 16*1(arg1), \T1
                 vaesenc \T1, \XMM1, \XMM1
                 vaesenc \T1, \XMM2, \XMM2
                 vaesenc \T1, \XMM3, \XMM3
@@ -2163,21 +2264,7 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM7, \XMM7
                 vaesenc \T1, \XMM8, \XMM8
 
-        vmovdqa         TMP4(%rsp), \T1
-        vmovdqa         HashKey_5(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*6(arg1), \T1
+                vmovdqu 16*2(arg1), \T1
                 vaesenc \T1, \XMM1, \XMM1
                 vaesenc \T1, \XMM2, \XMM2
                 vaesenc \T1, \XMM3, \XMM3
@@ -2188,21 +2275,16 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM8, \XMM8
 
 
-        vmovdqa         TMP5(%rsp), \T1
-        vmovdqa         HashKey_4(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
+        #######################################################################
 
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
+        vmovdqu         HashKey_8(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
+        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
+        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
+        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
+        vpxor           \T5, \T6, \T6
 
-                vmovdqu 16*7(arg1), \T1
+                vmovdqu 16*3(arg1), \T1
                 vaesenc \T1, \XMM1, \XMM1
                 vaesenc \T1, \XMM2, \XMM2
                 vaesenc \T1, \XMM3, \XMM3
@@ -2212,8 +2294,8 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM7, \XMM7
                 vaesenc \T1, \XMM8, \XMM8
 
-        vmovdqa         TMP6(%rsp), \T1
-        vmovdqa         HashKey_3(arg1), \T5
+        vmovdqa         TMP2(%rsp), \T1
+        vmovdqu         HashKey_7(arg2), \T5
         vpclmulqdq      $0x11, \T5, \T1, \T3
         vpxor           \T3, \T4, \T4
 
@@ -2226,7 +2308,7 @@ _initial_blocks_done\@:
         vpclmulqdq      $0x10, \T5, \T1, \T3
         vpxor           \T3, \T6, \T6
 
-                vmovdqu 16*8(arg1), \T1
+                vmovdqu 16*4(arg1), \T1
                 vaesenc \T1, \XMM1, \XMM1
                 vaesenc \T1, \XMM2, \XMM2
                 vaesenc \T1, \XMM3, \XMM3
@@ -2236,35 +2318,12 @@ _initial_blocks_done\@:
                 vaesenc \T1, \XMM7, \XMM7
                 vaesenc \T1, \XMM8, \XMM8
 
-        vmovdqa         TMP7(%rsp), \T1
-        vmovdqa         HashKey_2(arg1), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-
         #######################################################################
 
-                vmovdqu 16*9(arg1), \T5
-                vaesenc \T5, \XMM1, \XMM1
-                vaesenc \T5, \XMM2, \XMM2
-                vaesenc \T5, \XMM3, \XMM3
-                vaesenc \T5, \XMM4, \XMM4
-                vaesenc \T5, \XMM5, \XMM5
-                vaesenc \T5, \XMM6, \XMM6
-                vaesenc \T5, \XMM7, \XMM7
-                vaesenc \T5, \XMM8, \XMM8
-
-        vmovdqa         TMP8(%rsp), \T1
-        vmovdqa         HashKey(arg1), \T5
+        vmovdqa         TMP3(%rsp), \T1
+        vmovdqu         HashKey_6(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
 
         vpclmulqdq      $0x00, \T5, \T1, \T3
         vpxor           \T3, \T7, \T7
@@ -2275,672 +2334,510 @@ _initial_blocks_done\@:
         vpclmulqdq      $0x10, \T5, \T1, \T3
         vpxor           \T3, \T6, \T6
 
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T1
-
-
-                vmovdqu 16*10(arg1), \T5
-
-	i = 0
-	j = 1
-	setreg
-.rep 8
-		vpxor	16*i(arg3, %r11), \T5, \T2
-                .if \ENC_DEC == ENC
-                vaesenclast     \T2, reg_j, reg_j
-                .else
-                vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg3, %r11), reg_j
-                vmovdqu \T3, 16*i(arg2, %r11)
-                .endif
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-	#######################################################################
-
-
-	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
-	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
-	vpxor	\T3, \T7, \T7
-	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
-
-
-
-	#######################################################################
-	#first phase of the reduction
-	vmovdqa         POLY2(%rip), \T3
-
-	vpclmulqdq	$0x01, \T7, \T3, \T2
-	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
-
-	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
-	#######################################################################
-                .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg2,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg2,%r11)		# Write to the Ciphertext buffer
-                .endif
-
-	#######################################################################
-	#second phase of the reduction
-	vpclmulqdq	$0x00, \T7, \T3, \T2
-	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
-	vpclmulqdq	$0x10, \T7, \T3, \T4
-	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
-	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
-	#######################################################################
-	vpxor		\T4, \T1, \T1			# the result is in T1
-
-		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
-
-
-	vpxor	\T1, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
-        ## Karatsuba Method
-
-        vmovdqa         HashKey_8(arg1), \T5
-
-        vpshufd         $0b01001110, \XMM1, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM1, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM1, \T6
-        vpclmulqdq      $0x00, \T5, \XMM1, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_7(arg1), \T5
-        vpshufd         $0b01001110, \XMM2, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM2, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_6(arg1), \T5
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM3, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_5(arg1), \T5
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM4, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_4(arg1), \T5
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM5, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_3(arg1), \T5
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM6, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqa         HashKey_2(arg1), \T5
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM7, \T2, \T2
-        vpxor           \T5, \T3, \T3
+                vmovdqu 16*5(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
+        vmovdqa         TMP4(%rsp), \T1
+        vmovdqu         HashKey_5(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
 
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpxor           \T2, \XMM1, \XMM1
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        ######################
+                vmovdqu 16*6(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        vmovdqa         HashKey(arg1), \T5
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM8, \T2, \T2
-        vpxor           \T5, \T3, \T3
 
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
+        vmovdqa         TMP5(%rsp), \T1
+        vmovdqu         HashKey_4(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
 
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vpclmulqdq      $0x00, \T3, \T2, \T2
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
+                vmovdqu 16*7(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
+        vmovdqa         TMP6(%rsp), \T1
+        vmovdqu         HashKey_3(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
 
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
-						   # accumulated carry-less multiplications
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        #######################################################################
-        #first phase of the reduction
-        vmovdqa         POLY2(%rip), \T3
+                vmovdqu 16*8(arg1), \T1
+                vaesenc \T1, \XMM1, \XMM1
+                vaesenc \T1, \XMM2, \XMM2
+                vaesenc \T1, \XMM3, \XMM3
+                vaesenc \T1, \XMM4, \XMM4
+                vaesenc \T1, \XMM5, \XMM5
+                vaesenc \T1, \XMM6, \XMM6
+                vaesenc \T1, \XMM7, \XMM7
+                vaesenc \T1, \XMM8, \XMM8
 
-        vpclmulqdq      $0x01, \T7, \T3, \T2
-        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
+        vmovdqa         TMP7(%rsp), \T1
+        vmovdqu         HashKey_2(arg2), \T5
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T4
 
-        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
-        #######################################################################
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        #second phase of the reduction
-        vpclmulqdq      $0x00, \T7, \T3, \T2
-        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
-        vpclmulqdq      $0x10, \T7, \T3, \T4
-        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
 
-        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
         #######################################################################
-        vpxor           \T4, \T6, \T6              # the result is in T6
-.endm
-
-
 
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro  GCM_ENC_DEC_AVX2     ENC_DEC
+                vmovdqu 16*9(arg1), \T5
+                vaesenc \T5, \XMM1, \XMM1
+                vaesenc \T5, \XMM2, \XMM2
+                vaesenc \T5, \XMM3, \XMM3
+                vaesenc \T5, \XMM4, \XMM4
+                vaesenc \T5, \XMM5, \XMM5
+                vaesenc \T5, \XMM6, \XMM6
+                vaesenc \T5, \XMM7, \XMM7
+                vaesenc \T5, \XMM8, \XMM8
 
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
+        vmovdqa         TMP8(%rsp), \T1
+        vmovdqu         HashKey(arg2), \T5
 
-        mov     %rsp, %r14
+        vpclmulqdq      $0x00, \T5, \T1, \T3
+        vpxor           \T3, \T7, \T7
 
+        vpclmulqdq      $0x01, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
+        vpclmulqdq      $0x10, \T5, \T1, \T3
+        vpxor           \T3, \T6, \T6
 
+        vpclmulqdq      $0x11, \T5, \T1, \T3
+        vpxor           \T3, \T4, \T1
 
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                         # align rsp to 64 bytes
 
+                vmovdqu 16*10(arg1), \T5
 
-        vmovdqu  HashKey(arg1), %xmm13             # xmm13 = HashKey
+        i = 11
+        setreg
+.rep (\REP-9)
+        vaesenc \T5, \XMM1, \XMM1
+        vaesenc \T5, \XMM2, \XMM2
+        vaesenc \T5, \XMM3, \XMM3
+        vaesenc \T5, \XMM4, \XMM4
+        vaesenc \T5, \XMM5, \XMM5
+        vaesenc \T5, \XMM6, \XMM6
+        vaesenc \T5, \XMM7, \XMM7
+        vaesenc \T5, \XMM8, \XMM8
+
+        vmovdqu 16*i(arg1), \T5
+        i = i + 1
+        setreg
+.endr
 
-        mov     arg4, %r13                         # save the number of bytes of plaintext/ciphertext
-        and     $-16, %r13                         # r13 = r13 - (r13 mod 16)
+	i = 0
+	j = 1
+	setreg
+.rep 8
+		vpxor	16*i(arg4, %r11), \T5, \T2
+                .if \ENC_DEC == ENC
+                vaesenclast     \T2, reg_j, reg_j
+                .else
+                vaesenclast     \T2, reg_j, \T3
+                vmovdqu 16*i(arg4, %r11), reg_j
+                vmovdqu \T3, 16*i(arg3, %r11)
+                .endif
+	i = (i+1)
+	j = (j+1)
+	setreg
+.endr
+	#######################################################################
 
-        mov     %r13, %r12
-        shr     $4, %r12
-        and     $7, %r12
-        jz      _initial_num_blocks_is_0\@
 
-        cmp     $7, %r12
-        je      _initial_num_blocks_is_7\@
-        cmp     $6, %r12
-        je      _initial_num_blocks_is_6\@
-        cmp     $5, %r12
-        je      _initial_num_blocks_is_5\@
-        cmp     $4, %r12
-        je      _initial_num_blocks_is_4\@
-        cmp     $3, %r12
-        je      _initial_num_blocks_is_3\@
-        cmp     $2, %r12
-        je      _initial_num_blocks_is_2\@
+	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
+	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
+	vpxor	\T3, \T7, \T7
+	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
 
-        jmp     _initial_num_blocks_is_1\@
 
-_initial_num_blocks_is_7\@:
-        INITIAL_BLOCKS_AVX2  7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*7, %r13
-        jmp     _initial_blocks_encrypted\@
 
-_initial_num_blocks_is_6\@:
-        INITIAL_BLOCKS_AVX2  6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*6, %r13
-        jmp     _initial_blocks_encrypted\@
+	#######################################################################
+	#first phase of the reduction
+	vmovdqa         POLY2(%rip), \T3
 
-_initial_num_blocks_is_5\@:
-        INITIAL_BLOCKS_AVX2  5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*5, %r13
-        jmp     _initial_blocks_encrypted\@
+	vpclmulqdq	$0x01, \T7, \T3, \T2
+	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
 
-_initial_num_blocks_is_4\@:
-        INITIAL_BLOCKS_AVX2  4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*4, %r13
-        jmp     _initial_blocks_encrypted\@
+	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
+	#######################################################################
+                .if \ENC_DEC == ENC
+		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
+		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
+                .endif
 
-_initial_num_blocks_is_3\@:
-        INITIAL_BLOCKS_AVX2  3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*3, %r13
-        jmp     _initial_blocks_encrypted\@
+	#######################################################################
+	#second phase of the reduction
+	vpclmulqdq	$0x00, \T7, \T3, \T2
+	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
 
-_initial_num_blocks_is_2\@:
-        INITIAL_BLOCKS_AVX2  2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*2, %r13
-        jmp     _initial_blocks_encrypted\@
+	vpclmulqdq	$0x10, \T7, \T3, \T4
+	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
 
-_initial_num_blocks_is_1\@:
-        INITIAL_BLOCKS_AVX2  1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*1, %r13
-        jmp     _initial_blocks_encrypted\@
+	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
+	#######################################################################
+	vpxor		\T4, \T1, \T1			# the result is in T1
 
-_initial_num_blocks_is_0\@:
-        INITIAL_BLOCKS_AVX2  0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
+		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
 
 
-_initial_blocks_encrypted\@:
-        cmp     $0, %r13
-        je      _zero_cipher_left\@
+	vpxor	\T1, \XMM1, \XMM1
 
-        sub     $128, %r13
-        je      _eight_cipher_left\@
 
 
+.endm
 
 
-        vmovd   %xmm9, %r15d
-        and     $255, %r15d
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+# GHASH the last 4 ciphertext blocks.
+.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
 
+        ## Karatsuba Method
 
-_encrypt_by_8_new\@:
-        cmp     $(255-8), %r15d
-        jg      _encrypt_by_8\@
+        vmovdqu         HashKey_8(arg2), \T5
 
+        vpshufd         $0b01001110, \XMM1, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM1, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
+        vpclmulqdq      $0x11, \T5, \XMM1, \T6
+        vpclmulqdq      $0x00, \T5, \XMM1, \T7
 
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+        vpclmulqdq      $0x00, \T3, \T2, \XMM1
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        jmp     _eight_cipher_left\@
+        ######################
 
-_encrypt_by_8\@:
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $8, %r15b
-        GHASH_8_ENCRYPT_8_PARALLEL_AVX2      %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $128, %r11
-        sub     $128, %r13
-        jne     _encrypt_by_8_new\@
+        vmovdqu         HashKey_7(arg2), \T5
+        vpshufd         $0b01001110, \XMM2, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM2, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+        vpclmulqdq      $0x11, \T5, \XMM2, \T4
+        vpxor           \T4, \T6, \T6
 
+        vpclmulqdq      $0x00, \T5, \XMM2, \T4
+        vpxor           \T4, \T7, \T7
 
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
+        vpxor           \T2, \XMM1, \XMM1
 
-_eight_cipher_left\@:
-        GHASH_LAST_8_AVX2    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+        ######################
 
+        vmovdqu         HashKey_6(arg2), \T5
+        vpshufd         $0b01001110, \XMM3, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM3, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-_zero_cipher_left\@:
-        cmp     $16, arg4
-        jl      _only_less_than_16\@
+        vpclmulqdq      $0x11, \T5, \XMM3, \T4
+        vpxor           \T4, \T6, \T6
 
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        vpclmulqdq      $0x00, \T5, \XMM3, \T4
+        vpxor           \T4, \T7, \T7
 
-        je      _multiple_of_16_bytes\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        # handle the last <16 Byte block seperately
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
-        vpaddd   ONE(%rip), %xmm9, %xmm9             # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        vmovdqu         HashKey_5(arg2), \T5
+        vpshufd         $0b01001110, \XMM4, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM4, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        sub     $16, %r11
-        add     %r13, %r11
-        vmovdqu (arg3, %r11), %xmm1                  # receive the last <16 Byte block
+        vpclmulqdq      $0x11, \T5, \XMM4, \T4
+        vpxor           \T4, \T6, \T6
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer
-						     # to be able to shift 16-r13 bytes
-						     # (r13 is the number of bytes in plaintext mod 16)
-        vmovdqu (%r12), %xmm2                        # get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm1, %xmm1                  # shift right 16-r13 bytes
-        jmp     _final_ghash_mul\@
-
-_only_less_than_16\@:
-        # check for 0 length
-        mov     arg4, %r13
-        and     $15, %r13                            # r13 = (arg4 mod 16)
+        vpclmulqdq      $0x00, \T5, \XMM4, \T4
+        vpxor           \T4, \T7, \T7
 
-        je      _multiple_of_16_bytes\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        # handle the last <16 Byte block seperately
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Yn)
+        vmovdqu         HashKey_4(arg2), \T5
+        vpshufd         $0b01001110, \XMM5, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM5, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
+        vpclmulqdq      $0x11, \T5, \XMM5, \T4
+        vpxor           \T4, \T6, \T6
 
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-						     # number of bytes in plaintext mod 16)
+        vpclmulqdq      $0x00, \T5, \XMM5, \T4
+        vpxor           \T4, \T7, \T7
 
-_get_last_16_byte_loop\@:
-        movb    (arg3, %r11),  %al
-        movb    %al,  TMP1 (%rsp , %r11)
-        add     $1, %r11
-        cmp     %r13,  %r11
-        jne     _get_last_16_byte_loop\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        vmovdqu  TMP1(%rsp), %xmm1
+        vpxor           \T2, \XMM1, \XMM1
 
-        sub     $16, %r11
+        ######################
 
-_final_ghash_mul\@:
-        .if  \ENC_DEC ==  DEC
-        vmovdqa %xmm1, %xmm2
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm2, %xmm2
-        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
-        vpxor   %xmm2, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        .else
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        vpxor   %xmm9, %xmm14, %xmm14
-	#GHASH computation for the last <16 Byte block
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        sub     %r13, %r11
-        add     $16, %r11
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
-        .endif
+        vmovdqu         HashKey_3(arg2), \T5
+        vpshufd         $0b01001110, \XMM6, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM6, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
+        vpclmulqdq      $0x11, \T5, \XMM6, \T4
+        vpxor           \T4, \T6, \T6
 
-        #############################
-        # output r13 Bytes
-        vmovq   %xmm9, %rax
-        cmp     $8, %r13
-        jle     _less_than_8_bytes_left\@
+        vpclmulqdq      $0x00, \T5, \XMM6, \T4
+        vpxor           \T4, \T7, \T7
 
-        mov     %rax, (arg2 , %r11)
-        add     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        vmovq   %xmm9, %rax
-        sub     $8, %r13
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-_less_than_8_bytes_left\@:
-        movb    %al, (arg2 , %r11)
-        add     $1, %r11
-        shr     $8, %rax
-        sub     $1, %r13
-        jne     _less_than_8_bytes_left\@
-        #############################
+        vpxor           \T2, \XMM1, \XMM1
 
-_multiple_of_16_bytes\@:
-        mov     arg7, %r12                           # r12 = aadLen (number of bytes)
-        shl     $3, %r12                             # convert into number of bits
-        vmovd   %r12d, %xmm15                        # len(A) in xmm15
+        ######################
 
-        shl     $3, arg4                             # len(C) in bits  (*128)
-        vmovq   arg4, %xmm1
-        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
-        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
+        vmovdqu         HashKey_2(arg2), \T5
+        vpshufd         $0b01001110, \XMM7, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM7, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-        vpxor   %xmm15, %xmm14, %xmm14
-        GHASH_MUL_AVX2       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
-        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14              # perform a 16Byte swap
+        vpclmulqdq      $0x11, \T5, \XMM7, \T4
+        vpxor           \T4, \T6, \T6
 
-        mov     arg5, %rax                           # rax = *Y0
-        vmovdqu (%rax), %xmm9                        # xmm9 = Y0
+        vpclmulqdq      $0x00, \T5, \XMM7, \T4
+        vpxor           \T4, \T7, \T7
 
-        ENCRYPT_SINGLE_BLOCK    %xmm9                # E(K, Y0)
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-        vpxor   %xmm14, %xmm9, %xmm9
+        vpxor           \T2, \XMM1, \XMM1
 
+        ######################
 
+        vmovdqu         HashKey(arg2), \T5
+        vpshufd         $0b01001110, \XMM8, \T2
+        vpshufd         $0b01001110, \T5, \T3
+        vpxor           \XMM8, \T2, \T2
+        vpxor           \T5, \T3, \T3
 
-_return_T\@:
-        mov     arg8, %r10              # r10 = authTag
-        mov     arg9, %r11              # r11 = auth_tag_len
+        vpclmulqdq      $0x11, \T5, \XMM8, \T4
+        vpxor           \T4, \T6, \T6
 
-        cmp     $16, %r11
-        je      _T_16\@
+        vpclmulqdq      $0x00, \T5, \XMM8, \T4
+        vpxor           \T4, \T7, \T7
 
-        cmp     $8, %r11
-        jl      _T_4\@
+        vpclmulqdq      $0x00, \T3, \T2, \T2
 
-_T_8\@:
-        vmovq   %xmm9, %rax
-        mov     %rax, (%r10)
-        add     $8, %r10
-        sub     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_4\@:
-        vmovd   %xmm9, %eax
-        mov     %eax, (%r10)
-        add     $4, %r10
-        sub     $4, %r11
-        vpsrldq     $4, %xmm9, %xmm9
-        cmp     $0, %r11
-        je     _return_T_done\@
-_T_123\@:
-        vmovd     %xmm9, %eax
-        cmp     $2, %r11
-        jl     _T_1\@
-        mov     %ax, (%r10)
-        cmp     $2, %r11
-        je     _return_T_done\@
-        add     $2, %r10
-        sar     $16, %eax
-_T_1\@:
-        mov     %al, (%r10)
-        jmp     _return_T_done\@
+        vpxor           \T2, \XMM1, \XMM1
+        vpxor           \T6, \XMM1, \XMM1
+        vpxor           \T7, \XMM1, \T2
 
-_T_16\@:
-        vmovdqu %xmm9, (%r10)
 
-_return_T_done\@:
-        mov     %r14, %rsp
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
-.endm
 
+        vpslldq $8, \T2, \T4
+        vpsrldq $8, \T2, \T2
 
-#############################################################
-#void   aesni_gcm_precomp_avx_gen4
-#        (gcm_data     *my_ctx_data,
-#        u8     *hash_subkey)# /* H, the Hash sub key input.
-#				Data starts on a 16-byte boundary. */
-#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen4)
-        #the number of pushes must equal STACK_OFFSET
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
+        vpxor   \T4, \T7, \T7
+        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
+						   # accumulated carry-less multiplications
 
-        mov     %rsp, %r14
+        #######################################################################
+        #first phase of the reduction
+        vmovdqa         POLY2(%rip), \T3
 
+        vpclmulqdq      $0x01, \T7, \T3, \T2
+        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
+
+        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
+        #######################################################################
 
 
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                    # align rsp to 64 bytes
+        #second phase of the reduction
+        vpclmulqdq      $0x00, \T7, \T3, \T2
+        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
 
-        vmovdqu  (arg2), %xmm6                # xmm6 = HashKey
+        vpclmulqdq      $0x10, \T7, \T3, \T4
+        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
 
-        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
-        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
-        vmovdqa  %xmm6, %xmm2
-        vpsllq   $1, %xmm6, %xmm6
-        vpsrlq   $63, %xmm2, %xmm2
-        vmovdqa  %xmm2, %xmm1
-        vpslldq  $8, %xmm2, %xmm2
-        vpsrldq  $8, %xmm1, %xmm1
-        vpor     %xmm2, %xmm6, %xmm6
-        #reduction
-        vpshufd  $0b00100100, %xmm1, %xmm2
-        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
-        vpand    POLY(%rip), %xmm2, %xmm2
-        vpxor    %xmm2, %xmm6, %xmm6          # xmm6 holds the HashKey<<1 mod poly
+        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
         #######################################################################
-        vmovdqa  %xmm6, HashKey(arg1)         # store HashKey<<1 mod poly
-
+        vpxor           \T4, \T6, \T6              # the result is in T6
+.endm
 
-        PRECOMPUTE_AVX2  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
 
-        mov     %r14, %rsp
 
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
+#############################################################
+#void   aesni_gcm_init_avx_gen4
+#        (gcm_data     *my_ctx_data,
+#         gcm_context_data *data,
+#        u8      *iv, /* Pre-counter block j0: 4 byte salt
+#			(from Security Association) concatenated with 8 byte
+#			Initialisation Vector (from IPSec ESP Payload)
+#			concatenated with 0x00000001. 16-byte aligned pointer. */
+#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
+#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#############################################################
+ENTRY(aesni_gcm_init_avx_gen4)
+        FUNC_SAVE
+        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
+        FUNC_RESTORE
         ret
-ENDPROC(aesni_gcm_precomp_avx_gen4)
-
+ENDPROC(aesni_gcm_init_avx_gen4)
 
 ###############################################################################
 #void   aesni_gcm_enc_avx_gen4(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
 #        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			 Initialisation Vector (from IPSec ESP Payload)
-#			 concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
 ###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen4)
-        GCM_ENC_DEC_AVX2     ENC
+ENTRY(aesni_gcm_enc_update_avx_gen4)
+        FUNC_SAVE
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_enc_update4
+        cmp     $16, %eax
+        je      key_128_enc_update4
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
+        FUNC_RESTORE
+	ret
+key_128_enc_update4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
+        FUNC_RESTORE
 	ret
-ENDPROC(aesni_gcm_enc_avx_gen4)
+key_256_enc_update4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
+        FUNC_RESTORE
+	ret
+ENDPROC(aesni_gcm_enc_update_avx_gen4)
 
 ###############################################################################
-#void   aesni_gcm_dec_avx_gen4(
+#void   aesni_gcm_dec_update_avx_gen4(
 #        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
 #        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len, /* Length of data in Bytes for encryption. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen4)
+        FUNC_SAVE
+        mov     keysize,%eax
+        cmp     $32, %eax
+        je      key_256_dec_update4
+        cmp     $16, %eax
+        je      key_128_dec_update4
+        # must be 192
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
+        FUNC_RESTORE
+        ret
+key_128_dec_update4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
+        FUNC_RESTORE
+        ret
+key_256_dec_update4:
+        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_dec_update_avx_gen4)
+
+###############################################################################
+#void   aesni_gcm_finalize_avx_gen4(
+#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
+#        gcm_context_data *data,
 #        u8      *auth_tag, /* Authenticated Tag output. */
 #        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
+#                              Valid values are 16 (most likely), 12 or 8. */
 ###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen4)
-        GCM_ENC_DEC_AVX2     DEC
-	ret
-ENDPROC(aesni_gcm_dec_avx_gen4)
+ENTRY(aesni_gcm_finalize_avx_gen4)
+        FUNC_SAVE
+        mov	keysize,%eax
+        cmp     $32, %eax
+        je      key_256_finalize4
+        cmp     $16, %eax
+        je      key_128_finalize4
+        # must be 192
+        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_128_finalize4:
+        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
+        FUNC_RESTORE
+        ret
+key_256_finalize4:
+        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
+        FUNC_RESTORE
+        ret
+ENDPROC(aesni_gcm_finalize_avx_gen4)
 
 #endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 661f7daf43da947acccfffe5b820ef7cebe7009c..1321700d6647f8be1f7894e00405c8ad40e51c38 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -84,7 +84,7 @@ struct gcm_context_data {
 	u8 current_counter[GCM_BLOCK_LEN];
 	u64 partial_block_len;
 	u64 unused;
-	u8 hash_keys[GCM_BLOCK_LEN * 8];
+	u8 hash_keys[GCM_BLOCK_LEN * 16];
 };
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
 				   struct gcm_context_data *gdata,
 				   u8 *auth_tag, unsigned long auth_tag_len);
 
+static struct aesni_gcm_tfm_s {
+void (*init)(void *ctx,
+				struct gcm_context_data *gdata,
+				u8 *iv,
+				u8 *hash_subkey, const u8 *aad,
+				unsigned long aad_len);
+void (*enc_update)(void *ctx,
+					struct gcm_context_data *gdata, u8 *out,
+					const u8 *in,
+					unsigned long plaintext_len);
+void (*dec_update)(void *ctx,
+					struct gcm_context_data *gdata, u8 *out,
+					const u8 *in,
+					unsigned long ciphertext_len);
+void (*finalize)(void *ctx,
+				struct gcm_context_data *gdata,
+				u8 *auth_tag, unsigned long auth_tag_len);
+} *aesni_gcm_tfm;
+
+struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+	.init = &aesni_gcm_init,
+	.enc_update = &aesni_gcm_enc_update,
+	.dec_update = &aesni_gcm_dec_update,
+	.finalize = &aesni_gcm_finalize,
+};
+
 #ifdef CONFIG_AS_AVX
 asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
@@ -183,136 +209,94 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
 asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
 /*
- * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * asmlinkage void aesni_gcm_init_avx_gen2()
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
+					struct gcm_context_data *gdata,
+					u8 *iv,
+					u8 *hash_subkey,
+					const u8 *aad,
+					unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in,
+				     unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
+				   struct gcm_context_data *gdata,
+				   u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long plaintext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long ciphertext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long plaintext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
-		aesni_gcm_enc(ctx, data, out, in,
-			plaintext_len, iv, hash_subkey, aad,
-			aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	}
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+	.init = &aesni_gcm_init_avx_gen2,
+	.enc_update = &aesni_gcm_enc_update_avx_gen2,
+	.dec_update = &aesni_gcm_dec_update_avx_gen2,
+	.finalize = &aesni_gcm_finalize_avx_gen2,
+};
 
-static void aesni_gcm_dec_avx(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long ciphertext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-		aesni_gcm_dec(ctx, data, out, in,
-			ciphertext_len, iv, hash_subkey, aad,
-			aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	}
-}
 #endif
 
 #ifdef CONFIG_AS_AVX2
 /*
- * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * asmlinkage void aesni_gcm_init_avx_gen4()
  * gcm_data *my_ctx_data, context data
  * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
  */
-asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
+					struct gcm_context_data *gdata,
+					u8 *iv,
+					u8 *hash_subkey,
+					const u8 *aad,
+					unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
+				     struct gcm_context_data *gdata, u8 *out,
+				     const u8 *in,
+				     unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
+				   struct gcm_context_data *gdata,
+				   u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long plaintext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
+				struct gcm_context_data *gdata, u8 *out,
 			const u8 *in, unsigned long ciphertext_len, u8 *iv,
 			const u8 *aad, unsigned long aad_len,
 			u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx2(void *ctx,
-			struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long plaintext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-		aesni_gcm_enc(ctx, data, out, in,
-			      plaintext_len, iv, hash_subkey, aad,
-			      aad_len, auth_tag, auth_tag_len);
-	} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
-		aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	}
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+	.init = &aesni_gcm_init_avx_gen4,
+	.enc_update = &aesni_gcm_enc_update_avx_gen4,
+	.dec_update = &aesni_gcm_dec_update_avx_gen4,
+	.finalize = &aesni_gcm_finalize_avx_gen4,
+};
 
-static void aesni_gcm_dec_avx2(void *ctx,
-	struct gcm_context_data *data, u8 *out,
-			const u8 *in, unsigned long ciphertext_len, u8 *iv,
-			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-			u8 *auth_tag, unsigned long auth_tag_len)
-{
-       struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
-	if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-		aesni_gcm_dec(ctx, data, out, in,
-			      ciphertext_len, iv, hash_subkey,
-			      aad, aad_len, auth_tag, auth_tag_len);
-	} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
-		aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	} else {
-		aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
-		aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
-					aad_len, auth_tag, auth_tag_len);
-	}
-}
 #endif
 
-static void (*aesni_gcm_enc_tfm)(void *ctx,
-				 struct gcm_context_data *data, u8 *out,
-				 const u8 *in, unsigned long plaintext_len,
-				 u8 *iv, u8 *hash_subkey, const u8 *aad,
-				 unsigned long aad_len, u8 *auth_tag,
-				 unsigned long auth_tag_len);
-
-static void (*aesni_gcm_dec_tfm)(void *ctx,
-				 struct gcm_context_data *data, u8 *out,
-				 const u8 *in, unsigned long ciphertext_len,
-				 u8 *iv, u8 *hash_subkey, const u8 *aad,
-				 unsigned long aad_len, u8 *auth_tag,
-				 unsigned long auth_tag_len);
-
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
@@ -794,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
 	struct gcm_context_data data AESNI_ALIGN_ATTR;
 	struct scatter_walk dst_sg_walk = {};
 	unsigned long left = req->cryptlen;
@@ -811,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 	if (!enc)
 		left -= auth_tag_len;
 
+#ifdef CONFIG_AS_AVX2
+	if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
+		gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+#endif
+#ifdef CONFIG_AS_AVX
+	if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
+		gcm_tfm = &aesni_gcm_tfm_sse;
+#endif
+
 	/* Linearize assoc, if not already linear */
 	if (req->src->length >= assoclen && req->src->length &&
 		(!PageHighMem(sg_page(req->src)) ||
@@ -835,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 	}
 
 	kernel_fpu_begin();
-	aesni_gcm_init(aes_ctx, &data, iv,
+	gcm_tfm->init(aes_ctx, &data, iv,
 		hash_subkey, assoc, assoclen);
 	if (req->src != req->dst) {
 		while (left) {
@@ -846,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			len = min(srclen, dstlen);
 			if (len) {
 				if (enc)
-					aesni_gcm_enc_update(aes_ctx, &data,
+					gcm_tfm->enc_update(aes_ctx, &data,
 							     dst, src, len);
 				else
-					aesni_gcm_dec_update(aes_ctx, &data,
+					gcm_tfm->dec_update(aes_ctx, &data,
 							     dst, src, len);
 			}
 			left -= len;
@@ -867,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			len = scatterwalk_clamp(&src_sg_walk, left);
 			if (len) {
 				if (enc)
-					aesni_gcm_enc_update(aes_ctx, &data,
+					gcm_tfm->enc_update(aes_ctx, &data,
 							     src, src, len);
 				else
-					aesni_gcm_dec_update(aes_ctx, &data,
+					gcm_tfm->dec_update(aes_ctx, &data,
 							     src, src, len);
 			}
 			left -= len;
@@ -879,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 			scatterwalk_done(&src_sg_walk, 1, left);
 		}
 	}
-	aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+	gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
 	kernel_fpu_end();
 
 	if (!assocmem)
@@ -912,147 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
 static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
 			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
-	u8 one_entry_in_sg = 0;
-	u8 *src, *dst, *assoc;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	struct scatter_walk src_sg_walk;
-	struct scatter_walk dst_sg_walk = {};
-	struct gcm_context_data data AESNI_ALIGN_ATTR;
-
-	if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
-		aesni_gcm_enc_tfm == aesni_gcm_enc ||
-		req->cryptlen < AVX_GEN2_OPTSIZE) {
-		return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
-					  aes_ctx);
-	}
-	if (sg_is_last(req->src) &&
-	    (!PageHighMem(sg_page(req->src)) ||
-	    req->src->offset + req->src->length <= PAGE_SIZE) &&
-	    sg_is_last(req->dst) &&
-	    (!PageHighMem(sg_page(req->dst)) ||
-	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
-		one_entry_in_sg = 1;
-		scatterwalk_start(&src_sg_walk, req->src);
-		assoc = scatterwalk_map(&src_sg_walk);
-		src = assoc + req->assoclen;
-		dst = src;
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
-		}
-	} else {
-		/* Allocate memory for src, dst, assoc */
-		assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
-			GFP_ATOMIC);
-		if (unlikely(!assoc))
-			return -ENOMEM;
-		scatterwalk_map_and_copy(assoc, req->src, 0,
-					 req->assoclen + req->cryptlen, 0);
-		src = assoc + req->assoclen;
-		dst = src;
-	}
-
-	kernel_fpu_begin();
-	aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
-			  hash_subkey, assoc, assoclen,
-			  dst + req->cryptlen, auth_tag_len);
-	kernel_fpu_end();
-
-	/* The authTag (aka the Integrity Check Value) needs to be written
-	 * back to the packet. */
-	if (one_entry_in_sg) {
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst - req->assoclen);
-			scatterwalk_advance(&dst_sg_walk, req->dst->length);
-			scatterwalk_done(&dst_sg_walk, 1, 0);
-		}
-		scatterwalk_unmap(assoc);
-		scatterwalk_advance(&src_sg_walk, req->src->length);
-		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
-	} else {
-		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
-					 req->cryptlen + auth_tag_len, 1);
-		kfree(assoc);
-	}
-	return 0;
+	return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+				aes_ctx);
 }
 
 static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
 			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
-	u8 one_entry_in_sg = 0;
-	u8 *src, *dst, *assoc;
-	unsigned long tempCipherLen = 0;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 authTag[16];
-	struct scatter_walk src_sg_walk;
-	struct scatter_walk dst_sg_walk = {};
-	struct gcm_context_data data AESNI_ALIGN_ATTR;
-	int retval = 0;
-
-	if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
-		aesni_gcm_enc_tfm == aesni_gcm_enc ||
-		req->cryptlen < AVX_GEN2_OPTSIZE) {
-		return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
-					  aes_ctx);
-	}
-	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
-
-	if (sg_is_last(req->src) &&
-	    (!PageHighMem(sg_page(req->src)) ||
-	    req->src->offset + req->src->length <= PAGE_SIZE) &&
-	    sg_is_last(req->dst) && req->dst->length &&
-	    (!PageHighMem(sg_page(req->dst)) ||
-	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
-		one_entry_in_sg = 1;
-		scatterwalk_start(&src_sg_walk, req->src);
-		assoc = scatterwalk_map(&src_sg_walk);
-		src = assoc + req->assoclen;
-		dst = src;
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
-		}
-	} else {
-		/* Allocate memory for src, dst, assoc */
-		assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
-		if (!assoc)
-			return -ENOMEM;
-		scatterwalk_map_and_copy(assoc, req->src, 0,
-					 req->assoclen + req->cryptlen, 0);
-		src = assoc + req->assoclen;
-		dst = src;
-	}
-
-
-	kernel_fpu_begin();
-	aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
-			  hash_subkey, assoc, assoclen,
-			  authTag, auth_tag_len);
-	kernel_fpu_end();
-
-	/* Compare generated tag with passed in tag. */
-	retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
-		-EBADMSG : 0;
-
-	if (one_entry_in_sg) {
-		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst - req->assoclen);
-			scatterwalk_advance(&dst_sg_walk, req->dst->length);
-			scatterwalk_done(&dst_sg_walk, 1, 0);
-		}
-		scatterwalk_unmap(assoc);
-		scatterwalk_advance(&src_sg_walk, req->src->length);
-		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
-	} else {
-		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
-					 tempCipherLen, 1);
-		kfree(assoc);
-	}
-	return retval;
-
+	return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+				aes_ctx);
 }
 
 static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -1420,21 +1282,18 @@ static int __init aesni_init(void)
 #ifdef CONFIG_AS_AVX2
 	if (boot_cpu_has(X86_FEATURE_AVX2)) {
 		pr_info("AVX2 version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
-		aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+		aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
 	} else
 #endif
 #ifdef CONFIG_AS_AVX
 	if (boot_cpu_has(X86_FEATURE_AVX)) {
 		pr_info("AVX version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
-		aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+		aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
 	} else
 #endif
 	{
 		pr_info("SSE version of gcm_enc/dec engaged.\n");
-		aesni_gcm_enc_tfm = aesni_gcm_enc;
-		aesni_gcm_dec_tfm = aesni_gcm_dec;
+		aesni_gcm_tfm = &aesni_gcm_tfm_sse;
 	}
 	aesni_ctr_enc_tfm = aesni_ctr_enc;
 #ifdef CONFIG_AS_AVX
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644
index 0000000000000000000000000000000000000000..32903fd450af28f26c96c8bbe27786443bcdd7cb
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -0,0 +1,1025 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	vmovdqa		ROT8(%rip),%ymm4
+	vmovdqa		ROT16(%rip),%ymm5
+
+	mov		%rcx,%rax
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rax
+	jl		.Lxorpart2
+	vpxor		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rax
+	jl		.Lxorpart2
+	vpxor		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rax
+	jl		.Lxorpart2
+	vpxor		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rax
+	jl		.Lxorpart2
+	vpxor		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rax
+	jl		.Lxorpart2
+	vpxor		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rax
+	jl		.Lxorpart2
+	vpxor		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rax
+	jl		.Lxorpart2
+	vpxor		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rax
+	jl		.Lxorpart2
+	vpxor		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone2
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm7,%xmm7
+	vmovdqa		%xmm7,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	vmovdqa		ROT8(%rip),%ymm8
+	vmovdqa		ROT16(%rip),%ymm9
+
+	mov		%rcx,%rax
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	vpxor		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	vpxor		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	vpxor		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	vpxor		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	vpxor		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	vpxor		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	vpxor		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	vpxor		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	vpxor		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	vpxor		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	vpxor		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	vpxor		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	vpxor		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	vpxor		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	vpxor		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	vpxor		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm10,%xmm10
+	vmovdqa		%xmm10,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	lea		8(%rsp),%r10
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa		0x00(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
+	cmp		$0x0020,%rax
+	jl		.Lxorpart8
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rax
+	jl		.Lxorpart8
+	vpxor		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vmovdqa		0x40(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
+	cmp		$0x0060,%rax
+	jl		.Lxorpart8
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rax
+	jl		.Lxorpart8
+	vpxor		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vmovdqa		0x20(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vmovdqa		0x60(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
+	cmp		$0x00e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rax
+	jl		.Lxorpart8
+	vpxor		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa		%ymm4,%ymm0
+	cmp		$0x0120,%rax
+	jl		.Lxorpart8
+	vpxor		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0100(%rsi)
+
+	vmovdqa		%ymm12,%ymm0
+	cmp		$0x0140,%rax
+	jl		.Lxorpart8
+	vpxor		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0120(%rsi)
+
+	vmovdqa		%ymm6,%ymm0
+	cmp		$0x0160,%rax
+	jl		.Lxorpart8
+	vpxor		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0140(%rsi)
+
+	vmovdqa		%ymm14,%ymm0
+	cmp		$0x0180,%rax
+	jl		.Lxorpart8
+	vpxor		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0160(%rsi)
+
+	vmovdqa		%ymm5,%ymm0
+	cmp		$0x01a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0180(%rsi)
+
+	vmovdqa		%ymm13,%ymm0
+	cmp		$0x01c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01a0(%rsi)
+
+	vmovdqa		%ymm7,%ymm0
+	cmp		$0x01e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01c0(%rsi)
+
+	vmovdqa		%ymm15,%ymm0
+	cmp		$0x0200,%rax
+	jl		.Lxorpart8
+	vpxor		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	lea		-8(%r10),%rsp
+	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x1f,%r9
+	jz		.Ldone8
+	and		$~0x1f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644
index 0000000000000000000000000000000000000000..848f9c75fd4f5fd4ff2a280006e237bf65088b04
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rcx
+	jl		.Lxorpart2
+	vpxord		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rcx
+	jl		.Lxorpart2
+	vpxord		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rcx
+	jl		.Lxorpart2
+	vpxord		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rcx
+	jl		.Lxorpart2
+	vpxord		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rcx
+	jl		.Lxorpart2
+	vpxord		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rcx
+	jl		.Lxorpart2
+	vpxord		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rcx
+	jl		.Lxorpart2
+	vpxord		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rcx
+	jl		.Lxorpart2
+	vpxord		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	ret
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm7,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rcx
+	jl		.Lxorpart4
+	vpxord		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rcx
+	jl		.Lxorpart4
+	vpxord		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rcx
+	jl		.Lxorpart4
+	vpxord		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rcx
+	jl		.Lxorpart4
+	vpxord		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rcx
+	jl		.Lxorpart4
+	vpxord		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rcx
+	jl		.Lxorpart4
+	vpxord		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rcx
+	jl		.Lxorpart4
+	vpxord		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rcx
+	jl		.Lxorpart4
+	vpxord		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rcx
+	jl		.Lxorpart4
+	vpxord		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rcx
+	jl		.Lxorpart4
+	vpxord		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rcx
+	jl		.Lxorpart4
+	vpxord		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	ret
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm10,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. Compared to AVX2, this
+	# mostly benefits from the new rotate instructions in VL and the
+	# additional registers.
+
+	vzeroupper
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpaddd		%ymm16,%ymm0,%ymm0
+	vpaddd		%ymm17,%ymm1,%ymm1
+	vpaddd		%ymm18,%ymm2,%ymm2
+	vpaddd		%ymm19,%ymm3,%ymm3
+	vpaddd		%ymm20,%ymm4,%ymm4
+	vpaddd		%ymm21,%ymm5,%ymm5
+	vpaddd		%ymm22,%ymm6,%ymm6
+	vpaddd		%ymm23,%ymm7,%ymm7
+	vpaddd		%ymm24,%ymm8,%ymm8
+	vpaddd		%ymm25,%ymm9,%ymm9
+	vpaddd		%ymm26,%ymm10,%ymm10
+	vpaddd		%ymm27,%ymm11,%ymm11
+	vpaddd		%ymm28,%ymm12,%ymm12
+	vpaddd		%ymm29,%ymm13,%ymm13
+	vpaddd		%ymm30,%ymm14,%ymm14
+	vpaddd		%ymm31,%ymm15,%ymm15
+
+	# interleave 32-bit words in state n, n+1
+	vpunpckldq	%ymm1,%ymm0,%ymm16
+	vpunpckhdq	%ymm1,%ymm0,%ymm17
+	vpunpckldq	%ymm3,%ymm2,%ymm18
+	vpunpckhdq	%ymm3,%ymm2,%ymm19
+	vpunpckldq	%ymm5,%ymm4,%ymm20
+	vpunpckhdq	%ymm5,%ymm4,%ymm21
+	vpunpckldq	%ymm7,%ymm6,%ymm22
+	vpunpckhdq	%ymm7,%ymm6,%ymm23
+	vpunpckldq	%ymm9,%ymm8,%ymm24
+	vpunpckhdq	%ymm9,%ymm8,%ymm25
+	vpunpckldq	%ymm11,%ymm10,%ymm26
+	vpunpckhdq	%ymm11,%ymm10,%ymm27
+	vpunpckldq	%ymm13,%ymm12,%ymm28
+	vpunpckhdq	%ymm13,%ymm12,%ymm29
+	vpunpckldq	%ymm15,%ymm14,%ymm30
+	vpunpckhdq	%ymm15,%ymm14,%ymm31
+
+	# interleave 64-bit words in state n, n+2
+	vpunpcklqdq	%ymm18,%ymm16,%ymm0
+	vpunpcklqdq	%ymm19,%ymm17,%ymm1
+	vpunpckhqdq	%ymm18,%ymm16,%ymm2
+	vpunpckhqdq	%ymm19,%ymm17,%ymm3
+	vpunpcklqdq	%ymm22,%ymm20,%ymm4
+	vpunpcklqdq	%ymm23,%ymm21,%ymm5
+	vpunpckhqdq	%ymm22,%ymm20,%ymm6
+	vpunpckhqdq	%ymm23,%ymm21,%ymm7
+	vpunpcklqdq	%ymm26,%ymm24,%ymm8
+	vpunpcklqdq	%ymm27,%ymm25,%ymm9
+	vpunpckhqdq	%ymm26,%ymm24,%ymm10
+	vpunpckhqdq	%ymm27,%ymm25,%ymm11
+	vpunpcklqdq	%ymm30,%ymm28,%ymm12
+	vpunpcklqdq	%ymm31,%ymm29,%ymm13
+	vpunpckhqdq	%ymm30,%ymm28,%ymm14
+	vpunpckhqdq	%ymm31,%ymm29,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa64	%ymm0,%ymm16
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
+	cmp		$0x0020,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0000(%rsi)
+	vmovdqa64	%ymm16,%ymm0
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
+	cmp		$0x0060,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
+	cmp		$0x00e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa64	%ymm4,%ymm0
+	cmp		$0x0120,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0100(%rsi)
+
+	vmovdqa64	%ymm12,%ymm0
+	cmp		$0x0140,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0120(%rsi)
+
+	vmovdqa64	%ymm6,%ymm0
+	cmp		$0x0160,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0140(%rsi)
+
+	vmovdqa64	%ymm14,%ymm0
+	cmp		$0x0180,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0160(%rsi)
+
+	vmovdqa64	%ymm5,%ymm0
+	cmp		$0x01a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0180(%rsi)
+
+	vmovdqa64	%ymm13,%ymm0
+	cmp		$0x01c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01a0(%rsi)
+
+	vmovdqa64	%ymm7,%ymm0
+	cmp		$0x01e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01c0(%rsi)
+
+	vmovdqa64	%ymm15,%ymm0
+	cmp		$0x0200,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	ret
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0x1f,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0x1f,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
+	vpxord		%ymm0,%ymm1,%ymm1
+	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
similarity index 76%
rename from arch/x86/crypto/chacha20-ssse3-x86_64.S
rename to arch/x86/crypto/chacha-ssse3-x86_64.S
index 512a2b500fd1813d1ffc4d74053ec137ecf865c5..c05a7a963dc39ab4b73089e04a71e64c3895adca 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
  *
  * Copyright (C) 2015 Martin Willi
  *
@@ -10,6 +10,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/frame.h>
 
 .section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
@@ -23,35 +24,25 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 
 .text
 
-ENTRY(chacha20_block_xor_ssse3)
-	# %rdi: Input state matrix, s
-	# %rsi: 1 data block output, o
-	# %rdx: 1 data block input, i
-
-	# This function encrypts one ChaCha20 block by loading the state matrix
-	# in four SSE registers. It performs matrix operation on four words in
-	# parallel, but requireds shuffling to rearrange the words after each
-	# round. 8/16-bit word rotation is done with the slightly better
-	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
-	# traditional shift+OR.
-
-	# x0..3 = s0..3
-	movdqa		0x00(%rdi),%xmm0
-	movdqa		0x10(%rdi),%xmm1
-	movdqa		0x20(%rdi),%xmm2
-	movdqa		0x30(%rdi),%xmm3
-	movdqa		%xmm0,%xmm8
-	movdqa		%xmm1,%xmm9
-	movdqa		%xmm2,%xmm10
-	movdqa		%xmm3,%xmm11
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
 
 	movdqa		ROT8(%rip),%xmm4
 	movdqa		ROT16(%rip),%xmm5
 
-	mov	$10,%ecx
-
 .Ldoubleround:
-
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	pshufd		$0x39,%xmm3,%xmm3
 
-	dec		%ecx
+	sub		$2,%r8d
 	jnz		.Ldoubleround
 
+	ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+	FRAME_BEGIN
+
+	# x0..3 = s0..3
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	mov		%rcx,%rax
+	call		chacha_permute
+
 	# o0 = i0 ^ (x0 + s0)
-	movdqu		0x00(%rdx),%xmm4
 	paddd		%xmm8,%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart
+	movdqu		0x00(%rdx),%xmm4
 	pxor		%xmm4,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
 	# o1 = i1 ^ (x1 + s1)
-	movdqu		0x10(%rdx),%xmm5
 	paddd		%xmm9,%xmm1
-	pxor		%xmm5,%xmm1
-	movdqu		%xmm1,0x10(%rsi)
+	movdqa		%xmm1,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart
+	movdqu		0x10(%rdx),%xmm0
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
 	# o2 = i2 ^ (x2 + s2)
-	movdqu		0x20(%rdx),%xmm6
 	paddd		%xmm10,%xmm2
-	pxor		%xmm6,%xmm2
-	movdqu		%xmm2,0x20(%rsi)
+	movdqa		%xmm2,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart
+	movdqu		0x20(%rdx),%xmm0
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
 	# o3 = i3 ^ (x3 + s3)
-	movdqu		0x30(%rdx),%xmm7
 	paddd		%xmm11,%xmm3
-	pxor		%xmm7,%xmm3
-	movdqu		%xmm3,0x30(%rsi)
+	movdqa		%xmm3,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart
+	movdqu		0x30(%rdx),%xmm0
+	pxor		%xmm3,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+.Ldone:
+	FRAME_END
+	ret
+
+.Lxorpart:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
 
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: output (8 32-bit words)
+	# %edx: nrounds
+	FRAME_BEGIN
+
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+
+	mov		%edx,%r8d
+	call		chacha_permute
+
+	movdqu		%xmm0,0x00(%rsi)
+	movdqu		%xmm3,0x10(%rsi)
+
+	FRAME_END
 	ret
-ENDPROC(chacha20_block_xor_ssse3)
+ENDPROC(hchacha_block_ssse3)
 
-ENTRY(chacha20_4block_xor_ssse3)
+ENTRY(chacha_4block_xor_ssse3)
 	# %rdi: Input state matrix, s
-	# %rsi: 4 data blocks output, o
-	# %rdx: 4 data blocks input, i
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
 
-	# This function encrypts four consecutive ChaCha20 blocks by loading the
+	# This function encrypts four consecutive ChaCha blocks by loading the
 	# the state matrix in SSE registers four times. As we need some scratch
 	# registers, we save the first four registers on the stack. The
 	# algorithm performs each operation on the corresponding word of each
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
 	lea		8(%rsp),%r10
 	sub		$0x80,%rsp
 	and		$~63,%rsp
+	mov		%rcx,%rax
 
 	# x0..15[0-3] = s0..3[0..3]
 	movq		0x00(%rdi),%xmm1
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
 	# x12 += counter values 0-3
 	paddd		%xmm1,%xmm12
 
-	mov		$10,%ecx
-
 .Ldoubleround4:
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	movdqa		0x00(%rsp),%xmm0
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
 	psrld		$25,%xmm4
 	por		%xmm0,%xmm4
 
-	dec		%ecx
+	sub		$2,%r8d
 	jnz		.Ldoubleround4
 
 	# x0[0-3] += s0[0]
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 
 	# xor with corresponding input, write to output
 	movdqa		0x00(%rsp),%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
 	movdqu		0x00(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
-	movdqa		0x10(%rsp),%xmm0
-	movdqu		0x80(%rdx),%xmm1
+
+	movdqu		%xmm4,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	movdqu		0x10(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x80(%rsi)
+	movdqu		%xmm0,0x10(%rsi)
+
+	movdqu		%xmm8,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+
+	movdqu		%xmm12,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
 	movdqa		0x20(%rsp),%xmm0
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
 	movdqu		0x40(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x40(%rsi)
+
+	movdqu		%xmm6,%xmm0
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x50(%rsi)
+
+	movdqu		%xmm10,%xmm0
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x60(%rsi)
+
+	movdqu		%xmm14,%xmm0
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x70(%rsi)
+
+	movdqa		0x10(%rsp),%xmm0
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+
+	movdqu		%xmm5,%xmm0
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x90(%rsi)
+
+	movdqu		%xmm9,%xmm0
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xa0(%rsi)
+
+	movdqu		%xmm13,%xmm0
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xb0(%rsi)
+
 	movdqa		0x30(%rsp),%xmm0
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
 	movdqu		0xc0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xc0(%rsi)
-	movdqu		0x10(%rdx),%xmm1
-	pxor		%xmm1,%xmm4
-	movdqu		%xmm4,0x10(%rsi)
-	movdqu		0x90(%rdx),%xmm1
-	pxor		%xmm1,%xmm5
-	movdqu		%xmm5,0x90(%rsi)
-	movdqu		0x50(%rdx),%xmm1
-	pxor		%xmm1,%xmm6
-	movdqu		%xmm6,0x50(%rsi)
+
+	movdqu		%xmm7,%xmm0
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
 	movdqu		0xd0(%rdx),%xmm1
-	pxor		%xmm1,%xmm7
-	movdqu		%xmm7,0xd0(%rsi)
-	movdqu		0x20(%rdx),%xmm1
-	pxor		%xmm1,%xmm8
-	movdqu		%xmm8,0x20(%rsi)
-	movdqu		0xa0(%rdx),%xmm1
-	pxor		%xmm1,%xmm9
-	movdqu		%xmm9,0xa0(%rsi)
-	movdqu		0x60(%rdx),%xmm1
-	pxor		%xmm1,%xmm10
-	movdqu		%xmm10,0x60(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xd0(%rsi)
+
+	movdqu		%xmm11,%xmm0
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
 	movdqu		0xe0(%rdx),%xmm1
-	pxor		%xmm1,%xmm11
-	movdqu		%xmm11,0xe0(%rsi)
-	movdqu		0x30(%rdx),%xmm1
-	pxor		%xmm1,%xmm12
-	movdqu		%xmm12,0x30(%rsi)
-	movdqu		0xb0(%rdx),%xmm1
-	pxor		%xmm1,%xmm13
-	movdqu		%xmm13,0xb0(%rsi)
-	movdqu		0x70(%rdx),%xmm1
-	pxor		%xmm1,%xmm14
-	movdqu		%xmm14,0x70(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xe0(%rsi)
+
+	movdqu		%xmm15,%xmm0
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
 	movdqu		0xf0(%rdx),%xmm1
-	pxor		%xmm1,%xmm15
-	movdqu		%xmm15,0xf0(%rsi)
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xf0(%rsi)
 
+.Ldone4:
 	lea		-8(%r10),%rsp
 	ret
-ENDPROC(chacha20_4block_xor_ssse3)
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
deleted file mode 100644
index f3cd26f4833254cb0966f31a1afae8c5c9fc95a4..0000000000000000000000000000000000000000
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-.section	.rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
-	.octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section	.rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
-	.octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC:	.octa 0x00000003000000020000000100000000
-	.octa 0x00000007000000060000000500000004
-
-.text
-
-ENTRY(chacha20_8block_xor_avx2)
-	# %rdi: Input state matrix, s
-	# %rsi: 8 data blocks output, o
-	# %rdx: 8 data blocks input, i
-
-	# This function encrypts eight consecutive ChaCha20 blocks by loading
-	# the state matrix in AVX registers eight times. As we need some
-	# scratch registers, we save the first four registers on the stack. The
-	# algorithm performs each operation on the corresponding word of each
-	# state matrix, hence requires no word shuffling. For final XORing step
-	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
-	# words, which allows us to do XOR in AVX registers. 8/16-bit word
-	# rotation is done with the slightly better performing byte shuffling,
-	# 7/12-bit word rotation uses traditional shift+OR.
-
-	vzeroupper
-	# 4 * 32 byte stack, 32-byte aligned
-	lea		8(%rsp),%r10
-	and		$~31, %rsp
-	sub		$0x80, %rsp
-
-	# x0..15[0-7] = s[0..15]
-	vpbroadcastd	0x00(%rdi),%ymm0
-	vpbroadcastd	0x04(%rdi),%ymm1
-	vpbroadcastd	0x08(%rdi),%ymm2
-	vpbroadcastd	0x0c(%rdi),%ymm3
-	vpbroadcastd	0x10(%rdi),%ymm4
-	vpbroadcastd	0x14(%rdi),%ymm5
-	vpbroadcastd	0x18(%rdi),%ymm6
-	vpbroadcastd	0x1c(%rdi),%ymm7
-	vpbroadcastd	0x20(%rdi),%ymm8
-	vpbroadcastd	0x24(%rdi),%ymm9
-	vpbroadcastd	0x28(%rdi),%ymm10
-	vpbroadcastd	0x2c(%rdi),%ymm11
-	vpbroadcastd	0x30(%rdi),%ymm12
-	vpbroadcastd	0x34(%rdi),%ymm13
-	vpbroadcastd	0x38(%rdi),%ymm14
-	vpbroadcastd	0x3c(%rdi),%ymm15
-	# x0..3 on stack
-	vmovdqa		%ymm0,0x00(%rsp)
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		%ymm3,0x60(%rsp)
-
-	vmovdqa		CTRINC(%rip),%ymm1
-	vmovdqa		ROT8(%rip),%ymm2
-	vmovdqa		ROT16(%rip),%ymm3
-
-	# x12 += counter values 0-3
-	vpaddd		%ymm1,%ymm12,%ymm12
-
-	mov		$10,%ecx
-
-.Ldoubleround8:
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	vpaddd		0x00(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm3,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	vpaddd		0x20(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm3,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	vpaddd		0x40(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm3,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vpaddd		0x60(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm3,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm4,%ymm4
-	vpslld		$12,%ymm4,%ymm0
-	vpsrld		$20,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm5,%ymm5
-	vpslld		$12,%ymm5,%ymm0
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm6,%ymm6
-	vpslld		$12,%ymm6,%ymm0
-	vpsrld		$20,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm7,%ymm7
-	vpslld		$12,%ymm7,%ymm0
-	vpsrld		$20,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-
-	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	vpaddd		0x00(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm2,%ymm12,%ymm12
-	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	vpaddd		0x20(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm2,%ymm13,%ymm13
-	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	vpaddd		0x40(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm2,%ymm14,%ymm14
-	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vpaddd		0x60(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm2,%ymm15,%ymm15
-
-	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	vpaddd		%ymm12,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm4,%ymm4
-	vpslld		$7,%ymm4,%ymm0
-	vpsrld		$25,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	vpaddd		%ymm13,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm5,%ymm5
-	vpslld		$7,%ymm5,%ymm0
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	vpaddd		%ymm14,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm6,%ymm6
-	vpslld		$7,%ymm6,%ymm0
-	vpsrld		$25,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vpaddd		%ymm15,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm7,%ymm7
-	vpslld		$7,%ymm7,%ymm0
-	vpsrld		$25,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	vpaddd		0x00(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm3,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
-	vpaddd		0x20(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm3,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	vpaddd		0x40(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm3,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vpaddd		0x60(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm3,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm5,%ymm5
-	vpslld		$12,%ymm5,%ymm0
-	vpsrld		$20,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm6,%ymm6
-	vpslld		$12,%ymm6,%ymm0
-	vpsrld		$20,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm7,%ymm7
-	vpslld		$12,%ymm7,%ymm0
-	vpsrld		$20,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm4,%ymm4
-	vpslld		$12,%ymm4,%ymm0
-	vpsrld		$20,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-
-	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	vpaddd		0x00(%rsp),%ymm5,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpxor		%ymm0,%ymm15,%ymm15
-	vpshufb		%ymm2,%ymm15,%ymm15
-	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	vpaddd		0x20(%rsp),%ymm6,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpxor		%ymm0,%ymm12,%ymm12
-	vpshufb		%ymm2,%ymm12,%ymm12
-	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	vpaddd		0x40(%rsp),%ymm7,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpxor		%ymm0,%ymm13,%ymm13
-	vpshufb		%ymm2,%ymm13,%ymm13
-	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vpaddd		0x60(%rsp),%ymm4,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpxor		%ymm0,%ymm14,%ymm14
-	vpshufb		%ymm2,%ymm14,%ymm14
-
-	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	vpaddd		%ymm15,%ymm10,%ymm10
-	vpxor		%ymm10,%ymm5,%ymm5
-	vpslld		$7,%ymm5,%ymm0
-	vpsrld		$25,%ymm5,%ymm5
-	vpor		%ymm0,%ymm5,%ymm5
-	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	vpaddd		%ymm12,%ymm11,%ymm11
-	vpxor		%ymm11,%ymm6,%ymm6
-	vpslld		$7,%ymm6,%ymm0
-	vpsrld		$25,%ymm6,%ymm6
-	vpor		%ymm0,%ymm6,%ymm6
-	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	vpaddd		%ymm13,%ymm8,%ymm8
-	vpxor		%ymm8,%ymm7,%ymm7
-	vpslld		$7,%ymm7,%ymm0
-	vpsrld		$25,%ymm7,%ymm7
-	vpor		%ymm0,%ymm7,%ymm7
-	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vpaddd		%ymm14,%ymm9,%ymm9
-	vpxor		%ymm9,%ymm4,%ymm4
-	vpslld		$7,%ymm4,%ymm0
-	vpsrld		$25,%ymm4,%ymm4
-	vpor		%ymm0,%ymm4,%ymm4
-
-	dec		%ecx
-	jnz		.Ldoubleround8
-
-	# x0..15[0-3] += s[0..15]
-	vpbroadcastd	0x00(%rdi),%ymm0
-	vpaddd		0x00(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x00(%rsp)
-	vpbroadcastd	0x04(%rdi),%ymm0
-	vpaddd		0x20(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x20(%rsp)
-	vpbroadcastd	0x08(%rdi),%ymm0
-	vpaddd		0x40(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x40(%rsp)
-	vpbroadcastd	0x0c(%rdi),%ymm0
-	vpaddd		0x60(%rsp),%ymm0,%ymm0
-	vmovdqa		%ymm0,0x60(%rsp)
-	vpbroadcastd	0x10(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm4,%ymm4
-	vpbroadcastd	0x14(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm5,%ymm5
-	vpbroadcastd	0x18(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm6,%ymm6
-	vpbroadcastd	0x1c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm7,%ymm7
-	vpbroadcastd	0x20(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm8,%ymm8
-	vpbroadcastd	0x24(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm9,%ymm9
-	vpbroadcastd	0x28(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm10,%ymm10
-	vpbroadcastd	0x2c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm11,%ymm11
-	vpbroadcastd	0x30(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm12,%ymm12
-	vpbroadcastd	0x34(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm13,%ymm13
-	vpbroadcastd	0x38(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm14,%ymm14
-	vpbroadcastd	0x3c(%rdi),%ymm0
-	vpaddd		%ymm0,%ymm15,%ymm15
-
-	# x12 += counter values 0-3
-	vpaddd		%ymm1,%ymm12,%ymm12
-
-	# interleave 32-bit words in state n, n+1
-	vmovdqa		0x00(%rsp),%ymm0
-	vmovdqa		0x20(%rsp),%ymm1
-	vpunpckldq	%ymm1,%ymm0,%ymm2
-	vpunpckhdq	%ymm1,%ymm0,%ymm1
-	vmovdqa		%ymm2,0x00(%rsp)
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		0x40(%rsp),%ymm0
-	vmovdqa		0x60(%rsp),%ymm1
-	vpunpckldq	%ymm1,%ymm0,%ymm2
-	vpunpckhdq	%ymm1,%ymm0,%ymm1
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		%ymm1,0x60(%rsp)
-	vmovdqa		%ymm4,%ymm0
-	vpunpckldq	%ymm5,%ymm0,%ymm4
-	vpunpckhdq	%ymm5,%ymm0,%ymm5
-	vmovdqa		%ymm6,%ymm0
-	vpunpckldq	%ymm7,%ymm0,%ymm6
-	vpunpckhdq	%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm8,%ymm0
-	vpunpckldq	%ymm9,%ymm0,%ymm8
-	vpunpckhdq	%ymm9,%ymm0,%ymm9
-	vmovdqa		%ymm10,%ymm0
-	vpunpckldq	%ymm11,%ymm0,%ymm10
-	vpunpckhdq	%ymm11,%ymm0,%ymm11
-	vmovdqa		%ymm12,%ymm0
-	vpunpckldq	%ymm13,%ymm0,%ymm12
-	vpunpckhdq	%ymm13,%ymm0,%ymm13
-	vmovdqa		%ymm14,%ymm0
-	vpunpckldq	%ymm15,%ymm0,%ymm14
-	vpunpckhdq	%ymm15,%ymm0,%ymm15
-
-	# interleave 64-bit words in state n, n+2
-	vmovdqa		0x00(%rsp),%ymm0
-	vmovdqa		0x40(%rsp),%ymm2
-	vpunpcklqdq	%ymm2,%ymm0,%ymm1
-	vpunpckhqdq	%ymm2,%ymm0,%ymm2
-	vmovdqa		%ymm1,0x00(%rsp)
-	vmovdqa		%ymm2,0x40(%rsp)
-	vmovdqa		0x20(%rsp),%ymm0
-	vmovdqa		0x60(%rsp),%ymm2
-	vpunpcklqdq	%ymm2,%ymm0,%ymm1
-	vpunpckhqdq	%ymm2,%ymm0,%ymm2
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		%ymm2,0x60(%rsp)
-	vmovdqa		%ymm4,%ymm0
-	vpunpcklqdq	%ymm6,%ymm0,%ymm4
-	vpunpckhqdq	%ymm6,%ymm0,%ymm6
-	vmovdqa		%ymm5,%ymm0
-	vpunpcklqdq	%ymm7,%ymm0,%ymm5
-	vpunpckhqdq	%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm8,%ymm0
-	vpunpcklqdq	%ymm10,%ymm0,%ymm8
-	vpunpckhqdq	%ymm10,%ymm0,%ymm10
-	vmovdqa		%ymm9,%ymm0
-	vpunpcklqdq	%ymm11,%ymm0,%ymm9
-	vpunpckhqdq	%ymm11,%ymm0,%ymm11
-	vmovdqa		%ymm12,%ymm0
-	vpunpcklqdq	%ymm14,%ymm0,%ymm12
-	vpunpckhqdq	%ymm14,%ymm0,%ymm14
-	vmovdqa		%ymm13,%ymm0
-	vpunpcklqdq	%ymm15,%ymm0,%ymm13
-	vpunpckhqdq	%ymm15,%ymm0,%ymm15
-
-	# interleave 128-bit words in state n, n+4
-	vmovdqa		0x00(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
-	vmovdqa		%ymm1,0x00(%rsp)
-	vmovdqa		0x20(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
-	vmovdqa		%ymm1,0x20(%rsp)
-	vmovdqa		0x40(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
-	vmovdqa		%ymm1,0x40(%rsp)
-	vmovdqa		0x60(%rsp),%ymm0
-	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
-	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
-	vmovdqa		%ymm1,0x60(%rsp)
-	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
-	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
-	vmovdqa		%ymm0,%ymm8
-	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
-	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
-	vmovdqa		%ymm0,%ymm9
-	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
-	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
-	vmovdqa		%ymm0,%ymm10
-	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
-	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
-	vmovdqa		%ymm0,%ymm11
-
-	# xor with corresponding input, write to output
-	vmovdqa		0x00(%rsp),%ymm0
-	vpxor		0x0000(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0000(%rsi)
-	vmovdqa		0x20(%rsp),%ymm0
-	vpxor		0x0080(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0080(%rsi)
-	vmovdqa		0x40(%rsp),%ymm0
-	vpxor		0x0040(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x0040(%rsi)
-	vmovdqa		0x60(%rsp),%ymm0
-	vpxor		0x00c0(%rdx),%ymm0,%ymm0
-	vmovdqu		%ymm0,0x00c0(%rsi)
-	vpxor		0x0100(%rdx),%ymm4,%ymm4
-	vmovdqu		%ymm4,0x0100(%rsi)
-	vpxor		0x0180(%rdx),%ymm5,%ymm5
-	vmovdqu		%ymm5,0x00180(%rsi)
-	vpxor		0x0140(%rdx),%ymm6,%ymm6
-	vmovdqu		%ymm6,0x0140(%rsi)
-	vpxor		0x01c0(%rdx),%ymm7,%ymm7
-	vmovdqu		%ymm7,0x01c0(%rsi)
-	vpxor		0x0020(%rdx),%ymm8,%ymm8
-	vmovdqu		%ymm8,0x0020(%rsi)
-	vpxor		0x00a0(%rdx),%ymm9,%ymm9
-	vmovdqu		%ymm9,0x00a0(%rsi)
-	vpxor		0x0060(%rdx),%ymm10,%ymm10
-	vmovdqu		%ymm10,0x0060(%rsi)
-	vpxor		0x00e0(%rdx),%ymm11,%ymm11
-	vmovdqu		%ymm11,0x00e0(%rsi)
-	vpxor		0x0120(%rdx),%ymm12,%ymm12
-	vmovdqu		%ymm12,0x0120(%rsi)
-	vpxor		0x01a0(%rdx),%ymm13,%ymm13
-	vmovdqu		%ymm13,0x01a0(%rsi)
-	vpxor		0x0160(%rdx),%ymm14,%ymm14
-	vmovdqu		%ymm14,0x0160(%rsi)
-	vpxor		0x01e0(%rdx),%ymm15,%ymm15
-	vmovdqu		%ymm15,0x01e0(%rsi)
-
-	vzeroupper
-	lea		-8(%r10),%rsp
-	ret
-ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
deleted file mode 100644
index dce7c5d39c2f26f8e1cf6d31c69e1ef627f6cb10..0000000000000000000000000000000000000000
--- a/arch/x86/crypto/chacha20_glue.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-
-#define CHACHA20_STATE_ALIGN 16
-
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
-static bool chacha20_use_avx2;
-#endif
-
-static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
-{
-	u8 buf[CHACHA20_BLOCK_SIZE];
-
-#ifdef CONFIG_AS_AVX2
-	if (chacha20_use_avx2) {
-		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
-			chacha20_8block_xor_avx2(state, dst, src);
-			bytes -= CHACHA20_BLOCK_SIZE * 8;
-			src += CHACHA20_BLOCK_SIZE * 8;
-			dst += CHACHA20_BLOCK_SIZE * 8;
-			state[12] += 8;
-		}
-	}
-#endif
-	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_ssse3(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE * 4;
-		src += CHACHA20_BLOCK_SIZE * 4;
-		dst += CHACHA20_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block_xor_ssse3(state, dst, src);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		src += CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha20_block_xor_ssse3(state, buf, buf);
-		memcpy(dst, buf, bytes);
-	}
-}
-
-static int chacha20_simd(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u32 *state, state_buf[16 + 2] __aligned(8);
-	struct skcipher_walk walk;
-	int err;
-
-	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
-	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-
-	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(req);
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_chacha20_init(state, ctx, walk.iv);
-
-	kernel_fpu_begin();
-
-	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = skcipher_walk_done(&walk,
-					 walk.nbytes % CHACHA20_BLOCK_SIZE);
-	}
-
-	if (walk.nbytes) {
-		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
-				walk.nbytes);
-		err = skcipher_walk_done(&walk, 0);
-	}
-
-	kernel_fpu_end();
-
-	return err;
-}
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-simd",
-	.base.cra_priority	= 300,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= chacha20_simd,
-	.decrypt		= chacha20_simd,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
-	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
-			    boot_cpu_has(X86_FEATURE_AVX2) &&
-			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#endif
-	return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
-	crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..45c1c41431766dc397a9f8591f6785d0ca013cb5
--- /dev/null
+++ b/arch/x86/crypto/chacha_glue.c
@@ -0,0 +1,304 @@
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+					unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+	if (chacha_use_avx512vl) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes) {
+			chacha_2block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+#endif
+	if (chacha_use_avx2) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+			state[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+#endif
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	if (bytes > CHACHA_BLOCK_SIZE) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state[12] += chacha_advance(bytes, 4);
+		return;
+	}
+	if (bytes) {
+		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state[12]++;
+	}
+}
+
+static int chacha_simd_stream_xor(struct skcipher_walk *walk,
+				  struct chacha_ctx *ctx, u8 *iv)
+{
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	int next_yield = 4096; /* bytes until next FPU yield */
+	int err = 0;
+
+	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk->nbytes > 0) {
+		unsigned int nbytes = walk->nbytes;
+
+		if (nbytes < walk->total) {
+			nbytes = round_down(nbytes, walk->stride);
+			next_yield -= nbytes;
+		}
+
+		chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+			      nbytes, ctx->nrounds);
+
+		if (next_yield <= 0) {
+			/* temporarily allow preemption */
+			kernel_fpu_end();
+			kernel_fpu_begin();
+			next_yield = 4096;
+		}
+
+		err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_chacha_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+	if (err)
+		return err;
+
+	kernel_fpu_begin();
+	err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+	kernel_fpu_end();
+	return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	struct chacha_ctx subctx;
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	u8 real_iv[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+		return crypto_xchacha_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+	if (err)
+		return err;
+
+	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+	crypto_chacha_init(state, ctx, req->iv);
+
+	kernel_fpu_begin();
+
+	hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= chacha_simd,
+		.decrypt		= chacha_simd,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= xchacha_simd,
+		.decrypt		= xchacha_simd,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-simd",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= xchacha_simd,
+		.decrypt		= xchacha_simd,
+	},
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+	chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+			  boot_cpu_has(X86_FEATURE_AVX2) &&
+			  cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+	chacha_use_avx512vl = chacha_use_avx2 &&
+			      boot_cpu_has(X86_FEATURE_AVX512VL) &&
+			      boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 0000000000000000000000000000000000000000..f7946ea1b70467da4ac4b989aead1256c7040373
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%ymm0
+#define		PASS1_SUMS	%ymm1
+#define		PASS2_SUMS	%ymm2
+#define		PASS3_SUMS	%ymm3
+#define		K0		%ymm4
+#define		K0_XMM		%xmm4
+#define		K1		%ymm5
+#define		K1_XMM		%xmm5
+#define		K2		%ymm6
+#define		K2_XMM		%xmm6
+#define		K3		%ymm7
+#define		K3_XMM		%xmm7
+#define		T0		%ymm8
+#define		T1		%ymm9
+#define		T2		%ymm10
+#define		T2_XMM		%xmm10
+#define		T3		%ymm11
+#define		T3_XMM		%xmm11
+#define		T4		%ymm12
+#define		T5		%ymm13
+#define		T6		%ymm14
+#define		T7		%ymm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_2xstride	k0, k1, k2, k3
+
+	// Add message words to key words
+	vpaddd		\k0, T3, T0
+	vpaddd		\k1, T3, T1
+	vpaddd		\k2, T3, T2
+	vpaddd		\k3, T3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	vpshufd		$0x10, T0, T4
+	vpshufd		$0x32, T0, T0
+	vpshufd		$0x10, T1, T5
+	vpshufd		$0x32, T1, T1
+	vpshufd		$0x10, T2, T6
+	vpshufd		$0x32, T2, T2
+	vpshufd		$0x10, T3, T7
+	vpshufd		$0x32, T3, T3
+	vpmuludq	T4, T0, T0
+	vpmuludq	T5, T1, T1
+	vpmuludq	T6, T2, T2
+	vpmuludq	T7, T3, T3
+	vpaddq		T0, PASS0_SUMS, PASS0_SUMS
+	vpaddq		T1, PASS1_SUMS, PASS1_SUMS
+	vpaddq		T2, PASS2_SUMS, PASS2_SUMS
+	vpaddq		T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+	vmovdqu		0x00(KEY), K0
+	vmovdqu		0x10(KEY), K1
+	add		$0x20, KEY
+	vpxor		PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+	vpxor		PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+	vpxor		PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+	vpxor		PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+
+	vmovdqu		0x20(MESSAGE), T3
+	vmovdqu		0x20(KEY), K0
+	vmovdqu		0x30(KEY), K1
+	_nh_2xstride	K2, K3, K0, K1
+
+	add		$0x40, MESSAGE
+	add		$0x40, KEY
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+
+	cmp		$0x20, MESSAGE_LEN
+	jl		.Llast
+
+	// 2 or 3 strides remain; do 2 more.
+	vmovdqu		(MESSAGE), T3
+	vmovdqu		0x00(KEY), K2
+	vmovdqu		0x10(KEY), K3
+	_nh_2xstride	K0, K1, K2, K3
+	add		$0x20, MESSAGE
+	add		$0x20, KEY
+	sub		$0x20, MESSAGE_LEN
+	jz		.Ldone
+	vmovdqa		K2, K0
+	vmovdqa		K3, K1
+.Llast:
+	// Last stride.  Zero the high 128 bits of the message and keys so they
+	// don't affect the result when processing them like 2 strides.
+	vmovdqu		(MESSAGE), T3_XMM
+	vmovdqa		K0_XMM, K0_XMM
+	vmovdqa		K1_XMM, K1_XMM
+	vmovdqu		0x00(KEY), K2_XMM
+	vmovdqu		0x10(KEY), K3_XMM
+	_nh_2xstride	K0, K1, K2, K3
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+
+	// PASS0_SUMS is (0A 0B 0C 0D)
+	// PASS1_SUMS is (1A 1B 1C 1D)
+	// PASS2_SUMS is (2A 2B 2C 2D)
+	// PASS3_SUMS is (3A 3B 3C 3D)
+	// We need the horizontal sums:
+	//     (0A + 0B + 0C + 0D,
+	//	1A + 1B + 1C + 1D,
+	//	2A + 2B + 2C + 2D,
+	//	3A + 3B + 3C + 3D)
+	//
+
+	vpunpcklqdq	PASS1_SUMS, PASS0_SUMS, T0	// T0 = (0A 1A 0C 1C)
+	vpunpckhqdq	PASS1_SUMS, PASS0_SUMS, T1	// T1 = (0B 1B 0D 1D)
+	vpunpcklqdq	PASS3_SUMS, PASS2_SUMS, T2	// T2 = (2A 3A 2C 3C)
+	vpunpckhqdq	PASS3_SUMS, PASS2_SUMS, T3	// T3 = (2B 3B 2D 3D)
+
+	vinserti128	$0x1, T2_XMM, T0, T4		// T4 = (0A 1A 2A 3A)
+	vinserti128	$0x1, T3_XMM, T1, T5		// T5 = (0B 1B 2B 3B)
+	vperm2i128	$0x31, T2, T0, T0		// T0 = (0C 1C 2C 3C)
+	vperm2i128	$0x31, T3, T1, T1		// T1 = (0D 1D 2D 3D)
+
+	vpaddq		T5, T4, T4
+	vpaddq		T1, T0, T0
+	vpaddq		T4, T0, T0
+	vmovdqu		T0, (HASH)
+	ret
+ENDPROC(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 0000000000000000000000000000000000000000..51f52d4ab4bbc75dde98199f3330c483a631b006
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define		PASS0_SUMS	%xmm0
+#define		PASS1_SUMS	%xmm1
+#define		PASS2_SUMS	%xmm2
+#define		PASS3_SUMS	%xmm3
+#define		K0		%xmm4
+#define		K1		%xmm5
+#define		K2		%xmm6
+#define		K3		%xmm7
+#define		T0		%xmm8
+#define		T1		%xmm9
+#define		T2		%xmm10
+#define		T3		%xmm11
+#define		T4		%xmm12
+#define		T5		%xmm13
+#define		T6		%xmm14
+#define		T7		%xmm15
+#define		KEY		%rdi
+#define		MESSAGE		%rsi
+#define		MESSAGE_LEN	%rdx
+#define		HASH		%rcx
+
+.macro _nh_stride	k0, k1, k2, k3, offset
+
+	// Load next message stride
+	movdqu		\offset(MESSAGE), T1
+
+	// Load next key stride
+	movdqu		\offset(KEY), \k3
+
+	// Add message words to key words
+	movdqa		T1, T2
+	movdqa		T1, T3
+	paddd		T1, \k0    // reuse k0 to avoid a move
+	paddd		\k1, T1
+	paddd		\k2, T2
+	paddd		\k3, T3
+
+	// Multiply 32x32 => 64 and accumulate
+	pshufd		$0x10, \k0, T4
+	pshufd		$0x32, \k0, \k0
+	pshufd		$0x10, T1, T5
+	pshufd		$0x32, T1, T1
+	pshufd		$0x10, T2, T6
+	pshufd		$0x32, T2, T2
+	pshufd		$0x10, T3, T7
+	pshufd		$0x32, T3, T3
+	pmuludq		T4, \k0
+	pmuludq		T5, T1
+	pmuludq		T6, T2
+	pmuludq		T7, T3
+	paddq		\k0, PASS0_SUMS
+	paddq		T1, PASS1_SUMS
+	paddq		T2, PASS2_SUMS
+	paddq		T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ *		u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+	movdqu		0x00(KEY), K0
+	movdqu		0x10(KEY), K1
+	movdqu		0x20(KEY), K2
+	add		$0x30, KEY
+	pxor		PASS0_SUMS, PASS0_SUMS
+	pxor		PASS1_SUMS, PASS1_SUMS
+	pxor		PASS2_SUMS, PASS2_SUMS
+	pxor		PASS3_SUMS, PASS3_SUMS
+
+	sub		$0x40, MESSAGE_LEN
+	jl		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3, 0x00
+	_nh_stride	K1, K2, K3, K0, 0x10
+	_nh_stride	K2, K3, K0, K1, 0x20
+	_nh_stride	K3, K0, K1, K2, 0x30
+	add		$0x40, KEY
+	add		$0x40, MESSAGE
+	sub		$0x40, MESSAGE_LEN
+	jge		.Lloop4
+
+.Lloop4_done:
+	and		$0x3f, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K0, K1, K2, K3, 0x00
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K1, K2, K3, K0, 0x10
+
+	sub		$0x10, MESSAGE_LEN
+	jz		.Ldone
+	_nh_stride	K2, K3, K0, K1, 0x20
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	movdqa		PASS0_SUMS, T0
+	movdqa		PASS2_SUMS, T1
+	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
+	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
+	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
+	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
+	paddq		PASS0_SUMS, T0
+	paddq		PASS2_SUMS, T1
+	movdqu		T0, 0x00(HASH)
+	movdqu		T1, 0x10(HASH)
+	ret
+ENDPROC(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..20d815ea4b6a98fbd3ccd30f8f50f052ae9f58fc
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_avx2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !irq_fpu_usable())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_fpu_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+		kernel_fpu_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-avx2",
+	.base.cra_priority	= 300,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_avx2_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_OSXSAVE))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed68d164ce1402ee971737c96575a2d0ea4dc42f
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+			u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES])
+{
+	nh_sse2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+{
+	if (srclen < 64 || !irq_fpu_usable())
+		return crypto_nhpoly1305_update(desc, src, srclen);
+
+	do {
+		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+		kernel_fpu_begin();
+		crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+		kernel_fpu_end();
+		src += n;
+		srclen -= n;
+	} while (srclen);
+	return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-sse2",
+	.base.cra_priority	= 200,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= nhpoly1305_sse2_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XMM2))
+		return -ENODEV;
+
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f012b7e28ad1d9d0914699462586b136b0411a16..88cc01506c84a64bc4731e8974ed76487be0f168 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
 	if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
 		if (unlikely(!sctx->wset)) {
 			if (!sctx->uset) {
-				memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-				poly1305_simd_mult(sctx->u, dctx->r);
+				memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+				poly1305_simd_mult(sctx->u, dctx->r.r);
 				sctx->uset = true;
 			}
 			memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 5, dctx->r);
+			poly1305_simd_mult(sctx->u + 5, dctx->r.r);
 			memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u + 10, dctx->r);
+			poly1305_simd_mult(sctx->u + 10, dctx->r.r);
 			sctx->wset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
-		poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 4 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
 	}
 #endif
 	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
 		if (unlikely(!sctx->uset)) {
-			memcpy(sctx->u, dctx->r, sizeof(sctx->u));
-			poly1305_simd_mult(sctx->u, dctx->r);
+			memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u, dctx->r.r);
 			sctx->uset = true;
 		}
 		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
-		poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+		poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+				     sctx->u);
 		src += POLY1305_BLOCK_SIZE * 2 * blocks;
 		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
 	}
 	if (srclen >= POLY1305_BLOCK_SIZE) {
-		poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+		poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
 		srclen -= POLY1305_BLOCK_SIZE;
 	}
 	return srclen;
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 05c91eb10ca1fd97fde1dd0593d975b99b798e1c..045af6eeb7e2ef01bec8bb6784f5ad82356a540e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -430,11 +430,14 @@ config CRYPTO_CTS
 	help
 	  CTS: Cipher Text Stealing
 	  This is the Cipher Text Stealing mode as described by
-	  Section 8 of rfc2040 and referenced by rfc3962.
-	  (rfc3962 includes errata information in its Appendix A)
+	  Section 8 of rfc2040 and referenced by rfc3962
+	  (rfc3962 includes errata information in its Appendix A) or
+	  CBC-CS3 as defined by NIST in Sp800-38A addendum from Oct 2010.
 	  This mode is required for Kerberos gss mechanism support
 	  for AES encryption.
 
+	  See: https://csrc.nist.gov/publications/detail/sp/800-38a/addendum/final
+
 config CRYPTO_ECB
 	tristate "ECB support"
 	select CRYPTO_BLKCIPHER
@@ -493,6 +496,50 @@ config CRYPTO_KEYWRAP
 	  Support for key wrapping (NIST SP800-38F / RFC3394) without
 	  padding.
 
+config CRYPTO_NHPOLY1305
+	tristate
+	select CRYPTO_HASH
+	select CRYPTO_POLY1305
+
+config CRYPTO_NHPOLY1305_SSE2
+	tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_NHPOLY1305
+	help
+	  SSE2 optimized implementation of the hash function used by the
+	  Adiantum encryption mode.
+
+config CRYPTO_NHPOLY1305_AVX2
+	tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_NHPOLY1305
+	help
+	  AVX2 optimized implementation of the hash function used by the
+	  Adiantum encryption mode.
+
+config CRYPTO_ADIANTUM
+	tristate "Adiantum support"
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select CRYPTO_NHPOLY1305
+	help
+	  Adiantum is a tweakable, length-preserving encryption mode
+	  designed for fast and secure disk encryption, especially on
+	  CPUs without dedicated crypto instructions.  It encrypts
+	  each sector using the XChaCha12 stream cipher, two passes of
+	  an ε-almost-∆-universal hash function, and an invocation of
+	  the AES-256 block cipher on a single 16-byte block.  On CPUs
+	  without AES instructions, Adiantum is much faster than
+	  AES-XTS.
+
+	  Adiantum's security is provably reducible to that of its
+	  underlying stream and block ciphers, subject to a security
+	  bound.  Unlike XTS, Adiantum is a true wide-block encryption
+	  mode, so it actually provides an even stronger notion of
+	  security than XTS, subject to the security bound.
+
+	  If unsure, say N.
+
 comment "Hash modes"
 
 config CRYPTO_CMAC
@@ -936,6 +983,18 @@ config CRYPTO_SM3
 	  http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
 	  https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash
 
+config CRYPTO_STREEBOG
+	tristate "Streebog Hash Function"
+	select CRYPTO_HASH
+	help
+	  Streebog Hash Function (GOST R 34.11-2012, RFC 6986) is one of the Russian
+	  cryptographic standard algorithms (called GOST algorithms).
+	  This setting enables two hash algorithms with 256 and 512 bits output.
+
+	  References:
+	  https://tc26.ru/upload/iblock/fed/feddbb4d26b685903faa2ba11aea43f6.pdf
+	  https://tools.ietf.org/html/rfc6986
+
 config CRYPTO_TGR192
 	tristate "Tiger digest algorithms"
 	select CRYPTO_HASH
@@ -1006,7 +1065,8 @@ config CRYPTO_AES_TI
 	  8 for decryption), this implementation only uses just two S-boxes of
 	  256 bytes each, and attempts to eliminate data dependent latencies by
 	  prefetching the entire table into the cache at the start of each
-	  block.
+	  block. Interrupts are also disabled to avoid races where cachelines
+	  are evicted when the CPU is interrupted to do something else.
 
 config CRYPTO_AES_586
 	tristate "AES cipher algorithms (i586)"
@@ -1387,32 +1447,34 @@ config CRYPTO_SALSA20
 	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
 config CRYPTO_CHACHA20
-	tristate "ChaCha20 cipher algorithm"
+	tristate "ChaCha stream cipher algorithms"
 	select CRYPTO_BLKCIPHER
 	help
-	  ChaCha20 cipher algorithm, RFC7539.
+	  The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
 
 	  ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
 	  Bernstein and further specified in RFC7539 for use in IETF protocols.
-	  This is the portable C implementation of ChaCha20.
-
-	  See also:
+	  This is the portable C implementation of ChaCha20.  See also:
 	  <http://cr.yp.to/chacha/chacha-20080128.pdf>
 
+	  XChaCha20 is the application of the XSalsa20 construction to ChaCha20
+	  rather than to Salsa20.  XChaCha20 extends ChaCha20's nonce length
+	  from 64 bits (or 96 bits using the RFC7539 convention) to 192 bits,
+	  while provably retaining ChaCha20's security.  See also:
+	  <https://cr.yp.to/snuffle/xsalsa-20081128.pdf>
+
+	  XChaCha12 is XChaCha20 reduced to 12 rounds, with correspondingly
+	  reduced security margin but increased performance.  It can be needed
+	  in some performance-sensitive scenarios.
+
 config CRYPTO_CHACHA20_X86_64
-	tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
+	tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
 	depends on X86 && 64BIT
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 	help
-	  ChaCha20 cipher algorithm, RFC7539.
-
-	  ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
-	  Bernstein and further specified in RFC7539 for use in IETF protocols.
-	  This is the x86_64 assembler implementation using SIMD instructions.
-
-	  See also:
-	  <http://cr.yp.to/chacha/chacha-20080128.pdf>
+	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+	  XChaCha20, and XChaCha12 stream ciphers.
 
 config CRYPTO_SEED
 	tristate "SEED cipher algorithm"
@@ -1812,7 +1874,8 @@ config CRYPTO_USER_API_AEAD
 	  cipher algorithms.
 
 config CRYPTO_STATS
-	bool
+	bool "Crypto usage statistics for User-space"
+	depends on CRYPTO_USER
 	help
 	  This option enables the gathering of crypto stats.
 	  This will collect:
diff --git a/crypto/Makefile b/crypto/Makefile
index 5c207c76abf7e8b40382d8abec3aa58106c641c9..799ed5e946069ee090ddc0db007f402874ef10f0 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -54,7 +54,8 @@ cryptomgr-y := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
-crypto_user-y := crypto_user_base.o crypto_user_stat.o
+crypto_user-y := crypto_user_base.o
+crypto_user-$(CONFIG_CRYPTO_STATS) += crypto_user_stat.o
 obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
@@ -71,6 +72,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
+obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
@@ -84,6 +86,8 @@ obj-$(CONFIG_CRYPTO_LRW) += lrw.o
 obj-$(CONFIG_CRYPTO_XTS) += xts.o
 obj-$(CONFIG_CRYPTO_CTR) += ctr.o
 obj-$(CONFIG_CRYPTO_KEYWRAP) += keywrap.o
+obj-$(CONFIG_CRYPTO_ADIANTUM) += adiantum.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305) += nhpoly1305.o
 obj-$(CONFIG_CRYPTO_GCM) += gcm.o
 obj-$(CONFIG_CRYPTO_CCM) += ccm.o
 obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
@@ -116,7 +120,7 @@ obj-$(CONFIG_CRYPTO_KHAZAD) += khazad.o
 obj-$(CONFIG_CRYPTO_ANUBIS) += anubis.o
 obj-$(CONFIG_CRYPTO_SEED) += seed.o
 obj-$(CONFIG_CRYPTO_SALSA20) += salsa20_generic.o
-obj-$(CONFIG_CRYPTO_CHACHA20) += chacha20_generic.o
+obj-$(CONFIG_CRYPTO_CHACHA20) += chacha_generic.o
 obj-$(CONFIG_CRYPTO_POLY1305) += poly1305_generic.o
 obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o
 obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index 8882e90e868eade5b77fc310fb5238dd322e9f03..b339587073c3e96ff5469369f0bbd9c24bea394f 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -365,23 +365,18 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_blkcipher rblkcipher;
 
-	strncpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<default>",
-		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "ablkcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, "<default>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
 	rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize;
 	rblkcipher.ivsize = alg->cra_ablkcipher.ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
@@ -403,7 +398,7 @@ static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "min keysize  : %u\n", ablkcipher->min_keysize);
 	seq_printf(m, "max keysize  : %u\n", ablkcipher->max_keysize);
 	seq_printf(m, "ivsize       : %u\n", ablkcipher->ivsize);
-	seq_printf(m, "geniv        : %s\n", ablkcipher->geniv ?: "<default>");
+	seq_printf(m, "geniv        : <default>\n");
 }
 
 const struct crypto_type crypto_ablkcipher_type = {
@@ -415,78 +410,3 @@ const struct crypto_type crypto_ablkcipher_type = {
 	.report = crypto_ablkcipher_report,
 };
 EXPORT_SYMBOL_GPL(crypto_ablkcipher_type);
-
-static int crypto_init_givcipher_ops(struct crypto_tfm *tfm, u32 type,
-				      u32 mask)
-{
-	struct ablkcipher_alg *alg = &tfm->__crt_alg->cra_ablkcipher;
-	struct ablkcipher_tfm *crt = &tfm->crt_ablkcipher;
-
-	if (alg->ivsize > PAGE_SIZE / 8)
-		return -EINVAL;
-
-	crt->setkey = tfm->__crt_alg->cra_flags & CRYPTO_ALG_GENIV ?
-		      alg->setkey : setkey;
-	crt->encrypt = alg->encrypt;
-	crt->decrypt = alg->decrypt;
-	crt->base = __crypto_ablkcipher_cast(tfm);
-	crt->ivsize = alg->ivsize;
-
-	return 0;
-}
-
-#ifdef CONFIG_NET
-static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_blkcipher rblkcipher;
-
-	strncpy(rblkcipher.type, "givcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_ablkcipher.geniv ?: "<built-in>",
-		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
-
-	rblkcipher.blocksize = alg->cra_blocksize;
-	rblkcipher.min_keysize = alg->cra_ablkcipher.min_keysize;
-	rblkcipher.max_keysize = alg->cra_ablkcipher.max_keysize;
-	rblkcipher.ivsize = alg->cra_ablkcipher.ivsize;
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-#else
-static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return -ENOSYS;
-}
-#endif
-
-static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__maybe_unused;
-static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-{
-	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
-
-	seq_printf(m, "type         : givcipher\n");
-	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
-					     "yes" : "no");
-	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
-	seq_printf(m, "min keysize  : %u\n", ablkcipher->min_keysize);
-	seq_printf(m, "max keysize  : %u\n", ablkcipher->max_keysize);
-	seq_printf(m, "ivsize       : %u\n", ablkcipher->ivsize);
-	seq_printf(m, "geniv        : %s\n", ablkcipher->geniv ?: "<built-in>");
-}
-
-const struct crypto_type crypto_givcipher_type = {
-	.ctxsize = crypto_ablkcipher_ctxsize,
-	.init = crypto_init_givcipher_ops,
-#ifdef CONFIG_PROC_FS
-	.show = crypto_givcipher_show,
-#endif
-	.report = crypto_givcipher_report,
-};
-EXPORT_SYMBOL_GPL(crypto_givcipher_type);
diff --git a/crypto/acompress.c b/crypto/acompress.c
index 1544b7c057fb5660e593c64d2033542fa9d1ea47..0c5bedd06e705525ab2cbc49a2c951713fa3d633 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -33,15 +33,11 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_acomp racomp;
 
-	strncpy(racomp.type, "acomp", sizeof(racomp.type));
+	memset(&racomp, 0, sizeof(racomp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP,
-		    sizeof(struct crypto_report_acomp), &racomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(racomp.type, "acomp", sizeof(racomp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_ACOMP, sizeof(racomp), &racomp);
 }
 #else
 static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/adiantum.c b/crypto/adiantum.c
new file mode 100644
index 0000000000000000000000000000000000000000..6651e713c45d6b2e3af4955cdecde6c671a2c874
--- /dev/null
+++ b/crypto/adiantum.c
@@ -0,0 +1,664 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Adiantum length-preserving encryption mode
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Adiantum is a tweakable, length-preserving encryption mode designed for fast
+ * and secure disk encryption, especially on CPUs without dedicated crypto
+ * instructions.  Adiantum encrypts each sector using the XChaCha12 stream
+ * cipher, two passes of an ε-almost-∆-universal (ε-∆U) hash function based on
+ * NH and Poly1305, and an invocation of the AES-256 block cipher on a single
+ * 16-byte block.  See the paper for details:
+ *
+ *	Adiantum: length-preserving encryption for entry-level processors
+ *      (https://eprint.iacr.org/2018/720.pdf)
+ *
+ * For flexibility, this implementation also allows other ciphers:
+ *
+ *	- Stream cipher: XChaCha12 or XChaCha20
+ *	- Block cipher: any with a 128-bit block size and 256-bit key
+ *
+ * This implementation doesn't currently allow other ε-∆U hash functions, i.e.
+ * HPolyC is not supported.  This is because Adiantum is ~20% faster than HPolyC
+ * but still provably as secure, and also the ε-∆U hash function of HBSH is
+ * formally defined to take two inputs (tweak, message) which makes it difficult
+ * to wrap with the crypto_shash API.  Rather, some details need to be handled
+ * here.  Nevertheless, if needed in the future, support for other ε-∆U hash
+ * functions could be added here.
+ */
+
+#include <crypto/b128ops.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/nhpoly1305.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+
+#include "internal.h"
+
+/*
+ * Size of right-hand part of input data, in bytes; also the size of the block
+ * cipher's block size and the hash function's output.
+ */
+#define BLOCKCIPHER_BLOCK_SIZE		16
+
+/* Size of the block cipher key (K_E) in bytes */
+#define BLOCKCIPHER_KEY_SIZE		32
+
+/* Size of the hash key (K_H) in bytes */
+#define HASH_KEY_SIZE		(POLY1305_BLOCK_SIZE + NHPOLY1305_KEY_SIZE)
+
+/*
+ * The specification allows variable-length tweaks, but Linux's crypto API
+ * currently only allows algorithms to support a single length.  The "natural"
+ * tweak length for Adiantum is 16, since that fits into one Poly1305 block for
+ * the best performance.  But longer tweaks are useful for fscrypt, to avoid
+ * needing to derive per-file keys.  So instead we use two blocks, or 32 bytes.
+ */
+#define TWEAK_SIZE		32
+
+struct adiantum_instance_ctx {
+	struct crypto_skcipher_spawn streamcipher_spawn;
+	struct crypto_spawn blockcipher_spawn;
+	struct crypto_shash_spawn hash_spawn;
+};
+
+struct adiantum_tfm_ctx {
+	struct crypto_skcipher *streamcipher;
+	struct crypto_cipher *blockcipher;
+	struct crypto_shash *hash;
+	struct poly1305_key header_hash_key;
+};
+
+struct adiantum_request_ctx {
+
+	/*
+	 * Buffer for right-hand part of data, i.e.
+	 *
+	 *    P_L => P_M => C_M => C_R when encrypting, or
+	 *    C_R => C_M => P_M => P_L when decrypting.
+	 *
+	 * Also used to build the IV for the stream cipher.
+	 */
+	union {
+		u8 bytes[XCHACHA_IV_SIZE];
+		__le32 words[XCHACHA_IV_SIZE / sizeof(__le32)];
+		le128 bignum;	/* interpret as element of Z/(2^{128}Z) */
+	} rbuf;
+
+	bool enc; /* true if encrypting, false if decrypting */
+
+	/*
+	 * The result of the Poly1305 ε-∆U hash function applied to
+	 * (bulk length, tweak)
+	 */
+	le128 header_hash;
+
+	/* Sub-requests, must be last */
+	union {
+		struct shash_desc hash_desc;
+		struct skcipher_request streamcipher_req;
+	} u;
+};
+
+/*
+ * Given the XChaCha stream key K_S, derive the block cipher key K_E and the
+ * hash key K_H as follows:
+ *
+ *     K_E || K_H || ... = XChaCha(key=K_S, nonce=1||0^191)
+ *
+ * Note that this denotes using bits from the XChaCha keystream, which here we
+ * get indirectly by encrypting a buffer containing all 0's.
+ */
+static int adiantum_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct {
+		u8 iv[XCHACHA_IV_SIZE];
+		u8 derived_keys[BLOCKCIPHER_KEY_SIZE + HASH_KEY_SIZE];
+		struct scatterlist sg;
+		struct crypto_wait wait;
+		struct skcipher_request req; /* must be last */
+	} *data;
+	u8 *keyp;
+	int err;
+
+	/* Set the stream cipher key (K_S) */
+	crypto_skcipher_clear_flags(tctx->streamcipher, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(tctx->streamcipher,
+				  crypto_skcipher_get_flags(tfm) &
+				  CRYPTO_TFM_REQ_MASK);
+	err = crypto_skcipher_setkey(tctx->streamcipher, key, keylen);
+	crypto_skcipher_set_flags(tfm,
+				crypto_skcipher_get_flags(tctx->streamcipher) &
+				CRYPTO_TFM_RES_MASK);
+	if (err)
+		return err;
+
+	/* Derive the subkeys */
+	data = kzalloc(sizeof(*data) +
+		       crypto_skcipher_reqsize(tctx->streamcipher), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->iv[0] = 1;
+	sg_init_one(&data->sg, data->derived_keys, sizeof(data->derived_keys));
+	crypto_init_wait(&data->wait);
+	skcipher_request_set_tfm(&data->req, tctx->streamcipher);
+	skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP |
+						  CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      crypto_req_done, &data->wait);
+	skcipher_request_set_crypt(&data->req, &data->sg, &data->sg,
+				   sizeof(data->derived_keys), data->iv);
+	err = crypto_wait_req(crypto_skcipher_encrypt(&data->req), &data->wait);
+	if (err)
+		goto out;
+	keyp = data->derived_keys;
+
+	/* Set the block cipher key (K_E) */
+	crypto_cipher_clear_flags(tctx->blockcipher, CRYPTO_TFM_REQ_MASK);
+	crypto_cipher_set_flags(tctx->blockcipher,
+				crypto_skcipher_get_flags(tfm) &
+				CRYPTO_TFM_REQ_MASK);
+	err = crypto_cipher_setkey(tctx->blockcipher, keyp,
+				   BLOCKCIPHER_KEY_SIZE);
+	crypto_skcipher_set_flags(tfm,
+				  crypto_cipher_get_flags(tctx->blockcipher) &
+				  CRYPTO_TFM_RES_MASK);
+	if (err)
+		goto out;
+	keyp += BLOCKCIPHER_KEY_SIZE;
+
+	/* Set the hash key (K_H) */
+	poly1305_core_setkey(&tctx->header_hash_key, keyp);
+	keyp += POLY1305_BLOCK_SIZE;
+
+	crypto_shash_clear_flags(tctx->hash, CRYPTO_TFM_REQ_MASK);
+	crypto_shash_set_flags(tctx->hash, crypto_skcipher_get_flags(tfm) &
+					   CRYPTO_TFM_REQ_MASK);
+	err = crypto_shash_setkey(tctx->hash, keyp, NHPOLY1305_KEY_SIZE);
+	crypto_skcipher_set_flags(tfm, crypto_shash_get_flags(tctx->hash) &
+				       CRYPTO_TFM_RES_MASK);
+	keyp += NHPOLY1305_KEY_SIZE;
+	WARN_ON(keyp != &data->derived_keys[ARRAY_SIZE(data->derived_keys)]);
+out:
+	kzfree(data);
+	return err;
+}
+
+/* Addition in Z/(2^{128}Z) */
+static inline void le128_add(le128 *r, const le128 *v1, const le128 *v2)
+{
+	u64 x = le64_to_cpu(v1->b);
+	u64 y = le64_to_cpu(v2->b);
+
+	r->b = cpu_to_le64(x + y);
+	r->a = cpu_to_le64(le64_to_cpu(v1->a) + le64_to_cpu(v2->a) +
+			   (x + y < x));
+}
+
+/* Subtraction in Z/(2^{128}Z) */
+static inline void le128_sub(le128 *r, const le128 *v1, const le128 *v2)
+{
+	u64 x = le64_to_cpu(v1->b);
+	u64 y = le64_to_cpu(v2->b);
+
+	r->b = cpu_to_le64(x - y);
+	r->a = cpu_to_le64(le64_to_cpu(v1->a) - le64_to_cpu(v2->a) -
+			   (x - y > x));
+}
+
+/*
+ * Apply the Poly1305 ε-∆U hash function to (bulk length, tweak) and save the
+ * result to rctx->header_hash.  This is the calculation
+ *
+ *	H_T ← Poly1305_{K_T}(bin_{128}(|L|) || T)
+ *
+ * from the procedure in section 6.4 of the Adiantum paper.  The resulting value
+ * is reused in both the first and second hash steps.  Specifically, it's added
+ * to the result of an independently keyed ε-∆U hash function (for equal length
+ * inputs only) taken over the left-hand part (the "bulk") of the message, to
+ * give the overall Adiantum hash of the (tweak, left-hand part) pair.
+ */
+static void adiantum_hash_header(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	struct {
+		__le64 message_bits;
+		__le64 padding;
+	} header = {
+		.message_bits = cpu_to_le64((u64)bulk_len * 8)
+	};
+	struct poly1305_state state;
+
+	poly1305_core_init(&state);
+
+	BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
+	poly1305_core_blocks(&state, &tctx->header_hash_key,
+			     &header, sizeof(header) / POLY1305_BLOCK_SIZE);
+
+	BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
+	poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+			     TWEAK_SIZE / POLY1305_BLOCK_SIZE);
+
+	poly1305_core_emit(&state, &rctx->header_hash);
+}
+
+/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
+static int adiantum_hash_message(struct skcipher_request *req,
+				 struct scatterlist *sgl, le128 *digest)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	struct shash_desc *hash_desc = &rctx->u.hash_desc;
+	struct sg_mapping_iter miter;
+	unsigned int i, n;
+	int err;
+
+	hash_desc->tfm = tctx->hash;
+	hash_desc->flags = 0;
+
+	err = crypto_shash_init(hash_desc);
+	if (err)
+		return err;
+
+	sg_miter_start(&miter, sgl, sg_nents(sgl),
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	for (i = 0; i < bulk_len; i += n) {
+		sg_miter_next(&miter);
+		n = min_t(unsigned int, miter.length, bulk_len - i);
+		err = crypto_shash_update(hash_desc, miter.addr, n);
+		if (err)
+			break;
+	}
+	sg_miter_stop(&miter);
+	if (err)
+		return err;
+
+	return crypto_shash_final(hash_desc, (u8 *)digest);
+}
+
+/* Continue Adiantum encryption/decryption after the stream cipher step */
+static int adiantum_finish(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	le128 digest;
+	int err;
+
+	/* If decrypting, decrypt C_M with the block cipher to get P_M */
+	if (!rctx->enc)
+		crypto_cipher_decrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+					  rctx->rbuf.bytes);
+
+	/*
+	 * Second hash step
+	 *	enc: C_R = C_M - H_{K_H}(T, C_L)
+	 *	dec: P_R = P_M - H_{K_H}(T, P_L)
+	 */
+	err = adiantum_hash_message(req, req->dst, &digest);
+	if (err)
+		return err;
+	le128_add(&digest, &digest, &rctx->header_hash);
+	le128_sub(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+	scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->dst,
+				 bulk_len, BLOCKCIPHER_BLOCK_SIZE, 1);
+	return 0;
+}
+
+static void adiantum_streamcipher_done(struct crypto_async_request *areq,
+				       int err)
+{
+	struct skcipher_request *req = areq->data;
+
+	if (!err)
+		err = adiantum_finish(req);
+
+	skcipher_request_complete(req, err);
+}
+
+static int adiantum_crypt(struct skcipher_request *req, bool enc)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct adiantum_request_ctx *rctx = skcipher_request_ctx(req);
+	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
+	unsigned int stream_len;
+	le128 digest;
+	int err;
+
+	if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE)
+		return -EINVAL;
+
+	rctx->enc = enc;
+
+	/*
+	 * First hash step
+	 *	enc: P_M = P_R + H_{K_H}(T, P_L)
+	 *	dec: C_M = C_R + H_{K_H}(T, C_L)
+	 */
+	adiantum_hash_header(req);
+	err = adiantum_hash_message(req, req->src, &digest);
+	if (err)
+		return err;
+	le128_add(&digest, &digest, &rctx->header_hash);
+	scatterwalk_map_and_copy(&rctx->rbuf.bignum, req->src,
+				 bulk_len, BLOCKCIPHER_BLOCK_SIZE, 0);
+	le128_add(&rctx->rbuf.bignum, &rctx->rbuf.bignum, &digest);
+
+	/* If encrypting, encrypt P_M with the block cipher to get C_M */
+	if (enc)
+		crypto_cipher_encrypt_one(tctx->blockcipher, rctx->rbuf.bytes,
+					  rctx->rbuf.bytes);
+
+	/* Initialize the rest of the XChaCha IV (first part is C_M) */
+	BUILD_BUG_ON(BLOCKCIPHER_BLOCK_SIZE != 16);
+	BUILD_BUG_ON(XCHACHA_IV_SIZE != 32);	/* nonce || stream position */
+	rctx->rbuf.words[4] = cpu_to_le32(1);
+	rctx->rbuf.words[5] = 0;
+	rctx->rbuf.words[6] = 0;
+	rctx->rbuf.words[7] = 0;
+
+	/*
+	 * XChaCha needs to be done on all the data except the last 16 bytes;
+	 * for disk encryption that usually means 4080 or 496 bytes.  But ChaCha
+	 * implementations tend to be most efficient when passed a whole number
+	 * of 64-byte ChaCha blocks, or sometimes even a multiple of 256 bytes.
+	 * And here it doesn't matter whether the last 16 bytes are written to,
+	 * as the second hash step will overwrite them.  Thus, round the XChaCha
+	 * length up to the next 64-byte boundary if possible.
+	 */
+	stream_len = bulk_len;
+	if (round_up(stream_len, CHACHA_BLOCK_SIZE) <= req->cryptlen)
+		stream_len = round_up(stream_len, CHACHA_BLOCK_SIZE);
+
+	skcipher_request_set_tfm(&rctx->u.streamcipher_req, tctx->streamcipher);
+	skcipher_request_set_crypt(&rctx->u.streamcipher_req, req->src,
+				   req->dst, stream_len, &rctx->rbuf);
+	skcipher_request_set_callback(&rctx->u.streamcipher_req,
+				      req->base.flags,
+				      adiantum_streamcipher_done, req);
+	return crypto_skcipher_encrypt(&rctx->u.streamcipher_req) ?:
+		adiantum_finish(req);
+}
+
+static int adiantum_encrypt(struct skcipher_request *req)
+{
+	return adiantum_crypt(req, true);
+}
+
+static int adiantum_decrypt(struct skcipher_request *req)
+{
+	return adiantum_crypt(req, false);
+}
+
+static int adiantum_init_tfm(struct crypto_skcipher *tfm)
+{
+	struct skcipher_instance *inst = skcipher_alg_instance(tfm);
+	struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+	struct crypto_skcipher *streamcipher;
+	struct crypto_cipher *blockcipher;
+	struct crypto_shash *hash;
+	unsigned int subreq_size;
+	int err;
+
+	streamcipher = crypto_spawn_skcipher(&ictx->streamcipher_spawn);
+	if (IS_ERR(streamcipher))
+		return PTR_ERR(streamcipher);
+
+	blockcipher = crypto_spawn_cipher(&ictx->blockcipher_spawn);
+	if (IS_ERR(blockcipher)) {
+		err = PTR_ERR(blockcipher);
+		goto err_free_streamcipher;
+	}
+
+	hash = crypto_spawn_shash(&ictx->hash_spawn);
+	if (IS_ERR(hash)) {
+		err = PTR_ERR(hash);
+		goto err_free_blockcipher;
+	}
+
+	tctx->streamcipher = streamcipher;
+	tctx->blockcipher = blockcipher;
+	tctx->hash = hash;
+
+	BUILD_BUG_ON(offsetofend(struct adiantum_request_ctx, u) !=
+		     sizeof(struct adiantum_request_ctx));
+	subreq_size = max(FIELD_SIZEOF(struct adiantum_request_ctx,
+				       u.hash_desc) +
+			  crypto_shash_descsize(hash),
+			  FIELD_SIZEOF(struct adiantum_request_ctx,
+				       u.streamcipher_req) +
+			  crypto_skcipher_reqsize(streamcipher));
+
+	crypto_skcipher_set_reqsize(tfm,
+				    offsetof(struct adiantum_request_ctx, u) +
+				    subreq_size);
+	return 0;
+
+err_free_blockcipher:
+	crypto_free_cipher(blockcipher);
+err_free_streamcipher:
+	crypto_free_skcipher(streamcipher);
+	return err;
+}
+
+static void adiantum_exit_tfm(struct crypto_skcipher *tfm)
+{
+	struct adiantum_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
+
+	crypto_free_skcipher(tctx->streamcipher);
+	crypto_free_cipher(tctx->blockcipher);
+	crypto_free_shash(tctx->hash);
+}
+
+static void adiantum_free_instance(struct skcipher_instance *inst)
+{
+	struct adiantum_instance_ctx *ictx = skcipher_instance_ctx(inst);
+
+	crypto_drop_skcipher(&ictx->streamcipher_spawn);
+	crypto_drop_spawn(&ictx->blockcipher_spawn);
+	crypto_drop_shash(&ictx->hash_spawn);
+	kfree(inst);
+}
+
+/*
+ * Check for a supported set of inner algorithms.
+ * See the comment at the beginning of this file.
+ */
+static bool adiantum_supported_algorithms(struct skcipher_alg *streamcipher_alg,
+					  struct crypto_alg *blockcipher_alg,
+					  struct shash_alg *hash_alg)
+{
+	if (strcmp(streamcipher_alg->base.cra_name, "xchacha12") != 0 &&
+	    strcmp(streamcipher_alg->base.cra_name, "xchacha20") != 0)
+		return false;
+
+	if (blockcipher_alg->cra_cipher.cia_min_keysize > BLOCKCIPHER_KEY_SIZE ||
+	    blockcipher_alg->cra_cipher.cia_max_keysize < BLOCKCIPHER_KEY_SIZE)
+		return false;
+	if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE)
+		return false;
+
+	if (strcmp(hash_alg->base.cra_name, "nhpoly1305") != 0)
+		return false;
+
+	return true;
+}
+
+static int adiantum_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct crypto_attr_type *algt;
+	const char *streamcipher_name;
+	const char *blockcipher_name;
+	const char *nhpoly1305_name;
+	struct skcipher_instance *inst;
+	struct adiantum_instance_ctx *ictx;
+	struct skcipher_alg *streamcipher_alg;
+	struct crypto_alg *blockcipher_alg;
+	struct crypto_alg *_hash_alg;
+	struct shash_alg *hash_alg;
+	int err;
+
+	algt = crypto_get_attr_type(tb);
+	if (IS_ERR(algt))
+		return PTR_ERR(algt);
+
+	if ((algt->type ^ CRYPTO_ALG_TYPE_SKCIPHER) & algt->mask)
+		return -EINVAL;
+
+	streamcipher_name = crypto_attr_alg_name(tb[1]);
+	if (IS_ERR(streamcipher_name))
+		return PTR_ERR(streamcipher_name);
+
+	blockcipher_name = crypto_attr_alg_name(tb[2]);
+	if (IS_ERR(blockcipher_name))
+		return PTR_ERR(blockcipher_name);
+
+	nhpoly1305_name = crypto_attr_alg_name(tb[3]);
+	if (nhpoly1305_name == ERR_PTR(-ENOENT))
+		nhpoly1305_name = "nhpoly1305";
+	if (IS_ERR(nhpoly1305_name))
+		return PTR_ERR(nhpoly1305_name);
+
+	inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
+	if (!inst)
+		return -ENOMEM;
+	ictx = skcipher_instance_ctx(inst);
+
+	/* Stream cipher, e.g. "xchacha12" */
+	err = crypto_grab_skcipher(&ictx->streamcipher_spawn, streamcipher_name,
+				   0, crypto_requires_sync(algt->type,
+							   algt->mask));
+	if (err)
+		goto out_free_inst;
+	streamcipher_alg = crypto_spawn_skcipher_alg(&ictx->streamcipher_spawn);
+
+	/* Block cipher, e.g. "aes" */
+	err = crypto_grab_spawn(&ictx->blockcipher_spawn, blockcipher_name,
+				CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_drop_streamcipher;
+	blockcipher_alg = ictx->blockcipher_spawn.alg;
+
+	/* NHPoly1305 ε-∆U hash function */
+	_hash_alg = crypto_alg_mod_lookup(nhpoly1305_name,
+					  CRYPTO_ALG_TYPE_SHASH,
+					  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(_hash_alg)) {
+		err = PTR_ERR(_hash_alg);
+		goto out_drop_blockcipher;
+	}
+	hash_alg = __crypto_shash_alg(_hash_alg);
+	err = crypto_init_shash_spawn(&ictx->hash_spawn, hash_alg,
+				      skcipher_crypto_instance(inst));
+	if (err)
+		goto out_put_hash;
+
+	/* Check the set of algorithms */
+	if (!adiantum_supported_algorithms(streamcipher_alg, blockcipher_alg,
+					   hash_alg)) {
+		pr_warn("Unsupported Adiantum instantiation: (%s,%s,%s)\n",
+			streamcipher_alg->base.cra_name,
+			blockcipher_alg->cra_name, hash_alg->base.cra_name);
+		err = -EINVAL;
+		goto out_drop_hash;
+	}
+
+	/* Instance fields */
+
+	err = -ENAMETOOLONG;
+	if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+		     "adiantum(%s,%s)", streamcipher_alg->base.cra_name,
+		     blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
+		goto out_drop_hash;
+	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+		     "adiantum(%s,%s,%s)",
+		     streamcipher_alg->base.cra_driver_name,
+		     blockcipher_alg->cra_driver_name,
+		     hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		goto out_drop_hash;
+
+	inst->alg.base.cra_flags = streamcipher_alg->base.cra_flags &
+				   CRYPTO_ALG_ASYNC;
+	inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
+	inst->alg.base.cra_ctxsize = sizeof(struct adiantum_tfm_ctx);
+	inst->alg.base.cra_alignmask = streamcipher_alg->base.cra_alignmask |
+				       hash_alg->base.cra_alignmask;
+	/*
+	 * The block cipher is only invoked once per message, so for long
+	 * messages (e.g. sectors for disk encryption) its performance doesn't
+	 * matter as much as that of the stream cipher and hash function.  Thus,
+	 * weigh the block cipher's ->cra_priority less.
+	 */
+	inst->alg.base.cra_priority = (4 * streamcipher_alg->base.cra_priority +
+				       2 * hash_alg->base.cra_priority +
+				       blockcipher_alg->cra_priority) / 7;
+
+	inst->alg.setkey = adiantum_setkey;
+	inst->alg.encrypt = adiantum_encrypt;
+	inst->alg.decrypt = adiantum_decrypt;
+	inst->alg.init = adiantum_init_tfm;
+	inst->alg.exit = adiantum_exit_tfm;
+	inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(streamcipher_alg);
+	inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(streamcipher_alg);
+	inst->alg.ivsize = TWEAK_SIZE;
+
+	inst->free = adiantum_free_instance;
+
+	err = skcipher_register_instance(tmpl, inst);
+	if (err)
+		goto out_drop_hash;
+
+	crypto_mod_put(_hash_alg);
+	return 0;
+
+out_drop_hash:
+	crypto_drop_shash(&ictx->hash_spawn);
+out_put_hash:
+	crypto_mod_put(_hash_alg);
+out_drop_blockcipher:
+	crypto_drop_spawn(&ictx->blockcipher_spawn);
+out_drop_streamcipher:
+	crypto_drop_skcipher(&ictx->streamcipher_spawn);
+out_free_inst:
+	kfree(inst);
+	return err;
+}
+
+/* adiantum(streamcipher_name, blockcipher_name [, nhpoly1305_name]) */
+static struct crypto_template adiantum_tmpl = {
+	.name = "adiantum",
+	.create = adiantum_create,
+	.module = THIS_MODULE,
+};
+
+static int __init adiantum_module_init(void)
+{
+	return crypto_register_template(&adiantum_tmpl);
+}
+
+static void __exit adiantum_module_exit(void)
+{
+	crypto_unregister_template(&adiantum_tmpl);
+}
+
+module_init(adiantum_module_init);
+module_exit(adiantum_module_exit);
+
+MODULE_DESCRIPTION("Adiantum length-preserving encryption mode");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("adiantum");
diff --git a/crypto/aead.c b/crypto/aead.c
index 60b3bbe973e752a879f39ac5ca5af712560ad888..189c52d1f63abd72093fffc6b70be665d43d61aa 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -119,20 +119,16 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct crypto_report_aead raead;
 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
 
-	strncpy(raead.type, "aead", sizeof(raead.type));
-	strncpy(raead.geniv, "<none>", sizeof(raead.geniv));
+	memset(&raead, 0, sizeof(raead));
+
+	strscpy(raead.type, "aead", sizeof(raead.type));
+	strscpy(raead.geniv, "<none>", sizeof(raead.geniv));
 
 	raead.blocksize = alg->cra_blocksize;
 	raead.maxauthsize = aead->maxauthsize;
 	raead.ivsize = aead->ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AEAD,
-		    sizeof(struct crypto_report_aead), &raead))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_AEAD, sizeof(raead), &raead);
 }
 #else
 static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index ca554d57d01e9739f42b2d06d95bf8ccfd5572c3..13df33aca46310e7c6381ad63d558e7a51c3c55f 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -63,7 +63,8 @@ static inline u8 byte(const u32 x, const unsigned n)
 
 static const u32 rco_tab[10] = { 1, 2, 4, 8, 16, 32, 64, 128, 27, 54 };
 
-__visible const u32 crypto_ft_tab[4][256] = {
+/* cacheline-aligned to facilitate prefetching into cache */
+__visible const u32 crypto_ft_tab[4][256] __cacheline_aligned = {
 	{
 		0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6,
 		0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591,
@@ -327,7 +328,7 @@ __visible const u32 crypto_ft_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_fl_tab[4][256] = {
+__visible const u32 crypto_fl_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
 		0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
@@ -591,7 +592,7 @@ __visible const u32 crypto_fl_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_it_tab[4][256] = {
+__visible const u32 crypto_it_tab[4][256] __cacheline_aligned = {
 	{
 		0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a,
 		0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b,
@@ -855,7 +856,7 @@ __visible const u32 crypto_it_tab[4][256] = {
 	}
 };
 
-__visible const u32 crypto_il_tab[4][256] = {
+__visible const u32 crypto_il_tab[4][256] __cacheline_aligned = {
 	{
 		0x00000052, 0x00000009, 0x0000006a, 0x000000d5,
 		0x00000030, 0x00000036, 0x000000a5, 0x00000038,
diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
index 03023b2290e8ec6e790679a30ddf4763e0476923..1ff9785b30f5568e121c374770d2eaee58f4e3d9 100644
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@@ -269,6 +269,7 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_enc + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
@@ -276,6 +277,12 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
 	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
 	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
@@ -300,6 +307,8 @@ static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
@@ -308,6 +317,7 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	const u32 *rkp = ctx->key_dec + 4;
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
+	unsigned long flags;
 	int round;
 
 	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
@@ -315,6 +325,12 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
 	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
 
+	/*
+	 * Temporarily disable interrupts to avoid races where cachelines are
+	 * evicted when the CPU is interrupted to do something else.
+	 */
+	local_irq_save(flags);
+
 	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
 	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
 	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
@@ -339,6 +355,8 @@ static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
 	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
 	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+
+	local_irq_restore(flags);
 }
 
 static struct crypto_alg aes_alg = {
diff --git a/crypto/ahash.c b/crypto/ahash.c
index e21667b4e10ac50ca9873adebeea72cc79412f7c..5d320a811f75023271b1cbf8525f086c0f1df687 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -364,20 +364,28 @@ static int crypto_ahash_op(struct ahash_request *req,
 
 int crypto_ahash_final(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->final);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_final);
 
 int crypto_ahash_finup(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_op(req, crypto_ahash_reqtfm(req)->finup);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_finup);
@@ -385,13 +393,16 @@ EXPORT_SYMBOL_GPL(crypto_ahash_finup);
 int crypto_ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = crypto_ahash_op(req, tfm->digest);
-	crypto_stat_ahash_final(req, ret);
+	crypto_stats_ahash_final(nbytes, ret, alg);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
@@ -498,18 +509,14 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_hash rhash;
 
-	strncpy(rhash.type, "ahash", sizeof(rhash.type));
+	memset(&rhash, 0, sizeof(rhash));
+
+	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
 	rhash.blocksize = alg->cra_blocksize;
 	rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_HASH,
-		    sizeof(struct crypto_report_hash), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
 }
 #else
 static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index cfbdb06d8ca864b30d67685208f14ecdba2ab764..0cbeae137e0ae8f9e45f67ef0c6e3475031ed747 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -30,15 +30,12 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_akcipher rakcipher;
 
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	memset(&rakcipher, 0, sizeof(rakcipher));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
-		    sizeof(struct crypto_report_akcipher), &rakcipher))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
+		       sizeof(rakcipher), &rakcipher);
 }
 #else
 static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 2545c5f89c4ca873572b5be8bdc7d4fa5c9525dd..8b65ada33e5d33a5ede556201b85ec2c6448a53c 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -258,13 +258,7 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 	list_add(&alg->cra_list, &crypto_alg_list);
 	list_add(&larval->alg.cra_list, &crypto_alg_list);
 
-	atomic_set(&alg->encrypt_cnt, 0);
-	atomic_set(&alg->decrypt_cnt, 0);
-	atomic64_set(&alg->encrypt_tlen, 0);
-	atomic64_set(&alg->decrypt_tlen, 0);
-	atomic_set(&alg->verify_cnt, 0);
-	atomic_set(&alg->cipher_err_cnt, 0);
-	atomic_set(&alg->sign_cnt, 0);
+	crypto_stats_init(alg);
 
 out:
 	return larval;
@@ -1076,6 +1070,245 @@ int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
 }
 EXPORT_SYMBOL_GPL(crypto_type_has_alg);
 
+#ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_init(struct crypto_alg *alg)
+{
+	memset(&alg->stats, 0, sizeof(alg->stats));
+}
+EXPORT_SYMBOL_GPL(crypto_stats_init);
+
+void crypto_stats_get(struct crypto_alg *alg)
+{
+	crypto_alg_get(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_get);
+
+void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret,
+				     struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.cipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
+		atomic64_add(nbytes, &alg->stats.cipher.encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_encrypt);
+
+void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret,
+				     struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.cipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
+		atomic64_add(nbytes, &alg->stats.cipher.decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ablkcipher_decrypt);
+
+void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.aead.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.aead.encrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.aead.encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_aead_encrypt);
+
+void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.aead.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.aead.decrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.aead.decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_aead_decrypt);
+
+void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.akcipher.encrypt_cnt);
+		atomic64_add(src_len, &alg->stats.akcipher.encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_encrypt);
+
+void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.akcipher.decrypt_cnt);
+		atomic64_add(src_len, &alg->stats.akcipher.decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_decrypt);
+
+void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
+	else
+		atomic64_inc(&alg->stats.akcipher.sign_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_sign);
+
+void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->stats.akcipher.err_cnt);
+	else
+		atomic64_inc(&alg->stats.akcipher.verify_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_akcipher_verify);
+
+void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.compress.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.compress.compress_cnt);
+		atomic64_add(slen, &alg->stats.compress.compress_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_compress);
+
+void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.compress.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.compress.decompress_cnt);
+		atomic64_add(slen, &alg->stats.compress.decompress_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_decompress);
+
+void crypto_stats_ahash_update(unsigned int nbytes, int ret,
+			       struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->stats.hash.err_cnt);
+	else
+		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ahash_update);
+
+void crypto_stats_ahash_final(unsigned int nbytes, int ret,
+			      struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.hash.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.hash.hash_cnt);
+		atomic64_add(nbytes, &alg->stats.hash.hash_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_ahash_final);
+
+void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->stats.kpp.err_cnt);
+	else
+		atomic64_inc(&alg->stats.kpp.setsecret_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_set_secret);
+
+void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->stats.kpp.err_cnt);
+	else
+		atomic64_inc(&alg->stats.kpp.generate_public_key_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_generate_public_key);
+
+void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
+{
+	if (ret)
+		atomic64_inc(&alg->stats.kpp.err_cnt);
+	else
+		atomic64_inc(&alg->stats.kpp.compute_shared_secret_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_kpp_compute_shared_secret);
+
+void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
+		atomic64_inc(&alg->stats.rng.err_cnt);
+	else
+		atomic64_inc(&alg->stats.rng.seed_cnt);
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_rng_seed);
+
+void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen,
+			       int ret)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.rng.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.rng.generate_cnt);
+		atomic64_add(dlen, &alg->stats.rng.generate_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_rng_generate);
+
+void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.cipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.cipher.encrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.cipher.encrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_skcipher_encrypt);
+
+void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret,
+				   struct crypto_alg *alg)
+{
+	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
+		atomic64_inc(&alg->stats.cipher.err_cnt);
+	} else {
+		atomic64_inc(&alg->stats.cipher.decrypt_cnt);
+		atomic64_add(cryptlen, &alg->stats.cipher.decrypt_tlen);
+	}
+	crypto_alg_put(alg);
+}
+EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt);
+#endif
+
 static int __init crypto_algapi_init(void)
 {
 	crypto_init_proc();
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index f93abf13b5d425174fb1a897c6255bd3110dcb44..c5398bd54942c1502d67cd65c0c1103efa5b483b 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -507,23 +507,18 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_blkcipher rblkcipher;
 
-	strncpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, alg->cra_blkcipher.geniv ?: "<default>",
-		sizeof(rblkcipher.geniv));
-	rblkcipher.geniv[sizeof(rblkcipher.geniv) - 1] = '\0';
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "blkcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, "<default>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
 	rblkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
 	rblkcipher.ivsize = alg->cra_blkcipher.ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
@@ -541,8 +536,7 @@ static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "min keysize  : %u\n", alg->cra_blkcipher.min_keysize);
 	seq_printf(m, "max keysize  : %u\n", alg->cra_blkcipher.max_keysize);
 	seq_printf(m, "ivsize       : %u\n", alg->cra_blkcipher.ivsize);
-	seq_printf(m, "geniv        : %s\n", alg->cra_blkcipher.geniv ?:
-					     "<default>");
+	seq_printf(m, "geniv        : <default>\n");
 }
 
 const struct crypto_type crypto_blkcipher_type = {
diff --git a/crypto/cfb.c b/crypto/cfb.c
index 20987d0e09d89ceafbd51d3b61b321dc58a56862..e81e456734985ad1772ef15b32b62991e0412f98 100644
--- a/crypto/cfb.c
+++ b/crypto/cfb.c
@@ -144,7 +144,7 @@ static int crypto_cfb_decrypt_segment(struct skcipher_walk *walk,
 
 	do {
 		crypto_cfb_encrypt_one(tfm, iv, dst);
-		crypto_xor(dst, iv, bsize);
+		crypto_xor(dst, src, bsize);
 		iv = src;
 
 		src += bsize;
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
deleted file mode 100644
index 3ae96587caf9a4e18216c8a0f1308eb0812eb912..0000000000000000000000000000000000000000
--- a/crypto/chacha20_generic.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <asm/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/module.h>
-
-static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
-			     unsigned int bytes)
-{
-	/* aligned to potentially speed up crypto_xor() */
-	u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long));
-
-	if (dst != src)
-		memcpy(dst, src, bytes);
-
-	while (bytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_block(state, stream);
-		crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE);
-		bytes -= CHACHA20_BLOCK_SIZE;
-		dst += CHACHA20_BLOCK_SIZE;
-	}
-	if (bytes) {
-		chacha20_block(state, stream);
-		crypto_xor(dst, stream, bytes);
-	}
-}
-
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
-{
-	state[0]  = 0x61707865; /* "expa" */
-	state[1]  = 0x3320646e; /* "nd 3" */
-	state[2]  = 0x79622d32; /* "2-by" */
-	state[3]  = 0x6b206574; /* "te k" */
-	state[4]  = ctx->key[0];
-	state[5]  = ctx->key[1];
-	state[6]  = ctx->key[2];
-	state[7]  = ctx->key[3];
-	state[8]  = ctx->key[4];
-	state[9]  = ctx->key[5];
-	state[10] = ctx->key[6];
-	state[11] = ctx->key[7];
-	state[12] = get_unaligned_le32(iv +  0);
-	state[13] = get_unaligned_le32(iv +  4);
-	state[14] = get_unaligned_le32(iv +  8);
-	state[15] = get_unaligned_le32(iv + 12);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_init);
-
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			   unsigned int keysize)
-{
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int i;
-
-	if (keysize != CHACHA20_KEY_SIZE)
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
-		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
-
-int crypto_chacha20_crypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_chacha20_init(state, ctx, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
-
-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-				 nbytes);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
-
-static struct skcipher_alg alg = {
-	.base.cra_name		= "chacha20",
-	.base.cra_driver_name	= "chacha20-generic",
-	.base.cra_priority	= 100,
-	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_module	= THIS_MODULE,
-
-	.min_keysize		= CHACHA20_KEY_SIZE,
-	.max_keysize		= CHACHA20_KEY_SIZE,
-	.ivsize			= CHACHA20_IV_SIZE,
-	.chunksize		= CHACHA20_BLOCK_SIZE,
-	.setkey			= crypto_chacha20_setkey,
-	.encrypt		= crypto_chacha20_crypt,
-	.decrypt		= crypto_chacha20_crypt,
-};
-
-static int __init chacha20_generic_mod_init(void)
-{
-	return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_generic_mod_fini(void)
-{
-	crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_generic_mod_init);
-module_exit(chacha20_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-generic");
diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c
index 600afa99941fe07470b74f3c142b59716656df92..fef11446ab1b9b9b9cd0ce9f93c0785e71977e42 100644
--- a/crypto/chacha20poly1305.c
+++ b/crypto/chacha20poly1305.c
@@ -13,7 +13,7 @@
 #include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 #include <crypto/poly1305.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -22,8 +22,6 @@
 
 #include "internal.h"
 
-#define CHACHAPOLY_IV_SIZE	12
-
 struct chachapoly_instance_ctx {
 	struct crypto_skcipher_spawn chacha;
 	struct crypto_ahash_spawn poly;
@@ -51,7 +49,7 @@ struct poly_req {
 };
 
 struct chacha_req {
-	u8 iv[CHACHA20_IV_SIZE];
+	u8 iv[CHACHA_IV_SIZE];
 	struct scatterlist src[1];
 	struct skcipher_request req; /* must be last member */
 };
@@ -91,7 +89,7 @@ static void chacha_iv(u8 *iv, struct aead_request *req, u32 icb)
 	memcpy(iv, &leicb, sizeof(leicb));
 	memcpy(iv + sizeof(leicb), ctx->salt, ctx->saltlen);
 	memcpy(iv + sizeof(leicb) + ctx->saltlen, req->iv,
-	       CHACHA20_IV_SIZE - sizeof(leicb) - ctx->saltlen);
+	       CHACHA_IV_SIZE - sizeof(leicb) - ctx->saltlen);
 }
 
 static int poly_verify_tag(struct aead_request *req)
@@ -494,7 +492,7 @@ static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
 	struct chachapoly_ctx *ctx = crypto_aead_ctx(aead);
 	int err;
 
-	if (keylen != ctx->saltlen + CHACHA20_KEY_SIZE)
+	if (keylen != ctx->saltlen + CHACHA_KEY_SIZE)
 		return -EINVAL;
 
 	keylen -= ctx->saltlen;
@@ -639,7 +637,7 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
 
 	err = -EINVAL;
 	/* Need 16-byte IV size, including Initial Block Counter value */
-	if (crypto_skcipher_alg_ivsize(chacha) != CHACHA20_IV_SIZE)
+	if (crypto_skcipher_alg_ivsize(chacha) != CHACHA_IV_SIZE)
 		goto out_drop_chacha;
 	/* Not a stream cipher? */
 	if (chacha->base.cra_blocksize != 1)
diff --git a/crypto/chacha_generic.c b/crypto/chacha_generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..35b583101f4f3dc5422968a4938194164bc6ad5e
--- /dev/null
+++ b/crypto/chacha_generic.c
@@ -0,0 +1,217 @@
+/*
+ * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2018 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/module.h>
+
+static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
+			   unsigned int bytes, int nrounds)
+{
+	/* aligned to potentially speed up crypto_xor() */
+	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+	if (dst != src)
+		memcpy(dst, src, bytes);
+
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block(state, stream, nrounds);
+		crypto_xor(dst, stream, CHACHA_BLOCK_SIZE);
+		bytes -= CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+	}
+	if (bytes) {
+		chacha_block(state, stream, nrounds);
+		crypto_xor(dst, stream, bytes);
+	}
+}
+
+static int chacha_stream_xor(struct skcipher_request *req,
+			     struct chacha_ctx *ctx, u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	crypto_chacha_init(state, ctx, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+			       nbytes, ctx->nrounds);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv)
+{
+	state[0]  = 0x61707865; /* "expa" */
+	state[1]  = 0x3320646e; /* "nd 3" */
+	state[2]  = 0x79622d32; /* "2-by" */
+	state[3]  = 0x6b206574; /* "te k" */
+	state[4]  = ctx->key[0];
+	state[5]  = ctx->key[1];
+	state[6]  = ctx->key[2];
+	state[7]  = ctx->key[3];
+	state[8]  = ctx->key[4];
+	state[9]  = ctx->key[5];
+	state[10] = ctx->key[6];
+	state[11] = ctx->key[7];
+	state[12] = get_unaligned_le32(iv +  0);
+	state[13] = get_unaligned_le32(iv +  4);
+	state[14] = get_unaligned_le32(iv +  8);
+	state[15] = get_unaligned_le32(iv + 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_init);
+
+static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			 unsigned int keysize, int nrounds)
+{
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int i;
+
+	if (keysize != CHACHA_KEY_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+	ctx->nrounds = nrounds;
+	return 0;
+}
+
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 20);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
+
+int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 12);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
+
+int crypto_chacha_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	return chacha_stream_xor(req, ctx, req->iv);
+}
+EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
+
+int crypto_xchacha_crypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	/* Compute the subkey given the original key and first 128 nonce bits */
+	crypto_chacha_init(state, ctx, req->iv);
+	hchacha_block(state, subctx.key, ctx->nrounds);
+	subctx.nrounds = ctx->nrounds;
+
+	/* Build the real IV */
+	memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */
+	memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */
+
+	/* Generate the stream and XOR it with the data */
+	return chacha_stream_xor(req, &subctx, real_iv);
+}
+EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_chacha_crypt,
+		.decrypt		= crypto_chacha_crypt,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha20_setkey,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-generic",
+		.base.cra_priority	= 100,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= crypto_chacha12_setkey,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
+	}
+};
+
+static int __init chacha_generic_mod_init(void)
+{
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_generic_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_generic_mod_init);
+module_exit(chacha_generic_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-generic");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-generic");
diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index 7118fb5efbaadb3c81bafda5610bb87d995c4d76..5640e5db7bdbf7c90dd8705aeaa0293d5a778ee7 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -422,8 +422,6 @@ static int cryptd_create_blkcipher(struct crypto_template *tmpl,
 	inst->alg.cra_ablkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
 	inst->alg.cra_ablkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
 
-	inst->alg.cra_ablkcipher.geniv = alg->cra_blkcipher.geniv;
-
 	inst->alg.cra_ctxsize = sizeof(struct cryptd_blkcipher_ctx);
 
 	inst->alg.cra_init = cryptd_blkcipher_init_tfm;
@@ -1174,7 +1172,7 @@ struct cryptd_ablkcipher *cryptd_alloc_ablkcipher(const char *alg_name,
 		return ERR_PTR(-EINVAL);
 	type = crypto_skcipher_type(type);
 	mask &= ~CRYPTO_ALG_TYPE_MASK;
-	mask |= (CRYPTO_ALG_GENIV | CRYPTO_ALG_TYPE_BLKCIPHER_MASK);
+	mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
 	tfm = crypto_alloc_base(cryptd_alg_name, type, mask);
 	if (IS_ERR(tfm))
 		return ERR_CAST(tfm);
diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
index 784748dbb19f0c58482ad18c761c7de121d41928..f25d3f32c9c26901bb5e46ed21b9f35afbd2568a 100644
--- a/crypto/crypto_user_base.c
+++ b/crypto/crypto_user_base.c
@@ -84,87 +84,38 @@ static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_cipher rcipher;
 
-	strncpy(rcipher.type, "cipher", sizeof(rcipher.type));
+	memset(&rcipher, 0, sizeof(rcipher));
+
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
 	rcipher.blocksize = alg->cra_blocksize;
 	rcipher.min_keysize = alg->cra_cipher.cia_min_keysize;
 	rcipher.max_keysize = alg->cra_cipher.cia_max_keysize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
-		    sizeof(struct crypto_report_cipher), &rcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
+		       sizeof(rcipher), &rcipher);
 }
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_comp rcomp;
 
-	strncpy(rcomp.type, "compression", sizeof(rcomp.type));
-	if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
-		    sizeof(struct crypto_report_comp), &rcomp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_acomp racomp;
+	memset(&rcomp, 0, sizeof(rcomp));
 
-	strncpy(racomp.type, "acomp", sizeof(racomp.type));
+	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_ACOMP,
-		    sizeof(struct crypto_report_acomp), &racomp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_akcipher rakcipher;
-
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
-		    sizeof(struct crypto_report_akcipher), &rakcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_kpp rkpp;
-
-	strncpy(rkpp.type, "kpp", sizeof(rkpp.type));
-
-	if (nla_put(skb, CRYPTOCFGA_REPORT_KPP,
-		    sizeof(struct crypto_report_kpp), &rkpp))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
 static int crypto_report_one(struct crypto_alg *alg,
 			     struct crypto_user_alg *ualg, struct sk_buff *skb)
 {
-	strncpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strncpy(ualg->cru_driver_name, alg->cra_driver_name,
+	memset(ualg, 0, sizeof(*ualg));
+
+	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
 		sizeof(ualg->cru_driver_name));
-	strncpy(ualg->cru_module_name, module_name(alg->cra_module),
+	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
 		sizeof(ualg->cru_module_name));
 
 	ualg->cru_type = 0;
@@ -177,9 +128,9 @@ static int crypto_report_one(struct crypto_alg *alg,
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
 		struct crypto_report_larval rl;
 
-		strncpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL,
-			    sizeof(struct crypto_report_larval), &rl))
+		memset(&rl, 0, sizeof(rl));
+		strscpy(rl.type, "larval", sizeof(rl.type));
+		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl))
 			goto nla_put_failure;
 		goto out;
 	}
@@ -202,20 +153,6 @@ static int crypto_report_one(struct crypto_alg *alg,
 			goto nla_put_failure;
 
 		break;
-	case CRYPTO_ALG_TYPE_ACOMPRESS:
-		if (crypto_report_acomp(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	case CRYPTO_ALG_TYPE_AKCIPHER:
-		if (crypto_report_akcipher(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	case CRYPTO_ALG_TYPE_KPP:
-		if (crypto_report_kpp(skb, alg))
-			goto nla_put_failure;
-		break;
 	}
 
 out:
@@ -294,30 +231,33 @@ static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 
 static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct crypto_alg *alg;
+	const size_t start_pos = cb->args[0];
+	size_t pos = 0;
 	struct crypto_dump_info info;
-	int err;
-
-	if (cb->args[0])
-		goto out;
-
-	cb->args[0] = 1;
+	struct crypto_alg *alg;
+	int res;
 
 	info.in_skb = cb->skb;
 	info.out_skb = skb;
 	info.nlmsg_seq = cb->nlh->nlmsg_seq;
 	info.nlmsg_flags = NLM_F_MULTI;
 
+	down_read(&crypto_alg_sem);
 	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
-		err = crypto_report_alg(alg, &info);
-		if (err)
-			goto out_err;
+		if (pos >= start_pos) {
+			res = crypto_report_alg(alg, &info);
+			if (res == -EMSGSIZE)
+				break;
+			if (res)
+				goto out;
+		}
+		pos++;
 	}
-
+	cb->args[0] = pos;
+	res = skb->len;
 out:
-	return skb->len;
-out_err:
-	return err;
+	up_read(&crypto_alg_sem);
+	return res;
 }
 
 static int crypto_dump_report_done(struct netlink_callback *cb)
@@ -483,9 +423,7 @@ static const struct crypto_link {
 						       .dump = crypto_dump_report,
 						       .done = crypto_dump_report_done},
 	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
-	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat,
-						       .dump = crypto_dump_reportstat,
-						       .done = crypto_dump_reportstat_done},
+	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat},
 };
 
 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -505,7 +443,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) &&
 	    (nlh->nlmsg_flags & NLM_F_DUMP))) {
 		struct crypto_alg *alg;
-		u16 dump_alloc = 0;
+		unsigned long dump_alloc = 0;
 
 		if (link->dump == NULL)
 			return -EINVAL;
@@ -513,16 +451,16 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		down_read(&crypto_alg_sem);
 		list_for_each_entry(alg, &crypto_alg_list, cra_list)
 			dump_alloc += CRYPTO_REPORT_MAXSIZE;
+		up_read(&crypto_alg_sem);
 
 		{
 			struct netlink_dump_control c = {
 				.dump = link->dump,
 				.done = link->done,
-				.min_dump_alloc = dump_alloc,
+				.min_dump_alloc = min(dump_alloc, 65535UL),
 			};
 			err = netlink_dump_start(crypto_nlsk, skb, nlh, &c);
 		}
-		up_read(&crypto_alg_sem);
 
 		return err;
 	}
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
index 1dfaa0ccd555b5bd3246822365114fa69c7e5ae1..3e9a53233d80439498a251024266c5eefead8a9a 100644
--- a/crypto/crypto_user_stat.c
+++ b/crypto/crypto_user_stat.c
@@ -33,260 +33,149 @@ struct crypto_dump_info {
 
 static int crypto_report_aead(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat raead;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_aead raead;
 
 	memset(&raead, 0, sizeof(raead));
 
-	strncpy(raead.type, "aead", sizeof(raead.type));
-
-	v32 = atomic_read(&alg->encrypt_cnt);
-	raead.stat_encrypt_cnt = v32;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	raead.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	raead.stat_decrypt_cnt = v32;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	raead.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->aead_err_cnt);
-	raead.stat_aead_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_AEAD,
-		    sizeof(struct crypto_stat), &raead))
-		goto nla_put_failure;
-	return 0;
+	strscpy(raead.type, "aead", sizeof(raead.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	raead.stat_encrypt_cnt = atomic64_read(&alg->stats.aead.encrypt_cnt);
+	raead.stat_encrypt_tlen = atomic64_read(&alg->stats.aead.encrypt_tlen);
+	raead.stat_decrypt_cnt = atomic64_read(&alg->stats.aead.decrypt_cnt);
+	raead.stat_decrypt_tlen = atomic64_read(&alg->stats.aead.decrypt_tlen);
+	raead.stat_err_cnt = atomic64_read(&alg->stats.aead.err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
 }
 
 static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcipher;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_cipher rcipher;
 
 	memset(&rcipher, 0, sizeof(rcipher));
 
-	strlcpy(rcipher.type, "cipher", sizeof(rcipher.type));
-
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rcipher.stat_encrypt_cnt = v32;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	rcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rcipher.stat_decrypt_cnt = v32;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	rcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcipher.stat_cipher_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_CIPHER,
-		    sizeof(struct crypto_stat), &rcipher))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	rcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.cipher.encrypt_cnt);
+	rcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.cipher.encrypt_tlen);
+	rcipher.stat_decrypt_cnt =  atomic64_read(&alg->stats.cipher.decrypt_cnt);
+	rcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.cipher.decrypt_tlen);
+	rcipher.stat_err_cnt =  atomic64_read(&alg->stats.cipher.err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
 }
 
 static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rcomp;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_compress rcomp;
 
 	memset(&rcomp, 0, sizeof(rcomp));
 
-	strlcpy(rcomp.type, "compression", sizeof(rcomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	rcomp.stat_compress_cnt = v32;
-	v64 = atomic64_read(&alg->compress_tlen);
-	rcomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	rcomp.stat_decompress_cnt = v32;
-	v64 = atomic64_read(&alg->decompress_tlen);
-	rcomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	rcomp.stat_compress_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_COMPRESS,
-		    sizeof(struct crypto_stat), &rcomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
+	rcomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
+	rcomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
+	rcomp.stat_decompress_cnt = atomic64_read(&alg->stats.compress.decompress_cnt);
+	rcomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
+	rcomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
 }
 
 static int crypto_report_acomp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat racomp;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_compress racomp;
 
 	memset(&racomp, 0, sizeof(racomp));
 
-	strlcpy(racomp.type, "acomp", sizeof(racomp.type));
-	v32 = atomic_read(&alg->compress_cnt);
-	racomp.stat_compress_cnt = v32;
-	v64 = atomic64_read(&alg->compress_tlen);
-	racomp.stat_compress_tlen = v64;
-	v32 = atomic_read(&alg->decompress_cnt);
-	racomp.stat_decompress_cnt = v32;
-	v64 = atomic64_read(&alg->decompress_tlen);
-	racomp.stat_decompress_tlen = v64;
-	v32 = atomic_read(&alg->cipher_err_cnt);
-	racomp.stat_compress_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_ACOMP,
-		    sizeof(struct crypto_stat), &racomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(racomp.type, "acomp", sizeof(racomp.type));
+	racomp.stat_compress_cnt = atomic64_read(&alg->stats.compress.compress_cnt);
+	racomp.stat_compress_tlen = atomic64_read(&alg->stats.compress.compress_tlen);
+	racomp.stat_decompress_cnt =  atomic64_read(&alg->stats.compress.decompress_cnt);
+	racomp.stat_decompress_tlen = atomic64_read(&alg->stats.compress.decompress_tlen);
+	racomp.stat_err_cnt = atomic64_read(&alg->stats.compress.err_cnt);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
 }
 
 static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rakcipher;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_akcipher rakcipher;
 
 	memset(&rakcipher, 0, sizeof(rakcipher));
 
-	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	v32 = atomic_read(&alg->encrypt_cnt);
-	rakcipher.stat_encrypt_cnt = v32;
-	v64 = atomic64_read(&alg->encrypt_tlen);
-	rakcipher.stat_encrypt_tlen = v64;
-	v32 = atomic_read(&alg->decrypt_cnt);
-	rakcipher.stat_decrypt_cnt = v32;
-	v64 = atomic64_read(&alg->decrypt_tlen);
-	rakcipher.stat_decrypt_tlen = v64;
-	v32 = atomic_read(&alg->sign_cnt);
-	rakcipher.stat_sign_cnt = v32;
-	v32 = atomic_read(&alg->verify_cnt);
-	rakcipher.stat_verify_cnt = v32;
-	v32 = atomic_read(&alg->akcipher_err_cnt);
-	rakcipher.stat_akcipher_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
-		    sizeof(struct crypto_stat), &rakcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+	rakcipher.stat_encrypt_cnt = atomic64_read(&alg->stats.akcipher.encrypt_cnt);
+	rakcipher.stat_encrypt_tlen = atomic64_read(&alg->stats.akcipher.encrypt_tlen);
+	rakcipher.stat_decrypt_cnt = atomic64_read(&alg->stats.akcipher.decrypt_cnt);
+	rakcipher.stat_decrypt_tlen = atomic64_read(&alg->stats.akcipher.decrypt_tlen);
+	rakcipher.stat_sign_cnt = atomic64_read(&alg->stats.akcipher.sign_cnt);
+	rakcipher.stat_verify_cnt = atomic64_read(&alg->stats.akcipher.verify_cnt);
+	rakcipher.stat_err_cnt = atomic64_read(&alg->stats.akcipher.err_cnt);
+
+	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
+		       sizeof(rakcipher), &rakcipher);
 }
 
 static int crypto_report_kpp(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rkpp;
-	u32 v;
+	struct crypto_stat_kpp rkpp;
 
 	memset(&rkpp, 0, sizeof(rkpp));
 
-	strlcpy(rkpp.type, "kpp", sizeof(rkpp.type));
-
-	v = atomic_read(&alg->setsecret_cnt);
-	rkpp.stat_setsecret_cnt = v;
-	v = atomic_read(&alg->generate_public_key_cnt);
-	rkpp.stat_generate_public_key_cnt = v;
-	v = atomic_read(&alg->compute_shared_secret_cnt);
-	rkpp.stat_compute_shared_secret_cnt = v;
-	v = atomic_read(&alg->kpp_err_cnt);
-	rkpp.stat_kpp_err_cnt = v;
+	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_KPP,
-		    sizeof(struct crypto_stat), &rkpp))
-		goto nla_put_failure;
-	return 0;
+	rkpp.stat_setsecret_cnt = atomic64_read(&alg->stats.kpp.setsecret_cnt);
+	rkpp.stat_generate_public_key_cnt = atomic64_read(&alg->stats.kpp.generate_public_key_cnt);
+	rkpp.stat_compute_shared_secret_cnt = atomic64_read(&alg->stats.kpp.compute_shared_secret_cnt);
+	rkpp.stat_err_cnt = atomic64_read(&alg->stats.kpp.err_cnt);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
 }
 
 static int crypto_report_ahash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_hash rhash;
 
 	memset(&rhash, 0, sizeof(rhash));
 
-	strncpy(rhash.type, "ahash", sizeof(rhash.type));
+	strscpy(rhash.type, "ahash", sizeof(rhash.type));
 
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
-	v64 = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	rhash.stat_hash_cnt = atomic64_read(&alg->stats.hash.hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
-		    sizeof(struct crypto_stat), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
 
 static int crypto_report_shash(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rhash;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_hash rhash;
 
 	memset(&rhash, 0, sizeof(rhash));
 
-	strncpy(rhash.type, "shash", sizeof(rhash.type));
-
-	v32 = atomic_read(&alg->hash_cnt);
-	rhash.stat_hash_cnt = v32;
-	v64 = atomic64_read(&alg->hash_tlen);
-	rhash.stat_hash_tlen = v64;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rhash.stat_hash_err_cnt = v32;
+	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
-	if (nla_put(skb, CRYPTOCFGA_STAT_HASH,
-		    sizeof(struct crypto_stat), &rhash))
-		goto nla_put_failure;
-	return 0;
+	rhash.stat_hash_cnt =  atomic64_read(&alg->stats.hash.hash_cnt);
+	rhash.stat_hash_tlen = atomic64_read(&alg->stats.hash.hash_tlen);
+	rhash.stat_err_cnt = atomic64_read(&alg->stats.hash.err_cnt);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
 }
 
 static int crypto_report_rng(struct sk_buff *skb, struct crypto_alg *alg)
 {
-	struct crypto_stat rrng;
-	u64 v64;
-	u32 v32;
+	struct crypto_stat_rng rrng;
 
 	memset(&rrng, 0, sizeof(rrng));
 
-	strncpy(rrng.type, "rng", sizeof(rrng.type));
+	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	v32 = atomic_read(&alg->generate_cnt);
-	rrng.stat_generate_cnt = v32;
-	v64 = atomic64_read(&alg->generate_tlen);
-	rrng.stat_generate_tlen = v64;
-	v32 = atomic_read(&alg->seed_cnt);
-	rrng.stat_seed_cnt = v32;
-	v32 = atomic_read(&alg->hash_err_cnt);
-	rrng.stat_rng_err_cnt = v32;
-
-	if (nla_put(skb, CRYPTOCFGA_STAT_RNG,
-		    sizeof(struct crypto_stat), &rrng))
-		goto nla_put_failure;
-	return 0;
+	rrng.stat_generate_cnt = atomic64_read(&alg->stats.rng.generate_cnt);
+	rrng.stat_generate_tlen = atomic64_read(&alg->stats.rng.generate_tlen);
+	rrng.stat_seed_cnt = atomic64_read(&alg->stats.rng.seed_cnt);
+	rrng.stat_err_cnt = atomic64_read(&alg->stats.rng.err_cnt);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
 }
 
 static int crypto_reportstat_one(struct crypto_alg *alg,
@@ -295,10 +184,10 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 {
 	memset(ualg, 0, sizeof(*ualg));
 
-	strlcpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strlcpy(ualg->cru_driver_name, alg->cra_driver_name,
+	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
 		sizeof(ualg->cru_driver_name));
-	strlcpy(ualg->cru_module_name, module_name(alg->cra_module),
+	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
 		sizeof(ualg->cru_module_name));
 
 	ualg->cru_type = 0;
@@ -309,12 +198,11 @@ static int crypto_reportstat_one(struct crypto_alg *alg,
 	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
 		goto nla_put_failure;
 	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
-		struct crypto_stat rl;
+		struct crypto_stat_larval rl;
 
 		memset(&rl, 0, sizeof(rl));
-		strlcpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL,
-			    sizeof(struct crypto_stat), &rl))
+		strscpy(rl.type, "larval", sizeof(rl.type));
+		if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL, sizeof(rl), &rl))
 			goto nla_put_failure;
 		goto out;
 	}
@@ -448,37 +336,4 @@ int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 	return nlmsg_unicast(crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
 }
 
-int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct crypto_alg *alg;
-	struct crypto_dump_info info;
-	int err;
-
-	if (cb->args[0])
-		goto out;
-
-	cb->args[0] = 1;
-
-	info.in_skb = cb->skb;
-	info.out_skb = skb;
-	info.nlmsg_seq = cb->nlh->nlmsg_seq;
-	info.nlmsg_flags = NLM_F_MULTI;
-
-	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
-		err = crypto_reportstat_alg(alg, &info);
-		if (err)
-			goto out_err;
-	}
-
-out:
-	return skb->len;
-out_err:
-	return err;
-}
-
-int crypto_dump_reportstat_done(struct netlink_callback *cb)
-{
-	return 0;
-}
-
 MODULE_LICENSE("GPL");
diff --git a/crypto/ctr.c b/crypto/ctr.c
index 435b75bd619e41140c173a3a7308957f691b584e..30f3946efc6da31b544463bbfab7582dc1cef578 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -233,8 +233,6 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 	inst->alg.cra_blkcipher.encrypt = crypto_ctr_crypt;
 	inst->alg.cra_blkcipher.decrypt = crypto_ctr_crypt;
 
-	inst->alg.cra_blkcipher.geniv = "chainiv";
-
 out:
 	crypto_mod_put(alg);
 	return inst;
diff --git a/crypto/ecc.c b/crypto/ecc.c
index 8facafd678026ac9bfaf0cfe5c9767a77e875a1a..ed1237115066b48b368b15688e6b2343db70d6ac 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -842,15 +842,23 @@ static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2, u64 *curve_prime,
 
 static void ecc_point_mult(struct ecc_point *result,
 			   const struct ecc_point *point, const u64 *scalar,
-			   u64 *initial_z, u64 *curve_prime,
+			   u64 *initial_z, const struct ecc_curve *curve,
 			   unsigned int ndigits)
 {
 	/* R0 and R1 */
 	u64 rx[2][ECC_MAX_DIGITS];
 	u64 ry[2][ECC_MAX_DIGITS];
 	u64 z[ECC_MAX_DIGITS];
+	u64 sk[2][ECC_MAX_DIGITS];
+	u64 *curve_prime = curve->p;
 	int i, nb;
-	int num_bits = vli_num_bits(scalar, ndigits);
+	int num_bits;
+	int carry;
+
+	carry = vli_add(sk[0], scalar, curve->n, ndigits);
+	vli_add(sk[1], sk[0], curve->n, ndigits);
+	scalar = sk[!carry];
+	num_bits = sizeof(u64) * ndigits * 8 + 1;
 
 	vli_set(rx[1], point->x, ndigits);
 	vli_set(ry[1], point->y, ndigits);
@@ -904,30 +912,43 @@ static inline void ecc_swap_digits(const u64 *in, u64 *out,
 		out[i] = __swab64(in[ndigits - 1 - i]);
 }
 
-int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits,
-		     const u64 *private_key, unsigned int private_key_len)
+static int __ecc_is_key_valid(const struct ecc_curve *curve,
+			      const u64 *private_key, unsigned int ndigits)
 {
-	int nbytes;
-	const struct ecc_curve *curve = ecc_get_curve(curve_id);
+	u64 one[ECC_MAX_DIGITS] = { 1, };
+	u64 res[ECC_MAX_DIGITS];
 
 	if (!private_key)
 		return -EINVAL;
 
-	nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT;
-
-	if (private_key_len != nbytes)
+	if (curve->g.ndigits != ndigits)
 		return -EINVAL;
 
-	if (vli_is_zero(private_key, ndigits))
+	/* Make sure the private key is in the range [2, n-3]. */
+	if (vli_cmp(one, private_key, ndigits) != -1)
 		return -EINVAL;
-
-	/* Make sure the private key is in the range [1, n-1]. */
-	if (vli_cmp(curve->n, private_key, ndigits) != 1)
+	vli_sub(res, curve->n, one, ndigits);
+	vli_sub(res, res, one, ndigits);
+	if (vli_cmp(res, private_key, ndigits) != 1)
 		return -EINVAL;
 
 	return 0;
 }
 
+int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits,
+		     const u64 *private_key, unsigned int private_key_len)
+{
+	int nbytes;
+	const struct ecc_curve *curve = ecc_get_curve(curve_id);
+
+	nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT;
+
+	if (private_key_len != nbytes)
+		return -EINVAL;
+
+	return __ecc_is_key_valid(curve, private_key, ndigits);
+}
+
 /*
  * ECC private keys are generated using the method of extra random bits,
  * equivalent to that described in FIPS 186-4, Appendix B.4.1.
@@ -971,11 +992,8 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
 	if (err)
 		return err;
 
-	if (vli_is_zero(priv, ndigits))
-		return -EINVAL;
-
-	/* Make sure the private key is in the range [1, n-1]. */
-	if (vli_cmp(curve->n, priv, ndigits) != 1)
+	/* Make sure the private key is in the valid range. */
+	if (__ecc_is_key_valid(curve, priv, ndigits))
 		return -EINVAL;
 
 	ecc_swap_digits(priv, privkey, ndigits);
@@ -1004,7 +1022,7 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits,
 		goto out;
 	}
 
-	ecc_point_mult(pk, &curve->g, priv, NULL, curve->p, ndigits);
+	ecc_point_mult(pk, &curve->g, priv, NULL, curve, ndigits);
 	if (ecc_point_is_zero(pk)) {
 		ret = -EAGAIN;
 		goto err_free_point;
@@ -1090,7 +1108,7 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits,
 		goto err_alloc_product;
 	}
 
-	ecc_point_mult(product, pk, priv, rand_z, curve->p, ndigits);
+	ecc_point_mult(product, pk, priv, rand_z, curve, ndigits);
 
 	ecc_swap_digits(product->x, secret, ndigits);
 
diff --git a/crypto/hash_info.c b/crypto/hash_info.c
index 7b1e0b188ce6a9caa32d4c903219c0b6fb0f9ee4..1dd095e4b45195f4acf47036ea6725ac1859604b 100644
--- a/crypto/hash_info.c
+++ b/crypto/hash_info.c
@@ -32,6 +32,8 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= "tgr160",
 	[HASH_ALGO_TGR_192]	= "tgr192",
 	[HASH_ALGO_SM3_256]	= "sm3-256",
+	[HASH_ALGO_STREEBOG_256] = "streebog256",
+	[HASH_ALGO_STREEBOG_512] = "streebog512",
 };
 EXPORT_SYMBOL_GPL(hash_algo_name);
 
@@ -54,5 +56,7 @@ const int hash_digest_size[HASH_ALGO__LAST] = {
 	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
 	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
 	[HASH_ALGO_SM3_256]	= SM3256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
 };
 EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/crypto/kpp.c b/crypto/kpp.c
index a90edc27af77ed36bbbc5f72e126976a62b661a9..bc2f1006a2f77210a836dfe001338446a69a5f4d 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -30,15 +30,11 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_kpp rkpp;
 
-	strncpy(rkpp.type, "kpp", sizeof(rkpp.type));
+	memset(&rkpp, 0, sizeof(rkpp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_KPP,
-		    sizeof(struct crypto_report_kpp), &rkpp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(rkpp), &rkpp);
 }
 #else
 static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/lz4.c b/crypto/lz4.c
index 2ce2660d3519ef3fab4a70f74ea1c5dd30f12e0e..c160dfdbf2e0767691a4b2d157bffc3396e7b0fb 100644
--- a/crypto/lz4.c
+++ b/crypto/lz4.c
@@ -122,7 +122,6 @@ static struct crypto_alg alg_lz4 = {
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
 	.cra_ctxsize		= sizeof(struct lz4_ctx),
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(alg_lz4.cra_list),
 	.cra_init		= lz4_init,
 	.cra_exit		= lz4_exit,
 	.cra_u			= { .compress = {
diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c
index 2be14f054dafda9b7f46cb21e5a5175ee688dacc..583b5e013d7a59f63b5fab2e4ccf7cabc7b77c3a 100644
--- a/crypto/lz4hc.c
+++ b/crypto/lz4hc.c
@@ -123,7 +123,6 @@ static struct crypto_alg alg_lz4hc = {
 	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
 	.cra_ctxsize		= sizeof(struct lz4hc_ctx),
 	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(alg_lz4hc.cra_list),
 	.cra_init		= lz4hc_init,
 	.cra_exit		= lz4hc_exit,
 	.cra_u			= { .compress = {
diff --git a/crypto/nhpoly1305.c b/crypto/nhpoly1305.c
new file mode 100644
index 0000000000000000000000000000000000000000..ec831a5594d8fe46266d00de594fbbf66ea058f6
--- /dev/null
+++ b/crypto/nhpoly1305.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ *
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * "NHPoly1305" is the main component of Adiantum hashing.
+ * Specifically, it is the calculation
+ *
+ *	H_L ← Poly1305_{K_L}(NH_{K_N}(pad_{128}(L)))
+ *
+ * from the procedure in section 6.4 of the Adiantum paper [1].  It is an
+ * ε-almost-∆-universal (ε-∆U) hash function for equal-length inputs over
+ * Z/(2^{128}Z), where the "∆" operation is addition.  It hashes 1024-byte
+ * chunks of the input with the NH hash function [2], reducing the input length
+ * by 32x.  The resulting NH digests are evaluated as a polynomial in
+ * GF(2^{130}-5), like in the Poly1305 MAC [3].  Note that the polynomial
+ * evaluation by itself would suffice to achieve the ε-∆U property; NH is used
+ * for performance since it's over twice as fast as Poly1305.
+ *
+ * This is *not* a cryptographic hash function; do not use it as such!
+ *
+ * [1] Adiantum: length-preserving encryption for entry-level processors
+ *     (https://eprint.iacr.org/2018/720.pdf)
+ * [2] UMAC: Fast and Secure Message Authentication
+ *     (https://fastcrypto.org/umac/umac_proc.pdf)
+ * [3] The Poly1305-AES message-authentication code
+ *     (https://cr.yp.to/mac/poly1305-20050329.pdf)
+ */
+
+#include <asm/unaligned.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static void nh_generic(const u32 *key, const u8 *message, size_t message_len,
+		       __le64 hash[NH_NUM_PASSES])
+{
+	u64 sums[4] = { 0, 0, 0, 0 };
+
+	BUILD_BUG_ON(NH_PAIR_STRIDE != 2);
+	BUILD_BUG_ON(NH_NUM_PASSES != 4);
+
+	while (message_len) {
+		u32 m0 = get_unaligned_le32(message + 0);
+		u32 m1 = get_unaligned_le32(message + 4);
+		u32 m2 = get_unaligned_le32(message + 8);
+		u32 m3 = get_unaligned_le32(message + 12);
+
+		sums[0] += (u64)(u32)(m0 + key[ 0]) * (u32)(m2 + key[ 2]);
+		sums[1] += (u64)(u32)(m0 + key[ 4]) * (u32)(m2 + key[ 6]);
+		sums[2] += (u64)(u32)(m0 + key[ 8]) * (u32)(m2 + key[10]);
+		sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]);
+		sums[0] += (u64)(u32)(m1 + key[ 1]) * (u32)(m3 + key[ 3]);
+		sums[1] += (u64)(u32)(m1 + key[ 5]) * (u32)(m3 + key[ 7]);
+		sums[2] += (u64)(u32)(m1 + key[ 9]) * (u32)(m3 + key[11]);
+		sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]);
+		key += NH_MESSAGE_UNIT / sizeof(key[0]);
+		message += NH_MESSAGE_UNIT;
+		message_len -= NH_MESSAGE_UNIT;
+	}
+
+	hash[0] = cpu_to_le64(sums[0]);
+	hash[1] = cpu_to_le64(sums[1]);
+	hash[2] = cpu_to_le64(sums[2]);
+	hash[3] = cpu_to_le64(sums[3]);
+}
+
+/* Pass the next NH hash value through Poly1305 */
+static void process_nh_hash_value(struct nhpoly1305_state *state,
+				  const struct nhpoly1305_key *key)
+{
+	BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
+
+	poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
+			     NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
+}
+
+/*
+ * Feed the next portion of the source data, as a whole number of 16-byte
+ * "NH message units", through NH and Poly1305.  Each NH hash is taken over
+ * 1024 bytes, except possibly the final one which is taken over a multiple of
+ * 16 bytes up to 1024.  Also, in the case where data is passed in misaligned
+ * chunks, we combine partial hashes; the end result is the same either way.
+ */
+static void nhpoly1305_units(struct nhpoly1305_state *state,
+			     const struct nhpoly1305_key *key,
+			     const u8 *src, unsigned int srclen, nh_t nh_fn)
+{
+	do {
+		unsigned int bytes;
+
+		if (state->nh_remaining == 0) {
+			/* Starting a new NH message */
+			bytes = min_t(unsigned int, srclen, NH_MESSAGE_BYTES);
+			nh_fn(key->nh_key, src, bytes, state->nh_hash);
+			state->nh_remaining = NH_MESSAGE_BYTES - bytes;
+		} else {
+			/* Continuing a previous NH message */
+			__le64 tmp_hash[NH_NUM_PASSES];
+			unsigned int pos;
+			int i;
+
+			pos = NH_MESSAGE_BYTES - state->nh_remaining;
+			bytes = min(srclen, state->nh_remaining);
+			nh_fn(&key->nh_key[pos / 4], src, bytes, tmp_hash);
+			for (i = 0; i < NH_NUM_PASSES; i++)
+				le64_add_cpu(&state->nh_hash[i],
+					     le64_to_cpu(tmp_hash[i]));
+			state->nh_remaining -= bytes;
+		}
+		if (state->nh_remaining == 0)
+			process_nh_hash_value(state, key);
+		src += bytes;
+		srclen -= bytes;
+	} while (srclen);
+}
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+			     const u8 *key, unsigned int keylen)
+{
+	struct nhpoly1305_key *ctx = crypto_shash_ctx(tfm);
+	int i;
+
+	if (keylen != NHPOLY1305_KEY_SIZE)
+		return -EINVAL;
+
+	poly1305_core_setkey(&ctx->poly_key, key);
+	key += POLY1305_BLOCK_SIZE;
+
+	for (i = 0; i < NH_KEY_WORDS; i++)
+		ctx->nh_key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_setkey);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+
+	poly1305_core_init(&state->poly_state);
+	state->buflen = 0;
+	state->nh_remaining = 0;
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_init);
+
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+				    const u8 *src, unsigned int srclen,
+				    nh_t nh_fn)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+	const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+	unsigned int bytes;
+
+	if (state->buflen) {
+		bytes = min(srclen, (int)NH_MESSAGE_UNIT - state->buflen);
+		memcpy(&state->buffer[state->buflen], src, bytes);
+		state->buflen += bytes;
+		if (state->buflen < NH_MESSAGE_UNIT)
+			return 0;
+		nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+				 nh_fn);
+		state->buflen = 0;
+		src += bytes;
+		srclen -= bytes;
+	}
+
+	if (srclen >= NH_MESSAGE_UNIT) {
+		bytes = round_down(srclen, NH_MESSAGE_UNIT);
+		nhpoly1305_units(state, key, src, bytes, nh_fn);
+		src += bytes;
+		srclen -= bytes;
+	}
+
+	if (srclen) {
+		memcpy(state->buffer, src, srclen);
+		state->buflen = srclen;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update_helper);
+
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+			     const u8 *src, unsigned int srclen)
+{
+	return crypto_nhpoly1305_update_helper(desc, src, srclen, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_update);
+
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn)
+{
+	struct nhpoly1305_state *state = shash_desc_ctx(desc);
+	const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm);
+
+	if (state->buflen) {
+		memset(&state->buffer[state->buflen], 0,
+		       NH_MESSAGE_UNIT - state->buflen);
+		nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT,
+				 nh_fn);
+	}
+
+	if (state->nh_remaining)
+		process_nh_hash_value(state, key);
+
+	poly1305_core_emit(&state->poly_state, dst);
+	return 0;
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
+
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	return crypto_nhpoly1305_final_helper(desc, dst, nh_generic);
+}
+EXPORT_SYMBOL(crypto_nhpoly1305_final);
+
+static struct shash_alg nhpoly1305_alg = {
+	.base.cra_name		= "nhpoly1305",
+	.base.cra_driver_name	= "nhpoly1305-generic",
+	.base.cra_priority	= 100,
+	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
+	.base.cra_module	= THIS_MODULE,
+	.digestsize		= POLY1305_DIGEST_SIZE,
+	.init			= crypto_nhpoly1305_init,
+	.update			= crypto_nhpoly1305_update,
+	.final			= crypto_nhpoly1305_final,
+	.setkey			= crypto_nhpoly1305_setkey,
+	.descsize		= sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+	return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+	crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-generic");
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 8eb3c4c9ff678453b26e1aacc3767be896d422d7..d47cfc47b1b1b359216deed02dcd0f96c8bba98b 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -394,7 +394,7 @@ static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name)
 	int ret;
 
 	pinst->kobj.kset = pcrypt_kset;
-	ret = kobject_add(&pinst->kobj, NULL, name);
+	ret = kobject_add(&pinst->kobj, NULL, "%s", name);
 	if (!ret)
 		kobject_uevent(&pinst->kobj, KOBJ_ADD);
 
diff --git a/crypto/poly1305_generic.c b/crypto/poly1305_generic.c
index 47d3a6b83931ed20f9d64598d84b34c6e929f4e4..2a06874204e87fd52f0f401cd2f408e7716442d7 100644
--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -38,7 +38,7 @@ int crypto_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 
-	memset(dctx->h, 0, sizeof(dctx->h));
+	poly1305_core_init(&dctx->h);
 	dctx->buflen = 0;
 	dctx->rset = false;
 	dctx->sset = false;
@@ -47,23 +47,16 @@ int crypto_poly1305_init(struct shash_desc *desc)
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_init);
 
-static void poly1305_setrkey(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
 {
 	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-	dctx->r[0] = (get_unaligned_le32(key +  0) >> 0) & 0x3ffffff;
-	dctx->r[1] = (get_unaligned_le32(key +  3) >> 2) & 0x3ffff03;
-	dctx->r[2] = (get_unaligned_le32(key +  6) >> 4) & 0x3ffc0ff;
-	dctx->r[3] = (get_unaligned_le32(key +  9) >> 6) & 0x3f03fff;
-	dctx->r[4] = (get_unaligned_le32(key + 12) >> 8) & 0x00fffff;
-}
-
-static void poly1305_setskey(struct poly1305_desc_ctx *dctx, const u8 *key)
-{
-	dctx->s[0] = get_unaligned_le32(key +  0);
-	dctx->s[1] = get_unaligned_le32(key +  4);
-	dctx->s[2] = get_unaligned_le32(key +  8);
-	dctx->s[3] = get_unaligned_le32(key + 12);
+	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
 }
+EXPORT_SYMBOL_GPL(poly1305_core_setkey);
 
 /*
  * Poly1305 requires a unique key for each tag, which implies that we can't set
@@ -75,13 +68,16 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 {
 	if (!dctx->sset) {
 		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setrkey(dctx, src);
+			poly1305_core_setkey(&dctx->r, src);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->rset = true;
 		}
 		if (srclen >= POLY1305_BLOCK_SIZE) {
-			poly1305_setskey(dctx, src);
+			dctx->s[0] = get_unaligned_le32(src +  0);
+			dctx->s[1] = get_unaligned_le32(src +  4);
+			dctx->s[2] = get_unaligned_le32(src +  8);
+			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			srclen -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
@@ -91,41 +87,37 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
 
-static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, unsigned int srclen,
-				    u32 hibit)
+static void poly1305_blocks_internal(struct poly1305_state *state,
+				     const struct poly1305_key *key,
+				     const void *src, unsigned int nblocks,
+				     u32 hibit)
 {
 	u32 r0, r1, r2, r3, r4;
 	u32 s1, s2, s3, s4;
 	u32 h0, h1, h2, h3, h4;
 	u64 d0, d1, d2, d3, d4;
-	unsigned int datalen;
 
-	if (unlikely(!dctx->sset)) {
-		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
-		src += srclen - datalen;
-		srclen = datalen;
-	}
+	if (!nblocks)
+		return;
 
-	r0 = dctx->r[0];
-	r1 = dctx->r[1];
-	r2 = dctx->r[2];
-	r3 = dctx->r[3];
-	r4 = dctx->r[4];
+	r0 = key->r[0];
+	r1 = key->r[1];
+	r2 = key->r[2];
+	r3 = key->r[3];
+	r4 = key->r[4];
 
 	s1 = r1 * 5;
 	s2 = r2 * 5;
 	s3 = r3 * 5;
 	s4 = r4 * 5;
 
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
-
-	while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
+	do {
 		/* h += m[i] */
 		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
 		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
@@ -154,16 +146,36 @@ static unsigned int poly1305_blocks(struct poly1305_desc_ctx *dctx,
 		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
 
 		src += POLY1305_BLOCK_SIZE;
-		srclen -= POLY1305_BLOCK_SIZE;
-	}
+	} while (--nblocks);
 
-	dctx->h[0] = h0;
-	dctx->h[1] = h1;
-	dctx->h[2] = h2;
-	dctx->h[3] = h3;
-	dctx->h[4] = h4;
+	state->h[0] = h0;
+	state->h[1] = h1;
+	state->h[2] = h2;
+	state->h[3] = h3;
+	state->h[4] = h4;
+}
 
-	return srclen;
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks)
+{
+	poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+			    const u8 *src, unsigned int srclen, u32 hibit)
+{
+	unsigned int datalen;
+
+	if (unlikely(!dctx->sset)) {
+		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+		src += srclen - datalen;
+		srclen = datalen;
+	}
+
+	poly1305_blocks_internal(&dctx->h, &dctx->r,
+				 src, srclen / POLY1305_BLOCK_SIZE, hibit);
 }
 
 int crypto_poly1305_update(struct shash_desc *desc,
@@ -187,9 +199,9 @@ int crypto_poly1305_update(struct shash_desc *desc,
 	}
 
 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-		bytes = poly1305_blocks(dctx, src, srclen, 1 << 24);
-		src += srclen - bytes;
-		srclen = bytes;
+		poly1305_blocks(dctx, src, srclen, 1 << 24);
+		src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+		srclen %= POLY1305_BLOCK_SIZE;
 	}
 
 	if (unlikely(srclen)) {
@@ -201,30 +213,18 @@ int crypto_poly1305_update(struct shash_desc *desc,
 }
 EXPORT_SYMBOL_GPL(crypto_poly1305_update);
 
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+void poly1305_core_emit(const struct poly1305_state *state, void *dst)
 {
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	u32 h0, h1, h2, h3, h4;
 	u32 g0, g1, g2, g3, g4;
 	u32 mask;
-	u64 f = 0;
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
 
 	/* fully carry h */
-	h0 = dctx->h[0];
-	h1 = dctx->h[1];
-	h2 = dctx->h[2];
-	h3 = dctx->h[3];
-	h4 = dctx->h[4];
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
 
 	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
@@ -254,16 +254,40 @@ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 	h4 = (h4 & mask) | g4;
 
 	/* h = h % (2^128) */
-	h0 = (h0 >>  0) | (h1 << 26);
-	h1 = (h1 >>  6) | (h2 << 20);
-	h2 = (h2 >> 12) | (h3 << 14);
-	h3 = (h3 >> 18) | (h4 <<  8);
+	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_emit);
+
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+	__le32 digest[4];
+	u64 f = 0;
+
+	if (unlikely(!dctx->sset))
+		return -ENOKEY;
+
+	if (unlikely(dctx->buflen)) {
+		dctx->buf[dctx->buflen++] = 1;
+		memset(dctx->buf + dctx->buflen, 0,
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
+		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+	}
+
+	poly1305_core_emit(&dctx->h, digest);
 
 	/* mac = (h + s) % (2^128) */
-	f = (f >> 32) + h0 + dctx->s[0]; put_unaligned_le32(f, dst +  0);
-	f = (f >> 32) + h1 + dctx->s[1]; put_unaligned_le32(f, dst +  4);
-	f = (f >> 32) + h2 + dctx->s[2]; put_unaligned_le32(f, dst +  8);
-	f = (f >> 32) + h3 + dctx->s[3]; put_unaligned_le32(f, dst + 12);
+	f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+	put_unaligned_le32(f, dst + 0);
+	f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+	put_unaligned_le32(f, dst + 4);
+	f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+	put_unaligned_le32(f, dst + 8);
+	f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+	put_unaligned_le32(f, dst + 12);
 
 	return 0;
 }
diff --git a/crypto/rng.c b/crypto/rng.c
index 547f16ecbfb03fceda06ca0e2d8cf0873dd4a63b..33c38a72bff59fea8ff1c762957408e2d1f29434 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -35,9 +35,11 @@ static int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
+	struct crypto_alg *alg = tfm->base.__crt_alg;
 	u8 *buf = NULL;
 	int err;
 
+	crypto_stats_get(alg);
 	if (!seed && slen) {
 		buf = kmalloc(slen, GFP_KERNEL);
 		if (!buf)
@@ -50,7 +52,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 	}
 
 	err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
-	crypto_stat_rng_seed(tfm, err);
+	crypto_stats_rng_seed(alg, err);
 out:
 	kzfree(buf);
 	return err;
@@ -74,17 +76,13 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_rng rrng;
 
-	strncpy(rrng.type, "rng", sizeof(rrng.type));
+	memset(&rrng, 0, sizeof(rrng));
 
-	rrng.seedsize = seedsize(alg);
+	strscpy(rrng.type, "rng", sizeof(rrng.type));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_RNG,
-		    sizeof(struct crypto_report_rng), &rrng))
-		goto nla_put_failure;
-	return 0;
+	rrng.seedsize = seedsize(alg);
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_RNG, sizeof(rrng), &rrng);
 }
 #else
 static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c
index 8c77bc78a09f2693a1b8acf4e8748d10ced93da7..00fce32ae17af9fa630db24e2ff2ee4f8b7bf547 100644
--- a/crypto/salsa20_generic.c
+++ b/crypto/salsa20_generic.c
@@ -159,7 +159,7 @@ static int salsa20_crypt(struct skcipher_request *req)
 	u32 state[16];
 	int err;
 
-	err = skcipher_walk_virt(&walk, req, true);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	salsa20_init(state, ctx, walk.iv);
 
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 968bbcf65c9411679d74f737a2550bddb5eb0e51..6f8305f8c300407cc351cf935ed862bda5b4a0d2 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -40,15 +40,12 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_comp rscomp;
 
-	strncpy(rscomp.type, "scomp", sizeof(rscomp.type));
+	memset(&rscomp, 0, sizeof(rscomp));
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
-		    sizeof(struct crypto_report_comp), &rscomp))
-		goto nla_put_failure;
-	return 0;
+	strscpy(rscomp.type, "scomp", sizeof(rscomp.type));
 
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS,
+		       sizeof(rscomp), &rscomp);
 }
 #else
 static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/shash.c b/crypto/shash.c
index d21f04d70dce46eb4d1a4ef7a328667b476082af..44d297b82a8fb75368ed7b3d9227ff9472340ac0 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -408,18 +408,14 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct crypto_report_hash rhash;
 	struct shash_alg *salg = __crypto_shash_alg(alg);
 
-	strncpy(rhash.type, "shash", sizeof(rhash.type));
+	memset(&rhash, 0, sizeof(rhash));
+
+	strscpy(rhash.type, "shash", sizeof(rhash.type));
 
 	rhash.blocksize = alg->cra_blocksize;
 	rhash.digestsize = salg->digestsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_HASH,
-		    sizeof(struct crypto_report_hash), &rhash))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
 }
 #else
 static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 4caab81d2d02b23ad354de3ddacfd65d9e477c4d..2a969296bc248a6dc6ccecc290532d725724a7db 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -474,6 +474,8 @@ int skcipher_walk_virt(struct skcipher_walk *walk,
 {
 	int err;
 
+	might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP);
+
 	walk->flags &= ~SKCIPHER_WALK_PHYS;
 
 	err = skcipher_walk_skcipher(walk, req);
@@ -577,8 +579,7 @@ static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg)
 	if (alg->cra_type == &crypto_blkcipher_type)
 		return sizeof(struct crypto_blkcipher *);
 
-	if (alg->cra_type == &crypto_ablkcipher_type ||
-	    alg->cra_type == &crypto_givcipher_type)
+	if (alg->cra_type == &crypto_ablkcipher_type)
 		return sizeof(struct crypto_ablkcipher *);
 
 	return crypto_alg_extsize(alg);
@@ -842,8 +843,7 @@ static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
 	if (tfm->__crt_alg->cra_type == &crypto_blkcipher_type)
 		return crypto_init_skcipher_ops_blkcipher(tfm);
 
-	if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type ||
-	    tfm->__crt_alg->cra_type == &crypto_givcipher_type)
+	if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type)
 		return crypto_init_skcipher_ops_ablkcipher(tfm);
 
 	skcipher->setkey = skcipher_setkey;
@@ -897,21 +897,18 @@ static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
 						     base);
 
-	strncpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
-	strncpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));
+	memset(&rblkcipher, 0, sizeof(rblkcipher));
+
+	strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
+	strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));
 
 	rblkcipher.blocksize = alg->cra_blocksize;
 	rblkcipher.min_keysize = skcipher->min_keysize;
 	rblkcipher.max_keysize = skcipher->max_keysize;
 	rblkcipher.ivsize = skcipher->ivsize;
 
-	if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
-		    sizeof(struct crypto_report_blkcipher), &rblkcipher))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
+	return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
+		       sizeof(rblkcipher), &rblkcipher);
 }
 #else
 static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
diff --git a/crypto/streebog_generic.c b/crypto/streebog_generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..03272a22afcec302d897d68577d9964d20a058e1
--- /dev/null
+++ b/crypto/streebog_generic.c
@@ -0,0 +1,1140 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause
+/*
+ * Streebog hash function as specified by GOST R 34.11-2012 and
+ * described at https://tools.ietf.org/html/rfc6986
+ *
+ * Copyright (c) 2013 Alexey Degtyarev <alexey@renatasystems.org>
+ * Copyright (c) 2018 Vitaly Chikunov <vt@altlinux.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <crypto/streebog.h>
+
+static const struct streebog_uint512 buffer0 = { {
+	0, 0, 0, 0, 0, 0, 0, 0
+} };
+
+static const struct streebog_uint512 buffer512 = { {
+	cpu_to_le64(0x200), 0, 0, 0, 0, 0, 0, 0
+} };
+
+static const struct streebog_uint512 C[12] = {
+	{ {
+		 cpu_to_le64(0xdd806559f2a64507ULL),
+		 cpu_to_le64(0x05767436cc744d23ULL),
+		 cpu_to_le64(0xa2422a08a460d315ULL),
+		 cpu_to_le64(0x4b7ce09192676901ULL),
+		 cpu_to_le64(0x714eb88d7585c4fcULL),
+		 cpu_to_le64(0x2f6a76432e45d016ULL),
+		 cpu_to_le64(0xebcb2f81c0657c1fULL),
+		 cpu_to_le64(0xb1085bda1ecadae9ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xe679047021b19bb7ULL),
+		 cpu_to_le64(0x55dda21bd7cbcd56ULL),
+		 cpu_to_le64(0x5cb561c2db0aa7caULL),
+		 cpu_to_le64(0x9ab5176b12d69958ULL),
+		 cpu_to_le64(0x61d55e0f16b50131ULL),
+		 cpu_to_le64(0xf3feea720a232b98ULL),
+		 cpu_to_le64(0x4fe39d460f70b5d7ULL),
+		 cpu_to_le64(0x6fa3b58aa99d2f1aULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x991e96f50aba0ab2ULL),
+		 cpu_to_le64(0xc2b6f443867adb31ULL),
+		 cpu_to_le64(0xc1c93a376062db09ULL),
+		 cpu_to_le64(0xd3e20fe490359eb1ULL),
+		 cpu_to_le64(0xf2ea7514b1297b7bULL),
+		 cpu_to_le64(0x06f15e5f529c1f8bULL),
+		 cpu_to_le64(0x0a39fc286a3d8435ULL),
+		 cpu_to_le64(0xf574dcac2bce2fc7ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x220cbebc84e3d12eULL),
+		 cpu_to_le64(0x3453eaa193e837f1ULL),
+		 cpu_to_le64(0xd8b71333935203beULL),
+		 cpu_to_le64(0xa9d72c82ed03d675ULL),
+		 cpu_to_le64(0x9d721cad685e353fULL),
+		 cpu_to_le64(0x488e857e335c3c7dULL),
+		 cpu_to_le64(0xf948e1a05d71e4ddULL),
+		 cpu_to_le64(0xef1fdfb3e81566d2ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x601758fd7c6cfe57ULL),
+		 cpu_to_le64(0x7a56a27ea9ea63f5ULL),
+		 cpu_to_le64(0xdfff00b723271a16ULL),
+		 cpu_to_le64(0xbfcd1747253af5a3ULL),
+		 cpu_to_le64(0x359e35d7800fffbdULL),
+		 cpu_to_le64(0x7f151c1f1686104aULL),
+		 cpu_to_le64(0x9a3f410c6ca92363ULL),
+		 cpu_to_le64(0x4bea6bacad474799ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xfa68407a46647d6eULL),
+		 cpu_to_le64(0xbf71c57236904f35ULL),
+		 cpu_to_le64(0x0af21f66c2bec6b6ULL),
+		 cpu_to_le64(0xcffaa6b71c9ab7b4ULL),
+		 cpu_to_le64(0x187f9ab49af08ec6ULL),
+		 cpu_to_le64(0x2d66c4f95142a46cULL),
+		 cpu_to_le64(0x6fa4c33b7a3039c0ULL),
+		 cpu_to_le64(0xae4faeae1d3ad3d9ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x8886564d3a14d493ULL),
+		 cpu_to_le64(0x3517454ca23c4af3ULL),
+		 cpu_to_le64(0x06476983284a0504ULL),
+		 cpu_to_le64(0x0992abc52d822c37ULL),
+		 cpu_to_le64(0xd3473e33197a93c9ULL),
+		 cpu_to_le64(0x399ec6c7e6bf87c9ULL),
+		 cpu_to_le64(0x51ac86febf240954ULL),
+		 cpu_to_le64(0xf4c70e16eeaac5ecULL)
+	 } },
+	{ {
+		 cpu_to_le64(0xa47f0dd4bf02e71eULL),
+		 cpu_to_le64(0x36acc2355951a8d9ULL),
+		 cpu_to_le64(0x69d18d2bd1a5c42fULL),
+		 cpu_to_le64(0xf4892bcb929b0690ULL),
+		 cpu_to_le64(0x89b4443b4ddbc49aULL),
+		 cpu_to_le64(0x4eb7f8719c36de1eULL),
+		 cpu_to_le64(0x03e7aa020c6e4141ULL),
+		 cpu_to_le64(0x9b1f5b424d93c9a7ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x7261445183235adbULL),
+		 cpu_to_le64(0x0e38dc92cb1f2a60ULL),
+		 cpu_to_le64(0x7b2b8a9aa6079c54ULL),
+		 cpu_to_le64(0x800a440bdbb2ceb1ULL),
+		 cpu_to_le64(0x3cd955b7e00d0984ULL),
+		 cpu_to_le64(0x3a7d3a1b25894224ULL),
+		 cpu_to_le64(0x944c9ad8ec165fdeULL),
+		 cpu_to_le64(0x378f5a541631229bULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x74b4c7fb98459cedULL),
+		 cpu_to_le64(0x3698fad1153bb6c3ULL),
+		 cpu_to_le64(0x7a1e6c303b7652f4ULL),
+		 cpu_to_le64(0x9fe76702af69334bULL),
+		 cpu_to_le64(0x1fffe18a1b336103ULL),
+		 cpu_to_le64(0x8941e71cff8a78dbULL),
+		 cpu_to_le64(0x382ae548b2e4f3f3ULL),
+		 cpu_to_le64(0xabbedea680056f52ULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x6bcaa4cd81f32d1bULL),
+		 cpu_to_le64(0xdea2594ac06fd85dULL),
+		 cpu_to_le64(0xefbacd1d7d476e98ULL),
+		 cpu_to_le64(0x8a1d71efea48b9caULL),
+		 cpu_to_le64(0x2001802114846679ULL),
+		 cpu_to_le64(0xd8fa6bbbebab0761ULL),
+		 cpu_to_le64(0x3002c6cd635afe94ULL),
+		 cpu_to_le64(0x7bcd9ed0efc889fbULL)
+	 } },
+	{ {
+		 cpu_to_le64(0x48bc924af11bd720ULL),
+		 cpu_to_le64(0xfaf417d5d9b21b99ULL),
+		 cpu_to_le64(0xe71da4aa88e12852ULL),
+		 cpu_to_le64(0x5d80ef9d1891cc86ULL),
+		 cpu_to_le64(0xf82012d430219f9bULL),
+		 cpu_to_le64(0xcda43c32bcdf1d77ULL),
+		 cpu_to_le64(0xd21380b00449b17aULL),
+		 cpu_to_le64(0x378ee767f11631baULL)
+	 } }
+};
+
+static const u8 Tau[64] = {
+	0,   8,  16,  24,  32,  40,  48,  56,
+	1,   9,  17,  25,  33,  41,  49,  57,
+	2,  10,  18,  26,  34,  42,  50,  58,
+	3,  11,  19,  27,  35,  43,  51,  59,
+	4,  12,  20,  28,  36,  44,  52,  60,
+	5,  13,  21,  29,  37,  45,  53,  61,
+	6,  14,  22,  30,  38,  46,  54,  62,
+	7,  15,  23,  31,  39,  47,  55,  63
+};
+
+static const u8 Pi[256] = {
+	252, 238, 221,  17, 207, 110,  49,  22,
+	251, 196, 250, 218,  35, 197,   4,  77,
+	233, 119, 240, 219, 147,  46, 153, 186,
+	 23,  54, 241, 187,  20, 205,  95, 193,
+	249,  24, 101,  90, 226,  92, 239,  33,
+	129,  28,  60,  66, 139,   1, 142,  79,
+	  5, 132,   2, 174, 227, 106, 143, 160,
+	  6,  11, 237, 152, 127, 212, 211,  31,
+	235,  52,  44,  81, 234, 200,  72, 171,
+	242,  42, 104, 162, 253,  58, 206, 204,
+	181, 112,  14,  86,   8,  12, 118,  18,
+	191, 114,  19,  71, 156, 183,  93, 135,
+	 21, 161, 150,  41,  16, 123, 154, 199,
+	243, 145, 120, 111, 157, 158, 178, 177,
+	 50, 117,  25,  61, 255,  53, 138, 126,
+	109,  84, 198, 128, 195, 189,  13,  87,
+	223, 245,  36, 169,  62, 168,  67, 201,
+	215, 121, 214, 246, 124,  34, 185,   3,
+	224,  15, 236, 222, 122, 148, 176, 188,
+	220, 232,  40,  80,  78,  51,  10,  74,
+	167, 151,  96, 115,  30,   0,  98,  68,
+	 26, 184,  56, 130, 100, 159,  38,  65,
+	173,  69,  70, 146,  39,  94,  85,  47,
+	140, 163, 165, 125, 105, 213, 149,  59,
+	  7,  88, 179,  64, 134, 172,  29, 247,
+	 48,  55, 107, 228, 136, 217, 231, 137,
+	225,  27, 131,  73,  76,  63, 248, 254,
+	141,  83, 170, 144, 202, 216, 133,  97,
+	 32, 113, 103, 164,  45,  43,   9,  91,
+	203, 155,  37, 208, 190, 229, 108,  82,
+	 89, 166, 116, 210, 230, 244, 180, 192,
+	209, 102, 175, 194,  57,  75,  99, 182
+};
+
+static const unsigned long long Ax[8][256] = {
+	{
+	0xd01f715b5c7ef8e6ULL, 0x16fa240980778325ULL, 0xa8a42e857ee049c8ULL,
+	0x6ac1068fa186465bULL, 0x6e417bd7a2e9320bULL, 0x665c8167a437daabULL,
+	0x7666681aa89617f6ULL, 0x4b959163700bdcf5ULL, 0xf14be6b78df36248ULL,
+	0xc585bd689a625cffULL, 0x9557d7fca67d82cbULL, 0x89f0b969af6dd366ULL,
+	0xb0833d48749f6c35ULL, 0xa1998c23b1ecbc7cULL, 0x8d70c431ac02a736ULL,
+	0xd6dfbc2fd0a8b69eULL, 0x37aeb3e551fa198bULL, 0x0b7d128a40b5cf9cULL,
+	0x5a8f2008b5780cbcULL, 0xedec882284e333e5ULL, 0xd25fc177d3c7c2ceULL,
+	0x5e0f5d50b61778ecULL, 0x1d873683c0c24cb9ULL, 0xad040bcbb45d208cULL,
+	0x2f89a0285b853c76ULL, 0x5732fff6791b8d58ULL, 0x3e9311439ef6ec3fULL,
+	0xc9183a809fd3c00fULL, 0x83adf3f5260a01eeULL, 0xa6791941f4e8ef10ULL,
+	0x103ae97d0ca1cd5dULL, 0x2ce948121dee1b4aULL, 0x39738421dbf2bf53ULL,
+	0x093da2a6cf0cf5b4ULL, 0xcd9847d89cbcb45fULL, 0xf9561c078b2d8ae8ULL,
+	0x9c6a755a6971777fULL, 0xbc1ebaa0712ef0c5ULL, 0x72e61542abf963a6ULL,
+	0x78bb5fde229eb12eULL, 0x14ba94250fceb90dULL, 0x844d6697630e5282ULL,
+	0x98ea08026a1e032fULL, 0xf06bbea144217f5cULL, 0xdb6263d11ccb377aULL,
+	0x641c314b2b8ee083ULL, 0x320e96ab9b4770cfULL, 0x1ee7deb986a96b85ULL,
+	0xe96cf57a878c47b5ULL, 0xfdd6615f8842feb8ULL, 0xc83862965601dd1bULL,
+	0x2ea9f83e92572162ULL, 0xf876441142ff97fcULL, 0xeb2c455608357d9dULL,
+	0x5612a7e0b0c9904cULL, 0x6c01cbfb2d500823ULL, 0x4548a6a7fa037a2dULL,
+	0xabc4c6bf388b6ef4ULL, 0xbade77d4fdf8bebdULL, 0x799b07c8eb4cac3aULL,
+	0x0c9d87e805b19cf0ULL, 0xcb588aac106afa27ULL, 0xea0c1d40c1e76089ULL,
+	0x2869354a1e816f1aULL, 0xff96d17307fbc490ULL, 0x9f0a9d602f1a5043ULL,
+	0x96373fc6e016a5f7ULL, 0x5292dab8b3a6e41cULL, 0x9b8ae0382c752413ULL,
+	0x4f15ec3b7364a8a5ULL, 0x3fb349555724f12bULL, 0xc7c50d4415db66d7ULL,
+	0x92b7429ee379d1a7ULL, 0xd37f99611a15dfdaULL, 0x231427c05e34a086ULL,
+	0xa439a96d7b51d538ULL, 0xb403401077f01865ULL, 0xdda2aea5901d7902ULL,
+	0x0a5d4a9c8967d288ULL, 0xc265280adf660f93ULL, 0x8bb0094520d4e94eULL,
+	0x2a29856691385532ULL, 0x42a833c5bf072941ULL, 0x73c64d54622b7eb2ULL,
+	0x07e095624504536cULL, 0x8a905153e906f45aULL, 0x6f6123c16b3b2f1fULL,
+	0xc6e55552dc097bc3ULL, 0x4468feb133d16739ULL, 0xe211e7f0c7398829ULL,
+	0xa2f96419f7879b40ULL, 0x19074bdbc3ad38e9ULL, 0xf4ebc3f9474e0b0cULL,
+	0x43886bd376d53455ULL, 0xd8028beb5aa01046ULL, 0x51f23282f5cdc320ULL,
+	0xe7b1c2be0d84e16dULL, 0x081dfab006dee8a0ULL, 0x3b33340d544b857bULL,
+	0x7f5bcabc679ae242ULL, 0x0edd37c48a08a6d8ULL, 0x81ed43d9a9b33bc6ULL,
+	0xb1a3655ebd4d7121ULL, 0x69a1eeb5e7ed6167ULL, 0xf6ab73d5c8f73124ULL,
+	0x1a67a3e185c61fd5ULL, 0x2dc91004d43c065eULL, 0x0240b02c8fb93a28ULL,
+	0x90f7f2b26cc0eb8fULL, 0x3cd3a16f114fd617ULL, 0xaae49ea9f15973e0ULL,
+	0x06c0cd748cd64e78ULL, 0xda423bc7d5192a6eULL, 0xc345701c16b41287ULL,
+	0x6d2193ede4821537ULL, 0xfcf639494190e3acULL, 0x7c3b228621f1c57eULL,
+	0xfb16ac2b0494b0c0ULL, 0xbf7e529a3745d7f9ULL, 0x6881b6a32e3f7c73ULL,
+	0xca78d2bad9b8e733ULL, 0xbbfe2fc2342aa3a9ULL, 0x0dbddffecc6381e4ULL,
+	0x70a6a56e2440598eULL, 0xe4d12a844befc651ULL, 0x8c509c2765d0ba22ULL,
+	0xee8c6018c28814d9ULL, 0x17da7c1f49a59e31ULL, 0x609c4c1328e194d3ULL,
+	0xb3e3d57232f44b09ULL, 0x91d7aaa4a512f69bULL, 0x0ffd6fd243dabbccULL,
+	0x50d26a943c1fde34ULL, 0x6be15e9968545b4fULL, 0x94778fea6faf9fdfULL,
+	0x2b09dd7058ea4826ULL, 0x677cd9716de5c7bfULL, 0x49d5214fffb2e6ddULL,
+	0x0360e83a466b273cULL, 0x1fc786af4f7b7691ULL, 0xa0b9d435783ea168ULL,
+	0xd49f0c035f118cb6ULL, 0x01205816c9d21d14ULL, 0xac2453dd7d8f3d98ULL,
+	0x545217cc3f70aa64ULL, 0x26b4028e9489c9c2ULL, 0xdec2469fd6765e3eULL,
+	0x04807d58036f7450ULL, 0xe5f17292823ddb45ULL, 0xf30b569b024a5860ULL,
+	0x62dcfc3fa758aefbULL, 0xe84cad6c4e5e5aa1ULL, 0xccb81fce556ea94bULL,
+	0x53b282ae7a74f908ULL, 0x1b47fbf74c1402c1ULL, 0x368eebf39828049fULL,
+	0x7afbeff2ad278b06ULL, 0xbe5e0a8cfe97caedULL, 0xcfd8f7f413058e77ULL,
+	0xf78b2bc301252c30ULL, 0x4d555c17fcdd928dULL, 0x5f2f05467fc565f8ULL,
+	0x24f4b2a21b30f3eaULL, 0x860dd6bbecb768aaULL, 0x4c750401350f8f99ULL,
+	0x0000000000000000ULL, 0xecccd0344d312ef1ULL, 0xb5231806be220571ULL,
+	0xc105c030990d28afULL, 0x653c695de25cfd97ULL, 0x159acc33c61ca419ULL,
+	0xb89ec7f872418495ULL, 0xa9847693b73254dcULL, 0x58cf90243ac13694ULL,
+	0x59efc832f3132b80ULL, 0x5c4fed7c39ae42c4ULL, 0x828dabe3efd81cfaULL,
+	0xd13f294d95ace5f2ULL, 0x7d1b7a90e823d86aULL, 0xb643f03cf849224dULL,
+	0x3df3f979d89dcb03ULL, 0x7426d836272f2ddeULL, 0xdfe21e891fa4432aULL,
+	0x3a136c1b9d99986fULL, 0xfa36f43dcd46add4ULL, 0xc025982650df35bbULL,
+	0x856d3e81aadc4f96ULL, 0xc4a5e57e53b041ebULL, 0x4708168b75ba4005ULL,
+	0xaf44bbe73be41aa4ULL, 0x971767d029c4b8e3ULL, 0xb9be9feebb939981ULL,
+	0x215497ecd18d9aaeULL, 0x316e7e91dd2c57f3ULL, 0xcef8afe2dad79363ULL,
+	0x3853dc371220a247ULL, 0x35ee03c9de4323a3ULL, 0xe6919aa8c456fc79ULL,
+	0xe05157dc4880b201ULL, 0x7bdbb7e464f59612ULL, 0x127a59518318f775ULL,
+	0x332ecebd52956ddbULL, 0x8f30741d23bb9d1eULL, 0xd922d3fd93720d52ULL,
+	0x7746300c61440ae2ULL, 0x25d4eab4d2e2eefeULL, 0x75068020eefd30caULL,
+	0x135a01474acaea61ULL, 0x304e268714fe4ae7ULL, 0xa519f17bb283c82cULL,
+	0xdc82f6b359cf6416ULL, 0x5baf781e7caa11a8ULL, 0xb2c38d64fb26561dULL,
+	0x34ce5bdf17913eb7ULL, 0x5d6fb56af07c5fd0ULL, 0x182713cd0a7f25fdULL,
+	0x9e2ac576e6c84d57ULL, 0x9aaab82ee5a73907ULL, 0xa3d93c0f3e558654ULL,
+	0x7e7b92aaae48ff56ULL, 0x872d8ead256575beULL, 0x41c8dbfff96c0e7dULL,
+	0x99ca5014a3cc1e3bULL, 0x40e883e930be1369ULL, 0x1ca76e95091051adULL,
+	0x4e35b42dbab6b5b1ULL, 0x05a0254ecabd6944ULL, 0xe1710fca8152af15ULL,
+	0xf22b0e8dcb984574ULL, 0xb763a82a319b3f59ULL, 0x63fca4296e8ab3efULL,
+	0x9d4a2d4ca0a36a6bULL, 0xe331bfe60eeb953dULL, 0xd5bf541596c391a2ULL,
+	0xf5cb9bef8e9c1618ULL, 0x46284e9dbc685d11ULL, 0x2074cffa185f87baULL,
+	0xbd3ee2b6b8fcedd1ULL, 0xae64e3f1f23607b0ULL, 0xfeb68965ce29d984ULL,
+	0x55724fdaf6a2b770ULL, 0x29496d5cd753720eULL, 0xa75941573d3af204ULL,
+	0x8e102c0bea69800aULL, 0x111ab16bc573d049ULL, 0xd7ffe439197aab8aULL,
+	0xefac380e0b5a09cdULL, 0x48f579593660fbc9ULL, 0x22347fd697e6bd92ULL,
+	0x61bc1405e13389c7ULL, 0x4ab5c975b9d9c1e1ULL, 0x80cd1bcf606126d2ULL,
+	0x7186fd78ed92449aULL, 0x93971a882aabccb3ULL, 0x88d0e17f66bfce72ULL,
+	0x27945a985d5bd4d6ULL
+	}, {
+	0xde553f8c05a811c8ULL, 0x1906b59631b4f565ULL, 0x436e70d6b1964ff7ULL,
+	0x36d343cb8b1e9d85ULL, 0x843dfacc858aab5aULL, 0xfdfc95c299bfc7f9ULL,
+	0x0f634bdea1d51fa2ULL, 0x6d458b3b76efb3cdULL, 0x85c3f77cf8593f80ULL,
+	0x3c91315fbe737cb2ULL, 0x2148b03366ace398ULL, 0x18f8b8264c6761bfULL,
+	0xc830c1c495c9fb0fULL, 0x981a76102086a0aaULL, 0xaa16012142f35760ULL,
+	0x35cc54060c763cf6ULL, 0x42907d66cc45db2dULL, 0x8203d44b965af4bcULL,
+	0x3d6f3cefc3a0e868ULL, 0xbc73ff69d292bda7ULL, 0x8722ed0102e20a29ULL,
+	0x8f8185e8cd34deb7ULL, 0x9b0561dda7ee01d9ULL, 0x5335a0193227fad6ULL,
+	0xc9cecc74e81a6fd5ULL, 0x54f5832e5c2431eaULL, 0x99e47ba05d553470ULL,
+	0xf7bee756acd226ceULL, 0x384e05a5571816fdULL, 0xd1367452a47d0e6aULL,
+	0xf29fde1c386ad85bULL, 0x320c77316275f7caULL, 0xd0c879e2d9ae9ab0ULL,
+	0xdb7406c69110ef5dULL, 0x45505e51a2461011ULL, 0xfc029872e46c5323ULL,
+	0xfa3cb6f5f7bc0cc5ULL, 0x031f17cd8768a173ULL, 0xbd8df2d9af41297dULL,
+	0x9d3b4f5ab43e5e3fULL, 0x4071671b36feee84ULL, 0x716207e7d3e3b83dULL,
+	0x48d20ff2f9283a1aULL, 0x27769eb4757cbc7eULL, 0x5c56ebc793f2e574ULL,
+	0xa48b474f9ef5dc18ULL, 0x52cbada94ff46e0cULL, 0x60c7da982d8199c6ULL,
+	0x0e9d466edc068b78ULL, 0x4eec2175eaf865fcULL, 0x550b8e9e21f7a530ULL,
+	0x6b7ba5bc653fec2bULL, 0x5eb7f1ba6949d0ddULL, 0x57ea94e3db4c9099ULL,
+	0xf640eae6d101b214ULL, 0xdd4a284182c0b0bbULL, 0xff1d8fbf6304f250ULL,
+	0xb8accb933bf9d7e8ULL, 0xe8867c478eb68c4dULL, 0x3f8e2692391bddc1ULL,
+	0xcb2fd60912a15a7cULL, 0xaec935dbab983d2fULL, 0xf55ffd2b56691367ULL,
+	0x80e2ce366ce1c115ULL, 0x179bf3f8edb27e1dULL, 0x01fe0db07dd394daULL,
+	0xda8a0b76ecc37b87ULL, 0x44ae53e1df9584cbULL, 0xb310b4b77347a205ULL,
+	0xdfab323c787b8512ULL, 0x3b511268d070b78eULL, 0x65e6e3d2b9396753ULL,
+	0x6864b271e2574d58ULL, 0x259784c98fc789d7ULL, 0x02e11a7dfabb35a9ULL,
+	0x8841a6dfa337158bULL, 0x7ade78c39b5dcdd0ULL, 0xb7cf804d9a2cc84aULL,
+	0x20b6bd831b7f7742ULL, 0x75bd331d3a88d272ULL, 0x418f6aab4b2d7a5eULL,
+	0xd9951cbb6babdaf4ULL, 0xb6318dfde7ff5c90ULL, 0x1f389b112264aa83ULL,
+	0x492c024284fbaec0ULL, 0xe33a0363c608f9a0ULL, 0x2688930408af28a4ULL,
+	0xc7538a1a341ce4adULL, 0x5da8e677ee2171aeULL, 0x8c9e92254a5c7fc4ULL,
+	0x63d8cd55aae938b5ULL, 0x29ebd8daa97a3706ULL, 0x959827b37be88aa1ULL,
+	0x1484e4356adadf6eULL, 0xa7945082199d7d6bULL, 0xbf6ce8a455fa1cd4ULL,
+	0x9cc542eac9edcae5ULL, 0x79c16f0e1c356ca3ULL, 0x89bfab6fdee48151ULL,
+	0xd4174d1830c5f0ffULL, 0x9258048415eb419dULL, 0x6139d72850520d1cULL,
+	0x6a85a80c18ec78f1ULL, 0xcd11f88e0171059aULL, 0xcceff53e7ca29140ULL,
+	0xd229639f2315af19ULL, 0x90b91ef9ef507434ULL, 0x5977d28d074a1be1ULL,
+	0x311360fce51d56b9ULL, 0xc093a92d5a1f2f91ULL, 0x1a19a25bb6dc5416ULL,
+	0xeb996b8a09de2d3eULL, 0xfee3820f1ed7668aULL, 0xd7085ad5b7ad518cULL,
+	0x7fff41890fe53345ULL, 0xec5948bd67dde602ULL, 0x2fd5f65dbaaa68e0ULL,
+	0xa5754affe32648c2ULL, 0xf8ddac880d07396cULL, 0x6fa491468c548664ULL,
+	0x0c7c5c1326bdbed1ULL, 0x4a33158f03930fb3ULL, 0x699abfc19f84d982ULL,
+	0xe4fa2054a80b329cULL, 0x6707f9af438252faULL, 0x08a368e9cfd6d49eULL,
+	0x47b1442c58fd25b8ULL, 0xbbb3dc5ebc91769bULL, 0x1665fe489061eac7ULL,
+	0x33f27a811fa66310ULL, 0x93a609346838d547ULL, 0x30ed6d4c98cec263ULL,
+	0x1dd9816cd8df9f2aULL, 0x94662a03063b1e7bULL, 0x83fdd9fbeb896066ULL,
+	0x7b207573e68e590aULL, 0x5f49fc0a149a4407ULL, 0x343259b671a5a82cULL,
+	0xfbc2bb458a6f981fULL, 0xc272b350a0a41a38ULL, 0x3aaf1fd8ada32354ULL,
+	0x6cbb868b0b3c2717ULL, 0xa2b569c88d2583feULL, 0xf180c9d1bf027928ULL,
+	0xaf37386bd64ba9f5ULL, 0x12bacab2790a8088ULL, 0x4c0d3b0810435055ULL,
+	0xb2eeb9070e9436dfULL, 0xc5b29067cea7d104ULL, 0xdcb425f1ff132461ULL,
+	0x4f122cc5972bf126ULL, 0xac282fa651230886ULL, 0xe7e537992f6393efULL,
+	0xe61b3a2952b00735ULL, 0x709c0a57ae302ce7ULL, 0xe02514ae416058d3ULL,
+	0xc44c9dd7b37445deULL, 0x5a68c5408022ba92ULL, 0x1c278cdca50c0bf0ULL,
+	0x6e5a9cf6f18712beULL, 0x86dce0b17f319ef3ULL, 0x2d34ec2040115d49ULL,
+	0x4bcd183f7e409b69ULL, 0x2815d56ad4a9a3dcULL, 0x24698979f2141d0dULL,
+	0x0000000000000000ULL, 0x1ec696a15fb73e59ULL, 0xd86b110b16784e2eULL,
+	0x8e7f8858b0e74a6dULL, 0x063e2e8713d05fe6ULL, 0xe2c40ed3bbdb6d7aULL,
+	0xb1f1aeca89fc97acULL, 0xe1db191e3cb3cc09ULL, 0x6418ee62c4eaf389ULL,
+	0xc6ad87aa49cf7077ULL, 0xd6f65765ca7ec556ULL, 0x9afb6c6dda3d9503ULL,
+	0x7ce05644888d9236ULL, 0x8d609f95378feb1eULL, 0x23a9aa4e9c17d631ULL,
+	0x6226c0e5d73aac6fULL, 0x56149953a69f0443ULL, 0xeeb852c09d66d3abULL,
+	0x2b0ac2a753c102afULL, 0x07c023376e03cb3cULL, 0x2ccae1903dc2c993ULL,
+	0xd3d76e2f5ec63bc3ULL, 0x9e2458973356ff4cULL, 0xa66a5d32644ee9b1ULL,
+	0x0a427294356de137ULL, 0x783f62be61e6f879ULL, 0x1344c70204d91452ULL,
+	0x5b96c8f0fdf12e48ULL, 0xa90916ecc59bf613ULL, 0xbe92e5142829880eULL,
+	0x727d102a548b194eULL, 0x1be7afebcb0fc0ccULL, 0x3e702b2244c8491bULL,
+	0xd5e940a84d166425ULL, 0x66f9f41f3e51c620ULL, 0xabe80c913f20c3baULL,
+	0xf07ec461c2d1edf2ULL, 0xf361d3ac45b94c81ULL, 0x0521394a94b8fe95ULL,
+	0xadd622162cf09c5cULL, 0xe97871f7f3651897ULL, 0xf4a1f09b2bba87bdULL,
+	0x095d6559b2054044ULL, 0x0bbc7f2448be75edULL, 0x2af4cf172e129675ULL,
+	0x157ae98517094bb4ULL, 0x9fda55274e856b96ULL, 0x914713499283e0eeULL,
+	0xb952c623462a4332ULL, 0x74433ead475b46a8ULL, 0x8b5eb112245fb4f8ULL,
+	0xa34b6478f0f61724ULL, 0x11a5dd7ffe6221fbULL, 0xc16da49d27ccbb4bULL,
+	0x76a224d0bde07301ULL, 0x8aa0bca2598c2022ULL, 0x4df336b86d90c48fULL,
+	0xea67663a740db9e4ULL, 0xef465f70e0b54771ULL, 0x39b008152acb8227ULL,
+	0x7d1e5bf4f55e06ecULL, 0x105bd0cf83b1b521ULL, 0x775c2960c033e7dbULL,
+	0x7e014c397236a79fULL, 0x811cc386113255cfULL, 0xeda7450d1a0e72d8ULL,
+	0x5889df3d7a998f3bULL, 0x2e2bfbedc779fc3aULL, 0xce0eef438619a4e9ULL,
+	0x372d4e7bf6cd095fULL, 0x04df34fae96b6a4fULL, 0xf923a13870d4adb6ULL,
+	0xa1aa7e050a4d228dULL, 0xa8f71b5cb84862c9ULL, 0xb52e9a306097fde3ULL,
+	0x0d8251a35b6e2a0bULL, 0x2257a7fee1c442ebULL, 0x73831d9a29588d94ULL,
+	0x51d4ba64c89ccf7fULL, 0x502ab7d4b54f5ba5ULL, 0x97793dce8153bf08ULL,
+	0xe5042de4d5d8a646ULL, 0x9687307efc802bd2ULL, 0xa05473b5779eb657ULL,
+	0xb4d097801d446939ULL, 0xcff0e2f3fbca3033ULL, 0xc38cbee0dd778ee2ULL,
+	0x464f499c252eb162ULL, 0xcad1dbb96f72cea6ULL, 0xba4dd1eec142e241ULL,
+	0xb00fa37af42f0376ULL
+	}, {
+	0xcce4cd3aa968b245ULL, 0x089d5484e80b7fafULL, 0x638246c1b3548304ULL,
+	0xd2fe0ec8c2355492ULL, 0xa7fbdf7ff2374eeeULL, 0x4df1600c92337a16ULL,
+	0x84e503ea523b12fbULL, 0x0790bbfd53ab0c4aULL, 0x198a780f38f6ea9dULL,
+	0x2ab30c8f55ec48cbULL, 0xe0f7fed6b2c49db5ULL, 0xb6ecf3f422cadbdcULL,
+	0x409c9a541358df11ULL, 0xd3ce8a56dfde3fe3ULL, 0xc3e9224312c8c1a0ULL,
+	0x0d6dfa58816ba507ULL, 0xddf3e1b179952777ULL, 0x04c02a42748bb1d9ULL,
+	0x94c2abff9f2decb8ULL, 0x4f91752da8f8acf4ULL, 0x78682befb169bf7bULL,
+	0xe1c77a48af2ff6c4ULL, 0x0c5d7ec69c80ce76ULL, 0x4cc1e4928fd81167ULL,
+	0xfeed3d24d9997b62ULL, 0x518bb6dfc3a54a23ULL, 0x6dbf2d26151f9b90ULL,
+	0xb5bc624b05ea664fULL, 0xe86aaa525acfe21aULL, 0x4801ced0fb53a0beULL,
+	0xc91463e6c00868edULL, 0x1027a815cd16fe43ULL, 0xf67069a0319204cdULL,
+	0xb04ccc976c8abce7ULL, 0xc0b9b3fc35e87c33ULL, 0xf380c77c58f2de65ULL,
+	0x50bb3241de4e2152ULL, 0xdf93f490435ef195ULL, 0xf1e0d25d62390887ULL,
+	0xaf668bfb1a3c3141ULL, 0xbc11b251f00a7291ULL, 0x73a5eed47e427d47ULL,
+	0x25bee3f6ee4c3b2eULL, 0x43cc0beb34786282ULL, 0xc824e778dde3039cULL,
+	0xf97d86d98a327728ULL, 0xf2b043e24519b514ULL, 0xe297ebf7880f4b57ULL,
+	0x3a94a49a98fab688ULL, 0x868516cb68f0c419ULL, 0xeffa11af0964ee50ULL,
+	0xa4ab4ec0d517f37dULL, 0xa9c6b498547c567aULL, 0x8e18424f80fbbbb6ULL,
+	0x0bcdc53bcf2bc23cULL, 0x137739aaea3643d0ULL, 0x2c1333ec1bac2ff0ULL,
+	0x8d48d3f0a7db0625ULL, 0x1e1ac3f26b5de6d7ULL, 0xf520f81f16b2b95eULL,
+	0x9f0f6ec450062e84ULL, 0x0130849e1deb6b71ULL, 0xd45e31ab8c7533a9ULL,
+	0x652279a2fd14e43fULL, 0x3209f01e70f1c927ULL, 0xbe71a770cac1a473ULL,
+	0x0e3d6be7a64b1894ULL, 0x7ec8148cff29d840ULL, 0xcb7476c7fac3be0fULL,
+	0x72956a4a63a91636ULL, 0x37f95ec21991138fULL, 0x9e3fea5a4ded45f5ULL,
+	0x7b38ba50964902e8ULL, 0x222e580bbde73764ULL, 0x61e253e0899f55e6ULL,
+	0xfc8d2805e352ad80ULL, 0x35994be3235ac56dULL, 0x09add01af5e014deULL,
+	0x5e8659a6780539c6ULL, 0xb17c48097161d796ULL, 0x026015213acbd6e2ULL,
+	0xd1ae9f77e515e901ULL, 0xb7dc776a3f21b0adULL, 0xaba6a1b96eb78098ULL,
+	0x9bcf4486248d9f5dULL, 0x582666c536455efdULL, 0xfdbdac9bfeb9c6f1ULL,
+	0xc47999be4163cdeaULL, 0x765540081722a7efULL, 0x3e548ed8ec710751ULL,
+	0x3d041f67cb51bac2ULL, 0x7958af71ac82d40aULL, 0x36c9da5c047a78feULL,
+	0xed9a048e33af38b2ULL, 0x26ee7249c96c86bdULL, 0x900281bdeba65d61ULL,
+	0x11172c8bd0fd9532ULL, 0xea0abf73600434f8ULL, 0x42fc8f75299309f3ULL,
+	0x34a9cf7d3eb1ae1cULL, 0x2b838811480723baULL, 0x5ce64c8742ceef24ULL,
+	0x1adae9b01fd6570eULL, 0x3c349bf9d6bad1b3ULL, 0x82453c891c7b75c0ULL,
+	0x97923a40b80d512bULL, 0x4a61dbf1c198765cULL, 0xb48ce6d518010d3eULL,
+	0xcfb45c858e480fd6ULL, 0xd933cbf30d1e96aeULL, 0xd70ea014ab558e3aULL,
+	0xc189376228031742ULL, 0x9262949cd16d8b83ULL, 0xeb3a3bed7def5f89ULL,
+	0x49314a4ee6b8cbcfULL, 0xdcc3652f647e4c06ULL, 0xda635a4c2a3e2b3dULL,
+	0x470c21a940f3d35bULL, 0x315961a157d174b4ULL, 0x6672e81dda3459acULL,
+	0x5b76f77a1165e36eULL, 0x445cb01667d36ec8ULL, 0xc5491d205c88a69bULL,
+	0x456c34887a3805b9ULL, 0xffddb9bac4721013ULL, 0x99af51a71e4649bfULL,
+	0xa15be01cbc7729d5ULL, 0x52db2760e485f7b0ULL, 0x8c78576eba306d54ULL,
+	0xae560f6507d75a30ULL, 0x95f22f6182c687c9ULL, 0x71c5fbf54489aba5ULL,
+	0xca44f259e728d57eULL, 0x88b87d2ccebbdc8dULL, 0xbab18d32be4a15aaULL,
+	0x8be8ec93e99b611eULL, 0x17b713e89ebdf209ULL, 0xb31c5d284baa0174ULL,
+	0xeeca9531148f8521ULL, 0xb8d198138481c348ULL, 0x8988f9b2d350b7fcULL,
+	0xb9e11c8d996aa839ULL, 0x5a4673e40c8e881fULL, 0x1687977683569978ULL,
+	0xbf4123eed72acf02ULL, 0x4ea1f1b3b513c785ULL, 0xe767452be16f91ffULL,
+	0x7505d1b730021a7cULL, 0xa59bca5ec8fc980cULL, 0xad069eda20f7e7a3ULL,
+	0x38f4b1bba231606aULL, 0x60d2d77e94743e97ULL, 0x9affc0183966f42cULL,
+	0x248e6768f3a7505fULL, 0xcdd449a4b483d934ULL, 0x87b59255751baf68ULL,
+	0x1bea6d2e023d3c7fULL, 0x6b1f12455b5ffcabULL, 0x743555292de9710dULL,
+	0xd8034f6d10f5fddfULL, 0xc6198c9f7ba81b08ULL, 0xbb8109aca3a17edbULL,
+	0xfa2d1766ad12cabbULL, 0xc729080166437079ULL, 0x9c5fff7b77269317ULL,
+	0x0000000000000000ULL, 0x15d706c9a47624ebULL, 0x6fdf38072fd44d72ULL,
+	0x5fb6dd3865ee52b7ULL, 0xa33bf53d86bcff37ULL, 0xe657c1b5fc84fa8eULL,
+	0xaa962527735cebe9ULL, 0x39c43525bfda0b1bULL, 0x204e4d2a872ce186ULL,
+	0x7a083ece8ba26999ULL, 0x554b9c9db72efbfaULL, 0xb22cd9b656416a05ULL,
+	0x96a2bedea5e63a5aULL, 0x802529a826b0a322ULL, 0x8115ad363b5bc853ULL,
+	0x8375b81701901eb1ULL, 0x3069e53f4a3a1fc5ULL, 0xbd2136cfede119e0ULL,
+	0x18bafc91251d81ecULL, 0x1d4a524d4c7d5b44ULL, 0x05f0aedc6960daa8ULL,
+	0x29e39d3072ccf558ULL, 0x70f57f6b5962c0d4ULL, 0x989fd53903ad22ceULL,
+	0xf84d024797d91c59ULL, 0x547b1803aac5908bULL, 0xf0d056c37fd263f6ULL,
+	0xd56eb535919e58d8ULL, 0x1c7ad6d351963035ULL, 0x2e7326cd2167f912ULL,
+	0xac361a443d1c8cd2ULL, 0x697f076461942a49ULL, 0x4b515f6fdc731d2dULL,
+	0x8ad8680df4700a6fULL, 0x41ac1eca0eb3b460ULL, 0x7d988533d80965d3ULL,
+	0xa8f6300649973d0bULL, 0x7765c4960ac9cc9eULL, 0x7ca801adc5e20ea2ULL,
+	0xdea3700e5eb59ae4ULL, 0xa06b6482a19c42a4ULL, 0x6a2f96db46b497daULL,
+	0x27def6d7d487edccULL, 0x463ca5375d18b82aULL, 0xa6cb5be1efdc259fULL,
+	0x53eba3fef96e9cc1ULL, 0xce84d81b93a364a7ULL, 0xf4107c810b59d22fULL,
+	0x333974806d1aa256ULL, 0x0f0def79bba073e5ULL, 0x231edc95a00c5c15ULL,
+	0xe437d494c64f2c6cULL, 0x91320523f64d3610ULL, 0x67426c83c7df32ddULL,
+	0x6eefbc99323f2603ULL, 0x9d6f7be56acdf866ULL, 0x5916e25b2bae358cULL,
+	0x7ff89012e2c2b331ULL, 0x035091bf2720bd93ULL, 0x561b0d22900e4669ULL,
+	0x28d319ae6f279e29ULL, 0x2f43a2533c8c9263ULL, 0xd09e1be9f8fe8270ULL,
+	0xf740ed3e2c796fbcULL, 0xdb53ded237d5404cULL, 0x62b2c25faebfe875ULL,
+	0x0afd41a5d2c0a94dULL, 0x6412fd3ce0ff8f4eULL, 0xe3a76f6995e42026ULL,
+	0x6c8fa9b808f4f0e1ULL, 0xc2d9a6dd0f23aad1ULL, 0x8f28c6d19d10d0c7ULL,
+	0x85d587744fd0798aULL, 0xa20b71a39b579446ULL, 0x684f83fa7c7f4138ULL,
+	0xe507500adba4471dULL, 0x3f640a46f19a6c20ULL, 0x1247bd34f7dd28a1ULL,
+	0x2d23b77206474481ULL, 0x93521002cc86e0f2ULL, 0x572b89bc8de52d18ULL,
+	0xfb1d93f8b0f9a1caULL, 0xe95a2ecc4724896bULL, 0x3ba420048511ddf9ULL,
+	0xd63e248ab6bee54bULL, 0x5dd6c8195f258455ULL, 0x06a03f634e40673bULL,
+	0x1f2a476c76b68da6ULL, 0x217ec9b49ac78af7ULL, 0xecaa80102e4453c3ULL,
+	0x14e78257b99d4f9aULL
+	}, {
+	0x20329b2cc87bba05ULL, 0x4f5eb6f86546a531ULL, 0xd4f44775f751b6b1ULL,
+	0x8266a47b850dfa8bULL, 0xbb986aa15a6ca985ULL, 0xc979eb08f9ae0f99ULL,
+	0x2da6f447a2375ea1ULL, 0x1e74275dcd7d8576ULL, 0xbc20180a800bc5f8ULL,
+	0xb4a2f701b2dc65beULL, 0xe726946f981b6d66ULL, 0x48e6c453bf21c94cULL,
+	0x42cad9930f0a4195ULL, 0xefa47b64aacccd20ULL, 0x71180a8960409a42ULL,
+	0x8bb3329bf6a44e0cULL, 0xd34c35de2d36daccULL, 0xa92f5b7cbc23dc96ULL,
+	0xb31a85aa68bb09c3ULL, 0x13e04836a73161d2ULL, 0xb24dfc4129c51d02ULL,
+	0x8ae44b70b7da5acdULL, 0xe671ed84d96579a7ULL, 0xa4bb3417d66f3832ULL,
+	0x4572ab38d56d2de8ULL, 0xb1b47761ea47215cULL, 0xe81c09cf70aba15dULL,
+	0xffbdb872ce7f90acULL, 0xa8782297fd5dc857ULL, 0x0d946f6b6a4ce4a4ULL,
+	0xe4df1f4f5b995138ULL, 0x9ebc71edca8c5762ULL, 0x0a2c1dc0b02b88d9ULL,
+	0x3b503c115d9d7b91ULL, 0xc64376a8111ec3a2ULL, 0xcec199a323c963e4ULL,
+	0xdc76a87ec58616f7ULL, 0x09d596e073a9b487ULL, 0x14583a9d7d560dafULL,
+	0xf4c6dc593f2a0cb4ULL, 0xdd21d19584f80236ULL, 0x4a4836983ddde1d3ULL,
+	0xe58866a41ae745f9ULL, 0xf591a5b27e541875ULL, 0x891dc05074586693ULL,
+	0x5b068c651810a89eULL, 0xa30346bc0c08544fULL, 0x3dbf3751c684032dULL,
+	0x2a1e86ec785032dcULL, 0xf73f5779fca830eaULL, 0xb60c05ca30204d21ULL,
+	0x0cc316802b32f065ULL, 0x8770241bdd96be69ULL, 0xb861e18199ee95dbULL,
+	0xf805cad91418fcd1ULL, 0x29e70dccbbd20e82ULL, 0xc7140f435060d763ULL,
+	0x0f3a9da0e8b0cc3bULL, 0xa2543f574d76408eULL, 0xbd7761e1c175d139ULL,
+	0x4b1f4f737ca3f512ULL, 0x6dc2df1f2fc137abULL, 0xf1d05c3967b14856ULL,
+	0xa742bf3715ed046cULL, 0x654030141d1697edULL, 0x07b872abda676c7dULL,
+	0x3ce84eba87fa17ecULL, 0xc1fb0403cb79afdfULL, 0x3e46bc7105063f73ULL,
+	0x278ae987121cd678ULL, 0xa1adb4778ef47cd0ULL, 0x26dd906c5362c2b9ULL,
+	0x05168060589b44e2ULL, 0xfbfc41f9d79ac08fULL, 0x0e6de44ba9ced8faULL,
+	0x9feb08068bf243a3ULL, 0x7b341749d06b129bULL, 0x229c69e74a87929aULL,
+	0xe09ee6c4427c011bULL, 0x5692e30e725c4c3aULL, 0xda99a33e5e9f6e4bULL,
+	0x353dd85af453a36bULL, 0x25241b4c90e0fee7ULL, 0x5de987258309d022ULL,
+	0xe230140fc0802984ULL, 0x93281e86a0c0b3c6ULL, 0xf229d719a4337408ULL,
+	0x6f6c2dd4ad3d1f34ULL, 0x8ea5b2fbae3f0aeeULL, 0x8331dd90c473ee4aULL,
+	0x346aa1b1b52db7aaULL, 0xdf8f235e06042aa9ULL, 0xcc6f6b68a1354b7bULL,
+	0x6c95a6f46ebf236aULL, 0x52d31a856bb91c19ULL, 0x1a35ded6d498d555ULL,
+	0xf37eaef2e54d60c9ULL, 0x72e181a9a3c2a61cULL, 0x98537aad51952fdeULL,
+	0x16f6c856ffaa2530ULL, 0xd960281e9d1d5215ULL, 0x3a0745fa1ce36f50ULL,
+	0x0b7b642bf1559c18ULL, 0x59a87eae9aec8001ULL, 0x5e100c05408bec7cULL,
+	0x0441f98b19e55023ULL, 0xd70dcc5534d38aefULL, 0x927f676de1bea707ULL,
+	0x9769e70db925e3e5ULL, 0x7a636ea29115065aULL, 0x468b201816ef11b6ULL,
+	0xab81a9b73edff409ULL, 0xc0ac7de88a07bb1eULL, 0x1f235eb68c0391b7ULL,
+	0x6056b074458dd30fULL, 0xbe8eeac102f7ed67ULL, 0xcd381283e04b5fbaULL,
+	0x5cbefecec277c4e3ULL, 0xd21b4c356c48ce0dULL, 0x1019c31664b35d8cULL,
+	0x247362a7d19eea26ULL, 0xebe582efb3299d03ULL, 0x02aef2cb82fc289fULL,
+	0x86275df09ce8aaa8ULL, 0x28b07427faac1a43ULL, 0x38a9b7319e1f47cfULL,
+	0xc82e92e3b8d01b58ULL, 0x06ef0b409b1978bcULL, 0x62f842bfc771fb90ULL,
+	0x9904034610eb3b1fULL, 0xded85ab5477a3e68ULL, 0x90d195a663428f98ULL,
+	0x5384636e2ac708d8ULL, 0xcbd719c37b522706ULL, 0xae9729d76644b0ebULL,
+	0x7c8c65e20a0c7ee6ULL, 0x80c856b007f1d214ULL, 0x8c0b40302cc32271ULL,
+	0xdbcedad51fe17a8aULL, 0x740e8ae938dbdea0ULL, 0xa615c6dc549310adULL,
+	0x19cc55f6171ae90bULL, 0x49b1bdb8fe5fdd8dULL, 0xed0a89af2830e5bfULL,
+	0x6a7aadb4f5a65bd6ULL, 0x7e22972988f05679ULL, 0xf952b3325566e810ULL,
+	0x39fecedadf61530eULL, 0x6101c99f04f3c7ceULL, 0x2e5f7f6761b562ffULL,
+	0xf08725d226cf5c97ULL, 0x63af3b54860fef51ULL, 0x8ff2cb10ef411e2fULL,
+	0x884ab9bb35267252ULL, 0x4df04433e7ba8daeULL, 0x9afd8866d3690741ULL,
+	0x66b9bb34de94abb3ULL, 0x9baaf18d92171380ULL, 0x543c11c5f0a064a5ULL,
+	0x17a1b1bdbed431f1ULL, 0xb5f58eeaf3a2717fULL, 0xc355f6c849858740ULL,
+	0xec5df044694ef17eULL, 0xd83751f5dc6346d4ULL, 0xfc4433520dfdacf2ULL,
+	0x0000000000000000ULL, 0x5a51f58e596ebc5fULL, 0x3285aaf12e34cf16ULL,
+	0x8d5c39db6dbd36b0ULL, 0x12b731dde64f7513ULL, 0x94906c2d7aa7dfbbULL,
+	0x302b583aacc8e789ULL, 0x9d45facd090e6b3cULL, 0x2165e2c78905aec4ULL,
+	0x68d45f7f775a7349ULL, 0x189b2c1d5664fdcaULL, 0xe1c99f2f030215daULL,
+	0x6983269436246788ULL, 0x8489af3b1e148237ULL, 0xe94b702431d5b59cULL,
+	0x33d2d31a6f4adbd7ULL, 0xbfd9932a4389f9a6ULL, 0xb0e30e8aab39359dULL,
+	0xd1e2c715afcaf253ULL, 0x150f43763c28196eULL, 0xc4ed846393e2eb3dULL,
+	0x03f98b20c3823c5eULL, 0xfd134ab94c83b833ULL, 0x556b682eb1de7064ULL,
+	0x36c4537a37d19f35ULL, 0x7559f30279a5ca61ULL, 0x799ae58252973a04ULL,
+	0x9c12832648707ffdULL, 0x78cd9c6913e92ec5ULL, 0x1d8dac7d0effb928ULL,
+	0x439da0784e745554ULL, 0x413352b3cc887dcbULL, 0xbacf134a1b12bd44ULL,
+	0x114ebafd25cd494dULL, 0x2f08068c20cb763eULL, 0x76a07822ba27f63fULL,
+	0xeab2fb04f25789c2ULL, 0xe3676de481fe3d45ULL, 0x1b62a73d95e6c194ULL,
+	0x641749ff5c68832cULL, 0xa5ec4dfc97112cf3ULL, 0xf6682e92bdd6242bULL,
+	0x3f11c59a44782bb2ULL, 0x317c21d1edb6f348ULL, 0xd65ab5be75ad9e2eULL,
+	0x6b2dd45fb4d84f17ULL, 0xfaab381296e4d44eULL, 0xd0b5befeeeb4e692ULL,
+	0x0882ef0b32d7a046ULL, 0x512a91a5a83b2047ULL, 0x963e9ee6f85bf724ULL,
+	0x4e09cf132438b1f0ULL, 0x77f701c9fb59e2feULL, 0x7ddb1c094b726a27ULL,
+	0x5f4775ee01f5f8bdULL, 0x9186ec4d223c9b59ULL, 0xfeeac1998f01846dULL,
+	0xac39db1ce4b89874ULL, 0xb75b7c21715e59e0ULL, 0xafc0503c273aa42aULL,
+	0x6e3b543fec430bf5ULL, 0x704f7362213e8e83ULL, 0x58ff0745db9294c0ULL,
+	0x67eec2df9feabf72ULL, 0xa0facd9ccf8a6811ULL, 0xb936986ad890811aULL,
+	0x95c715c63bd9cb7aULL, 0xca8060283a2c33c7ULL, 0x507de84ee9453486ULL,
+	0x85ded6d05f6a96f6ULL, 0x1cdad5964f81ade9ULL, 0xd5a33e9eb62fa270ULL,
+	0x40642b588df6690aULL, 0x7f75eec2c98e42b8ULL, 0x2cf18dace3494a60ULL,
+	0x23cb100c0bf9865bULL, 0xeef3028febb2d9e1ULL, 0x4425d2d394133929ULL,
+	0xaad6d05c7fa1e0c8ULL, 0xad6ea2f7a5c68cb5ULL, 0xc2028f2308fb9381ULL,
+	0x819f2f5b468fc6d5ULL, 0xc5bafd88d29cfffcULL, 0x47dc59f357910577ULL,
+	0x2b49ff07392e261dULL, 0x57c59ae5332258fbULL, 0x73b6f842e2bcb2ddULL,
+	0xcf96e04862b77725ULL, 0x4ca73dd8a6c4996fULL, 0x015779eb417e14c1ULL,
+	0x37932a9176af8bf4ULL
+	}, {
+	0x190a2c9b249df23eULL, 0x2f62f8b62263e1e9ULL, 0x7a7f754740993655ULL,
+	0x330b7ba4d5564d9fULL, 0x4c17a16a46672582ULL, 0xb22f08eb7d05f5b8ULL,
+	0x535f47f40bc148ccULL, 0x3aec5d27d4883037ULL, 0x10ed0a1825438f96ULL,
+	0x516101f72c233d17ULL, 0x13cc6f949fd04eaeULL, 0x739853c441474bfdULL,
+	0x653793d90d3f5b1bULL, 0x5240647b96b0fc2fULL, 0x0c84890ad27623e0ULL,
+	0xd7189b32703aaea3ULL, 0x2685de3523bd9c41ULL, 0x99317c5b11bffefaULL,
+	0x0d9baa854f079703ULL, 0x70b93648fbd48ac5ULL, 0xa80441fce30bc6beULL,
+	0x7287704bdc36ff1eULL, 0xb65384ed33dc1f13ULL, 0xd36417343ee34408ULL,
+	0x39cd38ab6e1bf10fULL, 0x5ab861770a1f3564ULL, 0x0ebacf09f594563bULL,
+	0xd04572b884708530ULL, 0x3cae9722bdb3af47ULL, 0x4a556b6f2f5cbaf2ULL,
+	0xe1704f1f76c4bd74ULL, 0x5ec4ed7144c6dfcfULL, 0x16afc01d4c7810e6ULL,
+	0x283f113cd629ca7aULL, 0xaf59a8761741ed2dULL, 0xeed5a3991e215facULL,
+	0x3bf37ea849f984d4ULL, 0xe413e096a56ce33cULL, 0x2c439d3a98f020d1ULL,
+	0x637559dc6404c46bULL, 0x9e6c95d1e5f5d569ULL, 0x24bb9836045fe99aULL,
+	0x44efa466dac8ecc9ULL, 0xc6eab2a5c80895d6ULL, 0x803b50c035220cc4ULL,
+	0x0321658cba93c138ULL, 0x8f9ebc465dc7ee1cULL, 0xd15a5137190131d3ULL,
+	0x0fa5ec8668e5e2d8ULL, 0x91c979578d1037b1ULL, 0x0642ca05693b9f70ULL,
+	0xefca80168350eb4fULL, 0x38d21b24f36a45ecULL, 0xbeab81e1af73d658ULL,
+	0x8cbfd9cae7542f24ULL, 0xfd19cc0d81f11102ULL, 0x0ac6430fbb4dbc90ULL,
+	0x1d76a09d6a441895ULL, 0x2a01573ff1cbbfa1ULL, 0xb572e161894fde2bULL,
+	0x8124734fa853b827ULL, 0x614b1fdf43e6b1b0ULL, 0x68ac395c4238cc18ULL,
+	0x21d837bfd7f7b7d2ULL, 0x20c714304a860331ULL, 0x5cfaab726324aa14ULL,
+	0x74c5ba4eb50d606eULL, 0xf3a3030474654739ULL, 0x23e671bcf015c209ULL,
+	0x45f087e947b9582aULL, 0xd8bd77b418df4c7bULL, 0xe06f6c90ebb50997ULL,
+	0x0bd96080263c0873ULL, 0x7e03f9410e40dcfeULL, 0xb8e94be4c6484928ULL,
+	0xfb5b0608e8ca8e72ULL, 0x1a2b49179e0e3306ULL, 0x4e29e76961855059ULL,
+	0x4f36c4e6fcf4e4baULL, 0x49740ee395cf7bcaULL, 0xc2963ea386d17f7dULL,
+	0x90d65ad810618352ULL, 0x12d34c1b02a1fa4dULL, 0xfa44258775bb3a91ULL,
+	0x18150f14b9ec46ddULL, 0x1491861e6b9a653dULL, 0x9a1019d7ab2c3fc2ULL,
+	0x3668d42d06fe13d7ULL, 0xdcc1fbb25606a6d0ULL, 0x969490dd795a1c22ULL,
+	0x3549b1a1bc6dd2efULL, 0xc94f5e23a0ed770eULL, 0xb9f6686b5b39fdcbULL,
+	0xc4d4f4a6efeae00dULL, 0xe732851a1fff2204ULL, 0x94aad6de5eb869f9ULL,
+	0x3f8ff2ae07206e7fULL, 0xfe38a9813b62d03aULL, 0xa7a1ad7a8bee2466ULL,
+	0x7b6056c8dde882b6ULL, 0x302a1e286fc58ca7ULL, 0x8da0fa457a259bc7ULL,
+	0xb3302b64e074415bULL, 0x5402ae7eff8b635fULL, 0x08f8050c9cafc94bULL,
+	0xae468bf98a3059ceULL, 0x88c355cca98dc58fULL, 0xb10e6d67c7963480ULL,
+	0xbad70de7e1aa3cf3ULL, 0xbfb4a26e320262bbULL, 0xcb711820870f02d5ULL,
+	0xce12b7a954a75c9dULL, 0x563ce87dd8691684ULL, 0x9f73b65e7884618aULL,
+	0x2b1e74b06cba0b42ULL, 0x47cec1ea605b2df1ULL, 0x1c698312f735ac76ULL,
+	0x5fdbcefed9b76b2cULL, 0x831a354c8fb1cdfcULL, 0x820516c312c0791fULL,
+	0xb74ca762aeadabf0ULL, 0xfc06ef821c80a5e1ULL, 0x5723cbf24518a267ULL,
+	0x9d4df05d5f661451ULL, 0x588627742dfd40bfULL, 0xda8331b73f3d39a0ULL,
+	0x17b0e392d109a405ULL, 0xf965400bcf28fba9ULL, 0x7c3dbf4229a2a925ULL,
+	0x023e460327e275dbULL, 0x6cd0b55a0ce126b3ULL, 0xe62da695828e96e7ULL,
+	0x42ad6e63b3f373b9ULL, 0xe50cc319381d57dfULL, 0xc5cbd729729b54eeULL,
+	0x46d1e265fd2a9912ULL, 0x6428b056904eeff8ULL, 0x8be23040131e04b7ULL,
+	0x6709d5da2add2ec0ULL, 0x075de98af44a2b93ULL, 0x8447dcc67bfbe66fULL,
+	0x6616f655b7ac9a23ULL, 0xd607b8bded4b1a40ULL, 0x0563af89d3a85e48ULL,
+	0x3db1b4ad20c21ba4ULL, 0x11f22997b8323b75ULL, 0x292032b34b587e99ULL,
+	0x7f1cdace9331681dULL, 0x8e819fc9c0b65affULL, 0xa1e3677fe2d5bb16ULL,
+	0xcd33d225ee349da5ULL, 0xd9a2543b85aef898ULL, 0x795e10cbfa0af76dULL,
+	0x25a4bbb9992e5d79ULL, 0x78413344677b438eULL, 0xf0826688cef68601ULL,
+	0xd27b34bba392f0ebULL, 0x551d8df162fad7bcULL, 0x1e57c511d0d7d9adULL,
+	0xdeffbdb171e4d30bULL, 0xf4feea8e802f6caaULL, 0xa480c8f6317de55eULL,
+	0xa0fc44f07fa40ff5ULL, 0x95b5f551c3c9dd1aULL, 0x22f952336d6476eaULL,
+	0x0000000000000000ULL, 0xa6be8ef5169f9085ULL, 0xcc2cf1aa73452946ULL,
+	0x2e7ddb39bf12550aULL, 0xd526dd3157d8db78ULL, 0x486b2d6c08becf29ULL,
+	0x9b0f3a58365d8b21ULL, 0xac78cdfaadd22c15ULL, 0xbc95c7e28891a383ULL,
+	0x6a927f5f65dab9c3ULL, 0xc3891d2c1ba0cb9eULL, 0xeaa92f9f50f8b507ULL,
+	0xcf0d9426c9d6e87eULL, 0xca6e3baf1a7eb636ULL, 0xab25247059980786ULL,
+	0x69b31ad3df4978fbULL, 0xe2512a93cc577c4cULL, 0xff278a0ea61364d9ULL,
+	0x71a615c766a53e26ULL, 0x89dc764334fc716cULL, 0xf87a638452594f4aULL,
+	0xf2bc208be914f3daULL, 0x8766b94ac1682757ULL, 0xbbc82e687cdb8810ULL,
+	0x626a7a53f9757088ULL, 0xa2c202f358467a2eULL, 0x4d0882e5db169161ULL,
+	0x09e7268301de7da8ULL, 0xe897699c771ac0dcULL, 0xc8507dac3d9cc3edULL,
+	0xc0a878a0a1330aa6ULL, 0x978bb352e42ba8c1ULL, 0xe9884a13ea6b743fULL,
+	0x279afdbabecc28a2ULL, 0x047c8c064ed9eaabULL, 0x507e2278b15289f4ULL,
+	0x599904fbb08cf45cULL, 0xbd8ae46d15e01760ULL, 0x31353da7f2b43844ULL,
+	0x8558ff49e68a528cULL, 0x76fbfc4d92ef15b5ULL, 0x3456922e211c660cULL,
+	0x86799ac55c1993b4ULL, 0x3e90d1219a51da9cULL, 0x2d5cbeb505819432ULL,
+	0x982e5fd48cce4a19ULL, 0xdb9c1238a24c8d43ULL, 0xd439febecaa96f9bULL,
+	0x418c0bef0960b281ULL, 0x158ea591f6ebd1deULL, 0x1f48e69e4da66d4eULL,
+	0x8afd13cf8e6fb054ULL, 0xf5e1c9011d5ed849ULL, 0xe34e091c5126c8afULL,
+	0xad67ee7530a398f6ULL, 0x43b24dec2e82c75aULL, 0x75da99c1287cd48dULL,
+	0x92e81cdb3783f689ULL, 0xa3dd217cc537cecdULL, 0x60543c50de970553ULL,
+	0x93f73f54aaf2426aULL, 0xa91b62737e7a725dULL, 0xf19d4507538732e2ULL,
+	0x77e4dfc20f9ea156ULL, 0x7d229ccdb4d31dc6ULL, 0x1b346a98037f87e5ULL,
+	0xedf4c615a4b29e94ULL, 0x4093286094110662ULL, 0xb0114ee85ae78063ULL,
+	0x6ff1d0d6b672e78bULL, 0x6dcf96d591909250ULL, 0xdfe09e3eec9567e8ULL,
+	0x3214582b4827f97cULL, 0xb46dc2ee143e6ac8ULL, 0xf6c0ac8da7cd1971ULL,
+	0xebb60c10cd8901e4ULL, 0xf7df8f023abcad92ULL, 0x9c52d3d2c217a0b2ULL,
+	0x6b8d5cd0f8ab0d20ULL, 0x3777f7a29b8fa734ULL, 0x011f238f9d71b4e3ULL,
+	0xc1b75b2f3c42be45ULL, 0x5de588fdfe551ef7ULL, 0x6eeef3592b035368ULL,
+	0xaa3a07ffc4e9b365ULL, 0xecebe59a39c32a77ULL, 0x5ba742f8976e8187ULL,
+	0x4b4a48e0b22d0e11ULL, 0xddded83dcb771233ULL, 0xa59feb79ac0c51bdULL,
+	0xc7f5912a55792135ULL
+	}, {
+	0x6d6ae04668a9b08aULL, 0x3ab3f04b0be8c743ULL, 0xe51e166b54b3c908ULL,
+	0xbe90a9eb35c2f139ULL, 0xb2c7066637f2bec1ULL, 0xaa6945613392202cULL,
+	0x9a28c36f3b5201ebULL, 0xddce5a93ab536994ULL, 0x0e34133ef6382827ULL,
+	0x52a02ba1ec55048bULL, 0xa2f88f97c4b2a177ULL, 0x8640e513ca2251a5ULL,
+	0xcdf1d36258137622ULL, 0xfe6cb708dedf8ddbULL, 0x8a174a9ec8121e5dULL,
+	0x679896036b81560eULL, 0x59ed033395795feeULL, 0x1dd778ab8b74edafULL,
+	0xee533ef92d9f926dULL, 0x2a8c79baf8a8d8f5ULL, 0x6bcf398e69b119f6ULL,
+	0xe20491742fafdd95ULL, 0x276488e0809c2aecULL, 0xea955b82d88f5cceULL,
+	0x7102c63a99d9e0c4ULL, 0xf9763017a5c39946ULL, 0x429fa2501f151b3dULL,
+	0x4659c72bea05d59eULL, 0x984b7fdccf5a6634ULL, 0xf742232953fbb161ULL,
+	0x3041860e08c021c7ULL, 0x747bfd9616cd9386ULL, 0x4bb1367192312787ULL,
+	0x1b72a1638a6c44d3ULL, 0x4a0e68a6e8359a66ULL, 0x169a5039f258b6caULL,
+	0xb98a2ef44edee5a4ULL, 0xd9083fe85e43a737ULL, 0x967f6ce239624e13ULL,
+	0x8874f62d3c1a7982ULL, 0x3c1629830af06e3fULL, 0x9165ebfd427e5a8eULL,
+	0xb5dd81794ceeaa5cULL, 0x0de8f15a7834f219ULL, 0x70bd98ede3dd5d25ULL,
+	0xaccc9ca9328a8950ULL, 0x56664eda1945ca28ULL, 0x221db34c0f8859aeULL,
+	0x26dbd637fa98970dULL, 0x1acdffb4f068f932ULL, 0x4585254f64090fa0ULL,
+	0x72de245e17d53afaULL, 0x1546b25d7c546cf4ULL, 0x207e0ffffb803e71ULL,
+	0xfaaad2732bcf4378ULL, 0xb462dfae36ea17bdULL, 0xcf926fd1ac1b11fdULL,
+	0xe0672dc7dba7ba4aULL, 0xd3fa49ad5d6b41b3ULL, 0x8ba81449b216a3bcULL,
+	0x14f9ec8a0650d115ULL, 0x40fc1ee3eb1d7ce2ULL, 0x23a2ed9b758ce44fULL,
+	0x782c521b14fddc7eULL, 0x1c68267cf170504eULL, 0xbcf31558c1ca96e6ULL,
+	0xa781b43b4ba6d235ULL, 0xf6fd7dfe29ff0c80ULL, 0xb0a4bad5c3fad91eULL,
+	0xd199f51ea963266cULL, 0x414340349119c103ULL, 0x5405f269ed4dadf7ULL,
+	0xabd61bb649969dcdULL, 0x6813dbeae7bdc3c8ULL, 0x65fb2ab09f8931d1ULL,
+	0xf1e7fae152e3181dULL, 0xc1a67cef5a2339daULL, 0x7a4feea8e0f5bba1ULL,
+	0x1e0b9acf05783791ULL, 0x5b8ebf8061713831ULL, 0x80e53cdbcb3af8d9ULL,
+	0x7e898bd315e57502ULL, 0xc6bcfbf0213f2d47ULL, 0x95a38e86b76e942dULL,
+	0x092e94218d243cbaULL, 0x8339debf453622e7ULL, 0xb11be402b9fe64ffULL,
+	0x57d9100d634177c9ULL, 0xcc4e8db52217cbc3ULL, 0x3b0cae9c71ec7aa2ULL,
+	0xfb158ca451cbfe99ULL, 0x2b33276d82ac6514ULL, 0x01bf5ed77a04bde1ULL,
+	0xc5601994af33f779ULL, 0x75c4a3416cc92e67ULL, 0xf3844652a6eb7fc2ULL,
+	0x3487e375fdd0ef64ULL, 0x18ae430704609eedULL, 0x4d14efb993298efbULL,
+	0x815a620cb13e4538ULL, 0x125c354207487869ULL, 0x9eeea614ce42cf48ULL,
+	0xce2d3106d61fac1cULL, 0xbbe99247bad6827bULL, 0x071a871f7b1c149dULL,
+	0x2e4a1cc10db81656ULL, 0x77a71ff298c149b8ULL, 0x06a5d9c80118a97cULL,
+	0xad73c27e488e34b1ULL, 0x443a7b981e0db241ULL, 0xe3bbcfa355ab6074ULL,
+	0x0af276450328e684ULL, 0x73617a896dd1871bULL, 0x58525de4ef7de20fULL,
+	0xb7be3dcab8e6cd83ULL, 0x19111dd07e64230cULL, 0x842359a03e2a367aULL,
+	0x103f89f1f3401fb6ULL, 0xdc710444d157d475ULL, 0xb835702334da5845ULL,
+	0x4320fc876511a6dcULL, 0xd026abc9d3679b8dULL, 0x17250eee885c0b2bULL,
+	0x90dab52a387ae76fULL, 0x31fed8d972c49c26ULL, 0x89cba8fa461ec463ULL,
+	0x2ff5421677bcabb7ULL, 0x396f122f85e41d7dULL, 0xa09b332430bac6a8ULL,
+	0xc888e8ced7070560ULL, 0xaeaf201ac682ee8fULL, 0x1180d7268944a257ULL,
+	0xf058a43628e7a5fcULL, 0xbd4c4b8fbbce2b07ULL, 0xa1246df34abe7b49ULL,
+	0x7d5569b79be9af3cULL, 0xa9b5a705bd9efa12ULL, 0xdb6b835baa4bc0e8ULL,
+	0x05793bac8f147342ULL, 0x21c1512881848390ULL, 0xfdb0556c50d357e5ULL,
+	0x613d4fcb6a99ff72ULL, 0x03dce2648e0cda3eULL, 0xe949b9e6568386f0ULL,
+	0xfc0f0bbb2ad7ea04ULL, 0x6a70675913b5a417ULL, 0x7f36d5046fe1c8e3ULL,
+	0x0c57af8d02304ff8ULL, 0x32223abdfcc84618ULL, 0x0891caf6f720815bULL,
+	0xa63eeaec31a26fd4ULL, 0x2507345374944d33ULL, 0x49d28ac266394058ULL,
+	0xf5219f9aa7f3d6beULL, 0x2d96fea583b4cc68ULL, 0x5a31e1571b7585d0ULL,
+	0x8ed12fe53d02d0feULL, 0xdfade6205f5b0e4bULL, 0x4cabb16ee92d331aULL,
+	0x04c6657bf510cea3ULL, 0xd73c2cd6a87b8f10ULL, 0xe1d87310a1a307abULL,
+	0x6cd5be9112ad0d6bULL, 0x97c032354366f3f2ULL, 0xd4e0ceb22677552eULL,
+	0x0000000000000000ULL, 0x29509bde76a402cbULL, 0xc27a9e8bd42fe3e4ULL,
+	0x5ef7842cee654b73ULL, 0xaf107ecdbc86536eULL, 0x3fcacbe784fcb401ULL,
+	0xd55f90655c73e8cfULL, 0xe6c2f40fdabf1336ULL, 0xe8f6e7312c873b11ULL,
+	0xeb2a0555a28be12fULL, 0xe4a148bc2eb774e9ULL, 0x9b979db84156bc0aULL,
+	0x6eb60222e6a56ab4ULL, 0x87ffbbc4b026ec44ULL, 0xc703a5275b3b90a6ULL,
+	0x47e699fc9001687fULL, 0x9c8d1aa73a4aa897ULL, 0x7cea3760e1ed12ddULL,
+	0x4ec80ddd1d2554c5ULL, 0x13e36b957d4cc588ULL, 0x5d2b66486069914dULL,
+	0x92b90999cc7280b0ULL, 0x517cc9c56259deb5ULL, 0xc937b619ad03b881ULL,
+	0xec30824ad997f5b2ULL, 0xa45d565fc5aa080bULL, 0xd6837201d27f32f1ULL,
+	0x635ef3789e9198adULL, 0x531f75769651b96aULL, 0x4f77530a6721e924ULL,
+	0x486dd4151c3dfdb9ULL, 0x5f48dafb9461f692ULL, 0x375b011173dc355aULL,
+	0x3da9775470f4d3deULL, 0x8d0dcd81b30e0ac0ULL, 0x36e45fc609d888bbULL,
+	0x55baacbe97491016ULL, 0x8cb29356c90ab721ULL, 0x76184125e2c5f459ULL,
+	0x99f4210bb55edbd5ULL, 0x6f095cf59ca1d755ULL, 0x9f51f8c3b44672a9ULL,
+	0x3538bda287d45285ULL, 0x50c39712185d6354ULL, 0xf23b1885dcefc223ULL,
+	0x79930ccc6ef9619fULL, 0xed8fdc9da3934853ULL, 0xcb540aaa590bdf5eULL,
+	0x5c94389f1a6d2cacULL, 0xe77daad8a0bbaed7ULL, 0x28efc5090ca0bf2aULL,
+	0xbf2ff73c4fc64cd8ULL, 0xb37858b14df60320ULL, 0xf8c96ec0dfc724a7ULL,
+	0x828680683f329f06ULL, 0x941cd051cd6a29ccULL, 0xc3c5c05cae2b5e05ULL,
+	0xb601631dc2e27062ULL, 0xc01922382027843bULL, 0x24b86a840e90f0d2ULL,
+	0xd245177a276ffc52ULL, 0x0f8b4de98c3c95c6ULL, 0x3e759530fef809e0ULL,
+	0x0b4d2892792c5b65ULL, 0xc4df4743d5374a98ULL, 0xa5e20888bfaeb5eaULL,
+	0xba56cc90c0d23f9aULL, 0x38d04cf8ffe0a09cULL, 0x62e1adafe495254cULL,
+	0x0263bcb3f40867dfULL, 0xcaeb547d230f62bfULL, 0x6082111c109d4293ULL,
+	0xdad4dd8cd04f7d09ULL, 0xefec602e579b2f8cULL, 0x1fb4c4187f7c8a70ULL,
+	0xffd3e9dfa4db303aULL, 0x7bf0b07f9af10640ULL, 0xf49ec14dddf76b5fULL,
+	0x8f6e713247066d1fULL, 0x339d646a86ccfbf9ULL, 0x64447467e58d8c30ULL,
+	0x2c29a072f9b07189ULL, 0xd8b7613f24471ad6ULL, 0x6627c8d41185ebefULL,
+	0xa347d140beb61c96ULL, 0xde12b8f7255fb3aaULL, 0x9d324470404e1576ULL,
+	0x9306574eb6763d51ULL, 0xa80af9d2c79a47f3ULL, 0x859c0777442e8b9bULL,
+	0x69ac853d9db97e29ULL
+	}, {
+	0xc3407dfc2de6377eULL, 0x5b9e93eea4256f77ULL, 0xadb58fdd50c845e0ULL,
+	0x5219ff11a75bed86ULL, 0x356b61cfd90b1de9ULL, 0xfb8f406e25abe037ULL,
+	0x7a5a0231c0f60796ULL, 0x9d3cd216e1f5020bULL, 0x0c6550fb6b48d8f3ULL,
+	0xf57508c427ff1c62ULL, 0x4ad35ffa71cb407dULL, 0x6290a2da1666aa6dULL,
+	0xe284ec2349355f9fULL, 0xb3c307c53d7c84ecULL, 0x05e23c0468365a02ULL,
+	0x190bac4d6c9ebfa8ULL, 0x94bbbee9e28b80faULL, 0xa34fc777529cb9b5ULL,
+	0xcc7b39f095bcd978ULL, 0x2426addb0ce532e3ULL, 0x7e79329312ce4fc7ULL,
+	0xab09a72eebec2917ULL, 0xf8d15499f6b9d6c2ULL, 0x1a55b8babf8c895dULL,
+	0xdb8add17fb769a85ULL, 0xb57f2f368658e81bULL, 0x8acd36f18f3f41f6ULL,
+	0x5ce3b7bba50f11d3ULL, 0x114dcc14d5ee2f0aULL, 0xb91a7fcded1030e8ULL,
+	0x81d5425fe55de7a1ULL, 0xb6213bc1554adeeeULL, 0x80144ef95f53f5f2ULL,
+	0x1e7688186db4c10cULL, 0x3b912965db5fe1bcULL, 0xc281715a97e8252dULL,
+	0x54a5d7e21c7f8171ULL, 0x4b12535ccbc5522eULL, 0x1d289cefbea6f7f9ULL,
+	0x6ef5f2217d2e729eULL, 0xe6a7dc819b0d17ceULL, 0x1b94b41c05829b0eULL,
+	0x33d7493c622f711eULL, 0xdcf7f942fa5ce421ULL, 0x600fba8b7f7a8ecbULL,
+	0x46b60f011a83988eULL, 0x235b898e0dcf4c47ULL, 0x957ab24f588592a9ULL,
+	0x4354330572b5c28cULL, 0xa5f3ef84e9b8d542ULL, 0x8c711e02341b2d01ULL,
+	0x0b1874ae6a62a657ULL, 0x1213d8e306fc19ffULL, 0xfe6d7c6a4d9dba35ULL,
+	0x65ed868f174cd4c9ULL, 0x88522ea0e6236550ULL, 0x899322065c2d7703ULL,
+	0xc01e690bfef4018bULL, 0x915982ed8abddaf8ULL, 0xbe675b98ec3a4e4cULL,
+	0xa996bf7f82f00db1ULL, 0xe1daf8d49a27696aULL, 0x2effd5d3dc8986e7ULL,
+	0xd153a51f2b1a2e81ULL, 0x18caa0ebd690adfbULL, 0x390e3134b243c51aULL,
+	0x2778b92cdff70416ULL, 0x029f1851691c24a6ULL, 0x5e7cafeacc133575ULL,
+	0xfa4e4cc89fa5f264ULL, 0x5a5f9f481e2b7d24ULL, 0x484c47ab18d764dbULL,
+	0x400a27f2a1a7f479ULL, 0xaeeb9b2a83da7315ULL, 0x721c626879869734ULL,
+	0x042330a2d2384851ULL, 0x85f672fd3765aff0ULL, 0xba446b3a3e02061dULL,
+	0x73dd6ecec3888567ULL, 0xffac70ccf793a866ULL, 0xdfa9edb5294ed2d4ULL,
+	0x6c6aea7014325638ULL, 0x834a5a0e8c41c307ULL, 0xcdba35562fb2cb2bULL,
+	0x0ad97808d06cb404ULL, 0x0f3b440cb85aee06ULL, 0xe5f9c876481f213bULL,
+	0x98deee1289c35809ULL, 0x59018bbfcd394bd1ULL, 0xe01bf47220297b39ULL,
+	0xde68e1139340c087ULL, 0x9fa3ca4788e926adULL, 0xbb85679c840c144eULL,
+	0x53d8f3b71d55ffd5ULL, 0x0da45c5dd146caa0ULL, 0x6f34fe87c72060cdULL,
+	0x57fbc315cf6db784ULL, 0xcee421a1fca0fddeULL, 0x3d2d0196607b8d4bULL,
+	0x642c8a29ad42c69aULL, 0x14aff010bdd87508ULL, 0xac74837beac657b3ULL,
+	0x3216459ad821634dULL, 0x3fb219c70967a9edULL, 0x06bc28f3bb246cf7ULL,
+	0xf2082c9126d562c6ULL, 0x66b39278c45ee23cULL, 0xbd394f6f3f2878b9ULL,
+	0xfd33689d9e8f8cc0ULL, 0x37f4799eb017394fULL, 0x108cc0b26fe03d59ULL,
+	0xda4bd1b1417888d6ULL, 0xb09d1332ee6eb219ULL, 0x2f3ed975668794b4ULL,
+	0x58c0871977375982ULL, 0x7561463d78ace990ULL, 0x09876cff037e82f1ULL,
+	0x7fb83e35a8c05d94ULL, 0x26b9b58a65f91645ULL, 0xef20b07e9873953fULL,
+	0x3148516d0b3355b8ULL, 0x41cb2b541ba9e62aULL, 0x790416c613e43163ULL,
+	0xa011d380818e8f40ULL, 0x3a5025c36151f3efULL, 0xd57095bdf92266d0ULL,
+	0x498d4b0da2d97688ULL, 0x8b0c3a57353153a5ULL, 0x21c491df64d368e1ULL,
+	0x8f2f0af5e7091bf4ULL, 0x2da1c1240f9bb012ULL, 0xc43d59a92ccc49daULL,
+	0xbfa6573e56345c1fULL, 0x828b56a8364fd154ULL, 0x9a41f643e0df7cafULL,
+	0xbcf843c985266aeaULL, 0x2b1de9d7b4bfdce5ULL, 0x20059d79dedd7ab2ULL,
+	0x6dabe6d6ae3c446bULL, 0x45e81bf6c991ae7bULL, 0x6351ae7cac68b83eULL,
+	0xa432e32253b6c711ULL, 0xd092a9b991143cd2ULL, 0xcac711032e98b58fULL,
+	0xd8d4c9e02864ac70ULL, 0xc5fc550f96c25b89ULL, 0xd7ef8dec903e4276ULL,
+	0x67729ede7e50f06fULL, 0xeac28c7af045cf3dULL, 0xb15c1f945460a04aULL,
+	0x9cfddeb05bfb1058ULL, 0x93c69abce3a1fe5eULL, 0xeb0380dc4a4bdd6eULL,
+	0xd20db1e8f8081874ULL, 0x229a8528b7c15e14ULL, 0x44291750739fbc28ULL,
+	0xd3ccbd4e42060a27ULL, 0xf62b1c33f4ed2a97ULL, 0x86a8660ae4779905ULL,
+	0xd62e814a2a305025ULL, 0x477703a7a08d8addULL, 0x7b9b0e977af815c5ULL,
+	0x78c51a60a9ea2330ULL, 0xa6adfb733aaae3b7ULL, 0x97e5aa1e3199b60fULL,
+	0x0000000000000000ULL, 0xf4b404629df10e31ULL, 0x5564db44a6719322ULL,
+	0x9207961a59afec0dULL, 0x9624a6b88b97a45cULL, 0x363575380a192b1cULL,
+	0x2c60cd82b595a241ULL, 0x7d272664c1dc7932ULL, 0x7142769faa94a1c1ULL,
+	0xa1d0df263b809d13ULL, 0x1630e841d4c451aeULL, 0xc1df65ad44fa13d8ULL,
+	0x13d2d445bcf20bacULL, 0xd915c546926abe23ULL, 0x38cf3d92084dd749ULL,
+	0xe766d0272103059dULL, 0xc7634d5effde7f2fULL, 0x077d2455012a7ea4ULL,
+	0xedbfa82ff16fb199ULL, 0xaf2a978c39d46146ULL, 0x42953fa3c8bbd0dfULL,
+	0xcb061da59496a7dcULL, 0x25e7a17db6eb20b0ULL, 0x34aa6d6963050fbaULL,
+	0xa76cf7d580a4f1e4ULL, 0xf7ea10954ee338c4ULL, 0xfcf2643b24819e93ULL,
+	0xcf252d0746aeef8dULL, 0x4ef06f58a3f3082cULL, 0x563acfb37563a5d7ULL,
+	0x5086e740ce47c920ULL, 0x2982f186dda3f843ULL, 0x87696aac5e798b56ULL,
+	0x5d22bb1d1f010380ULL, 0x035e14f7d31236f5ULL, 0x3cec0d30da759f18ULL,
+	0xf3c920379cdb7095ULL, 0xb8db736b571e22bbULL, 0xdd36f5e44052f672ULL,
+	0xaac8ab8851e23b44ULL, 0xa857b3d938fe1fe2ULL, 0x17f1e4e76eca43fdULL,
+	0xec7ea4894b61a3caULL, 0x9e62c6e132e734feULL, 0xd4b1991b432c7483ULL,
+	0x6ad6c283af163acfULL, 0x1ce9904904a8e5aaULL, 0x5fbda34c761d2726ULL,
+	0xf910583f4cb7c491ULL, 0xc6a241f845d06d7cULL, 0x4f3163fe19fd1a7fULL,
+	0xe99c988d2357f9c8ULL, 0x8eee06535d0709a7ULL, 0x0efa48aa0254fc55ULL,
+	0xb4be23903c56fa48ULL, 0x763f52caabbedf65ULL, 0xeee1bcd8227d876cULL,
+	0xe345e085f33b4dccULL, 0x3e731561b369bbbeULL, 0x2843fd2067adea10ULL,
+	0x2adce5710eb1ceb6ULL, 0xb7e03767ef44ccbdULL, 0x8db012a48e153f52ULL,
+	0x61ceb62dc5749c98ULL, 0xe85d942b9959eb9bULL, 0x4c6f7709caef2c8aULL,
+	0x84377e5b8d6bbda3ULL, 0x30895dcbb13d47ebULL, 0x74a04a9bc2a2fbc3ULL,
+	0x6b17ce251518289cULL, 0xe438c4d0f2113368ULL, 0x1fb784bed7bad35fULL,
+	0x9b80fae55ad16efcULL, 0x77fe5e6c11b0cd36ULL, 0xc858095247849129ULL,
+	0x08466059b97090a2ULL, 0x01c10ca6ba0e1253ULL, 0x6988d6747c040c3aULL,
+	0x6849dad2c60a1e69ULL, 0x5147ebe67449db73ULL, 0xc99905f4fd8a837aULL,
+	0x991fe2b433cd4a5aULL, 0xf09734c04fc94660ULL, 0xa28ecbd1e892abe6ULL,
+	0xf1563866f5c75433ULL, 0x4dae7baf70e13ed9ULL, 0x7ce62ac27bd26b61ULL,
+	0x70837a39109ab392ULL, 0x90988e4b30b3c8abULL, 0xb2020b63877296bfULL,
+	0x156efcb607d6675bULL
+	}, {
+	0xe63f55ce97c331d0ULL, 0x25b506b0015bba16ULL, 0xc8706e29e6ad9ba8ULL,
+	0x5b43d3775d521f6aULL, 0x0bfa3d577035106eULL, 0xab95fc172afb0e66ULL,
+	0xf64b63979e7a3276ULL, 0xf58b4562649dad4bULL, 0x48f7c3dbae0c83f1ULL,
+	0xff31916642f5c8c5ULL, 0xcbb048dc1c4a0495ULL, 0x66b8f83cdf622989ULL,
+	0x35c130e908e2b9b0ULL, 0x7c761a61f0b34fa1ULL, 0x3601161cf205268dULL,
+	0x9e54ccfe2219b7d6ULL, 0x8b7d90a538940837ULL, 0x9cd403588ea35d0bULL,
+	0xbc3c6fea9ccc5b5aULL, 0xe5ff733b6d24aeedULL, 0xceed22de0f7eb8d2ULL,
+	0xec8581cab1ab545eULL, 0xb96105e88ff8e71dULL, 0x8ca03501871a5eadULL,
+	0x76ccce65d6db2a2fULL, 0x5883f582a7b58057ULL, 0x3f7be4ed2e8adc3eULL,
+	0x0fe7be06355cd9c9ULL, 0xee054e6c1d11be83ULL, 0x1074365909b903a6ULL,
+	0x5dde9f80b4813c10ULL, 0x4a770c7d02b6692cULL, 0x5379c8d5d7809039ULL,
+	0xb4067448161ed409ULL, 0x5f5e5026183bd6cdULL, 0xe898029bf4c29df9ULL,
+	0x7fb63c940a54d09cULL, 0xc5171f897f4ba8bcULL, 0xa6f28db7b31d3d72ULL,
+	0x2e4f3be7716eaa78ULL, 0x0d6771a099e63314ULL, 0x82076254e41bf284ULL,
+	0x2f0fd2b42733df98ULL, 0x5c9e76d3e2dc49f0ULL, 0x7aeb569619606cdbULL,
+	0x83478b07b2468764ULL, 0xcfadcb8d5923cd32ULL, 0x85dac7f05b95a41eULL,
+	0xb5469d1b4043a1e9ULL, 0xb821ecbbd9a592fdULL, 0x1b8e0b0e798c13c8ULL,
+	0x62a57b6d9a0be02eULL, 0xfcf1b793b81257f8ULL, 0x9d94ea0bd8fe28ebULL,
+	0x4cea408aeb654a56ULL, 0x23284a47e888996cULL, 0x2d8f1d128b893545ULL,
+	0xf4cbac3132c0d8abULL, 0xbd7c86b9ca912ebaULL, 0x3a268eef3dbe6079ULL,
+	0xf0d62f6077a9110cULL, 0x2735c916ade150cbULL, 0x89fd5f03942ee2eaULL,
+	0x1acee25d2fd16628ULL, 0x90f39bab41181bffULL, 0x430dfe8cde39939fULL,
+	0xf70b8ac4c8274796ULL, 0x1c53aeaac6024552ULL, 0x13b410acf35e9c9bULL,
+	0xa532ab4249faa24fULL, 0x2b1251e5625a163fULL, 0xd7e3e676da4841c7ULL,
+	0xa7b264e4e5404892ULL, 0xda8497d643ae72d3ULL, 0x861ae105a1723b23ULL,
+	0x38a6414991048aa4ULL, 0x6578dec92585b6b4ULL, 0x0280cfa6acbaeaddULL,
+	0x88bdb650c273970aULL, 0x9333bd5ebbff84c2ULL, 0x4e6a8f2c47dfa08bULL,
+	0x321c954db76cef2aULL, 0x418d312a72837942ULL, 0xb29b38bfffcdf773ULL,
+	0x6c022c38f90a4c07ULL, 0x5a033a240b0f6a8aULL, 0x1f93885f3ce5da6fULL,
+	0xc38a537e96988bc6ULL, 0x39e6a81ac759ff44ULL, 0x29929e43cee0fce2ULL,
+	0x40cdd87924de0ca2ULL, 0xe9d8ebc8a29fe819ULL, 0x0c2798f3cfbb46f4ULL,
+	0x55e484223e53b343ULL, 0x4650948ecd0d2fd8ULL, 0x20e86cb2126f0651ULL,
+	0x6d42c56baf5739e7ULL, 0xa06fc1405ace1e08ULL, 0x7babbfc54f3d193bULL,
+	0x424d17df8864e67fULL, 0xd8045870ef14980eULL, 0xc6d7397c85ac3781ULL,
+	0x21a885e1443273b1ULL, 0x67f8116f893f5c69ULL, 0x24f5efe35706cff6ULL,
+	0xd56329d076f2ab1aULL, 0x5e1eb9754e66a32dULL, 0x28d2771098bd8902ULL,
+	0x8f6013f47dfdc190ULL, 0x17a993fdb637553cULL, 0xe0a219397e1012aaULL,
+	0x786b9930b5da8606ULL, 0x6e82e39e55b0a6daULL, 0x875a0856f72f4ec3ULL,
+	0x3741ff4fa458536dULL, 0xac4859b3957558fcULL, 0x7ef6d5c75c09a57cULL,
+	0xc04a758b6c7f14fbULL, 0xf9acdd91ab26ebbfULL, 0x7391a467c5ef9668ULL,
+	0x335c7c1ee1319acaULL, 0xa91533b18641e4bbULL, 0xe4bf9a683b79db0dULL,
+	0x8e20faa72ba0b470ULL, 0x51f907737b3a7ae4ULL, 0x2268a314bed5ec8cULL,
+	0xd944b123b949edeeULL, 0x31dcb3b84d8b7017ULL, 0xd3fe65279f218860ULL,
+	0x097af2f1dc8ffab3ULL, 0x9b09a6fc312d0b91ULL, 0xcc6ded78a3c4520fULL,
+	0x3481d9ba5ebfcc50ULL, 0x4f2a667f1182d56bULL, 0xdfd9fdd4509ace94ULL,
+	0x26752045fbbc252bULL, 0xbffc491f662bc467ULL, 0xdd593272fc202449ULL,
+	0x3cbbc218d46d4303ULL, 0x91b372f817456e1fULL, 0x681faf69bc6385a0ULL,
+	0xb686bbeebaa43ed4ULL, 0x1469b5084cd0ca01ULL, 0x98c98009cbca94acULL,
+	0x6438379a73d8c354ULL, 0xc2caba2dc0c5fe26ULL, 0x3e3b0dbe78d7a9deULL,
+	0x50b9ee202d670f04ULL, 0x4590b27b37eab0e5ULL, 0x6025b4cb36b10af3ULL,
+	0xfb2c1237079c0162ULL, 0xa12f28130c936be8ULL, 0x4b37e52e54eb1cccULL,
+	0x083a1ba28ad28f53ULL, 0xc10a9cd83a22611bULL, 0x9f1425ad7444c236ULL,
+	0x069d4cf7e9d3237aULL, 0xedc56899e7f621beULL, 0x778c273680865fcfULL,
+	0x309c5aeb1bd605f7ULL, 0x8de0dc52d1472b4dULL, 0xf8ec34c2fd7b9e5fULL,
+	0xea18cd3d58787724ULL, 0xaad515447ca67b86ULL, 0x9989695a9d97e14cULL,
+	0x0000000000000000ULL, 0xf196c63321f464ecULL, 0x71116bc169557cb5ULL,
+	0xaf887f466f92c7c1ULL, 0x972e3e0ffe964d65ULL, 0x190ec4a8d536f915ULL,
+	0x95aef1a9522ca7b8ULL, 0xdc19db21aa7d51a9ULL, 0x94ee18fa0471d258ULL,
+	0x8087adf248a11859ULL, 0xc457f6da2916dd5cULL, 0xfa6cfb6451c17482ULL,
+	0xf256e0c6db13fbd1ULL, 0x6a9f60cf10d96f7dULL, 0x4daaa9d9bd383fb6ULL,
+	0x03c026f5fae79f3dULL, 0xde99148706c7bb74ULL, 0x2a52b8b6340763dfULL,
+	0x6fc20acd03edd33aULL, 0xd423c08320afdefaULL, 0xbbe1ca4e23420dc0ULL,
+	0x966ed75ca8cb3885ULL, 0xeb58246e0e2502c4ULL, 0x055d6a021334bc47ULL,
+	0xa47242111fa7d7afULL, 0xe3623fcc84f78d97ULL, 0x81c744a11efc6db9ULL,
+	0xaec8961539cfb221ULL, 0xf31609958d4e8e31ULL, 0x63e5923ecc5695ceULL,
+	0x47107ddd9b505a38ULL, 0xa3afe7b5a0298135ULL, 0x792b7063e387f3e6ULL,
+	0x0140e953565d75e0ULL, 0x12f4f9ffa503e97bULL, 0x750ce8902c3cb512ULL,
+	0xdbc47e8515f30733ULL, 0x1ed3610c6ab8af8fULL, 0x5239218681dde5d9ULL,
+	0xe222d69fd2aaf877ULL, 0xfe71783514a8bd25ULL, 0xcaf0a18f4a177175ULL,
+	0x61655d9860ec7f13ULL, 0xe77fbc9dc19e4430ULL, 0x2ccff441ddd440a5ULL,
+	0x16e97aaee06a20dcULL, 0xa855dae2d01c915bULL, 0x1d1347f9905f30b2ULL,
+	0xb7c652bdecf94b34ULL, 0xd03e43d265c6175dULL, 0xfdb15ec0ee4f2218ULL,
+	0x57644b8492e9599eULL, 0x07dda5a4bf8e569aULL, 0x54a46d71680ec6a3ULL,
+	0x5624a2d7c4b42c7eULL, 0xbebca04c3076b187ULL, 0x7d36f332a6ee3a41ULL,
+	0x3b6667bc6be31599ULL, 0x695f463aea3ef040ULL, 0xad08b0e0c3282d1cULL,
+	0xb15b1e4a052a684eULL, 0x44d05b2861b7c505ULL, 0x15295c5b1a8dbfe1ULL,
+	0x744c01c37a61c0f2ULL, 0x59c31cd1f1e8f5b7ULL, 0xef45a73f4b4ccb63ULL,
+	0x6bdf899c46841a9dULL, 0x3dfb2b4b823036e3ULL, 0xa2ef0ee6f674f4d5ULL,
+	0x184e2dfb836b8cf5ULL, 0x1134df0a5fe47646ULL, 0xbaa1231d751f7820ULL,
+	0xd17eaa81339b62bdULL, 0xb01bf71953771daeULL, 0x849a2ea30dc8d1feULL,
+	0x705182923f080955ULL, 0x0ea757556301ac29ULL, 0x041d83514569c9a7ULL,
+	0x0abad4042668658eULL, 0x49b72a88f851f611ULL, 0x8a3d79f66ec97dd7ULL,
+	0xcd2d042bf59927efULL, 0xc930877ab0f0ee48ULL, 0x9273540deda2f122ULL,
+	0xc797d02fd3f14261ULL, 0xe1e2f06a284d674aULL, 0xd2be8c74c97cfd80ULL,
+	0x9a494faf67707e71ULL, 0xb3dbd1eca9908293ULL, 0x72d14d3493b2e388ULL,
+	0xd6a30f258c153427ULL
+	}
+}; /* Ax */
+
+static void streebog_xor(const struct streebog_uint512 *x,
+			 const struct streebog_uint512 *y,
+			 struct streebog_uint512 *z)
+{
+	z->qword[0] = x->qword[0] ^ y->qword[0];
+	z->qword[1] = x->qword[1] ^ y->qword[1];
+	z->qword[2] = x->qword[2] ^ y->qword[2];
+	z->qword[3] = x->qword[3] ^ y->qword[3];
+	z->qword[4] = x->qword[4] ^ y->qword[4];
+	z->qword[5] = x->qword[5] ^ y->qword[5];
+	z->qword[6] = x->qword[6] ^ y->qword[6];
+	z->qword[7] = x->qword[7] ^ y->qword[7];
+}
+
+static void streebog_xlps(const struct streebog_uint512 *x,
+			  const struct streebog_uint512 *y,
+			  struct streebog_uint512 *data)
+{
+	u64 r0, r1, r2, r3, r4, r5, r6, r7;
+	int i;
+
+	r0 = le64_to_cpu(x->qword[0] ^ y->qword[0]);
+	r1 = le64_to_cpu(x->qword[1] ^ y->qword[1]);
+	r2 = le64_to_cpu(x->qword[2] ^ y->qword[2]);
+	r3 = le64_to_cpu(x->qword[3] ^ y->qword[3]);
+	r4 = le64_to_cpu(x->qword[4] ^ y->qword[4]);
+	r5 = le64_to_cpu(x->qword[5] ^ y->qword[5]);
+	r6 = le64_to_cpu(x->qword[6] ^ y->qword[6]);
+	r7 = le64_to_cpu(x->qword[7] ^ y->qword[7]);
+
+	for (i = 0; i <= 7; i++) {
+		data->qword[i]  = cpu_to_le64(Ax[0][r0 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[1][r1 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[2][r2 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[3][r3 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[4][r4 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[5][r5 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[6][r6 & 0xFF]);
+		data->qword[i] ^= cpu_to_le64(Ax[7][r7 & 0xFF]);
+		r0 >>= 8;
+		r1 >>= 8;
+		r2 >>= 8;
+		r3 >>= 8;
+		r4 >>= 8;
+		r5 >>= 8;
+		r6 >>= 8;
+		r7 >>= 8;
+	}
+}
+
+static void streebog_round(int i, struct streebog_uint512 *Ki,
+			   struct streebog_uint512 *data)
+{
+	streebog_xlps(Ki, &C[i], Ki);
+	streebog_xlps(Ki, data, data);
+}
+
+static int streebog_init(struct shash_desc *desc)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	unsigned int i;
+
+	memset(ctx, 0, sizeof(struct streebog_state));
+	for (i = 0; i < 8; i++) {
+		if (digest_size == STREEBOG256_DIGEST_SIZE)
+			ctx->h.qword[i] = 0x0101010101010101ULL;
+	}
+	return 0;
+}
+
+static void streebog_pad(struct streebog_state *ctx)
+{
+	if (ctx->fillsize >= STREEBOG_BLOCK_SIZE)
+		return;
+
+	memset(ctx->buffer + ctx->fillsize, 0,
+	       sizeof(ctx->buffer) - ctx->fillsize);
+
+	ctx->buffer[ctx->fillsize] = 1;
+}
+
+static void streebog_add512(const struct streebog_uint512 *x,
+			    const struct streebog_uint512 *y,
+			    struct streebog_uint512 *r)
+{
+	u64 carry = 0;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		const u64 left = le64_to_cpu(x->qword[i]);
+		u64 sum;
+
+		sum = left + le64_to_cpu(y->qword[i]) + carry;
+		if (sum != left)
+			carry = (sum < left);
+		r->qword[i] = cpu_to_le64(sum);
+	}
+}
+
+static void streebog_g(struct streebog_uint512 *h,
+		       const struct streebog_uint512 *N,
+		       const u8 *m)
+{
+	struct streebog_uint512 Ki, data;
+	unsigned int i;
+
+	streebog_xlps(h, N, &data);
+
+	/* Starting E() */
+	Ki = data;
+	streebog_xlps(&Ki, (const struct streebog_uint512 *)&m[0], &data);
+
+	for (i = 0; i < 11; i++)
+		streebog_round(i, &Ki, &data);
+
+	streebog_xlps(&Ki, &C[11], &Ki);
+	streebog_xor(&Ki, &data, &data);
+	/* E() done */
+
+	streebog_xor(&data, h, &data);
+	streebog_xor(&data, (const struct streebog_uint512 *)&m[0], h);
+}
+
+static void streebog_stage2(struct streebog_state *ctx, const u8 *data)
+{
+	streebog_g(&ctx->h, &ctx->N, data);
+
+	streebog_add512(&ctx->N, &buffer512, &ctx->N);
+	streebog_add512(&ctx->Sigma, (const struct streebog_uint512 *)data,
+			&ctx->Sigma);
+}
+
+static void streebog_stage3(struct streebog_state *ctx)
+{
+	struct streebog_uint512 buf = { { 0 } };
+
+	buf.qword[0] = cpu_to_le64(ctx->fillsize << 3);
+	streebog_pad(ctx);
+
+	streebog_g(&ctx->h, &ctx->N, (const u8 *)&ctx->buffer);
+	streebog_add512(&ctx->N, &buf, &ctx->N);
+	streebog_add512(&ctx->Sigma,
+			(const struct streebog_uint512 *)&ctx->buffer[0],
+			&ctx->Sigma);
+	streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->N);
+	streebog_g(&ctx->h, &buffer0, (const u8 *)&ctx->Sigma);
+	memcpy(&ctx->hash, &ctx->h, sizeof(struct streebog_uint512));
+}
+
+static int streebog_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+	size_t chunksize;
+
+	if (ctx->fillsize) {
+		chunksize = STREEBOG_BLOCK_SIZE - ctx->fillsize;
+		if (chunksize > len)
+			chunksize = len;
+		memcpy(&ctx->buffer[ctx->fillsize], data, chunksize);
+		ctx->fillsize += chunksize;
+		len  -= chunksize;
+		data += chunksize;
+
+		if (ctx->fillsize == STREEBOG_BLOCK_SIZE) {
+			streebog_stage2(ctx, ctx->buffer);
+			ctx->fillsize = 0;
+		}
+	}
+
+	while (len >= STREEBOG_BLOCK_SIZE) {
+		streebog_stage2(ctx, data);
+		data += STREEBOG_BLOCK_SIZE;
+		len  -= STREEBOG_BLOCK_SIZE;
+	}
+
+	if (len) {
+		memcpy(&ctx->buffer, data, len);
+		ctx->fillsize = len;
+	}
+	return 0;
+}
+
+static int streebog_final(struct shash_desc *desc, u8 *digest)
+{
+	struct streebog_state *ctx = shash_desc_ctx(desc);
+
+	streebog_stage3(ctx);
+	ctx->fillsize = 0;
+	if (crypto_shash_digestsize(desc->tfm) == STREEBOG256_DIGEST_SIZE)
+		memcpy(digest, &ctx->hash.qword[4], STREEBOG256_DIGEST_SIZE);
+	else
+		memcpy(digest, &ctx->hash.qword[0], STREEBOG512_DIGEST_SIZE);
+	return 0;
+}
+
+static struct shash_alg algs[2] = { {
+	.digestsize	=	STREEBOG256_DIGEST_SIZE,
+	.init		=	streebog_init,
+	.update		=	streebog_update,
+	.final		=	streebog_final,
+	.descsize	=	sizeof(struct streebog_state),
+	.base		=	{
+		.cra_name	 =	"streebog256",
+		.cra_driver_name =	"streebog256-generic",
+		.cra_blocksize	 =	STREEBOG_BLOCK_SIZE,
+		.cra_module	 =	THIS_MODULE,
+	},
+}, {
+	.digestsize	=	STREEBOG512_DIGEST_SIZE,
+	.init		=	streebog_init,
+	.update		=	streebog_update,
+	.final		=	streebog_final,
+	.descsize	=	sizeof(struct streebog_state),
+	.base		=	{
+		.cra_name	 =	"streebog512",
+		.cra_driver_name =	"streebog512-generic",
+		.cra_blocksize	 =	STREEBOG_BLOCK_SIZE,
+		.cra_module	 =	THIS_MODULE,
+	}
+} };
+
+static int __init streebog_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit streebog_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(streebog_mod_init);
+module_exit(streebog_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Chikunov <vt@altlinux.org>");
+MODULE_DESCRIPTION("Streebog Hash Function");
+
+MODULE_ALIAS_CRYPTO("streebog256");
+MODULE_ALIAS_CRYPTO("streebog256-generic");
+MODULE_ALIAS_CRYPTO("streebog512");
+MODULE_ALIAS_CRYPTO("streebog512-generic");
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index c20c9f5c18f229c3c878528b343e29bd4883d703..e7fb87e114a5b13eee7c12d245731284cf3ca9ba 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -76,10 +76,12 @@ static char *check[] = {
 	"cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea",
 	"khazad", "wp512", "wp384", "wp256", "tnepres", "xeta",  "fcrypt",
 	"camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320",
-	"lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512", NULL
+	"lzo", "cts", "sha3-224", "sha3-256", "sha3-384", "sha3-512",
+	"streebog256", "streebog512",
+	NULL
 };
 
-static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };
+static u32 block_sizes[] = { 16, 64, 256, 1024, 1472, 8192, 0 };
 static u32 aead_sizes[] = { 16, 64, 256, 512, 1024, 2048, 4096, 8192, 0 };
 
 #define XBUFSIZE 8
@@ -1736,6 +1738,7 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("ctr(aes)");
 		ret += tcrypt_test("rfc3686(ctr(aes))");
 		ret += tcrypt_test("ofb(aes)");
+		ret += tcrypt_test("cfb(aes)");
 		break;
 
 	case 11:
@@ -1913,6 +1916,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("sm3");
 		break;
 
+	case 53:
+		ret += tcrypt_test("streebog256");
+		break;
+
+	case 54:
+		ret += tcrypt_test("streebog512");
+		break;
+
 	case 100:
 		ret += tcrypt_test("hmac(md5)");
 		break;
@@ -1969,6 +1980,14 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		ret += tcrypt_test("hmac(sha3-512)");
 		break;
 
+	case 115:
+		ret += tcrypt_test("hmac(streebog256)");
+		break;
+
+	case 116:
+		ret += tcrypt_test("hmac(streebog512)");
+		break;
+
 	case 150:
 		ret += tcrypt_test("ansi_cprng");
 		break;
@@ -2060,6 +2079,10 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 				speed_template_16_24_32);
 		test_cipher_speed("ctr(aes)", DECRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
+		test_cipher_speed("cfb(aes)", ENCRYPT, sec, NULL, 0,
+				speed_template_16_24_32);
+		test_cipher_speed("cfb(aes)", DECRYPT, sec, NULL, 0,
+				speed_template_16_24_32);
 		break;
 
 	case 201:
@@ -2297,6 +2320,18 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_cipher_speed("ctr(sm4)", DECRYPT, sec, NULL, 0,
 				speed_template_16);
 		break;
+
+	case 219:
+		test_cipher_speed("adiantum(xchacha12,aes)", ENCRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha12,aes)", DECRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha20,aes)", ENCRYPT, sec, NULL,
+				  0, speed_template_32);
+		test_cipher_speed("adiantum(xchacha20,aes)", DECRYPT, sec, NULL,
+				  0, speed_template_32);
+		break;
+
 	case 300:
 		if (alg) {
 			test_hash_speed(alg, sec, generic_hash_speed_template);
@@ -2407,6 +2442,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 		test_hash_speed("sm3", sec, generic_hash_speed_template);
 		if (mode > 300 && mode < 400) break;
 		/* fall through */
+	case 327:
+		test_hash_speed("streebog256", sec,
+				generic_hash_speed_template);
+		if (mode > 300 && mode < 400) break;
+		/* fall through */
+	case 328:
+		test_hash_speed("streebog512", sec,
+				generic_hash_speed_template);
+		if (mode > 300 && mode < 400) break;
+		/* fall through */
 	case 399:
 		break;
 
@@ -2520,6 +2565,16 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
 				    num_mb);
 		if (mode > 400 && mode < 500) break;
 		/* fall through */
+	case 426:
+		test_mb_ahash_speed("streebog256", sec,
+				    generic_hash_speed_template, num_mb);
+		if (mode > 400 && mode < 500) break;
+		/* fall through */
+	case 427:
+		test_mb_ahash_speed("streebog512", sec,
+				    generic_hash_speed_template, num_mb);
+		if (mode > 400 && mode < 500) break;
+		/* fall through */
 	case 499:
 		break;
 
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b1f79c6bf40965b938b69bd877182788556d6f9e..0f684a414acbedc1334de5c939c17863bbafc998 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -2404,6 +2404,18 @@ static int alg_test_null(const struct alg_test_desc *desc,
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
+		.alg = "adiantum(xchacha12,aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(adiantum_xchacha12_aes_tv_template)
+		},
+	}, {
+		.alg = "adiantum(xchacha20,aes)",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(adiantum_xchacha20_aes_tv_template)
+		},
+	}, {
 		.alg = "aegis128",
 		.test = alg_test_aead,
 		.suite = {
@@ -2690,6 +2702,13 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.dec = __VECS(aes_ccm_dec_tv_template)
 			}
 		}
+	}, {
+		.alg = "cfb(aes)",
+		.test = alg_test_skcipher,
+		.fips_allowed = 1,
+		.suite = {
+			.cipher = __VECS(aes_cfb_tv_template)
+		},
 	}, {
 		.alg = "chacha20",
 		.test = alg_test_skcipher,
@@ -2805,6 +2824,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 		.alg = "cts(cbc(aes))",
 		.test = alg_test_skcipher,
+		.fips_allowed = 1,
 		.suite = {
 			.cipher = __VECS(cts_mode_tv_template)
 		}
@@ -3184,6 +3204,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(hmac_sha512_tv_template)
 		}
+	}, {
+		.alg = "hmac(streebog256)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(hmac_streebog256_tv_template)
+		}
+	}, {
+		.alg = "hmac(streebog512)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(hmac_streebog512_tv_template)
+		}
 	}, {
 		.alg = "jitterentropy_rng",
 		.fips_allowed = 1,
@@ -3291,6 +3323,12 @@ static const struct alg_test_desc alg_test_descs[] = {
 				.dec = __VECS(morus640_dec_tv_template),
 			}
 		}
+	}, {
+		.alg = "nhpoly1305",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(nhpoly1305_tv_template)
+		}
 	}, {
 		.alg = "ofb(aes)",
 		.test = alg_test_skcipher,
@@ -3496,6 +3534,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(sm3_tv_template)
 		}
+	}, {
+		.alg = "streebog256",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(streebog256_tv_template)
+		}
+	}, {
+		.alg = "streebog512",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(streebog512_tv_template)
+		}
 	}, {
 		.alg = "tgr128",
 		.test = alg_test_hash,
@@ -3544,6 +3594,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.suite = {
 			.hash = __VECS(aes_xcbc128_tv_template)
 		}
+	}, {
+		.alg = "xchacha12",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(xchacha12_tv_template)
+		},
+	}, {
+		.alg = "xchacha20",
+		.test = alg_test_skcipher,
+		.suite = {
+			.cipher = __VECS(xchacha20_tv_template)
+		},
 	}, {
 		.alg = "xts(aes)",
 		.test = alg_test_skcipher,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 1fe7b97ba03f906e81eb032614e567c74ac9b796..e8f47d7b92cdd933b693a074811ef63a5f5c61d1 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -27,7 +27,7 @@
 #define MAX_DIGEST_SIZE		64
 #define MAX_TAP			8
 
-#define MAX_KEYLEN		160
+#define MAX_KEYLEN		1088
 #define MAX_IVLEN		32
 
 struct hash_testvec {
@@ -35,10 +35,10 @@ struct hash_testvec {
 	const char *key;
 	const char *plaintext;
 	const char *digest;
-	unsigned char tap[MAX_TAP];
+	unsigned short tap[MAX_TAP];
+	unsigned short np;
 	unsigned short psize;
-	unsigned char np;
-	unsigned char ksize;
+	unsigned short ksize;
 };
 
 /*
@@ -2307,6 +2307,122 @@ static const struct hash_testvec crct10dif_tv_template[] = {
 	}
 };
 
+/*
+ * Streebog test vectors from RFC 6986 and GOST R 34.11-2012
+ */
+static const struct hash_testvec streebog256_tv_template[] = {
+	{ /* M1 */
+		.plaintext = "012345678901234567890123456789012345678901234567890123456789012",
+		.psize = 63,
+		.digest =
+			"\x9d\x15\x1e\xef\xd8\x59\x0b\x89"
+			"\xda\xa6\xba\x6c\xb7\x4a\xf9\x27"
+			"\x5d\xd0\x51\x02\x6b\xb1\x49\xa4"
+			"\x52\xfd\x84\xe5\xe5\x7b\x55\x00",
+	},
+	{ /* M2 */
+		.plaintext =
+			"\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8"
+			"\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee"
+			"\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8"
+			"\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20"
+			"\xf1\x20\xec\xee\xf0\xff\x20\xf1"
+			"\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20"
+			"\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0"
+			"\xfb\xff\x20\xef\xeb\xfa\xea\xfb"
+			"\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb",
+		.psize = 72,
+		.digest =
+			"\x9d\xd2\xfe\x4e\x90\x40\x9e\x5d"
+			"\xa8\x7f\x53\x97\x6d\x74\x05\xb0"
+			"\xc0\xca\xc6\x28\xfc\x66\x9a\x74"
+			"\x1d\x50\x06\x3c\x55\x7e\x8f\x50",
+	},
+};
+
+static const struct hash_testvec streebog512_tv_template[] = {
+	{ /* M1 */
+		.plaintext = "012345678901234567890123456789012345678901234567890123456789012",
+		.psize = 63,
+		.digest =
+			"\x1b\x54\xd0\x1a\x4a\xf5\xb9\xd5"
+			"\xcc\x3d\x86\xd6\x8d\x28\x54\x62"
+			"\xb1\x9a\xbc\x24\x75\x22\x2f\x35"
+			"\xc0\x85\x12\x2b\xe4\xba\x1f\xfa"
+			"\x00\xad\x30\xf8\x76\x7b\x3a\x82"
+			"\x38\x4c\x65\x74\xf0\x24\xc3\x11"
+			"\xe2\xa4\x81\x33\x2b\x08\xef\x7f"
+			"\x41\x79\x78\x91\xc1\x64\x6f\x48",
+	},
+	{ /* M2 */
+		.plaintext =
+			"\xd1\xe5\x20\xe2\xe5\xf2\xf0\xe8"
+			"\x2c\x20\xd1\xf2\xf0\xe8\xe1\xee"
+			"\xe6\xe8\x20\xe2\xed\xf3\xf6\xe8"
+			"\x2c\x20\xe2\xe5\xfe\xf2\xfa\x20"
+			"\xf1\x20\xec\xee\xf0\xff\x20\xf1"
+			"\xf2\xf0\xe5\xeb\xe0\xec\xe8\x20"
+			"\xed\xe0\x20\xf5\xf0\xe0\xe1\xf0"
+			"\xfb\xff\x20\xef\xeb\xfa\xea\xfb"
+			"\x20\xc8\xe3\xee\xf0\xe5\xe2\xfb",
+		.psize = 72,
+		.digest =
+			"\x1e\x88\xe6\x22\x26\xbf\xca\x6f"
+			"\x99\x94\xf1\xf2\xd5\x15\x69\xe0"
+			"\xda\xf8\x47\x5a\x3b\x0f\xe6\x1a"
+			"\x53\x00\xee\xe4\x6d\x96\x13\x76"
+			"\x03\x5f\xe8\x35\x49\xad\xa2\xb8"
+			"\x62\x0f\xcd\x7c\x49\x6c\xe5\xb3"
+			"\x3f\x0c\xb9\xdd\xdc\x2b\x64\x60"
+			"\x14\x3b\x03\xda\xba\xc9\xfb\x28",
+	},
+};
+
+/*
+ * Two HMAC-Streebog test vectors from RFC 7836 and R 50.1.113-2016 A
+ */
+static const struct hash_testvec hmac_streebog256_tv_template[] = {
+	{
+		.key =  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			"\x10\x11\x12\x13\x14\x15\x16\x17"
+			"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.ksize  = 32,
+		.plaintext =
+			"\x01\x26\xbd\xb8\x78\x00\xaf\x21"
+			"\x43\x41\x45\x65\x63\x78\x01\x00",
+		.psize  = 16,
+		.digest =
+			"\xa1\xaa\x5f\x7d\xe4\x02\xd7\xb3"
+			"\xd3\x23\xf2\x99\x1c\x8d\x45\x34"
+			"\x01\x31\x37\x01\x0a\x83\x75\x4f"
+			"\xd0\xaf\x6d\x7c\xd4\x92\x2e\xd9",
+	},
+};
+
+static const struct hash_testvec hmac_streebog512_tv_template[] = {
+	{
+		.key =  "\x00\x01\x02\x03\x04\x05\x06\x07"
+			"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+			"\x10\x11\x12\x13\x14\x15\x16\x17"
+			"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
+		.ksize  = 32,
+		.plaintext =
+			"\x01\x26\xbd\xb8\x78\x00\xaf\x21"
+			"\x43\x41\x45\x65\x63\x78\x01\x00",
+		.psize  = 16,
+		.digest =
+			"\xa5\x9b\xab\x22\xec\xae\x19\xc6"
+			"\x5f\xbd\xe6\xe5\xf4\xe9\xf5\xd8"
+			"\x54\x9d\x31\xf0\x37\xf9\xdf\x9b"
+			"\x90\x55\x00\xe1\x71\x92\x3a\x77"
+			"\x3d\x5f\x15\x30\xf2\xed\x7e\x96"
+			"\x4c\xb2\xee\xdc\x29\xe9\xad\x2f"
+			"\x3a\xfe\x93\xb2\x81\x4f\x79\xf5"
+			"\x00\x0f\xfc\x03\x66\xc2\x51\xe6",
+	},
+};
+
 /* Example vectors below taken from
  * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
  *
@@ -5593,6 +5709,1238 @@ static const struct hash_testvec poly1305_tv_template[] = {
 	},
 };
 
+/* NHPoly1305 test vectors from https://github.com/google/adiantum */
+static const struct hash_testvec nhpoly1305_tv_template[] = {
+	{
+		.key	= "\xd2\x5d\x4c\xdd\x8d\x2b\x7f\x7a"
+			  "\xd9\xbe\x71\xec\xd1\x83\x52\xe3"
+			  "\xe1\xad\xd7\x5c\x0a\x75\x9d\xec"
+			  "\x1d\x13\x7e\x5d\x71\x07\xc9\xe4"
+			  "\x57\x2d\x44\x68\xcf\xd8\xd6\xc5"
+			  "\x39\x69\x7d\x32\x75\x51\x4f\x7e"
+			  "\xb2\x4c\xc6\x90\x51\x6e\xd9\xd6"
+			  "\xa5\x8b\x2d\xf1\x94\xf9\xf7\x5e"
+			  "\x2c\x84\x7b\x41\x0f\x88\x50\x89"
+			  "\x30\xd9\xa1\x38\x46\x6c\xc0\x4f"
+			  "\xe8\xdf\xdc\x66\xab\x24\x43\x41"
+			  "\x91\x55\x29\x65\x86\x28\x5e\x45"
+			  "\xd5\x2d\xb7\x80\x08\x9a\xc3\xd4"
+			  "\x9a\x77\x0a\xd4\xef\x3e\xe6\x3f"
+			  "\x6f\x2f\x9b\x3a\x7d\x12\x1e\x80"
+			  "\x6c\x44\xa2\x25\xe1\xf6\x60\xe9"
+			  "\x0d\xaf\xc5\x3c\xa5\x79\xae\x64"
+			  "\xbc\xa0\x39\xa3\x4d\x10\xe5\x4d"
+			  "\xd5\xe7\x89\x7a\x13\xee\x06\x78"
+			  "\xdc\xa4\xdc\x14\x27\xe6\x49\x38"
+			  "\xd0\xe0\x45\x25\x36\xc5\xf4\x79"
+			  "\x2e\x9a\x98\x04\xe4\x2b\x46\x52"
+			  "\x7c\x33\xca\xe2\x56\x51\x50\xe2"
+			  "\xa5\x9a\xae\x18\x6a\x13\xf8\xd2"
+			  "\x21\x31\x66\x02\xe2\xda\x8d\x7e"
+			  "\x41\x19\xb2\x61\xee\x48\x8f\xf1"
+			  "\x65\x24\x2e\x1e\x68\xce\x05\xd9"
+			  "\x2a\xcf\xa5\x3a\x57\xdd\x35\x91"
+			  "\x93\x01\xca\x95\xfc\x2b\x36\x04"
+			  "\xe6\x96\x97\x28\xf6\x31\xfe\xa3"
+			  "\x9d\xf6\x6a\x1e\x80\x8d\xdc\xec"
+			  "\xaf\x66\x11\x13\x02\x88\xd5\x27"
+			  "\x33\xb4\x1a\xcd\xa3\xf6\xde\x31"
+			  "\x8e\xc0\x0e\x6c\xd8\x5a\x97\x5e"
+			  "\xdd\xfd\x60\x69\x38\x46\x3f\x90"
+			  "\x5e\x97\xd3\x32\x76\xc7\x82\x49"
+			  "\xfe\xba\x06\x5f\x2f\xa2\xfd\xff"
+			  "\x80\x05\x40\xe4\x33\x03\xfb\x10"
+			  "\xc0\xde\x65\x8c\xc9\x8d\x3a\x9d"
+			  "\xb5\x7b\x36\x4b\xb5\x0c\xcf\x00"
+			  "\x9c\x87\xe4\x49\xad\x90\xda\x4a"
+			  "\xdd\xbd\xff\xe2\x32\x57\xd6\x78"
+			  "\x36\x39\x6c\xd3\x5b\x9b\x88\x59"
+			  "\x2d\xf0\x46\xe4\x13\x0e\x2b\x35"
+			  "\x0d\x0f\x73\x8a\x4f\x26\x84\x75"
+			  "\x88\x3c\xc5\x58\x66\x18\x1a\xb4"
+			  "\x64\x51\x34\x27\x1b\xa4\x11\xc9"
+			  "\x6d\x91\x8a\xfa\x32\x60\x9d\xd7"
+			  "\x87\xe5\xaa\x43\x72\xf8\xda\xd1"
+			  "\x48\x44\x13\x61\xdc\x8c\x76\x17"
+			  "\x0c\x85\x4e\xf3\xdd\xa2\x42\xd2"
+			  "\x74\xc1\x30\x1b\xeb\x35\x31\x29"
+			  "\x5b\xd7\x4c\x94\x46\x35\xa1\x23"
+			  "\x50\xf2\xa2\x8e\x7e\x4f\x23\x4f"
+			  "\x51\xff\xe2\xc9\xa3\x7d\x56\x8b"
+			  "\x41\xf2\xd0\xc5\x57\x7e\x59\xac"
+			  "\xbb\x65\xf3\xfe\xf7\x17\xef\x63"
+			  "\x7c\x6f\x23\xdd\x22\x8e\xed\x84"
+			  "\x0e\x3b\x09\xb3\xf3\xf4\x8f\xcd"
+			  "\x37\xa8\xe1\xa7\x30\xdb\xb1\xa2"
+			  "\x9c\xa2\xdf\x34\x17\x3e\x68\x44"
+			  "\xd0\xde\x03\x50\xd1\x48\x6b\x20"
+			  "\xe2\x63\x45\xa5\xea\x87\xc2\x42"
+			  "\x95\x03\x49\x05\xed\xe0\x90\x29"
+			  "\x1a\xb8\xcf\x9b\x43\xcf\x29\x7a"
+			  "\x63\x17\x41\x9f\xe0\xc9\x10\xfd"
+			  "\x2c\x56\x8c\x08\x55\xb4\xa9\x27"
+			  "\x0f\x23\xb1\x05\x6a\x12\x46\xc7"
+			  "\xe1\xfe\x28\x93\x93\xd7\x2f\xdc"
+			  "\x98\x30\xdb\x75\x8a\xbe\x97\x7a"
+			  "\x02\xfb\x8c\xba\xbe\x25\x09\xbe"
+			  "\xce\xcb\xa2\xef\x79\x4d\x0e\x9d"
+			  "\x1b\x9d\xb6\x39\x34\x38\xfa\x07"
+			  "\xec\xe8\xfc\x32\x85\x1d\xf7\x85"
+			  "\x63\xc3\x3c\xc0\x02\x75\xd7\x3f"
+			  "\xb2\x68\x60\x66\x65\x81\xc6\xb1"
+			  "\x42\x65\x4b\x4b\x28\xd7\xc7\xaa"
+			  "\x9b\xd2\xdc\x1b\x01\xe0\x26\x39"
+			  "\x01\xc1\x52\x14\xd1\x3f\xb7\xe6"
+			  "\x61\x41\xc7\x93\xd2\xa2\x67\xc6"
+			  "\xf7\x11\xb5\xf5\xea\xdd\x19\xfb"
+			  "\x4d\x21\x12\xd6\x7d\xf1\x10\xb0"
+			  "\x89\x07\xc7\x5a\x52\x73\x70\x2f"
+			  "\x32\xef\x65\x2b\x12\xb2\xf0\xf5"
+			  "\x20\xe0\x90\x59\x7e\x64\xf1\x4c"
+			  "\x41\xb3\xa5\x91\x08\xe6\x5e\x5f"
+			  "\x05\x56\x76\xb4\xb0\xcd\x70\x53"
+			  "\x10\x48\x9c\xff\xc2\x69\x55\x24"
+			  "\x87\xef\x84\xea\xfb\xa7\xbf\xa0"
+			  "\x91\x04\xad\x4f\x8b\x57\x54\x4b"
+			  "\xb6\xe9\xd1\xac\x37\x2f\x1d\x2e"
+			  "\xab\xa5\xa4\xe8\xff\xfb\xd9\x39"
+			  "\x2f\xb7\xac\xd1\xfe\x0b\x9a\x80"
+			  "\x0f\xb6\xf4\x36\x39\x90\x51\xe3"
+			  "\x0a\x2f\xb6\x45\x76\x89\xcd\x61"
+			  "\xfe\x48\x5f\x75\x1d\x13\x00\x62"
+			  "\x80\x24\x47\xe7\xbc\x37\xd7\xe3"
+			  "\x15\xe8\x68\x22\xaf\x80\x6f\x4b"
+			  "\xa8\x9f\x01\x10\x48\x14\xc3\x02"
+			  "\x52\xd2\xc7\x75\x9b\x52\x6d\x30"
+			  "\xac\x13\x85\xc8\xf7\xa3\x58\x4b"
+			  "\x49\xf7\x1c\x45\x55\x8c\x39\x9a"
+			  "\x99\x6d\x97\x27\x27\xe6\xab\xdd"
+			  "\x2c\x42\x1b\x35\xdd\x9d\x73\xbb"
+			  "\x6c\xf3\x64\xf1\xfb\xb9\xf7\xe6"
+			  "\x4a\x3c\xc0\x92\xc0\x2e\xb7\x1a"
+			  "\xbe\xab\xb3\x5a\xe5\xea\xb1\x48"
+			  "\x58\x13\x53\x90\xfd\xc3\x8e\x54"
+			  "\xf9\x18\x16\x73\xe8\xcb\x6d\x39"
+			  "\x0e\xd7\xe0\xfe\xb6\x9f\x43\x97"
+			  "\xe8\xd0\x85\x56\x83\x3e\x98\x68"
+			  "\x7f\xbd\x95\xa8\x9a\x61\x21\x8f"
+			  "\x06\x98\x34\xa6\xc8\xd6\x1d\xf3"
+			  "\x3d\x43\xa4\x9a\x8c\xe5\xd3\x5a"
+			  "\x32\xa2\x04\x22\xa4\x19\x1a\x46"
+			  "\x42\x7e\x4d\xe5\xe0\xe6\x0e\xca"
+			  "\xd5\x58\x9d\x2c\xaf\xda\x33\x5c"
+			  "\xb0\x79\x9e\xc9\xfc\xca\xf0\x2f"
+			  "\xa8\xb2\x77\xeb\x7a\xa2\xdd\x37"
+			  "\x35\x83\x07\xd6\x02\x1a\xb6\x6c"
+			  "\x24\xe2\x59\x08\x0e\xfd\x3e\x46"
+			  "\xec\x40\x93\xf4\x00\x26\x4f\x2a"
+			  "\xff\x47\x2f\xeb\x02\x92\x26\x5b"
+			  "\x53\x17\xc2\x8d\x2a\xc7\xa3\x1b"
+			  "\xcd\xbc\xa7\xe8\xd1\x76\xe3\x80"
+			  "\x21\xca\x5d\x3b\xe4\x9c\x8f\xa9"
+			  "\x5b\x7f\x29\x7f\x7c\xd8\xed\x6d"
+			  "\x8c\xb2\x86\x85\xe7\x77\xf2\x85"
+			  "\xab\x38\xa9\x9d\xc1\x4e\xc5\x64"
+			  "\x33\x73\x8b\x59\x03\xad\x05\xdf"
+			  "\x25\x98\x31\xde\xef\x13\xf1\x9b"
+			  "\x3c\x91\x9d\x7b\xb1\xfa\xe6\xbf"
+			  "\x5b\xed\xa5\x55\xe6\xea\x6c\x74"
+			  "\xf4\xb9\xe4\x45\x64\x72\x81\xc2"
+			  "\x4c\x28\xd4\xcd\xac\xe2\xde\xf9"
+			  "\xeb\x5c\xeb\x61\x60\x5a\xe5\x28",
+		.ksize	= 1088,
+		.plaintext	= "",
+		.psize	= 0,
+		.digest	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+	}, {
+		.key	= "\x29\x21\x43\xcb\xcb\x13\x07\xde"
+			  "\xbf\x48\xdf\x8a\x7f\xa2\x84\xde"
+			  "\x72\x23\x9d\xf5\xf0\x07\xf2\x4c"
+			  "\x20\x3a\x93\xb9\xcd\x5d\xfe\xcb"
+			  "\x99\x2c\x2b\x58\xc6\x50\x5f\x94"
+			  "\x56\xc3\x7c\x0d\x02\x3f\xb8\x5e"
+			  "\x7b\xc0\x6c\x51\x34\x76\xc0\x0e"
+			  "\xc6\x22\xc8\x9e\x92\xa0\x21\xc9"
+			  "\x85\x5c\x7c\xf8\xe2\x64\x47\xc9"
+			  "\xe4\xa2\x57\x93\xf8\xa2\x69\xcd"
+			  "\x62\x98\x99\xf4\xd7\x7b\x14\xb1"
+			  "\xd8\x05\xff\x04\x15\xc9\xe1\x6e"
+			  "\x9b\xe6\x50\x6b\x0b\x3f\x22\x1f"
+			  "\x08\xde\x0c\x5b\x08\x7e\xc6\x2f"
+			  "\x6c\xed\xd6\xb2\x15\xa4\xb3\xf9"
+			  "\xa7\x46\x38\x2a\xea\x69\xa5\xde"
+			  "\x02\xc3\x96\x89\x4d\x55\x3b\xed"
+			  "\x3d\x3a\x85\x77\xbf\x97\x45\x5c"
+			  "\x9e\x02\x69\xe2\x1b\x68\xbe\x96"
+			  "\xfb\x64\x6f\x0f\xf6\x06\x40\x67"
+			  "\xfa\x04\xe3\x55\xfa\xbe\xa4\x60"
+			  "\xef\x21\x66\x97\xe6\x9d\x5c\x1f"
+			  "\x62\x37\xaa\x31\xde\xe4\x9c\x28"
+			  "\x95\xe0\x22\x86\xf4\x4d\xf3\x07"
+			  "\xfd\x5f\x3a\x54\x2c\x51\x80\x71"
+			  "\xba\x78\x69\x5b\x65\xab\x1f\x81"
+			  "\xed\x3b\xff\x34\xa3\xfb\xbc\x73"
+			  "\x66\x7d\x13\x7f\xdf\x6e\xe2\xe2"
+			  "\xeb\x4f\x6c\xda\x7d\x33\x57\xd0"
+			  "\xd3\x7c\x95\x4f\x33\x58\x21\xc7"
+			  "\xc0\xe5\x6f\x42\x26\xc6\x1f\x5e"
+			  "\x85\x1b\x98\x9a\xa2\x1e\x55\x77"
+			  "\x23\xdf\x81\x5e\x79\x55\x05\xfc"
+			  "\xfb\xda\xee\xba\x5a\xba\xf7\x77"
+			  "\x7f\x0e\xd3\xe1\x37\xfe\x8d\x2b"
+			  "\xd5\x3f\xfb\xd0\xc0\x3c\x0b\x3f"
+			  "\xcf\x3c\x14\xcf\xfb\x46\x72\x4c"
+			  "\x1f\x39\xe2\xda\x03\x71\x6d\x23"
+			  "\xef\x93\xcd\x39\xd9\x37\x80\x4d"
+			  "\x65\x61\xd1\x2c\x03\xa9\x47\x72"
+			  "\x4d\x1e\x0e\x16\x33\x0f\x21\x17"
+			  "\xec\x92\xea\x6f\x37\x22\xa4\xd8"
+			  "\x03\x33\x9e\xd8\x03\x69\x9a\xe8"
+			  "\xb2\x57\xaf\x78\x99\x05\x12\xab"
+			  "\x48\x90\x80\xf0\x12\x9b\x20\x64"
+			  "\x7a\x1d\x47\x5f\xba\x3c\xf9\xc3"
+			  "\x0a\x0d\x8d\xa1\xf9\x1b\x82\x13"
+			  "\x3e\x0d\xec\x0a\x83\xc0\x65\xe1"
+			  "\xe9\x95\xff\x97\xd6\xf2\xe4\xd5"
+			  "\x86\xc0\x1f\x29\x27\x63\xd7\xde"
+			  "\xb7\x0a\x07\x99\x04\x2d\xa3\x89"
+			  "\xa2\x43\xcf\xf3\xe1\x43\xac\x4a"
+			  "\x06\x97\xd0\x05\x4f\x87\xfa\xf9"
+			  "\x9b\xbf\x52\x70\xbd\xbc\x6c\xf3"
+			  "\x03\x13\x60\x41\x28\x09\xec\xcc"
+			  "\xb1\x1a\xec\xd6\xfb\x6f\x2a\x89"
+			  "\x5d\x0b\x53\x9c\x59\xc1\x84\x21"
+			  "\x33\x51\x47\x19\x31\x9c\xd4\x0a"
+			  "\x4d\x04\xec\x50\x90\x61\xbd\xbc"
+			  "\x7e\xc8\xd9\x6c\x98\x1d\x45\x41"
+			  "\x17\x5e\x97\x1c\xc5\xa8\xe8\xea"
+			  "\x46\x58\x53\xf7\x17\xd5\xad\x11"
+			  "\xc8\x54\xf5\x7a\x33\x90\xf5\x19"
+			  "\xba\x36\xb4\xfc\x52\xa5\x72\x3d"
+			  "\x14\xbb\x55\xa7\xe9\xe3\x12\xf7"
+			  "\x1c\x30\xa2\x82\x03\xbf\x53\x91"
+			  "\x2e\x60\x41\x9f\x5b\x69\x39\xf6"
+			  "\x4d\xc8\xf8\x46\x7a\x7f\xa4\x98"
+			  "\x36\xff\x06\xcb\xca\xe7\x33\xf2"
+			  "\xc0\x4a\xf4\x3c\x14\x44\x5f\x6b"
+			  "\x75\xef\x02\x36\x75\x08\x14\xfd"
+			  "\x10\x8e\xa5\x58\xd0\x30\x46\x49"
+			  "\xaf\x3a\xf8\x40\x3d\x35\xdb\x84"
+			  "\x11\x2e\x97\x6a\xb7\x87\x7f\xad"
+			  "\xf1\xfa\xa5\x63\x60\xd8\x5e\xbf"
+			  "\x41\x78\x49\xcf\x77\xbb\x56\xbb"
+			  "\x7d\x01\x67\x05\x22\xc8\x8f\x41"
+			  "\xba\x81\xd2\xca\x2c\x38\xac\x76"
+			  "\x06\xc1\x1a\xc2\xce\xac\x90\x67"
+			  "\x57\x3e\x20\x12\x5b\xd9\x97\x58"
+			  "\x65\x05\xb7\x04\x61\x7e\xd8\x3a"
+			  "\xbf\x55\x3b\x13\xe9\x34\x5a\x37"
+			  "\x36\xcb\x94\x45\xc5\x32\xb3\xa0"
+			  "\x0c\x3e\x49\xc5\xd3\xed\xa7\xf0"
+			  "\x1c\x69\xcc\xea\xcc\x83\xc9\x16"
+			  "\x95\x72\x4b\xf4\x89\xd5\xb9\x10"
+			  "\xf6\x2d\x60\x15\xea\x3c\x06\x66"
+			  "\x9f\x82\xad\x17\xce\xd2\xa4\x48"
+			  "\x7c\x65\xd9\xf8\x02\x4d\x9b\x4c"
+			  "\x89\x06\x3a\x34\x85\x48\x89\x86"
+			  "\xf9\x24\xa9\x54\x72\xdb\x44\x95"
+			  "\xc7\x44\x1c\x19\x11\x4c\x04\xdc"
+			  "\x13\xb9\x67\xc8\xc3\x3a\x6a\x50"
+			  "\xfa\xd1\xfb\xe1\x88\xb6\xf1\xa3"
+			  "\xc5\x3b\xdc\x38\x45\x16\x26\x02"
+			  "\x3b\xb8\x8f\x8b\x58\x7d\x23\x04"
+			  "\x50\x6b\x81\x9f\xae\x66\xac\x6f"
+			  "\xcf\x2a\x9d\xf1\xfd\x1d\x57\x07"
+			  "\xbe\x58\xeb\x77\x0c\xe3\xc2\x19"
+			  "\x14\x74\x1b\x51\x1c\x4f\x41\xf3"
+			  "\x32\x89\xb3\xe7\xde\x62\xf6\x5f"
+			  "\xc7\x6a\x4a\x2a\x5b\x0f\x5f\x87"
+			  "\x9c\x08\xb9\x02\x88\xc8\x29\xb7"
+			  "\x94\x52\xfa\x52\xfe\xaa\x50\x10"
+			  "\xba\x48\x75\x5e\x11\x1b\xe6\x39"
+			  "\xd7\x82\x2c\x87\xf1\x1e\xa4\x38"
+			  "\x72\x3e\x51\xe7\xd8\x3e\x5b\x7b"
+			  "\x31\x16\x89\xba\xd6\xad\x18\x5e"
+			  "\xba\xf8\x12\xb3\xf4\x6c\x47\x30"
+			  "\xc0\x38\x58\xb3\x10\x8d\x58\x5d"
+			  "\xb4\xfb\x19\x7e\x41\xc3\x66\xb8"
+			  "\xd6\x72\x84\xe1\x1a\xc2\x71\x4c"
+			  "\x0d\x4a\x21\x7a\xab\xa2\xc0\x36"
+			  "\x15\xc5\xe9\x46\xd7\x29\x17\x76"
+			  "\x5e\x47\x36\x7f\x72\x05\xa7\xcc"
+			  "\x36\x63\xf9\x47\x7d\xe6\x07\x3c"
+			  "\x8b\x79\x1d\x96\x61\x8d\x90\x65"
+			  "\x7c\xf5\xeb\x4e\x6e\x09\x59\x6d"
+			  "\x62\x50\x1b\x0f\xe0\xdc\x78\xf2"
+			  "\x5b\x83\x1a\xa1\x11\x75\xfd\x18"
+			  "\xd7\xe2\x8d\x65\x14\x21\xce\xbe"
+			  "\xb5\x87\xe3\x0a\xda\x24\x0a\x64"
+			  "\xa9\x9f\x03\x8d\x46\x5d\x24\x1a"
+			  "\x8a\x0c\x42\x01\xca\xb1\x5f\x7c"
+			  "\xa5\xac\x32\x4a\xb8\x07\x91\x18"
+			  "\x6f\xb0\x71\x3c\xc9\xb1\xa8\xf8"
+			  "\x5f\x69\xa5\xa1\xca\x9e\x7a\xaa"
+			  "\xac\xe9\xc7\x47\x41\x75\x25\xc3"
+			  "\x73\xe2\x0b\xdd\x6d\x52\x71\xbe"
+			  "\xc5\xdc\xb4\xe7\x01\x26\x53\x77"
+			  "\x86\x90\x85\x68\x6b\x7b\x03\x53"
+			  "\xda\x52\x52\x51\x68\xc8\xf3\xec"
+			  "\x6c\xd5\x03\x7a\xa3\x0e\xb4\x02"
+			  "\x5f\x1a\xab\xee\xca\x67\x29\x7b"
+			  "\xbd\x96\x59\xb3\x8b\x32\x7a\x92"
+			  "\x9f\xd8\x25\x2b\xdf\xc0\x4c\xda",
+		.ksize	= 1088,
+		.plaintext	= "\xbc\xda\x81\xa8\x78\x79\x1c\xbf"
+			  "\x77\x53\xba\x4c\x30\x5b\xb8\x33",
+		.psize	= 16,
+		.digest	= "\x04\xbf\x7f\x6a\xce\x72\xea\x6a"
+			  "\x79\xdb\xb0\xc9\x60\xf6\x12\xcc",
+		.np	= 6,
+		.tap	= { 4, 4, 1, 1, 1, 5 },
+	}, {
+		.key	= "\x65\x4d\xe3\xf8\xd2\x4c\xac\x28"
+			  "\x68\xf5\xb3\x81\x71\x4b\xa1\xfa"
+			  "\x04\x0e\xd3\x81\x36\xbe\x0c\x81"
+			  "\x5e\xaf\xbc\x3a\xa4\xc0\x8e\x8b"
+			  "\x55\x63\xd3\x52\x97\x88\xd6\x19"
+			  "\xbc\x96\xdf\x49\xff\x04\x63\xf5"
+			  "\x0c\x11\x13\xaa\x9e\x1f\x5a\xf7"
+			  "\xdd\xbd\x37\x80\xc3\xd0\xbe\xa7"
+			  "\x05\xc8\x3c\x98\x1e\x05\x3c\x84"
+			  "\x39\x61\xc4\xed\xed\x71\x1b\xc4"
+			  "\x74\x45\x2c\xa1\x56\x70\x97\xfd"
+			  "\x44\x18\x07\x7d\xca\x60\x1f\x73"
+			  "\x3b\x6d\x21\xcb\x61\x87\x70\x25"
+			  "\x46\x21\xf1\x1f\x21\x91\x31\x2d"
+			  "\x5d\xcc\xb7\xd1\x84\x3e\x3d\xdb"
+			  "\x03\x53\x2a\x82\xa6\x9a\x95\xbc"
+			  "\x1a\x1e\x0a\x5e\x07\x43\xab\x43"
+			  "\xaf\x92\x82\x06\x91\x04\x09\xf4"
+			  "\x17\x0a\x9a\x2c\x54\xdb\xb8\xf4"
+			  "\xd0\xf0\x10\x66\x24\x8d\xcd\xda"
+			  "\xfe\x0e\x45\x9d\x6f\xc4\x4e\xf4"
+			  "\x96\xaf\x13\xdc\xa9\xd4\x8c\xc4"
+			  "\xc8\x57\x39\x3c\xc2\xd3\x0a\x76"
+			  "\x4a\x1f\x75\x83\x44\xc7\xd1\x39"
+			  "\xd8\xb5\x41\xba\x73\x87\xfa\x96"
+			  "\xc7\x18\x53\xfb\x9b\xda\xa0\x97"
+			  "\x1d\xee\x60\x85\x9e\x14\xc3\xce"
+			  "\xc4\x05\x29\x3b\x95\x30\xa3\xd1"
+			  "\x9f\x82\x6a\x04\xf5\xa7\x75\x57"
+			  "\x82\x04\xfe\x71\x51\x71\xb1\x49"
+			  "\x50\xf8\xe0\x96\xf1\xfa\xa8\x88"
+			  "\x3f\xa0\x86\x20\xd4\x60\x79\x59"
+			  "\x17\x2d\xd1\x09\xf4\xec\x05\x57"
+			  "\xcf\x62\x7e\x0e\x7e\x60\x78\xe6"
+			  "\x08\x60\x29\xd8\xd5\x08\x1a\x24"
+			  "\xc4\x6c\x24\xe7\x92\x08\x3d\x8a"
+			  "\x98\x7a\xcf\x99\x0a\x65\x0e\xdc"
+			  "\x8c\x8a\xbe\x92\x82\x91\xcc\x62"
+			  "\x30\xb6\xf4\x3f\xc6\x8a\x7f\x12"
+			  "\x4a\x8a\x49\xfa\x3f\x5c\xd4\x5a"
+			  "\xa6\x82\xa3\xe6\xaa\x34\x76\xb2"
+			  "\xab\x0a\x30\xef\x6c\x77\x58\x3f"
+			  "\x05\x6b\xcc\x5c\xae\xdc\xd7\xb9"
+			  "\x51\x7e\x8d\x32\x5b\x24\x25\xbe"
+			  "\x2b\x24\x01\xcf\x80\xda\x16\xd8"
+			  "\x90\x72\x2c\xad\x34\x8d\x0c\x74"
+			  "\x02\xcb\xfd\xcf\x6e\xef\x97\xb5"
+			  "\x4c\xf2\x68\xca\xde\x43\x9e\x8a"
+			  "\xc5\x5f\x31\x7f\x14\x71\x38\xec"
+			  "\xbd\x98\xe5\x71\xc4\xb5\xdb\xef"
+			  "\x59\xd2\xca\xc0\xc1\x86\x75\x01"
+			  "\xd4\x15\x0d\x6f\xa4\xf7\x7b\x37"
+			  "\x47\xda\x18\x93\x63\xda\xbe\x9e"
+			  "\x07\xfb\xb2\x83\xd5\xc4\x34\x55"
+			  "\xee\x73\xa1\x42\x96\xf9\x66\x41"
+			  "\xa4\xcc\xd2\x93\x6e\xe1\x0a\xbb"
+			  "\xd2\xdd\x18\x23\xe6\x6b\x98\x0b"
+			  "\x8a\x83\x59\x2c\xc3\xa6\x59\x5b"
+			  "\x01\x22\x59\xf7\xdc\xb0\x87\x7e"
+			  "\xdb\x7d\xf4\x71\x41\xab\xbd\xee"
+			  "\x79\xbe\x3c\x01\x76\x0b\x2d\x0a"
+			  "\x42\xc9\x77\x8c\xbb\x54\x95\x60"
+			  "\x43\x2e\xe0\x17\x52\xbd\x90\xc9"
+			  "\xc2\x2c\xdd\x90\x24\x22\x76\x40"
+			  "\x5c\xb9\x41\xc9\xa1\xd5\xbd\xe3"
+			  "\x44\xe0\xa4\xab\xcc\xb8\xe2\x32"
+			  "\x02\x15\x04\x1f\x8c\xec\x5d\x14"
+			  "\xac\x18\xaa\xef\x6e\x33\x19\x6e"
+			  "\xde\xfe\x19\xdb\xeb\x61\xca\x18"
+			  "\xad\xd8\x3d\xbf\x09\x11\xc7\xa5"
+			  "\x86\x0b\x0f\xe5\x3e\xde\xe8\xd9"
+			  "\x0a\x69\x9e\x4c\x20\xff\xf9\xc5"
+			  "\xfa\xf8\xf3\x7f\xa5\x01\x4b\x5e"
+			  "\x0f\xf0\x3b\x68\xf0\x46\x8c\x2a"
+			  "\x7a\xc1\x8f\xa0\xfe\x6a\x5b\x44"
+			  "\x70\x5c\xcc\x92\x2c\x6f\x0f\xbd"
+			  "\x25\x3e\xb7\x8e\x73\x58\xda\xc9"
+			  "\xa5\xaa\x9e\xf3\x9b\xfd\x37\x3e"
+			  "\xe2\x88\xa4\x7b\xc8\x5c\xa8\x93"
+			  "\x0e\xe7\x9a\x9c\x2e\x95\x18\x9f"
+			  "\xc8\x45\x0c\x88\x9e\x53\x4f\x3a"
+			  "\x76\xc1\x35\xfa\x17\xd8\xac\xa0"
+			  "\x0c\x2d\x47\x2e\x4f\x69\x9b\xf7"
+			  "\xd0\xb6\x96\x0c\x19\xb3\x08\x01"
+			  "\x65\x7a\x1f\xc7\x31\x86\xdb\xc8"
+			  "\xc1\x99\x8f\xf8\x08\x4a\x9d\x23"
+			  "\x22\xa8\xcf\x27\x01\x01\x88\x93"
+			  "\x9c\x86\x45\xbd\xe0\x51\xca\x52"
+			  "\x84\xba\xfe\x03\xf7\xda\xc5\xce"
+			  "\x3e\x77\x75\x86\xaf\x84\xc8\x05"
+			  "\x44\x01\x0f\x02\xf3\x58\xb0\x06"
+			  "\x5a\xd7\x12\x30\x8d\xdf\x1f\x1f"
+			  "\x0a\xe6\xd2\xea\xf6\x3a\x7a\x99"
+			  "\x63\xe8\xd2\xc1\x4a\x45\x8b\x40"
+			  "\x4d\x0a\xa9\x76\x92\xb3\xda\x87"
+			  "\x36\x33\xf0\x78\xc3\x2f\x5f\x02"
+			  "\x1a\x6a\x2c\x32\xcd\x76\xbf\xbd"
+			  "\x5a\x26\x20\x28\x8c\x8c\xbc\x52"
+			  "\x3d\x0a\xc9\xcb\xab\xa4\x21\xb0"
+			  "\x54\x40\x81\x44\xc7\xd6\x1c\x11"
+			  "\x44\xc6\x02\x92\x14\x5a\xbf\x1a"
+			  "\x09\x8a\x18\xad\xcd\x64\x3d\x53"
+			  "\x4a\xb6\xa5\x1b\x57\x0e\xef\xe0"
+			  "\x8c\x44\x5f\x7d\xbd\x6c\xfd\x60"
+			  "\xae\x02\x24\xb6\x99\xdd\x8c\xaf"
+			  "\x59\x39\x75\x3c\xd1\x54\x7b\x86"
+			  "\xcc\x99\xd9\x28\x0c\xb0\x94\x62"
+			  "\xf9\x51\xd1\x19\x96\x2d\x66\xf5"
+			  "\x55\xcf\x9e\x59\xe2\x6b\x2c\x08"
+			  "\xc0\x54\x48\x24\x45\xc3\x8c\x73"
+			  "\xea\x27\x6e\x66\x7d\x1d\x0e\x6e"
+			  "\x13\xe8\x56\x65\x3a\xb0\x81\x5c"
+			  "\xf0\xe8\xd8\x00\x6b\xcd\x8f\xad"
+			  "\xdd\x53\xf3\xa4\x6c\x43\xd6\x31"
+			  "\xaf\xd2\x76\x1e\x91\x12\xdb\x3c"
+			  "\x8c\xc2\x81\xf0\x49\xdb\xe2\x6b"
+			  "\x76\x62\x0a\x04\xe4\xaa\x8a\x7c"
+			  "\x08\x0b\x5d\xd0\xee\x1d\xfb\xc4"
+			  "\x02\x75\x42\xd6\xba\xa7\x22\xa8"
+			  "\x47\x29\xb7\x85\x6d\x93\x3a\xdb"
+			  "\x00\x53\x0b\xa2\xeb\xf8\xfe\x01"
+			  "\x6f\x8a\x31\xd6\x17\x05\x6f\x67"
+			  "\x88\x95\x32\xfe\x4f\xa6\x4b\xf8"
+			  "\x03\xe4\xcd\x9a\x18\xe8\x4e\x2d"
+			  "\xf7\x97\x9a\x0c\x7d\x9f\x7e\x44"
+			  "\x69\x51\xe0\x32\x6b\x62\x86\x8f"
+			  "\xa6\x8e\x0b\x21\x96\xe5\xaf\x77"
+			  "\xc0\x83\xdf\xa5\x0e\xd0\xa1\x04"
+			  "\xaf\xc1\x10\xcb\x5a\x40\xe4\xe3"
+			  "\x38\x7e\x07\xe8\x4d\xfa\xed\xc5"
+			  "\xf0\x37\xdf\xbb\x8a\xcf\x3d\xdc"
+			  "\x61\xd2\xc6\x2b\xff\x07\xc9\x2f"
+			  "\x0c\x2d\x5c\x07\xa8\x35\x6a\xfc"
+			  "\xae\x09\x03\x45\x74\x51\x4d\xc4"
+			  "\xb8\x23\x87\x4a\x99\x27\x20\x87"
+			  "\x62\x44\x0a\x4a\xce\x78\x47\x22",
+		.ksize	= 1088,
+		.plaintext	= "\x8e\xb0\x4c\xde\x9c\x4a\x04\x5a"
+			  "\xf6\xa9\x7f\x45\x25\xa5\x7b\x3a"
+			  "\xbc\x4d\x73\x39\x81\xb5\xbd\x3d"
+			  "\x21\x6f\xd7\x37\x50\x3c\x7b\x28"
+			  "\xd1\x03\x3a\x17\xed\x7b\x7c\x2a"
+			  "\x16\xbc\xdf\x19\x89\x52\x71\x31"
+			  "\xb6\xc0\xfd\xb5\xd3\xba\x96\x99"
+			  "\xb6\x34\x0b\xd0\x99\x93\xfc\x1a"
+			  "\x01\x3c\x85\xc6\x9b\x78\x5c\x8b"
+			  "\xfe\xae\xd2\xbf\xb2\x6f\xf9\xed"
+			  "\xc8\x25\x17\xfe\x10\x3b\x7d\xda"
+			  "\xf4\x8d\x35\x4b\x7c\x7b\x82\xe7"
+			  "\xc2\xb3\xee\x60\x4a\x03\x86\xc9"
+			  "\x4e\xb5\xc4\xbe\xd2\xbd\x66\xf1"
+			  "\x13\xf1\x09\xab\x5d\xca\x63\x1f"
+			  "\xfc\xfb\x57\x2a\xfc\xca\x66\xd8"
+			  "\x77\x84\x38\x23\x1d\xac\xd3\xb3"
+			  "\x7a\xad\x4c\x70\xfa\x9c\xc9\x61"
+			  "\xa6\x1b\xba\x33\x4b\x4e\x33\xec"
+			  "\xa0\xa1\x64\x39\x40\x05\x1c\xc2"
+			  "\x3f\x49\x9d\xae\xf2\xc5\xf2\xc5"
+			  "\xfe\xe8\xf4\xc2\xf9\x96\x2d\x28"
+			  "\x92\x30\x44\xbc\xd2\x7f\xe1\x6e"
+			  "\x62\x02\x8f\x3d\x1c\x80\xda\x0e"
+			  "\x6a\x90\x7e\x75\xff\xec\x3e\xc4"
+			  "\xcd\x16\x34\x3b\x05\x6d\x4d\x20"
+			  "\x1c\x7b\xf5\x57\x4f\xfa\x3d\xac"
+			  "\xd0\x13\x55\xe8\xb3\xe1\x1b\x78"
+			  "\x30\xe6\x9f\x84\xd4\x69\xd1\x08"
+			  "\x12\x77\xa7\x4a\xbd\xc0\xf2\xd2"
+			  "\x78\xdd\xa3\x81\x12\xcb\x6c\x14"
+			  "\x90\x61\xe2\x84\xc6\x2b\x16\xcc"
+			  "\x40\x99\x50\x88\x01\x09\x64\x4f"
+			  "\x0a\x80\xbe\x61\xae\x46\xc9\x0a"
+			  "\x5d\xe0\xfb\x72\x7a\x1a\xdd\x61"
+			  "\x63\x20\x05\xa0\x4a\xf0\x60\x69"
+			  "\x7f\x92\xbc\xbf\x4e\x39\x4d\xdd"
+			  "\x74\xd1\xb7\xc0\x5a\x34\xb7\xae"
+			  "\x76\x65\x2e\xbc\x36\xb9\x04\x95"
+			  "\x42\xe9\x6f\xca\x78\xb3\x72\x07"
+			  "\xa3\xba\x02\x94\x67\x4c\xb1\xd7"
+			  "\xe9\x30\x0d\xf0\x3b\xb8\x10\x6d"
+			  "\xea\x2b\x21\xbf\x74\x59\x82\x97"
+			  "\x85\xaa\xf1\xd7\x54\x39\xeb\x05"
+			  "\xbd\xf3\x40\xa0\x97\xe6\x74\xfe"
+			  "\xb4\x82\x5b\xb1\x36\xcb\xe8\x0d"
+			  "\xce\x14\xd9\xdf\xf1\x94\x22\xcd"
+			  "\xd6\x00\xba\x04\x4c\x05\x0c\xc0"
+			  "\xd1\x5a\xeb\x52\xd5\xa8\x8e\xc8"
+			  "\x97\xa1\xaa\xc1\xea\xc1\xbe\x7c"
+			  "\x36\xb3\x36\xa0\xc6\x76\x66\xc5"
+			  "\xe2\xaf\xd6\x5c\xe2\xdb\x2c\xb3"
+			  "\x6c\xb9\x99\x7f\xff\x9f\x03\x24"
+			  "\xe1\x51\x44\x66\xd8\x0c\x5d\x7f"
+			  "\x5c\x85\x22\x2a\xcf\x6d\x79\x28"
+			  "\xab\x98\x01\x72\xfe\x80\x87\x5f"
+			  "\x46\xba\xef\x81\x24\xee\xbf\xb0"
+			  "\x24\x74\xa3\x65\x97\x12\xc4\xaf"
+			  "\x8b\xa0\x39\xda\x8a\x7e\x74\x6e"
+			  "\x1b\x42\xb4\x44\x37\xfc\x59\xfd"
+			  "\x86\xed\xfb\x8c\x66\x33\xda\x63"
+			  "\x75\xeb\xe1\xa4\x85\x4f\x50\x8f"
+			  "\x83\x66\x0d\xd3\x37\xfa\xe6\x9c"
+			  "\x4f\x30\x87\x35\x18\xe3\x0b\xb7"
+			  "\x6e\x64\x54\xcd\x70\xb3\xde\x54"
+			  "\xb7\x1d\xe6\x4c\x4d\x55\x12\x12"
+			  "\xaf\x5f\x7f\x5e\xee\x9d\xe8\x8e"
+			  "\x32\x9d\x4e\x75\xeb\xc6\xdd\xaa"
+			  "\x48\x82\xa4\x3f\x3c\xd7\xd3\xa8"
+			  "\x63\x9e\x64\xfe\xe3\x97\x00\x62"
+			  "\xe5\x40\x5d\xc3\xad\x72\xe1\x28"
+			  "\x18\x50\xb7\x75\xef\xcd\x23\xbf"
+			  "\x3f\xc0\x51\x36\xf8\x41\xc3\x08"
+			  "\xcb\xf1\x8d\x38\x34\xbd\x48\x45"
+			  "\x75\xed\xbc\x65\x7b\xb5\x0c\x9b"
+			  "\xd7\x67\x7d\x27\xb4\xc4\x80\xd7"
+			  "\xa9\xb9\xc7\x4a\x97\xaa\xda\xc8"
+			  "\x3c\x74\xcf\x36\x8f\xe4\x41\xe3"
+			  "\xd4\xd3\x26\xa7\xf3\x23\x9d\x8f"
+			  "\x6c\x20\x05\x32\x3e\xe0\xc3\xc8"
+			  "\x56\x3f\xa7\x09\xb7\xfb\xc7\xf7"
+			  "\xbe\x2a\xdd\x0f\x06\x7b\x0d\xdd"
+			  "\xb0\xb4\x86\x17\xfd\xb9\x04\xe5"
+			  "\xc0\x64\x5d\xad\x2a\x36\x38\xdb"
+			  "\x24\xaf\x5b\xff\xca\xf9\x41\xe8"
+			  "\xf9\x2f\x1e\x5e\xf9\xf5\xd5\xf2"
+			  "\xb2\x88\xca\xc9\xa1\x31\xe2\xe8"
+			  "\x10\x95\x65\xbf\xf1\x11\x61\x7a"
+			  "\x30\x1a\x54\x90\xea\xd2\x30\xf6"
+			  "\xa5\xad\x60\xf9\x4d\x84\x21\x1b"
+			  "\xe4\x42\x22\xc8\x12\x4b\xb0\x58"
+			  "\x3e\x9c\x2d\x32\x95\x0a\x8e\xb0"
+			  "\x0a\x7e\x77\x2f\xe8\x97\x31\x6a"
+			  "\xf5\x59\xb4\x26\xe6\x37\x12\xc9"
+			  "\xcb\xa0\x58\x33\x6f\xd5\x55\x55"
+			  "\x3c\xa1\x33\xb1\x0b\x7e\x2e\xb4"
+			  "\x43\x2a\x84\x39\xf0\x9c\xf4\x69"
+			  "\x4f\x1e\x79\xa6\x15\x1b\x87\xbb"
+			  "\xdb\x9b\xe0\xf1\x0b\xba\xe3\x6e"
+			  "\xcc\x2f\x49\x19\x22\x29\xfc\x71"
+			  "\xbb\x77\x38\x18\x61\xaf\x85\x76"
+			  "\xeb\xd1\x09\xcc\x86\x04\x20\x9a"
+			  "\x66\x53\x2f\x44\x8b\xc6\xa3\xd2"
+			  "\x5f\xc7\x79\x82\x66\xa8\x6e\x75"
+			  "\x7d\x94\xd1\x86\x75\x0f\xa5\x4f"
+			  "\x3c\x7a\x33\xce\xd1\x6e\x9d\x7b"
+			  "\x1f\x91\x37\xb8\x37\x80\xfb\xe0"
+			  "\x52\x26\xd0\x9a\xd4\x48\x02\x41"
+			  "\x05\xe3\x5a\x94\xf1\x65\x61\x19"
+			  "\xb8\x88\x4e\x2b\xea\xba\x8b\x58"
+			  "\x8b\x42\x01\x00\xa8\xfe\x00\x5c"
+			  "\xfe\x1c\xee\x31\x15\x69\xfa\xb3"
+			  "\x9b\x5f\x22\x8e\x0d\x2c\xe3\xa5"
+			  "\x21\xb9\x99\x8a\x8e\x94\x5a\xef"
+			  "\x13\x3e\x99\x96\x79\x6e\xd5\x42"
+			  "\x36\x03\xa9\xe2\xca\x65\x4e\x8a"
+			  "\x8a\x30\xd2\x7d\x74\xe7\xf0\xaa"
+			  "\x23\x26\xdd\xcb\x82\x39\xfc\x9d"
+			  "\x51\x76\x21\x80\xa2\xbe\x93\x03"
+			  "\x47\xb0\xc1\xb6\xdc\x63\xfd\x9f"
+			  "\xca\x9d\xa5\xca\x27\x85\xe2\xd8"
+			  "\x15\x5b\x7e\x14\x7a\xc4\x89\xcc"
+			  "\x74\x14\x4b\x46\xd2\xce\xac\x39"
+			  "\x6b\x6a\x5a\xa4\x0e\xe3\x7b\x15"
+			  "\x94\x4b\x0f\x74\xcb\x0c\x7f\xa9"
+			  "\xbe\x09\x39\xa3\xdd\x56\x5c\xc7"
+			  "\x99\x56\x65\x39\xf4\x0b\x7d\x87"
+			  "\xec\xaa\xe3\x4d\x22\x65\x39\x4e",
+		.psize	= 1024,
+		.digest	= "\x64\x3a\xbc\xc3\x3f\x74\x40\x51"
+			  "\x6e\x56\x01\x1a\x51\xec\x36\xde",
+		.np	= 8,
+		.tap	= { 64, 203, 267, 28, 263, 62, 54, 83 },
+	}, {
+		.key	= "\x1b\x82\x2e\x1b\x17\x23\xb9\x6d"
+			  "\xdc\x9c\xda\x99\x07\xe3\x5f\xd8"
+			  "\xd2\xf8\x43\x80\x8d\x86\x7d\x80"
+			  "\x1a\xd0\xcc\x13\xb9\x11\x05\x3f"
+			  "\x7e\xcf\x7e\x80\x0e\xd8\x25\x48"
+			  "\x8b\xaa\x63\x83\x92\xd0\x72\xf5"
+			  "\x4f\x67\x7e\x50\x18\x25\xa4\xd1"
+			  "\xe0\x7e\x1e\xba\xd8\xa7\x6e\xdb"
+			  "\x1a\xcc\x0d\xfe\x9f\x6d\x22\x35"
+			  "\xe1\xe6\xe0\xa8\x7b\x9c\xb1\x66"
+			  "\xa3\xf8\xff\x4d\x90\x84\x28\xbc"
+			  "\xdc\x19\xc7\x91\x49\xfc\xf6\x33"
+			  "\xc9\x6e\x65\x7f\x28\x6f\x68\x2e"
+			  "\xdf\x1a\x75\xe9\xc2\x0c\x96\xb9"
+			  "\x31\x22\xc4\x07\xc6\x0a\x2f\xfd"
+			  "\x36\x06\x5f\x5c\xc5\xb1\x3a\xf4"
+			  "\x5e\x48\xa4\x45\x2b\x88\xa7\xee"
+			  "\xa9\x8b\x52\xcc\x99\xd9\x2f\xb8"
+			  "\xa4\x58\x0a\x13\xeb\x71\x5a\xfa"
+			  "\xe5\x5e\xbe\xf2\x64\xad\x75\xbc"
+			  "\x0b\x5b\x34\x13\x3b\x23\x13\x9a"
+			  "\x69\x30\x1e\x9a\xb8\x03\xb8\x8b"
+			  "\x3e\x46\x18\x6d\x38\xd9\xb3\xd8"
+			  "\xbf\xf1\xd0\x28\xe6\x51\x57\x80"
+			  "\x5e\x99\xfb\xd0\xce\x1e\x83\xf7"
+			  "\xe9\x07\x5a\x63\xa9\xef\xce\xa5"
+			  "\xfb\x3f\x37\x17\xfc\x0b\x37\x0e"
+			  "\xbb\x4b\x21\x62\xb7\x83\x0e\xa9"
+			  "\x9e\xb0\xc4\xad\x47\xbe\x35\xe7"
+			  "\x51\xb2\xf2\xac\x2b\x65\x7b\x48"
+			  "\xe3\x3f\x5f\xb6\x09\x04\x0c\x58"
+			  "\xce\x99\xa9\x15\x2f\x4e\xc1\xf2"
+			  "\x24\x48\xc0\xd8\x6c\xd3\x76\x17"
+			  "\x83\x5d\xe6\xe3\xfd\x01\x8e\xf7"
+			  "\x42\xa5\x04\x29\x30\xdf\xf9\x00"
+			  "\x4a\xdc\x71\x22\x1a\x33\x15\xb6"
+			  "\xd7\x72\xfb\x9a\xb8\xeb\x2b\x38"
+			  "\xea\xa8\x61\xa8\x90\x11\x9d\x73"
+			  "\x2e\x6c\xce\x81\x54\x5a\x9f\xcd"
+			  "\xcf\xd5\xbd\x26\x5d\x66\xdb\xfb"
+			  "\xdc\x1e\x7c\x10\xfe\x58\x82\x10"
+			  "\x16\x24\x01\xce\x67\x55\x51\xd1"
+			  "\xdd\x6b\x44\xa3\x20\x8e\xa9\xa6"
+			  "\x06\xa8\x29\x77\x6e\x00\x38\x5b"
+			  "\xde\x4d\x58\xd8\x1f\x34\xdf\xf9"
+			  "\x2c\xac\x3e\xad\xfb\x92\x0d\x72"
+			  "\x39\xa4\xac\x44\x10\xc0\x43\xc4"
+			  "\xa4\x77\x3b\xfc\xc4\x0d\x37\xd3"
+			  "\x05\x84\xda\x53\x71\xf8\x80\xd3"
+			  "\x34\x44\xdb\x09\xb4\x2b\x8e\xe3"
+			  "\x00\x75\x50\x9e\x43\x22\x00\x0b"
+			  "\x7c\x70\xab\xd4\x41\xf1\x93\xcd"
+			  "\x25\x2d\x84\x74\xb5\xf2\x92\xcd"
+			  "\x0a\x28\xea\x9a\x49\x02\x96\xcb"
+			  "\x85\x9e\x2f\x33\x03\x86\x1d\xdc"
+			  "\x1d\x31\xd5\xfc\x9d\xaa\xc5\xe9"
+			  "\x9a\xc4\x57\xf5\x35\xed\xf4\x4b"
+			  "\x3d\x34\xc2\x29\x13\x86\x36\x42"
+			  "\x5d\xbf\x90\x86\x13\x77\xe5\xc3"
+			  "\x62\xb4\xfe\x0b\x70\x39\x35\x65"
+			  "\x02\xea\xf6\xce\x57\x0c\xbb\x74"
+			  "\x29\xe3\xfd\x60\x90\xfd\x10\x38"
+			  "\xd5\x4e\x86\xbd\x37\x70\xf0\x97"
+			  "\xa6\xab\x3b\x83\x64\x52\xca\x66"
+			  "\x2f\xf9\xa4\xca\x3a\x55\x6b\xb0"
+			  "\xe8\x3a\x34\xdb\x9e\x48\x50\x2f"
+			  "\x3b\xef\xfd\x08\x2d\x5f\xc1\x37"
+			  "\x5d\xbe\x73\xe4\xd8\xe9\xac\xca"
+			  "\x8a\xaa\x48\x7c\x5c\xf4\xa6\x96"
+			  "\x5f\xfa\x70\xa6\xb7\x8b\x50\xcb"
+			  "\xa6\xf5\xa9\xbd\x7b\x75\x4c\x22"
+			  "\x0b\x19\x40\x2e\xc9\x39\x39\x32"
+			  "\x83\x03\xa8\xa4\x98\xe6\x8e\x16"
+			  "\xb9\xde\x08\xc5\xfc\xbf\xad\x39"
+			  "\xa8\xc7\x93\x6c\x6f\x23\xaf\xc1"
+			  "\xab\xe1\xdf\xbb\x39\xae\x93\x29"
+			  "\x0e\x7d\x80\x8d\x3e\x65\xf3\xfd"
+			  "\x96\x06\x65\x90\xa1\x28\x64\x4b"
+			  "\x69\xf9\xa8\x84\x27\x50\xfc\x87"
+			  "\xf7\xbf\x55\x8e\x56\x13\x58\x7b"
+			  "\x85\xb4\x6a\x72\x0f\x40\xf1\x4f"
+			  "\x83\x81\x1f\x76\xde\x15\x64\x7a"
+			  "\x7a\x80\xe4\xc7\x5e\x63\x01\x91"
+			  "\xd7\x6b\xea\x0b\x9b\xa2\x99\x3b"
+			  "\x6c\x88\xd8\xfd\x59\x3c\x8d\x22"
+			  "\x86\x56\xbe\xab\xa1\x37\x08\x01"
+			  "\x50\x85\x69\x29\xee\x9f\xdf\x21"
+			  "\x3e\x20\x20\xf5\xb0\xbb\x6b\xd0"
+			  "\x9c\x41\x38\xec\x54\x6f\x2d\xbd"
+			  "\x0f\xe1\xbd\xf1\x2b\x6e\x60\x56"
+			  "\x29\xe5\x7a\x70\x1c\xe2\xfc\x97"
+			  "\x82\x68\x67\xd9\x3d\x1f\xfb\xd8"
+			  "\x07\x9f\xbf\x96\x74\xba\x6a\x0e"
+			  "\x10\x48\x20\xd8\x13\x1e\xb5\x44"
+			  "\xf2\xcc\xb1\x8b\xfb\xbb\xec\xd7"
+			  "\x37\x70\x1f\x7c\x55\xd2\x4b\xb9"
+			  "\xfd\x70\x5e\xa3\x91\x73\x63\x52"
+			  "\x13\x47\x5a\x06\xfb\x01\x67\xa5"
+			  "\xc0\xd0\x49\x19\x56\x66\x9a\x77"
+			  "\x64\xaf\x8c\x25\x91\x52\x87\x0e"
+			  "\x18\xf3\x5f\x97\xfd\x71\x13\xf8"
+			  "\x05\xa5\x39\xcc\x65\xd3\xcc\x63"
+			  "\x5b\xdb\x5f\x7e\x5f\x6e\xad\xc4"
+			  "\xf4\xa0\xc5\xc2\x2b\x4d\x97\x38"
+			  "\x4f\xbc\xfa\x33\x17\xb4\x47\xb9"
+			  "\x43\x24\x15\x8d\xd2\xed\x80\x68"
+			  "\x84\xdb\x04\x80\xca\x5e\x6a\x35"
+			  "\x2c\x2c\xe7\xc5\x03\x5f\x54\xb0"
+			  "\x5e\x4f\x1d\x40\x54\x3d\x78\x9a"
+			  "\xac\xda\x80\x27\x4d\x15\x4c\x1a"
+			  "\x6e\x80\xc9\xc4\x3b\x84\x0e\xd9"
+			  "\x2e\x93\x01\x8c\xc3\xc8\x91\x4b"
+			  "\xb3\xaa\x07\x04\x68\x5b\x93\xa5"
+			  "\xe7\xc4\x9d\xe7\x07\xee\xf5\x3b"
+			  "\x40\x89\xcc\x60\x34\x9d\xb4\x06"
+			  "\x1b\xef\x92\xe6\xc1\x2a\x7d\x0f"
+			  "\x81\xaa\x56\xe3\xd7\xed\xa7\xd4"
+			  "\xa7\x3a\x49\xc4\xad\x81\x5c\x83"
+			  "\x55\x8e\x91\x54\xb7\x7d\x65\xa5"
+			  "\x06\x16\xd5\x9a\x16\xc1\xb0\xa2"
+			  "\x06\xd8\x98\x47\x73\x7e\x73\xa0"
+			  "\xb8\x23\xb1\x52\xbf\x68\x74\x5d"
+			  "\x0b\xcb\xfa\x8c\x46\xe3\x24\xe6"
+			  "\xab\xd4\x69\x8d\x8c\xf2\x8a\x59"
+			  "\xbe\x48\x46\x50\x8c\x9a\xe8\xe3"
+			  "\x31\x55\x0a\x06\xed\x4f\xf8\xb7"
+			  "\x4f\xe3\x85\x17\x30\xbd\xd5\x20"
+			  "\xe7\x5b\xb2\x32\xcf\x6b\x16\x44"
+			  "\xd2\xf5\x7e\xd7\xd1\x2f\xee\x64"
+			  "\x3e\x9d\x10\xef\x27\x35\x43\x64"
+			  "\x67\xfb\x7a\x7b\xe0\x62\x31\x9a"
+			  "\x4d\xdf\xa5\xab\xc0\x20\xbb\x01"
+			  "\xe9\x7b\x54\xf1\xde\xb2\x79\x50"
+			  "\x6c\x4b\x91\xdb\x7f\xbb\x50\xc1"
+			  "\x55\x44\x38\x9a\xe0\x9f\xe8\x29"
+			  "\x6f\x15\xf8\x4e\xa6\xec\xa0\x60",
+		.ksize	= 1088,
+		.plaintext	= "\x15\x68\x9e\x2f\xad\x15\x52\xdf"
+			  "\xf0\x42\x62\x24\x2a\x2d\xea\xbf"
+			  "\xc7\xf3\xb4\x1a\xf5\xed\xb2\x08"
+			  "\x15\x60\x1c\x00\x77\xbf\x0b\x0e"
+			  "\xb7\x2c\xcf\x32\x3a\xc7\x01\x77"
+			  "\xef\xa6\x75\xd0\x29\xc7\x68\x20"
+			  "\xb2\x92\x25\xbf\x12\x34\xe9\xa4"
+			  "\xfd\x32\x7b\x3f\x7c\xbd\xa5\x02"
+			  "\x38\x41\xde\xc9\xc1\x09\xd9\xfc"
+			  "\x6e\x78\x22\x83\x18\xf7\x50\x8d"
+			  "\x8f\x9c\x2d\x02\xa5\x30\xac\xff"
+			  "\xea\x63\x2e\x80\x37\x83\xb0\x58"
+			  "\xda\x2f\xef\x21\x55\xba\x7b\xb1"
+			  "\xb6\xed\xf5\xd2\x4d\xaa\x8c\xa9"
+			  "\xdd\xdb\x0f\xb4\xce\xc1\x9a\xb1"
+			  "\xc1\xdc\xbd\xab\x86\xc2\xdf\x0b"
+			  "\xe1\x2c\xf9\xbe\xf6\xd8\xda\x62"
+			  "\x72\xdd\x98\x09\x52\xc0\xc4\xb6"
+			  "\x7b\x17\x5c\xf5\xd8\x4b\x88\xd6"
+			  "\x6b\xbf\x84\x4a\x3f\xf5\x4d\xd2"
+			  "\x94\xe2\x9c\xff\xc7\x3c\xd9\xc8"
+			  "\x37\x38\xbc\x8c\xf3\xe7\xb7\xd0"
+			  "\x1d\x78\xc4\x39\x07\xc8\x5e\x79"
+			  "\xb6\x5a\x90\x5b\x6e\x97\xc9\xd4"
+			  "\x82\x9c\xf3\x83\x7a\xe7\x97\xfc"
+			  "\x1d\xbb\xef\xdb\xce\xe0\x82\xad"
+			  "\xca\x07\x6c\x54\x62\x6f\x81\xe6"
+			  "\x7a\x5a\x96\x6e\x80\x3a\xa2\x37"
+			  "\x6f\xc6\xa4\x29\xc3\x9e\x19\x94"
+			  "\x9f\xb0\x3e\x38\xfb\x3c\x2b\x7d"
+			  "\xaa\xb8\x74\xda\x54\x23\x51\x12"
+			  "\x4b\x96\x36\x8f\x91\x4f\x19\x37"
+			  "\x83\xc9\xdd\xc7\x1a\x32\x2d\xab"
+			  "\xc7\x89\xe2\x07\x47\x6c\xe8\xa6"
+			  "\x70\x6b\x8e\x0c\xda\x5c\x6a\x59"
+			  "\x27\x33\x0e\xe1\xe1\x20\xe8\xc8"
+			  "\xae\xdc\xd0\xe3\x6d\xa8\xa6\x06"
+			  "\x41\xb4\xd4\xd4\xcf\x91\x3e\x06"
+			  "\xb0\x9a\xf7\xf1\xaa\xa6\x23\x92"
+			  "\x10\x86\xf0\x94\xd1\x7c\x2e\x07"
+			  "\x30\xfb\xc5\xd8\xf3\x12\xa9\xe8"
+			  "\x22\x1c\x97\x1a\xad\x96\xb0\xa1"
+			  "\x72\x6a\x6b\xb4\xfd\xf7\xe8\xfa"
+			  "\xe2\x74\xd8\x65\x8d\x35\x17\x4b"
+			  "\x00\x23\x5c\x8c\x70\xad\x71\xa2"
+			  "\xca\xc5\x6c\x59\xbf\xb4\xc0\x6d"
+			  "\x86\x98\x3e\x19\x5a\x90\x92\xb1"
+			  "\x66\x57\x6a\x91\x68\x7c\xbc\xf3"
+			  "\xf1\xdb\x94\xf8\x48\xf1\x36\xd8"
+			  "\x78\xac\x1c\xa9\xcc\xd6\x27\xba"
+			  "\x91\x54\x22\xf5\xe6\x05\x3f\xcc"
+			  "\xc2\x8f\x2c\x3b\x2b\xc3\x2b\x2b"
+			  "\x3b\xb8\xb6\x29\xb7\x2f\x94\xb6"
+			  "\x7b\xfc\x94\x3e\xd0\x7a\x41\x59"
+			  "\x7b\x1f\x9a\x09\xa6\xed\x4a\x82"
+			  "\x9d\x34\x1c\xbd\x4e\x1c\x3a\x66"
+			  "\x80\x74\x0e\x9a\x4f\x55\x54\x47"
+			  "\x16\xba\x2a\x0a\x03\x35\x99\xa3"
+			  "\x5c\x63\x8d\xa2\x72\x8b\x17\x15"
+			  "\x68\x39\x73\xeb\xec\xf2\xe8\xf5"
+			  "\x95\x32\x27\xd6\xc4\xfe\xb0\x51"
+			  "\xd5\x0c\x50\xc5\xcd\x6d\x16\xb3"
+			  "\xa3\x1e\x95\x69\xad\x78\x95\x06"
+			  "\xb9\x46\xf2\x6d\x24\x5a\x99\x76"
+			  "\x73\x6a\x91\xa6\xac\x12\xe1\x28"
+			  "\x79\xbc\x08\x4e\x97\x00\x98\x63"
+			  "\x07\x1c\x4e\xd1\x68\xf3\xb3\x81"
+			  "\xa8\xa6\x5f\xf1\x01\xc9\xc1\xaf"
+			  "\x3a\x96\xf9\x9d\xb5\x5a\x5f\x8f"
+			  "\x7e\xc1\x7e\x77\x0a\x40\xc8\x8e"
+			  "\xfc\x0e\xed\xe1\x0d\xb0\xe5\x5e"
+			  "\x5e\x6f\xf5\x7f\xab\x33\x7d\xcd"
+			  "\xf0\x09\x4b\xb2\x11\x37\xdc\x65"
+			  "\x97\x32\x62\x71\x3a\x29\x54\xb9"
+			  "\xc7\xa4\xbf\x75\x0f\xf9\x40\xa9"
+			  "\x8d\xd7\x8b\xa7\xe0\x9a\xbe\x15"
+			  "\xc6\xda\xd8\x00\x14\x69\x1a\xaf"
+			  "\x5f\x79\xc3\xf5\xbb\x6c\x2a\x9d"
+			  "\xdd\x3c\x5f\x97\x21\xe1\x3a\x03"
+			  "\x84\x6a\xe9\x76\x11\x1f\xd3\xd5"
+			  "\xf0\x54\x20\x4d\xc2\x91\xc3\xa4"
+			  "\x36\x25\xbe\x1b\x2a\x06\xb7\xf3"
+			  "\xd1\xd0\x55\x29\x81\x4c\x83\xa3"
+			  "\xa6\x84\x1e\x5c\xd1\xd0\x6c\x90"
+			  "\xa4\x11\xf0\xd7\x63\x6a\x48\x05"
+			  "\xbc\x48\x18\x53\xcd\xb0\x8d\xdb"
+			  "\xdc\xfe\x55\x11\x5c\x51\xb3\xab"
+			  "\xab\x63\x3e\x31\x5a\x8b\x93\x63"
+			  "\x34\xa9\xba\x2b\x69\x1a\xc0\xe3"
+			  "\xcb\x41\xbc\xd7\xf5\x7f\x82\x3e"
+			  "\x01\xa3\x3c\x72\xf4\xfe\xdf\xbe"
+			  "\xb1\x67\x17\x2b\x37\x60\x0d\xca"
+			  "\x6f\xc3\x94\x2c\xd2\x92\x6d\x9d"
+			  "\x75\x18\x77\xaa\x29\x38\x96\xed"
+			  "\x0e\x20\x70\x92\xd5\xd0\xb4\x00"
+			  "\xc0\x31\xf2\xc9\x43\x0e\x75\x1d"
+			  "\x4b\x64\xf2\x1f\xf2\x29\x6c\x7b"
+			  "\x7f\xec\x59\x7d\x8c\x0d\xd4\xd3"
+			  "\xac\x53\x4c\xa3\xde\x42\x92\x95"
+			  "\x6d\xa3\x4f\xd0\xe6\x3d\xe7\xec"
+			  "\x7a\x4d\x68\xf1\xfe\x67\x66\x09"
+			  "\x83\x22\xb1\x98\x43\x8c\xab\xb8"
+			  "\x45\xe6\x6d\xdf\x5e\x50\x71\xce"
+			  "\xf5\x4e\x40\x93\x2b\xfa\x86\x0e"
+			  "\xe8\x30\xbd\x82\xcc\x1c\x9c\x5f"
+			  "\xad\xfd\x08\x31\xbe\x52\xe7\xe6"
+			  "\xf2\x06\x01\x62\x25\x15\x99\x74"
+			  "\x33\x51\x52\x57\x3f\x57\x87\x61"
+			  "\xb9\x7f\x29\x3d\xcd\x92\x5e\xa6"
+			  "\x5c\x3b\xf1\xed\x5f\xeb\x82\xed"
+			  "\x56\x7b\x61\xe7\xfd\x02\x47\x0e"
+			  "\x2a\x15\xa4\xce\x43\x86\x9b\xe1"
+			  "\x2b\x4c\x2a\xd9\x42\x97\xf7\x9a"
+			  "\xe5\x47\x46\x48\xd3\x55\x6f\x4d"
+			  "\xd9\xeb\x4b\xdd\x7b\x21\x2f\xb3"
+			  "\xa8\x36\x28\xdf\xca\xf1\xf6\xd9"
+			  "\x10\xf6\x1c\xfd\x2e\x0c\x27\xe0"
+			  "\x01\xb3\xff\x6d\x47\x08\x4d\xd4"
+			  "\x00\x25\xee\x55\x4a\xe9\xe8\x5b"
+			  "\xd8\xf7\x56\x12\xd4\x50\xb2\xe5"
+			  "\x51\x6f\x34\x63\x69\xd2\x4e\x96"
+			  "\x4e\xbc\x79\xbf\x18\xae\xc6\x13"
+			  "\x80\x92\x77\xb0\xb4\x0f\x29\x94"
+			  "\x6f\x4c\xbb\x53\x11\x36\xc3\x9f"
+			  "\x42\x8e\x96\x8a\x91\xc8\xe9\xfc"
+			  "\xfe\xbf\x7c\x2d\x6f\xf9\xb8\x44"
+			  "\x89\x1b\x09\x53\x0a\x2a\x92\xc3"
+			  "\x54\x7a\x3a\xf9\xe2\xe4\x75\x87"
+			  "\xa0\x5e\x4b\x03\x7a\x0d\x8a\xf4"
+			  "\x55\x59\x94\x2b\x63\x96\x0e\xf5",
+		.psize	= 1040,
+		.digest	= "\xb5\xb9\x08\xb3\x24\x3e\x03\xf0"
+			  "\xd6\x0b\x57\xbc\x0a\x6d\x89\x59",
+	}, {
+		.key	= "\xf6\x34\x42\x71\x35\x52\x8b\x58"
+			  "\x02\x3a\x8e\x4a\x8d\x41\x13\xe9"
+			  "\x7f\xba\xb9\x55\x9d\x73\x4d\xf8"
+			  "\x3f\x5d\x73\x15\xff\xd3\x9e\x7f"
+			  "\x20\x2a\x6a\xa8\xd1\xf0\x8f\x12"
+			  "\x6b\x02\xd8\x6c\xde\xba\x80\x22"
+			  "\x19\x37\xc8\xd0\x4e\x89\x17\x7c"
+			  "\x7c\xdd\x88\xfd\x41\xc0\x04\xb7"
+			  "\x1d\xac\x19\xe3\x20\xc7\x16\xcf"
+			  "\x58\xee\x1d\x7a\x61\x69\xa9\x12"
+			  "\x4b\xef\x4f\xb6\x38\xdd\x78\xf8"
+			  "\x28\xee\x70\x08\xc7\x7c\xcc\xc8"
+			  "\x1e\x41\xf5\x80\x86\x70\xd0\xf0"
+			  "\xa3\x87\x6b\x0a\x00\xd2\x41\x28"
+			  "\x74\x26\xf1\x24\xf3\xd0\x28\x77"
+			  "\xd7\xcd\xf6\x2d\x61\xf4\xa2\x13"
+			  "\x77\xb4\x6f\xa0\xf4\xfb\xd6\xb5"
+			  "\x38\x9d\x5a\x0c\x51\xaf\xad\x63"
+			  "\x27\x67\x8c\x01\xea\x42\x1a\x66"
+			  "\xda\x16\x7c\x3c\x30\x0c\x66\x53"
+			  "\x1c\x88\xa4\x5c\xb2\xe3\x78\x0a"
+			  "\x13\x05\x6d\xe2\xaf\xb3\xe4\x75"
+			  "\x00\x99\x58\xee\x76\x09\x64\xaa"
+			  "\xbb\x2e\xb1\x81\xec\xd8\x0e\xd3"
+			  "\x0c\x33\x5d\xb7\x98\xef\x36\xb6"
+			  "\xd2\x65\x69\x41\x70\x12\xdc\x25"
+			  "\x41\x03\x99\x81\x41\x19\x62\x13"
+			  "\xd1\x0a\x29\xc5\x8c\xe0\x4c\xf3"
+			  "\xd6\xef\x4c\xf4\x1d\x83\x2e\x6d"
+			  "\x8e\x14\x87\xed\x80\xe0\xaa\xd3"
+			  "\x08\x04\x73\x1a\x84\x40\xf5\x64"
+			  "\xbd\x61\x32\x65\x40\x42\xfb\xb0"
+			  "\x40\xf6\x40\x8d\xc7\x7f\x14\xd0"
+			  "\x83\x99\xaa\x36\x7e\x60\xc6\xbf"
+			  "\x13\x8a\xf9\x21\xe4\x7e\x68\x87"
+			  "\xf3\x33\x86\xb4\xe0\x23\x7e\x0a"
+			  "\x21\xb1\xf5\xad\x67\x3c\x9c\x9d"
+			  "\x09\xab\xaf\x5f\xba\xe0\xd0\x82"
+			  "\x48\x22\x70\xb5\x6d\x53\xd6\x0e"
+			  "\xde\x64\x92\x41\xb0\xd3\xfb\xda"
+			  "\x21\xfe\xab\xea\x20\xc4\x03\x58"
+			  "\x18\x2e\x7d\x2f\x03\xa9\x47\x66"
+			  "\xdf\x7b\xa4\x6b\x34\x6b\x55\x9c"
+			  "\x4f\xd7\x9c\x47\xfb\xa9\x42\xec"
+			  "\x5a\x12\xfd\xfe\x76\xa0\x92\x9d"
+			  "\xfe\x1e\x16\xdd\x24\x2a\xe4\x27"
+			  "\xd5\xa9\xf2\x05\x4f\x83\xa2\xaf"
+			  "\xfe\xee\x83\x7a\xad\xde\xdf\x9a"
+			  "\x80\xd5\x81\x14\x93\x16\x7e\x46"
+			  "\x47\xc2\x14\xef\x49\x6e\xb9\xdb"
+			  "\x40\xe8\x06\x6f\x9c\x2a\xfd\x62"
+			  "\x06\x46\xfd\x15\x1d\x36\x61\x6f"
+			  "\x77\x77\x5e\x64\xce\x78\x1b\x85"
+			  "\xbf\x50\x9a\xfd\x67\xa6\x1a\x65"
+			  "\xad\x5b\x33\x30\xf1\x71\xaa\xd9"
+			  "\x23\x0d\x92\x24\x5f\xae\x57\xb0"
+			  "\x24\x37\x0a\x94\x12\xfb\xb5\xb1"
+			  "\xd3\xb8\x1d\x12\x29\xb0\x80\x24"
+			  "\x2d\x47\x9f\x96\x1f\x95\xf1\xb1"
+			  "\xda\x35\xf6\x29\xe0\xe1\x23\x96"
+			  "\xc7\xe8\x22\x9b\x7c\xac\xf9\x41"
+			  "\x39\x01\xe5\x73\x15\x5e\x99\xec"
+			  "\xb4\xc1\xf4\xe7\xa7\x97\x6a\xd5"
+			  "\x90\x9a\xa0\x1d\xf3\x5a\x8b\x5f"
+			  "\xdf\x01\x52\xa4\x93\x31\x97\xb0"
+			  "\x93\x24\xb5\xbc\xb2\x14\x24\x98"
+			  "\x4a\x8f\x19\x85\xc3\x2d\x0f\x74"
+			  "\x9d\x16\x13\x80\x5e\x59\x62\x62"
+			  "\x25\xe0\xd1\x2f\x64\xef\xba\xac"
+			  "\xcd\x09\x07\x15\x8a\xcf\x73\xb5"
+			  "\x8b\xc9\xd8\x24\xb0\x53\xd5\x6f"
+			  "\xe1\x2b\x77\xb1\xc5\xe4\xa7\x0e"
+			  "\x18\x45\xab\x36\x03\x59\xa8\xbd"
+			  "\x43\xf0\xd8\x2c\x1a\x69\x96\xbb"
+			  "\x13\xdf\x6c\x33\x77\xdf\x25\x34"
+			  "\x5b\xa5\x5b\x8c\xf9\x51\x05\xd4"
+			  "\x8b\x8b\x44\x87\x49\xfc\xa0\x8f"
+			  "\x45\x15\x5b\x40\x42\xc4\x09\x92"
+			  "\x98\x0c\x4d\xf4\x26\x37\x1b\x13"
+			  "\x76\x01\x93\x8d\x4f\xe6\xed\x18"
+			  "\xd0\x79\x7b\x3f\x44\x50\xcb\xee"
+			  "\xf7\x4a\xc9\x9e\xe0\x96\x74\xa7"
+			  "\xe6\x93\xb2\x53\xca\x55\xa8\xdc"
+			  "\x1e\x68\x07\x87\xb7\x2e\xc1\x08"
+			  "\xb2\xa4\x5b\xaf\xc6\xdb\x5c\x66"
+			  "\x41\x1c\x51\xd9\xb0\x07\x00\x0d"
+			  "\xf0\x4c\xdc\x93\xde\xa9\x1e\x8e"
+			  "\xd3\x22\x62\xd8\x8b\x88\x2c\xea"
+			  "\x5e\xf1\x6e\x14\x40\xc7\xbe\xaa"
+			  "\x42\x28\xd0\x26\x30\x78\x01\x9b"
+			  "\x83\x07\xbc\x94\xc7\x57\xa2\x9f"
+			  "\x03\x07\xff\x16\xff\x3c\x6e\x48"
+			  "\x0a\xd0\xdd\x4c\xf6\x64\x9a\xf1"
+			  "\xcd\x30\x12\x82\x2c\x38\xd3\x26"
+			  "\x83\xdb\xab\x3e\xc6\xf8\xe6\xfa"
+			  "\x77\x0a\x78\x82\x75\xf8\x63\x51"
+			  "\x59\xd0\x8d\x24\x9f\x25\xe6\xa3"
+			  "\x4c\xbc\x34\xfc\xe3\x10\xc7\x62"
+			  "\xd4\x23\xc8\x3d\xa7\xc6\xa6\x0a"
+			  "\x4f\x7e\x29\x9d\x6d\xbe\xb5\xf1"
+			  "\xdf\xa4\x53\xfa\xc0\x23\x0f\x37"
+			  "\x84\x68\xd0\xb5\xc8\xc6\xae\xf8"
+			  "\xb7\x8d\xb3\x16\xfe\x8f\x87\xad"
+			  "\xd0\xc1\x08\xee\x12\x1c\x9b\x1d"
+			  "\x90\xf8\xd1\x63\xa4\x92\x3c\xf0"
+			  "\xc7\x34\xd8\xf1\x14\xed\xa3\xbc"
+			  "\x17\x7e\xd4\x62\x42\x54\x57\x2c"
+			  "\x3e\x7a\x35\x35\x17\x0f\x0b\x7f"
+			  "\x81\xa1\x3f\xd0\xcd\xc8\x3b\x96"
+			  "\xe9\xe0\x4a\x04\xe1\xb6\x3c\xa1"
+			  "\xd6\xca\xc4\xbd\xb6\xb5\x95\x34"
+			  "\x12\x9d\xc5\x96\xf2\xdf\xba\x54"
+			  "\x76\xd1\xb2\x6b\x3b\x39\xe0\xb9"
+			  "\x18\x62\xfb\xf7\xfc\x12\xf1\x5f"
+			  "\x7e\xc7\xe3\x59\x4c\xa6\xc2\x3d"
+			  "\x40\x15\xf9\xa3\x95\x64\x4c\x74"
+			  "\x8b\x73\x77\x33\x07\xa7\x04\x1d"
+			  "\x33\x5a\x7e\x8f\xbd\x86\x01\x4f"
+			  "\x3e\xb9\x27\x6f\xe2\x41\xf7\x09"
+			  "\x67\xfd\x29\x28\xc5\xe4\xf6\x18"
+			  "\x4c\x1b\x49\xb2\x9c\x5b\xf6\x81"
+			  "\x4f\xbb\x5c\xcc\x0b\xdf\x84\x23"
+			  "\x58\xd6\x28\x34\x93\x3a\x25\x97"
+			  "\xdf\xb2\xc3\x9e\x97\x38\x0b\x7d"
+			  "\x10\xb3\x54\x35\x23\x8c\x64\xee"
+			  "\xf0\xd8\x66\xff\x8b\x22\xd2\x5b"
+			  "\x05\x16\x3c\x89\xf7\xb1\x75\xaf"
+			  "\xc0\xae\x6a\x4f\x3f\xaf\x9a\xf4"
+			  "\xf4\x9a\x24\xd9\x80\x82\xc0\x12"
+			  "\xde\x96\xd1\xbe\x15\x0b\x8d\x6a"
+			  "\xd7\x12\xe4\x85\x9f\x83\xc9\xc3"
+			  "\xff\x0b\xb5\xaf\x3b\xd8\x6d\x67"
+			  "\x81\x45\xe6\xac\xec\xc1\x7b\x16"
+			  "\x18\x0a\xce\x4b\xc0\x2e\x76\xbc"
+			  "\x1b\xfa\xb4\x34\xb8\xfc\x3e\xc8"
+			  "\x5d\x90\x71\x6d\x7a\x79\xef\x06",
+		.ksize	= 1088,
+		.plaintext	= "\xaa\x5d\x54\xcb\xea\x1e\x46\x0f"
+			  "\x45\x87\x70\x51\x8a\x66\x7a\x33"
+			  "\xb4\x18\xff\xa9\x82\xf9\x45\x4b"
+			  "\x93\xae\x2e\x7f\xab\x98\xfe\xbf"
+			  "\x01\xee\xe5\xa0\x37\x8f\x57\xa6"
+			  "\xb0\x76\x0d\xa4\xd6\x28\x2b\x5d"
+			  "\xe1\x03\xd6\x1c\x6f\x34\x0d\xe7"
+			  "\x61\x2d\x2e\xe5\xae\x5d\x47\xc7"
+			  "\x80\x4b\x18\x8f\xa8\x99\xbc\x28"
+			  "\xed\x1d\x9d\x86\x7d\xd7\x41\xd1"
+			  "\xe0\x2b\xe1\x8c\x93\x2a\xa7\x80"
+			  "\xe1\x07\xa0\xa9\x9f\x8c\x8d\x1a"
+			  "\x55\xfc\x6b\x24\x7a\xbd\x3e\x51"
+			  "\x68\x4b\x26\x59\xc8\xa7\x16\xd9"
+			  "\xb9\x61\x13\xde\x8b\x63\x1c\xf6"
+			  "\x60\x01\xfb\x08\xb3\x5b\x0a\xbf"
+			  "\x34\x73\xda\x87\x87\x3d\x6f\x97"
+			  "\x4a\x0c\xa3\x58\x20\xa2\xc0\x81"
+			  "\x5b\x8c\xef\xa9\xc2\x01\x1e\x64"
+			  "\x83\x8c\xbc\x03\xb6\xd0\x29\x9f"
+			  "\x54\xe2\xce\x8b\xc2\x07\x85\x78"
+			  "\x25\x38\x96\x4c\xb4\xbe\x17\x4a"
+			  "\x65\xa6\xfa\x52\x9d\x66\x9d\x65"
+			  "\x4a\xd1\x01\x01\xf0\xcb\x13\xcc"
+			  "\xa5\x82\xf3\xf2\x66\xcd\x3f\x9d"
+			  "\xd1\xaa\xe4\x67\xea\xf2\xad\x88"
+			  "\x56\x76\xa7\x9b\x59\x3c\xb1\x5d"
+			  "\x78\xfd\x69\x79\x74\x78\x43\x26"
+			  "\x7b\xde\x3f\xf1\xf5\x4e\x14\xd9"
+			  "\x15\xf5\x75\xb5\x2e\x19\xf3\x0c"
+			  "\x48\x72\xd6\x71\x6d\x03\x6e\xaa"
+			  "\xa7\x08\xf9\xaa\x70\xa3\x0f\x4d"
+			  "\x12\x8a\xdd\xe3\x39\x73\x7e\xa7"
+			  "\xea\x1f\x6d\x06\x26\x2a\xf2\xc5"
+			  "\x52\xb4\xbf\xfd\x52\x0c\x06\x60"
+			  "\x90\xd1\xb2\x7b\x56\xae\xac\x58"
+			  "\x5a\x6b\x50\x2a\xf5\xe0\x30\x3c"
+			  "\x2a\x98\x0f\x1b\x5b\x0a\x84\x6c"
+			  "\x31\xae\x92\xe2\xd4\xbb\x7f\x59"
+			  "\x26\x10\xb9\x89\x37\x68\x26\xbf"
+			  "\x41\xc8\x49\xc4\x70\x35\x7d\xff"
+			  "\x2d\x7f\xf6\x8a\x93\x68\x8c\x78"
+			  "\x0d\x53\xce\x7d\xff\x7d\xfb\xae"
+			  "\x13\x1b\x75\xc4\x78\xd7\x71\xd8"
+			  "\xea\xd3\xf4\x9d\x95\x64\x8e\xb4"
+			  "\xde\xb8\xe4\xa6\x68\xc8\xae\x73"
+			  "\x58\xaf\xa8\xb0\x5a\x20\xde\x87"
+			  "\x43\xb9\x0f\xe3\xad\x41\x4b\xd5"
+			  "\xb7\xad\x16\x00\xa6\xff\xf6\x74"
+			  "\xbf\x8c\x9f\xb3\x58\x1b\xb6\x55"
+			  "\xa9\x90\x56\x28\xf0\xb5\x13\x4e"
+			  "\x9e\xf7\x25\x86\xe0\x07\x7b\x98"
+			  "\xd8\x60\x5d\x38\x95\x3c\xe4\x22"
+			  "\x16\x2f\xb2\xa2\xaf\xe8\x90\x17"
+			  "\xec\x11\x83\x1a\xf4\xa9\x26\xda"
+			  "\x39\x72\xf5\x94\x61\x05\x51\xec"
+			  "\xa8\x30\x8b\x2c\x13\xd0\x72\xac"
+			  "\xb9\xd2\xa0\x4c\x4b\x78\xe8\x6e"
+			  "\x04\x85\xe9\x04\x49\x82\x91\xff"
+			  "\x89\xe5\xab\x4c\xaa\x37\x03\x12"
+			  "\xca\x8b\x74\x10\xfd\x9e\xd9\x7b"
+			  "\xcb\xdb\x82\x6e\xce\x2e\x33\x39"
+			  "\xce\xd2\x84\x6e\x34\x71\x51\x6e"
+			  "\x0d\xd6\x01\x87\xc7\xfa\x0a\xd3"
+			  "\xad\x36\xf3\x4c\x9f\x96\x5e\x62"
+			  "\x62\x54\xc3\x03\x78\xd6\xab\xdd"
+			  "\x89\x73\x55\x25\x30\xf8\xa7\xe6"
+			  "\x4f\x11\x0c\x7c\x0a\xa1\x2b\x7b"
+			  "\x3d\x0d\xde\x81\xd4\x9d\x0b\xae"
+			  "\xdf\x00\xf9\x4c\xb6\x90\x8e\x16"
+			  "\xcb\x11\xc8\xd1\x2e\x73\x13\x75"
+			  "\x75\x3e\xaa\xf5\xee\x02\xb3\x18"
+			  "\xa6\x2d\xf5\x3b\x51\xd1\x1f\x47"
+			  "\x6b\x2c\xdb\xc4\x10\xe0\xc8\xba"
+			  "\x9d\xac\xb1\x9d\x75\xd5\x41\x0e"
+			  "\x7e\xbe\x18\x5b\xa4\x1f\xf8\x22"
+			  "\x4c\xc1\x68\xda\x6d\x51\x34\x6c"
+			  "\x19\x59\xec\xb5\xb1\xec\xa7\x03"
+			  "\xca\x54\x99\x63\x05\x6c\xb1\xac"
+			  "\x9c\x31\xd6\xdb\xba\x7b\x14\x12"
+			  "\x7a\xc3\x2f\xbf\x8d\xdc\x37\x46"
+			  "\xdb\xd2\xbc\xd4\x2f\xab\x30\xd5"
+			  "\xed\x34\x99\x8e\x83\x3e\xbe\x4c"
+			  "\x86\x79\x58\xe0\x33\x8d\x9a\xb8"
+			  "\xa9\xa6\x90\x46\xa2\x02\xb8\xdd"
+			  "\xf5\xf9\x1a\x5c\x8c\x01\xaa\x6e"
+			  "\xb4\x22\x12\xf5\x0c\x1b\x9b\x7a"
+			  "\xc3\x80\xf3\x06\x00\x5f\x30\xd5"
+			  "\x06\xdb\x7d\x82\xc2\xd4\x0b\x4c"
+			  "\x5f\xe9\xc5\xf5\xdf\x97\x12\xbf"
+			  "\x56\xaf\x9b\x69\xcd\xee\x30\xb4"
+			  "\xa8\x71\xff\x3e\x7d\x73\x7a\xb4"
+			  "\x0d\xa5\x46\x7a\xf3\xf4\x15\x87"
+			  "\x5d\x93\x2b\x8c\x37\x64\xb5\xdd"
+			  "\x48\xd1\xe5\x8c\xae\xd4\xf1\x76"
+			  "\xda\xf4\xba\x9e\x25\x0e\xad\xa3"
+			  "\x0d\x08\x7c\xa8\x82\x16\x8d\x90"
+			  "\x56\x40\x16\x84\xe7\x22\x53\x3a"
+			  "\x58\xbc\xb9\x8f\x33\xc8\xc2\x84"
+			  "\x22\xe6\x0d\xe7\xb3\xdc\x5d\xdf"
+			  "\xd7\x2a\x36\xe4\x16\x06\x07\xd2"
+			  "\x97\x60\xb2\xf5\x5e\x14\xc9\xfd"
+			  "\x8b\x05\xd1\xce\xee\x9a\x65\x99"
+			  "\xb7\xae\x19\xb7\xc8\xbc\xd5\xa2"
+			  "\x7b\x95\xe1\xcc\xba\x0d\xdc\x8a"
+			  "\x1d\x59\x52\x50\xaa\x16\x02\x82"
+			  "\xdf\x61\x33\x2e\x44\xce\x49\xc7"
+			  "\xe5\xc6\x2e\x76\xcf\x80\x52\xf0"
+			  "\x3d\x17\x34\x47\x3f\xd3\x80\x48"
+			  "\xa2\xba\xd5\xc7\x7b\x02\x28\xdb"
+			  "\xac\x44\xc7\x6e\x05\x5c\xc2\x79"
+			  "\xb3\x7d\x6a\x47\x77\x66\xf1\x38"
+			  "\xf0\xf5\x4f\x27\x1a\x31\xca\x6c"
+			  "\x72\x95\x92\x8e\x3f\xb0\xec\x1d"
+			  "\xc7\x2a\xff\x73\xee\xdf\x55\x80"
+			  "\x93\xd2\xbd\x34\xd3\x9f\x00\x51"
+			  "\xfb\x2e\x41\xba\x6c\x5a\x7c\x17"
+			  "\x7f\xe6\x70\xac\x8d\x39\x3f\x77"
+			  "\xe2\x23\xac\x8f\x72\x4e\xe4\x53"
+			  "\xcc\xf1\x1b\xf1\x35\xfe\x52\xa4"
+			  "\xd6\xb8\x40\x6b\xc1\xfd\xa0\xa1"
+			  "\xf5\x46\x65\xc2\x50\xbb\x43\xe2"
+			  "\xd1\x43\x28\x34\x74\xf5\x87\xa0"
+			  "\xf2\x5e\x27\x3b\x59\x2b\x3e\x49"
+			  "\xdf\x46\xee\xaf\x71\xd7\x32\x36"
+			  "\xc7\x14\x0b\x58\x6e\x3e\x2d\x41"
+			  "\xfa\x75\x66\x3a\x54\xe0\xb2\xb9"
+			  "\xaf\xdd\x04\x80\x15\x19\x3f\x6f"
+			  "\xce\x12\xb4\xd8\xe8\x89\x3c\x05"
+			  "\x30\xeb\xf3\x3d\xcd\x27\xec\xdc"
+			  "\x56\x70\x12\xcf\x78\x2b\x77\xbf"
+			  "\x22\xf0\x1b\x17\x9c\xcc\xd6\x1b"
+			  "\x2d\x3d\xa0\x3b\xd8\xc9\x70\xa4"
+			  "\x7a\x3e\x07\xb9\x06\xc3\xfa\xb0"
+			  "\x33\xee\xc1\xd8\xf6\xe0\xf0\xb2"
+			  "\x61\x12\x69\xb0\x5f\x28\x99\xda"
+			  "\xc3\x61\x48\xfa\x07\x16\x03\xc4"
+			  "\xa8\xe1\x3c\xe8\x0e\x64\x15\x30"
+			  "\xc1\x9d\x84\x2f\x73\x98\x0e\x3a"
+			  "\xf2\x86\x21\xa4\x9e\x1d\xb5\x86"
+			  "\x16\xdb\x2b\x9a\x06\x64\x8e\x79"
+			  "\x8d\x76\x3e\xc3\xc2\x64\x44\xe3"
+			  "\xda\xbc\x1a\x52\xd7\x61\x03\x65"
+			  "\x54\x32\x77\x01\xed\x9d\x8a\x43"
+			  "\x25\x24\xe3\xc1\xbe\xb8\x2f\xcb"
+			  "\x89\x14\x64\xab\xf6\xa0\x6e\x02"
+			  "\x57\xe4\x7d\xa9\x4e\x9a\x03\x36"
+			  "\xad\xf1\xb1\xfc\x0b\xe6\x79\x51"
+			  "\x9f\x81\x77\xc4\x14\x78\x9d\xbf"
+			  "\xb6\xd6\xa3\x8c\xba\x0b\x26\xe7"
+			  "\xc8\xb9\x5c\xcc\xe1\x5f\xd5\xc6"
+			  "\xc4\xca\xc2\xa3\x45\xba\x94\x13"
+			  "\xb2\x8f\xc3\x54\x01\x09\xe7\x8b"
+			  "\xda\x2a\x0a\x11\x02\x43\xcb\x57"
+			  "\xc9\xcc\xb5\x5c\xab\xc4\xec\x54"
+			  "\x00\x06\x34\xe1\x6e\x03\x89\x7c"
+			  "\xc6\xfb\x6a\xc7\x60\x43\xd6\xc5"
+			  "\xb5\x68\x72\x89\x8f\x42\xc3\x74"
+			  "\xbd\x25\xaa\x9f\x67\xb5\xdf\x26"
+			  "\x20\xe8\xb7\x01\x3c\xe4\x77\xce"
+			  "\xc4\x65\xa7\x23\x79\xea\x33\xc7"
+			  "\x82\x14\x5c\x82\xf2\x4e\x3d\xf6"
+			  "\xc6\x4a\x0e\x29\xbb\xec\x44\xcd"
+			  "\x2f\xd1\x4f\x21\x71\xa9\xce\x0f"
+			  "\x5c\xf2\x72\x5c\x08\x2e\x21\xd2"
+			  "\xc3\x29\x13\xd8\xac\xc3\xda\x13"
+			  "\x1a\x9d\xa7\x71\x1d\x27\x1d\x27"
+			  "\x1d\xea\xab\x44\x79\xad\xe5\xeb"
+			  "\xef\x1f\x22\x0a\x44\x4f\xcb\x87"
+			  "\xa7\x58\x71\x0e\x66\xf8\x60\xbf"
+			  "\x60\x74\x4a\xb4\xec\x2e\xfe\xd3"
+			  "\xf5\xb8\xfe\x46\x08\x50\x99\x6c"
+			  "\x66\xa5\xa8\x34\x44\xb5\xe5\xf0"
+			  "\xdd\x2c\x67\x4e\x35\x96\x8e\x67"
+			  "\x48\x3f\x5f\x37\x44\x60\x51\x2e"
+			  "\x14\x91\x5e\x57\xc3\x0e\x79\x77"
+			  "\x2f\x03\xf4\xe2\x1c\x72\xbf\x85"
+			  "\x5d\xd3\x17\xdf\x6c\xc5\x70\x24"
+			  "\x42\xdf\x51\x4e\x2a\xb2\xd2\x5b"
+			  "\x9e\x69\x83\x41\x11\xfe\x73\x22"
+			  "\xde\x8a\x9e\xd8\x8a\xfb\x20\x38"
+			  "\xd8\x47\x6f\xd5\xed\x8f\x41\xfd"
+			  "\x13\x7a\x18\x03\x7d\x0f\xcd\x7d"
+			  "\xa6\x7d\x31\x9e\xf1\x8f\x30\xa3"
+			  "\x8b\x4c\x24\xb7\xf5\x48\xd7\xd9"
+			  "\x12\xe7\x84\x97\x5c\x31\x6d\xfb"
+			  "\xdf\xf3\xd3\xd1\xd5\x0c\x30\x06"
+			  "\x01\x6a\xbc\x6c\x78\x7b\xa6\x50"
+			  "\xfa\x0f\x3c\x42\x2d\xa5\xa3\x3b"
+			  "\xcf\x62\x50\xff\x71\x6d\xe7\xda"
+			  "\x27\xab\xc6\x67\x16\x65\x68\x64"
+			  "\xc7\xd5\x5f\x81\xa9\xf6\x65\xb3"
+			  "\x5e\x43\x91\x16\xcd\x3d\x55\x37"
+			  "\x55\xb3\xf0\x28\xc5\x54\x19\xc0"
+			  "\xe0\xd6\x2a\x61\xd4\xc8\x72\x51"
+			  "\xe9\xa1\x7b\x48\x21\xad\x44\x09"
+			  "\xe4\x01\x61\x3c\x8a\x5b\xf9\xa1"
+			  "\x6e\x1b\xdf\xc0\x04\xa8\x8b\xf2"
+			  "\x21\xbe\x34\x7b\xfc\xa1\xcd\xc9"
+			  "\xa9\x96\xf4\xa4\x4c\xf7\x4e\x8f"
+			  "\x84\xcc\xd3\xa8\x92\x77\x8f\x36"
+			  "\xe2\x2e\x8c\x33\xe8\x84\xa6\x0c"
+			  "\x6c\x8a\xda\x14\x32\xc2\x96\xff"
+			  "\xc6\x4a\xc2\x9b\x30\x7f\xd1\x29"
+			  "\xc0\xd5\x78\x41\x00\x80\x80\x03"
+			  "\x2a\xb1\xde\x26\x03\x48\x49\xee"
+			  "\x57\x14\x76\x51\x3c\x36\x5d\x0a"
+			  "\x5c\x9f\xe8\xd8\x53\xdb\x4f\xd4"
+			  "\x38\xbf\x66\xc9\x75\x12\x18\x75"
+			  "\x34\x2d\x93\x22\x96\x51\x24\x6e"
+			  "\x4e\xd9\x30\xea\x67\xff\x92\x1c"
+			  "\x16\x26\xe9\xb5\x33\xab\x8c\x22"
+			  "\x47\xdb\xa0\x2c\x08\xf0\x12\x69"
+			  "\x7e\x93\x52\xda\xa5\xe5\xca\xc1"
+			  "\x0f\x55\x2a\xbd\x09\x30\x88\x1b"
+			  "\x9c\xc6\x9f\xe6\xdb\xa6\x92\xeb"
+			  "\xf4\xbd\x5c\xc4\xdb\xc6\x71\x09"
+			  "\xab\x5e\x48\x0c\xed\x6f\xda\x8e"
+			  "\x8d\x0c\x98\x71\x7d\x10\xd0\x9c"
+			  "\x20\x9b\x79\x53\x26\x5d\xb9\x85"
+			  "\x8a\x31\xb8\xc5\x1c\x97\xde\x88"
+			  "\x61\x55\x7f\x7c\x21\x06\xea\xc4"
+			  "\x5f\xaf\xf2\xf0\xd5\x5e\x7d\xb4"
+			  "\x6e\xcf\xe9\xae\x1b\x0e\x11\x80"
+			  "\xc1\x9a\x74\x7e\x52\x6f\xa0\xb7"
+			  "\x24\xcd\x8d\x0a\x11\x40\x63\x72"
+			  "\xfa\xe2\xc5\xb3\x94\xef\x29\xa2"
+			  "\x1a\x23\x43\x04\x37\x55\x0d\xe9"
+			  "\x83\xb2\x29\x51\x49\x64\xa0\xbd"
+			  "\xde\x73\xfd\xa5\x7c\x95\x70\x62"
+			  "\x58\xdc\xe2\xd0\xbf\x98\xf5\x8a"
+			  "\x6a\xfd\xce\xa8\x0e\x42\x2a\xeb"
+			  "\xd2\xff\x83\x27\x53\x5c\xa0\x6e"
+			  "\x93\xef\xe2\xb9\x5d\x35\xd6\x98"
+			  "\xf6\x71\x19\x7a\x54\xa1\xa7\xe8"
+			  "\x09\xfe\xf6\x9e\xc7\xbd\x3e\x29"
+			  "\xbd\x6b\x17\xf4\xe7\x3e\x10\x5c"
+			  "\xc1\xd2\x59\x4f\x4b\x12\x1a\x5b"
+			  "\x50\x80\x59\xb9\xec\x13\x66\xa8"
+			  "\xd2\x31\x7b\x6a\x61\x22\xdd\x7d"
+			  "\x61\xee\x87\x16\x46\x9f\xf9\xc7"
+			  "\x41\xee\x74\xf8\xd0\x96\x2c\x76"
+			  "\x2a\xac\x7d\x6e\x9f\x0e\x7f\x95"
+			  "\xfe\x50\x16\xb2\x23\xca\x62\xd5"
+			  "\x68\xcf\x07\x3f\x3f\x97\x85\x2a"
+			  "\x0c\x25\x45\xba\xdb\x32\xcb\x83"
+			  "\x8c\x4f\xe0\x6d\x9a\x99\xf9\xc9"
+			  "\xda\xd4\x19\x31\xc1\x7c\x6d\xd9"
+			  "\x9c\x56\xd3\xec\xc1\x81\x4c\xed"
+			  "\x28\x9d\x87\xeb\x19\xd7\x1a\x4f"
+			  "\x04\x6a\xcb\x1f\xcf\x1f\xa2\x16"
+			  "\xfc\x2a\x0d\xa1\x14\x2d\xfa\xc5"
+			  "\x5a\xd2\xc5\xf9\x19\x7c\x20\x1f"
+			  "\x2d\x10\xc0\x66\x7c\xd9\x2d\xe5"
+			  "\x88\x70\x59\xa7\x85\xd5\x2e\x7c"
+			  "\x5c\xe3\xb7\x12\xd6\x97\x3f\x29",
+		.psize	= 2048,
+		.digest	= "\x37\x90\x92\xc2\xeb\x01\x87\xd9"
+			  "\x95\xc7\x91\xc3\x17\x8b\x38\x52",
+	}
+};
+
+
 /*
  * DES test vectors.
  */
@@ -11449,6 +12797,82 @@ static const struct cipher_testvec aes_cbc_tv_template[] = {
 	},
 };
 
+static const struct cipher_testvec aes_cfb_tv_template[] = {
+	{ /* From NIST SP800-38A */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+			  "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+			  "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+			  "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+			  "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+			  "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+		.len	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+			  "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+			  "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+			  "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+			  "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+			  "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+			  "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+			  "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+		.len	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xdc\x7e\x84\xbf\xda\x79\x16\x4b"
+			  "\x7e\xcd\x84\x86\x98\x5d\x38\x60"
+			  "\x39\xff\xed\x14\x3b\x28\xb1\xc8"
+			  "\x32\x11\x3c\x63\x31\xe5\x40\x7b"
+			  "\xdf\x10\x13\x24\x15\xe5\x4b\x92"
+			  "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9"
+			  "\x75\xa3\x85\x74\x1a\xb9\xce\xf8"
+			  "\x20\x31\x62\x3d\x55\xb1\xe4\x71",
+		.len	= 64,
+	},
+};
+
 static const struct aead_testvec hmac_md5_ecb_cipher_null_enc_tv_template[] = {
 	{ /* Input data from RFC 2410 Case 1 */
 #ifdef __LITTLE_ENDIAN
@@ -30802,6 +32226,1794 @@ static const struct cipher_testvec chacha20_tv_template[] = {
 	},
 };
 
+static const struct cipher_testvec xchacha20_tv_template[] = {
+	{ /* from libsodium test/default/xchacha20.c */
+		.key	= "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+			  "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+			  "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+			  "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+		.klen	= 32,
+		.iv	= "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+			  "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+			  "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00",
+		.ctext	= "\xc6\xe9\x75\x81\x60\x08\x3a\xc6"
+			  "\x04\xef\x90\xe7\x12\xce\x6e\x75"
+			  "\xd7\x79\x75\x90\x74\x4e\x0c\xf0"
+			  "\x60\xf0\x13\x73\x9c",
+		.len	= 29,
+	}, { /* from libsodium test/default/xchacha20.c */
+		.key	= "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+			  "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+			  "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+			  "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+		.klen	= 32,
+		.iv	= "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+			  "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+			  "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00",
+		.ctext	= "\xa2\x12\x09\x09\x65\x94\xde\x8c"
+			  "\x56\x67\xb1\xd1\x3a\xd9\x3f\x74"
+			  "\x41\x06\xd0\x54\xdf\x21\x0e\x47"
+			  "\x82\xcd\x39\x6f\xec\x69\x2d\x35"
+			  "\x15\xa2\x0b\xf3\x51\xee\xc0\x11"
+			  "\xa9\x2c\x36\x78\x88\xbc\x46\x4c"
+			  "\x32\xf0\x80\x7a\xcd\x6c\x20\x3a"
+			  "\x24\x7e\x0d\xb8\x54\x14\x84\x68"
+			  "\xe9\xf9\x6b\xee\x4c\xf7\x18\xd6"
+			  "\x8d\x5f\x63\x7c\xbd\x5a\x37\x64"
+			  "\x57\x78\x8e\x6f\xae\x90\xfc\x31"
+			  "\x09\x7c\xfc",
+		.len	= 91,
+	}, { /* Taken from the ChaCha20 test vectors, appended 12 random bytes
+		to the nonce, zero-padded the stream position from 4 to 8 bytes,
+		and recomputed the ciphertext using libsodium's XChaCha20 */
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x67\xc6\x69\x73"
+			  "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ctext	= "\x9c\x49\x2a\xe7\x8a\x2f\x93\xc7"
+			  "\xb3\x33\x6f\x82\x17\xd8\xc4\x1e"
+			  "\xad\x80\x11\x11\x1d\x4c\x16\x18"
+			  "\x07\x73\x9b\x4f\xdb\x7c\xcb\x47"
+			  "\xfd\xef\x59\x74\xfa\x3f\xe5\x4c"
+			  "\x9b\xd0\xea\xbc\xba\x56\xad\x32"
+			  "\x03\xdc\xf8\x2b\xc1\xe1\x75\x67"
+			  "\x23\x7b\xe6\xfc\xd4\x03\x86\x54",
+		.len	= 64,
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+			  "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+			  "\x01\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+			  "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+			  "\x6f\x20\x74\x68\x65\x20\x49\x45"
+			  "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+			  "\x64\x65\x64\x20\x62\x79\x20\x74"
+			  "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+			  "\x69\x62\x75\x74\x6f\x72\x20\x66"
+			  "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+			  "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+			  "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+			  "\x20\x70\x61\x72\x74\x20\x6f\x66"
+			  "\x20\x61\x6e\x20\x49\x45\x54\x46"
+			  "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+			  "\x74\x2d\x44\x72\x61\x66\x74\x20"
+			  "\x6f\x72\x20\x52\x46\x43\x20\x61"
+			  "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+			  "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+			  "\x20\x6d\x61\x64\x65\x20\x77\x69"
+			  "\x74\x68\x69\x6e\x20\x74\x68\x65"
+			  "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+			  "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+			  "\x45\x54\x46\x20\x61\x63\x74\x69"
+			  "\x76\x69\x74\x79\x20\x69\x73\x20"
+			  "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+			  "\x65\x64\x20\x61\x6e\x20\x22\x49"
+			  "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+			  "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+			  "\x22\x2e\x20\x53\x75\x63\x68\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+			  "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x20\x49\x45"
+			  "\x54\x46\x20\x73\x65\x73\x73\x69"
+			  "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+			  "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+			  "\x77\x72\x69\x74\x74\x65\x6e\x20"
+			  "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+			  "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+			  "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+			  "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+			  "\x64\x65\x20\x61\x74\x20\x61\x6e"
+			  "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+			  "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+			  "\x20\x77\x68\x69\x63\x68\x20\x61"
+			  "\x72\x65\x20\x61\x64\x64\x72\x65"
+			  "\x73\x73\x65\x64\x20\x74\x6f",
+		.ctext	= "\xf9\xab\x7a\x4a\x60\xb8\x5f\xa0"
+			  "\x50\xbb\x57\xce\xef\x8c\xc1\xd9"
+			  "\x24\x15\xb3\x67\x5e\x7f\x01\xf6"
+			  "\x1c\x22\xf6\xe5\x71\xb1\x43\x64"
+			  "\x63\x05\xd5\xfc\x5c\x3d\xc0\x0e"
+			  "\x23\xef\xd3\x3b\xd9\xdc\x7f\xa8"
+			  "\x58\x26\xb3\xd0\xc2\xd5\x04\x3f"
+			  "\x0a\x0e\x8f\x17\xe4\xcd\xf7\x2a"
+			  "\xb4\x2c\x09\xe4\x47\xec\x8b\xfb"
+			  "\x59\x37\x7a\xa1\xd0\x04\x7e\xaa"
+			  "\xf1\x98\x5f\x24\x3d\x72\x9a\x43"
+			  "\xa4\x36\x51\x92\x22\x87\xff\x26"
+			  "\xce\x9d\xeb\x59\x78\x84\x5e\x74"
+			  "\x97\x2e\x63\xc0\xef\x29\xf7\x8a"
+			  "\xb9\xee\x35\x08\x77\x6a\x35\x9a"
+			  "\x3e\xe6\x4f\x06\x03\x74\x1b\xc1"
+			  "\x5b\xb3\x0b\x89\x11\x07\xd3\xb7"
+			  "\x53\xd6\x25\x04\xd9\x35\xb4\x5d"
+			  "\x4c\x33\x5a\xc2\x42\x4c\xe6\xa4"
+			  "\x97\x6e\x0e\xd2\xb2\x8b\x2f\x7f"
+			  "\x28\xe5\x9f\xac\x4b\x2e\x02\xab"
+			  "\x85\xfa\xa9\x0d\x7c\x2d\x10\xe6"
+			  "\x91\xab\x55\x63\xf0\xde\x3a\x94"
+			  "\x25\x08\x10\x03\xc2\x68\xd1\xf4"
+			  "\xaf\x7d\x9c\x99\xf7\x86\x96\x30"
+			  "\x60\xfc\x0b\xe6\xa8\x80\x15\xb0"
+			  "\x81\xb1\x0c\xbe\xb9\x12\x18\x25"
+			  "\xe9\x0e\xb1\xe7\x23\xb2\xef\x4a"
+			  "\x22\x8f\xc5\x61\x89\xd4\xe7\x0c"
+			  "\x64\x36\x35\x61\xb6\x34\x60\xf7"
+			  "\x7b\x61\x37\x37\x12\x10\xa2\xf6"
+			  "\x7e\xdb\x7f\x39\x3f\xb6\x8e\x89"
+			  "\x9e\xf3\xfe\x13\x98\xbb\x66\x5a"
+			  "\xec\xea\xab\x3f\x9c\x87\xc4\x8c"
+			  "\x8a\x04\x18\x49\xfc\x77\x11\x50"
+			  "\x16\xe6\x71\x2b\xee\xc0\x9c\xb6"
+			  "\x87\xfd\x80\xff\x0b\x1d\x73\x38"
+			  "\xa4\x1d\x6f\xae\xe4\x12\xd7\x93"
+			  "\x9d\xcd\x38\x26\x09\x40\x52\xcd"
+			  "\x67\x01\x67\x26\xe0\x3e\x98\xa8"
+			  "\xe8\x1a\x13\x41\xbb\x90\x4d\x87"
+			  "\xbb\x42\x82\x39\xce\x3a\xd0\x18"
+			  "\x6d\x7b\x71\x8f\xbb\x2c\x6a\xd1"
+			  "\xbd\xf5\xc7\x8a\x7e\xe1\x1e\x0f"
+			  "\x0d\x0d\x13\x7c\xd9\xd8\x3c\x91"
+			  "\xab\xff\x1f\x12\xc3\xee\xe5\x65"
+			  "\x12\x8d\x7b\x61\xe5\x1f\x98",
+		.len	= 375,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 375 - 20, 4, 16 },
+
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+			  "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+			  "\x2a\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x27\x54\x77\x61\x73\x20\x62\x72"
+			  "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+			  "\x6e\x64\x20\x74\x68\x65\x20\x73"
+			  "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+			  "\x76\x65\x73\x0a\x44\x69\x64\x20"
+			  "\x67\x79\x72\x65\x20\x61\x6e\x64"
+			  "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x77"
+			  "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+			  "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+			  "\x65\x72\x65\x20\x74\x68\x65\x20"
+			  "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+			  "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+			  "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+			  "\x72\x61\x74\x68\x73\x20\x6f\x75"
+			  "\x74\x67\x72\x61\x62\x65\x2e",
+		.ctext	= "\x95\xb9\x51\xe7\x8f\xb4\xa4\x03"
+			  "\xca\x37\xcc\xde\x60\x1d\x8c\xe2"
+			  "\xf1\xbb\x8a\x13\x7f\x61\x85\xcc"
+			  "\xad\xf4\xf0\xdc\x86\xa6\x1e\x10"
+			  "\xbc\x8e\xcb\x38\x2b\xa5\xc8\x8f"
+			  "\xaa\x03\x3d\x53\x4a\x42\xb1\x33"
+			  "\xfc\xd3\xef\xf0\x8e\x7e\x10\x9c"
+			  "\x6f\x12\x5e\xd4\x96\xfe\x5b\x08"
+			  "\xb6\x48\xf0\x14\x74\x51\x18\x7c"
+			  "\x07\x92\xfc\xac\x9d\xf1\x94\xc0"
+			  "\xc1\x9d\xc5\x19\x43\x1f\x1d\xbb"
+			  "\x07\xf0\x1b\x14\x25\x45\xbb\xcb"
+			  "\x5c\xe2\x8b\x28\xf3\xcf\x47\x29"
+			  "\x27\x79\x67\x24\xa6\x87\xc2\x11"
+			  "\x65\x03\xfa\x45\xf7\x9e\x53\x7a"
+			  "\x99\xf1\x82\x25\x4f\x8d\x07",
+		.len	= 127,
+	}, { /* Derived from a ChaCha20 test vector, via the process above */
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+			  "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+			  "\x1c\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+			  "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+			  "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+			  "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+			  "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+			  "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+			  "\x01\xc6\x67\xda\x03\x91\x18\x90"
+			  "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+			  "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+			  "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+			  "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+			  "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+			  "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+			  "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+			  "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+			  "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+			  "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+			  "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+			  "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+			  "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+			  "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+			  "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+			  "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+			  "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+			  "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+			  "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+			  "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+			  "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+			  "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+			  "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+			  "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+			  "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+			  "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+			  "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+			  "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+			  "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+			  "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+			  "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+			  "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+			  "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+			  "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+			  "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+			  "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+			  "\x49\x46\x00\x88\x22\x8d\xce\xea"
+			  "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+			  "\x72\x11\xf5\x50\x73\x04\x40\x47"
+			  "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+			  "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+			  "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+			  "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+			  "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+			  "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+			  "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+			  "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+			  "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+			  "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+			  "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+			  "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+			  "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+			  "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+			  "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+			  "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+			  "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+			  "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+			  "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+			  "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+			  "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+			  "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+			  "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+			  "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+			  "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+			  "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+			  "\x65\x69\x8a\x45\x29\xef\x74\x85"
+			  "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+			  "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+			  "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+			  "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+			  "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+			  "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+			  "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+			  "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+			  "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+			  "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+			  "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+			  "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+			  "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+			  "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+			  "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+			  "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+			  "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+			  "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+			  "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+			  "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+			  "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+			  "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+			  "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+			  "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+			  "\x25\x94\x10\x5f\x40\x00\x64\x99"
+			  "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+			  "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+			  "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+			  "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+			  "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+			  "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+			  "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+			  "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+			  "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+			  "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+			  "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+			  "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+			  "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+			  "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+			  "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+			  "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+			  "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+			  "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+			  "\xb9\x83\x90\xef\x20\x59\x46\xff"
+			  "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+			  "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+			  "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+			  "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+			  "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+			  "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+			  "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+			  "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+			  "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+			  "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+			  "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+			  "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+			  "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+			  "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+			  "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+			  "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+			  "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+			  "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+			  "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+			  "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+			  "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+			  "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+			  "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+			  "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+			  "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+			  "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+			  "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+			  "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+			  "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+			  "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+			  "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+			  "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+			  "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+			  "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+			  "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+			  "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+			  "\xca\x34\x83\x27\x10\x5b\x68\x45"
+			  "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+			  "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+			  "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+			  "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+			  "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+			  "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+			  "\x72",
+		.ctext	= "\x3a\x92\xee\x53\x31\xaf\x2b\x60"
+			  "\x5f\x55\x8d\x00\x5d\xfc\x74\x97"
+			  "\x28\x54\xf4\xa5\x75\xf1\x9b\x25"
+			  "\x62\x1c\xc0\xe0\x13\xc8\x87\x53"
+			  "\xd0\xf3\xa7\x97\x1f\x3b\x1e\xea"
+			  "\xe0\xe5\x2a\xd1\xdd\xa4\x3b\x50"
+			  "\x45\xa3\x0d\x7e\x1b\xc9\xa0\xad"
+			  "\xb9\x2c\x54\xa6\xc7\x55\x16\xd0"
+			  "\xc5\x2e\x02\x44\x35\xd0\x7e\x67"
+			  "\xf2\xc4\x9b\xcd\x95\x10\xcc\x29"
+			  "\x4b\xfa\x86\x87\xbe\x40\x36\xbe"
+			  "\xe1\xa3\x52\x89\x55\x20\x9b\xc2"
+			  "\xab\xf2\x31\x34\x16\xad\xc8\x17"
+			  "\x65\x24\xc0\xff\x12\x37\xfe\x5a"
+			  "\x62\x3b\x59\x47\x6c\x5f\x3a\x8e"
+			  "\x3b\xd9\x30\xc8\x7f\x2f\x88\xda"
+			  "\x80\xfd\x02\xda\x7f\x9a\x7a\x73"
+			  "\x59\xc5\x34\x09\x9a\x11\xcb\xa7"
+			  "\xfc\xf6\xa1\xa0\x60\xfb\x43\xbb"
+			  "\xf1\xe9\xd7\xc6\x79\x27\x4e\xff"
+			  "\x22\xb4\x24\xbf\x76\xee\x47\xb9"
+			  "\x6d\x3f\x8b\xb0\x9c\x3c\x43\xdd"
+			  "\xff\x25\x2e\x6d\xa4\x2b\xfb\x5d"
+			  "\x1b\x97\x6c\x55\x0a\x82\x7a\x7b"
+			  "\x94\x34\xc2\xdb\x2f\x1f\xc1\xea"
+			  "\xd4\x4d\x17\x46\x3b\x51\x69\x09"
+			  "\xe4\x99\x32\x25\xfd\x94\xaf\xfb"
+			  "\x10\xf7\x4f\xdd\x0b\x3c\x8b\x41"
+			  "\xb3\x6a\xb7\xd1\x33\xa8\x0c\x2f"
+			  "\x62\x4c\x72\x11\xd7\x74\xe1\x3b"
+			  "\x38\x43\x66\x7b\x6c\x36\x48\xe7"
+			  "\xe3\xe7\x9d\xb9\x42\x73\x7a\x2a"
+			  "\x89\x20\x1a\x41\x80\x03\xf7\x8f"
+			  "\x61\x78\x13\xbf\xfe\x50\xf5\x04"
+			  "\x52\xf9\xac\x47\xf8\x62\x4b\xb2"
+			  "\x24\xa9\xbf\x64\xb0\x18\x69\xd2"
+			  "\xf5\xe4\xce\xc8\xb1\x87\x75\xd6"
+			  "\x2c\x24\x79\x00\x7d\x26\xfb\x44"
+			  "\xe7\x45\x7a\xee\x58\xa5\x83\xc1"
+			  "\xb4\x24\xab\x23\x2f\x4d\xd7\x4f"
+			  "\x1c\xc7\xaa\xa9\x50\xf4\xa3\x07"
+			  "\x12\x13\x89\x74\xdc\x31\x6a\xb2"
+			  "\xf5\x0f\x13\x8b\xb9\xdb\x85\x1f"
+			  "\xf5\xbc\x88\xd9\x95\xea\x31\x6c"
+			  "\x36\x60\xb6\x49\xdc\xc4\xf7\x55"
+			  "\x3f\x21\xc1\xb5\x92\x18\x5e\xbc"
+			  "\x9f\x87\x7f\xe7\x79\x25\x40\x33"
+			  "\xd6\xb9\x33\xd5\x50\xb3\xc7\x89"
+			  "\x1b\x12\xa0\x46\xdd\xa7\xd8\x3e"
+			  "\x71\xeb\x6f\x66\xa1\x26\x0c\x67"
+			  "\xab\xb2\x38\x58\x17\xd8\x44\x3b"
+			  "\x16\xf0\x8e\x62\x8d\x16\x10\x00"
+			  "\x32\x8b\xef\xb9\x28\xd3\xc5\xad"
+			  "\x0a\x19\xa2\xe4\x03\x27\x7d\x94"
+			  "\x06\x18\xcd\xd6\x27\x00\xf9\x1f"
+			  "\xb6\xb3\xfe\x96\x35\x5f\xc4\x1c"
+			  "\x07\x62\x10\x79\x68\x50\xf1\x7e"
+			  "\x29\xe7\xc4\xc4\xe7\xee\x54\xd6"
+			  "\x58\x76\x84\x6d\x8d\xe4\x59\x31"
+			  "\xe9\xf4\xdc\xa1\x1f\xe5\x1a\xd6"
+			  "\xe6\x64\x46\xf5\x77\x9c\x60\x7a"
+			  "\x5e\x62\xe3\x0a\xd4\x9f\x7a\x2d"
+			  "\x7a\xa5\x0a\x7b\x29\x86\x7a\x74"
+			  "\x74\x71\x6b\xca\x7d\x1d\xaa\xba"
+			  "\x39\x84\x43\x76\x35\xfe\x4f\x9b"
+			  "\xbb\xbb\xb5\x6a\x32\xb5\x5d\x41"
+			  "\x51\xf0\x5b\x68\x03\x47\x4b\x8a"
+			  "\xca\x88\xf6\x37\xbd\x73\x51\x70"
+			  "\x66\xfe\x9e\x5f\x21\x9c\xf3\xdd"
+			  "\xc3\xea\x27\xf9\x64\x94\xe1\x19"
+			  "\xa0\xa9\xab\x60\xe0\x0e\xf7\x78"
+			  "\x70\x86\xeb\xe0\xd1\x5c\x05\xd3"
+			  "\xd7\xca\xe0\xc0\x47\x47\x34\xee"
+			  "\x11\xa3\xa3\x54\x98\xb7\x49\x8e"
+			  "\x84\x28\x70\x2c\x9e\xfb\x55\x54"
+			  "\x4d\xf8\x86\xf7\x85\x7c\xbd\xf3"
+			  "\x17\xd8\x47\xcb\xac\xf4\x20\x85"
+			  "\x34\x66\xad\x37\x2d\x5e\x52\xda"
+			  "\x8a\xfe\x98\x55\x30\xe7\x2d\x2b"
+			  "\x19\x10\x8e\x7b\x66\x5e\xdc\xe0"
+			  "\x45\x1f\x7b\xb4\x08\xfb\x8f\xf6"
+			  "\x8c\x89\x21\x34\x55\x27\xb2\x76"
+			  "\xb2\x07\xd9\xd6\x68\x9b\xea\x6b"
+			  "\x2d\xb4\xc4\x35\xdd\xd2\x79\xae"
+			  "\xc7\xd6\x26\x7f\x12\x01\x8c\xa7"
+			  "\xe3\xdb\xa8\xf4\xf7\x2b\xec\x99"
+			  "\x11\x00\xf1\x35\x8c\xcf\xd5\xc9"
+			  "\xbd\x91\x36\x39\x70\xcf\x7d\x70"
+			  "\x47\x1a\xfc\x6b\x56\xe0\x3f\x9c"
+			  "\x60\x49\x01\x72\xa9\xaf\x2c\x9c"
+			  "\xe8\xab\xda\x8c\x14\x19\xf3\x75"
+			  "\x07\x17\x9d\x44\x67\x7a\x2e\xef"
+			  "\xb7\x83\x35\x4a\xd1\x3d\x1c\x84"
+			  "\x32\xdd\xaa\xea\xca\x1d\xdc\x72"
+			  "\x2c\xcc\x43\xcd\x5d\xe3\x21\xa4"
+			  "\xd0\x8a\x4b\x20\x12\xa3\xd5\x86"
+			  "\x76\x96\xff\x5f\x04\x57\x0f\xe6"
+			  "\xba\xe8\x76\x50\x0c\x64\x1d\x83"
+			  "\x9c\x9b\x9a\x9a\x58\x97\x9c\x5c"
+			  "\xb4\xa4\xa6\x3e\x19\xeb\x8f\x5a"
+			  "\x61\xb2\x03\x7b\x35\x19\xbe\xa7"
+			  "\x63\x0c\xfd\xdd\xf9\x90\x6c\x08"
+			  "\x19\x11\xd3\x65\x4a\xf5\x96\x92"
+			  "\x59\xaa\x9c\x61\x0c\x29\xa7\xf8"
+			  "\x14\x39\x37\xbf\x3c\xf2\x16\x72"
+			  "\x02\xfa\xa2\xf3\x18\x67\x5d\xcb"
+			  "\xdc\x4d\xbb\x96\xff\x70\x08\x2d"
+			  "\xc2\xa8\x52\xe1\x34\x5f\x72\xfe"
+			  "\x64\xbf\xca\xa7\x74\x38\xfb\x74"
+			  "\x55\x9c\xfa\x8a\xed\xfb\x98\xeb"
+			  "\x58\x2e\x6c\xe1\x52\x76\x86\xd7"
+			  "\xcf\xa1\xa4\xfc\xb2\x47\x41\x28"
+			  "\xa3\xc1\xe5\xfd\x53\x19\x28\x2b"
+			  "\x37\x04\x65\x96\x99\x7a\x28\x0f"
+			  "\x07\x68\x4b\xc7\x52\x0a\x55\x35"
+			  "\x40\x19\x95\x61\xe8\x59\x40\x1f"
+			  "\x9d\xbf\x78\x7d\x8f\x84\xff\x6f"
+			  "\xd0\xd5\x63\xd2\x22\xbd\xc8\x4e"
+			  "\xfb\xe7\x9f\x06\xe6\xe7\x39\x6d"
+			  "\x6a\x96\x9f\xf0\x74\x7e\xc9\x35"
+			  "\xb7\x26\xb8\x1c\x0a\xa6\x27\x2c"
+			  "\xa2\x2b\xfe\xbe\x0f\x07\x73\xae"
+			  "\x7f\x7f\x54\xf5\x7c\x6a\x0a\x56"
+			  "\x49\xd4\x81\xe5\x85\x53\x99\x1f"
+			  "\x95\x05\x13\x58\x8d\x0e\x1b\x90"
+			  "\xc3\x75\x48\x64\x58\x98\x67\x84"
+			  "\xae\xe2\x21\xa2\x8a\x04\x0a\x0b"
+			  "\x61\xaa\xb0\xd4\x28\x60\x7a\xf8"
+			  "\xbc\x52\xfb\x24\x7f\xed\x0d\x2a"
+			  "\x0a\xb2\xf9\xc6\x95\xb5\x11\xc9"
+			  "\xf4\x0f\x26\x11\xcf\x2a\x57\x87"
+			  "\x7a\xf3\xe7\x94\x65\xc2\xb5\xb3"
+			  "\xab\x98\xe3\xc1\x2b\x59\x19\x7c"
+			  "\xd6\xf3\xf9\xbf\xff\x6d\xc6\x82"
+			  "\x13\x2f\x4a\x2e\xcd\x26\xfe\x2d"
+			  "\x01\x70\xf4\xc2\x7f\x1f\x4c\xcb"
+			  "\x47\x77\x0c\xa0\xa3\x03\xec\xda"
+			  "\xa9\xbf\x0d\x2d\xae\xe4\xb8\x7b"
+			  "\xa9\xbc\x08\xb4\x68\x2e\xc5\x60"
+			  "\x8d\x87\x41\x2b\x0f\x69\xf0\xaf"
+			  "\x5f\xba\x72\x20\x0f\x33\xcd\x6d"
+			  "\x36\x7d\x7b\xd5\x05\xf1\x4b\x05"
+			  "\xc4\xfc\x7f\x80\xb9\x4d\xbd\xf7"
+			  "\x7c\x84\x07\x01\xc2\x40\x66\x5b"
+			  "\x98\xc7\x2c\xe3\x97\xfa\xdf\x87"
+			  "\xa0\x1f\xe9\x21\x42\x0f\x3b\xeb"
+			  "\x89\x1c\x3b\xca\x83\x61\x77\x68"
+			  "\x84\xbb\x60\x87\x38\x2e\x25\xd5"
+			  "\x9e\x04\x41\x70\xac\xda\xc0\x9c"
+			  "\x9c\x69\xea\x8d\x4e\x55\x2a\x29"
+			  "\xed\x05\x4b\x7b\x73\x71\x90\x59"
+			  "\x4d\xc8\xd8\x44\xf0\x4c\xe1\x5e"
+			  "\x84\x47\x55\xcc\x32\x3f\xe7\x97"
+			  "\x42\xc6\x32\xac\x40\xe5\xa5\xc7"
+			  "\x8b\xed\xdb\xf7\x83\xd6\xb1\xc2"
+			  "\x52\x5e\x34\xb7\xeb\x6e\xd9\xfc"
+			  "\xe5\x93\x9a\x97\x3e\xb0\xdc\xd9"
+			  "\xd7\x06\x10\xb6\x1d\x80\x59\xdd"
+			  "\x0d\xfe\x64\x35\xcd\x5d\xec\xf0"
+			  "\xba\xd0\x34\xc9\x2d\x91\xc5\x17"
+			  "\x11",
+		.len	= 1281,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 1200, 1, 80 },
+	}, { /* test vector from https://tools.ietf.org/html/draft-arciszewski-xchacha-02#appendix-A.3.2 */
+		.key	= "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f",
+		.klen	= 32,
+		.iv	= "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x58"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x54\x68\x65\x20\x64\x68\x6f\x6c"
+			  "\x65\x20\x28\x70\x72\x6f\x6e\x6f"
+			  "\x75\x6e\x63\x65\x64\x20\x22\x64"
+			  "\x6f\x6c\x65\x22\x29\x20\x69\x73"
+			  "\x20\x61\x6c\x73\x6f\x20\x6b\x6e"
+			  "\x6f\x77\x6e\x20\x61\x73\x20\x74"
+			  "\x68\x65\x20\x41\x73\x69\x61\x74"
+			  "\x69\x63\x20\x77\x69\x6c\x64\x20"
+			  "\x64\x6f\x67\x2c\x20\x72\x65\x64"
+			  "\x20\x64\x6f\x67\x2c\x20\x61\x6e"
+			  "\x64\x20\x77\x68\x69\x73\x74\x6c"
+			  "\x69\x6e\x67\x20\x64\x6f\x67\x2e"
+			  "\x20\x49\x74\x20\x69\x73\x20\x61"
+			  "\x62\x6f\x75\x74\x20\x74\x68\x65"
+			  "\x20\x73\x69\x7a\x65\x20\x6f\x66"
+			  "\x20\x61\x20\x47\x65\x72\x6d\x61"
+			  "\x6e\x20\x73\x68\x65\x70\x68\x65"
+			  "\x72\x64\x20\x62\x75\x74\x20\x6c"
+			  "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72"
+			  "\x65\x20\x6c\x69\x6b\x65\x20\x61"
+			  "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65"
+			  "\x67\x67\x65\x64\x20\x66\x6f\x78"
+			  "\x2e\x20\x54\x68\x69\x73\x20\x68"
+			  "\x69\x67\x68\x6c\x79\x20\x65\x6c"
+			  "\x75\x73\x69\x76\x65\x20\x61\x6e"
+			  "\x64\x20\x73\x6b\x69\x6c\x6c\x65"
+			  "\x64\x20\x6a\x75\x6d\x70\x65\x72"
+			  "\x20\x69\x73\x20\x63\x6c\x61\x73"
+			  "\x73\x69\x66\x69\x65\x64\x20\x77"
+			  "\x69\x74\x68\x20\x77\x6f\x6c\x76"
+			  "\x65\x73\x2c\x20\x63\x6f\x79\x6f"
+			  "\x74\x65\x73\x2c\x20\x6a\x61\x63"
+			  "\x6b\x61\x6c\x73\x2c\x20\x61\x6e"
+			  "\x64\x20\x66\x6f\x78\x65\x73\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x74"
+			  "\x61\x78\x6f\x6e\x6f\x6d\x69\x63"
+			  "\x20\x66\x61\x6d\x69\x6c\x79\x20"
+			  "\x43\x61\x6e\x69\x64\x61\x65\x2e",
+		.ctext	= "\x45\x59\xab\xba\x4e\x48\xc1\x61"
+			  "\x02\xe8\xbb\x2c\x05\xe6\x94\x7f"
+			  "\x50\xa7\x86\xde\x16\x2f\x9b\x0b"
+			  "\x7e\x59\x2a\x9b\x53\xd0\xd4\xe9"
+			  "\x8d\x8d\x64\x10\xd5\x40\xa1\xa6"
+			  "\x37\x5b\x26\xd8\x0d\xac\xe4\xfa"
+			  "\xb5\x23\x84\xc7\x31\xac\xbf\x16"
+			  "\xa5\x92\x3c\x0c\x48\xd3\x57\x5d"
+			  "\x4d\x0d\x2c\x67\x3b\x66\x6f\xaa"
+			  "\x73\x10\x61\x27\x77\x01\x09\x3a"
+			  "\x6b\xf7\xa1\x58\xa8\x86\x42\x92"
+			  "\xa4\x1c\x48\xe3\xa9\xb4\xc0\xda"
+			  "\xec\xe0\xf8\xd9\x8d\x0d\x7e\x05"
+			  "\xb3\x7a\x30\x7b\xbb\x66\x33\x31"
+			  "\x64\xec\x9e\x1b\x24\xea\x0d\x6c"
+			  "\x3f\xfd\xdc\xec\x4f\x68\xe7\x44"
+			  "\x30\x56\x19\x3a\x03\xc8\x10\xe1"
+			  "\x13\x44\xca\x06\xd8\xed\x8a\x2b"
+			  "\xfb\x1e\x8d\x48\xcf\xa6\xbc\x0e"
+			  "\xb4\xe2\x46\x4b\x74\x81\x42\x40"
+			  "\x7c\x9f\x43\x1a\xee\x76\x99\x60"
+			  "\xe1\x5b\xa8\xb9\x68\x90\x46\x6e"
+			  "\xf2\x45\x75\x99\x85\x23\x85\xc6"
+			  "\x61\xf7\x52\xce\x20\xf9\xda\x0c"
+			  "\x09\xab\x6b\x19\xdf\x74\xe7\x6a"
+			  "\x95\x96\x74\x46\xf8\xd0\xfd\x41"
+			  "\x5e\x7b\xee\x2a\x12\xa1\x14\xc2"
+			  "\x0e\xb5\x29\x2a\xe7\xa3\x49\xae"
+			  "\x57\x78\x20\xd5\x52\x0a\x1f\x3f"
+			  "\xb6\x2a\x17\xce\x6a\x7e\x68\xfa"
+			  "\x7c\x79\x11\x1d\x88\x60\x92\x0b"
+			  "\xc0\x48\xef\x43\xfe\x84\x48\x6c"
+			  "\xcb\x87\xc2\x5f\x0a\xe0\x45\xf0"
+			  "\xcc\xe1\xe7\x98\x9a\x9a\xa2\x20"
+			  "\xa2\x8b\xdd\x48\x27\xe7\x51\xa2"
+			  "\x4a\x6d\x5c\x62\xd7\x90\xa6\x63"
+			  "\x93\xb9\x31\x11\xc1\xa5\x5d\xd7"
+			  "\x42\x1a\x10\x18\x49\x74\xc7\xc5",
+		.len	= 304,
+	}
+};
+
+/*
+ * Same as XChaCha20 test vectors above, but recomputed the ciphertext with
+ * XChaCha12, using a modified libsodium.
+ */
+static const struct cipher_testvec xchacha12_tv_template[] = {
+	{
+		.key	= "\x79\xc9\x97\x98\xac\x67\x30\x0b"
+			  "\xbb\x27\x04\xc9\x5c\x34\x1e\x32"
+			  "\x45\xf3\xdc\xb2\x17\x61\xb9\x8e"
+			  "\x52\xff\x45\xb2\x4f\x30\x4f\xc4",
+		.klen	= 32,
+		.iv	= "\xb3\x3f\xfd\x30\x96\x47\x9b\xcf"
+			  "\xbc\x9a\xee\x49\x41\x76\x88\xa0"
+			  "\xa2\x55\x4f\x8d\x95\x38\x94\x19"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00",
+		.ctext	= "\x1b\x78\x7f\xd7\xa1\x41\x68\xab"
+			  "\x3d\x3f\xd1\x7b\x69\x56\xb2\xd5"
+			  "\x43\xce\xeb\xaf\x36\xf0\x29\x9d"
+			  "\x3a\xfb\x18\xae\x1b",
+		.len	= 29,
+	}, {
+		.key	= "\x9d\x23\xbd\x41\x49\xcb\x97\x9c"
+			  "\xcf\x3c\x5c\x94\xdd\x21\x7e\x98"
+			  "\x08\xcb\x0e\x50\xcd\x0f\x67\x81"
+			  "\x22\x35\xea\xaf\x60\x1d\x62\x32",
+		.klen	= 32,
+		.iv	= "\xc0\x47\x54\x82\x66\xb7\xc3\x70"
+			  "\xd3\x35\x66\xa2\x42\x5c\xbf\x30"
+			  "\xd8\x2d\x1e\xaf\x52\x94\x10\x9e"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00",
+		.ctext	= "\xfb\x32\x09\x1d\x83\x05\xae\x4c"
+			  "\x13\x1f\x12\x71\xf2\xca\xb2\xeb"
+			  "\x5b\x83\x14\x7d\x83\xf6\x57\x77"
+			  "\x2e\x40\x1f\x92\x2c\xf9\xec\x35"
+			  "\x34\x1f\x93\xdf\xfb\x30\xd7\x35"
+			  "\x03\x05\x78\xc1\x20\x3b\x7a\xe3"
+			  "\x62\xa3\x89\xdc\x11\x11\x45\xa8"
+			  "\x82\x89\xa0\xf1\x4e\xc7\x0f\x11"
+			  "\x69\xdd\x0c\x84\x2b\x89\x5c\xdc"
+			  "\xf0\xde\x01\xef\xc5\x65\x79\x23"
+			  "\x87\x67\xd6\x50\xd9\x8d\xd9\x92"
+			  "\x54\x5b\x0e",
+		.len	= 91,
+	}, {
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x67\xc6\x69\x73"
+			  "\x51\xff\x4a\xec\x29\xcd\xba\xab"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ctext	= "\xdf\x2d\xc6\x21\x2a\x9d\xa1\xbb"
+			  "\xc2\x77\x66\x0c\x5c\x46\xef\xa7"
+			  "\x79\x1b\xb9\xdf\x55\xe2\xf9\x61"
+			  "\x4c\x7b\xa4\x52\x24\xaf\xa2\xda"
+			  "\xd1\x8f\x8f\xa2\x9e\x53\x4d\xc4"
+			  "\xb8\x55\x98\x08\x7c\x08\xd4\x18"
+			  "\x67\x8f\xef\x50\xb1\x5f\xa5\x77"
+			  "\x4c\x25\xe7\x86\x26\x42\xca\x44",
+		.len	= 64,
+	}, {
+		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x00\x00\x00\x00\x01",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\xf2\xfb\xe3\x46"
+			  "\x7c\xc2\x54\xf8\x1b\xe8\xe7\x8d"
+			  "\x01\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x41\x6e\x79\x20\x73\x75\x62\x6d"
+			  "\x69\x73\x73\x69\x6f\x6e\x20\x74"
+			  "\x6f\x20\x74\x68\x65\x20\x49\x45"
+			  "\x54\x46\x20\x69\x6e\x74\x65\x6e"
+			  "\x64\x65\x64\x20\x62\x79\x20\x74"
+			  "\x68\x65\x20\x43\x6f\x6e\x74\x72"
+			  "\x69\x62\x75\x74\x6f\x72\x20\x66"
+			  "\x6f\x72\x20\x70\x75\x62\x6c\x69"
+			  "\x63\x61\x74\x69\x6f\x6e\x20\x61"
+			  "\x73\x20\x61\x6c\x6c\x20\x6f\x72"
+			  "\x20\x70\x61\x72\x74\x20\x6f\x66"
+			  "\x20\x61\x6e\x20\x49\x45\x54\x46"
+			  "\x20\x49\x6e\x74\x65\x72\x6e\x65"
+			  "\x74\x2d\x44\x72\x61\x66\x74\x20"
+			  "\x6f\x72\x20\x52\x46\x43\x20\x61"
+			  "\x6e\x64\x20\x61\x6e\x79\x20\x73"
+			  "\x74\x61\x74\x65\x6d\x65\x6e\x74"
+			  "\x20\x6d\x61\x64\x65\x20\x77\x69"
+			  "\x74\x68\x69\x6e\x20\x74\x68\x65"
+			  "\x20\x63\x6f\x6e\x74\x65\x78\x74"
+			  "\x20\x6f\x66\x20\x61\x6e\x20\x49"
+			  "\x45\x54\x46\x20\x61\x63\x74\x69"
+			  "\x76\x69\x74\x79\x20\x69\x73\x20"
+			  "\x63\x6f\x6e\x73\x69\x64\x65\x72"
+			  "\x65\x64\x20\x61\x6e\x20\x22\x49"
+			  "\x45\x54\x46\x20\x43\x6f\x6e\x74"
+			  "\x72\x69\x62\x75\x74\x69\x6f\x6e"
+			  "\x22\x2e\x20\x53\x75\x63\x68\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x63\x6c\x75"
+			  "\x64\x65\x20\x6f\x72\x61\x6c\x20"
+			  "\x73\x74\x61\x74\x65\x6d\x65\x6e"
+			  "\x74\x73\x20\x69\x6e\x20\x49\x45"
+			  "\x54\x46\x20\x73\x65\x73\x73\x69"
+			  "\x6f\x6e\x73\x2c\x20\x61\x73\x20"
+			  "\x77\x65\x6c\x6c\x20\x61\x73\x20"
+			  "\x77\x72\x69\x74\x74\x65\x6e\x20"
+			  "\x61\x6e\x64\x20\x65\x6c\x65\x63"
+			  "\x74\x72\x6f\x6e\x69\x63\x20\x63"
+			  "\x6f\x6d\x6d\x75\x6e\x69\x63\x61"
+			  "\x74\x69\x6f\x6e\x73\x20\x6d\x61"
+			  "\x64\x65\x20\x61\x74\x20\x61\x6e"
+			  "\x79\x20\x74\x69\x6d\x65\x20\x6f"
+			  "\x72\x20\x70\x6c\x61\x63\x65\x2c"
+			  "\x20\x77\x68\x69\x63\x68\x20\x61"
+			  "\x72\x65\x20\x61\x64\x64\x72\x65"
+			  "\x73\x73\x65\x64\x20\x74\x6f",
+		.ctext	= "\xe4\xa6\xc8\x30\xc4\x23\x13\xd6"
+			  "\x08\x4d\xc9\xb7\xa5\x64\x7c\xb9"
+			  "\x71\xe2\xab\x3e\xa8\x30\x8a\x1c"
+			  "\x4a\x94\x6d\x9b\xe0\xb3\x6f\xf1"
+			  "\xdc\xe3\x1b\xb3\xa9\x6d\x0d\xd6"
+			  "\xd0\xca\x12\xef\xe7\x5f\xd8\x61"
+			  "\x3c\x82\xd3\x99\x86\x3c\x6f\x66"
+			  "\x02\x06\xdc\x55\xf9\xed\xdf\x38"
+			  "\xb4\xa6\x17\x00\x7f\xef\xbf\x4f"
+			  "\xf8\x36\xf1\x60\x7e\x47\xaf\xdb"
+			  "\x55\x9b\x12\xcb\x56\x44\xa7\x1f"
+			  "\xd3\x1a\x07\x3b\x00\xec\xe6\x4c"
+			  "\xa2\x43\x27\xdf\x86\x19\x4f\x16"
+			  "\xed\xf9\x4a\xf3\x63\x6f\xfa\x7f"
+			  "\x78\x11\xf6\x7d\x97\x6f\xec\x6f"
+			  "\x85\x0f\x5c\x36\x13\x8d\x87\xe0"
+			  "\x80\xb1\x69\x0b\x98\x89\x9c\x4e"
+			  "\xf8\xdd\xee\x5c\x0a\x85\xce\xd4"
+			  "\xea\x1b\x48\xbe\x08\xf8\xe2\xa8"
+			  "\xa5\xb0\x3c\x79\xb1\x15\xb4\xb9"
+			  "\x75\x10\x95\x35\x81\x7e\x26\xe6"
+			  "\x78\xa4\x88\xcf\xdb\x91\x34\x18"
+			  "\xad\xd7\x8e\x07\x7d\xab\x39\xf9"
+			  "\xa3\x9e\xa5\x1d\xbb\xed\x61\xfd"
+			  "\xdc\xb7\x5a\x27\xfc\xb5\xc9\x10"
+			  "\xa8\xcc\x52\x7f\x14\x76\x90\xe7"
+			  "\x1b\x29\x60\x74\xc0\x98\x77\xbb"
+			  "\xe0\x54\xbb\x27\x49\x59\x1e\x62"
+			  "\x3d\xaf\x74\x06\xa4\x42\x6f\xc6"
+			  "\x52\x97\xc4\x1d\xc4\x9f\xe2\xe5"
+			  "\x38\x57\x91\xd1\xa2\x28\xcc\x40"
+			  "\xcc\x70\x59\x37\xfc\x9f\x4b\xda"
+			  "\xa0\xeb\x97\x9a\x7d\xed\x14\x5c"
+			  "\x9c\xb7\x93\x26\x41\xa8\x66\xdd"
+			  "\x87\x6a\xc0\xd3\xc2\xa9\x3e\xae"
+			  "\xe9\x72\xfe\xd1\xb3\xac\x38\xea"
+			  "\x4d\x15\xa9\xd5\x36\x61\xe9\x96"
+			  "\x6c\x23\xf8\x43\xe4\x92\x29\xd9"
+			  "\x8b\x78\xf7\x0a\x52\xe0\x19\x5b"
+			  "\x59\x69\x5b\x5d\xa1\x53\xc4\x68"
+			  "\xe1\xbb\xac\x89\x14\xe2\xe2\x85"
+			  "\x41\x18\xf5\xb3\xd1\xfa\x68\x19"
+			  "\x44\x78\xdc\xcf\xe7\x88\x2d\x52"
+			  "\x5f\x40\xb5\x7e\xf8\x88\xa2\xae"
+			  "\x4a\xb2\x07\x35\x9d\x9b\x07\x88"
+			  "\xb7\x00\xd0\x0c\xb6\xa0\x47\x59"
+			  "\xda\x4e\xc9\xab\x9b\x8a\x7b",
+
+		.len	= 375,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 375 - 20, 4, 16 },
+
+	}, {
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x02\x76\x5a\x2e\x63"
+			  "\x33\x9f\xc9\x9a\x66\x32\x0d\xb7"
+			  "\x2a\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x27\x54\x77\x61\x73\x20\x62\x72"
+			  "\x69\x6c\x6c\x69\x67\x2c\x20\x61"
+			  "\x6e\x64\x20\x74\x68\x65\x20\x73"
+			  "\x6c\x69\x74\x68\x79\x20\x74\x6f"
+			  "\x76\x65\x73\x0a\x44\x69\x64\x20"
+			  "\x67\x79\x72\x65\x20\x61\x6e\x64"
+			  "\x20\x67\x69\x6d\x62\x6c\x65\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x77"
+			  "\x61\x62\x65\x3a\x0a\x41\x6c\x6c"
+			  "\x20\x6d\x69\x6d\x73\x79\x20\x77"
+			  "\x65\x72\x65\x20\x74\x68\x65\x20"
+			  "\x62\x6f\x72\x6f\x67\x6f\x76\x65"
+			  "\x73\x2c\x0a\x41\x6e\x64\x20\x74"
+			  "\x68\x65\x20\x6d\x6f\x6d\x65\x20"
+			  "\x72\x61\x74\x68\x73\x20\x6f\x75"
+			  "\x74\x67\x72\x61\x62\x65\x2e",
+		.ctext	= "\xb9\x68\xbc\x6a\x24\xbc\xcc\xd8"
+			  "\x9b\x2a\x8d\x5b\x96\xaf\x56\xe3"
+			  "\x11\x61\xe7\xa7\x9b\xce\x4e\x7d"
+			  "\x60\x02\x48\xac\xeb\xd5\x3a\x26"
+			  "\x9d\x77\x3b\xb5\x32\x13\x86\x8e"
+			  "\x20\x82\x26\x72\xae\x64\x1b\x7e"
+			  "\x2e\x01\x68\xb4\x87\x45\xa1\x24"
+			  "\xe4\x48\x40\xf0\xaa\xac\xee\xa9"
+			  "\xfc\x31\xad\x9d\x89\xa3\xbb\xd2"
+			  "\xe4\x25\x13\xad\x0f\x5e\xdf\x3c"
+			  "\x27\xab\xb8\x62\x46\x22\x30\x48"
+			  "\x55\x2c\x4e\x84\x78\x1d\x0d\x34"
+			  "\x8d\x3c\x91\x0a\x7f\x5b\x19\x9f"
+			  "\x97\x05\x4c\xa7\x62\x47\x8b\xc5"
+			  "\x44\x2e\x20\x33\xdd\xa0\x82\xa9"
+			  "\x25\x76\x37\xe6\x3c\x67\x5b",
+		.len	= 127,
+	}, {
+		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
+			  "\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+			  "\x47\x39\x17\xc1\x40\x2b\x80\x09"
+			  "\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+		.klen	= 32,
+		.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+			  "\x00\x00\x00\x01\x31\x58\xa3\x5a"
+			  "\x25\x5d\x05\x17\x58\xe9\x5e\xd4"
+			  "\x1c\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x49\xee\xe0\xdc\x24\x90\x40\xcd"
+			  "\xc5\x40\x8f\x47\x05\xbc\xdd\x81"
+			  "\x47\xc6\x8d\xe6\xb1\x8f\xd7\xcb"
+			  "\x09\x0e\x6e\x22\x48\x1f\xbf\xb8"
+			  "\x5c\xf7\x1e\x8a\xc1\x23\xf2\xd4"
+			  "\x19\x4b\x01\x0f\x4e\xa4\x43\xce"
+			  "\x01\xc6\x67\xda\x03\x91\x18\x90"
+			  "\xa5\xa4\x8e\x45\x03\xb3\x2d\xac"
+			  "\x74\x92\xd3\x53\x47\xc8\xdd\x25"
+			  "\x53\x6c\x02\x03\x87\x0d\x11\x0c"
+			  "\x58\xe3\x12\x18\xfd\x2a\x5b\x40"
+			  "\x0c\x30\xf0\xb8\x3f\x43\xce\xae"
+			  "\x65\x3a\x7d\x7c\xf4\x54\xaa\xcc"
+			  "\x33\x97\xc3\x77\xba\xc5\x70\xde"
+			  "\xd7\xd5\x13\xa5\x65\xc4\x5f\x0f"
+			  "\x46\x1a\x0d\x97\xb5\xf3\xbb\x3c"
+			  "\x84\x0f\x2b\xc5\xaa\xea\xf2\x6c"
+			  "\xc9\xb5\x0c\xee\x15\xf3\x7d\xbe"
+			  "\x9f\x7b\x5a\xa6\xae\x4f\x83\xb6"
+			  "\x79\x49\x41\xf4\x58\x18\xcb\x86"
+			  "\x7f\x30\x0e\xf8\x7d\x44\x36\xea"
+			  "\x75\xeb\x88\x84\x40\x3c\xad\x4f"
+			  "\x6f\x31\x6b\xaa\x5d\xe5\xa5\xc5"
+			  "\x21\x66\xe9\xa7\xe3\xb2\x15\x88"
+			  "\x78\xf6\x79\xa1\x59\x47\x12\x4e"
+			  "\x9f\x9f\x64\x1a\xa0\x22\x5b\x08"
+			  "\xbe\x7c\x36\xc2\x2b\x66\x33\x1b"
+			  "\xdd\x60\x71\xf7\x47\x8c\x61\xc3"
+			  "\xda\x8a\x78\x1e\x16\xfa\x1e\x86"
+			  "\x81\xa6\x17\x2a\xa7\xb5\xc2\xe7"
+			  "\xa4\xc7\x42\xf1\xcf\x6a\xca\xb4"
+			  "\x45\xcf\xf3\x93\xf0\xe7\xea\xf6"
+			  "\xf4\xe6\x33\x43\x84\x93\xa5\x67"
+			  "\x9b\x16\x58\x58\x80\x0f\x2b\x5c"
+			  "\x24\x74\x75\x7f\x95\x81\xb7\x30"
+			  "\x7a\x33\xa7\xf7\x94\x87\x32\x27"
+			  "\x10\x5d\x14\x4c\x43\x29\xdd\x26"
+			  "\xbd\x3e\x3c\x0e\xfe\x0e\xa5\x10"
+			  "\xea\x6b\x64\xfd\x73\xc6\xed\xec"
+			  "\xa8\xc9\xbf\xb3\xba\x0b\x4d\x07"
+			  "\x70\xfc\x16\xfd\x79\x1e\xd7\xc5"
+			  "\x49\x4e\x1c\x8b\x8d\x79\x1b\xb1"
+			  "\xec\xca\x60\x09\x4c\x6a\xd5\x09"
+			  "\x49\x46\x00\x88\x22\x8d\xce\xea"
+			  "\xb1\x17\x11\xde\x42\xd2\x23\xc1"
+			  "\x72\x11\xf5\x50\x73\x04\x40\x47"
+			  "\xf9\x5d\xe7\xa7\x26\xb1\x7e\xb0"
+			  "\x3f\x58\xc1\x52\xab\x12\x67\x9d"
+			  "\x3f\x43\x4b\x68\xd4\x9c\x68\x38"
+			  "\x07\x8a\x2d\x3e\xf3\xaf\x6a\x4b"
+			  "\xf9\xe5\x31\x69\x22\xf9\xa6\x69"
+			  "\xc6\x9c\x96\x9a\x12\x35\x95\x1d"
+			  "\x95\xd5\xdd\xbe\xbf\x93\x53\x24"
+			  "\xfd\xeb\xc2\x0a\x64\xb0\x77\x00"
+			  "\x6f\x88\xc4\x37\x18\x69\x7c\xd7"
+			  "\x41\x92\x55\x4c\x03\xa1\x9a\x4b"
+			  "\x15\xe5\xdf\x7f\x37\x33\x72\xc1"
+			  "\x8b\x10\x67\xa3\x01\x57\x94\x25"
+			  "\x7b\x38\x71\x7e\xdd\x1e\xcc\x73"
+			  "\x55\xd2\x8e\xeb\x07\xdd\xf1\xda"
+			  "\x58\xb1\x47\x90\xfe\x42\x21\x72"
+			  "\xa3\x54\x7a\xa0\x40\xec\x9f\xdd"
+			  "\xc6\x84\x6e\xca\xae\xe3\x68\xb4"
+			  "\x9d\xe4\x78\xff\x57\xf2\xf8\x1b"
+			  "\x03\xa1\x31\xd9\xde\x8d\xf5\x22"
+			  "\x9c\xdd\x20\xa4\x1e\x27\xb1\x76"
+			  "\x4f\x44\x55\xe2\x9b\xa1\x9c\xfe"
+			  "\x54\xf7\x27\x1b\xf4\xde\x02\xf5"
+			  "\x1b\x55\x48\x5c\xdc\x21\x4b\x9e"
+			  "\x4b\x6e\xed\x46\x23\xdc\x65\xb2"
+			  "\xcf\x79\x5f\x28\xe0\x9e\x8b\xe7"
+			  "\x4c\x9d\x8a\xff\xc1\xa6\x28\xb8"
+			  "\x65\x69\x8a\x45\x29\xef\x74\x85"
+			  "\xde\x79\xc7\x08\xae\x30\xb0\xf4"
+			  "\xa3\x1d\x51\x41\xab\xce\xcb\xf6"
+			  "\xb5\xd8\x6d\xe0\x85\xe1\x98\xb3"
+			  "\x43\xbb\x86\x83\x0a\xa0\xf5\xb7"
+			  "\x04\x0b\xfa\x71\x1f\xb0\xf6\xd9"
+			  "\x13\x00\x15\xf0\xc7\xeb\x0d\x5a"
+			  "\x9f\xd7\xb9\x6c\x65\x14\x22\x45"
+			  "\x6e\x45\x32\x3e\x7e\x60\x1a\x12"
+			  "\x97\x82\x14\xfb\xaa\x04\x22\xfa"
+			  "\xa0\xe5\x7e\x8c\x78\x02\x48\x5d"
+			  "\x78\x33\x5a\x7c\xad\xdb\x29\xce"
+			  "\xbb\x8b\x61\xa4\xb7\x42\xe2\xac"
+			  "\x8b\x1a\xd9\x2f\x0b\x8b\x62\x21"
+			  "\x83\x35\x7e\xad\x73\xc2\xb5\x6c"
+			  "\x10\x26\x38\x07\xe5\xc7\x36\x80"
+			  "\xe2\x23\x12\x61\xf5\x48\x4b\x2b"
+			  "\xc5\xdf\x15\xd9\x87\x01\xaa\xac"
+			  "\x1e\x7c\xad\x73\x78\x18\x63\xe0"
+			  "\x8b\x9f\x81\xd8\x12\x6a\x28\x10"
+			  "\xbe\x04\x68\x8a\x09\x7c\x1b\x1c"
+			  "\x83\x66\x80\x47\x80\xe8\xfd\x35"
+			  "\x1c\x97\x6f\xae\x49\x10\x66\xcc"
+			  "\xc6\xd8\xcc\x3a\x84\x91\x20\x77"
+			  "\x72\xe4\x24\xd2\x37\x9f\xc5\xc9"
+			  "\x25\x94\x10\x5f\x40\x00\x64\x99"
+			  "\xdc\xae\xd7\x21\x09\x78\x50\x15"
+			  "\xac\x5f\xc6\x2c\xa2\x0b\xa9\x39"
+			  "\x87\x6e\x6d\xab\xde\x08\x51\x16"
+			  "\xc7\x13\xe9\xea\xed\x06\x8e\x2c"
+			  "\xf8\x37\x8c\xf0\xa6\x96\x8d\x43"
+			  "\xb6\x98\x37\xb2\x43\xed\xde\xdf"
+			  "\x89\x1a\xe7\xeb\x9d\xa1\x7b\x0b"
+			  "\x77\xb0\xe2\x75\xc0\xf1\x98\xd9"
+			  "\x80\x55\xc9\x34\x91\xd1\x59\xe8"
+			  "\x4b\x0f\xc1\xa9\x4b\x7a\x84\x06"
+			  "\x20\xa8\x5d\xfa\xd1\xde\x70\x56"
+			  "\x2f\x9e\x91\x9c\x20\xb3\x24\xd8"
+			  "\x84\x3d\xe1\x8c\x7e\x62\x52\xe5"
+			  "\x44\x4b\x9f\xc2\x93\x03\xea\x2b"
+			  "\x59\xc5\xfa\x3f\x91\x2b\xbb\x23"
+			  "\xf5\xb2\x7b\xf5\x38\xaf\xb3\xee"
+			  "\x63\xdc\x7b\xd1\xff\xaa\x8b\xab"
+			  "\x82\x6b\x37\x04\xeb\x74\xbe\x79"
+			  "\xb9\x83\x90\xef\x20\x59\x46\xff"
+			  "\xe9\x97\x3e\x2f\xee\xb6\x64\x18"
+			  "\x38\x4c\x7a\x4a\xf9\x61\xe8\x9a"
+			  "\xa1\xb5\x01\xa6\x47\xd3\x11\xd4"
+			  "\xce\xd3\x91\x49\x88\xc7\xb8\x4d"
+			  "\xb1\xb9\x07\x6d\x16\x72\xae\x46"
+			  "\x5e\x03\xa1\x4b\xb6\x02\x30\xa8"
+			  "\x3d\xa9\x07\x2a\x7c\x19\xe7\x62"
+			  "\x87\xe3\x82\x2f\x6f\xe1\x09\xd9"
+			  "\x94\x97\xea\xdd\x58\x9e\xae\x76"
+			  "\x7e\x35\xe5\xb4\xda\x7e\xf4\xde"
+			  "\xf7\x32\x87\xcd\x93\xbf\x11\x56"
+			  "\x11\xbe\x08\x74\xe1\x69\xad\xe2"
+			  "\xd7\xf8\x86\x75\x8a\x3c\xa4\xbe"
+			  "\x70\xa7\x1b\xfc\x0b\x44\x2a\x76"
+			  "\x35\xea\x5d\x85\x81\xaf\x85\xeb"
+			  "\xa0\x1c\x61\xc2\xf7\x4f\xa5\xdc"
+			  "\x02\x7f\xf6\x95\x40\x6e\x8a\x9a"
+			  "\xf3\x5d\x25\x6e\x14\x3a\x22\xc9"
+			  "\x37\x1c\xeb\x46\x54\x3f\xa5\x91"
+			  "\xc2\xb5\x8c\xfe\x53\x08\x97\x32"
+			  "\x1b\xb2\x30\x27\xfe\x25\x5d\xdc"
+			  "\x08\x87\xd0\xe5\x94\x1a\xd4\xf1"
+			  "\xfe\xd6\xb4\xa3\xe6\x74\x81\x3c"
+			  "\x1b\xb7\x31\xa7\x22\xfd\xd4\xdd"
+			  "\x20\x4e\x7c\x51\xb0\x60\x73\xb8"
+			  "\x9c\xac\x91\x90\x7e\x01\xb0\xe1"
+			  "\x8a\x2f\x75\x1c\x53\x2a\x98\x2a"
+			  "\x06\x52\x95\x52\xb2\xe9\x25\x2e"
+			  "\x4c\xe2\x5a\x00\xb2\x13\x81\x03"
+			  "\x77\x66\x0d\xa5\x99\xda\x4e\x8c"
+			  "\xac\xf3\x13\x53\x27\x45\xaf\x64"
+			  "\x46\xdc\xea\x23\xda\x97\xd1\xab"
+			  "\x7d\x6c\x30\x96\x1f\xbc\x06\x34"
+			  "\x18\x0b\x5e\x21\x35\x11\x8d\x4c"
+			  "\xe0\x2d\xe9\x50\x16\x74\x81\xa8"
+			  "\xb4\x34\xb9\x72\x42\xa6\xcc\xbc"
+			  "\xca\x34\x83\x27\x10\x5b\x68\x45"
+			  "\x8f\x52\x22\x0c\x55\x3d\x29\x7c"
+			  "\xe3\xc0\x66\x05\x42\x91\x5f\x58"
+			  "\xfe\x4a\x62\xd9\x8c\xa9\x04\x19"
+			  "\x04\xa9\x08\x4b\x57\xfc\x67\x53"
+			  "\x08\x7c\xbc\x66\x8a\xb0\xb6\x9f"
+			  "\x92\xd6\x41\x7c\x5b\x2a\x00\x79"
+			  "\x72",
+		.ctext	= "\xe1\xb6\x8b\x5c\x80\xb8\xcc\x08"
+			  "\x1b\x84\xb2\xd1\xad\xa4\x70\xac"
+			  "\x67\xa9\x39\x27\xac\xb4\x5b\xb7"
+			  "\x4c\x26\x77\x23\x1d\xce\x0a\xbe"
+			  "\x18\x9e\x42\x8b\xbd\x7f\xd6\xf1"
+			  "\xf1\x6b\xe2\x6d\x7f\x92\x0e\xcb"
+			  "\xb8\x79\xba\xb4\xac\x7e\x2d\xc0"
+			  "\x9e\x83\x81\x91\xd5\xea\xc3\x12"
+			  "\x8d\xa4\x26\x70\xa4\xf9\x71\x0b"
+			  "\xbd\x2e\xe1\xb3\x80\x42\x25\xb3"
+			  "\x0b\x31\x99\xe1\x0d\xde\xa6\x90"
+			  "\xf2\xa3\x10\xf7\xe5\xf3\x83\x1e"
+			  "\x2c\xfb\x4d\xf0\x45\x3d\x28\x3c"
+			  "\xb8\xf1\xcb\xbf\x67\xd8\x43\x5a"
+			  "\x9d\x7b\x73\x29\x88\x0f\x13\x06"
+			  "\x37\x50\x0d\x7c\xe6\x9b\x07\xdd"
+			  "\x7e\x01\x1f\x81\x90\x10\x69\xdb"
+			  "\xa4\xad\x8a\x5e\xac\x30\x72\xf2"
+			  "\x36\xcd\xe3\x23\x49\x02\x93\xfa"
+			  "\x3d\xbb\xe2\x98\x83\xeb\xe9\x8d"
+			  "\xb3\x8f\x11\xaa\x53\xdb\xaf\x2e"
+			  "\x95\x13\x99\x3d\x71\xbd\x32\x92"
+			  "\xdd\xfc\x9d\x5e\x6f\x63\x2c\xee"
+			  "\x91\x1f\x4c\x64\x3d\x87\x55\x0f"
+			  "\xcc\x3d\x89\x61\x53\x02\x57\x8f"
+			  "\xe4\x77\x29\x32\xaf\xa6\x2f\x0a"
+			  "\xae\x3c\x3f\x3f\xf4\xfb\x65\x52"
+			  "\xc5\xc1\x78\x78\x53\x28\xad\xed"
+			  "\xd1\x67\x37\xc7\x59\x70\xcd\x0a"
+			  "\xb8\x0f\x80\x51\x9f\xc0\x12\x5e"
+			  "\x06\x0a\x7e\xec\x24\x5f\x73\x00"
+			  "\xb1\x0b\x31\x47\x4f\x73\x8d\xb4"
+			  "\xce\xf3\x55\x45\x6c\x84\x27\xba"
+			  "\xb9\x6f\x03\x4a\xeb\x98\x88\x6e"
+			  "\x53\xed\x25\x19\x0d\x8f\xfe\xca"
+			  "\x60\xe5\x00\x93\x6e\x3c\xff\x19"
+			  "\xae\x08\x3b\x8a\xa6\x84\x05\xfe"
+			  "\x9b\x59\xa0\x8c\xc8\x05\x45\xf5"
+			  "\x05\x37\xdc\x45\x6f\x8b\x95\x8c"
+			  "\x4e\x11\x45\x7a\xce\x21\xa5\xf7"
+			  "\x71\x67\xb9\xce\xd7\xf9\xe9\x5e"
+			  "\x60\xf5\x53\x7a\xa8\x85\x14\x03"
+			  "\xa0\x92\xec\xf3\x51\x80\x84\xc4"
+			  "\xdc\x11\x9e\x57\xce\x4b\x45\xcf"
+			  "\x90\x95\x85\x0b\x96\xe9\xee\x35"
+			  "\x10\xb8\x9b\xf2\x59\x4a\xc6\x7e"
+			  "\x85\xe5\x6f\x38\x51\x93\x40\x0c"
+			  "\x99\xd7\x7f\x32\xa8\x06\x27\xd1"
+			  "\x2b\xd5\xb5\x3a\x1a\xe1\x5e\xda"
+			  "\xcd\x5a\x50\x30\x3c\xc7\xe7\x65"
+			  "\xa6\x07\x0b\x98\x91\xc6\x20\x27"
+			  "\x2a\x03\x63\x1b\x1e\x3d\xaf\xc8"
+			  "\x71\x48\x46\x6a\x64\x28\xf9\x3d"
+			  "\xd1\x1d\xab\xc8\x40\x76\xc2\x39"
+			  "\x4e\x00\x75\xd2\x0e\x82\x58\x8c"
+			  "\xd3\x73\x5a\xea\x46\x89\xbe\xfd"
+			  "\x4e\x2c\x0d\x94\xaa\x9b\x68\xac"
+			  "\x86\x87\x30\x7e\xa9\x16\xcd\x59"
+			  "\xd2\xa6\xbe\x0a\xd8\xf5\xfd\x2d"
+			  "\x49\x69\xd2\x1a\x90\xd2\x1b\xed"
+			  "\xff\x71\x04\x87\x87\x21\xc4\xb8"
+			  "\x1f\x5b\x51\x33\xd0\xd6\x59\x9a"
+			  "\x03\x0e\xd3\x8b\xfb\x57\x73\xfd"
+			  "\x5a\x52\x63\x82\xc8\x85\x2f\xcb"
+			  "\x74\x6d\x4e\xd9\x68\x37\x85\x6a"
+			  "\xd4\xfb\x94\xed\x8d\xd1\x1a\xaf"
+			  "\x76\xa7\xb7\x88\xd0\x2b\x4e\xda"
+			  "\xec\x99\x94\x27\x6f\x87\x8c\xdf"
+			  "\x4b\x5e\xa6\x66\xdd\xcb\x33\x7b"
+			  "\x64\x94\x31\xa8\x37\xa6\x1d\xdb"
+			  "\x0d\x5c\x93\xa4\x40\xf9\x30\x53"
+			  "\x4b\x74\x8d\xdd\xf6\xde\x3c\xac"
+			  "\x5c\x80\x01\x3a\xef\xb1\x9a\x02"
+			  "\x0c\x22\x8e\xe7\x44\x09\x74\x4c"
+			  "\xf2\x9a\x27\x69\x7f\x12\x32\x36"
+			  "\xde\x92\xdf\xde\x8f\x5b\x31\xab"
+			  "\x4a\x01\x26\xe0\xb1\xda\xe8\x37"
+			  "\x21\x64\xe8\xff\x69\xfc\x9e\x41"
+			  "\xd2\x96\x2d\x18\x64\x98\x33\x78"
+			  "\x24\x61\x73\x9b\x47\x29\xf1\xa7"
+			  "\xcb\x27\x0f\xf0\x85\x6d\x8c\x9d"
+			  "\x2c\x95\x9e\xe5\xb2\x8e\x30\x29"
+			  "\x78\x8a\x9d\x65\xb4\x8e\xde\x7b"
+			  "\xd9\x00\x50\xf5\x7f\x81\xc3\x1b"
+			  "\x25\x85\xeb\xc2\x8c\x33\x22\x1e"
+			  "\x68\x38\x22\x30\xd8\x2e\x00\x98"
+			  "\x85\x16\x06\x56\xb4\x81\x74\x20"
+			  "\x95\xdb\x1c\x05\x19\xe8\x23\x4d"
+			  "\x65\x5d\xcc\xd8\x7f\xc4\x2d\x0f"
+			  "\x57\x26\x71\x07\xad\xaa\x71\x9f"
+			  "\x19\x76\x2f\x25\x51\x88\xe4\xc0"
+			  "\x82\x6e\x08\x05\x37\x04\xee\x25"
+			  "\x23\x90\xe9\x4e\xce\x9b\x16\xc1"
+			  "\x31\xe7\x6e\x2c\x1b\xe1\x85\x9a"
+			  "\x0c\x8c\xbb\x12\x1e\x68\x7b\x93"
+			  "\xa9\x3c\x39\x56\x23\x3e\x6e\xc7"
+			  "\x77\x84\xd3\xe0\x86\x59\xaa\xb9"
+			  "\xd5\x53\x58\xc9\x0a\x83\x5f\x85"
+			  "\xd8\x47\x14\x67\x8a\x3c\x17\xe0"
+			  "\xab\x02\x51\xea\xf1\xf0\x4f\x30"
+			  "\x7d\xe0\x92\xc2\x5f\xfb\x19\x5a"
+			  "\x3f\xbd\xf4\x39\xa4\x31\x0c\x39"
+			  "\xd1\xae\x4e\xf7\x65\x7f\x1f\xce"
+			  "\xc2\x39\xd1\x84\xd4\xe5\x02\xe0"
+			  "\x58\xaa\xf1\x5e\x81\xaf\x7f\x72"
+			  "\x0f\x08\x99\x43\xb9\xd8\xac\x41"
+			  "\x35\x55\xf2\xb2\xd4\x98\xb8\x3b"
+			  "\x2b\x3c\x3e\x16\x06\x31\xfc\x79"
+			  "\x47\x38\x63\x51\xc5\xd0\x26\xd7"
+			  "\x43\xb4\x2b\xd9\xc5\x05\xf2\x9d"
+			  "\x18\xc9\x26\x82\x56\xd2\x11\x05"
+			  "\xb6\x89\xb4\x43\x9c\xb5\x9d\x11"
+			  "\x6c\x83\x37\x71\x27\x1c\xae\xbf"
+			  "\xcd\x57\xd2\xee\x0d\x5a\x15\x26"
+			  "\x67\x88\x80\x80\x1b\xdc\xc1\x62"
+			  "\xdd\x4c\xff\x92\x5c\x6c\xe1\xa0"
+			  "\xe3\x79\xa9\x65\x8c\x8c\x14\x42"
+			  "\xe5\x11\xd2\x1a\xad\xa9\x56\x6f"
+			  "\x98\xfc\x8a\x7b\x56\x1f\xc6\xc1"
+			  "\x52\x12\x92\x9b\x41\x0f\x4b\xae"
+			  "\x1b\x4a\xbc\xfe\x23\xb6\x94\x70"
+			  "\x04\x30\x9e\x69\x47\xbe\xb8\x8f"
+			  "\xca\x45\xd7\x8a\xf4\x78\x3e\xaa"
+			  "\x71\x17\xd8\x1e\xb8\x11\x8f\xbc"
+			  "\xc8\x1a\x65\x7b\x41\x89\x72\xc7"
+			  "\x5f\xbe\xc5\x2a\xdb\x5c\x54\xf9"
+			  "\x25\xa3\x7a\x80\x56\x9c\x8c\xab"
+			  "\x26\x19\x10\x36\xa6\xf3\x14\x79"
+			  "\x40\x98\x70\x68\xb7\x35\xd9\xb9"
+			  "\x27\xd4\xe7\x74\x5b\x3d\x97\xb4"
+			  "\xd9\xaa\xd9\xf2\xb5\x14\x84\x1f"
+			  "\xa9\xde\x12\x44\x5b\x00\xc0\xbc"
+			  "\xc8\x11\x25\x1b\x67\x7a\x15\x72"
+			  "\xa6\x31\x6f\xf4\x68\x7a\x86\x9d"
+			  "\x43\x1c\x5f\x16\xd3\xad\x2e\x52"
+			  "\xf3\xb4\xc3\xfa\x27\x2e\x68\x6c"
+			  "\x06\xe7\x4c\x4f\xa2\xe0\xe4\x21"
+			  "\x5d\x9e\x33\x58\x8d\xbf\xd5\x70"
+			  "\xf8\x80\xa5\xdd\xe7\x18\x79\xfa"
+			  "\x7b\xfd\x09\x69\x2c\x37\x32\xa8"
+			  "\x65\xfa\x8d\x8b\x5c\xcc\xe8\xf3"
+			  "\x37\xf6\xa6\xc6\x5c\xa2\x66\x79"
+			  "\xfa\x8a\xa7\xd1\x0b\x2e\x1b\x5e"
+			  "\x95\x35\x00\x76\xae\x42\xf7\x50"
+			  "\x51\x78\xfb\xb4\x28\x24\xde\x1a"
+			  "\x70\x8b\xed\xca\x3c\x5e\xe4\xbd"
+			  "\x28\xb5\xf3\x76\x4f\x67\x5d\x81"
+			  "\xb2\x60\x87\xd9\x7b\x19\x1a\xa7"
+			  "\x79\xa2\xfa\x3f\x9e\xa9\xd7\x25"
+			  "\x61\xe1\x74\x31\xa2\x77\xa0\x1b"
+			  "\xf6\xf7\xcb\xc5\xaa\x9e\xce\xf9"
+			  "\x9b\x96\xef\x51\xc3\x1a\x44\x96"
+			  "\xae\x17\x50\xab\x29\x08\xda\xcc"
+			  "\x1a\xb3\x12\xd0\x24\xe4\xe2\xe0"
+			  "\xc6\xe3\xcc\x82\xd0\xba\x47\x4c"
+			  "\x3f\x49\xd7\xe8\xb6\x61\xaa\x65"
+			  "\x25\x18\x40\x2d\x62\x25\x02\x71"
+			  "\x61\xa2\xc1\xb2\x13\xd2\x71\x3f"
+			  "\x43\x1a\xc9\x09\x92\xff\xd5\x57"
+			  "\xf0\xfc\x5e\x1c\xf1\xf5\xf9\xf3"
+			  "\x5b",
+		.len	= 1281,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 1200, 1, 80 },
+	}, {
+		.key	= "\x80\x81\x82\x83\x84\x85\x86\x87"
+			  "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+			  "\x90\x91\x92\x93\x94\x95\x96\x97"
+			  "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f",
+		.klen	= 32,
+		.iv	= "\x40\x41\x42\x43\x44\x45\x46\x47"
+			  "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+			  "\x50\x51\x52\x53\x54\x55\x56\x58"
+			  "\x00\x00\x00\x00\x00\x00\x00\x00",
+		.ptext	= "\x54\x68\x65\x20\x64\x68\x6f\x6c"
+			  "\x65\x20\x28\x70\x72\x6f\x6e\x6f"
+			  "\x75\x6e\x63\x65\x64\x20\x22\x64"
+			  "\x6f\x6c\x65\x22\x29\x20\x69\x73"
+			  "\x20\x61\x6c\x73\x6f\x20\x6b\x6e"
+			  "\x6f\x77\x6e\x20\x61\x73\x20\x74"
+			  "\x68\x65\x20\x41\x73\x69\x61\x74"
+			  "\x69\x63\x20\x77\x69\x6c\x64\x20"
+			  "\x64\x6f\x67\x2c\x20\x72\x65\x64"
+			  "\x20\x64\x6f\x67\x2c\x20\x61\x6e"
+			  "\x64\x20\x77\x68\x69\x73\x74\x6c"
+			  "\x69\x6e\x67\x20\x64\x6f\x67\x2e"
+			  "\x20\x49\x74\x20\x69\x73\x20\x61"
+			  "\x62\x6f\x75\x74\x20\x74\x68\x65"
+			  "\x20\x73\x69\x7a\x65\x20\x6f\x66"
+			  "\x20\x61\x20\x47\x65\x72\x6d\x61"
+			  "\x6e\x20\x73\x68\x65\x70\x68\x65"
+			  "\x72\x64\x20\x62\x75\x74\x20\x6c"
+			  "\x6f\x6f\x6b\x73\x20\x6d\x6f\x72"
+			  "\x65\x20\x6c\x69\x6b\x65\x20\x61"
+			  "\x20\x6c\x6f\x6e\x67\x2d\x6c\x65"
+			  "\x67\x67\x65\x64\x20\x66\x6f\x78"
+			  "\x2e\x20\x54\x68\x69\x73\x20\x68"
+			  "\x69\x67\x68\x6c\x79\x20\x65\x6c"
+			  "\x75\x73\x69\x76\x65\x20\x61\x6e"
+			  "\x64\x20\x73\x6b\x69\x6c\x6c\x65"
+			  "\x64\x20\x6a\x75\x6d\x70\x65\x72"
+			  "\x20\x69\x73\x20\x63\x6c\x61\x73"
+			  "\x73\x69\x66\x69\x65\x64\x20\x77"
+			  "\x69\x74\x68\x20\x77\x6f\x6c\x76"
+			  "\x65\x73\x2c\x20\x63\x6f\x79\x6f"
+			  "\x74\x65\x73\x2c\x20\x6a\x61\x63"
+			  "\x6b\x61\x6c\x73\x2c\x20\x61\x6e"
+			  "\x64\x20\x66\x6f\x78\x65\x73\x20"
+			  "\x69\x6e\x20\x74\x68\x65\x20\x74"
+			  "\x61\x78\x6f\x6e\x6f\x6d\x69\x63"
+			  "\x20\x66\x61\x6d\x69\x6c\x79\x20"
+			  "\x43\x61\x6e\x69\x64\x61\x65\x2e",
+		.ctext	= "\x9f\x1a\xab\x8a\x95\xf4\x7e\xcd"
+			  "\xee\x34\xc0\x39\xd6\x23\x43\x94"
+			  "\xf6\x01\xc1\x7f\x60\x91\xa5\x23"
+			  "\x4a\x8a\xe6\xb1\x14\x8b\xd7\x58"
+			  "\xee\x02\xad\xab\xce\x1e\x7d\xdf"
+			  "\xf9\x49\x27\x69\xd0\x8d\x0c\x20"
+			  "\x6e\x17\xc4\xae\x87\x7a\xc6\x61"
+			  "\x91\xe2\x8e\x0a\x1d\x61\xcc\x38"
+			  "\x02\x64\x43\x49\xc6\xb2\x59\x59"
+			  "\x42\xe7\x9d\x83\x00\x60\x90\xd2"
+			  "\xb9\xcd\x97\x6e\xc7\x95\x71\xbc"
+			  "\x23\x31\x58\x07\xb3\xb4\xac\x0b"
+			  "\x87\x64\x56\xe5\xe3\xec\x63\xa1"
+			  "\x71\x8c\x08\x48\x33\x20\x29\x81"
+			  "\xea\x01\x25\x20\xc3\xda\xe6\xee"
+			  "\x6a\x03\xf6\x68\x4d\x26\xa0\x91"
+			  "\x9e\x44\xb8\xc1\xc0\x8f\x5a\x6a"
+			  "\xc0\xcd\xbf\x24\x5e\x40\x66\xd2"
+			  "\x42\x24\xb5\xbf\xc1\xeb\x12\x60"
+			  "\x56\xbe\xb1\xa6\xc4\x0f\xfc\x49"
+			  "\x69\x9f\xcc\x06\x5c\xe3\x26\xd7"
+			  "\x52\xc0\x42\xe8\xb4\x76\xc3\xee"
+			  "\xb2\x97\xe3\x37\x61\x29\x5a\xb5"
+			  "\x8e\xe8\x8c\xc5\x38\xcc\xcb\xec"
+			  "\x64\x1a\xa9\x12\x5f\xf7\x79\xdf"
+			  "\x64\xca\x77\x4e\xbd\xf9\x83\xa0"
+			  "\x13\x27\x3f\x31\x03\x63\x30\x26"
+			  "\x27\x0b\x3e\xb3\x23\x13\x61\x0b"
+			  "\x70\x1d\xd4\xad\x85\x1e\xbf\xdf"
+			  "\xc6\x8e\x4d\x08\xcc\x7e\x77\xbd"
+			  "\x1e\x18\x77\x38\x3a\xfe\xc0\x5d"
+			  "\x16\xfc\xf0\xa9\x2f\xe9\x17\xc7"
+			  "\xd3\x23\x17\x18\xa3\xe6\x54\x77"
+			  "\x6f\x1b\xbe\x8a\x6e\x7e\xca\x97"
+			  "\x08\x05\x36\x76\xaf\x12\x7a\x42"
+			  "\xf7\x7a\xc2\x35\xc3\xb4\x93\x40"
+			  "\x54\x14\x90\xa0\x4d\x65\x1c\x37"
+			  "\x50\x70\x44\x29\x6d\x6e\x62\x68",
+		.len	= 304,
+	}
+};
+
+/* Adiantum test vectors from https://github.com/google/adiantum */
+static const struct cipher_testvec adiantum_xchacha12_aes_tv_template[] = {
+	{
+		.key	= "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+			  "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+			  "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+			  "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+		.klen	= 32,
+		.iv	= "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+			  "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+			  "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+			  "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+		.ptext	= "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+			  "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+		.ctext	= "\x6d\x32\x86\x18\x67\x86\x0f\x3f"
+			  "\x96\x7c\x9d\x28\x0d\x53\xec\x9f",
+		.len	= 16,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 14, 2 },
+	}, {
+		.key	= "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+			  "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+			  "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+			  "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+		.klen	= 32,
+		.iv	= "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+			  "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+			  "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+			  "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+		.ptext	= "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+			  "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+			  "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+			  "\x43\x5a\x46\x06\x94\x2d\xf2",
+		.ctext	= "\xc7\xc6\xf1\x73\x8f\xc4\xff\x4a"
+			  "\x39\xbe\x78\xbe\x8d\x28\xc8\x89"
+			  "\x46\x63\xe7\x0c\x7d\x87\xe8\x4e"
+			  "\xc9\x18\x7b\xbe\x18\x60\x50",
+		.len	= 31,
+	}, {
+		.key	= "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+			  "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+			  "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+			  "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+		.klen	= 32,
+		.iv	= "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+			  "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+			  "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+			  "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+		.ptext	= "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+			  "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+			  "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+			  "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+			  "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+			  "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+			  "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+			  "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+			  "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+			  "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+			  "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+			  "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+			  "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+			  "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+			  "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+			  "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+		.ctext	= "\x9e\x16\xab\xed\x4b\xa7\x42\x5a"
+			  "\xc6\xfb\x4e\x76\xff\xbe\x03\xa0"
+			  "\x0f\xe3\xad\xba\xe4\x98\x2b\x0e"
+			  "\x21\x48\xa0\xb8\x65\x48\x27\x48"
+			  "\x84\x54\x54\xb2\x9a\x94\x7b\xe6"
+			  "\x4b\x29\xe9\xcf\x05\x91\x80\x1a"
+			  "\x3a\xf3\x41\x96\x85\x1d\x9f\x74"
+			  "\x51\x56\x63\xfa\x7c\x28\x85\x49"
+			  "\xf7\x2f\xf9\xf2\x18\x46\xf5\x33"
+			  "\x80\xa3\x3c\xce\xb2\x57\x93\xf5"
+			  "\xae\xbd\xa9\xf5\x7b\x30\xc4\x93"
+			  "\x66\xe0\x30\x77\x16\xe4\xa0\x31"
+			  "\xba\x70\xbc\x68\x13\xf5\xb0\x9a"
+			  "\xc1\xfc\x7e\xfe\x55\x80\x5c\x48"
+			  "\x74\xa6\xaa\xa3\xac\xdc\xc2\xf5"
+			  "\x8d\xde\x34\x86\x78\x60\x75\x8d",
+		.len	= 128,
+		.also_non_np = 1,
+		.np	= 4,
+		.tap	= { 104, 16, 4, 4 },
+	}, {
+		.key	= "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+			  "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+			  "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+			  "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+		.klen	= 32,
+		.iv	= "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+			  "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+			  "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+			  "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+		.ptext	= "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+			  "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+			  "\x05\xa3\x69\x60\x91\x36\x98\x57"
+			  "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+			  "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+			  "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+			  "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+			  "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+			  "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+			  "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+			  "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+			  "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+			  "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+			  "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+			  "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+			  "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+			  "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+			  "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+			  "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+			  "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+			  "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+			  "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+			  "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+			  "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+			  "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+			  "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+			  "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+			  "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+			  "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+			  "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+			  "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+			  "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+			  "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+			  "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+			  "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+			  "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+			  "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+			  "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+			  "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+			  "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+			  "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+			  "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+			  "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+			  "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+			  "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+			  "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+			  "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+			  "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+			  "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+			  "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+			  "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+			  "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+			  "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+			  "\x17\x7c\x25\x48\x52\x67\x11\x27"
+			  "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+			  "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+			  "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+			  "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+			  "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+			  "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+			  "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+			  "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+			  "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+			  "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+		.ctext	= "\x15\x97\xd0\x86\x18\x03\x9c\x51"
+			  "\xc5\x11\x36\x62\x13\x92\xe6\x73"
+			  "\x29\x79\xde\xa1\x00\x3e\x08\x64"
+			  "\x17\x1a\xbc\xd5\xfe\x33\x0e\x0c"
+			  "\x7c\x94\xa7\xc6\x3c\xbe\xac\xa2"
+			  "\x89\xe6\xbc\xdf\x0c\x33\x27\x42"
+			  "\x46\x73\x2f\xba\x4e\xa6\x46\x8f"
+			  "\xe4\xee\x39\x63\x42\x65\xa3\x88"
+			  "\x7a\xad\x33\x23\xa9\xa7\x20\x7f"
+			  "\x0b\xe6\x6a\xc3\x60\xda\x9e\xb4"
+			  "\xd6\x07\x8a\x77\x26\xd1\xab\x44"
+			  "\x99\x55\x03\x5e\xed\x8d\x7b\xbd"
+			  "\xc8\x21\xb7\x21\x30\x3f\xc0\xb5"
+			  "\xc8\xec\x6c\x23\xa6\xa3\x6d\xf1"
+			  "\x30\x0a\xd0\xa6\xa9\x28\x69\xae"
+			  "\x2a\xe6\x54\xac\x82\x9d\x6a\x95"
+			  "\x6f\x06\x44\xc5\x5a\x77\x6e\xec"
+			  "\xf8\xf8\x63\xb2\xe6\xaa\xbd\x8e"
+			  "\x0e\x8a\x62\x00\x03\xc8\x84\xdd"
+			  "\x47\x4a\xc3\x55\xba\xb7\xe7\xdf"
+			  "\x08\xbf\x62\xf5\xe8\xbc\xb6\x11"
+			  "\xe4\xcb\xd0\x66\x74\x32\xcf\xd4"
+			  "\xf8\x51\x80\x39\x14\x05\x12\xdb"
+			  "\x87\x93\xe2\x26\x30\x9c\x3a\x21"
+			  "\xe5\xd0\x38\x57\x80\x15\xe4\x08"
+			  "\x58\x05\x49\x7d\xe6\x92\x77\x70"
+			  "\xfb\x1e\x2d\x6a\x84\x00\xc8\x68"
+			  "\xf7\x1a\xdd\xf0\x7b\x38\x1e\xd8"
+			  "\x2c\x78\x78\x61\xcf\xe3\xde\x69"
+			  "\x1f\xd5\x03\xd5\x1a\xb4\xcf\x03"
+			  "\xc8\x7a\x70\x68\x35\xb4\xf6\xbe"
+			  "\x90\x62\xb2\x28\x99\x86\xf5\x44"
+			  "\x99\xeb\x31\xcf\xca\xdf\xd0\x21"
+			  "\xd6\x60\xf7\x0f\x40\xb4\x80\xb7"
+			  "\xab\xe1\x9b\x45\xba\x66\xda\xee"
+			  "\xdd\x04\x12\x40\x98\xe1\x69\xe5"
+			  "\x2b\x9c\x59\x80\xe7\x7b\xcc\x63"
+			  "\xa6\xc0\x3a\xa9\xfe\x8a\xf9\x62"
+			  "\x11\x34\x61\x94\x35\xfe\xf2\x99"
+			  "\xfd\xee\x19\xea\x95\xb6\x12\xbf"
+			  "\x1b\xdf\x02\x1a\xcc\x3e\x7e\x65"
+			  "\x78\x74\x10\x50\x29\x63\x28\xea"
+			  "\x6b\xab\xd4\x06\x4d\x15\x24\x31"
+			  "\xc7\x0a\xc9\x16\xb6\x48\xf0\xbf"
+			  "\x49\xdb\x68\x71\x31\x8f\x87\xe2"
+			  "\x13\x05\x64\xd6\x22\x0c\xf8\x36"
+			  "\x84\x24\x3e\x69\x5e\xb8\x9e\x16"
+			  "\x73\x6c\x83\x1e\xe0\x9f\x9e\xba"
+			  "\xe5\x59\x21\x33\x1b\xa9\x26\xc2"
+			  "\xc7\xd9\x30\x73\xb6\xa6\x73\x82"
+			  "\x19\xfa\x44\x4d\x40\x8b\x69\x04"
+			  "\x94\x74\xea\x6e\xb3\x09\x47\x01"
+			  "\x2a\xb9\x78\x34\x43\x11\xed\xd6"
+			  "\x8c\x95\x65\x1b\x85\x67\xa5\x40"
+			  "\xac\x9c\x05\x4b\x57\x4a\xa9\x96"
+			  "\x0f\xdd\x4f\xa1\xe0\xcf\x6e\xc7"
+			  "\x1b\xed\xa2\xb4\x56\x8c\x09\x6e"
+			  "\xa6\x65\xd7\x55\x81\xb7\xed\x11"
+			  "\x9b\x40\x75\xa8\x6b\x56\xaf\x16"
+			  "\x8b\x3d\xf4\xcb\xfe\xd5\x1d\x3d"
+			  "\x85\xc2\xc0\xde\x43\x39\x4a\x96"
+			  "\xba\x88\x97\xc0\xd6\x00\x0e\x27"
+			  "\x21\xb0\x21\x52\xba\xa7\x37\xaa"
+			  "\xcc\xbf\x95\xa8\xf4\xd0\x91\xf6",
+		.len	= 512,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 144, 368 },
+	}
+};
+
+/* Adiantum with XChaCha20 instead of XChaCha12 */
+/* Test vectors from https://github.com/google/adiantum */
+static const struct cipher_testvec adiantum_xchacha20_aes_tv_template[] = {
+	{
+		.key	= "\x9e\xeb\xb2\x49\x3c\x1c\xf5\xf4"
+			  "\x6a\x99\xc2\xc4\xdf\xb1\xf4\xdd"
+			  "\x75\x20\x57\xea\x2c\x4f\xcd\xb2"
+			  "\xa5\x3d\x7b\x49\x1e\xab\xfd\x0f",
+		.klen	= 32,
+		.iv	= "\xdf\x63\xd4\xab\xd2\x49\xf3\xd8"
+			  "\x33\x81\x37\x60\x7d\xfa\x73\x08"
+			  "\xd8\x49\x6d\x80\xe8\x2f\x62\x54"
+			  "\xeb\x0e\xa9\x39\x5b\x45\x7f\x8a",
+		.ptext	= "\x67\xc9\xf2\x30\x84\x41\x8e\x43"
+			  "\xfb\xf3\xb3\x3e\x79\x36\x7f\xe8",
+		.ctext	= "\xf6\x78\x97\xd6\xaa\x94\x01\x27"
+			  "\x2e\x4d\x83\xe0\x6e\x64\x9a\xdf",
+		.len	= 16,
+		.also_non_np = 1,
+		.np	= 3,
+		.tap	= { 5, 2, 9 },
+	}, {
+		.key	= "\x36\x2b\x57\x97\xf8\x5d\xcd\x99"
+			  "\x5f\x1a\x5a\x44\x1d\x92\x0f\x27"
+			  "\xcc\x16\xd7\x2b\x85\x63\x99\xd3"
+			  "\xba\x96\xa1\xdb\xd2\x60\x68\xda",
+		.klen	= 32,
+		.iv	= "\xef\x58\x69\xb1\x2c\x5e\x9a\x47"
+			  "\x24\xc1\xb1\x69\xe1\x12\x93\x8f"
+			  "\x43\x3d\x6d\x00\xdb\x5e\xd8\xd9"
+			  "\x12\x9a\xfe\xd9\xff\x2d\xaa\xc4",
+		.ptext	= "\x5e\xa8\x68\x19\x85\x98\x12\x23"
+			  "\x26\x0a\xcc\xdb\x0a\x04\xb9\xdf"
+			  "\x4d\xb3\x48\x7b\xb0\xe3\xc8\x19"
+			  "\x43\x5a\x46\x06\x94\x2d\xf2",
+		.ctext	= "\x4b\xb8\x90\x10\xdf\x7f\x64\x08"
+			  "\x0e\x14\x42\x5f\x00\x74\x09\x36"
+			  "\x57\x72\xb5\xfd\xb5\x5d\xb8\x28"
+			  "\x0c\x04\x91\x14\x91\xe9\x37",
+		.len	= 31,
+		.also_non_np = 1,
+		.np	= 2,
+		.tap	= { 16, 15 },
+	}, {
+		.key	= "\xa5\x28\x24\x34\x1a\x3c\xd8\xf7"
+			  "\x05\x91\x8f\xee\x85\x1f\x35\x7f"
+			  "\x80\x3d\xfc\x9b\x94\xf6\xfc\x9e"
+			  "\x19\x09\x00\xa9\x04\x31\x4f\x11",
+		.klen	= 32,
+		.iv	= "\xa1\xba\x49\x95\xff\x34\x6d\xb8"
+			  "\xcd\x87\x5d\x5e\xfd\xea\x85\xdb"
+			  "\x8a\x7b\x5e\xb2\x5d\x57\xdd\x62"
+			  "\xac\xa9\x8c\x41\x42\x94\x75\xb7",
+		.ptext	= "\x69\xb4\xe8\x8c\x37\xe8\x67\x82"
+			  "\xf1\xec\x5d\x04\xe5\x14\x91\x13"
+			  "\xdf\xf2\x87\x1b\x69\x81\x1d\x71"
+			  "\x70\x9e\x9c\x3b\xde\x49\x70\x11"
+			  "\xa0\xa3\xdb\x0d\x54\x4f\x66\x69"
+			  "\xd7\xdb\x80\xa7\x70\x92\x68\xce"
+			  "\x81\x04\x2c\xc6\xab\xae\xe5\x60"
+			  "\x15\xe9\x6f\xef\xaa\x8f\xa7\xa7"
+			  "\x63\x8f\xf2\xf0\x77\xf1\xa8\xea"
+			  "\xe1\xb7\x1f\x9e\xab\x9e\x4b\x3f"
+			  "\x07\x87\x5b\x6f\xcd\xa8\xaf\xb9"
+			  "\xfa\x70\x0b\x52\xb8\xa8\xa7\x9e"
+			  "\x07\x5f\xa6\x0e\xb3\x9b\x79\x13"
+			  "\x79\xc3\x3e\x8d\x1c\x2c\x68\xc8"
+			  "\x51\x1d\x3c\x7b\x7d\x79\x77\x2a"
+			  "\x56\x65\xc5\x54\x23\x28\xb0\x03",
+		.ctext	= "\xb1\x8b\xa0\x05\x77\xa8\x4d\x59"
+			  "\x1b\x8e\x21\xfc\x3a\x49\xfa\xd4"
+			  "\xeb\x36\xf3\xc4\xdf\xdc\xae\x67"
+			  "\x07\x3f\x70\x0e\xe9\x66\xf5\x0c"
+			  "\x30\x4d\x66\xc9\xa4\x2f\x73\x9c"
+			  "\x13\xc8\x49\x44\xcc\x0a\x90\x9d"
+			  "\x7c\xdd\x19\x3f\xea\x72\x8d\x58"
+			  "\xab\xe7\x09\x2c\xec\xb5\x44\xd2"
+			  "\xca\xa6\x2d\x7a\x5c\x9c\x2b\x15"
+			  "\xec\x2a\xa6\x69\x91\xf9\xf3\x13"
+			  "\xf7\x72\xc1\xc1\x40\xd5\xe1\x94"
+			  "\xf4\x29\xa1\x3e\x25\x02\xa8\x3e"
+			  "\x94\xc1\x91\x14\xa1\x14\xcb\xbe"
+			  "\x67\x4c\xb9\x38\xfe\xa7\xaa\x32"
+			  "\x29\x62\x0d\xb2\xf6\x3c\x58\x57"
+			  "\xc1\xd5\x5a\xbb\xd6\xa6\x2a\xe5",
+		.len	= 128,
+		.also_non_np = 1,
+		.np	= 4,
+		.tap	= { 112, 7, 8, 1 },
+	}, {
+		.key	= "\xd3\x81\x72\x18\x23\xff\x6f\x4a"
+			  "\x25\x74\x29\x0d\x51\x8a\x0e\x13"
+			  "\xc1\x53\x5d\x30\x8d\xee\x75\x0d"
+			  "\x14\xd6\x69\xc9\x15\xa9\x0c\x60",
+		.klen	= 32,
+		.iv	= "\x65\x9b\xd4\xa8\x7d\x29\x1d\xf4"
+			  "\xc4\xd6\x9b\x6a\x28\xab\x64\xe2"
+			  "\x62\x81\x97\xc5\x81\xaa\xf9\x44"
+			  "\xc1\x72\x59\x82\xaf\x16\xc8\x2c",
+		.ptext	= "\xc7\x6b\x52\x6a\x10\xf0\xcc\x09"
+			  "\xc1\x12\x1d\x6d\x21\xa6\x78\xf5"
+			  "\x05\xa3\x69\x60\x91\x36\x98\x57"
+			  "\xba\x0c\x14\xcc\xf3\x2d\x73\x03"
+			  "\xc6\xb2\x5f\xc8\x16\x27\x37\x5d"
+			  "\xd0\x0b\x87\xb2\x50\x94\x7b\x58"
+			  "\x04\xf4\xe0\x7f\x6e\x57\x8e\xc9"
+			  "\x41\x84\xc1\xb1\x7e\x4b\x91\x12"
+			  "\x3a\x8b\x5d\x50\x82\x7b\xcb\xd9"
+			  "\x9a\xd9\x4e\x18\x06\x23\x9e\xd4"
+			  "\xa5\x20\x98\xef\xb5\xda\xe5\xc0"
+			  "\x8a\x6a\x83\x77\x15\x84\x1e\xae"
+			  "\x78\x94\x9d\xdf\xb7\xd1\xea\x67"
+			  "\xaa\xb0\x14\x15\xfa\x67\x21\x84"
+			  "\xd3\x41\x2a\xce\xba\x4b\x4a\xe8"
+			  "\x95\x62\xa9\x55\xf0\x80\xad\xbd"
+			  "\xab\xaf\xdd\x4f\xa5\x7c\x13\x36"
+			  "\xed\x5e\x4f\x72\xad\x4b\xf1\xd0"
+			  "\x88\x4e\xec\x2c\x88\x10\x5e\xea"
+			  "\x12\xc0\x16\x01\x29\xa3\xa0\x55"
+			  "\xaa\x68\xf3\xe9\x9d\x3b\x0d\x3b"
+			  "\x6d\xec\xf8\xa0\x2d\xf0\x90\x8d"
+			  "\x1c\xe2\x88\xd4\x24\x71\xf9\xb3"
+			  "\xc1\x9f\xc5\xd6\x76\x70\xc5\x2e"
+			  "\x9c\xac\xdb\x90\xbd\x83\x72\xba"
+			  "\x6e\xb5\xa5\x53\x83\xa9\xa5\xbf"
+			  "\x7d\x06\x0e\x3c\x2a\xd2\x04\xb5"
+			  "\x1e\x19\x38\x09\x16\xd2\x82\x1f"
+			  "\x75\x18\x56\xb8\x96\x0b\xa6\xf9"
+			  "\xcf\x62\xd9\x32\x5d\xa9\xd7\x1d"
+			  "\xec\xe4\xdf\x1b\xbe\xf1\x36\xee"
+			  "\xe3\x7b\xb5\x2f\xee\xf8\x53\x3d"
+			  "\x6a\xb7\x70\xa9\xfc\x9c\x57\x25"
+			  "\xf2\x89\x10\xd3\xb8\xa8\x8c\x30"
+			  "\xae\x23\x4f\x0e\x13\x66\x4f\xe1"
+			  "\xb6\xc0\xe4\xf8\xef\x93\xbd\x6e"
+			  "\x15\x85\x6b\xe3\x60\x81\x1d\x68"
+			  "\xd7\x31\x87\x89\x09\xab\xd5\x96"
+			  "\x1d\xf3\x6d\x67\x80\xca\x07\x31"
+			  "\x5d\xa7\xe4\xfb\x3e\xf2\x9b\x33"
+			  "\x52\x18\xc8\x30\xfe\x2d\xca\x1e"
+			  "\x79\x92\x7a\x60\x5c\xb6\x58\x87"
+			  "\xa4\x36\xa2\x67\x92\x8b\xa4\xb7"
+			  "\xf1\x86\xdf\xdc\xc0\x7e\x8f\x63"
+			  "\xd2\xa2\xdc\x78\xeb\x4f\xd8\x96"
+			  "\x47\xca\xb8\x91\xf9\xf7\x94\x21"
+			  "\x5f\x9a\x9f\x5b\xb8\x40\x41\x4b"
+			  "\x66\x69\x6a\x72\xd0\xcb\x70\xb7"
+			  "\x93\xb5\x37\x96\x05\x37\x4f\xe5"
+			  "\x8c\xa7\x5a\x4e\x8b\xb7\x84\xea"
+			  "\xc7\xfc\x19\x6e\x1f\x5a\xa1\xac"
+			  "\x18\x7d\x52\x3b\xb3\x34\x62\x99"
+			  "\xe4\x9e\x31\x04\x3f\xc0\x8d\x84"
+			  "\x17\x7c\x25\x48\x52\x67\x11\x27"
+			  "\x67\xbb\x5a\x85\xca\x56\xb2\x5c"
+			  "\xe6\xec\xd5\x96\x3d\x15\xfc\xfb"
+			  "\x22\x25\xf4\x13\xe5\x93\x4b\x9a"
+			  "\x77\xf1\x52\x18\xfa\x16\x5e\x49"
+			  "\x03\x45\xa8\x08\xfa\xb3\x41\x92"
+			  "\x79\x50\x33\xca\xd0\xd7\x42\x55"
+			  "\xc3\x9a\x0c\x4e\xd9\xa4\x3c\x86"
+			  "\x80\x9f\x53\xd1\xa4\x2e\xd1\xbc"
+			  "\xf1\x54\x6e\x93\xa4\x65\x99\x8e"
+			  "\xdf\x29\xc0\x64\x63\x07\xbb\xea",
+		.ctext	= "\xe0\x33\xf6\xe0\xb4\xa5\xdd\x2b"
+			  "\xdd\xce\xfc\x12\x1e\xfc\x2d\xf2"
+			  "\x8b\xc7\xeb\xc1\xc4\x2a\xe8\x44"
+			  "\x0f\x3d\x97\x19\x2e\x6d\xa2\x38"
+			  "\x9d\xa6\xaa\xe1\x96\xb9\x08\xe8"
+			  "\x0b\x70\x48\x5c\xed\xb5\x9b\xcb"
+			  "\x8b\x40\x88\x7e\x69\x73\xf7\x16"
+			  "\x71\xbb\x5b\xfc\xa3\x47\x5d\xa6"
+			  "\xae\x3a\x64\xc4\xe7\xb8\xa8\xe7"
+			  "\xb1\x32\x19\xdb\xe3\x01\xb8\xf0"
+			  "\xa4\x86\xb4\x4c\xc2\xde\x5c\xd2"
+			  "\x6c\x77\xd2\xe8\x18\xb7\x0a\xc9"
+			  "\x3d\x53\xb5\xc4\x5c\xf0\x8c\x06"
+			  "\xdc\x90\xe0\x74\x47\x1b\x0b\xf6"
+			  "\xd2\x71\x6b\xc4\xf1\x97\x00\x2d"
+			  "\x63\x57\x44\x1f\x8c\xf4\xe6\x9b"
+			  "\xe0\x7a\xdd\xec\x32\x73\x42\x32"
+			  "\x7f\x35\x67\x60\x0d\xcf\x10\x52"
+			  "\x61\x22\x53\x8d\x8e\xbb\x33\x76"
+			  "\x59\xd9\x10\xce\xdf\xef\xc0\x41"
+			  "\xd5\x33\x29\x6a\xda\x46\xa4\x51"
+			  "\xf0\x99\x3d\x96\x31\xdd\xb5\xcb"
+			  "\x3e\x2a\x1f\xc7\x5c\x79\xd3\xc5"
+			  "\x20\xa1\xb1\x39\x1b\xc6\x0a\x70"
+			  "\x26\x39\x95\x07\xad\x7a\xc9\x69"
+			  "\xfe\x81\xc7\x88\x08\x38\xaf\xad"
+			  "\x9e\x8d\xfb\xe8\x24\x0d\x22\xb8"
+			  "\x0e\xed\xbe\x37\x53\x7c\xa6\xc6"
+			  "\x78\x62\xec\xa3\x59\xd9\xc6\x9d"
+			  "\xb8\x0e\x69\x77\x84\x2d\x6a\x4c"
+			  "\xc5\xd9\xb2\xa0\x2b\xa8\x80\xcc"
+			  "\xe9\x1e\x9c\x5a\xc4\xa1\xb2\x37"
+			  "\x06\x9b\x30\x32\x67\xf7\xe7\xd2"
+			  "\x42\xc7\xdf\x4e\xd4\xcb\xa0\x12"
+			  "\x94\xa1\x34\x85\x93\x50\x4b\x0a"
+			  "\x3c\x7d\x49\x25\x01\x41\x6b\x96"
+			  "\xa9\x12\xbb\x0b\xc0\xd7\xd0\x93"
+			  "\x1f\x70\x38\xb8\x21\xee\xf6\xa7"
+			  "\xee\xeb\xe7\x81\xa4\x13\xb4\x87"
+			  "\xfa\xc1\xb0\xb5\x37\x8b\x74\xa2"
+			  "\x4e\xc7\xc2\xad\x3d\x62\x3f\xf8"
+			  "\x34\x42\xe5\xae\x45\x13\x63\xfe"
+			  "\xfc\x2a\x17\x46\x61\xa9\xd3\x1c"
+			  "\x4c\xaf\xf0\x09\x62\x26\x66\x1e"
+			  "\x74\xcf\xd6\x68\x3d\x7d\xd8\xb7"
+			  "\xe7\xe6\xf8\xf0\x08\x20\xf7\x47"
+			  "\x1c\x52\xaa\x0f\x3e\x21\xa3\xf2"
+			  "\xbf\x2f\x95\x16\xa8\xc8\xc8\x8c"
+			  "\x99\x0f\x5d\xfb\xfa\x2b\x58\x8a"
+			  "\x7e\xd6\x74\x02\x60\xf0\xd0\x5b"
+			  "\x65\xa8\xac\xea\x8d\x68\x46\x34"
+			  "\x26\x9d\x4f\xb1\x9a\x8e\xc0\x1a"
+			  "\xf1\xed\xc6\x7a\x83\xfd\x8a\x57"
+			  "\xf2\xe6\xe4\xba\xfc\xc6\x3c\xad"
+			  "\x5b\x19\x50\x2f\x3a\xcc\x06\x46"
+			  "\x04\x51\x3f\x91\x97\xf0\xd2\x07"
+			  "\xe7\x93\x89\x7e\xb5\x32\x0f\x03"
+			  "\xe5\x58\x9e\x74\x72\xeb\xc2\x38"
+			  "\x00\x0c\x91\x72\x69\xed\x7d\x6d"
+			  "\xc8\x71\xf0\xec\xff\x80\xd9\x1c"
+			  "\x9e\xd2\xfa\x15\xfc\x6c\x4e\xbc"
+			  "\xb1\xa6\xbd\xbd\x70\x40\xca\x20"
+			  "\xb8\x78\xd2\xa3\xc6\xf3\x79\x9c"
+			  "\xc7\x27\xe1\x6a\x29\xad\xa4\x03",
+		.len	= 512,
+	}
+};
+
 /*
  * CTS (Cipher Text Stealing) mode tests
  */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 61c392752fe4bbfeba5b1b404bf64cff7e9d8b00..ccfcf00f2798d3c7cbad526517fcdd86c2f02314 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3623,7 +3623,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 		 * change.
 		 */
 
-		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
 		if (IS_ERR(peer_integrity_tfm)) {
 			peer_integrity_tfm = NULL;
 			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
diff --git a/drivers/char/hw_random/bcm2835-rng.c b/drivers/char/hw_random/bcm2835-rng.c
index 6767d965c36c5254e052c84d1b9577af7c0ca5e1..256b0b1d0f26a58eaf2c412188b409854660a7d6 100644
--- a/drivers/char/hw_random/bcm2835-rng.c
+++ b/drivers/char/hw_random/bcm2835-rng.c
@@ -1,10 +1,7 @@
-/**
+// SPDX-License-Identifier: GPL-2.0
+/*
  * Copyright (c) 2010-2012 Broadcom. All rights reserved.
  * Copyright (c) 2013 Lubomir Rintel
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License ("GPL")
- * version 2, as published by the Free Software Foundation.
  */
 
 #include <linux/hw_random.h>
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2eb70e76ed355cc677da414b2ad16436dc35545d..38c6d1af6d1c09c0a07bfbf8cb6a5bd42b3612ea 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -265,7 +265,7 @@
 #include <linux/syscalls.h>
 #include <linux/completion.h>
 #include <linux/uuid.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 
 #include <asm/processor.h>
 #include <linux/uaccess.h>
@@ -431,11 +431,10 @@ static int crng_init = 0;
 #define crng_ready() (likely(crng_init > 1))
 static int crng_init_cnt = 0;
 static unsigned long crng_global_init_time = 0;
-#define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE)
-static void _extract_crng(struct crng_state *crng,
-			  __u8 out[CHACHA20_BLOCK_SIZE]);
+#define CRNG_INIT_CNT_THRESH (2*CHACHA_KEY_SIZE)
+static void _extract_crng(struct crng_state *crng, __u8 out[CHACHA_BLOCK_SIZE]);
 static void _crng_backtrack_protect(struct crng_state *crng,
-				    __u8 tmp[CHACHA20_BLOCK_SIZE], int used);
+				    __u8 tmp[CHACHA_BLOCK_SIZE], int used);
 static void process_random_ready_list(void);
 static void _get_random_bytes(void *buf, int nbytes);
 
@@ -863,7 +862,7 @@ static int crng_fast_load(const char *cp, size_t len)
 	}
 	p = (unsigned char *) &primary_crng.state[4];
 	while (len > 0 && crng_init_cnt < CRNG_INIT_CNT_THRESH) {
-		p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp;
+		p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp;
 		cp++; crng_init_cnt++; len--;
 	}
 	spin_unlock_irqrestore(&primary_crng.lock, flags);
@@ -895,7 +894,7 @@ static int crng_slow_load(const char *cp, size_t len)
 	unsigned long		flags;
 	static unsigned char	lfsr = 1;
 	unsigned char		tmp;
-	unsigned		i, max = CHACHA20_KEY_SIZE;
+	unsigned		i, max = CHACHA_KEY_SIZE;
 	const char *		src_buf = cp;
 	char *			dest_buf = (char *) &primary_crng.state[4];
 
@@ -913,8 +912,8 @@ static int crng_slow_load(const char *cp, size_t len)
 		lfsr >>= 1;
 		if (tmp & 1)
 			lfsr ^= 0xE1;
-		tmp = dest_buf[i % CHACHA20_KEY_SIZE];
-		dest_buf[i % CHACHA20_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
+		tmp = dest_buf[i % CHACHA_KEY_SIZE];
+		dest_buf[i % CHACHA_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
 		lfsr += (tmp << 3) | (tmp >> 5);
 	}
 	spin_unlock_irqrestore(&primary_crng.lock, flags);
@@ -926,7 +925,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	unsigned long	flags;
 	int		i, num;
 	union {
-		__u8	block[CHACHA20_BLOCK_SIZE];
+		__u8	block[CHACHA_BLOCK_SIZE];
 		__u32	key[8];
 	} buf;
 
@@ -937,7 +936,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 	} else {
 		_extract_crng(&primary_crng, buf.block);
 		_crng_backtrack_protect(&primary_crng, buf.block,
-					CHACHA20_KEY_SIZE);
+					CHACHA_KEY_SIZE);
 	}
 	spin_lock_irqsave(&crng->lock, flags);
 	for (i = 0; i < 8; i++) {
@@ -973,7 +972,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 }
 
 static void _extract_crng(struct crng_state *crng,
-			  __u8 out[CHACHA20_BLOCK_SIZE])
+			  __u8 out[CHACHA_BLOCK_SIZE])
 {
 	unsigned long v, flags;
 
@@ -990,7 +989,7 @@ static void _extract_crng(struct crng_state *crng,
 	spin_unlock_irqrestore(&crng->lock, flags);
 }
 
-static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+static void extract_crng(__u8 out[CHACHA_BLOCK_SIZE])
 {
 	struct crng_state *crng = NULL;
 
@@ -1008,14 +1007,14 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
  * enough) to mutate the CRNG key to provide backtracking protection.
  */
 static void _crng_backtrack_protect(struct crng_state *crng,
-				    __u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+				    __u8 tmp[CHACHA_BLOCK_SIZE], int used)
 {
 	unsigned long	flags;
 	__u32		*s, *d;
 	int		i;
 
 	used = round_up(used, sizeof(__u32));
-	if (used + CHACHA20_KEY_SIZE > CHACHA20_BLOCK_SIZE) {
+	if (used + CHACHA_KEY_SIZE > CHACHA_BLOCK_SIZE) {
 		extract_crng(tmp);
 		used = 0;
 	}
@@ -1027,7 +1026,7 @@ static void _crng_backtrack_protect(struct crng_state *crng,
 	spin_unlock_irqrestore(&crng->lock, flags);
 }
 
-static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
+static void crng_backtrack_protect(__u8 tmp[CHACHA_BLOCK_SIZE], int used)
 {
 	struct crng_state *crng = NULL;
 
@@ -1042,8 +1041,8 @@ static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used)
 
 static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
 {
-	ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE;
-	__u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4);
+	ssize_t ret = 0, i = CHACHA_BLOCK_SIZE;
+	__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
 	int large_request = (nbytes > 256);
 
 	while (nbytes) {
@@ -1057,7 +1056,7 @@ static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
 		}
 
 		extract_crng(tmp);
-		i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
+		i = min_t(int, nbytes, CHACHA_BLOCK_SIZE);
 		if (copy_to_user(buf, tmp, i)) {
 			ret = -EFAULT;
 			break;
@@ -1622,14 +1621,14 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller,
  */
 static void _get_random_bytes(void *buf, int nbytes)
 {
-	__u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4);
+	__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
 
 	trace_get_random_bytes(nbytes, _RET_IP_);
 
-	while (nbytes >= CHACHA20_BLOCK_SIZE) {
+	while (nbytes >= CHACHA_BLOCK_SIZE) {
 		extract_crng(buf);
-		buf += CHACHA20_BLOCK_SIZE;
-		nbytes -= CHACHA20_BLOCK_SIZE;
+		buf += CHACHA_BLOCK_SIZE;
+		nbytes -= CHACHA_BLOCK_SIZE;
 	}
 
 	if (nbytes > 0) {
@@ -1637,7 +1636,7 @@ static void _get_random_bytes(void *buf, int nbytes)
 		memcpy(buf, tmp, nbytes);
 		crng_backtrack_protect(tmp, nbytes);
 	} else
-		crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE);
+		crng_backtrack_protect(tmp, CHACHA_BLOCK_SIZE);
 	memzero_explicit(tmp, sizeof(tmp));
 }
 
@@ -2208,8 +2207,8 @@ struct ctl_table random_table[] = {
 
 struct batched_entropy {
 	union {
-		u64 entropy_u64[CHACHA20_BLOCK_SIZE / sizeof(u64)];
-		u32 entropy_u32[CHACHA20_BLOCK_SIZE / sizeof(u32)];
+		u64 entropy_u64[CHACHA_BLOCK_SIZE / sizeof(u64)];
+		u32 entropy_u32[CHACHA_BLOCK_SIZE / sizeof(u32)];
 	};
 	unsigned int position;
 };
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index caa98a7fe3923bb752cec40f5459633627cc42fa..d80751d48cf1e57704d7f8a84d5ec369001566b2 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -762,10 +762,12 @@ config CRYPTO_DEV_CCREE
 	select CRYPTO_ECB
 	select CRYPTO_CTR
 	select CRYPTO_XTS
+	select CRYPTO_SM4
+	select CRYPTO_SM3
 	help
 	  Say 'Y' to enable a driver for the REE interface of the Arm
 	  TrustZone CryptoCell family of processors. Currently the
-	  CryptoCell 712, 710 and 630 are supported.
+	  CryptoCell 713, 703, 712, 710 and 630 are supported.
 	  Choose this if you wish to use hardware acceleration of
 	  cryptographic operations on the system REE.
 	  If unsure say Y.
diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c
index f5c07498ea4f08541b47040f3356bb00cfbc35ff..4092c2aad8e2119038a426d899c055f506240e7f 100644
--- a/drivers/crypto/amcc/crypto4xx_alg.c
+++ b/drivers/crypto/amcc/crypto4xx_alg.c
@@ -520,8 +520,7 @@ static int crypto4xx_compute_gcm_hash_key_sw(__le32 *hash_start, const u8 *key,
 	uint8_t src[16] = { 0 };
 	int rc = 0;
 
-	aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC |
-				      CRYPTO_ALG_NEED_FALLBACK);
+	aes_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK);
 	if (IS_ERR(aes_tfm)) {
 		rc = PTR_ERR(aes_tfm);
 		pr_warn("could not load aes cipher driver: %d\n", rc);
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 2d1f1db9f807448c45bc228d7c4f500d3ac6152c..c9393ffb70ed80c57e87aa200e8e69d7b42aeb9f 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -3868,7 +3868,6 @@ static struct iproc_alg_s driver_algs[] = {
 			.cra_driver_name = "ctr-aes-iproc",
 			.cra_blocksize = AES_BLOCK_SIZE,
 			.cra_ablkcipher = {
-					   /* .geniv = "chainiv", */
 					   .min_keysize = AES_MIN_KEY_SIZE,
 					   .max_keysize = AES_MAX_KEY_SIZE,
 					   .ivsize = AES_BLOCK_SIZE,
@@ -4605,7 +4604,6 @@ static int spu_register_ablkcipher(struct iproc_alg_s *driver_alg)
 	crypto->cra_priority = cipher_pri;
 	crypto->cra_alignmask = 0;
 	crypto->cra_ctxsize = sizeof(struct iproc_ctx_s);
-	INIT_LIST_HEAD(&crypto->cra_list);
 
 	crypto->cra_init = ablkcipher_cra_init;
 	crypto->cra_exit = generic_cra_exit;
@@ -4652,12 +4650,16 @@ static int spu_register_ahash(struct iproc_alg_s *driver_alg)
 	hash->halg.statesize = sizeof(struct spu_hash_export_s);
 
 	if (driver_alg->auth_info.mode != HASH_MODE_HMAC) {
-		hash->setkey = ahash_setkey;
 		hash->init = ahash_init;
 		hash->update = ahash_update;
 		hash->final = ahash_final;
 		hash->finup = ahash_finup;
 		hash->digest = ahash_digest;
+		if ((driver_alg->auth_info.alg == HASH_ALG_AES) &&
+		    ((driver_alg->auth_info.mode == HASH_MODE_XCBC) ||
+		    (driver_alg->auth_info.mode == HASH_MODE_CMAC))) {
+			hash->setkey = ahash_setkey;
+		}
 	} else {
 		hash->setkey = ahash_hmac_setkey;
 		hash->init = ahash_hmac_init;
@@ -4687,7 +4689,6 @@ static int spu_register_aead(struct iproc_alg_s *driver_alg)
 	aead->base.cra_priority = aead_pri;
 	aead->base.cra_alignmask = 0;
 	aead->base.cra_ctxsize = sizeof(struct iproc_ctx_s);
-	INIT_LIST_HEAD(&aead->base.cra_list);
 
 	aead->base.cra_flags |= CRYPTO_ALG_ASYNC;
 	/* setkey set in alg initialization */
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 869f092432de26145694e8c47af387af0b8f4866..92e593e2069ae1511a67e7addf17fffeb842947b 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -72,6 +72,8 @@
 #define AUTHENC_DESC_JOB_IO_LEN		(AEAD_DESC_JOB_IO_LEN + \
 					 CAAM_CMD_SZ * 5)
 
+#define CHACHAPOLY_DESC_JOB_IO_LEN	(AEAD_DESC_JOB_IO_LEN + CAAM_CMD_SZ * 6)
+
 #define DESC_MAX_USED_BYTES		(CAAM_DESC_BYTES_MAX - DESC_JOB_IO_LEN)
 #define DESC_MAX_USED_LEN		(DESC_MAX_USED_BYTES / CAAM_CMD_SZ)
 
@@ -513,6 +515,61 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc,
 	return 0;
 }
 
+static int chachapoly_set_sh_desc(struct crypto_aead *aead)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	u32 *desc;
+
+	if (!ctx->cdata.keylen || !ctx->authsize)
+		return 0;
+
+	desc = ctx->sh_desc_enc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, true, false);
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), ctx->dir);
+
+	desc = ctx->sh_desc_dec;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, false, false);
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), ctx->dir);
+
+	return 0;
+}
+
+static int chachapoly_setauthsize(struct crypto_aead *aead,
+				  unsigned int authsize)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (authsize != POLY1305_DIGEST_SIZE)
+		return -EINVAL;
+
+	ctx->authsize = authsize;
+	return chachapoly_set_sh_desc(aead);
+}
+
+static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
+			     unsigned int keylen)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
+
+	if (keylen != CHACHA_KEY_SIZE + saltlen) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->cdata.key_virt = key;
+	ctx->cdata.keylen = keylen - saltlen;
+
+	return chachapoly_set_sh_desc(aead);
+}
+
 static int aead_setkey(struct crypto_aead *aead,
 			       const u8 *key, unsigned int keylen)
 {
@@ -1031,6 +1088,40 @@ static void init_gcm_job(struct aead_request *req,
 	/* End of blank commands */
 }
 
+static void init_chachapoly_job(struct aead_request *req,
+				struct aead_edesc *edesc, bool all_contig,
+				bool encrypt)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int assoclen = req->assoclen;
+	u32 *desc = edesc->hw_desc;
+	u32 ctx_iv_off = 4;
+
+	init_aead_job(req, edesc, all_contig, encrypt);
+
+	if (ivsize != CHACHAPOLY_IV_SIZE) {
+		/* IPsec specific: CONTEXT1[223:128] = {NONCE, IV} */
+		ctx_iv_off += 4;
+
+		/*
+		 * The associated data comes already with the IV but we need
+		 * to skip it when we authenticate or encrypt...
+		 */
+		assoclen -= ivsize;
+	}
+
+	append_math_add_imm_u32(desc, REG3, ZERO, IMM, assoclen);
+
+	/*
+	 * For IPsec load the IV further in the same register.
+	 * For RFC7539 simply load the 12 bytes nonce in a single operation
+	 */
+	append_load_as_imm(desc, req->iv, ivsize, LDST_CLASS_1_CCB |
+			   LDST_SRCDST_BYTE_CONTEXT |
+			   ctx_iv_off << LDST_OFFSET_SHIFT);
+}
+
 static void init_authenc_job(struct aead_request *req,
 			     struct aead_edesc *edesc,
 			     bool all_contig, bool encrypt)
@@ -1289,6 +1380,72 @@ static int gcm_encrypt(struct aead_request *req)
 	return ret;
 }
 
+static int chachapoly_encrypt(struct aead_request *req)
+{
+	struct aead_edesc *edesc;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	bool all_contig;
+	u32 *desc;
+	int ret;
+
+	edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig,
+				 true);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	desc = edesc->hw_desc;
+
+	init_chachapoly_job(req, edesc, all_contig, true);
+	print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+
+	ret = caam_jr_enqueue(jrdev, desc, aead_encrypt_done, req);
+	if (!ret) {
+		ret = -EINPROGRESS;
+	} else {
+		aead_unmap(jrdev, edesc, req);
+		kfree(edesc);
+	}
+
+	return ret;
+}
+
+static int chachapoly_decrypt(struct aead_request *req)
+{
+	struct aead_edesc *edesc;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	struct device *jrdev = ctx->jrdev;
+	bool all_contig;
+	u32 *desc;
+	int ret;
+
+	edesc = aead_edesc_alloc(req, CHACHAPOLY_DESC_JOB_IO_LEN, &all_contig,
+				 false);
+	if (IS_ERR(edesc))
+		return PTR_ERR(edesc);
+
+	desc = edesc->hw_desc;
+
+	init_chachapoly_job(req, edesc, all_contig, false);
+	print_hex_dump_debug("chachapoly jobdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+
+	ret = caam_jr_enqueue(jrdev, desc, aead_decrypt_done, req);
+	if (!ret) {
+		ret = -EINPROGRESS;
+	} else {
+		aead_unmap(jrdev, edesc, req);
+		kfree(edesc);
+	}
+
+	return ret;
+}
+
 static int ipsec_gcm_encrypt(struct aead_request *req)
 {
 	if (req->assoclen < 8)
@@ -3002,6 +3159,50 @@ static struct caam_aead_alg driver_aeads[] = {
 			.geniv = true,
 		},
 	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539-chacha20-poly1305-"
+						   "caam",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = chachapoly_encrypt,
+			.decrypt = chachapoly_decrypt,
+			.ivsize = CHACHAPOLY_IV_SIZE,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539esp(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539esp-chacha20-"
+						   "poly1305-caam",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = chachapoly_encrypt,
+			.decrypt = chachapoly_decrypt,
+			.ivsize = 8,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
 };
 
 static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam,
@@ -3135,7 +3336,7 @@ static int __init caam_algapi_init(void)
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
 	int i = 0, err = 0;
-	u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst;
+	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst, ccha_inst, ptha_inst;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
 	bool registered = false;
 
@@ -3168,14 +3369,38 @@ static int __init caam_algapi_init(void)
 	 * Register crypto algorithms the device supports.
 	 * First, detect presence and attributes of DES, AES, and MD blocks.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT;
-	aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT;
-	md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	if (priv->era < 10) {
+		u32 cha_vid, cha_inst;
+
+		cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
+		aes_vid = cha_vid & CHA_ID_LS_AES_MASK;
+		md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+
+		cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+		des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >>
+			   CHA_ID_LS_DES_SHIFT;
+		aes_inst = cha_inst & CHA_ID_LS_AES_MASK;
+		md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+		ccha_inst = 0;
+		ptha_inst = 0;
+	} else {
+		u32 aesa, mdha;
+
+		aesa = rd_reg32(&priv->ctrl->vreg.aesa);
+		mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+
+		des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK;
+		aes_inst = aesa & CHA_VER_NUM_MASK;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+		ccha_inst = rd_reg32(&priv->ctrl->vreg.ccha) & CHA_VER_NUM_MASK;
+		ptha_inst = rd_reg32(&priv->ctrl->vreg.ptha) & CHA_VER_NUM_MASK;
+	}
 
 	/* If MD is present, limit digest size based on LP256 */
-	if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256))
+	if (md_inst && md_vid  == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
@@ -3196,10 +3421,10 @@ static int __init caam_algapi_init(void)
 		 * Check support for AES modes not available
 		 * on LP devices.
 		 */
-		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
-			if ((t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) ==
-			     OP_ALG_AAI_XTS)
-				continue;
+		if (aes_vid == CHA_VER_VID_AES_LP &&
+		    (t_alg->caam.class1_alg_type & OP_ALG_AAI_MASK) ==
+		    OP_ALG_AAI_XTS)
+			continue;
 
 		caam_skcipher_alg_init(t_alg);
 
@@ -3232,21 +3457,28 @@ static int __init caam_algapi_init(void)
 		if (!aes_inst && (c1_alg_sel == OP_ALG_ALGSEL_AES))
 				continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 && !ccha_inst)
+			continue;
+
+		/* Skip POLY1305 algorithms if not supported by device */
+		if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 && !ptha_inst)
+			continue;
+
 		/*
 		 * Check support for AES algorithms not available
 		 * on LP devices.
 		 */
-		if ((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP)
-			if (alg_aai == OP_ALG_AAI_GCM)
-				continue;
+		if (aes_vid  == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM)
+			continue;
 
 		/*
 		 * Skip algorithms requiring message digests
 		 * if MD or MD size is not supported by device.
 		 */
-		if (c2_alg_sel &&
-		    (!md_inst || (t_alg->aead.maxauthsize > md_limit)))
-				continue;
+		if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 &&
+		    (!md_inst || t_alg->aead.maxauthsize > md_limit))
+			continue;
 
 		caam_aead_alg_init(t_alg);
 
diff --git a/drivers/crypto/caam/caamalg_desc.c b/drivers/crypto/caam/caamalg_desc.c
index 1a6f0da141069fe0be11906eec774e53bf0fed69..7db1640d3577d1c45ce2c65aed19e9414144c66c 100644
--- a/drivers/crypto/caam/caamalg_desc.c
+++ b/drivers/crypto/caam/caamalg_desc.c
@@ -1213,6 +1213,139 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata,
 }
 EXPORT_SYMBOL(cnstr_shdsc_rfc4543_decap);
 
+/**
+ * cnstr_shdsc_chachapoly - Chacha20 + Poly1305 generic AEAD (rfc7539) and
+ *                          IPsec ESP (rfc7634, a.k.a. rfc7539esp) shared
+ *                          descriptor (non-protocol).
+ * @desc: pointer to buffer used for descriptor construction
+ * @cdata: pointer to block cipher transform definitions
+ *         Valid algorithm values - OP_ALG_ALGSEL_CHACHA20 ANDed with
+ *         OP_ALG_AAI_AEAD.
+ * @adata: pointer to authentication transform definitions
+ *         Valid algorithm values - OP_ALG_ALGSEL_POLY1305 ANDed with
+ *         OP_ALG_AAI_AEAD.
+ * @ivsize: initialization vector size
+ * @icvsize: integrity check value (ICV) size (truncated or full)
+ * @encap: true if encapsulation, false if decapsulation
+ * @is_qi: true when called from caam/qi
+ */
+void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
+			    struct alginfo *adata, unsigned int ivsize,
+			    unsigned int icvsize, const bool encap,
+			    const bool is_qi)
+{
+	u32 *key_jump_cmd, *wait_cmd;
+	u32 nfifo;
+	const bool is_ipsec = (ivsize != CHACHAPOLY_IV_SIZE);
+
+	/* Note: Context registers are saved. */
+	init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX);
+
+	/* skip key loading if they are loaded due to sharing */
+	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+				   JUMP_COND_SHRD);
+
+	append_key_as_imm(desc, cdata->key_virt, cdata->keylen, cdata->keylen,
+			  CLASS_1 | KEY_DEST_CLASS_REG);
+
+	/* For IPsec load the salt from keymat in the context register */
+	if (is_ipsec)
+		append_load_as_imm(desc, cdata->key_virt + cdata->keylen, 4,
+				   LDST_CLASS_1_CCB | LDST_SRCDST_BYTE_CONTEXT |
+				   4 << LDST_OFFSET_SHIFT);
+
+	set_jump_tgt_here(desc, key_jump_cmd);
+
+	/* Class 2 and 1 operations: Poly & ChaCha */
+	if (encap) {
+		append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_ENCRYPT);
+		append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_ENCRYPT);
+	} else {
+		append_operation(desc, adata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+		append_operation(desc, cdata->algtype | OP_ALG_AS_INITFINAL |
+				 OP_ALG_DECRYPT);
+	}
+
+	if (is_qi) {
+		u32 *wait_load_cmd;
+		u32 ctx1_iv_off = is_ipsec ? 8 : 4;
+
+		/* REG3 = assoclen */
+		append_seq_load(desc, 4, LDST_CLASS_DECO |
+				LDST_SRCDST_WORD_DECO_MATH3 |
+				4 << LDST_OFFSET_SHIFT);
+
+		wait_load_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+					    JUMP_COND_CALM | JUMP_COND_NCP |
+					    JUMP_COND_NOP | JUMP_COND_NIP |
+					    JUMP_COND_NIFP);
+		set_jump_tgt_here(desc, wait_load_cmd);
+
+		append_seq_load(desc, ivsize, LDST_CLASS_1_CCB |
+				LDST_SRCDST_BYTE_CONTEXT |
+				ctx1_iv_off << LDST_OFFSET_SHIFT);
+	}
+
+	/*
+	 * MAGIC with NFIFO
+	 * Read associated data from the input and send them to class1 and
+	 * class2 alignment blocks. From class1 send data to output fifo and
+	 * then write it to memory since we don't need to encrypt AD.
+	 */
+	nfifo = NFIFOENTRY_DEST_BOTH | NFIFOENTRY_FC1 | NFIFOENTRY_FC2 |
+		NFIFOENTRY_DTYPE_POLY | NFIFOENTRY_BND;
+	append_load_imm_u32(desc, nfifo, LDST_CLASS_IND_CCB |
+			    LDST_SRCDST_WORD_INFO_FIFO_SM | LDLEN_MATH3);
+
+	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
+	append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ);
+	append_seq_fifo_load(desc, 0, FIFOLD_TYPE_NOINFOFIFO |
+			     FIFOLD_CLASS_CLASS1 | LDST_VLF);
+	append_move_len(desc, MOVE_AUX_LS | MOVE_SRC_AUX_ABLK |
+			MOVE_DEST_OUTFIFO | MOVELEN_MRSEL_MATH3);
+	append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | LDST_VLF);
+
+	/* IPsec - copy IV at the output */
+	if (is_ipsec)
+		append_seq_fifo_store(desc, ivsize, FIFOST_TYPE_METADATA |
+				      0x2 << 25);
+
+	wait_cmd = append_jump(desc, JUMP_JSL | JUMP_TYPE_LOCAL |
+			       JUMP_COND_NOP | JUMP_TEST_ALL);
+	set_jump_tgt_here(desc, wait_cmd);
+
+	if (encap) {
+		/* Read and write cryptlen bytes */
+		append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+		append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0,
+				CAAM_CMD_SZ);
+		aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2);
+
+		/* Write ICV */
+		append_seq_store(desc, icvsize, LDST_CLASS_2_CCB |
+				 LDST_SRCDST_BYTE_CONTEXT);
+	} else {
+		/* Read and write cryptlen bytes */
+		append_math_add(desc, VARSEQINLEN, SEQOUTLEN, REG0,
+				CAAM_CMD_SZ);
+		append_math_add(desc, VARSEQOUTLEN, SEQOUTLEN, REG0,
+				CAAM_CMD_SZ);
+		aead_append_src_dst(desc, FIFOLD_TYPE_MSG);
+
+		/* Load ICV for verification */
+		append_seq_fifo_load(desc, icvsize, FIFOLD_CLASS_CLASS2 |
+				     FIFOLD_TYPE_LAST2 | FIFOLD_TYPE_ICV);
+	}
+
+	print_hex_dump_debug("chachapoly shdesc@" __stringify(__LINE__)": ",
+			     DUMP_PREFIX_ADDRESS, 16, 4, desc, desc_bytes(desc),
+			     1);
+}
+EXPORT_SYMBOL(cnstr_shdsc_chachapoly);
+
 /* For skcipher encrypt and decrypt, read from req->src and write to req->dst */
 static inline void skcipher_append_src_dst(u32 *desc)
 {
@@ -1228,7 +1361,8 @@ static inline void skcipher_append_src_dst(u32 *desc)
  * @desc: pointer to buffer used for descriptor construction
  * @cdata: pointer to block cipher transform definitions
  *         Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed
- *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128.
+ *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128
+ *                                - OP_ALG_ALGSEL_CHACHA20
  * @ivsize: initialization vector size
  * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template
  * @ctx1_iv_off: IV offset in CONTEXT1 register
@@ -1293,7 +1427,8 @@ EXPORT_SYMBOL(cnstr_shdsc_skcipher_encap);
  * @desc: pointer to buffer used for descriptor construction
  * @cdata: pointer to block cipher transform definitions
  *         Valid algorithm values - one of OP_ALG_ALGSEL_{AES, DES, 3DES} ANDed
- *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128.
+ *         with OP_ALG_AAI_CBC or OP_ALG_AAI_CTR_MOD128
+ *                                - OP_ALG_ALGSEL_CHACHA20
  * @ivsize: initialization vector size
  * @is_rfc3686: true when ctr(aes) is wrapped by rfc3686 template
  * @ctx1_iv_off: IV offset in CONTEXT1 register
diff --git a/drivers/crypto/caam/caamalg_desc.h b/drivers/crypto/caam/caamalg_desc.h
index 1315c8f6f9515304b1ac7d9521eb403f520e2a03..d5ca42ff961a6dd755ae6727e2a79db7d7463dc9 100644
--- a/drivers/crypto/caam/caamalg_desc.h
+++ b/drivers/crypto/caam/caamalg_desc.h
@@ -96,6 +96,11 @@ void cnstr_shdsc_rfc4543_decap(u32 * const desc, struct alginfo *cdata,
 			       unsigned int ivsize, unsigned int icvsize,
 			       const bool is_qi);
 
+void cnstr_shdsc_chachapoly(u32 * const desc, struct alginfo *cdata,
+			    struct alginfo *adata, unsigned int ivsize,
+			    unsigned int icvsize, const bool encap,
+			    const bool is_qi);
+
 void cnstr_shdsc_skcipher_encap(u32 * const desc, struct alginfo *cdata,
 				unsigned int ivsize, const bool is_rfc3686,
 				const u32 ctx1_iv_off);
diff --git a/drivers/crypto/caam/caamalg_qi.c b/drivers/crypto/caam/caamalg_qi.c
index 23c9fc4975f8a8a4584c000e041cd0b61a697cbb..c0d55310aade17387ad2e58c284df0d152b21b5d 100644
--- a/drivers/crypto/caam/caamalg_qi.c
+++ b/drivers/crypto/caam/caamalg_qi.c
@@ -2462,7 +2462,7 @@ static int __init caam_qi_algapi_init(void)
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
 	int i = 0, err = 0;
-	u32 cha_vid, cha_inst, des_inst, aes_inst, md_inst;
+	u32 aes_vid, aes_inst, des_inst, md_vid, md_inst;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
 	bool registered = false;
 
@@ -2497,14 +2497,34 @@ static int __init caam_qi_algapi_init(void)
 	 * Register crypto algorithms the device supports.
 	 * First, detect presence and attributes of DES, AES, and MD blocks.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >> CHA_ID_LS_DES_SHIFT;
-	aes_inst = (cha_inst & CHA_ID_LS_AES_MASK) >> CHA_ID_LS_AES_SHIFT;
-	md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	if (priv->era < 10) {
+		u32 cha_vid, cha_inst;
+
+		cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
+		aes_vid = cha_vid & CHA_ID_LS_AES_MASK;
+		md_vid = (cha_vid & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+
+		cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+		des_inst = (cha_inst & CHA_ID_LS_DES_MASK) >>
+			   CHA_ID_LS_DES_SHIFT;
+		aes_inst = cha_inst & CHA_ID_LS_AES_MASK;
+		md_inst = (cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	} else {
+		u32 aesa, mdha;
+
+		aesa = rd_reg32(&priv->ctrl->vreg.aesa);
+		mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		aes_vid = (aesa & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+
+		des_inst = rd_reg32(&priv->ctrl->vreg.desa) & CHA_VER_NUM_MASK;
+		aes_inst = aesa & CHA_VER_NUM_MASK;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+	}
 
 	/* If MD is present, limit digest size based on LP256 */
-	if (md_inst && ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256))
+	if (md_inst && md_vid  == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
@@ -2556,8 +2576,7 @@ static int __init caam_qi_algapi_init(void)
 		 * Check support for AES algorithms not available
 		 * on LP devices.
 		 */
-		if (((cha_vid & CHA_ID_LS_AES_MASK) == CHA_ID_LS_AES_LP) &&
-		    (alg_aai == OP_ALG_AAI_GCM))
+		if (aes_vid  == CHA_VER_VID_AES_LP && alg_aai == OP_ALG_AAI_GCM)
 			continue;
 
 		/*
diff --git a/drivers/crypto/caam/caamalg_qi2.c b/drivers/crypto/caam/caamalg_qi2.c
index 7d8ac0222fa3b2e4a9ecab3d96b19f0f17a87033..425d5d97461311277d925585d7f2eec3ccbcd967 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -462,7 +462,15 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
 	edesc->dst_nents = dst_nents;
 	edesc->iv_dma = iv_dma;
 
-	edesc->assoclen = cpu_to_caam32(req->assoclen);
+	if ((alg->caam.class1_alg_type & OP_ALG_ALGSEL_MASK) ==
+	    OP_ALG_ALGSEL_CHACHA20 && ivsize != CHACHAPOLY_IV_SIZE)
+		/*
+		 * The associated data comes already with the IV but we need
+		 * to skip it when we authenticate or encrypt...
+		 */
+		edesc->assoclen = cpu_to_caam32(req->assoclen - ivsize);
+	else
+		edesc->assoclen = cpu_to_caam32(req->assoclen);
 	edesc->assoclen_dma = dma_map_single(dev, &edesc->assoclen, 4,
 					     DMA_TO_DEVICE);
 	if (dma_mapping_error(dev, edesc->assoclen_dma)) {
@@ -532,6 +540,68 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
 	return edesc;
 }
 
+static int chachapoly_set_sh_desc(struct crypto_aead *aead)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	struct device *dev = ctx->dev;
+	struct caam_flc *flc;
+	u32 *desc;
+
+	if (!ctx->cdata.keylen || !ctx->authsize)
+		return 0;
+
+	flc = &ctx->flc[ENCRYPT];
+	desc = flc->sh_desc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, true, true);
+	flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */
+	dma_sync_single_for_device(dev, ctx->flc_dma[ENCRYPT],
+				   sizeof(flc->flc) + desc_bytes(desc),
+				   ctx->dir);
+
+	flc = &ctx->flc[DECRYPT];
+	desc = flc->sh_desc;
+	cnstr_shdsc_chachapoly(desc, &ctx->cdata, &ctx->adata, ivsize,
+			       ctx->authsize, false, true);
+	flc->flc[1] = cpu_to_caam32(desc_len(desc)); /* SDL */
+	dma_sync_single_for_device(dev, ctx->flc_dma[DECRYPT],
+				   sizeof(flc->flc) + desc_bytes(desc),
+				   ctx->dir);
+
+	return 0;
+}
+
+static int chachapoly_setauthsize(struct crypto_aead *aead,
+				  unsigned int authsize)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+
+	if (authsize != POLY1305_DIGEST_SIZE)
+		return -EINVAL;
+
+	ctx->authsize = authsize;
+	return chachapoly_set_sh_desc(aead);
+}
+
+static int chachapoly_setkey(struct crypto_aead *aead, const u8 *key,
+			     unsigned int keylen)
+{
+	struct caam_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int ivsize = crypto_aead_ivsize(aead);
+	unsigned int saltlen = CHACHAPOLY_IV_SIZE - ivsize;
+
+	if (keylen != CHACHA_KEY_SIZE + saltlen) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->cdata.key_virt = key;
+	ctx->cdata.keylen = keylen - saltlen;
+
+	return chachapoly_set_sh_desc(aead);
+}
+
 static int gcm_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
@@ -816,7 +886,9 @@ static int skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key,
 	u32 *desc;
 	u32 ctx1_iv_off = 0;
 	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
-			       OP_ALG_AAI_CTR_MOD128);
+			       OP_ALG_AAI_CTR_MOD128) &&
+			       ((ctx->cdata.algtype & OP_ALG_ALGSEL_MASK) !=
+			       OP_ALG_ALGSEL_CHACHA20);
 	const bool is_rfc3686 = alg->caam.rfc3686;
 
 	print_hex_dump_debug("key in @" __stringify(__LINE__)": ",
@@ -1494,7 +1566,23 @@ static struct caam_skcipher_alg driver_algs[] = {
 			.ivsize = AES_BLOCK_SIZE,
 		},
 		.caam.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_XTS,
-	}
+	},
+	{
+		.skcipher = {
+			.base = {
+				.cra_name = "chacha20",
+				.cra_driver_name = "chacha20-caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = skcipher_setkey,
+			.encrypt = skcipher_encrypt,
+			.decrypt = skcipher_decrypt,
+			.min_keysize = CHACHA_KEY_SIZE,
+			.max_keysize = CHACHA_KEY_SIZE,
+			.ivsize = CHACHA_IV_SIZE,
+		},
+		.caam.class1_alg_type = OP_ALG_ALGSEL_CHACHA20,
+	},
 };
 
 static struct caam_aead_alg driver_aeads[] = {
@@ -2608,6 +2696,50 @@ static struct caam_aead_alg driver_aeads[] = {
 			.geniv = true,
 		},
 	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539-chacha20-poly1305-"
+						   "caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = aead_encrypt,
+			.decrypt = aead_decrypt,
+			.ivsize = CHACHAPOLY_IV_SIZE,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
+	{
+		.aead = {
+			.base = {
+				.cra_name = "rfc7539esp(chacha20,poly1305)",
+				.cra_driver_name = "rfc7539esp-chacha20-"
+						   "poly1305-caam-qi2",
+				.cra_blocksize = 1,
+			},
+			.setkey = chachapoly_setkey,
+			.setauthsize = chachapoly_setauthsize,
+			.encrypt = aead_encrypt,
+			.decrypt = aead_decrypt,
+			.ivsize = 8,
+			.maxauthsize = POLY1305_DIGEST_SIZE,
+		},
+		.caam = {
+			.class1_alg_type = OP_ALG_ALGSEL_CHACHA20 |
+					   OP_ALG_AAI_AEAD,
+			.class2_alg_type = OP_ALG_ALGSEL_POLY1305 |
+					   OP_ALG_AAI_AEAD,
+		},
+	},
 	{
 		.aead = {
 			.base = {
@@ -4908,6 +5040,11 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev)
 		    alg_sel == OP_ALG_ALGSEL_AES)
 			continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (alg_sel == OP_ALG_ALGSEL_CHACHA20 &&
+		    !priv->sec_attr.ccha_acc_num)
+			continue;
+
 		t_alg->caam.dev = dev;
 		caam_skcipher_alg_init(t_alg);
 
@@ -4940,11 +5077,22 @@ static int dpaa2_caam_probe(struct fsl_mc_device *dpseci_dev)
 		    c1_alg_sel == OP_ALG_ALGSEL_AES)
 			continue;
 
+		/* Skip CHACHA20 algorithms if not supported by device */
+		if (c1_alg_sel == OP_ALG_ALGSEL_CHACHA20 &&
+		    !priv->sec_attr.ccha_acc_num)
+			continue;
+
+		/* Skip POLY1305 algorithms if not supported by device */
+		if (c2_alg_sel == OP_ALG_ALGSEL_POLY1305 &&
+		    !priv->sec_attr.ptha_acc_num)
+			continue;
+
 		/*
 		 * Skip algorithms requiring message digests
 		 * if MD not supported by device.
 		 */
-		if (!priv->sec_attr.md_acc_num && c2_alg_sel)
+		if ((c2_alg_sel & ~OP_ALG_ALGSEL_SUBMASK) == 0x40 &&
+		    !priv->sec_attr.md_acc_num)
 			continue;
 
 		t_alg->caam.dev = dev;
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 46924affa0bd6e8ccd7e45543a29756608edacba..81712aa5d0f2fae6062efe706e6811e98c79db2e 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for ahash functions of crypto API
  *
  * Copyright 2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * Based on caamalg.c crypto API driver.
  *
@@ -1801,7 +1802,7 @@ static int __init caam_algapi_hash_init(void)
 	int i = 0, err = 0;
 	struct caam_drv_private *priv;
 	unsigned int md_limit = SHA512_DIGEST_SIZE;
-	u32 cha_inst, cha_vid;
+	u32 md_inst, md_vid;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
 	if (!dev_node) {
@@ -1831,18 +1832,27 @@ static int __init caam_algapi_hash_init(void)
 	 * Register crypto algorithms the device supports.  First, identify
 	 * presence and attributes of MD block.
 	 */
-	cha_vid = rd_reg32(&priv->ctrl->perfmon.cha_id_ls);
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
+	if (priv->era < 10) {
+		md_vid = (rd_reg32(&priv->ctrl->perfmon.cha_id_ls) &
+			  CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+		md_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			   CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT;
+	} else {
+		u32 mdha = rd_reg32(&priv->ctrl->vreg.mdha);
+
+		md_vid = (mdha & CHA_VER_VID_MASK) >> CHA_VER_VID_SHIFT;
+		md_inst = mdha & CHA_VER_NUM_MASK;
+	}
 
 	/*
 	 * Skip registration of any hashing algorithms if MD block
 	 * is not present.
 	 */
-	if (!((cha_inst & CHA_ID_LS_MD_MASK) >> CHA_ID_LS_MD_SHIFT))
+	if (!md_inst)
 		return -ENODEV;
 
 	/* Limit digest size based on LP256 */
-	if ((cha_vid & CHA_ID_LS_MD_MASK) == CHA_ID_LS_MD_LP256)
+	if (md_vid == CHA_VER_VID_MD_LP256)
 		md_limit = SHA256_DIGEST_SIZE;
 
 	INIT_LIST_HEAD(&hash_list);
diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c
index 4fc209cbbeabd8819eec55ce60a551f4780feb23..77ab28a2811a60aacd65329606e5fcc314ffd3d9 100644
--- a/drivers/crypto/caam/caampkc.c
+++ b/drivers/crypto/caam/caampkc.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for Public Key Cryptography
  *
  * Copyright 2016 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * There is no Shared Descriptor for PKC so that the Job Descriptor must carry
  * all the desired key parameters, input and output pointers.
@@ -1017,7 +1018,7 @@ static int __init caam_pkc_init(void)
 	struct platform_device *pdev;
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
-	u32 cha_inst, pk_inst;
+	u32 pk_inst;
 	int err;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
@@ -1045,8 +1046,11 @@ static int __init caam_pkc_init(void)
 		return -ENODEV;
 
 	/* Determine public key hardware accelerator presence. */
-	cha_inst = rd_reg32(&priv->ctrl->perfmon.cha_num_ls);
-	pk_inst = (cha_inst & CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT;
+	if (priv->era < 10)
+		pk_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			   CHA_ID_LS_PK_MASK) >> CHA_ID_LS_PK_SHIFT;
+	else
+		pk_inst = rd_reg32(&priv->ctrl->vreg.pkha) & CHA_VER_NUM_MASK;
 
 	/* Do not register algorithms if PKHA is not present. */
 	if (!pk_inst)
diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 4318b0aa6fb9e3556afc5aef8b478d2525a6ef2e..a387c8d49a62d775617f867465a53a0404f8bc87 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -3,6 +3,7 @@
  * caam - Freescale FSL CAAM support for hw_random
  *
  * Copyright 2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  *
  * Based on caamalg.c crypto API driver.
  *
@@ -309,6 +310,7 @@ static int __init caam_rng_init(void)
 	struct platform_device *pdev;
 	struct device *ctrldev;
 	struct caam_drv_private *priv;
+	u32 rng_inst;
 	int err;
 
 	dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
@@ -336,7 +338,13 @@ static int __init caam_rng_init(void)
 		return -ENODEV;
 
 	/* Check for an instantiated RNG before registration */
-	if (!(rd_reg32(&priv->ctrl->perfmon.cha_num_ls) & CHA_ID_LS_RNG_MASK))
+	if (priv->era < 10)
+		rng_inst = (rd_reg32(&priv->ctrl->perfmon.cha_num_ls) &
+			    CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT;
+	else
+		rng_inst = rd_reg32(&priv->ctrl->vreg.rng) & CHA_VER_NUM_MASK;
+
+	if (!rng_inst)
 		return -ENODEV;
 
 	dev = caam_jr_alloc();
diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h
index 9604ff7a335e1fa8e9c8a0dedebe701d8ade7d2c..87d9efe4c7aa5fb0c18de140935cb9ac5525f8b9 100644
--- a/drivers/crypto/caam/compat.h
+++ b/drivers/crypto/caam/compat.h
@@ -36,6 +36,8 @@
 #include <crypto/gcm.h>
 #include <crypto/sha.h>
 #include <crypto/md5.h>
+#include <crypto/chacha.h>
+#include <crypto/poly1305.h>
 #include <crypto/internal/aead.h>
 #include <crypto/authenc.h>
 #include <crypto/akcipher.h>
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 3fc7931938215cb62b4c94c3b6c3f3368915ccd4..16bbc72f041a8b3a507ec6b49d83caeca47acf0d 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -3,6 +3,7 @@
  * Controller-level driver, kernel property detection, initialization
  *
  * Copyright 2008-2012 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #include <linux/device.h>
@@ -106,7 +107,7 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc,
 	struct caam_ctrl __iomem *ctrl = ctrlpriv->ctrl;
 	struct caam_deco __iomem *deco = ctrlpriv->deco;
 	unsigned int timeout = 100000;
-	u32 deco_dbg_reg, flags;
+	u32 deco_dbg_reg, deco_state, flags;
 	int i;
 
 
@@ -149,13 +150,22 @@ static inline int run_descriptor_deco0(struct device *ctrldev, u32 *desc,
 	timeout = 10000000;
 	do {
 		deco_dbg_reg = rd_reg32(&deco->desc_dbg);
+
+		if (ctrlpriv->era < 10)
+			deco_state = (deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) >>
+				     DESC_DBG_DECO_STAT_SHIFT;
+		else
+			deco_state = (rd_reg32(&deco->dbg_exec) &
+				      DESC_DER_DECO_STAT_MASK) >>
+				     DESC_DER_DECO_STAT_SHIFT;
+
 		/*
 		 * If an error occured in the descriptor, then
 		 * the DECO status field will be set to 0x0D
 		 */
-		if ((deco_dbg_reg & DESC_DBG_DECO_STAT_MASK) ==
-		    DESC_DBG_DECO_STAT_HOST_ERR)
+		if (deco_state == DECO_STAT_HOST_ERR)
 			break;
+
 		cpu_relax();
 	} while ((deco_dbg_reg & DESC_DBG_DECO_STAT_VALID) && --timeout);
 
@@ -491,7 +501,7 @@ static int caam_probe(struct platform_device *pdev)
 	struct caam_perfmon *perfmon;
 #endif
 	u32 scfgr, comp_params;
-	u32 cha_vid_ls;
+	u8 rng_vid;
 	int pg_size;
 	int BLOCK_OFFSET = 0;
 
@@ -733,15 +743,19 @@ static int caam_probe(struct platform_device *pdev)
 		goto caam_remove;
 	}
 
-	cha_vid_ls = rd_reg32(&ctrl->perfmon.cha_id_ls);
+	if (ctrlpriv->era < 10)
+		rng_vid = (rd_reg32(&ctrl->perfmon.cha_id_ls) &
+			   CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT;
+	else
+		rng_vid = (rd_reg32(&ctrl->vreg.rng) & CHA_VER_VID_MASK) >>
+			   CHA_VER_VID_SHIFT;
 
 	/*
 	 * If SEC has RNG version >= 4 and RNG state handle has not been
 	 * already instantiated, do RNG instantiation
 	 * In case of SoCs with Management Complex, RNG is managed by MC f/w.
 	 */
-	if (!ctrlpriv->mc_en &&
-	    (cha_vid_ls & CHA_ID_LS_RNG_MASK) >> CHA_ID_LS_RNG_SHIFT >= 4) {
+	if (!ctrlpriv->mc_en && rng_vid >= 4) {
 		ctrlpriv->rng4_sh_init =
 			rd_reg32(&ctrl->r4tst[0].rdsta);
 		/*
diff --git a/drivers/crypto/caam/desc.h b/drivers/crypto/caam/desc.h
index f76ff160a02c466fed29a8dd76be77970e3f1c5f..ec10230178c5215a1f442be0cde4478629000a3c 100644
--- a/drivers/crypto/caam/desc.h
+++ b/drivers/crypto/caam/desc.h
@@ -4,6 +4,7 @@
  * Definitions to support CAAM descriptor instruction generation
  *
  * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #ifndef DESC_H
@@ -242,6 +243,7 @@
 #define LDST_SRCDST_WORD_DESCBUF_SHARED	(0x42 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_DESCBUF_JOB_WE	(0x45 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_DESCBUF_SHARED_WE (0x46 << LDST_SRCDST_SHIFT)
+#define LDST_SRCDST_WORD_INFO_FIFO_SM	(0x71 << LDST_SRCDST_SHIFT)
 #define LDST_SRCDST_WORD_INFO_FIFO	(0x7a << LDST_SRCDST_SHIFT)
 
 /* Offset in source/destination */
@@ -284,6 +286,12 @@
 #define LDLEN_SET_OFIFO_OFFSET_SHIFT	0
 #define LDLEN_SET_OFIFO_OFFSET_MASK	(3 << LDLEN_SET_OFIFO_OFFSET_SHIFT)
 
+/* Special Length definitions when dst=sm, nfifo-{sm,m} */
+#define LDLEN_MATH0			0
+#define LDLEN_MATH1			1
+#define LDLEN_MATH2			2
+#define LDLEN_MATH3			3
+
 /*
  * FIFO_LOAD/FIFO_STORE/SEQ_FIFO_LOAD/SEQ_FIFO_STORE
  * Command Constructs
@@ -408,6 +416,7 @@
 #define FIFOST_TYPE_MESSAGE_DATA (0x30 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_RNGSTORE	 (0x34 << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_RNGFIFO	 (0x35 << FIFOST_TYPE_SHIFT)
+#define FIFOST_TYPE_METADATA	 (0x3e << FIFOST_TYPE_SHIFT)
 #define FIFOST_TYPE_SKIP	 (0x3f << FIFOST_TYPE_SHIFT)
 
 /*
@@ -1133,6 +1142,12 @@
 #define OP_ALG_TYPE_CLASS1	(2 << OP_ALG_TYPE_SHIFT)
 #define OP_ALG_TYPE_CLASS2	(4 << OP_ALG_TYPE_SHIFT)
 
+/* version register fields */
+#define OP_VER_CCHA_NUM  0x000000ff /* Number CCHAs instantiated */
+#define OP_VER_CCHA_MISC 0x0000ff00 /* CCHA Miscellaneous Information */
+#define OP_VER_CCHA_REV  0x00ff0000 /* CCHA Revision Number */
+#define OP_VER_CCHA_VID  0xff000000 /* CCHA Version ID */
+
 #define OP_ALG_ALGSEL_SHIFT	16
 #define OP_ALG_ALGSEL_MASK	(0xff << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_SUBMASK	(0x0f << OP_ALG_ALGSEL_SHIFT)
@@ -1152,6 +1167,8 @@
 #define OP_ALG_ALGSEL_KASUMI	(0x70 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_CRC	(0x90 << OP_ALG_ALGSEL_SHIFT)
 #define OP_ALG_ALGSEL_SNOW_F9	(0xA0 << OP_ALG_ALGSEL_SHIFT)
+#define OP_ALG_ALGSEL_CHACHA20	(0xD0 << OP_ALG_ALGSEL_SHIFT)
+#define OP_ALG_ALGSEL_POLY1305	(0xE0 << OP_ALG_ALGSEL_SHIFT)
 
 #define OP_ALG_AAI_SHIFT	4
 #define OP_ALG_AAI_MASK		(0x1ff << OP_ALG_AAI_SHIFT)
@@ -1199,6 +1216,11 @@
 #define OP_ALG_AAI_RNG4_AI	(0x80 << OP_ALG_AAI_SHIFT)
 #define OP_ALG_AAI_RNG4_SK	(0x100 << OP_ALG_AAI_SHIFT)
 
+/* Chacha20 AAI set */
+#define OP_ALG_AAI_AEAD	(0x002 << OP_ALG_AAI_SHIFT)
+#define OP_ALG_AAI_KEYSTREAM	(0x001 << OP_ALG_AAI_SHIFT)
+#define OP_ALG_AAI_BC8		(0x008 << OP_ALG_AAI_SHIFT)
+
 /* hmac/smac AAI set */
 #define OP_ALG_AAI_HASH		(0x00 << OP_ALG_AAI_SHIFT)
 #define OP_ALG_AAI_HMAC		(0x01 << OP_ALG_AAI_SHIFT)
@@ -1387,6 +1409,7 @@
 #define MOVE_SRC_MATH3		(0x07 << MOVE_SRC_SHIFT)
 #define MOVE_SRC_INFIFO		(0x08 << MOVE_SRC_SHIFT)
 #define MOVE_SRC_INFIFO_CL	(0x09 << MOVE_SRC_SHIFT)
+#define MOVE_SRC_AUX_ABLK	(0x0a << MOVE_SRC_SHIFT)
 
 #define MOVE_DEST_SHIFT		16
 #define MOVE_DEST_MASK		(0x0f << MOVE_DEST_SHIFT)
@@ -1413,6 +1436,10 @@
 
 #define MOVELEN_MRSEL_SHIFT	0
 #define MOVELEN_MRSEL_MASK	(0x3 << MOVE_LEN_SHIFT)
+#define MOVELEN_MRSEL_MATH0	(0 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH1	(1 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH2	(2 << MOVELEN_MRSEL_SHIFT)
+#define MOVELEN_MRSEL_MATH3	(3 << MOVELEN_MRSEL_SHIFT)
 
 /*
  * MATH Command Constructs
@@ -1589,6 +1616,7 @@
 #define NFIFOENTRY_DTYPE_IV	(0x2 << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_SAD	(0x3 << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_ICV	(0xA << NFIFOENTRY_DTYPE_SHIFT)
+#define NFIFOENTRY_DTYPE_POLY	(0xB << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_SKIP	(0xE << NFIFOENTRY_DTYPE_SHIFT)
 #define NFIFOENTRY_DTYPE_MSG	(0xF << NFIFOENTRY_DTYPE_SHIFT)
 
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index d4256fa4a1d65c42073b267095b6e124b6b6eaca..2980b8ef1fb1dccef83fa0c67fc556ea274355c8 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -189,6 +189,7 @@ static inline u32 *append_##cmd(u32 * const desc, u32 options) \
 }
 APPEND_CMD_RET(jump, JUMP)
 APPEND_CMD_RET(move, MOVE)
+APPEND_CMD_RET(move_len, MOVE_LEN)
 
 static inline void set_jump_tgt_here(u32 * const desc, u32 *jump_cmd)
 {
@@ -327,7 +328,11 @@ static inline void append_##cmd##_imm_##type(u32 * const desc, type immediate, \
 					     u32 options) \
 { \
 	PRINT_POS; \
-	append_cmd(desc, CMD_##op | IMMEDIATE | options | sizeof(type)); \
+	if (options & LDST_LEN_MASK) \
+		append_cmd(desc, CMD_##op | IMMEDIATE | options); \
+	else \
+		append_cmd(desc, CMD_##op | IMMEDIATE | options | \
+			   sizeof(type)); \
 	append_cmd(desc, immediate); \
 }
 APPEND_CMD_RAW_IMM(load, LOAD, u32);
diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index 457815f965c06f58b8502eab598fc00764d5c867..3cd0822ea819966abf8bc4a37e07ee242c41552a 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -3,6 +3,7 @@
  * CAAM hardware register-level view
  *
  * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ * Copyright 2018 NXP
  */
 
 #ifndef REGS_H
@@ -211,6 +212,47 @@ struct jr_outentry {
 	u32 jrstatus;	/* Status for completed descriptor */
 } __packed;
 
+/* Version registers (Era 10+)	e80-eff */
+struct version_regs {
+	u32 crca;	/* CRCA_VERSION */
+	u32 afha;	/* AFHA_VERSION */
+	u32 kfha;	/* KFHA_VERSION */
+	u32 pkha;	/* PKHA_VERSION */
+	u32 aesa;	/* AESA_VERSION */
+	u32 mdha;	/* MDHA_VERSION */
+	u32 desa;	/* DESA_VERSION */
+	u32 snw8a;	/* SNW8A_VERSION */
+	u32 snw9a;	/* SNW9A_VERSION */
+	u32 zuce;	/* ZUCE_VERSION */
+	u32 zuca;	/* ZUCA_VERSION */
+	u32 ccha;	/* CCHA_VERSION */
+	u32 ptha;	/* PTHA_VERSION */
+	u32 rng;	/* RNG_VERSION */
+	u32 trng;	/* TRNG_VERSION */
+	u32 aaha;	/* AAHA_VERSION */
+	u32 rsvd[10];
+	u32 sr;		/* SR_VERSION */
+	u32 dma;	/* DMA_VERSION */
+	u32 ai;		/* AI_VERSION */
+	u32 qi;		/* QI_VERSION */
+	u32 jr;		/* JR_VERSION */
+	u32 deco;	/* DECO_VERSION */
+};
+
+/* Version registers bitfields */
+
+/* Number of CHAs instantiated */
+#define CHA_VER_NUM_MASK	0xffull
+/* CHA Miscellaneous Information */
+#define CHA_VER_MISC_SHIFT	8
+#define CHA_VER_MISC_MASK	(0xffull << CHA_VER_MISC_SHIFT)
+/* CHA Revision Number */
+#define CHA_VER_REV_SHIFT	16
+#define CHA_VER_REV_MASK	(0xffull << CHA_VER_REV_SHIFT)
+/* CHA Version ID */
+#define CHA_VER_VID_SHIFT	24
+#define CHA_VER_VID_MASK	(0xffull << CHA_VER_VID_SHIFT)
+
 /*
  * caam_perfmon - Performance Monitor/Secure Memory Status/
  *                CAAM Global Status/Component Version IDs
@@ -223,15 +265,13 @@ struct jr_outentry {
 #define CHA_NUM_MS_DECONUM_MASK	(0xfull << CHA_NUM_MS_DECONUM_SHIFT)
 
 /*
- * CHA version IDs / instantiation bitfields
+ * CHA version IDs / instantiation bitfields (< Era 10)
  * Defined for use with the cha_id fields in perfmon, but the same shift/mask
  * selectors can be used to pull out the number of instantiated blocks within
  * cha_num fields in perfmon because the locations are the same.
  */
 #define CHA_ID_LS_AES_SHIFT	0
 #define CHA_ID_LS_AES_MASK	(0xfull << CHA_ID_LS_AES_SHIFT)
-#define CHA_ID_LS_AES_LP	(0x3ull << CHA_ID_LS_AES_SHIFT)
-#define CHA_ID_LS_AES_HP	(0x4ull << CHA_ID_LS_AES_SHIFT)
 
 #define CHA_ID_LS_DES_SHIFT	4
 #define CHA_ID_LS_DES_MASK	(0xfull << CHA_ID_LS_DES_SHIFT)
@@ -241,9 +281,6 @@ struct jr_outentry {
 
 #define CHA_ID_LS_MD_SHIFT	12
 #define CHA_ID_LS_MD_MASK	(0xfull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_LP256	(0x0ull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_LP512	(0x1ull << CHA_ID_LS_MD_SHIFT)
-#define CHA_ID_LS_MD_HP		(0x2ull << CHA_ID_LS_MD_SHIFT)
 
 #define CHA_ID_LS_RNG_SHIFT	16
 #define CHA_ID_LS_RNG_MASK	(0xfull << CHA_ID_LS_RNG_SHIFT)
@@ -269,6 +306,13 @@ struct jr_outentry {
 #define CHA_ID_MS_JR_SHIFT	28
 #define CHA_ID_MS_JR_MASK	(0xfull << CHA_ID_MS_JR_SHIFT)
 
+/* Specific CHA version IDs */
+#define CHA_VER_VID_AES_LP	0x3ull
+#define CHA_VER_VID_AES_HP	0x4ull
+#define CHA_VER_VID_MD_LP256	0x0ull
+#define CHA_VER_VID_MD_LP512	0x1ull
+#define CHA_VER_VID_MD_HP	0x2ull
+
 struct sec_vid {
 	u16 ip_id;
 	u8 maj_rev;
@@ -479,8 +523,10 @@ struct caam_ctrl {
 		struct rng4tst r4tst[2];
 	};
 
-	u32 rsvd9[448];
+	u32 rsvd9[416];
 
+	/* Version registers - introduced with era 10		e80-eff */
+	struct version_regs vreg;
 	/* Performance Monitor                                  f00-fff */
 	struct caam_perfmon perfmon;
 };
@@ -570,8 +616,10 @@ struct caam_job_ring {
 	u32 rsvd11;
 	u32 jrcommand;	/* JRCRx - JobR command */
 
-	u32 rsvd12[932];
+	u32 rsvd12[900];
 
+	/* Version registers - introduced with era 10           e80-eff */
+	struct version_regs vreg;
 	/* Performance Monitor                                  f00-fff */
 	struct caam_perfmon perfmon;
 };
@@ -878,13 +926,19 @@ struct caam_deco {
 	u32 rsvd29[48];
 	u32 descbuf[64];	/* DxDESB - Descriptor buffer */
 	u32 rscvd30[193];
-#define DESC_DBG_DECO_STAT_HOST_ERR	0x00D00000
 #define DESC_DBG_DECO_STAT_VALID	0x80000000
 #define DESC_DBG_DECO_STAT_MASK		0x00F00000
+#define DESC_DBG_DECO_STAT_SHIFT	20
 	u32 desc_dbg;		/* DxDDR - DECO Debug Register */
-	u32 rsvd31[126];
+	u32 rsvd31[13];
+#define DESC_DER_DECO_STAT_MASK		0x000F0000
+#define DESC_DER_DECO_STAT_SHIFT	16
+	u32 dbg_exec;		/* DxDER - DECO Debug Exec Register */
+	u32 rsvd32[112];
 };
 
+#define DECO_STAT_HOST_ERR	0xD
+
 #define DECO_JQCR_WHL		0x20000000
 #define DECO_JQCR_FOUR		0x10000000
 
diff --git a/drivers/crypto/cavium/nitrox/Makefile b/drivers/crypto/cavium/nitrox/Makefile
index e12954791673a58d6f75d1df318ab0bee15d9d4e..f83991aaf82040a0c24b69c4b7b27ae6047e26fd 100644
--- a/drivers/crypto/cavium/nitrox/Makefile
+++ b/drivers/crypto/cavium/nitrox/Makefile
@@ -6,7 +6,10 @@ n5pf-objs := nitrox_main.o \
 	nitrox_lib.o \
 	nitrox_hal.o \
 	nitrox_reqmgr.o \
-	nitrox_algs.o
+	nitrox_algs.o	\
+	nitrox_mbx.o	\
+	nitrox_skcipher.o \
+	nitrox_aead.o
 
 n5pf-$(CONFIG_PCI_IOV) += nitrox_sriov.o
 n5pf-$(CONFIG_DEBUG_FS) += nitrox_debugfs.o
diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f43eacd25572978204c0f952b7ccda8d68eaaf0
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/crypto.h>
+#include <linux/rtnetlink.h>
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <crypto/des.h>
+#include <crypto/sha.h>
+#include <crypto/internal/aead.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/gcm.h>
+
+#include "nitrox_dev.h"
+#include "nitrox_common.h"
+#include "nitrox_req.h"
+
+#define GCM_AES_SALT_SIZE	4
+
+/**
+ * struct nitrox_crypt_params - Params to set nitrox crypto request.
+ * @cryptlen: Encryption/Decryption data length
+ * @authlen: Assoc data length + Cryptlen
+ * @srclen: Input buffer length
+ * @dstlen: Output buffer length
+ * @iv: IV data
+ * @ivsize: IV data length
+ * @ctrl_arg: Identifies the request type (ENCRYPT/DECRYPT)
+ */
+struct nitrox_crypt_params {
+	unsigned int cryptlen;
+	unsigned int authlen;
+	unsigned int srclen;
+	unsigned int dstlen;
+	u8 *iv;
+	int ivsize;
+	u8 ctrl_arg;
+};
+
+union gph_p3 {
+	struct {
+#ifdef __BIG_ENDIAN_BITFIELD
+		u16 iv_offset : 8;
+		u16 auth_offset	: 8;
+#else
+		u16 auth_offset	: 8;
+		u16 iv_offset : 8;
+#endif
+	};
+	u16 param;
+};
+
+static int nitrox_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
+				 unsigned int keylen)
+{
+	int aes_keylen;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct flexi_crypto_context *fctx;
+	union fc_ctx_flags flags;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	/* fill crypto context */
+	fctx = nctx->u.fctx;
+	flags.f = be64_to_cpu(fctx->flags.f);
+	flags.w0.aes_keylen = aes_keylen;
+	fctx->flags.f = cpu_to_be64(flags.f);
+
+	/* copy enc key to context */
+	memset(&fctx->crypto, 0, sizeof(fctx->crypto));
+	memcpy(fctx->crypto.u.key, key, keylen);
+
+	return 0;
+}
+
+static int nitrox_aead_setauthsize(struct crypto_aead *aead,
+				   unsigned int authsize)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	union fc_ctx_flags flags;
+
+	flags.f = be64_to_cpu(fctx->flags.f);
+	flags.w0.mac_len = authsize;
+	fctx->flags.f = cpu_to_be64(flags.f);
+
+	aead->authsize = authsize;
+
+	return 0;
+}
+
+static int alloc_src_sglist(struct aead_request *areq, char *iv, int ivsize,
+			    int buflen)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	int nents = sg_nents_for_len(areq->src, buflen) + 1;
+	int ret;
+
+	if (nents < 0)
+		return nents;
+
+	/* Allocate buffer to hold IV and input scatterlist array */
+	ret = alloc_src_req_buf(nkreq, nents, ivsize);
+	if (ret)
+		return ret;
+
+	nitrox_creq_copy_iv(nkreq->src, iv, ivsize);
+	nitrox_creq_set_src_sg(nkreq, nents, ivsize, areq->src, buflen);
+
+	return 0;
+}
+
+static int alloc_dst_sglist(struct aead_request *areq, int ivsize, int buflen)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	int nents = sg_nents_for_len(areq->dst, buflen) + 3;
+	int ret;
+
+	if (nents < 0)
+		return nents;
+
+	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
+	 * array
+	 */
+	ret = alloc_dst_req_buf(nkreq, nents);
+	if (ret)
+		return ret;
+
+	nitrox_creq_set_orh(nkreq);
+	nitrox_creq_set_comp(nkreq);
+	nitrox_creq_set_dst_sg(nkreq, nents, ivsize, areq->dst, buflen);
+
+	return 0;
+}
+
+static void free_src_sglist(struct aead_request *areq)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+
+	kfree(nkreq->src);
+}
+
+static void free_dst_sglist(struct aead_request *areq)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+
+	kfree(nkreq->dst);
+}
+
+static int nitrox_set_creq(struct aead_request *areq,
+			   struct nitrox_crypt_params *params)
+{
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	union gph_p3 param3;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	int ret;
+
+	creq->flags = areq->base.flags;
+	creq->gfp = (areq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+		GFP_KERNEL : GFP_ATOMIC;
+
+	creq->ctrl.value = 0;
+	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
+	creq->ctrl.s.arg = params->ctrl_arg;
+
+	creq->gph.param0 = cpu_to_be16(params->cryptlen);
+	creq->gph.param1 = cpu_to_be16(params->authlen);
+	creq->gph.param2 = cpu_to_be16(params->ivsize + areq->assoclen);
+	param3.iv_offset = 0;
+	param3.auth_offset = params->ivsize;
+	creq->gph.param3 = cpu_to_be16(param3.param);
+
+	creq->ctx_handle = nctx->u.ctx_handle;
+	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
+
+	ret = alloc_src_sglist(areq, params->iv, params->ivsize,
+			       params->srclen);
+	if (ret)
+		return ret;
+
+	ret = alloc_dst_sglist(areq, params->ivsize, params->dstlen);
+	if (ret) {
+		free_src_sglist(areq);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void nitrox_aead_callback(void *arg, int err)
+{
+	struct aead_request *areq = arg;
+
+	free_src_sglist(areq);
+	free_dst_sglist(areq);
+	if (err) {
+		pr_err_ratelimited("request failed status 0x%0x\n", err);
+		err = -EINVAL;
+	}
+
+	areq->base.complete(&areq->base, err);
+}
+
+static int nitrox_aes_gcm_enc(struct aead_request *areq)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	struct nitrox_crypt_params params;
+	int ret;
+
+	memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE);
+
+	memset(&params, 0, sizeof(params));
+	params.cryptlen = areq->cryptlen;
+	params.authlen = areq->assoclen + params.cryptlen;
+	params.srclen = params.authlen;
+	params.dstlen = params.srclen + aead->authsize;
+	params.iv = &areq->iv[GCM_AES_SALT_SIZE];
+	params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE;
+	params.ctrl_arg = ENCRYPT;
+	ret = nitrox_set_creq(areq, &params);
+	if (ret)
+		return ret;
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback,
+					 areq);
+}
+
+static int nitrox_aes_gcm_dec(struct aead_request *areq)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(areq);
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct nitrox_kcrypt_request *nkreq = aead_request_ctx(areq);
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct flexi_crypto_context *fctx = nctx->u.fctx;
+	struct nitrox_crypt_params params;
+	int ret;
+
+	memcpy(fctx->crypto.iv, areq->iv, GCM_AES_SALT_SIZE);
+
+	memset(&params, 0, sizeof(params));
+	params.cryptlen = areq->cryptlen - aead->authsize;
+	params.authlen = areq->assoclen + params.cryptlen;
+	params.srclen = areq->cryptlen + areq->assoclen;
+	params.dstlen = params.srclen - aead->authsize;
+	params.iv = &areq->iv[GCM_AES_SALT_SIZE];
+	params.ivsize = GCM_AES_IV_SIZE - GCM_AES_SALT_SIZE;
+	params.ctrl_arg = DECRYPT;
+	ret = nitrox_set_creq(areq, &params);
+	if (ret)
+		return ret;
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq, nitrox_aead_callback,
+					 areq);
+}
+
+static int nitrox_aead_init(struct crypto_aead *aead)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	struct crypto_ctx_hdr *chdr;
+
+	/* get the first device */
+	nctx->ndev = nitrox_get_first_device();
+	if (!nctx->ndev)
+		return -ENODEV;
+
+	/* allocate nitrox crypto context */
+	chdr = crypto_alloc_context(nctx->ndev);
+	if (!chdr) {
+		nitrox_put_device(nctx->ndev);
+		return -ENOMEM;
+	}
+	nctx->chdr = chdr;
+	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
+					 sizeof(struct ctx_hdr));
+	nctx->u.fctx->flags.f = 0;
+
+	return 0;
+}
+
+static int nitrox_aes_gcm_init(struct crypto_aead *aead)
+{
+	int ret;
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+	union fc_ctx_flags *flags;
+
+	ret = nitrox_aead_init(aead);
+	if (ret)
+		return ret;
+
+	flags = &nctx->u.fctx->flags;
+	flags->w0.cipher_type = CIPHER_AES_GCM;
+	flags->w0.hash_type = AUTH_NULL;
+	flags->w0.iv_source = IV_FROM_DPTR;
+	/* ask microcode to calculate ipad/opad */
+	flags->w0.auth_input_type = 1;
+	flags->f = be64_to_cpu(flags->f);
+
+	crypto_aead_set_reqsize(aead, sizeof(struct aead_request) +
+				sizeof(struct nitrox_kcrypt_request));
+
+	return 0;
+}
+
+static void nitrox_aead_exit(struct crypto_aead *aead)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_aead_ctx(aead);
+
+	/* free the nitrox crypto context */
+	if (nctx->u.ctx_handle) {
+		struct flexi_crypto_context *fctx = nctx->u.fctx;
+
+		memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys));
+		memzero_explicit(&fctx->auth, sizeof(struct auth_keys));
+		crypto_free_context((void *)nctx->chdr);
+	}
+	nitrox_put_device(nctx->ndev);
+
+	nctx->u.ctx_handle = 0;
+	nctx->ndev = NULL;
+}
+
+static struct aead_alg nitrox_aeads[] = { {
+	.base = {
+		.cra_name = "gcm(aes)",
+		.cra_driver_name = "n5_aes_gcm",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.setkey = nitrox_aes_gcm_setkey,
+	.setauthsize = nitrox_aead_setauthsize,
+	.encrypt = nitrox_aes_gcm_enc,
+	.decrypt = nitrox_aes_gcm_dec,
+	.init = nitrox_aes_gcm_init,
+	.exit = nitrox_aead_exit,
+	.ivsize = GCM_AES_IV_SIZE,
+	.maxauthsize = AES_BLOCK_SIZE,
+} };
+
+int nitrox_register_aeads(void)
+{
+	return crypto_register_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads));
+}
+
+void nitrox_unregister_aeads(void)
+{
+	crypto_unregister_aeads(nitrox_aeads, ARRAY_SIZE(nitrox_aeads));
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_algs.c b/drivers/crypto/cavium/nitrox/nitrox_algs.c
index 2ae6124e5da673e1d73fe0141beed7adaf5e5360..d646ae5f29b0158ed4592e1dd9f7670bfe73df01 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_algs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_algs.c
@@ -1,458 +1,24 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-
-#include <crypto/aes.h>
-#include <crypto/skcipher.h>
-#include <crypto/ctr.h>
-#include <crypto/des.h>
-#include <crypto/xts.h>
-
-#include "nitrox_dev.h"
 #include "nitrox_common.h"
-#include "nitrox_req.h"
-
-#define PRIO 4001
-
-struct nitrox_cipher {
-	const char *name;
-	enum flexi_cipher value;
-};
-
-/**
- * supported cipher list
- */
-static const struct nitrox_cipher flexi_cipher_table[] = {
-	{ "null",		CIPHER_NULL },
-	{ "cbc(des3_ede)",	CIPHER_3DES_CBC },
-	{ "ecb(des3_ede)",	CIPHER_3DES_ECB },
-	{ "cbc(aes)",		CIPHER_AES_CBC },
-	{ "ecb(aes)",		CIPHER_AES_ECB },
-	{ "cfb(aes)",		CIPHER_AES_CFB },
-	{ "rfc3686(ctr(aes))",	CIPHER_AES_CTR },
-	{ "xts(aes)",		CIPHER_AES_XTS },
-	{ "cts(cbc(aes))",	CIPHER_AES_CBC_CTS },
-	{ NULL,			CIPHER_INVALID }
-};
-
-static enum flexi_cipher flexi_cipher_type(const char *name)
-{
-	const struct nitrox_cipher *cipher = flexi_cipher_table;
-
-	while (cipher->name) {
-		if (!strcmp(cipher->name, name))
-			break;
-		cipher++;
-	}
-	return cipher->value;
-}
-
-static int flexi_aes_keylen(int keylen)
-{
-	int aes_keylen;
-
-	switch (keylen) {
-	case AES_KEYSIZE_128:
-		aes_keylen = 1;
-		break;
-	case AES_KEYSIZE_192:
-		aes_keylen = 2;
-		break;
-	case AES_KEYSIZE_256:
-		aes_keylen = 3;
-		break;
-	default:
-		aes_keylen = -EINVAL;
-		break;
-	}
-	return aes_keylen;
-}
-
-static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
-{
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
-	void *fctx;
-
-	/* get the first device */
-	nctx->ndev = nitrox_get_first_device();
-	if (!nctx->ndev)
-		return -ENODEV;
-
-	/* allocate nitrox crypto context */
-	fctx = crypto_alloc_context(nctx->ndev);
-	if (!fctx) {
-		nitrox_put_device(nctx->ndev);
-		return -ENOMEM;
-	}
-	nctx->u.ctx_handle = (uintptr_t)fctx;
-	crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) +
-				    sizeof(struct nitrox_kcrypt_request));
-	return 0;
-}
-
-static void nitrox_skcipher_exit(struct crypto_skcipher *tfm)
-{
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
-
-	/* free the nitrox crypto context */
-	if (nctx->u.ctx_handle) {
-		struct flexi_crypto_context *fctx = nctx->u.fctx;
-
-		memset(&fctx->crypto, 0, sizeof(struct crypto_keys));
-		memset(&fctx->auth, 0, sizeof(struct auth_keys));
-		crypto_free_context((void *)fctx);
-	}
-	nitrox_put_device(nctx->ndev);
-
-	nctx->u.ctx_handle = 0;
-	nctx->ndev = NULL;
-}
 
-static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher,
-					 int aes_keylen, const u8 *key,
-					 unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	enum flexi_cipher cipher_type;
-	const char *name;
-
-	name = crypto_tfm_alg_name(tfm);
-	cipher_type = flexi_cipher_type(name);
-	if (unlikely(cipher_type == CIPHER_INVALID)) {
-		pr_err("unsupported cipher: %s\n", name);
-		return -EINVAL;
-	}
-
-	/* fill crypto context */
-	fctx = nctx->u.fctx;
-	fctx->flags = 0;
-	fctx->w0.cipher_type = cipher_type;
-	fctx->w0.aes_keylen = aes_keylen;
-	fctx->w0.iv_source = IV_FROM_DPTR;
-	fctx->flags = cpu_to_be64(*(u64 *)&fctx->w0);
-	/* copy the key to context */
-	memcpy(fctx->crypto.u.key, key, keylen);
-
-	return 0;
-}
-
-static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key,
-			     unsigned int keylen)
+int nitrox_crypto_register(void)
 {
-	int aes_keylen;
+	int err;
 
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
+	err = nitrox_register_skciphers();
+	if (err)
+		return err;
 
-static void nitrox_skcipher_callback(struct skcipher_request *skreq,
-				     int err)
-{
+	err = nitrox_register_aeads();
 	if (err) {
-		pr_err_ratelimited("request failed status 0x%0x\n", err);
-		err = -EINVAL;
-	}
-	skcipher_request_complete(skreq, err);
-}
-
-static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
-{
-	struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq);
-	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher);
-	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
-	int ivsize = crypto_skcipher_ivsize(cipher);
-	struct se_crypto_request *creq;
-
-	creq = &nkreq->creq;
-	creq->flags = skreq->base.flags;
-	creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
-		     GFP_KERNEL : GFP_ATOMIC;
-
-	/* fill the request */
-	creq->ctrl.value = 0;
-	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
-	creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT);
-	/* param0: length of the data to be encrypted */
-	creq->gph.param0 = cpu_to_be16(skreq->cryptlen);
-	creq->gph.param1 = 0;
-	/* param2: encryption data offset */
-	creq->gph.param2 = cpu_to_be16(ivsize);
-	creq->gph.param3 = 0;
-
-	creq->ctx_handle = nctx->u.ctx_handle;
-	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
-
-	/* copy the iv */
-	memcpy(creq->iv, skreq->iv, ivsize);
-	creq->ivsize = ivsize;
-	creq->src = skreq->src;
-	creq->dst = skreq->dst;
-
-	nkreq->nctx = nctx;
-	nkreq->skreq = skreq;
-
-	/* send the crypto request */
-	return nitrox_process_se_request(nctx->ndev, creq,
-					 nitrox_skcipher_callback, skreq);
-}
-
-static int nitrox_aes_encrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, true);
-}
-
-static int nitrox_aes_decrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, false);
-}
-
-static int nitrox_3des_setkey(struct crypto_skcipher *cipher,
-			      const u8 *key, unsigned int keylen)
-{
-	if (keylen != DES3_EDE_KEY_SIZE) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-
-	return nitrox_skcipher_setkey(cipher, 0, key, keylen);
-}
-
-static int nitrox_3des_encrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, true);
-}
-
-static int nitrox_3des_decrypt(struct skcipher_request *skreq)
-{
-	return nitrox_skcipher_crypt(skreq, false);
-}
-
-static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher,
-				 const u8 *key, unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	int aes_keylen, ret;
-
-	ret = xts_check_key(tfm, key, keylen);
-	if (ret)
-		return ret;
-
-	keylen /= 2;
-
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
+		nitrox_unregister_skciphers();
+		return err;
 	}
 
-	fctx = nctx->u.fctx;
-	/* copy KEY2 */
-	memcpy(fctx->auth.u.key2, (key + keylen), keylen);
-
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
-
-static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher,
-					 const u8 *key, unsigned int keylen)
-{
-	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
-	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
-	struct flexi_crypto_context *fctx;
-	int aes_keylen;
-
-	if (keylen < CTR_RFC3686_NONCE_SIZE)
-		return -EINVAL;
-
-	fctx = nctx->u.fctx;
-
-	memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE),
-	       CTR_RFC3686_NONCE_SIZE);
-
-	keylen -= CTR_RFC3686_NONCE_SIZE;
-
-	aes_keylen = flexi_aes_keylen(keylen);
-	if (aes_keylen < 0) {
-		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
-}
-
-static struct skcipher_alg nitrox_skciphers[] = { {
-	.base = {
-		.cra_name = "cbc(aes)",
-		.cra_driver_name = "n5_cbc(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "ecb(aes)",
-		.cra_driver_name = "n5_ecb(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "cfb(aes)",
-		.cra_driver_name = "n5_cfb(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "xts(aes)",
-		.cra_driver_name = "n5_xts(aes)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = 2 * AES_MIN_KEY_SIZE,
-	.max_keysize = 2 * AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_xts_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "rfc3686(ctr(aes))",
-		.cra_driver_name = "n5_rfc3686(ctr(aes))",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = 1,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
-	.ivsize = CTR_RFC3686_IV_SIZE,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-	.setkey = nitrox_aes_ctr_rfc3686_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-}, {
-	.base = {
-		.cra_name = "cts(cbc(aes))",
-		.cra_driver_name = "n5_cts(cbc(aes))",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = AES_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_type = &crypto_ablkcipher_type,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = AES_MIN_KEY_SIZE,
-	.max_keysize = AES_MAX_KEY_SIZE,
-	.ivsize = AES_BLOCK_SIZE,
-	.setkey = nitrox_aes_setkey,
-	.encrypt = nitrox_aes_encrypt,
-	.decrypt = nitrox_aes_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "cbc(des3_ede)",
-		.cra_driver_name = "n5_cbc(des3_ede)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = DES3_EDE_KEY_SIZE,
-	.max_keysize = DES3_EDE_KEY_SIZE,
-	.ivsize = DES3_EDE_BLOCK_SIZE,
-	.setkey = nitrox_3des_setkey,
-	.encrypt = nitrox_3des_encrypt,
-	.decrypt = nitrox_3des_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}, {
-	.base = {
-		.cra_name = "ecb(des3_ede)",
-		.cra_driver_name = "n5_ecb(des3_ede)",
-		.cra_priority = PRIO,
-		.cra_flags = CRYPTO_ALG_ASYNC,
-		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
-		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
-		.cra_alignmask = 0,
-		.cra_module = THIS_MODULE,
-	},
-	.min_keysize = DES3_EDE_KEY_SIZE,
-	.max_keysize = DES3_EDE_KEY_SIZE,
-	.ivsize = DES3_EDE_BLOCK_SIZE,
-	.setkey = nitrox_3des_setkey,
-	.encrypt = nitrox_3des_encrypt,
-	.decrypt = nitrox_3des_decrypt,
-	.init = nitrox_skcipher_init,
-	.exit = nitrox_skcipher_exit,
-}
-
-};
-
-int nitrox_crypto_register(void)
-{
-	return crypto_register_skciphers(nitrox_skciphers,
-					 ARRAY_SIZE(nitrox_skciphers));
+	return 0;
 }
 
 void nitrox_crypto_unregister(void)
 {
-	crypto_unregister_skciphers(nitrox_skciphers,
-				    ARRAY_SIZE(nitrox_skciphers));
+	nitrox_unregister_aeads();
+	nitrox_unregister_skciphers();
 }
diff --git a/drivers/crypto/cavium/nitrox/nitrox_common.h b/drivers/crypto/cavium/nitrox/nitrox_common.h
index 863143a8336bf8579f48ca82e40d12bcfdf939a5..e4be69d7e6e5ccb0351334ba8fb4940efb2eae8a 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_common.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_common.h
@@ -7,6 +7,10 @@
 
 int nitrox_crypto_register(void);
 void nitrox_crypto_unregister(void);
+int nitrox_register_aeads(void);
+void nitrox_unregister_aeads(void);
+int nitrox_register_skciphers(void);
+void nitrox_unregister_skciphers(void);
 void *crypto_alloc_context(struct nitrox_device *ndev);
 void crypto_free_context(void *ctx);
 struct nitrox_device *nitrox_get_first_device(void);
@@ -19,7 +23,7 @@ void pkt_slc_resp_tasklet(unsigned long data);
 int nitrox_process_se_request(struct nitrox_device *ndev,
 			      struct se_crypto_request *req,
 			      completion_t cb,
-			      struct skcipher_request *skreq);
+			      void *cb_arg);
 void backlog_qflush_work(struct work_struct *work);
 
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_csr.h b/drivers/crypto/cavium/nitrox/nitrox_csr.h
index 1ad27b1a87c54fb5e61c2712fcbd000242488bf7..a2a452642b3865a93db67fe2c0f53a7985524d92 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_csr.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_csr.h
@@ -54,7 +54,13 @@
 #define NPS_STATS_PKT_DMA_WR_CNT	0x1000190
 
 /* NPS packet registers */
-#define NPS_PKT_INT				0x1040018
+#define NPS_PKT_INT			0x1040018
+#define NPS_PKT_MBOX_INT_LO		0x1040020
+#define NPS_PKT_MBOX_INT_LO_ENA_W1C	0x1040030
+#define NPS_PKT_MBOX_INT_LO_ENA_W1S	0x1040038
+#define NPS_PKT_MBOX_INT_HI		0x1040040
+#define NPS_PKT_MBOX_INT_HI_ENA_W1C	0x1040050
+#define NPS_PKT_MBOX_INT_HI_ENA_W1S	0x1040058
 #define NPS_PKT_IN_RERR_HI		0x1040108
 #define NPS_PKT_IN_RERR_HI_ENA_W1S	0x1040120
 #define NPS_PKT_IN_RERR_LO		0x1040128
@@ -74,6 +80,10 @@
 #define NPS_PKT_SLC_RERR_LO_ENA_W1S	0x1040240
 #define NPS_PKT_SLC_ERR_TYPE		0x1040248
 #define NPS_PKT_SLC_ERR_TYPE_ENA_W1S	0x1040260
+/* Mailbox PF->VF PF Accessible Data registers */
+#define NPS_PKT_MBOX_PF_VF_PFDATAX(_i)	(0x1040800 + ((_i) * 0x8))
+#define NPS_PKT_MBOX_VF_PF_PFDATAX(_i)	(0x1040C00 + ((_i) * 0x8))
+
 #define NPS_PKT_SLC_CTLX(_i)		(0x10000 + ((_i) * 0x40000))
 #define NPS_PKT_SLC_CNTSX(_i)		(0x10008 + ((_i) * 0x40000))
 #define NPS_PKT_SLC_INT_LEVELSX(_i)	(0x10010 + ((_i) * 0x40000))
diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
index 5f3cd5fafe048b9f204ccf821db87c7ffffd4a9e..0196b992280f7eb7d82dfd6153f4d522dc814fc1 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.c
@@ -13,18 +13,7 @@ static int firmware_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int firmware_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, firmware_show, inode->i_private);
-}
-
-static const struct file_operations firmware_fops = {
-	.owner = THIS_MODULE,
-	.open = firmware_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(firmware);
 
 static int device_show(struct seq_file *s, void *v)
 {
@@ -41,18 +30,7 @@ static int device_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int nitrox_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, device_show, inode->i_private);
-}
-
-static const struct file_operations nitrox_fops = {
-	.owner = THIS_MODULE,
-	.open = nitrox_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(device);
 
 static int stats_show(struct seq_file *s, void *v)
 {
@@ -69,18 +47,7 @@ static int stats_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static int nitrox_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, stats_show, inode->i_private);
-}
-
-static const struct file_operations nitrox_stats_fops = {
-	.owner = THIS_MODULE,
-	.open = nitrox_stats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(stats);
 
 void nitrox_debugfs_exit(struct nitrox_device *ndev)
 {
@@ -97,13 +64,16 @@ int nitrox_debugfs_init(struct nitrox_device *ndev)
 		return -ENOMEM;
 
 	ndev->debugfs_dir = dir;
-	f = debugfs_create_file("firmware", 0400, dir, ndev, &firmware_fops);
+	f = debugfs_create_file("firmware", 0400, dir, ndev,
+				&firmware_fops);
 	if (!f)
 		goto err;
-	f = debugfs_create_file("device", 0400, dir, ndev, &nitrox_fops);
+	f = debugfs_create_file("device", 0400, dir, ndev,
+				&device_fops);
 	if (!f)
 		goto err;
-	f = debugfs_create_file("stats", 0400, dir, ndev, &nitrox_stats_fops);
+	f = debugfs_create_file("stats", 0400, dir, ndev,
+				&stats_fops);
 	if (!f)
 		goto err;
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_debugfs.h b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8d85ffa619ca4dd1a784cff2d282e5947342158
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_debugfs.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NITROX_DEBUGFS_H
+#define __NITROX_DEBUGFS_H
+
+#include "nitrox_dev.h"
+
+#ifdef CONFIG_DEBUG_FS
+int nitrox_debugfs_init(struct nitrox_device *ndev);
+void nitrox_debugfs_exit(struct nitrox_device *ndev);
+#else
+static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
+{
+	return 0;
+}
+
+static inline void nitrox_debugfs_exit(struct nitrox_device *ndev)
+{
+}
+#endif /* !CONFIG_DEBUG_FS */
+
+#endif /* __NITROX_DEBUGFS_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_dev.h b/drivers/crypto/cavium/nitrox/nitrox_dev.h
index 283e252385fbc3b8a3fe38fbbfa34a796d87d4a9..0338877b828f9844336fd0ee260390787291b315 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_dev.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_dev.h
@@ -8,6 +8,8 @@
 #include <linux/if.h>
 
 #define VERSION_LEN 32
+/* Maximum queues in PF mode */
+#define MAX_PF_QUEUES	64
 
 /**
  * struct nitrox_cmdq - NITROX command queue
@@ -103,6 +105,61 @@ struct nitrox_q_vector {
 	};
 };
 
+/**
+ * mbox_msg - Mailbox message data
+ * @type: message type
+ * @opcode: message opcode
+ * @data: message data
+ */
+union mbox_msg {
+	u64 value;
+	struct {
+		u64 type: 2;
+		u64 opcode: 6;
+		u64 data: 58;
+	};
+	struct {
+		u64 type: 2;
+		u64 opcode: 6;
+		u64 chipid: 8;
+		u64 vfid: 8;
+	} id;
+};
+
+/**
+ * nitrox_vfdev - NITROX VF device instance in PF
+ * @state: VF device state
+ * @vfno: VF number
+ * @nr_queues: number of queues enabled in VF
+ * @ring: ring to communicate with VF
+ * @msg: Mailbox message data from VF
+ * @mbx_resp: Mailbox counters
+ */
+struct nitrox_vfdev {
+	atomic_t state;
+	int vfno;
+	int nr_queues;
+	int ring;
+	union mbox_msg msg;
+	atomic64_t mbx_resp;
+};
+
+/**
+ * struct nitrox_iov - SR-IOV information
+ * @num_vfs: number of VF(s) enabled
+ * @max_vf_queues: Maximum number of queues allowed for VF
+ * @vfdev: VF(s) devices
+ * @pf2vf_wq: workqueue for PF2VF communication
+ * @msix: MSI-X entry for PF in SR-IOV case
+ */
+struct nitrox_iov {
+	int num_vfs;
+	int max_vf_queues;
+	struct nitrox_vfdev *vfdev;
+	struct workqueue_struct *pf2vf_wq;
+	struct msix_entry msix;
+};
+
 /*
  * NITROX Device states
  */
@@ -150,6 +207,9 @@ enum vf_mode {
  * @ctx_pool: DMA pool for crypto context
  * @pkt_inq: Packet input rings
  * @qvec: MSI-X queue vectors information
+ * @iov: SR-IOV informatin
+ * @num_vecs: number of MSI-X vectors
+ * @stats: request statistics
  * @hw: hardware information
  * @debugfs_dir: debugfs directory
  */
@@ -168,13 +228,13 @@ struct nitrox_device {
 	int node;
 	u16 qlen;
 	u16 nr_queues;
-	int num_vfs;
 	enum vf_mode mode;
 
 	struct dma_pool *ctx_pool;
 	struct nitrox_cmdq *pkt_inq;
 
 	struct nitrox_q_vector *qvec;
+	struct nitrox_iov iov;
 	int num_vecs;
 
 	struct nitrox_stats stats;
@@ -213,17 +273,9 @@ static inline bool nitrox_ready(struct nitrox_device *ndev)
 	return atomic_read(&ndev->state) == __NDEV_READY;
 }
 
-#ifdef CONFIG_DEBUG_FS
-int nitrox_debugfs_init(struct nitrox_device *ndev);
-void nitrox_debugfs_exit(struct nitrox_device *ndev);
-#else
-static inline int nitrox_debugfs_init(struct nitrox_device *ndev)
+static inline bool nitrox_vfdev_ready(struct nitrox_vfdev *vfdev)
 {
-	return 0;
+	return atomic_read(&vfdev->state) == __NDEV_READY;
 }
 
-static inline void nitrox_debugfs_exit(struct nitrox_device *ndev)
-{ }
-#endif
-
 #endif /* __NITROX_DEV_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.c b/drivers/crypto/cavium/nitrox/nitrox_hal.c
index a9b82387cf532f5fd78dcdb9c5d3e93a7165ce3f..c08d9f33a3b154233b9ef805b5daebc7e2ae2cbd 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_hal.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_hal.c
@@ -5,10 +5,11 @@
 #include "nitrox_csr.h"
 
 #define PLL_REF_CLK 50
+#define MAX_CSR_RETRIES 10
 
 /**
  * emu_enable_cores - Enable EMU cluster cores.
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 static void emu_enable_cores(struct nitrox_device *ndev)
 {
@@ -33,7 +34,7 @@ static void emu_enable_cores(struct nitrox_device *ndev)
 
 /**
  * nitrox_config_emu_unit - configure EMU unit.
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 void nitrox_config_emu_unit(struct nitrox_device *ndev)
 {
@@ -63,29 +64,26 @@ void nitrox_config_emu_unit(struct nitrox_device *ndev)
 static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring)
 {
 	union nps_pkt_in_instr_ctl pkt_in_ctl;
-	union nps_pkt_in_instr_baoff_dbell pkt_in_dbell;
 	union nps_pkt_in_done_cnts pkt_in_cnts;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
+	/* step 1: disable the ring, clear enable bit */
 	offset = NPS_PKT_IN_INSTR_CTLX(ring);
-	/* disable the ring */
 	pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
 	pkt_in_ctl.s.enb = 0;
 	nitrox_write_csr(ndev, offset, pkt_in_ctl.value);
-	usleep_range(100, 150);
 
-	/* wait to clear [ENB] */
+	/* step 2: wait to clear [ENB] */
+	usleep_range(100, 150);
 	do {
 		pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (pkt_in_ctl.s.enb);
-
-	/* clear off door bell counts */
-	offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(ring);
-	pkt_in_dbell.value = 0;
-	pkt_in_dbell.s.dbell = 0xffffffff;
-	nitrox_write_csr(ndev, offset, pkt_in_dbell.value);
+		if (!pkt_in_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 
-	/* clear done counts */
+	/* step 3: clear done counts */
 	offset = NPS_PKT_IN_DONE_CNTSX(ring);
 	pkt_in_cnts.value = nitrox_read_csr(ndev, offset);
 	nitrox_write_csr(ndev, offset, pkt_in_cnts.value);
@@ -95,6 +93,7 @@ static void reset_pkt_input_ring(struct nitrox_device *ndev, int ring)
 void enable_pkt_input_ring(struct nitrox_device *ndev, int ring)
 {
 	union nps_pkt_in_instr_ctl pkt_in_ctl;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	/* 64-byte instruction size */
@@ -107,12 +106,15 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring)
 	/* wait for set [ENB] */
 	do {
 		pkt_in_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (!pkt_in_ctl.s.enb);
+		if (pkt_in_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
 /**
  * nitrox_config_pkt_input_rings - configure Packet Input Rings
- * @ndev: N5 device
+ * @ndev: NITROX device
  */
 void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 {
@@ -121,11 +123,14 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 	for (i = 0; i < ndev->nr_queues; i++) {
 		struct nitrox_cmdq *cmdq = &ndev->pkt_inq[i];
 		union nps_pkt_in_instr_rsize pkt_in_rsize;
+		union nps_pkt_in_instr_baoff_dbell pkt_in_dbell;
 		u64 offset;
 
 		reset_pkt_input_ring(ndev, i);
 
-		/* configure ring base address 16-byte aligned,
+		/**
+		 * step 4:
+		 * configure ring base address 16-byte aligned,
 		 * size and interrupt threshold.
 		 */
 		offset = NPS_PKT_IN_INSTR_BADDRX(i);
@@ -141,6 +146,13 @@ void nitrox_config_pkt_input_rings(struct nitrox_device *ndev)
 		offset = NPS_PKT_IN_INT_LEVELSX(i);
 		nitrox_write_csr(ndev, offset, 0xffffffff);
 
+		/* step 5: clear off door bell counts */
+		offset = NPS_PKT_IN_INSTR_BAOFF_DBELLX(i);
+		pkt_in_dbell.value = 0;
+		pkt_in_dbell.s.dbell = 0xffffffff;
+		nitrox_write_csr(ndev, offset, pkt_in_dbell.value);
+
+		/* enable the ring */
 		enable_pkt_input_ring(ndev, i);
 	}
 }
@@ -149,21 +161,26 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_ctl pkt_slc_ctl;
 	union nps_pkt_slc_cnts pkt_slc_cnts;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
-	/* disable slc port */
+	/* step 1: disable slc port */
 	offset = NPS_PKT_SLC_CTLX(port);
 	pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
 	pkt_slc_ctl.s.enb = 0;
 	nitrox_write_csr(ndev, offset, pkt_slc_ctl.value);
-	usleep_range(100, 150);
 
+	/* step 2 */
+	usleep_range(100, 150);
 	/* wait to clear [ENB] */
 	do {
 		pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (pkt_slc_ctl.s.enb);
+		if (!pkt_slc_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 
-	/* clear slc counters */
+	/* step 3: clear slc counters */
 	offset = NPS_PKT_SLC_CNTSX(port);
 	pkt_slc_cnts.value = nitrox_read_csr(ndev, offset);
 	nitrox_write_csr(ndev, offset, pkt_slc_cnts.value);
@@ -173,12 +190,12 @@ static void reset_pkt_solicit_port(struct nitrox_device *ndev, int port)
 void enable_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_ctl pkt_slc_ctl;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	offset = NPS_PKT_SLC_CTLX(port);
 	pkt_slc_ctl.value = 0;
 	pkt_slc_ctl.s.enb = 1;
-
 	/*
 	 * 8 trailing 0x00 bytes will be added
 	 * to the end of the outgoing packet.
@@ -191,23 +208,27 @@ void enable_pkt_solicit_port(struct nitrox_device *ndev, int port)
 	/* wait to set [ENB] */
 	do {
 		pkt_slc_ctl.value = nitrox_read_csr(ndev, offset);
-	} while (!pkt_slc_ctl.s.enb);
+		if (pkt_slc_ctl.s.enb)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
-static void config_single_pkt_solicit_port(struct nitrox_device *ndev,
-					   int port)
+static void config_pkt_solicit_port(struct nitrox_device *ndev, int port)
 {
 	union nps_pkt_slc_int_levels pkt_slc_int;
 	u64 offset;
 
 	reset_pkt_solicit_port(ndev, port);
 
+	/* step 4: configure interrupt levels */
 	offset = NPS_PKT_SLC_INT_LEVELSX(port);
 	pkt_slc_int.value = 0;
 	/* time interrupt threshold */
 	pkt_slc_int.s.timet = 0x3fffff;
 	nitrox_write_csr(ndev, offset, pkt_slc_int.value);
 
+	/* enable the solicit port */
 	enable_pkt_solicit_port(ndev, port);
 }
 
@@ -216,12 +237,12 @@ void nitrox_config_pkt_solicit_ports(struct nitrox_device *ndev)
 	int i;
 
 	for (i = 0; i < ndev->nr_queues; i++)
-		config_single_pkt_solicit_port(ndev, i);
+		config_pkt_solicit_port(ndev, i);
 }
 
 /**
  * enable_nps_interrupts - enable NPS interrutps
- * @ndev: N5 device.
+ * @ndev: NITROX device.
  *
  * This includes NPS core, packet in and slc interrupts.
  */
@@ -284,8 +305,8 @@ void nitrox_config_pom_unit(struct nitrox_device *ndev)
 }
 
 /**
- * nitrox_config_rand_unit - enable N5 random number unit
- * @ndev: N5 device
+ * nitrox_config_rand_unit - enable NITROX random number unit
+ * @ndev: NITROX device
  */
 void nitrox_config_rand_unit(struct nitrox_device *ndev)
 {
@@ -361,6 +382,7 @@ void invalidate_lbc(struct nitrox_device *ndev)
 {
 	union lbc_inval_ctl lbc_ctl;
 	union lbc_inval_status lbc_stat;
+	int max_retries = MAX_CSR_RETRIES;
 	u64 offset;
 
 	/* invalidate LBC */
@@ -370,10 +392,12 @@ void invalidate_lbc(struct nitrox_device *ndev)
 	nitrox_write_csr(ndev, offset, lbc_ctl.value);
 
 	offset = LBC_INVAL_STATUS;
-
 	do {
 		lbc_stat.value = nitrox_read_csr(ndev, offset);
-	} while (!lbc_stat.s.done);
+		if (lbc_stat.s.done)
+			break;
+		udelay(50);
+	} while (max_retries--);
 }
 
 void nitrox_config_lbc_unit(struct nitrox_device *ndev)
@@ -467,3 +491,31 @@ void nitrox_get_hwinfo(struct nitrox_device *ndev)
 	/* copy partname */
 	strncpy(ndev->hw.partname, name, sizeof(ndev->hw.partname));
 }
+
+void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev)
+{
+	u64 value = ~0ULL;
+	u64 reg_addr;
+
+	/* Mailbox interrupt low enable set register */
+	reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1S;
+	nitrox_write_csr(ndev, reg_addr, value);
+
+	/* Mailbox interrupt high enable set register */
+	reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1S;
+	nitrox_write_csr(ndev, reg_addr, value);
+}
+
+void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev)
+{
+	u64 value = ~0ULL;
+	u64 reg_addr;
+
+	/* Mailbox interrupt low enable clear register */
+	reg_addr = NPS_PKT_MBOX_INT_LO_ENA_W1C;
+	nitrox_write_csr(ndev, reg_addr, value);
+
+	/* Mailbox interrupt high enable clear register */
+	reg_addr = NPS_PKT_MBOX_INT_HI_ENA_W1C;
+	nitrox_write_csr(ndev, reg_addr, value);
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_hal.h b/drivers/crypto/cavium/nitrox/nitrox_hal.h
index 489ee64c119eddd3314eb34735b03a46759ba755..d6606418ba3822f93a7d6ab9057718997fd79f33 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_hal.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_hal.h
@@ -19,5 +19,7 @@ void enable_pkt_input_ring(struct nitrox_device *ndev, int ring);
 void enable_pkt_solicit_port(struct nitrox_device *ndev, int port);
 void config_nps_core_vfcfg_mode(struct nitrox_device *ndev, enum vf_mode mode);
 void nitrox_get_hwinfo(struct nitrox_device *ndev);
+void enable_pf2vf_mbox_interrupts(struct nitrox_device *ndev);
+void disable_pf2vf_mbox_interrupts(struct nitrox_device *ndev);
 
 #endif /* __NITROX_HAL_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.c b/drivers/crypto/cavium/nitrox/nitrox_isr.c
index 88a77b8fb3fbd747e23438b054829a1bc10ef7db..3dec570a190ad5efca0696831a281307f1f185a3 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_isr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_isr.c
@@ -7,12 +7,14 @@
 #include "nitrox_csr.h"
 #include "nitrox_common.h"
 #include "nitrox_hal.h"
+#include "nitrox_mbx.h"
 
 /**
  * One vector for each type of ring
  *  - NPS packet ring, AQMQ ring and ZQMQ ring
  */
 #define NR_RING_VECTORS 3
+#define NR_NON_RING_VECTORS 1
 /* base entry for packet ring/port */
 #define PKT_RING_MSIX_BASE 0
 #define NON_RING_MSIX_BASE 192
@@ -219,7 +221,8 @@ static void nps_core_int_tasklet(unsigned long data)
  */
 static irqreturn_t nps_core_int_isr(int irq, void *data)
 {
-	struct nitrox_device *ndev = data;
+	struct nitrox_q_vector *qvec = data;
+	struct nitrox_device *ndev = qvec->ndev;
 	union nps_core_int_active core_int;
 
 	core_int.value = nitrox_read_csr(ndev, NPS_CORE_INT_ACTIVE);
@@ -245,6 +248,10 @@ static irqreturn_t nps_core_int_isr(int irq, void *data)
 	if (core_int.s.bmi)
 		clear_bmi_err_intr(ndev);
 
+	/* Mailbox interrupt */
+	if (core_int.s.mbox)
+		nitrox_pf2vf_mbox_handler(ndev);
+
 	/* If more work callback the ISR, set resend */
 	core_int.s.resend = 1;
 	nitrox_write_csr(ndev, NPS_CORE_INT_ACTIVE, core_int.value);
@@ -275,6 +282,7 @@ void nitrox_unregister_interrupts(struct nitrox_device *ndev)
 		qvec->valid = false;
 	}
 	kfree(ndev->qvec);
+	ndev->qvec = NULL;
 	pci_free_irq_vectors(pdev);
 }
 
@@ -321,6 +329,7 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 		if (qvec->ring >= ndev->nr_queues)
 			break;
 
+		qvec->cmdq = &ndev->pkt_inq[qvec->ring];
 		snprintf(qvec->name, IRQ_NAMESZ, "nitrox-pkt%d", qvec->ring);
 		/* get the vector number */
 		vec = pci_irq_vector(pdev, i);
@@ -335,13 +344,13 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 
 		tasklet_init(&qvec->resp_tasklet, pkt_slc_resp_tasklet,
 			     (unsigned long)qvec);
-		qvec->cmdq = &ndev->pkt_inq[qvec->ring];
 		qvec->valid = true;
 	}
 
 	/* request irqs for non ring vectors */
 	i = NON_RING_MSIX_BASE;
 	qvec = &ndev->qvec[i];
+	qvec->ndev = ndev;
 
 	snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d", i);
 	/* get the vector number */
@@ -356,7 +365,6 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 
 	tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet,
 		     (unsigned long)qvec);
-	qvec->ndev = ndev;
 	qvec->valid = true;
 
 	return 0;
@@ -365,3 +373,81 @@ int nitrox_register_interrupts(struct nitrox_device *ndev)
 	nitrox_unregister_interrupts(ndev);
 	return ret;
 }
+
+void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int i;
+
+	for (i = 0; i < ndev->num_vecs; i++) {
+		struct nitrox_q_vector *qvec;
+		int vec;
+
+		qvec = ndev->qvec + i;
+		if (!qvec->valid)
+			continue;
+
+		vec = ndev->iov.msix.vector;
+		irq_set_affinity_hint(vec, NULL);
+		free_irq(vec, qvec);
+
+		tasklet_disable(&qvec->resp_tasklet);
+		tasklet_kill(&qvec->resp_tasklet);
+		qvec->valid = false;
+	}
+	kfree(ndev->qvec);
+	ndev->qvec = NULL;
+	pci_disable_msix(pdev);
+}
+
+int nitrox_sriov_register_interupts(struct nitrox_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct nitrox_q_vector *qvec;
+	int vec, cpu;
+	int ret;
+
+	/**
+	 * only non ring vectors i.e Entry 192 is available
+	 * for PF in SR-IOV mode.
+	 */
+	ndev->iov.msix.entry = NON_RING_MSIX_BASE;
+	ret = pci_enable_msix_exact(pdev, &ndev->iov.msix, NR_NON_RING_VECTORS);
+	if (ret) {
+		dev_err(DEV(ndev), "failed to allocate nps-core-int%d\n",
+			NON_RING_MSIX_BASE);
+		return ret;
+	}
+
+	qvec = kcalloc(NR_NON_RING_VECTORS, sizeof(*qvec), GFP_KERNEL);
+	if (!qvec) {
+		pci_disable_msix(pdev);
+		return -ENOMEM;
+	}
+	qvec->ndev = ndev;
+
+	ndev->qvec = qvec;
+	ndev->num_vecs = NR_NON_RING_VECTORS;
+	snprintf(qvec->name, IRQ_NAMESZ, "nitrox-core-int%d",
+		 NON_RING_MSIX_BASE);
+
+	vec = ndev->iov.msix.vector;
+	ret = request_irq(vec, nps_core_int_isr, 0, qvec->name, qvec);
+	if (ret) {
+		dev_err(DEV(ndev), "irq failed for nitrox-core-int%d\n",
+			NON_RING_MSIX_BASE);
+		goto iov_irq_fail;
+	}
+	cpu = num_online_cpus();
+	irq_set_affinity_hint(vec, get_cpu_mask(cpu));
+
+	tasklet_init(&qvec->resp_tasklet, nps_core_int_tasklet,
+		     (unsigned long)qvec);
+	qvec->valid = true;
+
+	return 0;
+
+iov_irq_fail:
+	nitrox_sriov_unregister_interrupts(ndev);
+	return ret;
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_isr.h b/drivers/crypto/cavium/nitrox/nitrox_isr.h
index 63418a6cc52c9f54b818e7503679122aad0f7881..1062c9336c1f4c9c40abc337e8180c60acee04b7 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_isr.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_isr.h
@@ -6,5 +6,7 @@
 
 int nitrox_register_interrupts(struct nitrox_device *ndev);
 void nitrox_unregister_interrupts(struct nitrox_device *ndev);
+int nitrox_sriov_register_interupts(struct nitrox_device *ndev);
+void nitrox_sriov_unregister_interrupts(struct nitrox_device *ndev);
 
 #endif /* __NITROX_ISR_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_lib.c b/drivers/crypto/cavium/nitrox/nitrox_lib.c
index 2260efa4230833ea0d4fcb8eecd26170c30816a7..9138bae12521206c980378d1cdf0c6985963d078 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_lib.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_lib.c
@@ -158,12 +158,19 @@ static void destroy_crypto_dma_pool(struct nitrox_device *ndev)
 void *crypto_alloc_context(struct nitrox_device *ndev)
 {
 	struct ctx_hdr *ctx;
+	struct crypto_ctx_hdr *chdr;
 	void *vaddr;
 	dma_addr_t dma;
 
+	chdr = kmalloc(sizeof(*chdr), GFP_KERNEL);
+	if (!chdr)
+		return NULL;
+
 	vaddr = dma_pool_zalloc(ndev->ctx_pool, GFP_KERNEL, &dma);
-	if (!vaddr)
+	if (!vaddr) {
+		kfree(chdr);
 		return NULL;
+	}
 
 	/* fill meta data */
 	ctx = vaddr;
@@ -171,7 +178,11 @@ void *crypto_alloc_context(struct nitrox_device *ndev)
 	ctx->dma = dma;
 	ctx->ctx_dma = dma + sizeof(struct ctx_hdr);
 
-	return ((u8 *)vaddr + sizeof(struct ctx_hdr));
+	chdr->pool = ndev->ctx_pool;
+	chdr->dma = dma;
+	chdr->vaddr = vaddr;
+
+	return chdr;
 }
 
 /**
@@ -180,13 +191,14 @@ void *crypto_alloc_context(struct nitrox_device *ndev)
  */
 void crypto_free_context(void *ctx)
 {
-	struct ctx_hdr *ctxp;
+	struct crypto_ctx_hdr *ctxp;
 
 	if (!ctx)
 		return;
 
-	ctxp = (struct ctx_hdr *)((u8 *)ctx - sizeof(struct ctx_hdr));
-	dma_pool_free(ctxp->pool, ctxp, ctxp->dma);
+	ctxp = ctx;
+	dma_pool_free(ctxp->pool, ctxp->vaddr, ctxp->dma);
+	kfree(ctxp);
 }
 
 /**
diff --git a/drivers/crypto/cavium/nitrox/nitrox_main.c b/drivers/crypto/cavium/nitrox/nitrox_main.c
index 6595c95af9f1911ad462e545c4f01044827f8388..014e9863c20e2d4143249c53a52ae6cf49e38729 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_main.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_main.c
@@ -1,6 +1,5 @@
 #include <linux/aer.h>
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/firmware.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -13,9 +12,9 @@
 #include "nitrox_csr.h"
 #include "nitrox_hal.h"
 #include "nitrox_isr.h"
+#include "nitrox_debugfs.h"
 
 #define CNN55XX_DEV_ID	0x12
-#define MAX_PF_QUEUES	64
 #define UCODE_HLEN 48
 #define SE_GROUP 0
 
diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.c b/drivers/crypto/cavium/nitrox/nitrox_mbx.c
new file mode 100644
index 0000000000000000000000000000000000000000..02ee950648413ef8254ccf354b5f6f6458952238
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/workqueue.h>
+
+#include "nitrox_csr.h"
+#include "nitrox_hal.h"
+#include "nitrox_dev.h"
+
+#define RING_TO_VFNO(_x, _y)	((_x) / (_y))
+
+/**
+ * mbx_msg_type - Mailbox message types
+ */
+enum mbx_msg_type {
+	MBX_MSG_TYPE_NOP,
+	MBX_MSG_TYPE_REQ,
+	MBX_MSG_TYPE_ACK,
+	MBX_MSG_TYPE_NACK,
+};
+
+/**
+ * mbx_msg_opcode - Mailbox message opcodes
+ */
+enum mbx_msg_opcode {
+	MSG_OP_VF_MODE = 1,
+	MSG_OP_VF_UP,
+	MSG_OP_VF_DOWN,
+	MSG_OP_CHIPID_VFID,
+};
+
+struct pf2vf_work {
+	struct nitrox_vfdev *vfdev;
+	struct nitrox_device *ndev;
+	struct work_struct pf2vf_resp;
+};
+
+static inline u64 pf2vf_read_mbox(struct nitrox_device *ndev, int ring)
+{
+	u64 reg_addr;
+
+	reg_addr = NPS_PKT_MBOX_VF_PF_PFDATAX(ring);
+	return nitrox_read_csr(ndev, reg_addr);
+}
+
+static inline void pf2vf_write_mbox(struct nitrox_device *ndev, u64 value,
+				    int ring)
+{
+	u64 reg_addr;
+
+	reg_addr = NPS_PKT_MBOX_PF_VF_PFDATAX(ring);
+	nitrox_write_csr(ndev, reg_addr, value);
+}
+
+static void pf2vf_send_response(struct nitrox_device *ndev,
+				struct nitrox_vfdev *vfdev)
+{
+	union mbox_msg msg;
+
+	msg.value = vfdev->msg.value;
+
+	switch (vfdev->msg.opcode) {
+	case MSG_OP_VF_MODE:
+		msg.data = ndev->mode;
+		break;
+	case MSG_OP_VF_UP:
+		vfdev->nr_queues = vfdev->msg.data;
+		atomic_set(&vfdev->state, __NDEV_READY);
+		break;
+	case MSG_OP_CHIPID_VFID:
+		msg.id.chipid = ndev->idx;
+		msg.id.vfid = vfdev->vfno;
+		break;
+	case MSG_OP_VF_DOWN:
+		vfdev->nr_queues = 0;
+		atomic_set(&vfdev->state, __NDEV_NOT_READY);
+		break;
+	default:
+		msg.type = MBX_MSG_TYPE_NOP;
+		break;
+	}
+
+	if (msg.type == MBX_MSG_TYPE_NOP)
+		return;
+
+	/* send ACK to VF */
+	msg.type = MBX_MSG_TYPE_ACK;
+	pf2vf_write_mbox(ndev, msg.value, vfdev->ring);
+
+	vfdev->msg.value = 0;
+	atomic64_inc(&vfdev->mbx_resp);
+}
+
+static void pf2vf_resp_handler(struct work_struct *work)
+{
+	struct pf2vf_work *pf2vf_resp = container_of(work, struct pf2vf_work,
+						     pf2vf_resp);
+	struct nitrox_vfdev *vfdev = pf2vf_resp->vfdev;
+	struct nitrox_device *ndev = pf2vf_resp->ndev;
+
+	switch (vfdev->msg.type) {
+	case MBX_MSG_TYPE_REQ:
+		/* process the request from VF */
+		pf2vf_send_response(ndev, vfdev);
+		break;
+	case MBX_MSG_TYPE_ACK:
+	case MBX_MSG_TYPE_NACK:
+		break;
+	};
+
+	kfree(pf2vf_resp);
+}
+
+void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev)
+{
+	struct nitrox_vfdev *vfdev;
+	struct pf2vf_work *pfwork;
+	u64 value, reg_addr;
+	u32 i;
+	int vfno;
+
+	/* loop for VF(0..63) */
+	reg_addr = NPS_PKT_MBOX_INT_LO;
+	value = nitrox_read_csr(ndev, reg_addr);
+	for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) {
+		/* get the vfno from ring */
+		vfno = RING_TO_VFNO(i, ndev->iov.max_vf_queues);
+		vfdev = ndev->iov.vfdev + vfno;
+		vfdev->ring = i;
+		/* fill the vf mailbox data */
+		vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring);
+		pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC);
+		if (!pfwork)
+			continue;
+
+		pfwork->vfdev = vfdev;
+		pfwork->ndev = ndev;
+		INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler);
+		queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp);
+		/* clear the corresponding vf bit */
+		nitrox_write_csr(ndev, reg_addr, BIT_ULL(i));
+	}
+
+	/* loop for VF(64..127) */
+	reg_addr = NPS_PKT_MBOX_INT_HI;
+	value = nitrox_read_csr(ndev, reg_addr);
+	for_each_set_bit(i, (const unsigned long *)&value, BITS_PER_LONG) {
+		/* get the vfno from ring */
+		vfno = RING_TO_VFNO(i + 64, ndev->iov.max_vf_queues);
+		vfdev = ndev->iov.vfdev + vfno;
+		vfdev->ring = (i + 64);
+		/* fill the vf mailbox data */
+		vfdev->msg.value = pf2vf_read_mbox(ndev, vfdev->ring);
+
+		pfwork = kzalloc(sizeof(*pfwork), GFP_ATOMIC);
+		if (!pfwork)
+			continue;
+
+		pfwork->vfdev = vfdev;
+		pfwork->ndev = ndev;
+		INIT_WORK(&pfwork->pf2vf_resp, pf2vf_resp_handler);
+		queue_work(ndev->iov.pf2vf_wq, &pfwork->pf2vf_resp);
+		/* clear the corresponding vf bit */
+		nitrox_write_csr(ndev, reg_addr, BIT_ULL(i));
+	}
+}
+
+int nitrox_mbox_init(struct nitrox_device *ndev)
+{
+	struct nitrox_vfdev *vfdev;
+	int i;
+
+	ndev->iov.vfdev = kcalloc(ndev->iov.num_vfs,
+				  sizeof(struct nitrox_vfdev), GFP_KERNEL);
+	if (!ndev->iov.vfdev)
+		return -ENOMEM;
+
+	for (i = 0; i < ndev->iov.num_vfs; i++) {
+		vfdev = ndev->iov.vfdev + i;
+		vfdev->vfno = i;
+	}
+
+	/* allocate pf2vf response workqueue */
+	ndev->iov.pf2vf_wq = alloc_workqueue("nitrox_pf2vf", 0, 0);
+	if (!ndev->iov.pf2vf_wq) {
+		kfree(ndev->iov.vfdev);
+		return -ENOMEM;
+	}
+	/* enable pf2vf mailbox interrupts */
+	enable_pf2vf_mbox_interrupts(ndev);
+
+	return 0;
+}
+
+void nitrox_mbox_cleanup(struct nitrox_device *ndev)
+{
+	/* disable pf2vf mailbox interrupts */
+	disable_pf2vf_mbox_interrupts(ndev);
+	/* destroy workqueue */
+	if (ndev->iov.pf2vf_wq)
+		destroy_workqueue(ndev->iov.pf2vf_wq);
+
+	kfree(ndev->iov.vfdev);
+	ndev->iov.pf2vf_wq = NULL;
+	ndev->iov.vfdev = NULL;
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_mbx.h b/drivers/crypto/cavium/nitrox/nitrox_mbx.h
new file mode 100644
index 0000000000000000000000000000000000000000..5008399775a938d6ae11ea5d0d6bca32a8d3b7c8
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_mbx.h
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NITROX_MBX_H
+#define __NITROX_MBX_H
+
+int nitrox_mbox_init(struct nitrox_device *ndev);
+void nitrox_mbox_cleanup(struct nitrox_device *ndev);
+void nitrox_pf2vf_mbox_handler(struct nitrox_device *ndev);
+
+#endif /* __NITROX_MBX_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_req.h b/drivers/crypto/cavium/nitrox/nitrox_req.h
index d091b6f5f5dd697d56ec01e1afd2449d5d03127e..76c0f0be7233693f173540dc0609d0067c74cd6d 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_req.h
+++ b/drivers/crypto/cavium/nitrox/nitrox_req.h
@@ -7,6 +7,9 @@
 
 #include "nitrox_dev.h"
 
+#define PENDING_SIG	0xFFFFFFFFFFFFFFFFUL
+#define PRIO 4001
+
 /**
  * struct gphdr - General purpose Header
  * @param0: first parameter.
@@ -46,13 +49,6 @@ union se_req_ctrl {
 	} s;
 };
 
-struct nitrox_sglist {
-	u16 len;
-	u16 raz0;
-	u32 raz1;
-	dma_addr_t dma;
-};
-
 #define MAX_IV_LEN 16
 
 /**
@@ -62,8 +58,10 @@ struct nitrox_sglist {
  * @ctx_handle: Crypto context handle.
  * @gph: GP Header
  * @ctrl: Request Information.
- * @in: Input sglist
- * @out: Output sglist
+ * @orh: ORH address
+ * @comp: completion address
+ * @src: Input sglist
+ * @dst: Output sglist
  */
 struct se_crypto_request {
 	u8 opcode;
@@ -73,9 +71,8 @@ struct se_crypto_request {
 
 	struct gphdr gph;
 	union se_req_ctrl ctrl;
-
-	u8 iv[MAX_IV_LEN];
-	u16 ivsize;
+	u64 *orh;
+	u64 *comp;
 
 	struct scatterlist *src;
 	struct scatterlist *dst;
@@ -110,6 +107,18 @@ enum flexi_cipher {
 	CIPHER_INVALID
 };
 
+enum flexi_auth {
+	AUTH_NULL = 0,
+	AUTH_MD5,
+	AUTH_SHA1,
+	AUTH_SHA2_SHA224,
+	AUTH_SHA2_SHA256,
+	AUTH_SHA2_SHA384,
+	AUTH_SHA2_SHA512,
+	AUTH_GMAC,
+	AUTH_INVALID
+};
+
 /**
  * struct crypto_keys - Crypto keys
  * @key: Encryption key or KEY1 for AES-XTS
@@ -136,6 +145,32 @@ struct auth_keys {
 	u8 opad[64];
 };
 
+union fc_ctx_flags {
+	__be64 f;
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u64 cipher_type	: 4;
+		u64 reserved_59	: 1;
+		u64 aes_keylen : 2;
+		u64 iv_source : 1;
+		u64 hash_type : 4;
+		u64 reserved_49_51 : 3;
+		u64 auth_input_type: 1;
+		u64 mac_len : 8;
+		u64 reserved_0_39 : 40;
+#else
+		u64 reserved_0_39 : 40;
+		u64 mac_len : 8;
+		u64 auth_input_type: 1;
+		u64 reserved_49_51 : 3;
+		u64 hash_type : 4;
+		u64 iv_source : 1;
+		u64 aes_keylen : 2;
+		u64 reserved_59	: 1;
+		u64 cipher_type	: 4;
+#endif
+	} w0;
+};
 /**
  * struct flexi_crypto_context - Crypto context
  * @cipher_type: Encryption cipher type
@@ -150,49 +185,30 @@ struct auth_keys {
  * @auth: Authentication keys
  */
 struct flexi_crypto_context {
-	union {
-		__be64 flags;
-		struct {
-#if defined(__BIG_ENDIAN_BITFIELD)
-			u64 cipher_type	: 4;
-			u64 reserved_59	: 1;
-			u64 aes_keylen : 2;
-			u64 iv_source : 1;
-			u64 hash_type : 4;
-			u64 reserved_49_51 : 3;
-			u64 auth_input_type: 1;
-			u64 mac_len : 8;
-			u64 reserved_0_39 : 40;
-#else
-			u64 reserved_0_39 : 40;
-			u64 mac_len : 8;
-			u64 auth_input_type: 1;
-			u64 reserved_49_51 : 3;
-			u64 hash_type : 4;
-			u64 iv_source : 1;
-			u64 aes_keylen : 2;
-			u64 reserved_59	: 1;
-			u64 cipher_type	: 4;
-#endif
-		} w0;
-	};
-
+	union fc_ctx_flags flags;
 	struct crypto_keys crypto;
 	struct auth_keys auth;
 };
 
+struct crypto_ctx_hdr {
+	struct dma_pool *pool;
+	dma_addr_t dma;
+	void *vaddr;
+};
+
 struct nitrox_crypto_ctx {
 	struct nitrox_device *ndev;
 	union {
 		u64 ctx_handle;
 		struct flexi_crypto_context *fctx;
 	} u;
+	struct crypto_ctx_hdr *chdr;
 };
 
 struct nitrox_kcrypt_request {
 	struct se_crypto_request creq;
-	struct nitrox_crypto_ctx *nctx;
-	struct skcipher_request *skreq;
+	u8 *src;
+	u8 *dst;
 };
 
 /**
@@ -369,26 +385,19 @@ struct nitrox_sgcomp {
 
 /*
  * strutct nitrox_sgtable - SG list information
- * @map_cnt: Number of buffers mapped
- * @nr_comp: Number of sglist components
+ * @sgmap_cnt: Number of buffers mapped
  * @total_bytes: Total bytes in sglist.
- * @len: Total sglist components length.
- * @dma: DMA address of sglist component.
- * @dir: DMA direction.
- * @buf: crypto request buffer.
- * @sglist: SG list of input/output buffers.
+ * @sgcomp_len: Total sglist components length.
+ * @sgcomp_dma: DMA address of sglist component.
+ * @sg: crypto request buffer.
  * @sgcomp: sglist component for NITROX.
  */
 struct nitrox_sgtable {
-	u8 map_bufs_cnt;
-	u8 nr_sgcomp;
+	u8 sgmap_cnt;
 	u16 total_bytes;
-	u32 len;
-	dma_addr_t dma;
-	enum dma_data_direction dir;
-
-	struct scatterlist *buf;
-	struct nitrox_sglist *sglist;
+	u32 sgcomp_len;
+	dma_addr_t sgcomp_dma;
+	struct scatterlist *sg;
 	struct nitrox_sgcomp *sgcomp;
 };
 
@@ -398,13 +407,11 @@ struct nitrox_sgtable {
 #define COMP_HLEN	8
 
 struct resp_hdr {
-	u64 orh;
-	dma_addr_t orh_dma;
-	u64 completion;
-	dma_addr_t completion_dma;
+	u64 *orh;
+	u64 *completion;
 };
 
-typedef void (*completion_t)(struct skcipher_request *skreq, int err);
+typedef void (*completion_t)(void *arg, int err);
 
 /**
  * struct nitrox_softreq - Represents the NIROX Request.
@@ -427,7 +434,6 @@ struct nitrox_softreq {
 	u32 flags;
 	gfp_t gfp;
 	atomic_t status;
-	bool inplace;
 
 	struct nitrox_device *ndev;
 	struct nitrox_cmdq *cmdq;
@@ -440,7 +446,201 @@ struct nitrox_softreq {
 	unsigned long tstamp;
 
 	completion_t callback;
-	struct skcipher_request *skreq;
+	void *cb_arg;
 };
 
+static inline int flexi_aes_keylen(int keylen)
+{
+	int aes_keylen;
+
+	switch (keylen) {
+	case AES_KEYSIZE_128:
+		aes_keylen = 1;
+		break;
+	case AES_KEYSIZE_192:
+		aes_keylen = 2;
+		break;
+	case AES_KEYSIZE_256:
+		aes_keylen = 3;
+		break;
+	default:
+		aes_keylen = -EINVAL;
+		break;
+	}
+	return aes_keylen;
+}
+
+static inline void *alloc_req_buf(int nents, int extralen, gfp_t gfp)
+{
+	size_t size;
+
+	size = sizeof(struct scatterlist) * nents;
+	size += extralen;
+
+	return kzalloc(size, gfp);
+}
+
+/**
+ * create_single_sg - Point SG entry to the data
+ * @sg:		Destination SG list
+ * @buf:	Data
+ * @buflen:	Data length
+ *
+ * Returns next free entry in the destination SG list
+ **/
+static inline struct scatterlist *create_single_sg(struct scatterlist *sg,
+						   void *buf, int buflen)
+{
+	sg_set_buf(sg, buf, buflen);
+	sg++;
+	return sg;
+}
+
+/**
+ * create_multi_sg - Create multiple sg entries with buflen data length from
+ *		     source sglist
+ * @to_sg:	Destination SG list
+ * @from_sg:	Source SG list
+ * @buflen:	Data length
+ *
+ * Returns next free entry in the destination SG list
+ **/
+static inline struct scatterlist *create_multi_sg(struct scatterlist *to_sg,
+						  struct scatterlist *from_sg,
+						  int buflen)
+{
+	struct scatterlist *sg = to_sg;
+	unsigned int sglen;
+
+	for (; buflen; buflen -= sglen) {
+		sglen = from_sg->length;
+		if (sglen > buflen)
+			sglen = buflen;
+
+		sg_set_buf(sg, sg_virt(from_sg), sglen);
+		from_sg = sg_next(from_sg);
+		sg++;
+	}
+
+	return sg;
+}
+
+static inline void set_orh_value(u64 *orh)
+{
+	WRITE_ONCE(*orh, PENDING_SIG);
+}
+
+static inline void set_comp_value(u64 *comp)
+{
+	WRITE_ONCE(*comp, PENDING_SIG);
+}
+
+static inline int alloc_src_req_buf(struct nitrox_kcrypt_request *nkreq,
+				    int nents, int ivsize)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	nkreq->src = alloc_req_buf(nents, ivsize, creq->gfp);
+	if (!nkreq->src)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void nitrox_creq_copy_iv(char *dst, char *src, int size)
+{
+	memcpy(dst, src, size);
+}
+
+static inline struct scatterlist *nitrox_creq_src_sg(char *iv, int ivsize)
+{
+	return (struct scatterlist *)(iv + ivsize);
+}
+
+static inline void nitrox_creq_set_src_sg(struct nitrox_kcrypt_request *nkreq,
+					  int nents, int ivsize,
+					  struct scatterlist *src, int buflen)
+{
+	char *iv = nkreq->src;
+	struct scatterlist *sg;
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->src = nitrox_creq_src_sg(iv, ivsize);
+	sg = creq->src;
+	sg_init_table(sg, nents);
+
+	/* Input format:
+	 * +----+----------------+
+	 * | IV | SRC sg entries |
+	 * +----+----------------+
+	 */
+
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* SRC entries */
+	create_multi_sg(sg, src, buflen);
+}
+
+static inline int alloc_dst_req_buf(struct nitrox_kcrypt_request *nkreq,
+				    int nents)
+{
+	int extralen = ORH_HLEN + COMP_HLEN;
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	nkreq->dst = alloc_req_buf(nents, extralen, creq->gfp);
+	if (!nkreq->dst)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void nitrox_creq_set_orh(struct nitrox_kcrypt_request *nkreq)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->orh = (u64 *)(nkreq->dst);
+	set_orh_value(creq->orh);
+}
+
+static inline void nitrox_creq_set_comp(struct nitrox_kcrypt_request *nkreq)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+
+	creq->comp = (u64 *)(nkreq->dst + ORH_HLEN);
+	set_comp_value(creq->comp);
+}
+
+static inline struct scatterlist *nitrox_creq_dst_sg(char *dst)
+{
+	return (struct scatterlist *)(dst + ORH_HLEN + COMP_HLEN);
+}
+
+static inline void nitrox_creq_set_dst_sg(struct nitrox_kcrypt_request *nkreq,
+					  int nents, int ivsize,
+					  struct scatterlist *dst, int buflen)
+{
+	struct se_crypto_request *creq = &nkreq->creq;
+	struct scatterlist *sg;
+	char *iv = nkreq->src;
+
+	creq->dst = nitrox_creq_dst_sg(nkreq->dst);
+	sg = creq->dst;
+	sg_init_table(sg, nents);
+
+	/* Output format:
+	 * +-----+----+----------------+-----------------+
+	 * | ORH | IV | DST sg entries | COMPLETION Bytes|
+	 * +-----+----+----------------+-----------------+
+	 */
+
+	/* ORH */
+	sg = create_single_sg(sg, creq->orh, ORH_HLEN);
+	/* IV */
+	sg = create_single_sg(sg, iv, ivsize);
+	/* DST entries */
+	sg = create_multi_sg(sg, dst, buflen);
+	/* COMPLETION Bytes */
+	create_single_sg(sg, creq->comp, COMP_HLEN);
+}
+
 #endif /* __NITROX_REQ_H */
diff --git a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
index 3987cd84c0336d52e4444b86fe4e860642b0a838..e34e4df8fd246275201dabdc53dc7f8570a6434f 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_reqmgr.c
@@ -13,7 +13,6 @@
 #define FDATA_SIZE 32
 /* Base destination port for the solicited requests */
 #define SOLICIT_BASE_DPORT 256
-#define PENDING_SIG	0xFFFFFFFFFFFFFFFFUL
 
 #define REQ_NOT_POSTED 1
 #define REQ_BACKLOG    2
@@ -52,58 +51,26 @@ static inline int incr_index(int index, int count, int max)
 	return index;
 }
 
-/**
- * dma_free_sglist - unmap and free the sg lists.
- * @ndev: N5 device
- * @sgtbl: SG table
- */
 static void softreq_unmap_sgbufs(struct nitrox_softreq *sr)
 {
 	struct nitrox_device *ndev = sr->ndev;
 	struct device *dev = DEV(ndev);
-	struct nitrox_sglist *sglist;
-
-	/* unmap in sgbuf */
-	sglist = sr->in.sglist;
-	if (!sglist)
-		goto out_unmap;
-
-	/* unmap iv */
-	dma_unmap_single(dev, sglist->dma, sglist->len, DMA_BIDIRECTIONAL);
-	/* unmpa src sglist */
-	dma_unmap_sg(dev, sr->in.buf, (sr->in.map_bufs_cnt - 1), sr->in.dir);
-	/* unamp gather component */
-	dma_unmap_single(dev, sr->in.dma, sr->in.len, DMA_TO_DEVICE);
-	kfree(sr->in.sglist);
+
+
+	dma_unmap_sg(dev, sr->in.sg, sr->in.sgmap_cnt, DMA_BIDIRECTIONAL);
+	dma_unmap_single(dev, sr->in.sgcomp_dma, sr->in.sgcomp_len,
+			 DMA_TO_DEVICE);
 	kfree(sr->in.sgcomp);
-	sr->in.sglist = NULL;
-	sr->in.buf = NULL;
-	sr->in.map_bufs_cnt = 0;
-
-out_unmap:
-	/* unmap out sgbuf */
-	sglist = sr->out.sglist;
-	if (!sglist)
-		return;
-
-	/* unmap orh */
-	dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir);
-
-	/* unmap dst sglist */
-	if (!sr->inplace) {
-		dma_unmap_sg(dev, sr->out.buf, (sr->out.map_bufs_cnt - 3),
-			     sr->out.dir);
-	}
-	/* unmap completion */
-	dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir);
+	sr->in.sg = NULL;
+	sr->in.sgmap_cnt = 0;
 
-	/* unmap scatter component */
-	dma_unmap_single(dev, sr->out.dma, sr->out.len, DMA_TO_DEVICE);
-	kfree(sr->out.sglist);
+	dma_unmap_sg(dev, sr->out.sg, sr->out.sgmap_cnt,
+		     DMA_BIDIRECTIONAL);
+	dma_unmap_single(dev, sr->out.sgcomp_dma, sr->out.sgcomp_len,
+			 DMA_TO_DEVICE);
 	kfree(sr->out.sgcomp);
-	sr->out.sglist = NULL;
-	sr->out.buf = NULL;
-	sr->out.map_bufs_cnt = 0;
+	sr->out.sg = NULL;
+	sr->out.sgmap_cnt = 0;
 }
 
 static void softreq_destroy(struct nitrox_softreq *sr)
@@ -116,7 +83,7 @@ static void softreq_destroy(struct nitrox_softreq *sr)
  * create_sg_component - create SG componets for N5 device.
  * @sr: Request structure
  * @sgtbl: SG table
- * @nr_comp: total number of components required
+ * @map_nents: number of dma mapped entries
  *
  * Component structure
  *
@@ -140,7 +107,7 @@ static int create_sg_component(struct nitrox_softreq *sr,
 {
 	struct nitrox_device *ndev = sr->ndev;
 	struct nitrox_sgcomp *sgcomp;
-	struct nitrox_sglist *sglist;
+	struct scatterlist *sg;
 	dma_addr_t dma;
 	size_t sz_comp;
 	int i, j, nr_sgcomp;
@@ -154,17 +121,15 @@ static int create_sg_component(struct nitrox_softreq *sr,
 		return -ENOMEM;
 
 	sgtbl->sgcomp = sgcomp;
-	sgtbl->nr_sgcomp = nr_sgcomp;
 
-	sglist = sgtbl->sglist;
+	sg = sgtbl->sg;
 	/* populate device sg component */
 	for (i = 0; i < nr_sgcomp; i++) {
-		for (j = 0; j < 4; j++) {
-			sgcomp->len[j] = cpu_to_be16(sglist->len);
-			sgcomp->dma[j] = cpu_to_be64(sglist->dma);
-			sglist++;
+		for (j = 0; j < 4 && sg; j++) {
+			sgcomp[i].len[j] = cpu_to_be16(sg_dma_len(sg));
+			sgcomp[i].dma[j] = cpu_to_be64(sg_dma_address(sg));
+			sg = sg_next(sg);
 		}
-		sgcomp++;
 	}
 	/* map the device sg component */
 	dma = dma_map_single(DEV(ndev), sgtbl->sgcomp, sz_comp, DMA_TO_DEVICE);
@@ -174,8 +139,8 @@ static int create_sg_component(struct nitrox_softreq *sr,
 		return -ENOMEM;
 	}
 
-	sgtbl->dma = dma;
-	sgtbl->len = sz_comp;
+	sgtbl->sgcomp_dma = dma;
+	sgtbl->sgcomp_len = sz_comp;
 
 	return 0;
 }
@@ -193,66 +158,27 @@ static int dma_map_inbufs(struct nitrox_softreq *sr,
 {
 	struct device *dev = DEV(sr->ndev);
 	struct scatterlist *sg = req->src;
-	struct nitrox_sglist *glist;
 	int i, nents, ret = 0;
-	dma_addr_t dma;
-	size_t sz;
 
-	nents = sg_nents(req->src);
+	nents = dma_map_sg(dev, req->src, sg_nents(req->src),
+			   DMA_BIDIRECTIONAL);
+	if (!nents)
+		return -EINVAL;
 
-	/* creater gather list IV and src entries */
-	sz = roundup((1 + nents), 4) * sizeof(*glist);
-	glist = kzalloc(sz, sr->gfp);
-	if (!glist)
-		return -ENOMEM;
+	for_each_sg(req->src, sg, nents, i)
+		sr->in.total_bytes += sg_dma_len(sg);
 
-	sr->in.sglist = glist;
-	/* map IV */
-	dma = dma_map_single(dev, &req->iv, req->ivsize, DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(dev, dma)) {
-		ret = -EINVAL;
-		goto iv_map_err;
-	}
-
-	sr->in.dir = (req->src == req->dst) ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
-	/* map src entries */
-	nents = dma_map_sg(dev, req->src, nents, sr->in.dir);
-	if (!nents) {
-		ret = -EINVAL;
-		goto src_map_err;
-	}
-	sr->in.buf = req->src;
-
-	/* store the mappings */
-	glist->len = req->ivsize;
-	glist->dma = dma;
-	glist++;
-	sr->in.total_bytes += req->ivsize;
-
-	for_each_sg(req->src, sg, nents, i) {
-		glist->len = sg_dma_len(sg);
-		glist->dma = sg_dma_address(sg);
-		sr->in.total_bytes += glist->len;
-		glist++;
-	}
-	/* roundup map count to align with entires in sg component */
-	sr->in.map_bufs_cnt = (1 + nents);
-
-	/* create NITROX gather component */
-	ret = create_sg_component(sr, &sr->in, sr->in.map_bufs_cnt);
+	sr->in.sg = req->src;
+	sr->in.sgmap_cnt = nents;
+	ret = create_sg_component(sr, &sr->in, sr->in.sgmap_cnt);
 	if (ret)
 		goto incomp_err;
 
 	return 0;
 
 incomp_err:
-	dma_unmap_sg(dev, req->src, nents, sr->in.dir);
-	sr->in.map_bufs_cnt = 0;
-src_map_err:
-	dma_unmap_single(dev, dma, req->ivsize, DMA_BIDIRECTIONAL);
-iv_map_err:
-	kfree(sr->in.sglist);
-	sr->in.sglist = NULL;
+	dma_unmap_sg(dev, req->src, nents, DMA_BIDIRECTIONAL);
+	sr->in.sgmap_cnt = 0;
 	return ret;
 }
 
@@ -260,104 +186,25 @@ static int dma_map_outbufs(struct nitrox_softreq *sr,
 			   struct se_crypto_request *req)
 {
 	struct device *dev = DEV(sr->ndev);
-	struct nitrox_sglist *glist = sr->in.sglist;
-	struct nitrox_sglist *slist;
-	struct scatterlist *sg;
-	int i, nents, map_bufs_cnt, ret = 0;
-	size_t sz;
-
-	nents = sg_nents(req->dst);
-
-	/* create scatter list ORH, IV, dst entries and Completion header */
-	sz = roundup((3 + nents), 4) * sizeof(*slist);
-	slist = kzalloc(sz, sr->gfp);
-	if (!slist)
-		return -ENOMEM;
-
-	sr->out.sglist = slist;
-	sr->out.dir = DMA_BIDIRECTIONAL;
-	/* map ORH */
-	sr->resp.orh_dma = dma_map_single(dev, &sr->resp.orh, ORH_HLEN,
-					  sr->out.dir);
-	if (dma_mapping_error(dev, sr->resp.orh_dma)) {
-		ret = -EINVAL;
-		goto orh_map_err;
-	}
+	int nents, ret = 0;
 
-	/* map completion */
-	sr->resp.completion_dma = dma_map_single(dev, &sr->resp.completion,
-						 COMP_HLEN, sr->out.dir);
-	if (dma_mapping_error(dev, sr->resp.completion_dma)) {
-		ret = -EINVAL;
-		goto compl_map_err;
-	}
+	nents = dma_map_sg(dev, req->dst, sg_nents(req->dst),
+			   DMA_BIDIRECTIONAL);
+	if (!nents)
+		return -EINVAL;
 
-	sr->inplace = (req->src == req->dst) ? true : false;
-	/* out place */
-	if (!sr->inplace) {
-		nents = dma_map_sg(dev, req->dst, nents, sr->out.dir);
-		if (!nents) {
-			ret = -EINVAL;
-			goto dst_map_err;
-		}
-	}
-	sr->out.buf = req->dst;
-
-	/* store the mappings */
-	/* orh */
-	slist->len = ORH_HLEN;
-	slist->dma = sr->resp.orh_dma;
-	slist++;
-
-	/* copy the glist mappings */
-	if (sr->inplace) {
-		nents = sr->in.map_bufs_cnt - 1;
-		map_bufs_cnt = sr->in.map_bufs_cnt;
-		while (map_bufs_cnt--) {
-			slist->len = glist->len;
-			slist->dma = glist->dma;
-			slist++;
-			glist++;
-		}
-	} else {
-		/* copy iv mapping */
-		slist->len = glist->len;
-		slist->dma = glist->dma;
-		slist++;
-		/* copy remaining maps */
-		for_each_sg(req->dst, sg, nents, i) {
-			slist->len = sg_dma_len(sg);
-			slist->dma = sg_dma_address(sg);
-			slist++;
-		}
-	}
-
-	/* completion */
-	slist->len = COMP_HLEN;
-	slist->dma = sr->resp.completion_dma;
-
-	sr->out.map_bufs_cnt = (3 + nents);
-
-	ret = create_sg_component(sr, &sr->out, sr->out.map_bufs_cnt);
+	sr->out.sg = req->dst;
+	sr->out.sgmap_cnt = nents;
+	ret = create_sg_component(sr, &sr->out, sr->out.sgmap_cnt);
 	if (ret)
 		goto outcomp_map_err;
 
 	return 0;
 
 outcomp_map_err:
-	if (!sr->inplace)
-		dma_unmap_sg(dev, req->dst, nents, sr->out.dir);
-	sr->out.map_bufs_cnt = 0;
-	sr->out.buf = NULL;
-dst_map_err:
-	dma_unmap_single(dev, sr->resp.completion_dma, COMP_HLEN, sr->out.dir);
-	sr->resp.completion_dma = 0;
-compl_map_err:
-	dma_unmap_single(dev, sr->resp.orh_dma, ORH_HLEN, sr->out.dir);
-	sr->resp.orh_dma = 0;
-orh_map_err:
-	kfree(sr->out.sglist);
-	sr->out.sglist = NULL;
+	dma_unmap_sg(dev, req->dst, nents, DMA_BIDIRECTIONAL);
+	sr->out.sgmap_cnt = 0;
+	sr->out.sg = NULL;
 	return ret;
 }
 
@@ -422,6 +269,8 @@ static inline bool cmdq_full(struct nitrox_cmdq *cmdq, int qlen)
 		smp_mb__after_atomic();
 		return true;
 	}
+	/* sync with other cpus */
+	smp_mb__after_atomic();
 	return false;
 }
 
@@ -477,8 +326,6 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq)
 	spin_lock_bh(&cmdq->backlog_qlock);
 
 	list_for_each_entry_safe(sr, tmp, &cmdq->backlog_head, backlog) {
-		struct skcipher_request *skreq;
-
 		/* submit until space available */
 		if (unlikely(cmdq_full(cmdq, ndev->qlen))) {
 			ret = -ENOSPC;
@@ -490,12 +337,8 @@ static int post_backlog_cmds(struct nitrox_cmdq *cmdq)
 		/* sync with other cpus */
 		smp_mb__after_atomic();
 
-		skreq = sr->skreq;
 		/* post the command */
 		post_se_instr(sr, cmdq);
-
-		/* backlog requests are posted, wakeup with -EINPROGRESS */
-		skcipher_request_complete(skreq, -EINPROGRESS);
 	}
 	spin_unlock_bh(&cmdq->backlog_qlock);
 
@@ -518,7 +361,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr)
 		}
 		/* add to backlog list */
 		backlog_list_add(sr, cmdq);
-		return -EBUSY;
+		return -EINPROGRESS;
 	}
 	post_se_instr(sr, cmdq);
 
@@ -535,7 +378,7 @@ static int nitrox_enqueue_request(struct nitrox_softreq *sr)
 int nitrox_process_se_request(struct nitrox_device *ndev,
 			      struct se_crypto_request *req,
 			      completion_t callback,
-			      struct skcipher_request *skreq)
+			      void *cb_arg)
 {
 	struct nitrox_softreq *sr;
 	dma_addr_t ctx_handle = 0;
@@ -552,12 +395,12 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 	sr->flags = req->flags;
 	sr->gfp = req->gfp;
 	sr->callback = callback;
-	sr->skreq = skreq;
+	sr->cb_arg = cb_arg;
 
 	atomic_set(&sr->status, REQ_NOT_POSTED);
 
-	WRITE_ONCE(sr->resp.orh, PENDING_SIG);
-	WRITE_ONCE(sr->resp.completion, PENDING_SIG);
+	sr->resp.orh = req->orh;
+	sr->resp.completion = req->comp;
 
 	ret = softreq_map_iobuf(sr, req);
 	if (ret) {
@@ -598,13 +441,13 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 
 	/* fill the packet instruction */
 	/* word 0 */
-	sr->instr.dptr0 = cpu_to_be64(sr->in.dma);
+	sr->instr.dptr0 = cpu_to_be64(sr->in.sgcomp_dma);
 
 	/* word 1 */
 	sr->instr.ih.value = 0;
 	sr->instr.ih.s.g = 1;
-	sr->instr.ih.s.gsz = sr->in.map_bufs_cnt;
-	sr->instr.ih.s.ssz = sr->out.map_bufs_cnt;
+	sr->instr.ih.s.gsz = sr->in.sgmap_cnt;
+	sr->instr.ih.s.ssz = sr->out.sgmap_cnt;
 	sr->instr.ih.s.fsz = FDATA_SIZE + sizeof(struct gphdr);
 	sr->instr.ih.s.tlen = sr->instr.ih.s.fsz + sr->in.total_bytes;
 	sr->instr.ih.value = cpu_to_be64(sr->instr.ih.value);
@@ -626,11 +469,11 @@ int nitrox_process_se_request(struct nitrox_device *ndev,
 
 	/* word 4 */
 	sr->instr.slc.value[0] = 0;
-	sr->instr.slc.s.ssz = sr->out.map_bufs_cnt;
+	sr->instr.slc.s.ssz = sr->out.sgmap_cnt;
 	sr->instr.slc.value[0] = cpu_to_be64(sr->instr.slc.value[0]);
 
 	/* word 5 */
-	sr->instr.slc.s.rptr = cpu_to_be64(sr->out.dma);
+	sr->instr.slc.s.rptr = cpu_to_be64(sr->out.sgcomp_dma);
 
 	/*
 	 * No conversion for front data,
@@ -664,6 +507,24 @@ void backlog_qflush_work(struct work_struct *work)
 	post_backlog_cmds(cmdq);
 }
 
+static bool sr_completed(struct nitrox_softreq *sr)
+{
+	u64 orh = READ_ONCE(*sr->resp.orh);
+	unsigned long timeout = jiffies + msecs_to_jiffies(1);
+
+	if ((orh != PENDING_SIG) && (orh & 0xff))
+		return true;
+
+	while (READ_ONCE(*sr->resp.completion) == PENDING_SIG) {
+		if (time_after(jiffies, timeout)) {
+			pr_err("comp not done\n");
+			return false;
+		}
+	}
+
+	return true;
+}
+
 /**
  * process_request_list - process completed requests
  * @ndev: N5 device
@@ -675,8 +536,6 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 {
 	struct nitrox_device *ndev = cmdq->ndev;
 	struct nitrox_softreq *sr;
-	struct skcipher_request *skreq;
-	completion_t callback;
 	int req_completed = 0, err = 0, budget;
 
 	/* check all pending requests */
@@ -691,13 +550,13 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 			break;
 
 		/* check orh and completion bytes updates */
-		if (READ_ONCE(sr->resp.orh) == READ_ONCE(sr->resp.completion)) {
+		if (!sr_completed(sr)) {
 			/* request not completed, check for timeout */
 			if (!cmd_timeout(sr->tstamp, ndev->timeout))
 				break;
 			dev_err_ratelimited(DEV(ndev),
 					    "Request timeout, orh 0x%016llx\n",
-					    READ_ONCE(sr->resp.orh));
+					    READ_ONCE(*sr->resp.orh));
 		}
 		atomic_dec(&cmdq->pending_count);
 		atomic64_inc(&ndev->stats.completed);
@@ -706,15 +565,12 @@ static void process_response_list(struct nitrox_cmdq *cmdq)
 		/* remove from response list */
 		response_list_del(sr, cmdq);
 
-		callback = sr->callback;
-		skreq = sr->skreq;
-
 		/* ORH error code */
-		err = READ_ONCE(sr->resp.orh) & 0xff;
+		err = READ_ONCE(*sr->resp.orh) & 0xff;
 		softreq_destroy(sr);
 
-		if (callback)
-			callback(skreq, err);
+		if (sr->callback)
+			sr->callback(sr->cb_arg, err);
 
 		req_completed++;
 	}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_skcipher.c b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c
new file mode 100644
index 0000000000000000000000000000000000000000..d4935d6cefdd351522e277d8c82ca9071f098471
--- /dev/null
+++ b/drivers/crypto/cavium/nitrox/nitrox_skcipher.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
+#include <crypto/ctr.h>
+#include <crypto/des.h>
+#include <crypto/xts.h>
+
+#include "nitrox_dev.h"
+#include "nitrox_common.h"
+#include "nitrox_req.h"
+
+struct nitrox_cipher {
+	const char *name;
+	enum flexi_cipher value;
+};
+
+/**
+ * supported cipher list
+ */
+static const struct nitrox_cipher flexi_cipher_table[] = {
+	{ "null",		CIPHER_NULL },
+	{ "cbc(des3_ede)",	CIPHER_3DES_CBC },
+	{ "ecb(des3_ede)",	CIPHER_3DES_ECB },
+	{ "cbc(aes)",		CIPHER_AES_CBC },
+	{ "ecb(aes)",		CIPHER_AES_ECB },
+	{ "cfb(aes)",		CIPHER_AES_CFB },
+	{ "rfc3686(ctr(aes))",	CIPHER_AES_CTR },
+	{ "xts(aes)",		CIPHER_AES_XTS },
+	{ "cts(cbc(aes))",	CIPHER_AES_CBC_CTS },
+	{ NULL,			CIPHER_INVALID }
+};
+
+static enum flexi_cipher flexi_cipher_type(const char *name)
+{
+	const struct nitrox_cipher *cipher = flexi_cipher_table;
+
+	while (cipher->name) {
+		if (!strcmp(cipher->name, name))
+			break;
+		cipher++;
+	}
+	return cipher->value;
+}
+
+static int nitrox_skcipher_init(struct crypto_skcipher *tfm)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
+	struct crypto_ctx_hdr *chdr;
+
+	/* get the first device */
+	nctx->ndev = nitrox_get_first_device();
+	if (!nctx->ndev)
+		return -ENODEV;
+
+	/* allocate nitrox crypto context */
+	chdr = crypto_alloc_context(nctx->ndev);
+	if (!chdr) {
+		nitrox_put_device(nctx->ndev);
+		return -ENOMEM;
+	}
+	nctx->chdr = chdr;
+	nctx->u.ctx_handle = (uintptr_t)((u8 *)chdr->vaddr +
+					 sizeof(struct ctx_hdr));
+	crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(tfm) +
+				    sizeof(struct nitrox_kcrypt_request));
+	return 0;
+}
+
+static void nitrox_skcipher_exit(struct crypto_skcipher *tfm)
+{
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(tfm);
+
+	/* free the nitrox crypto context */
+	if (nctx->u.ctx_handle) {
+		struct flexi_crypto_context *fctx = nctx->u.fctx;
+
+		memzero_explicit(&fctx->crypto, sizeof(struct crypto_keys));
+		memzero_explicit(&fctx->auth, sizeof(struct auth_keys));
+		crypto_free_context((void *)nctx->chdr);
+	}
+	nitrox_put_device(nctx->ndev);
+
+	nctx->u.ctx_handle = 0;
+	nctx->ndev = NULL;
+}
+
+static inline int nitrox_skcipher_setkey(struct crypto_skcipher *cipher,
+					 int aes_keylen, const u8 *key,
+					 unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	union fc_ctx_flags *flags;
+	enum flexi_cipher cipher_type;
+	const char *name;
+
+	name = crypto_tfm_alg_name(tfm);
+	cipher_type = flexi_cipher_type(name);
+	if (unlikely(cipher_type == CIPHER_INVALID)) {
+		pr_err("unsupported cipher: %s\n", name);
+		return -EINVAL;
+	}
+
+	/* fill crypto context */
+	fctx = nctx->u.fctx;
+	flags = &fctx->flags;
+	flags->f = 0;
+	flags->w0.cipher_type = cipher_type;
+	flags->w0.aes_keylen = aes_keylen;
+	flags->w0.iv_source = IV_FROM_DPTR;
+	flags->f = cpu_to_be64(*(u64 *)&flags->w0);
+	/* copy the key to context */
+	memcpy(fctx->crypto.u.key, key, keylen);
+
+	return 0;
+}
+
+static int nitrox_aes_setkey(struct crypto_skcipher *cipher, const u8 *key,
+			     unsigned int keylen)
+{
+	int aes_keylen;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static int alloc_src_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->src) + 1;
+	int ret;
+
+	/* Allocate buffer to hold IV and input scatterlist array */
+	ret = alloc_src_req_buf(nkreq, nents, ivsize);
+	if (ret)
+		return ret;
+
+	nitrox_creq_copy_iv(nkreq->src, skreq->iv, ivsize);
+	nitrox_creq_set_src_sg(nkreq, nents, ivsize, skreq->src,
+			       skreq->cryptlen);
+
+	return 0;
+}
+
+static int alloc_dst_sglist(struct skcipher_request *skreq, int ivsize)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int nents = sg_nents(skreq->dst) + 3;
+	int ret;
+
+	/* Allocate buffer to hold ORH, COMPLETION and output scatterlist
+	 * array
+	 */
+	ret = alloc_dst_req_buf(nkreq, nents);
+	if (ret)
+		return ret;
+
+	nitrox_creq_set_orh(nkreq);
+	nitrox_creq_set_comp(nkreq);
+	nitrox_creq_set_dst_sg(nkreq, nents, ivsize, skreq->dst,
+			       skreq->cryptlen);
+
+	return 0;
+}
+
+static void free_src_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->src);
+}
+
+static void free_dst_sglist(struct skcipher_request *skreq)
+{
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+
+	kfree(nkreq->dst);
+}
+
+static void nitrox_skcipher_callback(void *arg, int err)
+{
+	struct skcipher_request *skreq = arg;
+
+	free_src_sglist(skreq);
+	free_dst_sglist(skreq);
+	if (err) {
+		pr_err_ratelimited("request failed status 0x%0x\n", err);
+		err = -EINVAL;
+	}
+
+	skcipher_request_complete(skreq, err);
+}
+
+static int nitrox_skcipher_crypt(struct skcipher_request *skreq, bool enc)
+{
+	struct crypto_skcipher *cipher = crypto_skcipher_reqtfm(skreq);
+	struct nitrox_crypto_ctx *nctx = crypto_skcipher_ctx(cipher);
+	struct nitrox_kcrypt_request *nkreq = skcipher_request_ctx(skreq);
+	int ivsize = crypto_skcipher_ivsize(cipher);
+	struct se_crypto_request *creq;
+	int ret;
+
+	creq = &nkreq->creq;
+	creq->flags = skreq->base.flags;
+	creq->gfp = (skreq->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+		     GFP_KERNEL : GFP_ATOMIC;
+
+	/* fill the request */
+	creq->ctrl.value = 0;
+	creq->opcode = FLEXI_CRYPTO_ENCRYPT_HMAC;
+	creq->ctrl.s.arg = (enc ? ENCRYPT : DECRYPT);
+	/* param0: length of the data to be encrypted */
+	creq->gph.param0 = cpu_to_be16(skreq->cryptlen);
+	creq->gph.param1 = 0;
+	/* param2: encryption data offset */
+	creq->gph.param2 = cpu_to_be16(ivsize);
+	creq->gph.param3 = 0;
+
+	creq->ctx_handle = nctx->u.ctx_handle;
+	creq->ctrl.s.ctxl = sizeof(struct flexi_crypto_context);
+
+	ret = alloc_src_sglist(skreq, ivsize);
+	if (ret)
+		return ret;
+
+	ret = alloc_dst_sglist(skreq, ivsize);
+	if (ret) {
+		free_src_sglist(skreq);
+		return ret;
+	}
+
+	/* send the crypto request */
+	return nitrox_process_se_request(nctx->ndev, creq,
+					 nitrox_skcipher_callback, skreq);
+}
+
+static int nitrox_aes_encrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, true);
+}
+
+static int nitrox_aes_decrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, false);
+}
+
+static int nitrox_3des_setkey(struct crypto_skcipher *cipher,
+			      const u8 *key, unsigned int keylen)
+{
+	if (keylen != DES3_EDE_KEY_SIZE) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return nitrox_skcipher_setkey(cipher, 0, key, keylen);
+}
+
+static int nitrox_3des_encrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, true);
+}
+
+static int nitrox_3des_decrypt(struct skcipher_request *skreq)
+{
+	return nitrox_skcipher_crypt(skreq, false);
+}
+
+static int nitrox_aes_xts_setkey(struct crypto_skcipher *cipher,
+				 const u8 *key, unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	int aes_keylen, ret;
+
+	ret = xts_check_key(tfm, key, keylen);
+	if (ret)
+		return ret;
+
+	keylen /= 2;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	fctx = nctx->u.fctx;
+	/* copy KEY2 */
+	memcpy(fctx->auth.u.key2, (key + keylen), keylen);
+
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static int nitrox_aes_ctr_rfc3686_setkey(struct crypto_skcipher *cipher,
+					 const u8 *key, unsigned int keylen)
+{
+	struct crypto_tfm *tfm = crypto_skcipher_tfm(cipher);
+	struct nitrox_crypto_ctx *nctx = crypto_tfm_ctx(tfm);
+	struct flexi_crypto_context *fctx;
+	int aes_keylen;
+
+	if (keylen < CTR_RFC3686_NONCE_SIZE)
+		return -EINVAL;
+
+	fctx = nctx->u.fctx;
+
+	memcpy(fctx->crypto.iv, key + (keylen - CTR_RFC3686_NONCE_SIZE),
+	       CTR_RFC3686_NONCE_SIZE);
+
+	keylen -= CTR_RFC3686_NONCE_SIZE;
+
+	aes_keylen = flexi_aes_keylen(keylen);
+	if (aes_keylen < 0) {
+		crypto_skcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return nitrox_skcipher_setkey(cipher, aes_keylen, key, keylen);
+}
+
+static struct skcipher_alg nitrox_skciphers[] = { {
+	.base = {
+		.cra_name = "cbc(aes)",
+		.cra_driver_name = "n5_cbc(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "ecb(aes)",
+		.cra_driver_name = "n5_ecb(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "cfb(aes)",
+		.cra_driver_name = "n5_cfb(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "xts(aes)",
+		.cra_driver_name = "n5_xts(aes)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = 2 * AES_MIN_KEY_SIZE,
+	.max_keysize = 2 * AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_xts_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "rfc3686(ctr(aes))",
+		.cra_driver_name = "n5_rfc3686(ctr(aes))",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = 1,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
+	.ivsize = CTR_RFC3686_IV_SIZE,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+	.setkey = nitrox_aes_ctr_rfc3686_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+}, {
+	.base = {
+		.cra_name = "cts(cbc(aes))",
+		.cra_driver_name = "n5_cts(cbc(aes))",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = AES_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_type = &crypto_ablkcipher_type,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = AES_MIN_KEY_SIZE,
+	.max_keysize = AES_MAX_KEY_SIZE,
+	.ivsize = AES_BLOCK_SIZE,
+	.setkey = nitrox_aes_setkey,
+	.encrypt = nitrox_aes_encrypt,
+	.decrypt = nitrox_aes_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "cbc(des3_ede)",
+		.cra_driver_name = "n5_cbc(des3_ede)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = DES3_EDE_KEY_SIZE,
+	.max_keysize = DES3_EDE_KEY_SIZE,
+	.ivsize = DES3_EDE_BLOCK_SIZE,
+	.setkey = nitrox_3des_setkey,
+	.encrypt = nitrox_3des_encrypt,
+	.decrypt = nitrox_3des_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}, {
+	.base = {
+		.cra_name = "ecb(des3_ede)",
+		.cra_driver_name = "n5_ecb(des3_ede)",
+		.cra_priority = PRIO,
+		.cra_flags = CRYPTO_ALG_ASYNC,
+		.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+		.cra_ctxsize = sizeof(struct nitrox_crypto_ctx),
+		.cra_alignmask = 0,
+		.cra_module = THIS_MODULE,
+	},
+	.min_keysize = DES3_EDE_KEY_SIZE,
+	.max_keysize = DES3_EDE_KEY_SIZE,
+	.ivsize = DES3_EDE_BLOCK_SIZE,
+	.setkey = nitrox_3des_setkey,
+	.encrypt = nitrox_3des_encrypt,
+	.decrypt = nitrox_3des_decrypt,
+	.init = nitrox_skcipher_init,
+	.exit = nitrox_skcipher_exit,
+}
+
+};
+
+int nitrox_register_skciphers(void)
+{
+	return crypto_register_skciphers(nitrox_skciphers,
+					 ARRAY_SIZE(nitrox_skciphers));
+}
+
+void nitrox_unregister_skciphers(void)
+{
+	crypto_unregister_skciphers(nitrox_skciphers,
+				    ARRAY_SIZE(nitrox_skciphers));
+}
diff --git a/drivers/crypto/cavium/nitrox/nitrox_sriov.c b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
index 30c0aa87458373fdfeb17fb08c6efc03b0c59ad5..bf439d8256ba0c841152096d939411506b23abd7 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_sriov.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_sriov.c
@@ -6,7 +6,12 @@
 #include "nitrox_hal.h"
 #include "nitrox_common.h"
 #include "nitrox_isr.h"
+#include "nitrox_mbx.h"
 
+/**
+ * num_vfs_valid - validate VF count
+ * @num_vfs: number of VF(s)
+ */
 static inline bool num_vfs_valid(int num_vfs)
 {
 	bool valid = false;
@@ -48,7 +53,32 @@ static inline enum vf_mode num_vfs_to_mode(int num_vfs)
 	return mode;
 }
 
-static void pf_sriov_cleanup(struct nitrox_device *ndev)
+static inline int vf_mode_to_nr_queues(enum vf_mode mode)
+{
+	int nr_queues = 0;
+
+	switch (mode) {
+	case __NDEV_MODE_PF:
+		nr_queues = MAX_PF_QUEUES;
+		break;
+	case __NDEV_MODE_VF16:
+		nr_queues = 8;
+		break;
+	case __NDEV_MODE_VF32:
+		nr_queues = 4;
+		break;
+	case __NDEV_MODE_VF64:
+		nr_queues = 2;
+		break;
+	case __NDEV_MODE_VF128:
+		nr_queues = 1;
+		break;
+	}
+
+	return nr_queues;
+}
+
+static void nitrox_pf_cleanup(struct nitrox_device *ndev)
 {
 	 /* PF has no queues in SR-IOV mode */
 	atomic_set(&ndev->state, __NDEV_NOT_READY);
@@ -60,7 +90,11 @@ static void pf_sriov_cleanup(struct nitrox_device *ndev)
 	nitrox_common_sw_cleanup(ndev);
 }
 
-static int pf_sriov_init(struct nitrox_device *ndev)
+/**
+ * nitrox_pf_reinit - re-initialize PF resources once SR-IOV is disabled
+ * @ndev: NITROX device
+ */
+static int nitrox_pf_reinit(struct nitrox_device *ndev)
 {
 	int err;
 
@@ -86,6 +120,33 @@ static int pf_sriov_init(struct nitrox_device *ndev)
 	return nitrox_crypto_register();
 }
 
+static void nitrox_sriov_cleanup(struct nitrox_device *ndev)
+{
+	/* unregister interrupts for PF in SR-IOV */
+	nitrox_sriov_unregister_interrupts(ndev);
+	nitrox_mbox_cleanup(ndev);
+}
+
+static int nitrox_sriov_init(struct nitrox_device *ndev)
+{
+	int ret;
+
+	/* register interrupts for PF in SR-IOV */
+	ret = nitrox_sriov_register_interupts(ndev);
+	if (ret)
+		return ret;
+
+	ret = nitrox_mbox_init(ndev);
+	if (ret)
+		goto sriov_init_fail;
+
+	return 0;
+
+sriov_init_fail:
+	nitrox_sriov_cleanup(ndev);
+	return ret;
+}
+
 static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
 {
 	struct nitrox_device *ndev = pci_get_drvdata(pdev);
@@ -106,17 +167,32 @@ static int nitrox_sriov_enable(struct pci_dev *pdev, int num_vfs)
 	}
 	dev_info(DEV(ndev), "Enabled VF(s) %d\n", num_vfs);
 
-	ndev->num_vfs = num_vfs;
 	ndev->mode = num_vfs_to_mode(num_vfs);
+	ndev->iov.num_vfs = num_vfs;
+	ndev->iov.max_vf_queues = vf_mode_to_nr_queues(ndev->mode);
 	/* set bit in flags */
 	set_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
 	/* cleanup PF resources */
-	pf_sriov_cleanup(ndev);
+	nitrox_pf_cleanup(ndev);
 
-	config_nps_core_vfcfg_mode(ndev, ndev->mode);
+	/* PF SR-IOV mode initialization */
+	err = nitrox_sriov_init(ndev);
+	if (err)
+		goto iov_fail;
 
+	config_nps_core_vfcfg_mode(ndev, ndev->mode);
 	return num_vfs;
+
+iov_fail:
+	pci_disable_sriov(pdev);
+	/* clear bit in flags */
+	clear_bit(__NDEV_SRIOV_BIT, &ndev->flags);
+	ndev->iov.num_vfs = 0;
+	ndev->mode = __NDEV_MODE_PF;
+	/* reset back to working mode in PF */
+	nitrox_pf_reinit(ndev);
+	return err;
 }
 
 static int nitrox_sriov_disable(struct pci_dev *pdev)
@@ -134,12 +210,16 @@ static int nitrox_sriov_disable(struct pci_dev *pdev)
 	/* clear bit in flags */
 	clear_bit(__NDEV_SRIOV_BIT, &ndev->flags);
 
-	ndev->num_vfs = 0;
+	ndev->iov.num_vfs = 0;
+	ndev->iov.max_vf_queues = 0;
 	ndev->mode = __NDEV_MODE_PF;
 
+	/* cleanup PF SR-IOV resources */
+	nitrox_sriov_cleanup(ndev);
+
 	config_nps_core_vfcfg_mode(ndev, ndev->mode);
 
-	return pf_sriov_init(ndev);
+	return nitrox_pf_reinit(ndev);
 }
 
 int nitrox_sriov_configure(struct pci_dev *pdev, int num_vfs)
diff --git a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
index 3c6fe57f91f8c0bb7cf21ce4d026c20ac50377d4..9108015e56cc55747468d0a67a29da7cb3d41a71 100644
--- a/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
+++ b/drivers/crypto/ccp/ccp-crypto-aes-cmac.c
@@ -346,9 +346,7 @@ static int ccp_aes_cmac_cra_init(struct crypto_tfm *tfm)
 
 	crypto_ahash_set_reqsize(ahash, sizeof(struct ccp_aes_cmac_req_ctx));
 
-	cipher_tfm = crypto_alloc_cipher("aes", 0,
-					 CRYPTO_ALG_ASYNC |
-					 CRYPTO_ALG_NEED_FALLBACK);
+	cipher_tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_NEED_FALLBACK);
 	if (IS_ERR(cipher_tfm)) {
 		pr_warn("could not load aes cipher driver\n");
 		return PTR_ERR(cipher_tfm);
diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c
index 01b82b82f8b8761fa775952ec4165b63b9522eec..f2643cda45db9eee4dfd06cf660e0aa4b878863a 100644
--- a/drivers/crypto/ccree/cc_aead.c
+++ b/drivers/crypto/ccree/cc_aead.c
@@ -58,6 +58,7 @@ struct cc_aead_ctx {
 	unsigned int enc_keylen;
 	unsigned int auth_keylen;
 	unsigned int authsize; /* Actual (reduced?) size of the MAC/ICv */
+	unsigned int hash_len;
 	enum drv_cipher_mode cipher_mode;
 	enum cc_flow_mode flow_mode;
 	enum drv_hash_mode auth_mode;
@@ -122,6 +123,13 @@ static void cc_aead_exit(struct crypto_aead *tfm)
 	}
 }
 
+static unsigned int cc_get_aead_hash_len(struct crypto_aead *tfm)
+{
+	struct cc_aead_ctx *ctx = crypto_aead_ctx(tfm);
+
+	return cc_get_default_hash_len(ctx->drvdata);
+}
+
 static int cc_aead_init(struct crypto_aead *tfm)
 {
 	struct aead_alg *alg = crypto_aead_alg(tfm);
@@ -196,6 +204,7 @@ static int cc_aead_init(struct crypto_aead *tfm)
 		ctx->auth_state.hmac.ipad_opad = NULL;
 		ctx->auth_state.hmac.padded_authkey = NULL;
 	}
+	ctx->hash_len = cc_get_aead_hash_len(tfm);
 
 	return 0;
 
@@ -327,7 +336,7 @@ static int hmac_setkey(struct cc_hw_desc *desc, struct cc_aead_ctx *ctx)
 		/* Load the hash current length*/
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], hash_mode);
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		set_flow_mode(&desc[idx], S_DIN_to_HASH);
 		set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 		idx++;
@@ -465,7 +474,7 @@ static int cc_get_plain_hmac_key(struct crypto_aead *tfm, const u8 *key,
 			/* Load the hash current length*/
 			hw_desc_init(&desc[idx]);
 			set_cipher_mode(&desc[idx], hashmode);
-			set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+			set_din_const(&desc[idx], 0, ctx->hash_len);
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 			set_flow_mode(&desc[idx], S_DIN_to_HASH);
 			set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -1001,7 +1010,7 @@ static void cc_set_hmac_desc(struct aead_request *req, struct cc_hw_desc desc[],
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 	idx++;
@@ -1098,7 +1107,7 @@ static void cc_proc_scheme_desc(struct aead_request *req,
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_dout_sram(&desc[idx], aead_handle->sram_workspace_addr,
-		      ctx->drvdata->hash_len_sz);
+		      ctx->hash_len);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 	set_cipher_do(&desc[idx], DO_PAD);
@@ -1128,7 +1137,7 @@ static void cc_proc_scheme_desc(struct aead_request *req,
 	hw_desc_init(&desc[idx]);
 	set_cipher_mode(&desc[idx], hash_mode);
 	set_din_sram(&desc[idx], cc_digest_len_addr(ctx->drvdata, hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -2358,6 +2367,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha1),cbc(des3_ede))",
@@ -2377,6 +2387,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_DES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),cbc(aes))",
@@ -2396,6 +2407,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),cbc(des3_ede))",
@@ -2415,6 +2427,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_DES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(xcbc(aes),cbc(aes))",
@@ -2434,6 +2447,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_XCBC_MAC,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha1),rfc3686(ctr(aes)))",
@@ -2453,6 +2467,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA1,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(hmac(sha256),rfc3686(ctr(aes)))",
@@ -2472,6 +2487,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_SHA256,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "authenc(xcbc(aes),rfc3686(ctr(aes)))",
@@ -2491,6 +2507,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_XCBC_MAC,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ccm(aes)",
@@ -2510,6 +2527,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4309(ccm(aes))",
@@ -2529,6 +2547,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "gcm(aes)",
@@ -2548,6 +2567,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4106(gcm(aes))",
@@ -2567,6 +2587,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "rfc4543(gcm(aes))",
@@ -2586,6 +2607,7 @@ static struct cc_alg_template aead_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.auth_mode = DRV_HASH_NULL,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 };
 
@@ -2670,7 +2692,8 @@ int cc_aead_alloc(struct cc_drvdata *drvdata)
 
 	/* Linux crypto */
 	for (alg = 0; alg < ARRAY_SIZE(aead_algs); alg++) {
-		if (aead_algs[alg].min_hw_rev > drvdata->hw_rev)
+		if ((aead_algs[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & aead_algs[alg].std_body))
 			continue;
 
 		t_alg = cc_create_aead_alg(&aead_algs[alg], dev);
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index 7623b29911af443ed62491acad2916f078e58a01..cc92b031fad118f2f6c546c72d0bd345619185d7 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -7,6 +7,7 @@
 #include <crypto/internal/skcipher.h>
 #include <crypto/des.h>
 #include <crypto/xts.h>
+#include <crypto/sm4.h>
 #include <crypto/scatterwalk.h>
 
 #include "cc_driver.h"
@@ -83,6 +84,9 @@ static int validate_keys_sizes(struct cc_cipher_ctx *ctx_p, u32 size)
 		if (size == DES3_EDE_KEY_SIZE || size == DES_KEY_SIZE)
 			return 0;
 		break;
+	case S_DIN_to_SM4:
+		if (size == SM4_KEY_SIZE)
+			return 0;
 	default:
 		break;
 	}
@@ -122,6 +126,17 @@ static int validate_data_size(struct cc_cipher_ctx *ctx_p,
 		if (IS_ALIGNED(size, DES_BLOCK_SIZE))
 			return 0;
 		break;
+	case S_DIN_to_SM4:
+		switch (ctx_p->cipher_mode) {
+		case DRV_CIPHER_CTR:
+			return 0;
+		case DRV_CIPHER_ECB:
+		case DRV_CIPHER_CBC:
+			if (IS_ALIGNED(size, SM4_BLOCK_SIZE))
+				return 0;
+		default:
+			break;
+		}
 	default:
 		break;
 	}
@@ -522,6 +537,9 @@ static void cc_setup_cipher_data(struct crypto_tfm *tfm,
 	case S_DIN_to_DES:
 		flow_mode = DIN_DES_DOUT;
 		break;
+	case S_DIN_to_SM4:
+		flow_mode = DIN_SM4_DOUT;
+		break;
 	default:
 		dev_err(dev, "invalid flow mode, flow_mode = %d\n", flow_mode);
 		return;
@@ -815,6 +833,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_XTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts512(paes)",
@@ -832,6 +851,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts4096(paes)",
@@ -849,6 +869,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv(paes)",
@@ -865,6 +886,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ESSIV,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv512(paes)",
@@ -882,6 +904,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv4096(paes)",
@@ -899,6 +922,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker(paes)",
@@ -915,6 +939,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_BITLOCKER,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker512(paes)",
@@ -932,6 +957,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker4096(paes)",
@@ -949,6 +975,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(paes)",
@@ -965,6 +992,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(paes)",
@@ -981,6 +1009,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ofb(paes)",
@@ -997,6 +1026,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_OFB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cts(cbc(paes))",
@@ -1013,6 +1043,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC_CTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ctr(paes)",
@@ -1029,6 +1060,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CTR,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts(aes)",
@@ -1045,6 +1077,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_XTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts512(aes)",
@@ -1062,6 +1095,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "xts4096(aes)",
@@ -1079,6 +1113,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv(aes)",
@@ -1095,6 +1130,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ESSIV,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv512(aes)",
@@ -1112,6 +1148,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "essiv4096(aes)",
@@ -1129,6 +1166,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker(aes)",
@@ -1145,6 +1183,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_BITLOCKER,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker512(aes)",
@@ -1162,6 +1201,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 512,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "bitlocker4096(aes)",
@@ -1179,6 +1219,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.flow_mode = S_DIN_to_AES,
 		.data_unit = 4096,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(aes)",
@@ -1195,6 +1236,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(aes)",
@@ -1211,6 +1253,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ofb(aes)",
@@ -1227,6 +1270,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_OFB,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cts(cbc(aes))",
@@ -1243,6 +1287,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC_CTS,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ctr(aes)",
@@ -1259,6 +1304,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CTR,
 		.flow_mode = S_DIN_to_AES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(des3_ede)",
@@ -1275,6 +1321,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(des3_ede)",
@@ -1291,6 +1338,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "cbc(des)",
@@ -1307,6 +1355,7 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_CBC,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "ecb(des)",
@@ -1323,6 +1372,58 @@ static const struct cc_alg_template skcipher_algs[] = {
 		.cipher_mode = DRV_CIPHER_ECB,
 		.flow_mode = S_DIN_to_DES,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
+	},
+	{
+		.name = "cbc(sm4)",
+		.driver_name = "cbc-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = SM4_BLOCK_SIZE,
+			},
+		.cipher_mode = DRV_CIPHER_CBC,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
+	},
+	{
+		.name = "ecb(sm4)",
+		.driver_name = "ecb-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = 0,
+			},
+		.cipher_mode = DRV_CIPHER_ECB,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
+	},
+	{
+		.name = "ctr(sm4)",
+		.driver_name = "ctr-sm4-ccree",
+		.blocksize = SM4_BLOCK_SIZE,
+		.template_skcipher = {
+			.setkey = cc_cipher_setkey,
+			.encrypt = cc_cipher_encrypt,
+			.decrypt = cc_cipher_decrypt,
+			.min_keysize = SM4_KEY_SIZE,
+			.max_keysize = SM4_KEY_SIZE,
+			.ivsize = SM4_BLOCK_SIZE,
+			},
+		.cipher_mode = DRV_CIPHER_CTR,
+		.flow_mode = S_DIN_to_SM4,
+		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 };
 
@@ -1398,7 +1499,8 @@ int cc_cipher_alloc(struct cc_drvdata *drvdata)
 	dev_dbg(dev, "Number of algorithms = %zu\n",
 		ARRAY_SIZE(skcipher_algs));
 	for (alg = 0; alg < ARRAY_SIZE(skcipher_algs); alg++) {
-		if (skcipher_algs[alg].min_hw_rev > drvdata->hw_rev)
+		if ((skcipher_algs[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & skcipher_algs[alg].std_body))
 			continue;
 
 		dev_dbg(dev, "creating %s\n", skcipher_algs[alg].driver_name);
diff --git a/drivers/crypto/ccree/cc_crypto_ctx.h b/drivers/crypto/ccree/cc_crypto_ctx.h
index e032544f4e312403497d1878d11c2872d539a14f..c8dac273c5632abf04543652523a7de5e0529aff 100644
--- a/drivers/crypto/ccree/cc_crypto_ctx.h
+++ b/drivers/crypto/ccree/cc_crypto_ctx.h
@@ -115,7 +115,8 @@ enum drv_hash_mode {
 	DRV_HASH_CBC_MAC = 6,
 	DRV_HASH_XCBC_MAC = 7,
 	DRV_HASH_CMAC = 8,
-	DRV_HASH_MODE_NUM = 9,
+	DRV_HASH_SM3 = 9,
+	DRV_HASH_MODE_NUM = 10,
 	DRV_HASH_RESERVE32B = S32_MAX
 };
 
@@ -127,6 +128,7 @@ enum drv_hash_hw_mode {
 	DRV_HASH_HW_SHA512 = 4,
 	DRV_HASH_HW_SHA384 = 12,
 	DRV_HASH_HW_GHASH = 6,
+	DRV_HASH_HW_SM3 = 14,
 	DRV_HASH_HW_RESERVE32B = S32_MAX
 };
 
diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c
index 1ff229c2aeab13a464ad573af163fb61783959cf..8ada308d72eea230ebfb75855324f811961472d9 100644
--- a/drivers/crypto/ccree/cc_driver.c
+++ b/drivers/crypto/ccree/cc_driver.c
@@ -39,23 +39,38 @@ struct cc_hw_data {
 	char *name;
 	enum cc_hw_rev rev;
 	u32 sig;
+	int std_bodies;
 };
 
 /* Hardware revisions defs. */
 
+/* The 703 is a OSCCA only variant of the 713 */
+static const struct cc_hw_data cc703_hw = {
+	.name = "703", .rev = CC_HW_REV_713, .std_bodies = CC_STD_OSCCA
+};
+
+static const struct cc_hw_data cc713_hw = {
+	.name = "713", .rev = CC_HW_REV_713, .std_bodies = CC_STD_ALL
+};
+
 static const struct cc_hw_data cc712_hw = {
-	.name = "712", .rev = CC_HW_REV_712, .sig =  0xDCC71200U
+	.name = "712", .rev = CC_HW_REV_712, .sig =  0xDCC71200U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct cc_hw_data cc710_hw = {
-	.name = "710", .rev = CC_HW_REV_710, .sig =  0xDCC63200U
+	.name = "710", .rev = CC_HW_REV_710, .sig =  0xDCC63200U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct cc_hw_data cc630p_hw = {
-	.name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U
+	.name = "630P", .rev = CC_HW_REV_630, .sig = 0xDCC63000U,
+	.std_bodies = CC_STD_ALL
 };
 
 static const struct of_device_id arm_ccree_dev_of_match[] = {
+	{ .compatible = "arm,cryptocell-703-ree", .data = &cc703_hw },
+	{ .compatible = "arm,cryptocell-713-ree", .data = &cc713_hw },
 	{ .compatible = "arm,cryptocell-712-ree", .data = &cc712_hw },
 	{ .compatible = "arm,cryptocell-710-ree", .data = &cc710_hw },
 	{ .compatible = "arm,cryptocell-630p-ree", .data = &cc630p_hw },
@@ -204,14 +219,13 @@ static int init_cc_resources(struct platform_device *plat_dev)
 	hw_rev = (struct cc_hw_data *)dev_id->data;
 	new_drvdata->hw_rev_name = hw_rev->name;
 	new_drvdata->hw_rev = hw_rev->rev;
+	new_drvdata->std_bodies = hw_rev->std_bodies;
 
 	if (hw_rev->rev >= CC_HW_REV_712) {
-		new_drvdata->hash_len_sz = HASH_LEN_SIZE_712;
 		new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP);
 		new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_712);
 		new_drvdata->ver_offset = CC_REG(HOST_VERSION_712);
 	} else {
-		new_drvdata->hash_len_sz = HASH_LEN_SIZE_630;
 		new_drvdata->axim_mon_offset = CC_REG(AXIM_MON_COMP8);
 		new_drvdata->sig_offset = CC_REG(HOST_SIGNATURE_630);
 		new_drvdata->ver_offset = CC_REG(HOST_VERSION_630);
@@ -297,15 +311,17 @@ static int init_cc_resources(struct platform_device *plat_dev)
 		return rc;
 	}
 
-	/* Verify correct mapping */
-	signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset);
-	if (signature_val != hw_rev->sig) {
-		dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n",
-			signature_val, hw_rev->sig);
-		rc = -EINVAL;
-		goto post_clk_err;
+	if (hw_rev->rev <= CC_HW_REV_712) {
+		/* Verify correct mapping */
+		signature_val = cc_ioread(new_drvdata, new_drvdata->sig_offset);
+		if (signature_val != hw_rev->sig) {
+			dev_err(dev, "Invalid CC signature: SIGNATURE=0x%08X != expected=0x%08X\n",
+				signature_val, hw_rev->sig);
+			rc = -EINVAL;
+			goto post_clk_err;
+		}
+		dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val);
 	}
-	dev_dbg(dev, "CC SIGNATURE=0x%08X\n", signature_val);
 
 	/* Display HW versions */
 	dev_info(dev, "ARM CryptoCell %s Driver: HW version 0x%08X, Driver version %s\n",
@@ -461,6 +477,14 @@ int cc_clk_on(struct cc_drvdata *drvdata)
 	return 0;
 }
 
+unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata)
+{
+	if (drvdata->hw_rev >= CC_HW_REV_712)
+		return HASH_LEN_SIZE_712;
+	else
+		return HASH_LEN_SIZE_630;
+}
+
 void cc_clk_off(struct cc_drvdata *drvdata)
 {
 	struct clk *clk = drvdata->clk;
diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h
index d608a4faf662255260b23c6a508e11a12c34e3e8..5be7fd431b0525643d0abec2483e1fbfcf2a0762 100644
--- a/drivers/crypto/ccree/cc_driver.h
+++ b/drivers/crypto/ccree/cc_driver.h
@@ -36,12 +36,19 @@
 extern bool cc_dump_desc;
 extern bool cc_dump_bytes;
 
-#define DRV_MODULE_VERSION "4.0"
+#define DRV_MODULE_VERSION "5.0"
 
 enum cc_hw_rev {
 	CC_HW_REV_630 = 630,
 	CC_HW_REV_710 = 710,
-	CC_HW_REV_712 = 712
+	CC_HW_REV_712 = 712,
+	CC_HW_REV_713 = 713
+};
+
+enum cc_std_body {
+	CC_STD_NIST = 0x1,
+	CC_STD_OSCCA = 0x2,
+	CC_STD_ALL = 0x3
 };
 
 #define CC_COHERENT_CACHE_PARAMS 0xEEE
@@ -127,10 +134,10 @@ struct cc_drvdata {
 	bool coherent;
 	char *hw_rev_name;
 	enum cc_hw_rev hw_rev;
-	u32 hash_len_sz;
 	u32 axim_mon_offset;
 	u32 sig_offset;
 	u32 ver_offset;
+	int std_bodies;
 };
 
 struct cc_crypto_alg {
@@ -156,6 +163,7 @@ struct cc_alg_template {
 	int flow_mode; /* Note: currently, refers to the cipher mode only. */
 	int auth_mode;
 	u32 min_hw_rev;
+	enum cc_std_body std_body;
 	unsigned int data_unit;
 	struct cc_drvdata *drvdata;
 };
@@ -182,6 +190,7 @@ int init_cc_regs(struct cc_drvdata *drvdata, bool is_probe);
 void fini_cc_regs(struct cc_drvdata *drvdata);
 int cc_clk_on(struct cc_drvdata *drvdata);
 void cc_clk_off(struct cc_drvdata *drvdata);
+unsigned int cc_get_default_hash_len(struct cc_drvdata *drvdata);
 
 static inline void cc_iowrite(struct cc_drvdata *drvdata, u32 reg, u32 val)
 {
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index b9313306c36feb729b2aa3f560d452d2bfa6577e..2c4ddc8fb76b24829e9e2c34053b7c0e39f2eee0 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -6,6 +6,7 @@
 #include <crypto/algapi.h>
 #include <crypto/hash.h>
 #include <crypto/md5.h>
+#include <crypto/sm3.h>
 #include <crypto/internal/hash.h>
 
 #include "cc_driver.h"
@@ -16,6 +17,7 @@
 
 #define CC_MAX_HASH_SEQ_LEN 12
 #define CC_MAX_OPAD_KEYS_SIZE CC_MAX_HASH_BLCK_SIZE
+#define CC_SM3_HASH_LEN_SIZE 8
 
 struct cc_hash_handle {
 	cc_sram_addr_t digest_len_sram_addr; /* const value in SRAM*/
@@ -43,6 +45,9 @@ static u64 sha384_init[] = {
 static u64 sha512_init[] = {
 	SHA512_H7, SHA512_H6, SHA512_H5, SHA512_H4,
 	SHA512_H3, SHA512_H2, SHA512_H1, SHA512_H0 };
+static const u32 sm3_init[] = {
+	SM3_IVH, SM3_IVG, SM3_IVF, SM3_IVE,
+	SM3_IVD, SM3_IVC, SM3_IVB, SM3_IVA };
 
 static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[],
 			  unsigned int *seq_size);
@@ -82,6 +87,7 @@ struct cc_hash_ctx {
 	int hash_mode;
 	int hw_mode;
 	int inter_digestsize;
+	unsigned int hash_len;
 	struct completion setkey_comp;
 	bool is_hmac;
 };
@@ -138,10 +144,10 @@ static void cc_init_req(struct device *dev, struct ahash_req_ctx *state,
 			    ctx->hash_mode == DRV_HASH_SHA384)
 				memcpy(state->digest_bytes_len,
 				       digest_len_sha512_init,
-				       ctx->drvdata->hash_len_sz);
+				       ctx->hash_len);
 			else
 				memcpy(state->digest_bytes_len, digest_len_init,
-				       ctx->drvdata->hash_len_sz);
+				       ctx->hash_len);
 		}
 
 		if (ctx->hash_mode != DRV_HASH_NULL) {
@@ -321,7 +327,7 @@ static int cc_fin_result(struct cc_hw_desc *desc, struct ahash_request *req,
 
 	/* Get final MAC result */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	/* TODO */
 	set_dout_dlli(&desc[idx], state->digest_result_dma_addr, digestsize,
 		      NS_BIT, 1);
@@ -367,7 +373,7 @@ static int cc_fin_hmac(struct cc_hw_desc *desc, struct ahash_request *req,
 	set_cipher_mode(&desc[idx], ctx->hw_mode);
 	set_din_sram(&desc[idx],
 		     cc_digest_len_addr(ctx->drvdata, ctx->hash_mode),
-		     ctx->drvdata->hash_len_sz);
+		     ctx->hash_len);
 	set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -440,7 +446,7 @@ static int cc_hash_digest(struct ahash_request *req)
 	 * digest
 	 */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	if (is_hmac) {
 		set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr,
 			     ctx->inter_digestsize, NS_BIT);
@@ -454,14 +460,14 @@ static int cc_hash_digest(struct ahash_request *req)
 
 	/* Load the hash current length */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 
 	if (is_hmac) {
 		set_din_type(&desc[idx], DMA_DLLI,
 			     state->digest_bytes_len_dma_addr,
-			     ctx->drvdata->hash_len_sz, NS_BIT);
+			     ctx->hash_len, NS_BIT);
 	} else {
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		if (nbytes)
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 		else
@@ -478,7 +484,7 @@ static int cc_hash_digest(struct ahash_request *req)
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], ctx->hw_mode);
 		set_dout_dlli(&desc[idx], state->digest_buff_dma_addr,
-			      ctx->drvdata->hash_len_sz, NS_BIT, 0);
+			      ctx->hash_len, NS_BIT, 0);
 		set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 		set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 		set_cipher_do(&desc[idx], DO_PAD);
@@ -504,7 +510,7 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx,
 {
 	/* Restore hash digest */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_din_type(&desc[idx], DMA_DLLI, state->digest_buff_dma_addr,
 		     ctx->inter_digestsize, NS_BIT);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
@@ -513,10 +519,10 @@ static int cc_restore_hash(struct cc_hw_desc *desc, struct cc_hash_ctx *ctx,
 
 	/* Restore hash current length */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_cipher_config1(&desc[idx], HASH_PADDING_DISABLED);
 	set_din_type(&desc[idx], DMA_DLLI, state->digest_bytes_len_dma_addr,
-		     ctx->drvdata->hash_len_sz, NS_BIT);
+		     ctx->hash_len, NS_BIT);
 	set_flow_mode(&desc[idx], S_DIN_to_HASH);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 	idx++;
@@ -576,7 +582,7 @@ static int cc_hash_update(struct ahash_request *req)
 
 	/* store the hash digest result in context */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_buff_dma_addr,
 		      ctx->inter_digestsize, NS_BIT, 0);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
@@ -585,9 +591,9 @@ static int cc_hash_update(struct ahash_request *req)
 
 	/* store current hash length in context */
 	hw_desc_init(&desc[idx]);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
-		      ctx->drvdata->hash_len_sz, NS_BIT, 1);
+		      ctx->hash_len, NS_BIT, 1);
 	set_queue_last_ind(ctx->drvdata, &desc[idx]);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
@@ -649,9 +655,9 @@ static int cc_do_finup(struct ahash_request *req, bool update)
 	/* Pad the hash */
 	hw_desc_init(&desc[idx]);
 	set_cipher_do(&desc[idx], DO_PAD);
-	set_cipher_mode(&desc[idx], ctx->hw_mode);
+	set_hash_cipher_mode(&desc[idx], ctx->hw_mode, ctx->hash_mode);
 	set_dout_dlli(&desc[idx], state->digest_bytes_len_dma_addr,
-		      ctx->drvdata->hash_len_sz, NS_BIT, 0);
+		      ctx->hash_len, NS_BIT, 0);
 	set_setup_mode(&desc[idx], SETUP_WRITE_STATE1);
 	set_flow_mode(&desc[idx], S_HASH_to_DOUT);
 	idx++;
@@ -749,7 +755,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
 			/* Load the hash current length*/
 			hw_desc_init(&desc[idx]);
 			set_cipher_mode(&desc[idx], ctx->hw_mode);
-			set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+			set_din_const(&desc[idx], 0, ctx->hash_len);
 			set_cipher_config1(&desc[idx], HASH_PADDING_ENABLED);
 			set_flow_mode(&desc[idx], S_DIN_to_HASH);
 			set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
@@ -831,7 +837,7 @@ static int cc_hash_setkey(struct crypto_ahash *ahash, const u8 *key,
 		/* Load the hash current length*/
 		hw_desc_init(&desc[idx]);
 		set_cipher_mode(&desc[idx], ctx->hw_mode);
-		set_din_const(&desc[idx], 0, ctx->drvdata->hash_len_sz);
+		set_din_const(&desc[idx], 0, ctx->hash_len);
 		set_flow_mode(&desc[idx], S_DIN_to_HASH);
 		set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
 		idx++;
@@ -1069,6 +1075,16 @@ static int cc_alloc_ctx(struct cc_hash_ctx *ctx)
 	return -ENOMEM;
 }
 
+static int cc_get_hash_len(struct crypto_tfm *tfm)
+{
+	struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (ctx->hash_mode == DRV_HASH_SM3)
+		return CC_SM3_HASH_LEN_SIZE;
+	else
+		return cc_get_default_hash_len(ctx->drvdata);
+}
+
 static int cc_cra_init(struct crypto_tfm *tfm)
 {
 	struct cc_hash_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -1086,7 +1102,7 @@ static int cc_cra_init(struct crypto_tfm *tfm)
 	ctx->hw_mode = cc_alg->hw_mode;
 	ctx->inter_digestsize = cc_alg->inter_digestsize;
 	ctx->drvdata = cc_alg->drvdata;
-
+	ctx->hash_len = cc_get_hash_len(tfm);
 	return cc_alloc_ctx(ctx);
 }
 
@@ -1465,8 +1481,8 @@ static int cc_hash_export(struct ahash_request *req, void *out)
 	memcpy(out, state->digest_buff, ctx->inter_digestsize);
 	out += ctx->inter_digestsize;
 
-	memcpy(out, state->digest_bytes_len, ctx->drvdata->hash_len_sz);
-	out += ctx->drvdata->hash_len_sz;
+	memcpy(out, state->digest_bytes_len, ctx->hash_len);
+	out += ctx->hash_len;
 
 	memcpy(out, &curr_buff_cnt, sizeof(u32));
 	out += sizeof(u32);
@@ -1494,8 +1510,8 @@ static int cc_hash_import(struct ahash_request *req, const void *in)
 	memcpy(state->digest_buff, in, ctx->inter_digestsize);
 	in += ctx->inter_digestsize;
 
-	memcpy(state->digest_bytes_len, in, ctx->drvdata->hash_len_sz);
-	in += ctx->drvdata->hash_len_sz;
+	memcpy(state->digest_bytes_len, in, ctx->hash_len);
+	in += ctx->hash_len;
 
 	/* Sanity check the data as much as possible */
 	memcpy(&tmp, in, sizeof(u32));
@@ -1515,6 +1531,7 @@ struct cc_hash_template {
 	char mac_name[CRYPTO_MAX_ALG_NAME];
 	char mac_driver_name[CRYPTO_MAX_ALG_NAME];
 	unsigned int blocksize;
+	bool is_mac;
 	bool synchronize;
 	struct ahash_alg template_ahash;
 	int hash_mode;
@@ -1522,6 +1539,7 @@ struct cc_hash_template {
 	int inter_digestsize;
 	struct cc_drvdata *drvdata;
 	u32 min_hw_rev;
+	enum cc_std_body std_body;
 };
 
 #define CC_STATE_SIZE(_x) \
@@ -1536,6 +1554,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha1)",
 		.mac_driver_name = "hmac-sha1-ccree",
 		.blocksize = SHA1_BLOCK_SIZE,
+		.is_mac = true,
 		.synchronize = false,
 		.template_ahash = {
 			.init = cc_hash_init,
@@ -1555,6 +1574,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA1,
 		.inter_digestsize = SHA1_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha256",
@@ -1562,6 +1582,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha256)",
 		.mac_driver_name = "hmac-sha256-ccree",
 		.blocksize = SHA256_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1580,6 +1601,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA256,
 		.inter_digestsize = SHA256_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha224",
@@ -1587,6 +1609,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha224)",
 		.mac_driver_name = "hmac-sha224-ccree",
 		.blocksize = SHA224_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1605,6 +1628,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA256,
 		.inter_digestsize = SHA256_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha384",
@@ -1612,6 +1636,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha384)",
 		.mac_driver_name = "hmac-sha384-ccree",
 		.blocksize = SHA384_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1630,6 +1655,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA512,
 		.inter_digestsize = SHA512_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "sha512",
@@ -1637,6 +1663,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(sha512)",
 		.mac_driver_name = "hmac-sha512-ccree",
 		.blocksize = SHA512_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1655,6 +1682,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_SHA512,
 		.inter_digestsize = SHA512_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_712,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.name = "md5",
@@ -1662,6 +1690,7 @@ static struct cc_hash_template driver_hash[] = {
 		.mac_name = "hmac(md5)",
 		.mac_driver_name = "hmac-md5-ccree",
 		.blocksize = MD5_HMAC_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_hash_update,
@@ -1680,11 +1709,38 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_HASH_HW_MD5,
 		.inter_digestsize = MD5_DIGEST_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
+	},
+	{
+		.name = "sm3",
+		.driver_name = "sm3-ccree",
+		.blocksize = SM3_BLOCK_SIZE,
+		.is_mac = false,
+		.template_ahash = {
+			.init = cc_hash_init,
+			.update = cc_hash_update,
+			.final = cc_hash_final,
+			.finup = cc_hash_finup,
+			.digest = cc_hash_digest,
+			.export = cc_hash_export,
+			.import = cc_hash_import,
+			.setkey = cc_hash_setkey,
+			.halg = {
+				.digestsize = SM3_DIGEST_SIZE,
+				.statesize = CC_STATE_SIZE(SM3_DIGEST_SIZE),
+			},
+		},
+		.hash_mode = DRV_HASH_SM3,
+		.hw_mode = DRV_HASH_HW_SM3,
+		.inter_digestsize = SM3_DIGEST_SIZE,
+		.min_hw_rev = CC_HW_REV_713,
+		.std_body = CC_STD_OSCCA,
 	},
 	{
 		.mac_name = "xcbc(aes)",
 		.mac_driver_name = "xcbc-aes-ccree",
 		.blocksize = AES_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_mac_update,
@@ -1703,11 +1759,13 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_CIPHER_XCBC_MAC,
 		.inter_digestsize = AES_BLOCK_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 	{
 		.mac_name = "cmac(aes)",
 		.mac_driver_name = "cmac-aes-ccree",
 		.blocksize = AES_BLOCK_SIZE,
+		.is_mac = true,
 		.template_ahash = {
 			.init = cc_hash_init,
 			.update = cc_mac_update,
@@ -1726,6 +1784,7 @@ static struct cc_hash_template driver_hash[] = {
 		.hw_mode = DRV_CIPHER_CMAC,
 		.inter_digestsize = AES_BLOCK_SIZE,
 		.min_hw_rev = CC_HW_REV_630,
+		.std_body = CC_STD_NIST,
 	},
 };
 
@@ -1780,6 +1839,7 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata)
 	unsigned int larval_seq_len = 0;
 	struct cc_hw_desc larval_seq[CC_DIGEST_SIZE_MAX / sizeof(u32)];
 	bool large_sha_supported = (drvdata->hw_rev >= CC_HW_REV_712);
+	bool sm3_supported = (drvdata->hw_rev >= CC_HW_REV_713);
 	int rc = 0;
 
 	/* Copy-to-sram digest-len */
@@ -1845,6 +1905,17 @@ int cc_init_hash_sram(struct cc_drvdata *drvdata)
 	sram_buff_ofs += sizeof(sha256_init);
 	larval_seq_len = 0;
 
+	if (sm3_supported) {
+		cc_set_sram_desc(sm3_init, sram_buff_ofs,
+				 ARRAY_SIZE(sm3_init), larval_seq,
+				 &larval_seq_len);
+		rc = send_request_init(drvdata, larval_seq, larval_seq_len);
+		if (rc)
+			goto init_digest_const_err;
+		sram_buff_ofs += sizeof(sm3_init);
+		larval_seq_len = 0;
+	}
+
 	if (large_sha_supported) {
 		cc_set_sram_desc((u32 *)sha384_init, sram_buff_ofs,
 				 (ARRAY_SIZE(sha384_init) * 2), larval_seq,
@@ -1911,6 +1982,9 @@ int cc_hash_alloc(struct cc_drvdata *drvdata)
 			sizeof(sha224_init) +
 			sizeof(sha256_init);
 
+	if (drvdata->hw_rev >= CC_HW_REV_713)
+		sram_size_to_alloc += sizeof(sm3_init);
+
 	if (drvdata->hw_rev >= CC_HW_REV_712)
 		sram_size_to_alloc += sizeof(digest_len_sha512_init) +
 			sizeof(sha384_init) + sizeof(sha512_init);
@@ -1937,30 +2011,33 @@ int cc_hash_alloc(struct cc_drvdata *drvdata)
 		struct cc_hash_alg *t_alg;
 		int hw_mode = driver_hash[alg].hw_mode;
 
-		/* We either support both HASH and MAC or none */
-		if (driver_hash[alg].min_hw_rev > drvdata->hw_rev)
+		/* Check that the HW revision and variants are suitable */
+		if ((driver_hash[alg].min_hw_rev > drvdata->hw_rev) ||
+		    !(drvdata->std_bodies & driver_hash[alg].std_body))
 			continue;
 
-		/* register hmac version */
-		t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true);
-		if (IS_ERR(t_alg)) {
-			rc = PTR_ERR(t_alg);
-			dev_err(dev, "%s alg allocation failed\n",
-				driver_hash[alg].driver_name);
-			goto fail;
-		}
-		t_alg->drvdata = drvdata;
-
-		rc = crypto_register_ahash(&t_alg->ahash_alg);
-		if (rc) {
-			dev_err(dev, "%s alg registration failed\n",
-				driver_hash[alg].driver_name);
-			kfree(t_alg);
-			goto fail;
-		} else {
-			list_add_tail(&t_alg->entry, &hash_handle->hash_list);
+		if (driver_hash[alg].is_mac) {
+			/* register hmac version */
+			t_alg = cc_alloc_hash_alg(&driver_hash[alg], dev, true);
+			if (IS_ERR(t_alg)) {
+				rc = PTR_ERR(t_alg);
+				dev_err(dev, "%s alg allocation failed\n",
+					driver_hash[alg].driver_name);
+				goto fail;
+			}
+			t_alg->drvdata = drvdata;
+
+			rc = crypto_register_ahash(&t_alg->ahash_alg);
+			if (rc) {
+				dev_err(dev, "%s alg registration failed\n",
+					driver_hash[alg].driver_name);
+				kfree(t_alg);
+				goto fail;
+			} else {
+				list_add_tail(&t_alg->entry,
+					      &hash_handle->hash_list);
+			}
 		}
-
 		if (hw_mode == DRV_CIPHER_XCBC_MAC ||
 		    hw_mode == DRV_CIPHER_CMAC)
 			continue;
@@ -2027,7 +2104,7 @@ static void cc_setup_xcbc(struct ahash_request *areq, struct cc_hw_desc desc[],
 					    XCBC_MAC_K1_OFFSET),
 		     CC_AES_128_BIT_KEY_SIZE, NS_BIT);
 	set_setup_mode(&desc[idx], SETUP_LOAD_KEY0);
-	set_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC);
+	set_hash_cipher_mode(&desc[idx], DRV_CIPHER_XCBC_MAC, ctx->hash_mode);
 	set_cipher_config0(&desc[idx], DESC_DIRECTION_ENCRYPT_ENCRYPT);
 	set_key_size_aes(&desc[idx], CC_AES_128_BIT_KEY_SIZE);
 	set_flow_mode(&desc[idx], S_DIN_to_AES);
@@ -2162,6 +2239,8 @@ static const void *cc_larval_digest(struct device *dev, u32 mode)
 		return sha384_init;
 	case DRV_HASH_SHA512:
 		return sha512_init;
+	case DRV_HASH_SM3:
+		return sm3_init;
 	default:
 		dev_err(dev, "Invalid hash mode (%d)\n", mode);
 		return md5_init;
@@ -2182,6 +2261,8 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode)
 	struct cc_drvdata *_drvdata = (struct cc_drvdata *)drvdata;
 	struct cc_hash_handle *hash_handle = _drvdata->hash_handle;
 	struct device *dev = drvdata_to_dev(_drvdata);
+	bool sm3_supported = (_drvdata->hw_rev >= CC_HW_REV_713);
+	cc_sram_addr_t addr;
 
 	switch (mode) {
 	case DRV_HASH_NULL:
@@ -2200,19 +2281,31 @@ cc_sram_addr_t cc_larval_digest_addr(void *drvdata, u32 mode)
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init));
-	case DRV_HASH_SHA384:
+	case DRV_HASH_SM3:
 		return (hash_handle->larval_digest_sram_addr +
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init) +
 			sizeof(sha256_init));
+	case DRV_HASH_SHA384:
+		addr = (hash_handle->larval_digest_sram_addr +
+			sizeof(md5_init) +
+			sizeof(sha1_init) +
+			sizeof(sha224_init) +
+			sizeof(sha256_init));
+		if (sm3_supported)
+			addr += sizeof(sm3_init);
+		return addr;
 	case DRV_HASH_SHA512:
-		return (hash_handle->larval_digest_sram_addr +
+		addr = (hash_handle->larval_digest_sram_addr +
 			sizeof(md5_init) +
 			sizeof(sha1_init) +
 			sizeof(sha224_init) +
 			sizeof(sha256_init) +
 			sizeof(sha384_init));
+		if (sm3_supported)
+			addr += sizeof(sm3_init);
+		return addr;
 	default:
 		dev_err(dev, "Invalid hash mode (%d)\n", mode);
 	}
diff --git a/drivers/crypto/ccree/cc_hw_queue_defs.h b/drivers/crypto/ccree/cc_hw_queue_defs.h
index 45985b955d2c899c1478ce43d2d279c6590d9e0a..7a9b90db7db7a7fdeb1f4a9157002fda9c3bc118 100644
--- a/drivers/crypto/ccree/cc_hw_queue_defs.h
+++ b/drivers/crypto/ccree/cc_hw_queue_defs.h
@@ -42,6 +42,7 @@
 #define WORD3_QUEUE_LAST_IND	CC_GENMASK(3, QUEUE_LAST_IND)
 #define WORD4_ACK_NEEDED	CC_GENMASK(4, ACK_NEEDED)
 #define WORD4_AES_SEL_N_HASH	CC_GENMASK(4, AES_SEL_N_HASH)
+#define WORD4_AES_XOR_CRYPTO_KEY CC_GENMASK(4, AES_XOR_CRYPTO_KEY)
 #define WORD4_BYTES_SWAP	CC_GENMASK(4, BYTES_SWAP)
 #define WORD4_CIPHER_CONF0	CC_GENMASK(4, CIPHER_CONF0)
 #define WORD4_CIPHER_CONF1	CC_GENMASK(4, CIPHER_CONF1)
@@ -107,6 +108,7 @@ enum cc_flow_mode {
 	AES_to_AES_to_HASH_and_DOUT	= 13,
 	AES_to_AES_to_HASH	= 14,
 	AES_to_HASH_and_AES	= 15,
+	DIN_SM4_DOUT		= 16,
 	DIN_AES_AESMAC		= 17,
 	HASH_to_DOUT		= 18,
 	/* setup flows */
@@ -114,9 +116,11 @@ enum cc_flow_mode {
 	S_DIN_to_AES2		= 33,
 	S_DIN_to_DES		= 34,
 	S_DIN_to_RC4		= 35,
+	S_DIN_to_SM4		= 36,
 	S_DIN_to_HASH		= 37,
 	S_AES_to_DOUT		= 38,
 	S_AES2_to_DOUT		= 39,
+	S_SM4_to_DOUT		= 40,
 	S_RC4_to_DOUT		= 41,
 	S_DES_to_DOUT		= 42,
 	S_HASH_to_DOUT		= 43,
@@ -393,6 +397,16 @@ static inline void set_aes_not_hash_mode(struct cc_hw_desc *pdesc)
 	pdesc->word[4] |= FIELD_PREP(WORD4_AES_SEL_N_HASH, 1);
 }
 
+/*
+ * Set aes xor crypto key, this in some secenrios select SM3 engine
+ *
+ * @pdesc: pointer HW descriptor struct
+ */
+static inline void set_aes_xor_crypto_key(struct cc_hw_desc *pdesc)
+{
+	pdesc->word[4] |= FIELD_PREP(WORD4_AES_XOR_CRYPTO_KEY, 1);
+}
+
 /*
  * Set the DOUT field of a HW descriptors to SRAM mode
  * Note: No need to check SRAM alignment since host requests do not use SRAM and
@@ -454,6 +468,22 @@ static inline void set_cipher_mode(struct cc_hw_desc *pdesc, int mode)
 	pdesc->word[4] |= FIELD_PREP(WORD4_CIPHER_MODE, mode);
 }
 
+/*
+ * Set the cipher mode for hash algorithms.
+ *
+ * @pdesc: pointer HW descriptor struct
+ * @cipher_mode:  Any one of the modes defined in [CC7x-DESC]
+ * @hash_mode: specifies which hash is being handled
+ */
+static inline void set_hash_cipher_mode(struct cc_hw_desc *pdesc,
+					enum drv_cipher_mode cipher_mode,
+					enum drv_hash_mode hash_mode)
+{
+	set_cipher_mode(pdesc, cipher_mode);
+	if (hash_mode == DRV_HASH_SM3)
+		set_aes_xor_crypto_key(pdesc);
+}
+
 /*
  * Set the cipher configuration fields.
  *
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index db203f8be4290de3784fe7f5ee7f4175e5426101..bcef76508dfaeeafaedca586b2be01cb7cb9598e 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -123,7 +123,7 @@ static inline struct chcr_authenc_ctx *AUTHENC_CTX(struct chcr_aead_ctx *gctx)
 
 static inline struct uld_ctx *ULD_CTX(struct chcr_context *ctx)
 {
-	return ctx->dev->u_ctx;
+	return container_of(ctx->dev, struct uld_ctx, dev);
 }
 
 static inline int is_ofld_imm(const struct sk_buff *skb)
@@ -198,18 +198,43 @@ void chcr_verify_tag(struct aead_request *req, u8 *input, int *err)
 		*err = 0;
 }
 
-static inline void chcr_handle_aead_resp(struct aead_request *req,
+static int chcr_inc_wrcount(struct chcr_dev *dev)
+{
+	int err = 0;
+
+	spin_lock_bh(&dev->lock_chcr_dev);
+	if (dev->state == CHCR_DETACH)
+		err = 1;
+	else
+		atomic_inc(&dev->inflight);
+
+	spin_unlock_bh(&dev->lock_chcr_dev);
+
+	return err;
+}
+
+static inline void chcr_dec_wrcount(struct chcr_dev *dev)
+{
+	atomic_dec(&dev->inflight);
+}
+
+static inline int chcr_handle_aead_resp(struct aead_request *req,
 					 unsigned char *input,
 					 int err)
 {
 	struct chcr_aead_reqctx *reqctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_dev *dev = a_ctx(tfm)->dev;
 
 	chcr_aead_common_exit(req);
 	if (reqctx->verify == VERIFY_SW) {
 		chcr_verify_tag(req, input, &err);
 		reqctx->verify = VERIFY_HW;
 	}
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
+
+	return err;
 }
 
 static void get_aes_decrypt_key(unsigned char *dec_key,
@@ -391,7 +416,7 @@ static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid,
 
 static inline void dsgl_walk_add_page(struct dsgl_walk *walk,
 					size_t size,
-					dma_addr_t *addr)
+					dma_addr_t addr)
 {
 	int j;
 
@@ -399,7 +424,7 @@ static inline void dsgl_walk_add_page(struct dsgl_walk *walk,
 		return;
 	j = walk->nents;
 	walk->to->len[j % 8] = htons(size);
-	walk->to->addr[j % 8] = cpu_to_be64(*addr);
+	walk->to->addr[j % 8] = cpu_to_be64(addr);
 	j++;
 	if ((j % 8) == 0)
 		walk->to++;
@@ -473,16 +498,16 @@ static inline void ulptx_walk_end(struct ulptx_walk *walk)
 
 static inline void ulptx_walk_add_page(struct ulptx_walk *walk,
 					size_t size,
-					dma_addr_t *addr)
+					dma_addr_t addr)
 {
 	if (!size)
 		return;
 
 	if (walk->nents == 0) {
 		walk->sgl->len0 = cpu_to_be32(size);
-		walk->sgl->addr0 = cpu_to_be64(*addr);
+		walk->sgl->addr0 = cpu_to_be64(addr);
 	} else {
-		walk->pair->addr[walk->pair_idx] = cpu_to_be64(*addr);
+		walk->pair->addr[walk->pair_idx] = cpu_to_be64(addr);
 		walk->pair->len[walk->pair_idx] = cpu_to_be32(size);
 		walk->pair_idx = !walk->pair_idx;
 		if (!walk->pair_idx)
@@ -717,7 +742,7 @@ static inline void create_wreq(struct chcr_context *ctx,
 		htonl(FW_CRYPTO_LOOKASIDE_WR_LEN16_V(DIV_ROUND_UP(len16, 16)));
 	chcr_req->wreq.cookie = cpu_to_be64((uintptr_t)req);
 	chcr_req->wreq.rx_chid_to_rx_q_id =
-		FILL_WR_RX_Q_ID(ctx->dev->rx_channel_id, qid,
+		FILL_WR_RX_Q_ID(ctx->tx_chan_id, qid,
 				!!lcb, ctx->tx_qidx);
 
 	chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->tx_chan_id,
@@ -773,7 +798,7 @@ static struct sk_buff *create_cipher_wr(struct cipher_wr_param *wrparam)
 	}
 	chcr_req = __skb_put_zero(skb, transhdr_len);
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->dev->rx_channel_id, 2, 1);
+		FILL_SEC_CPL_OP_IVINSR(c_ctx(tfm)->tx_chan_id, 2, 1);
 
 	chcr_req->sec_cpl.pldlen = htonl(IV + wrparam->bytes);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -1100,6 +1125,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req,
 	struct cpl_fw6_pld *fw6_pld = (struct cpl_fw6_pld *)input;
 	struct chcr_blkcipher_req_ctx *reqctx = ablkcipher_request_ctx(req);
 	struct  cipher_wr_param wrparam;
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	int bytes;
 
 	if (err)
@@ -1161,6 +1187,7 @@ static int chcr_handle_cipher_resp(struct ablkcipher_request *req,
 unmap:
 	chcr_cipher_dma_unmap(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
 complete:
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
 	return err;
 }
@@ -1187,7 +1214,10 @@ static int process_cipher(struct ablkcipher_request *req,
 		       ablkctx->enckey_len, req->nbytes, ivsize);
 		goto error;
 	}
-	chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
+
+	err = chcr_cipher_dma_map(&ULD_CTX(c_ctx(tfm))->lldi.pdev->dev, req);
+	if (err)
+		goto error;
 	if (req->nbytes < (SGE_MAX_WR_LEN - (sizeof(struct chcr_wr) +
 					    AES_MIN_KEY_SIZE +
 					    sizeof(struct cpl_rx_phys_dsgl) +
@@ -1276,15 +1306,21 @@ static int process_cipher(struct ablkcipher_request *req,
 static int chcr_aes_encrypt(struct ablkcipher_request *req)
 {
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	struct sk_buff *skb = NULL;
 	int err, isfull = 0;
 	struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm));
 
+	err = chcr_inc_wrcount(dev);
+	if (err)
+		return -ENXIO;
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    c_ctx(tfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			err = -ENOSPC;
+			goto error;
+		}
 	}
 
 	err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
@@ -1295,15 +1331,23 @@ static int chcr_aes_encrypt(struct ablkcipher_request *req)
 	set_wr_txq(skb, CPL_PRIORITY_DATA, c_ctx(tfm)->tx_qidx);
 	chcr_send_wr(skb);
 	return isfull ? -EBUSY : -EINPROGRESS;
+error:
+	chcr_dec_wrcount(dev);
+	return err;
 }
 
 static int chcr_aes_decrypt(struct ablkcipher_request *req)
 {
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	struct uld_ctx *u_ctx = ULD_CTX(c_ctx(tfm));
+	struct chcr_dev *dev = c_ctx(tfm)->dev;
 	struct sk_buff *skb = NULL;
 	int err, isfull = 0;
 
+	err = chcr_inc_wrcount(dev);
+	if (err)
+		return -ENXIO;
+
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    c_ctx(tfm)->tx_qidx))) {
 		isfull = 1;
@@ -1311,8 +1355,8 @@ static int chcr_aes_decrypt(struct ablkcipher_request *req)
 			return -ENOSPC;
 	}
 
-	 err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
-			      &skb, CHCR_DECRYPT_OP);
+	err = process_cipher(req, u_ctx->lldi.rxq_ids[c_ctx(tfm)->rx_qidx],
+			     &skb, CHCR_DECRYPT_OP);
 	if (err || !skb)
 		return err;
 	skb->dev = u_ctx->lldi.ports[0];
@@ -1333,10 +1377,11 @@ static int chcr_device_init(struct chcr_context *ctx)
 	if (!ctx->dev) {
 		u_ctx = assign_chcr_device();
 		if (!u_ctx) {
+			err = -ENXIO;
 			pr_err("chcr device assignment fails\n");
 			goto out;
 		}
-		ctx->dev = u_ctx->dev;
+		ctx->dev = &u_ctx->dev;
 		adap = padap(ctx->dev);
 		ntxq = u_ctx->lldi.ntxq;
 		rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan;
@@ -1344,7 +1389,6 @@ static int chcr_device_init(struct chcr_context *ctx)
 		spin_lock(&ctx->dev->lock_chcr_dev);
 		ctx->tx_chan_id = ctx->dev->tx_channel_id;
 		ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id;
-		ctx->dev->rx_channel_id = 0;
 		spin_unlock(&ctx->dev->lock_chcr_dev);
 		rxq_idx = ctx->tx_chan_id * rxq_perchan;
 		rxq_idx += id % rxq_perchan;
@@ -1498,7 +1542,7 @@ static struct sk_buff *create_hash_wr(struct ahash_request *req,
 	chcr_req = __skb_put_zero(skb, transhdr_len);
 
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->dev->rx_channel_id, 2, 0);
+		FILL_SEC_CPL_OP_IVINSR(h_ctx(tfm)->tx_chan_id, 2, 0);
 	chcr_req->sec_cpl.pldlen = htonl(param->bfr_len + param->sg_len);
 
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -1562,6 +1606,7 @@ static int chcr_ahash_update(struct ahash_request *req)
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
 	struct uld_ctx *u_ctx = NULL;
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct sk_buff *skb;
 	u8 remainder = 0, bs;
 	unsigned int nbytes = req->nbytes;
@@ -1570,12 +1615,6 @@ static int chcr_ahash_update(struct ahash_request *req)
 
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
 	u_ctx = ULD_CTX(h_ctx(rtfm));
-	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
-					    h_ctx(rtfm)->tx_qidx))) {
-		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
-	}
 
 	if (nbytes + req_ctx->reqlen >= bs) {
 		remainder = (nbytes + req_ctx->reqlen) % bs;
@@ -1586,10 +1625,27 @@ static int chcr_ahash_update(struct ahash_request *req)
 		req_ctx->reqlen += nbytes;
 		return 0;
 	}
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
+	/* Detach state for CHCR means lldi or padap is freed. Increasing
+	 * inflight count for dev guarantees that lldi and padap is valid
+	 */
+	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
+					    h_ctx(rtfm)->tx_qidx))) {
+		isfull = 1;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
+	}
+
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
 	params.sg_len = chcr_hash_ent_in_wr(req->src, !!req_ctx->reqlen,
@@ -1629,6 +1685,8 @@ static int chcr_ahash_update(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1646,10 +1704,16 @@ static int chcr_ahash_final(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct hash_wr_param params;
 	struct sk_buff *skb;
 	struct uld_ctx *u_ctx = NULL;
 	u8 bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
+	int error = -EINVAL;
+
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	chcr_init_hctx_per_wr(req_ctx);
 	u_ctx = ULD_CTX(h_ctx(rtfm));
@@ -1686,19 +1750,25 @@ static int chcr_ahash_final(struct ahash_request *req)
 	}
 	params.hash_size = crypto_ahash_digestsize(rtfm);
 	skb = create_hash_wr(req, &params);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
+	if (IS_ERR(skb)) {
+		error = PTR_ERR(skb);
+		goto err;
+	}
 	req_ctx->reqlen = 0;
 	skb->dev = u_ctx->lldi.ports[0];
 	set_wr_txq(skb, CPL_PRIORITY_DATA, h_ctx(rtfm)->tx_qidx);
 	chcr_send_wr(skb);
 	return -EINPROGRESS;
+err:
+	chcr_dec_wrcount(dev);
+	return error;
 }
 
 static int chcr_ahash_finup(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct uld_ctx *u_ctx = NULL;
 	struct sk_buff *skb;
 	struct hash_wr_param params;
@@ -1707,17 +1777,24 @@ static int chcr_ahash_finup(struct ahash_request *req)
 
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
 	u_ctx = ULD_CTX(h_ctx(rtfm));
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    h_ctx(rtfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
 	}
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
@@ -1774,6 +1851,8 @@ static int chcr_ahash_finup(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1781,6 +1860,7 @@ static int chcr_ahash_digest(struct ahash_request *req)
 {
 	struct chcr_ahash_req_ctx *req_ctx = ahash_request_ctx(req);
 	struct crypto_ahash *rtfm = crypto_ahash_reqtfm(req);
+	struct chcr_dev *dev = h_ctx(rtfm)->dev;
 	struct uld_ctx *u_ctx = NULL;
 	struct sk_buff *skb;
 	struct hash_wr_param params;
@@ -1789,19 +1869,26 @@ static int chcr_ahash_digest(struct ahash_request *req)
 
 	rtfm->init(req);
 	bs = crypto_tfm_alg_blocksize(crypto_ahash_tfm(rtfm));
+	error = chcr_inc_wrcount(dev);
+	if (error)
+		return -ENXIO;
 
 	u_ctx = ULD_CTX(h_ctx(rtfm));
 	if (unlikely(cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 					    h_ctx(rtfm)->tx_qidx))) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
-			return -ENOSPC;
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			error = -ENOSPC;
+			goto err;
+		}
 	}
 
 	chcr_init_hctx_per_wr(req_ctx);
 	error = chcr_hash_dma_map(&u_ctx->lldi.pdev->dev, req);
-	if (error)
-		return -ENOMEM;
+	if (error) {
+		error = -ENOMEM;
+		goto err;
+	}
 
 	get_alg_config(&params.alg_prm, crypto_ahash_digestsize(rtfm));
 	params.kctx_len = roundup(params.alg_prm.result_size, 16);
@@ -1854,6 +1941,8 @@ static int chcr_ahash_digest(struct ahash_request *req)
 	return isfull ? -EBUSY : -EINPROGRESS;
 unmap:
 	chcr_hash_dma_unmap(&u_ctx->lldi.pdev->dev, req);
+err:
+	chcr_dec_wrcount(dev);
 	return error;
 }
 
@@ -1925,6 +2014,7 @@ static inline void chcr_handle_ahash_resp(struct ahash_request *req,
 	int digestsize, updated_digestsize;
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
 	struct uld_ctx *u_ctx = ULD_CTX(h_ctx(tfm));
+	struct chcr_dev *dev = h_ctx(tfm)->dev;
 
 	if (input == NULL)
 		goto out;
@@ -1967,6 +2057,7 @@ static inline void chcr_handle_ahash_resp(struct ahash_request *req,
 
 
 out:
+	chcr_dec_wrcount(dev);
 	req->base.complete(&req->base, err);
 }
 
@@ -1983,14 +2074,13 @@ int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input,
 
 	switch (tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
 	case CRYPTO_ALG_TYPE_AEAD:
-		chcr_handle_aead_resp(aead_request_cast(req), input, err);
+		err = chcr_handle_aead_resp(aead_request_cast(req), input, err);
 		break;
 
 	case CRYPTO_ALG_TYPE_ABLKCIPHER:
-		 err = chcr_handle_cipher_resp(ablkcipher_request_cast(req),
+		 chcr_handle_cipher_resp(ablkcipher_request_cast(req),
 					       input, err);
 		break;
-
 	case CRYPTO_ALG_TYPE_AHASH:
 		chcr_handle_ahash_resp(ahash_request_cast(req), input, err);
 		}
@@ -2008,7 +2098,7 @@ static int chcr_ahash_export(struct ahash_request *areq, void *out)
 	memcpy(state->partial_hash, req_ctx->partial_hash,
 	       CHCR_HASH_MAX_DIGEST_SIZE);
 	chcr_init_hctx_per_wr(state);
-		return 0;
+	return 0;
 }
 
 static int chcr_ahash_import(struct ahash_request *areq, const void *in)
@@ -2215,10 +2305,7 @@ static int chcr_aead_common_init(struct aead_request *req)
 		error = -ENOMEM;
 		goto err;
 	}
-	reqctx->aad_nents = sg_nents_xlen(req->src, req->assoclen,
-					  CHCR_SRC_SG_SIZE, 0);
-	reqctx->src_nents = sg_nents_xlen(req->src, req->cryptlen,
-					  CHCR_SRC_SG_SIZE, req->assoclen);
+
 	return 0;
 err:
 	return error;
@@ -2249,7 +2336,7 @@ static int chcr_aead_fallback(struct aead_request *req, unsigned short op_type)
 				  req->base.complete, req->base.data);
 	aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
 				 req->iv);
-	 aead_request_set_ad(subreq, req->assoclen);
+	aead_request_set_ad(subreq, req->assoclen);
 	return op_type ? crypto_aead_decrypt(subreq) :
 		crypto_aead_encrypt(subreq);
 }
@@ -2268,10 +2355,10 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	struct ulptx_sgl *ulptx;
 	unsigned int transhdr_len;
 	unsigned int dst_size = 0, temp, subtype = get_aead_subtype(tfm);
-	unsigned int   kctx_len = 0, dnents;
-	unsigned int  assoclen = req->assoclen;
+	unsigned int   kctx_len = 0, dnents, snents;
 	unsigned int  authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	int null = 0;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
@@ -2288,24 +2375,20 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CBC_NULL ||
 		subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) {
 		null = 1;
-		assoclen = 0;
-		reqctx->aad_nents = 0;
 	}
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen +
-		(reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE,
-		req->assoclen);
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen +
+		(reqctx->op ? -authsize : authsize), CHCR_DST_SG_SIZE, 0);
 	dnents += MIN_AUTH_SG; // For IV
-
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
 	dst_size = get_space_for_phys_dsgl(dnents);
 	kctx_len = (ntohl(KEY_CONTEXT_CTX_LEN_V(aeadctx->key_ctx_hdr)) << 4)
 		- sizeof(chcr_req->key_ctx);
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) <
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) <
 			SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16)
-			: (sgl_len(reqctx->src_nents + reqctx->aad_nents
-			+ MIN_GCM_SG) * 8);
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16)
+			: (sgl_len(snents) * 8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 
@@ -2315,7 +2398,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN, flags);
+	skb = alloc_skb(transhdr_len, flags);
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
@@ -2331,16 +2414,16 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	 * to the hardware spec
 	 */
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->dev->rx_channel_id, 2,
-				       assoclen + 1);
-	chcr_req->sec_cpl.pldlen = htonl(assoclen + IV + req->cryptlen);
+		FILL_SEC_CPL_OP_IVINSR(a_ctx(tfm)->tx_chan_id, 2, 1);
+	chcr_req->sec_cpl.pldlen = htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					assoclen ? 1 : 0, assoclen,
-					assoclen + IV + 1,
+					null ? 0 : 1 + IV,
+					null ? 0 : IV + req->assoclen,
+					req->assoclen + IV + 1,
 					(temp & 0x1F0) >> 4);
 	chcr_req->sec_cpl.cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT(
 					temp & 0xF,
-					null ? 0 : assoclen + IV + 1,
+					null ? 0 : req->assoclen + IV + 1,
 					temp, temp);
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL ||
 	    subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA)
@@ -2367,23 +2450,24 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 
 	memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16),
 	       actx->h_iopad, kctx_len - roundup(aeadctx->enckey_len, 16));
+	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
+	ulptx = (struct ulptx_sgl *)(ivptr + IV);
 	if (subtype == CRYPTO_ALG_SUB_TYPE_CTR_SHA ||
 	    subtype == CRYPTO_ALG_SUB_TYPE_CTR_NULL) {
-		memcpy(reqctx->iv, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE);
-		memcpy(reqctx->iv + CTR_RFC3686_NONCE_SIZE, req->iv,
+		memcpy(ivptr, aeadctx->nonce, CTR_RFC3686_NONCE_SIZE);
+		memcpy(ivptr + CTR_RFC3686_NONCE_SIZE, req->iv,
 				CTR_RFC3686_IV_SIZE);
-		*(__be32 *)(reqctx->iv + CTR_RFC3686_NONCE_SIZE +
+		*(__be32 *)(ivptr + CTR_RFC3686_NONCE_SIZE +
 			CTR_RFC3686_IV_SIZE) = cpu_to_be32(1);
 	} else {
-		memcpy(reqctx->iv, req->iv, IV);
+		memcpy(ivptr, req->iv, IV);
 	}
-	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 	atomic_inc(&adap->chcr_stats.cipher_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0);
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size,
 		   transhdr_len, temp, 0);
 	reqctx->skb = skb;
@@ -2470,8 +2554,7 @@ void chcr_aead_dma_unmap(struct device *dev,
 }
 
 void chcr_add_aead_src_ent(struct aead_request *req,
-			   struct ulptx_sgl *ulptx,
-			   unsigned int assoclen)
+			   struct ulptx_sgl *ulptx)
 {
 	struct ulptx_walk ulp_walk;
 	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
@@ -2484,28 +2567,20 @@ void chcr_add_aead_src_ent(struct aead_request *req,
 			buf += reqctx->b0_len;
 		}
 		sg_pcopy_to_buffer(req->src, sg_nents(req->src),
-				   buf, assoclen, 0);
-		buf += assoclen;
-		memcpy(buf, reqctx->iv, IV);
-		buf += IV;
-		sg_pcopy_to_buffer(req->src, sg_nents(req->src),
-				   buf, req->cryptlen, req->assoclen);
+				   buf, req->cryptlen + req->assoclen, 0);
 	} else {
 		ulptx_walk_init(&ulp_walk, ulptx);
 		if (reqctx->b0_len)
 			ulptx_walk_add_page(&ulp_walk, reqctx->b0_len,
-					    &reqctx->b0_dma);
-		ulptx_walk_add_sg(&ulp_walk, req->src, assoclen, 0);
-		ulptx_walk_add_page(&ulp_walk, IV, &reqctx->iv_dma);
-		ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen,
-				  req->assoclen);
+					    reqctx->b0_dma);
+		ulptx_walk_add_sg(&ulp_walk, req->src, req->cryptlen +
+				  req->assoclen,  0);
 		ulptx_walk_end(&ulp_walk);
 	}
 }
 
 void chcr_add_aead_dst_ent(struct aead_request *req,
 			   struct cpl_rx_phys_dsgl *phys_cpl,
-			   unsigned int assoclen,
 			   unsigned short qid)
 {
 	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
@@ -2516,12 +2591,10 @@ void chcr_add_aead_dst_ent(struct aead_request *req,
 	u32 temp;
 
 	dsgl_walk_init(&dsgl_walk, phys_cpl);
-	if (reqctx->b0_len)
-		dsgl_walk_add_page(&dsgl_walk, reqctx->b0_len, &reqctx->b0_dma);
-	dsgl_walk_add_sg(&dsgl_walk, req->dst, assoclen, 0);
-	dsgl_walk_add_page(&dsgl_walk, IV, &reqctx->iv_dma);
-	temp = req->cryptlen + (reqctx->op ? -authsize : authsize);
-	dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, req->assoclen);
+	dsgl_walk_add_page(&dsgl_walk, IV + reqctx->b0_len, reqctx->iv_dma);
+	temp = req->assoclen + req->cryptlen +
+		(reqctx->op ? -authsize : authsize);
+	dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, 0);
 	dsgl_walk_end(&dsgl_walk, qid, ctx->pci_chan_id);
 }
 
@@ -2589,7 +2662,7 @@ void chcr_add_hash_src_ent(struct ahash_request *req,
 		ulptx_walk_init(&ulp_walk, ulptx);
 		if (param->bfr_len)
 			ulptx_walk_add_page(&ulp_walk, param->bfr_len,
-					    &reqctx->hctx_wr.dma_addr);
+					    reqctx->hctx_wr.dma_addr);
 		ulptx_walk_add_sg(&ulp_walk, reqctx->hctx_wr.srcsg,
 				  param->sg_len, reqctx->hctx_wr.src_ofst);
 		reqctx->hctx_wr.srcsg = ulp_walk.last_sg;
@@ -2689,8 +2762,7 @@ static int set_msg_len(u8 *block, unsigned int msglen, int csize)
 	return 0;
 }
 
-static void generate_b0(struct aead_request *req,
-			struct chcr_aead_ctx *aeadctx,
+static void generate_b0(struct aead_request *req, u8 *ivptr,
 			unsigned short op_type)
 {
 	unsigned int l, lp, m;
@@ -2701,7 +2773,7 @@ static void generate_b0(struct aead_request *req,
 
 	m = crypto_aead_authsize(aead);
 
-	memcpy(b0, reqctx->iv, 16);
+	memcpy(b0, ivptr, 16);
 
 	lp = b0[0];
 	l = lp + 1;
@@ -2727,29 +2799,31 @@ static inline int crypto_ccm_check_iv(const u8 *iv)
 }
 
 static int ccm_format_packet(struct aead_request *req,
-			     struct chcr_aead_ctx *aeadctx,
+			     u8 *ivptr,
 			     unsigned int sub_type,
 			     unsigned short op_type,
 			     unsigned int assoclen)
 {
 	struct chcr_aead_reqctx *reqctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm));
 	int rc = 0;
 
 	if (sub_type == CRYPTO_ALG_SUB_TYPE_AEAD_RFC4309) {
-		reqctx->iv[0] = 3;
-		memcpy(reqctx->iv + 1, &aeadctx->salt[0], 3);
-		memcpy(reqctx->iv + 4, req->iv, 8);
-		memset(reqctx->iv + 12, 0, 4);
+		ivptr[0] = 3;
+		memcpy(ivptr + 1, &aeadctx->salt[0], 3);
+		memcpy(ivptr + 4, req->iv, 8);
+		memset(ivptr + 12, 0, 4);
 	} else {
-		memcpy(reqctx->iv, req->iv, 16);
+		memcpy(ivptr, req->iv, 16);
 	}
 	if (assoclen)
 		*((unsigned short *)(reqctx->scratch_pad + 16)) =
 				htons(assoclen);
 
-	generate_b0(req, aeadctx, op_type);
+	generate_b0(req, ivptr, op_type);
 	/* zero the ctr value */
-	memset(reqctx->iv + 15 - reqctx->iv[0], 0, reqctx->iv[0] + 1);
+	memset(ivptr + 15 - ivptr[0], 0, ivptr[0] + 1);
 	return rc;
 }
 
@@ -2762,7 +2836,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 	struct chcr_aead_ctx *aeadctx = AEAD_CTX(a_ctx(tfm));
 	unsigned int cipher_mode = CHCR_SCMD_CIPHER_MODE_AES_CCM;
 	unsigned int mac_mode = CHCR_SCMD_AUTH_MODE_CBCMAC;
-	unsigned int c_id = a_ctx(tfm)->dev->rx_channel_id;
+	unsigned int c_id = a_ctx(tfm)->tx_chan_id;
 	unsigned int ccm_xtra;
 	unsigned char tag_offset = 0, auth_offset = 0;
 	unsigned int assoclen;
@@ -2775,7 +2849,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 		((assoclen) ? CCM_AAD_FIELD_SIZE : 0);
 
 	auth_offset = req->cryptlen ?
-		(assoclen + IV + 1 + ccm_xtra) : 0;
+		(req->assoclen + IV + 1 + ccm_xtra) : 0;
 	if (op_type == CHCR_DECRYPT_OP) {
 		if (crypto_aead_authsize(tfm) != req->cryptlen)
 			tag_offset = crypto_aead_authsize(tfm);
@@ -2785,13 +2859,13 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 
 
 	sec_cpl->op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(c_id,
-					 2, assoclen + 1 + ccm_xtra);
+					 2, 1);
 	sec_cpl->pldlen =
-		htonl(assoclen + IV + req->cryptlen + ccm_xtra);
+		htonl(req->assoclen + IV + req->cryptlen + ccm_xtra);
 	/* For CCM there wil be b0 always. So AAD start will be 1 always */
 	sec_cpl->aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					1, assoclen + ccm_xtra, assoclen
-					+ IV + 1 + ccm_xtra, 0);
+				1 + IV,	IV + assoclen + ccm_xtra,
+				req->assoclen + IV + 1 + ccm_xtra, 0);
 
 	sec_cpl->cipherstop_lo_authinsert = FILL_SEC_CPL_AUTHINSERT(0,
 					auth_offset, tag_offset,
@@ -2838,10 +2912,11 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 	struct cpl_rx_phys_dsgl *phys_cpl;
 	struct ulptx_sgl *ulptx;
 	unsigned int transhdr_len;
-	unsigned int dst_size = 0, kctx_len, dnents, temp;
+	unsigned int dst_size = 0, kctx_len, dnents, temp, snents;
 	unsigned int sub_type, assoclen = req->assoclen;
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
 	struct adapter *adap = padap(a_ctx(tfm)->dev);
@@ -2857,37 +2932,38 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 	error = aead_ccm_validate_input(reqctx->op, req, aeadctx, sub_type);
 	if (error)
 		goto err;
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen
 			+ (reqctx->op ? -authsize : authsize),
-			CHCR_DST_SG_SIZE, req->assoclen);
+			CHCR_DST_SG_SIZE, 0);
 	dnents += MIN_CCM_SG; // For IV and B0
 	dst_size = get_space_for_phys_dsgl(dnents);
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
+	snents += MIN_CCM_SG; //For B0
 	kctx_len = roundup(aeadctx->enckey_len, 16) * 2;
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen +
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen +
 		       reqctx->b0_len) <= SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen +
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen +
 				     reqctx->b0_len, 16) :
-		(sgl_len(reqctx->src_nents + reqctx->aad_nents +
-				    MIN_CCM_SG) *  8);
+		(sgl_len(snents) *  8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 
 	if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE -
-				    reqctx->b0_len, transhdr_len, reqctx->op)) {
+				reqctx->b0_len, transhdr_len, reqctx->op)) {
 		atomic_inc(&adap->chcr_stats.fallback);
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN,  flags);
+	skb = alloc_skb(transhdr_len,  flags);
 
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
 	}
 
-	chcr_req = (struct chcr_wr *) __skb_put_zero(skb, transhdr_len);
+	chcr_req = __skb_put_zero(skb, transhdr_len);
 
 	fill_sec_cpl_for_aead(&chcr_req->sec_cpl, dst_size, req, reqctx->op);
 
@@ -2897,16 +2973,17 @@ static struct sk_buff *create_aead_ccm_wr(struct aead_request *req,
 			aeadctx->key, aeadctx->enckey_len);
 
 	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
-	error = ccm_format_packet(req, aeadctx, sub_type, reqctx->op, assoclen);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
+	ulptx = (struct ulptx_sgl *)(ivptr + IV);
+	error = ccm_format_packet(req, ivptr, sub_type, reqctx->op, assoclen);
 	if (error)
 		goto dstmap_fail;
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 
 	atomic_inc(&adap->chcr_stats.aead_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen +
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen +
 		reqctx->b0_len) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, 0,
 		    transhdr_len, temp, 0);
@@ -2931,10 +3008,11 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	struct chcr_wr *chcr_req;
 	struct cpl_rx_phys_dsgl *phys_cpl;
 	struct ulptx_sgl *ulptx;
-	unsigned int transhdr_len, dnents = 0;
+	unsigned int transhdr_len, dnents = 0, snents;
 	unsigned int dst_size = 0, temp = 0, kctx_len, assoclen = req->assoclen;
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	int error = -EINVAL;
+	u8 *ivptr;
 	gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL :
 		GFP_ATOMIC;
 	struct adapter *adap = padap(a_ctx(tfm)->dev);
@@ -2946,19 +3024,19 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	error = chcr_aead_common_init(req);
 	if (error)
 		return ERR_PTR(error);
-	dnents = sg_nents_xlen(req->dst, assoclen, CHCR_DST_SG_SIZE, 0);
-	dnents += sg_nents_xlen(req->dst, req->cryptlen +
+	dnents = sg_nents_xlen(req->dst, req->assoclen + req->cryptlen +
 				(reqctx->op ? -authsize : authsize),
-				CHCR_DST_SG_SIZE, req->assoclen);
+				CHCR_DST_SG_SIZE, 0);
+	snents = sg_nents_xlen(req->src, req->assoclen + req->cryptlen,
+			       CHCR_SRC_SG_SIZE, 0);
 	dnents += MIN_GCM_SG; // For IV
 	dst_size = get_space_for_phys_dsgl(dnents);
 	kctx_len = roundup(aeadctx->enckey_len, 16) + AEAD_H_SIZE;
 	transhdr_len = CIPHER_TRANSHDR_SIZE(kctx_len, dst_size);
-	reqctx->imm = (transhdr_len + assoclen + IV + req->cryptlen) <=
+	reqctx->imm = (transhdr_len + req->assoclen + req->cryptlen) <=
 			SGE_MAX_WR_LEN;
-	temp = reqctx->imm ? roundup(assoclen + IV + req->cryptlen, 16) :
-		(sgl_len(reqctx->src_nents +
-		reqctx->aad_nents + MIN_GCM_SG) * 8);
+	temp = reqctx->imm ? roundup(req->assoclen + req->cryptlen, 16) :
+		(sgl_len(snents) * 8);
 	transhdr_len += temp;
 	transhdr_len = roundup(transhdr_len, 16);
 	if (chcr_aead_need_fallback(req, dnents, T6_MAX_AAD_SIZE,
@@ -2968,7 +3046,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 		chcr_aead_common_exit(req);
 		return ERR_PTR(chcr_aead_fallback(req, reqctx->op));
 	}
-	skb = alloc_skb(SGE_MAX_WR_LEN, flags);
+	skb = alloc_skb(transhdr_len, flags);
 	if (!skb) {
 		error = -ENOMEM;
 		goto err;
@@ -2979,15 +3057,15 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	//Offset of tag from end
 	temp = (reqctx->op == CHCR_ENCRYPT_OP) ? 0 : authsize;
 	chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(
-					a_ctx(tfm)->dev->rx_channel_id, 2,
-					(assoclen + 1));
+					a_ctx(tfm)->tx_chan_id, 2, 1);
 	chcr_req->sec_cpl.pldlen =
-		htonl(assoclen + IV + req->cryptlen);
+		htonl(req->assoclen + IV + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-					assoclen ? 1 : 0, assoclen,
-					assoclen + IV + 1, 0);
+					assoclen ? 1 + IV : 0,
+					assoclen ? IV + assoclen : 0,
+					req->assoclen + IV + 1, 0);
 	chcr_req->sec_cpl.cipherstop_lo_authinsert =
-			FILL_SEC_CPL_AUTHINSERT(0, assoclen + IV + 1,
+			FILL_SEC_CPL_AUTHINSERT(0, req->assoclen + IV + 1,
 						temp, temp);
 	chcr_req->sec_cpl.seqno_numivs =
 			FILL_SEC_CPL_SCMD0_SEQNO(reqctx->op, (reqctx->op ==
@@ -3002,25 +3080,26 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 	memcpy(chcr_req->key_ctx.key + roundup(aeadctx->enckey_len, 16),
 	       GCM_CTX(aeadctx)->ghash_h, AEAD_H_SIZE);
 
+	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
+	ivptr = (u8 *)(phys_cpl + 1) + dst_size;
 	/* prepare a 16 byte iv */
 	/* S   A   L  T |  IV | 0x00000001 */
 	if (get_aead_subtype(tfm) ==
 	    CRYPTO_ALG_SUB_TYPE_AEAD_RFC4106) {
-		memcpy(reqctx->iv, aeadctx->salt, 4);
-		memcpy(reqctx->iv + 4, req->iv, GCM_RFC4106_IV_SIZE);
+		memcpy(ivptr, aeadctx->salt, 4);
+		memcpy(ivptr + 4, req->iv, GCM_RFC4106_IV_SIZE);
 	} else {
-		memcpy(reqctx->iv, req->iv, GCM_AES_IV_SIZE);
+		memcpy(ivptr, req->iv, GCM_AES_IV_SIZE);
 	}
-	*((unsigned int *)(reqctx->iv + 12)) = htonl(0x01);
+	*((unsigned int *)(ivptr + 12)) = htonl(0x01);
 
-	phys_cpl = (struct cpl_rx_phys_dsgl *)((u8 *)(chcr_req + 1) + kctx_len);
-	ulptx = (struct ulptx_sgl *)((u8 *)(phys_cpl + 1) + dst_size);
+	ulptx = (struct ulptx_sgl *)(ivptr + 16);
 
-	chcr_add_aead_dst_ent(req, phys_cpl, assoclen, qid);
-	chcr_add_aead_src_ent(req, ulptx, assoclen);
+	chcr_add_aead_dst_ent(req, phys_cpl, qid);
+	chcr_add_aead_src_ent(req, ulptx);
 	atomic_inc(&adap->chcr_stats.aead_rqst);
-	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size +
-		kctx_len + (reqctx->imm ? (assoclen + IV + req->cryptlen) : 0);
+	temp = sizeof(struct cpl_rx_phys_dsgl) + dst_size + IV +
+		kctx_len + (reqctx->imm ? (req->assoclen + req->cryptlen) : 0);
 	create_wreq(a_ctx(tfm), chcr_req, &req->base, reqctx->imm, size,
 		    transhdr_len, temp, reqctx->verify);
 	reqctx->skb = skb;
@@ -3118,12 +3197,12 @@ static int chcr_gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_12:
-		 aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT;
-		 aeadctx->mayverify = VERIFY_HW;
+		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_IPSEC_96BIT;
+		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_14:
-		 aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3;
-		 aeadctx->mayverify = VERIFY_HW;
+		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_PL3;
+		aeadctx->mayverify = VERIFY_HW;
 		break;
 	case ICV_16:
 		aeadctx->hmac_ctrl = CHCR_SCMD_HMAC_CTRL_NO_TRUNC;
@@ -3565,27 +3644,42 @@ static int chcr_aead_op(struct aead_request *req,
 			create_wr_t create_wr_fn)
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct chcr_aead_reqctx  *reqctx = aead_request_ctx(req);
 	struct uld_ctx *u_ctx;
 	struct sk_buff *skb;
 	int isfull = 0;
+	struct chcr_dev *cdev;
 
-	if (!a_ctx(tfm)->dev) {
+	cdev = a_ctx(tfm)->dev;
+	if (!cdev) {
 		pr_err("chcr : %s : No crypto device.\n", __func__);
 		return -ENXIO;
 	}
+
+	if (chcr_inc_wrcount(cdev)) {
+	/* Detach state for CHCR means lldi or padap is freed.
+	 * We cannot increment fallback here.
+	 */
+		return chcr_aead_fallback(req, reqctx->op);
+	}
+
 	u_ctx = ULD_CTX(a_ctx(tfm));
 	if (cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 				   a_ctx(tfm)->tx_qidx)) {
 		isfull = 1;
-		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
+		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
+			chcr_dec_wrcount(cdev);
 			return -ENOSPC;
+		}
 	}
 
 	/* Form a WR from req */
 	skb = create_wr_fn(req, u_ctx->lldi.rxq_ids[a_ctx(tfm)->rx_qidx], size);
 
-	if (IS_ERR(skb) || !skb)
+	if (IS_ERR(skb) || !skb) {
+		chcr_dec_wrcount(cdev);
 		return PTR_ERR(skb);
+	}
 
 	skb->dev = u_ctx->lldi.ports[0];
 	set_wr_txq(skb, CPL_PRIORITY_DATA, a_ctx(tfm)->tx_qidx);
@@ -3722,7 +3816,6 @@ static struct chcr_alg_template driver_algs[] = {
 				.setkey		= chcr_aes_rfc3686_setkey,
 				.encrypt	= chcr_aes_encrypt,
 				.decrypt	= chcr_aes_decrypt,
-				.geniv          = "seqiv",
 			}
 		}
 	},
@@ -4178,7 +4271,6 @@ static struct chcr_alg_template driver_algs[] = {
 			.setauthsize = chcr_authenc_null_setauthsize,
 		}
 	},
-
 };
 
 /*
diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h
index 1871500309e21d20d410ea338b6e524e4666f75b..ee20dd899e831883e6f38521050dc625b1ed793e 100644
--- a/drivers/crypto/chelsio/chcr_algo.h
+++ b/drivers/crypto/chelsio/chcr_algo.h
@@ -262,7 +262,7 @@
 #define MIN_AUTH_SG			1 /* IV */
 #define MIN_GCM_SG			1 /* IV */
 #define MIN_DIGEST_SG			1 /*Partial Buffer*/
-#define MIN_CCM_SG			2 /*IV+B0*/
+#define MIN_CCM_SG			1 /*IV+B0*/
 #define CIP_SPACE_LEFT(len) \
 	((SGE_MAX_WR_LEN - CIP_WR_MIN_LEN - (len)))
 #define HASH_SPACE_LEFT(len) \
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index 2c472e3c6aebf8ac13e0e62ec5c8fc1be05d48a2..239b933d6df6507d2e405ae8a370468c7dfb3036 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -26,10 +26,7 @@
 #include "chcr_core.h"
 #include "cxgb4_uld.h"
 
-static LIST_HEAD(uld_ctx_list);
-static DEFINE_MUTEX(dev_mutex);
-static atomic_t dev_count;
-static struct uld_ctx *ctx_rr;
+static struct chcr_driver_data drv_data;
 
 typedef int (*chcr_handler_func)(struct chcr_dev *dev, unsigned char *input);
 static int cpl_fw6_pld_handler(struct chcr_dev *dev, unsigned char *input);
@@ -53,6 +50,29 @@ static struct cxgb4_uld_info chcr_uld_info = {
 #endif /* CONFIG_CHELSIO_IPSEC_INLINE */
 };
 
+static void detach_work_fn(struct work_struct *work)
+{
+	struct chcr_dev *dev;
+
+	dev = container_of(work, struct chcr_dev, detach_work.work);
+
+	if (atomic_read(&dev->inflight)) {
+		dev->wqretry--;
+		if (dev->wqretry) {
+			pr_debug("Request Inflight Count %d\n",
+				atomic_read(&dev->inflight));
+
+			schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM);
+		} else {
+			WARN(1, "CHCR:%d request Still Pending\n",
+				atomic_read(&dev->inflight));
+			complete(&dev->detach_comp);
+		}
+	} else {
+		complete(&dev->detach_comp);
+	}
+}
+
 struct uld_ctx *assign_chcr_device(void)
 {
 	struct uld_ctx *u_ctx = NULL;
@@ -63,56 +83,74 @@ struct uld_ctx *assign_chcr_device(void)
 	 * Although One session must use the same device to
 	 * maintain request-response ordering.
 	 */
-	mutex_lock(&dev_mutex);
-	if (!list_empty(&uld_ctx_list)) {
-		u_ctx = ctx_rr;
-		if (list_is_last(&ctx_rr->entry, &uld_ctx_list))
-			ctx_rr = list_first_entry(&uld_ctx_list,
-						  struct uld_ctx,
-						  entry);
+	mutex_lock(&drv_data.drv_mutex);
+	if (!list_empty(&drv_data.act_dev)) {
+		u_ctx = drv_data.last_dev;
+		if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev))
+			drv_data.last_dev = list_first_entry(&drv_data.act_dev,
+						  struct uld_ctx, entry);
 		else
-			ctx_rr = list_next_entry(ctx_rr, entry);
+			drv_data.last_dev =
+				list_next_entry(drv_data.last_dev, entry);
 	}
-	mutex_unlock(&dev_mutex);
+	mutex_unlock(&drv_data.drv_mutex);
 	return u_ctx;
 }
 
-static int chcr_dev_add(struct uld_ctx *u_ctx)
+static void chcr_dev_add(struct uld_ctx *u_ctx)
 {
 	struct chcr_dev *dev;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENXIO;
+	dev = &u_ctx->dev;
+	dev->state = CHCR_ATTACH;
+	atomic_set(&dev->inflight, 0);
+	mutex_lock(&drv_data.drv_mutex);
+	list_move(&u_ctx->entry, &drv_data.act_dev);
+	if (!drv_data.last_dev)
+		drv_data.last_dev = u_ctx;
+	mutex_unlock(&drv_data.drv_mutex);
+}
+
+static void chcr_dev_init(struct uld_ctx *u_ctx)
+{
+	struct chcr_dev *dev;
 
+	dev = &u_ctx->dev;
 	spin_lock_init(&dev->lock_chcr_dev);
-	u_ctx->dev = dev;
-	dev->u_ctx = u_ctx;
-	atomic_inc(&dev_count);
-	mutex_lock(&dev_mutex);
-	list_add_tail(&u_ctx->entry, &uld_ctx_list);
-	if (!ctx_rr)
-		ctx_rr = u_ctx;
-	mutex_unlock(&dev_mutex);
-	return 0;
+	INIT_DELAYED_WORK(&dev->detach_work, detach_work_fn);
+	init_completion(&dev->detach_comp);
+	dev->state = CHCR_INIT;
+	dev->wqretry = WQ_RETRY;
+	atomic_inc(&drv_data.dev_count);
+	atomic_set(&dev->inflight, 0);
+	mutex_lock(&drv_data.drv_mutex);
+	list_add_tail(&u_ctx->entry, &drv_data.inact_dev);
+	if (!drv_data.last_dev)
+		drv_data.last_dev = u_ctx;
+	mutex_unlock(&drv_data.drv_mutex);
 }
 
-static int chcr_dev_remove(struct uld_ctx *u_ctx)
+static int chcr_dev_move(struct uld_ctx *u_ctx)
 {
-	if (ctx_rr == u_ctx) {
-		if (list_is_last(&ctx_rr->entry, &uld_ctx_list))
-			ctx_rr = list_first_entry(&uld_ctx_list,
-						  struct uld_ctx,
-						  entry);
+	struct adapter *adap;
+
+	 mutex_lock(&drv_data.drv_mutex);
+	if (drv_data.last_dev == u_ctx) {
+		if (list_is_last(&drv_data.last_dev->entry, &drv_data.act_dev))
+			drv_data.last_dev = list_first_entry(&drv_data.act_dev,
+						  struct uld_ctx, entry);
 		else
-			ctx_rr = list_next_entry(ctx_rr, entry);
+			drv_data.last_dev =
+				list_next_entry(drv_data.last_dev, entry);
 	}
-	list_del(&u_ctx->entry);
-	if (list_empty(&uld_ctx_list))
-		ctx_rr = NULL;
-	kfree(u_ctx->dev);
-	u_ctx->dev = NULL;
-	atomic_dec(&dev_count);
+	list_move(&u_ctx->entry, &drv_data.inact_dev);
+	if (list_empty(&drv_data.act_dev))
+		drv_data.last_dev = NULL;
+	adap = padap(&u_ctx->dev);
+	memset(&adap->chcr_stats, 0, sizeof(adap->chcr_stats));
+	atomic_dec(&drv_data.dev_count);
+	mutex_unlock(&drv_data.drv_mutex);
+
 	return 0;
 }
 
@@ -131,12 +169,8 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev,
 
 	ack_err_status =
 		ntohl(*(__be32 *)((unsigned char *)&fw6_pld->data[0] + 4));
-	if (ack_err_status) {
-		if (CHK_MAC_ERR_BIT(ack_err_status) ||
-		    CHK_PAD_ERR_BIT(ack_err_status))
-			error_status = -EBADMSG;
-		atomic_inc(&adap->chcr_stats.error);
-	}
+	if (CHK_MAC_ERR_BIT(ack_err_status) || CHK_PAD_ERR_BIT(ack_err_status))
+		error_status = -EBADMSG;
 	/* call completion callback with failure status */
 	if (req) {
 		error_status = chcr_handle_resp(req, input, error_status);
@@ -144,6 +178,9 @@ static int cpl_fw6_pld_handler(struct chcr_dev *dev,
 		pr_err("Incorrect request address from the firmware\n");
 		return -EFAULT;
 	}
+	if (error_status)
+		atomic_inc(&adap->chcr_stats.error);
+
 	return 0;
 }
 
@@ -167,6 +204,7 @@ static void *chcr_uld_add(const struct cxgb4_lld_info *lld)
 		goto out;
 	}
 	u_ctx->lldi = *lld;
+	chcr_dev_init(u_ctx);
 #ifdef CONFIG_CHELSIO_IPSEC_INLINE
 	if (lld->crypto & ULP_CRYPTO_IPSEC_INLINE)
 		chcr_add_xfrmops(lld);
@@ -179,7 +217,7 @@ int chcr_uld_rx_handler(void *handle, const __be64 *rsp,
 			const struct pkt_gl *pgl)
 {
 	struct uld_ctx *u_ctx = (struct uld_ctx *)handle;
-	struct chcr_dev *dev = u_ctx->dev;
+	struct chcr_dev *dev = &u_ctx->dev;
 	const struct cpl_fw6_pld *rpl = (struct cpl_fw6_pld *)rsp;
 
 	if (rpl->opcode != CPL_FW6_PLD) {
@@ -201,6 +239,28 @@ int chcr_uld_tx_handler(struct sk_buff *skb, struct net_device *dev)
 }
 #endif /* CONFIG_CHELSIO_IPSEC_INLINE */
 
+static void chcr_detach_device(struct uld_ctx *u_ctx)
+{
+	struct chcr_dev *dev = &u_ctx->dev;
+
+	spin_lock_bh(&dev->lock_chcr_dev);
+	if (dev->state == CHCR_DETACH) {
+		spin_unlock_bh(&dev->lock_chcr_dev);
+		pr_debug("Detached Event received for already detach device\n");
+		return;
+	}
+	dev->state = CHCR_DETACH;
+	spin_unlock_bh(&dev->lock_chcr_dev);
+
+	if (atomic_read(&dev->inflight) != 0) {
+		schedule_delayed_work(&dev->detach_work, WQ_DETACH_TM);
+		wait_for_completion(&dev->detach_comp);
+	}
+
+	// Move u_ctx to inactive_dev list
+	chcr_dev_move(u_ctx);
+}
+
 static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 {
 	struct uld_ctx *u_ctx = handle;
@@ -208,23 +268,16 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 
 	switch (state) {
 	case CXGB4_STATE_UP:
-		if (!u_ctx->dev) {
-			ret = chcr_dev_add(u_ctx);
-			if (ret != 0)
-				return ret;
+		if (u_ctx->dev.state != CHCR_INIT) {
+			// ALready Initialised.
+			return 0;
 		}
-		if (atomic_read(&dev_count) == 1)
-			ret = start_crypto();
+		chcr_dev_add(u_ctx);
+		ret = start_crypto();
 		break;
 
 	case CXGB4_STATE_DETACH:
-		if (u_ctx->dev) {
-			mutex_lock(&dev_mutex);
-			chcr_dev_remove(u_ctx);
-			mutex_unlock(&dev_mutex);
-		}
-		if (!atomic_read(&dev_count))
-			stop_crypto();
+		chcr_detach_device(u_ctx);
 		break;
 
 	case CXGB4_STATE_START_RECOVERY:
@@ -237,7 +290,13 @@ static int chcr_uld_state_change(void *handle, enum cxgb4_state state)
 
 static int __init chcr_crypto_init(void)
 {
+	INIT_LIST_HEAD(&drv_data.act_dev);
+	INIT_LIST_HEAD(&drv_data.inact_dev);
+	atomic_set(&drv_data.dev_count, 0);
+	mutex_init(&drv_data.drv_mutex);
+	drv_data.last_dev = NULL;
 	cxgb4_register_uld(CXGB4_ULD_CRYPTO, &chcr_uld_info);
+
 	return 0;
 }
 
@@ -245,18 +304,20 @@ static void __exit chcr_crypto_exit(void)
 {
 	struct uld_ctx *u_ctx, *tmp;
 
-	if (atomic_read(&dev_count))
-		stop_crypto();
+	stop_crypto();
 
+	cxgb4_unregister_uld(CXGB4_ULD_CRYPTO);
 	/* Remove all devices from list */
-	mutex_lock(&dev_mutex);
-	list_for_each_entry_safe(u_ctx, tmp, &uld_ctx_list, entry) {
-		if (u_ctx->dev)
-			chcr_dev_remove(u_ctx);
+	mutex_lock(&drv_data.drv_mutex);
+	list_for_each_entry_safe(u_ctx, tmp, &drv_data.act_dev, entry) {
+		list_del(&u_ctx->entry);
 		kfree(u_ctx);
 	}
-	mutex_unlock(&dev_mutex);
-	cxgb4_unregister_uld(CXGB4_ULD_CRYPTO);
+	list_for_each_entry_safe(u_ctx, tmp, &drv_data.inact_dev, entry) {
+		list_del(&u_ctx->entry);
+		kfree(u_ctx);
+	}
+	mutex_unlock(&drv_data.drv_mutex);
 }
 
 module_init(chcr_crypto_init);
diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index de3a9c085daf42e44225b58d36bb4cdec17b141e..1159dee964ed0d405251284a481aacfe57ea2675 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -47,7 +47,7 @@
 
 #define MAX_PENDING_REQ_TO_HW 20
 #define CHCR_TEST_RESPONSE_TIMEOUT 1000
-
+#define WQ_DETACH_TM	(msecs_to_jiffies(50))
 #define PAD_ERROR_BIT		1
 #define CHK_PAD_ERR_BIT(x)	(((x) >> PAD_ERROR_BIT) & 1)
 
@@ -61,9 +61,6 @@
 #define HASH_WR_MIN_LEN (sizeof(struct chcr_wr) + \
 			DUMMY_BYTES + \
 		    sizeof(struct ulptx_sgl))
-
-#define padap(dev) pci_get_drvdata(dev->u_ctx->lldi.pdev)
-
 struct uld_ctx;
 
 struct _key_ctx {
@@ -121,6 +118,20 @@ struct _key_ctx {
 #define KEYCTX_TX_WR_AUTHIN_G(x) \
 	(((x) >> KEYCTX_TX_WR_AUTHIN_S) & KEYCTX_TX_WR_AUTHIN_M)
 
+#define WQ_RETRY	5
+struct chcr_driver_data {
+	struct list_head act_dev;
+	struct list_head inact_dev;
+	atomic_t dev_count;
+	struct mutex drv_mutex;
+	struct uld_ctx *last_dev;
+};
+
+enum chcr_state {
+	CHCR_INIT = 0,
+	CHCR_ATTACH,
+	CHCR_DETACH,
+};
 struct chcr_wr {
 	struct fw_crypto_lookaside_wr wreq;
 	struct ulp_txpkt ulptx;
@@ -131,15 +142,18 @@ struct chcr_wr {
 
 struct chcr_dev {
 	spinlock_t lock_chcr_dev;
-	struct uld_ctx *u_ctx;
+	enum chcr_state state;
+	atomic_t inflight;
+	int wqretry;
+	struct delayed_work detach_work;
+	struct completion detach_comp;
 	unsigned char tx_channel_id;
-	unsigned char rx_channel_id;
 };
 
 struct uld_ctx {
 	struct list_head entry;
 	struct cxgb4_lld_info lldi;
-	struct chcr_dev *dev;
+	struct chcr_dev dev;
 };
 
 struct sge_opaque_hdr {
@@ -159,8 +173,17 @@ struct chcr_ipsec_wr {
 	struct chcr_ipsec_req req;
 };
 
+#define ESN_IV_INSERT_OFFSET 12
+struct chcr_ipsec_aadiv {
+	__be32 spi;
+	u8 seq_no[8];
+	u8 iv[8];
+};
+
 struct ipsec_sa_entry {
 	int hmac_ctrl;
+	u16 esn;
+	u16 imm;
 	unsigned int enckey_len;
 	unsigned int kctx_len;
 	unsigned int authsize;
@@ -181,6 +204,13 @@ static inline unsigned int sgl_len(unsigned int n)
 	return (3 * n) / 2 + (n & 1) + 2;
 }
 
+static inline void *padap(struct chcr_dev *dev)
+{
+	struct uld_ctx *u_ctx = container_of(dev, struct uld_ctx, dev);
+
+	return pci_get_drvdata(u_ctx->lldi.pdev);
+}
+
 struct uld_ctx *assign_chcr_device(void);
 int chcr_send_wr(struct sk_buff *skb);
 int start_crypto(void);
diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h
index d37ef41f9ebe7ce7371fb09a338d3a093da118ca..655606f2e4d09f089d27ed607da95fad94fe6983 100644
--- a/drivers/crypto/chelsio/chcr_crypto.h
+++ b/drivers/crypto/chelsio/chcr_crypto.h
@@ -41,7 +41,8 @@
 
 #define CCM_B0_SIZE             16
 #define CCM_AAD_FIELD_SIZE      2
-#define T6_MAX_AAD_SIZE 511
+// 511 - 16(For IV)
+#define T6_MAX_AAD_SIZE 495
 
 
 /* Define following if h/w is not dropping the AAD and IV data before
@@ -185,9 +186,6 @@ struct chcr_aead_reqctx {
 	dma_addr_t b0_dma;
 	unsigned int b0_len;
 	unsigned int op;
-	short int aad_nents;
-	short int src_nents;
-	short int dst_nents;
 	u16 imm;
 	u16 verify;
 	u8 iv[CHCR_MAX_CRYPTO_IV_LEN + MAX_SCRATCH_PAD_SIZE];
@@ -322,10 +320,8 @@ void chcr_aead_dma_unmap(struct device *dev, struct aead_request *req,
 			 unsigned short op_type);
 void chcr_add_aead_dst_ent(struct aead_request *req,
 			   struct cpl_rx_phys_dsgl *phys_cpl,
-			   unsigned int assoclen,
 			   unsigned short qid);
-void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx,
-			   unsigned int assoclen);
+void chcr_add_aead_src_ent(struct aead_request *req, struct ulptx_sgl *ulptx);
 void chcr_add_cipher_src_ent(struct ablkcipher_request *req,
 			     void *ulptx,
 			     struct  cipher_wr_param *wrparam);
diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c
index ceaa16b8f72efb933d934a416878041ef12c2af8..2fb48cce4462033d2601acfb24ba160369636aaf 100644
--- a/drivers/crypto/chelsio/chcr_ipsec.c
+++ b/drivers/crypto/chelsio/chcr_ipsec.c
@@ -76,12 +76,14 @@ static int chcr_xfrm_add_state(struct xfrm_state *x);
 static void chcr_xfrm_del_state(struct xfrm_state *x);
 static void chcr_xfrm_free_state(struct xfrm_state *x);
 static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
+static void chcr_advance_esn_state(struct xfrm_state *x);
 
 static const struct xfrmdev_ops chcr_xfrmdev_ops = {
 	.xdo_dev_state_add      = chcr_xfrm_add_state,
 	.xdo_dev_state_delete   = chcr_xfrm_del_state,
 	.xdo_dev_state_free     = chcr_xfrm_free_state,
 	.xdo_dev_offload_ok     = chcr_ipsec_offload_ok,
+	.xdo_dev_state_advance_esn = chcr_advance_esn_state,
 };
 
 /* Add offload xfrms to Chelsio Interface */
@@ -210,10 +212,6 @@ static int chcr_xfrm_add_state(struct xfrm_state *x)
 		pr_debug("CHCR: Cannot offload compressed xfrm states\n");
 		return -EINVAL;
 	}
-	if (x->props.flags & XFRM_STATE_ESN) {
-		pr_debug("CHCR: Cannot offload ESN xfrm states\n");
-		return -EINVAL;
-	}
 	if (x->props.family != AF_INET &&
 	    x->props.family != AF_INET6) {
 		pr_debug("CHCR: Only IPv4/6 xfrm state offloaded\n");
@@ -266,6 +264,8 @@ static int chcr_xfrm_add_state(struct xfrm_state *x)
 	}
 
 	sa_entry->hmac_ctrl = chcr_ipsec_setauthsize(x, sa_entry);
+	if (x->props.flags & XFRM_STATE_ESN)
+		sa_entry->esn = 1;
 	chcr_ipsec_setkey(x, sa_entry);
 	x->xso.offload_handle = (unsigned long)sa_entry;
 	try_module_get(THIS_MODULE);
@@ -294,28 +294,57 @@ static void chcr_xfrm_free_state(struct xfrm_state *x)
 
 static bool chcr_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
-	/* Offload with IP options is not supported yet */
-	if (ip_hdr(skb)->ihl > 5)
-		return false;
-
+	if (x->props.family == AF_INET) {
+		/* Offload with IP options is not supported yet */
+		if (ip_hdr(skb)->ihl > 5)
+			return false;
+	} else {
+		/* Offload with IPv6 extension headers is not support yet */
+		if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
+			return false;
+	}
 	return true;
 }
 
-static inline int is_eth_imm(const struct sk_buff *skb, unsigned int kctx_len)
+static void chcr_advance_esn_state(struct xfrm_state *x)
 {
-	int hdrlen = sizeof(struct chcr_ipsec_req) + kctx_len;
+	/* do nothing */
+	if (!x->xso.offload_handle)
+		return;
+}
+
+static inline int is_eth_imm(const struct sk_buff *skb,
+			     struct ipsec_sa_entry *sa_entry)
+{
+	unsigned int kctx_len;
+	int hdrlen;
+
+	kctx_len = sa_entry->kctx_len;
+	hdrlen = sizeof(struct fw_ulptx_wr) +
+		 sizeof(struct chcr_ipsec_req) + kctx_len;
 
 	hdrlen += sizeof(struct cpl_tx_pkt);
+	if (sa_entry->esn)
+		hdrlen += (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16)
+			   << 4);
 	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
 		return hdrlen;
 	return 0;
 }
 
 static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb,
-					     unsigned int kctx_len)
+					     struct ipsec_sa_entry *sa_entry)
 {
+	unsigned int kctx_len;
 	unsigned int flits;
-	int hdrlen = is_eth_imm(skb, kctx_len);
+	int aadivlen;
+	int hdrlen;
+
+	kctx_len = sa_entry->kctx_len;
+	hdrlen = is_eth_imm(skb, sa_entry);
+	aadivlen = sa_entry->esn ? DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv),
+						16) : 0;
+	aadivlen <<= 4;
 
 	/* If the skb is small enough, we can pump it out as a work request
 	 * with only immediate data.  In that case we just have to have the
@@ -338,13 +367,69 @@ static inline unsigned int calc_tx_sec_flits(const struct sk_buff *skb,
 	flits += (sizeof(struct fw_ulptx_wr) +
 		  sizeof(struct chcr_ipsec_req) +
 		  kctx_len +
-		  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+		  sizeof(struct cpl_tx_pkt_core) +
+		  aadivlen) / sizeof(__be64);
 	return flits;
 }
 
+inline void *copy_esn_pktxt(struct sk_buff *skb,
+			    struct net_device *dev,
+			    void *pos,
+			    struct ipsec_sa_entry *sa_entry)
+{
+	struct chcr_ipsec_aadiv *aadiv;
+	struct ulptx_idata *sc_imm;
+	struct ip_esp_hdr *esphdr;
+	struct xfrm_offload *xo;
+	struct sge_eth_txq *q;
+	struct adapter *adap;
+	struct port_info *pi;
+	__be64 seqno;
+	u32 qidx;
+	u32 seqlo;
+	u8 *iv;
+	int eoq;
+	int len;
+
+	pi = netdev_priv(dev);
+	adap = pi->adapter;
+	qidx = skb->queue_mapping;
+	q = &adap->sge.ethtxq[qidx + pi->first_qset];
+
+	/* end of queue, reset pos to start of queue */
+	eoq = (void *)q->q.stat - pos;
+	if (!eoq)
+		pos = q->q.desc;
+
+	len = DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv), 16) << 4;
+	memset(pos, 0, len);
+	aadiv = (struct chcr_ipsec_aadiv *)pos;
+	esphdr = (struct ip_esp_hdr *)skb_transport_header(skb);
+	iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr);
+	xo = xfrm_offload(skb);
+
+	aadiv->spi = (esphdr->spi);
+	seqlo = htonl(esphdr->seq_no);
+	seqno = cpu_to_be64(seqlo + ((u64)xo->seq.hi << 32));
+	memcpy(aadiv->seq_no, &seqno, 8);
+	iv = skb_transport_header(skb) + sizeof(struct ip_esp_hdr);
+	memcpy(aadiv->iv, iv, 8);
+
+	if (sa_entry->imm) {
+		sc_imm = (struct ulptx_idata *)(pos +
+			  (DIV_ROUND_UP(sizeof(struct chcr_ipsec_aadiv),
+					sizeof(__be64)) << 3));
+		sc_imm->cmd_more = FILL_CMD_MORE(!sa_entry->imm);
+		sc_imm->len = cpu_to_be32(sa_entry->imm);
+	}
+	pos += len;
+	return pos;
+}
+
 inline void *copy_cpltx_pktxt(struct sk_buff *skb,
-				struct net_device *dev,
-				void *pos)
+			      struct net_device *dev,
+			      void *pos,
+			      struct ipsec_sa_entry *sa_entry)
 {
 	struct cpl_tx_pkt_core *cpl;
 	struct sge_eth_txq *q;
@@ -379,6 +464,9 @@ inline void *copy_cpltx_pktxt(struct sk_buff *skb,
 	cpl->ctrl1 = cpu_to_be64(cntrl);
 
 	pos += sizeof(struct cpl_tx_pkt_core);
+	/* Copy ESN info for HW */
+	if (sa_entry->esn)
+		pos = copy_esn_pktxt(skb, dev, pos, sa_entry);
 	return pos;
 }
 
@@ -425,7 +513,7 @@ inline void *copy_key_cpltx_pktxt(struct sk_buff *skb,
 		pos = (u8 *)q->q.desc + (key_len - left);
 	}
 	/* Copy CPL TX PKT XT */
-	pos = copy_cpltx_pktxt(skb, dev, pos);
+	pos = copy_cpltx_pktxt(skb, dev, pos, sa_entry);
 
 	return pos;
 }
@@ -438,10 +526,16 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adap = pi->adapter;
-	unsigned int immdatalen = 0;
 	unsigned int ivsize = GCM_ESP_IV_SIZE;
 	struct chcr_ipsec_wr *wr;
+	u16 immdatalen = 0;
 	unsigned int flits;
+	u32 ivinoffset;
+	u32 aadstart;
+	u32 aadstop;
+	u32 ciphstart;
+	u32 ivdrop = 0;
+	u32 esnlen = 0;
 	u32 wr_mid;
 	int qidx = skb_get_queue_mapping(skb);
 	struct sge_eth_txq *q = &adap->sge.ethtxq[qidx + pi->first_qset];
@@ -450,10 +544,17 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 
 	atomic_inc(&adap->chcr_stats.ipsec_cnt);
 
-	flits = calc_tx_sec_flits(skb, kctx_len);
+	flits = calc_tx_sec_flits(skb, sa_entry);
+	if (sa_entry->esn)
+		ivdrop = 1;
 
-	if (is_eth_imm(skb, kctx_len))
+	if (is_eth_imm(skb, sa_entry)) {
 		immdatalen = skb->len;
+		sa_entry->imm = immdatalen;
+	}
+
+	if (sa_entry->esn)
+		esnlen = sizeof(struct chcr_ipsec_aadiv);
 
 	/* WR Header */
 	wr = (struct chcr_ipsec_wr *)pos;
@@ -478,33 +579,38 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 					 sizeof(wr->req.key_ctx) +
 					 kctx_len +
 					 sizeof(struct cpl_tx_pkt_core) +
-					 immdatalen);
+					 esnlen +
+					 (esnlen ? 0 : immdatalen));
 
 	/* CPL_SEC_PDU */
+	ivinoffset = sa_entry->esn ? (ESN_IV_INSERT_OFFSET + 1) :
+				     (skb_transport_offset(skb) +
+				      sizeof(struct ip_esp_hdr) + 1);
 	wr->req.sec_cpl.op_ivinsrtofst = htonl(
 				CPL_TX_SEC_PDU_OPCODE_V(CPL_TX_SEC_PDU) |
 				CPL_TX_SEC_PDU_CPLLEN_V(2) |
 				CPL_TX_SEC_PDU_PLACEHOLDER_V(1) |
 				CPL_TX_SEC_PDU_IVINSRTOFST_V(
-				(skb_transport_offset(skb) +
-				sizeof(struct ip_esp_hdr) + 1)));
+							     ivinoffset));
 
-	wr->req.sec_cpl.pldlen = htonl(skb->len);
+	wr->req.sec_cpl.pldlen = htonl(skb->len + esnlen);
+	aadstart = sa_entry->esn ? 1 : (skb_transport_offset(skb) + 1);
+	aadstop = sa_entry->esn ? ESN_IV_INSERT_OFFSET :
+				  (skb_transport_offset(skb) +
+				   sizeof(struct ip_esp_hdr));
+	ciphstart = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr) +
+		    GCM_ESP_IV_SIZE + 1;
+	ciphstart += sa_entry->esn ?  esnlen : 0;
 
 	wr->req.sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
-				(skb_transport_offset(skb) + 1),
-				(skb_transport_offset(skb) +
-				 sizeof(struct ip_esp_hdr)),
-				(skb_transport_offset(skb) +
-				 sizeof(struct ip_esp_hdr) +
-				 GCM_ESP_IV_SIZE + 1), 0);
+							aadstart,
+							aadstop,
+							ciphstart, 0);
 
 	wr->req.sec_cpl.cipherstop_lo_authinsert =
-		FILL_SEC_CPL_AUTHINSERT(0, skb_transport_offset(skb) +
-					   sizeof(struct ip_esp_hdr) +
-					   GCM_ESP_IV_SIZE + 1,
-					   sa_entry->authsize,
-					   sa_entry->authsize);
+		FILL_SEC_CPL_AUTHINSERT(0, ciphstart,
+					sa_entry->authsize,
+					 sa_entry->authsize);
 	wr->req.sec_cpl.seqno_numivs =
 		FILL_SEC_CPL_SCMD0_SEQNO(CHCR_ENCRYPT_OP, 1,
 					 CHCR_SCMD_CIPHER_MODE_AES_GCM,
@@ -512,7 +618,7 @@ inline void *chcr_crypto_wreq(struct sk_buff *skb,
 					 sa_entry->hmac_ctrl,
 					 ivsize >> 1);
 	wr->req.sec_cpl.ivgen_hdrlen =  FILL_SEC_CPL_IVGEN_HDRLEN(0, 0, 1,
-								  0, 0, 0);
+								  0, ivdrop, 0);
 
 	pos += sizeof(struct fw_ulptx_wr) +
 	       sizeof(struct ulp_txpkt) +
@@ -565,7 +671,7 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ipsec_sa_entry *sa_entry;
 	u64 *pos, *end, *before, *sgl;
 	int qidx, left, credits;
-	unsigned int flits = 0, ndesc, kctx_len;
+	unsigned int flits = 0, ndesc;
 	struct adapter *adap;
 	struct sge_eth_txq *q;
 	struct port_info *pi;
@@ -577,7 +683,6 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 
 	sa_entry = (struct ipsec_sa_entry *)x->xso.offload_handle;
-	kctx_len = sa_entry->kctx_len;
 
 	sp = skb_sec_path(skb);
 	if (sp->len != 1) {
@@ -592,7 +697,7 @@ out_free:       dev_kfree_skb_any(skb);
 
 	cxgb4_reclaim_completed_tx(adap, &q->q, true);
 
-	flits = calc_tx_sec_flits(skb, sa_entry->kctx_len);
+	flits = calc_tx_sec_flits(skb, sa_entry);
 	ndesc = flits_to_desc(flits);
 	credits = txq_avail(&q->q) - ndesc;
 
@@ -605,7 +710,7 @@ out_free:       dev_kfree_skb_any(skb);
 		return NETDEV_TX_BUSY;
 	}
 
-	if (is_eth_imm(skb, kctx_len))
+	if (is_eth_imm(skb, sa_entry))
 		immediate = true;
 
 	if (!immediate &&
diff --git a/drivers/crypto/geode-aes.c b/drivers/crypto/geode-aes.c
index eb2a0a73cbed17cd0c9bdc2f8ab847c4dcf4b9d3..b4c24a35b3d08a0b5b9c15c9effbea1e17d9c277 100644
--- a/drivers/crypto/geode-aes.c
+++ b/drivers/crypto/geode-aes.c
@@ -261,7 +261,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm)
 	struct geode_aes_op *op = crypto_tfm_ctx(tfm);
 
 	op->fallback.cip = crypto_alloc_cipher(name, 0,
-				CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+					       CRYPTO_ALG_NEED_FALLBACK);
 
 	if (IS_ERR(op->fallback.cip)) {
 		printk(KERN_ERR "Error allocating fallback algo %s\n", name);
diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c
index 3aef1d43e43510130dc359d61c911e51209e9a7f..d531c14020dcb61d6b7002f44f64503b5e6a3c23 100644
--- a/drivers/crypto/inside-secure/safexcel_cipher.c
+++ b/drivers/crypto/inside-secure/safexcel_cipher.c
@@ -970,7 +970,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des = {
 			.cra_name = "cbc(des)",
 			.cra_driver_name = "safexcel-cbc-des",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1010,7 +1010,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des = {
 			.cra_name = "ecb(des)",
 			.cra_driver_name = "safexcel-ecb-des",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1074,7 +1074,7 @@ struct safexcel_alg_template safexcel_alg_cbc_des3_ede = {
 			.cra_name = "cbc(des3_ede)",
 			.cra_driver_name = "safexcel-cbc-des3_ede",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
@@ -1114,7 +1114,7 @@ struct safexcel_alg_template safexcel_alg_ecb_des3_ede = {
 			.cra_name = "ecb(des3_ede)",
 			.cra_driver_name = "safexcel-ecb-des3_ede",
 			.cra_priority = 300,
-			.cra_flags = CRYPTO_ALG_TYPE_SKCIPHER | CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 				     CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
 			.cra_ctxsize = sizeof(struct safexcel_cipher_ctx),
diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c
index 27f7dad2d45d9d92b3eae73525b0aaae73cd4523..19fba998b86b462fbbfc8f2ac872e18f2fa8e28c 100644
--- a/drivers/crypto/ixp4xx_crypto.c
+++ b/drivers/crypto/ixp4xx_crypto.c
@@ -1194,7 +1194,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= DES_KEY_SIZE,
 			.max_keysize	= DES_KEY_SIZE,
 			.ivsize		= DES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1221,7 +1220,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= DES3_EDE_KEY_SIZE,
 			.max_keysize	= DES3_EDE_KEY_SIZE,
 			.ivsize		= DES3_EDE_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1247,7 +1245,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1273,7 +1270,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			}
 		}
 	},
@@ -1287,7 +1283,6 @@ static struct ixp_alg ixp4xx_algos[] = {
 			.min_keysize	= AES_MIN_KEY_SIZE,
 			.max_keysize	= AES_MAX_KEY_SIZE,
 			.ivsize		= AES_BLOCK_SIZE,
-			.geniv		= "eseqiv",
 			.setkey		= ablk_rfc3686_setkey,
 			.encrypt	= ablk_rfc3686_crypt,
 			.decrypt	= ablk_rfc3686_crypt }
diff --git a/drivers/crypto/mxc-scc.c b/drivers/crypto/mxc-scc.c
index e01c46387df8d8823f4a9434ea0db3b9992583c4..519086730791b9ab53898e7f0464f0f3742cc552 100644
--- a/drivers/crypto/mxc-scc.c
+++ b/drivers/crypto/mxc-scc.c
@@ -178,12 +178,12 @@ static int mxc_scc_get_data(struct mxc_scc_ctx *ctx,
 	else
 		from = scc->black_memory;
 
-	dev_dbg(scc->dev, "pcopy: from 0x%p %d bytes\n", from,
+	dev_dbg(scc->dev, "pcopy: from 0x%p %zu bytes\n", from,
 		ctx->dst_nents * 8);
 	len = sg_pcopy_from_buffer(ablkreq->dst, ctx->dst_nents,
 				   from, ctx->size, ctx->offset);
 	if (!len) {
-		dev_err(scc->dev, "pcopy err from 0x%p (len=%d)\n", from, len);
+		dev_err(scc->dev, "pcopy err from 0x%p (len=%zu)\n", from, len);
 		return -EINVAL;
 	}
 
@@ -274,7 +274,7 @@ static int mxc_scc_put_data(struct mxc_scc_ctx *ctx,
 	len = sg_pcopy_to_buffer(req->src, ctx->src_nents,
 				 to, len, ctx->offset);
 	if (!len) {
-		dev_err(scc->dev, "pcopy err to 0x%p (len=%d)\n", to, len);
+		dev_err(scc->dev, "pcopy err to 0x%p (len=%zu)\n", to, len);
 		return -EINVAL;
 	}
 
@@ -335,9 +335,9 @@ static void mxc_scc_ablkcipher_next(struct mxc_scc_ctx *ctx,
 		return;
 	}
 
-	dev_dbg(scc->dev, "Start encryption (0x%p/0x%p)\n",
-		(void *)readl(scc->base + SCC_SCM_RED_START),
-		(void *)readl(scc->base + SCC_SCM_BLACK_START));
+	dev_dbg(scc->dev, "Start encryption (0x%x/0x%x)\n",
+		readl(scc->base + SCC_SCM_RED_START),
+		readl(scc->base + SCC_SCM_BLACK_START));
 
 	/* clear interrupt control registers */
 	writel(SCC_SCM_INTR_CTRL_CLR_INTR,
diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c
index 4e6ff32f8a7e6f7f595780a4d7eb16ce209b3d07..a2105cf33abb42bbf0b53dd1becc54c7c80a6cc5 100644
--- a/drivers/crypto/mxs-dcp.c
+++ b/drivers/crypto/mxs-dcp.c
@@ -20,6 +20,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/stmp_device.h>
+#include <linux/clk.h>
 
 #include <crypto/aes.h>
 #include <crypto/sha.h>
@@ -82,6 +83,7 @@ struct dcp {
 	spinlock_t			lock[DCP_MAX_CHANS];
 	struct task_struct		*thread[DCP_MAX_CHANS];
 	struct crypto_queue		queue[DCP_MAX_CHANS];
+	struct clk			*dcp_clk;
 };
 
 enum dcp_chan {
@@ -1053,11 +1055,24 @@ static int mxs_dcp_probe(struct platform_device *pdev)
 	/* Re-align the structure so it fits the DCP constraints. */
 	sdcp->coh = PTR_ALIGN(sdcp->coh, DCP_ALIGNMENT);
 
-	/* Restart the DCP block. */
-	ret = stmp_reset_block(sdcp->base);
+	/* DCP clock is optional, only used on some SOCs */
+	sdcp->dcp_clk = devm_clk_get(dev, "dcp");
+	if (IS_ERR(sdcp->dcp_clk)) {
+		if (sdcp->dcp_clk != ERR_PTR(-ENOENT))
+			return PTR_ERR(sdcp->dcp_clk);
+		sdcp->dcp_clk = NULL;
+	}
+	ret = clk_prepare_enable(sdcp->dcp_clk);
 	if (ret)
 		return ret;
 
+	/* Restart the DCP block. */
+	ret = stmp_reset_block(sdcp->base);
+	if (ret) {
+		dev_err(dev, "Failed reset\n");
+		goto err_disable_unprepare_clk;
+	}
+
 	/* Initialize control register. */
 	writel(MXS_DCP_CTRL_GATHER_RESIDUAL_WRITES |
 	       MXS_DCP_CTRL_ENABLE_CONTEXT_CACHING | 0xf,
@@ -1094,7 +1109,8 @@ static int mxs_dcp_probe(struct platform_device *pdev)
 						      NULL, "mxs_dcp_chan/sha");
 	if (IS_ERR(sdcp->thread[DCP_CHAN_HASH_SHA])) {
 		dev_err(dev, "Error starting SHA thread!\n");
-		return PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]);
+		ret = PTR_ERR(sdcp->thread[DCP_CHAN_HASH_SHA]);
+		goto err_disable_unprepare_clk;
 	}
 
 	sdcp->thread[DCP_CHAN_CRYPTO] = kthread_run(dcp_chan_thread_aes,
@@ -1151,6 +1167,10 @@ static int mxs_dcp_probe(struct platform_device *pdev)
 
 err_destroy_sha_thread:
 	kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]);
+
+err_disable_unprepare_clk:
+	clk_disable_unprepare(sdcp->dcp_clk);
+
 	return ret;
 }
 
@@ -1170,6 +1190,8 @@ static int mxs_dcp_remove(struct platform_device *pdev)
 	kthread_stop(sdcp->thread[DCP_CHAN_HASH_SHA]);
 	kthread_stop(sdcp->thread[DCP_CHAN_CRYPTO]);
 
+	clk_disable_unprepare(sdcp->dcp_clk);
+
 	platform_set_drvdata(pdev, NULL);
 
 	global_sdcp = NULL;
diff --git a/drivers/crypto/nx/nx-aes-ctr.c b/drivers/crypto/nx/nx-aes-ctr.c
index 898c0a280511d5c49f8d6e076b4905773c1f3b8f..5a26fcd75d2df251067c9d093e684f2344be7bf4 100644
--- a/drivers/crypto/nx/nx-aes-ctr.c
+++ b/drivers/crypto/nx/nx-aes-ctr.c
@@ -159,7 +159,6 @@ struct crypto_alg nx_ctr3686_aes_alg = {
 		.min_keysize = AES_MIN_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
 		.max_keysize = AES_MAX_KEY_SIZE + CTR_RFC3686_NONCE_SIZE,
 		.ivsize      = CTR_RFC3686_IV_SIZE,
-		.geniv       = "seqiv",
 		.setkey      = ctr3686_aes_nx_set_key,
 		.encrypt     = ctr3686_aes_nx_crypt,
 		.decrypt     = ctr3686_aes_nx_crypt,
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index a553ffddb11beed2f5ed335e18cdd9ebfb6546e7..0120feb2d74627f4f1b1dfb86a79f4050f9de0dc 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -749,7 +749,6 @@ static struct crypto_alg algs_ctr[] = {
 	.cra_u.ablkcipher = {
 		.min_keysize	= AES_MIN_KEY_SIZE,
 		.max_keysize	= AES_MAX_KEY_SIZE,
-		.geniv		= "eseqiv",
 		.ivsize		= AES_BLOCK_SIZE,
 		.setkey		= omap_aes_setkey,
 		.encrypt	= omap_aes_ctr_encrypt,
@@ -1222,7 +1221,6 @@ static int omap_aes_probe(struct platform_device *pdev)
 				algp = &dd->pdata->algs_info[i].algs_list[j];
 
 				pr_debug("reg alg: %s\n", algp->cra_name);
-				INIT_LIST_HEAD(&algp->cra_list);
 
 				err = crypto_register_alg(algp);
 				if (err)
@@ -1240,7 +1238,6 @@ static int omap_aes_probe(struct platform_device *pdev)
 			algp = &aalg->base;
 
 			pr_debug("reg alg: %s\n", algp->cra_name);
-			INIT_LIST_HEAD(&algp->cra_list);
 
 			err = crypto_register_aead(aalg);
 			if (err)
diff --git a/drivers/crypto/omap-des.c b/drivers/crypto/omap-des.c
index eb95b0d7f1846f67418a2115c9dff185174843d8..6369019219d4431ec53131f1565af8f046fc49e1 100644
--- a/drivers/crypto/omap-des.c
+++ b/drivers/crypto/omap-des.c
@@ -1069,7 +1069,6 @@ static int omap_des_probe(struct platform_device *pdev)
 			algp = &dd->pdata->algs_info[i].algs_list[j];
 
 			pr_debug("reg alg: %s\n", algp->cra_name);
-			INIT_LIST_HEAD(&algp->cra_list);
 
 			err = crypto_register_alg(algp);
 			if (err)
diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
index a28f1d18fe01fda8626e20418e4b9bae2a6cb494..17068b55fea53ddf9e06d27725304e9296ea87e5 100644
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -1585,8 +1585,7 @@ static struct spacc_alg l2_engine_algs[] = {
 			.cra_name = "f8(kasumi)",
 			.cra_driver_name = "f8-kasumi-picoxcell",
 			.cra_priority = SPACC_CRYPTO_ALG_PRIORITY,
-			.cra_flags = CRYPTO_ALG_TYPE_GIVCIPHER |
-					CRYPTO_ALG_ASYNC |
+			.cra_flags = CRYPTO_ALG_ASYNC |
 					CRYPTO_ALG_KERN_DRIVER_ONLY,
 			.cra_blocksize = 8,
 			.cra_ctxsize = sizeof(struct spacc_ablk_ctx),
diff --git a/drivers/crypto/qce/ablkcipher.c b/drivers/crypto/qce/ablkcipher.c
index 585e1cab9ae347ec8b637f9d02f824328d61dc1f..25c13e26d01287c0bfdce32d8707a72bb0a068c8 100644
--- a/drivers/crypto/qce/ablkcipher.c
+++ b/drivers/crypto/qce/ablkcipher.c
@@ -376,7 +376,6 @@ static int qce_ablkcipher_register_one(const struct qce_ablkcipher_def *def,
 	alg->cra_module = THIS_MODULE;
 	alg->cra_init = qce_ablkcipher_init;
 	alg->cra_exit = qce_ablkcipher_exit;
-	INIT_LIST_HEAD(&alg->cra_list);
 
 	INIT_LIST_HEAD(&tmpl->entry);
 	tmpl->crypto_alg_type = CRYPTO_ALG_TYPE_ABLKCIPHER;
diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c
index d8a5db11b7ea1f3b3fec471b2cca1e11a84d6d1e..fc45f5ea6fdd272487009e6ce48c38379b63634d 100644
--- a/drivers/crypto/qce/sha.c
+++ b/drivers/crypto/qce/sha.c
@@ -508,7 +508,6 @@ static int qce_ahash_register_one(const struct qce_ahash_def *def,
 	base->cra_alignmask = 0;
 	base->cra_module = THIS_MODULE;
 	base->cra_init = qce_ahash_cra_init;
-	INIT_LIST_HEAD(&base->cra_list);
 
 	snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "%s", def->name);
 	snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c
index bbf166a97ad3a345aa33b30af1566cb38ce86ad8..8c32a3059b4a8cf87ec16687ca1b8ac6629d902e 100644
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
@@ -1321,7 +1321,6 @@ static int sahara_register_algs(struct sahara_dev *dev)
 	unsigned int i, j, k, l;
 
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
-		INIT_LIST_HEAD(&aes_algs[i].cra_list);
 		err = crypto_register_alg(&aes_algs[i]);
 		if (err)
 			goto err_aes_algs;
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 6988012deca4cdd8f0f2bc6dd21ca790e4c98f6a..45e20707cef8d708d2398a2e6d338bf1b053f7f6 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3155,7 +3155,6 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev,
 		alg->cra_ablkcipher.setkey = ablkcipher_setkey;
 		alg->cra_ablkcipher.encrypt = ablkcipher_encrypt;
 		alg->cra_ablkcipher.decrypt = ablkcipher_decrypt;
-		alg->cra_ablkcipher.geniv = "eseqiv";
 		break;
 	case CRYPTO_ALG_TYPE_AEAD:
 		alg = &t_alg->algt.alg.aead.base;
diff --git a/drivers/crypto/ux500/cryp/cryp_core.c b/drivers/crypto/ux500/cryp/cryp_core.c
index d2663a4e1f5eb8050589c82e8c89406df4bd3df8..a92a66b1ff46ed689a71d02dc4ab41481284ed1c 100644
--- a/drivers/crypto/ux500/cryp/cryp_core.c
+++ b/drivers/crypto/ux500/cryp/cryp_core.c
@@ -556,7 +556,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_src,
 				ctx->device->dma.sg_src_len,
-				direction, DMA_CTRL_ACK);
+				DMA_MEM_TO_DEV, DMA_CTRL_ACK);
 		break;
 
 	case DMA_FROM_DEVICE:
@@ -580,7 +580,7 @@ static int cryp_set_dma_transfer(struct cryp_ctx *ctx,
 		desc = dmaengine_prep_slave_sg(channel,
 				ctx->device->dma.sg_dst,
 				ctx->device->dma.sg_dst_len,
-				direction,
+				DMA_DEV_TO_MEM,
 				DMA_CTRL_ACK |
 				DMA_PREP_INTERRUPT);
 
diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c
index 633321a8dd034390b7d809aaeef792e8d971a21b..a0bb8a6eec3fd954e16ad601d2a1d16ebbbd4f0b 100644
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -166,7 +166,7 @@ static int hash_set_dma_transfer(struct hash_ctx *ctx, struct scatterlist *sg,
 		__func__);
 	desc = dmaengine_prep_slave_sg(channel,
 			ctx->device->dma.sg, ctx->device->dma.sg_len,
-			direction, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+			DMA_MEM_TO_DEV, DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
 	if (!desc) {
 		dev_err(ctx->device->dev,
 			"%s: dmaengine_prep_slave_sg() failed!\n", __func__);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b8eec515a003cfc592d450eb8f78f1bec1898be7..a7195eb5b8d8949b87cfcedc5daf7c944abce5f2 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -377,7 +377,7 @@ static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
 	int err;
 
 	/* Setup the essiv_tfm with the given salt */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
 	if (IS_ERR(essiv_tfm)) {
 		ti->error = "Error allocating crypto tfm for ESSIV";
 		return essiv_tfm;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index bb3096bf2cc6b5bb79f074278e7b4825f4d3bece..d4ad0bfee2519aeef4a3f4e7c4baf100fa38bb16 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2804,7 +2804,7 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
 	int r;
 
 	if (a->alg_string) {
-		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ASYNC);
+		*hash = crypto_alloc_shash(a->alg_string, 0, 0);
 		if (IS_ERR(*hash)) {
 			*error = error_alg;
 			r = PTR_ERR(*hash);
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index 5512c7f73fce89ec231f42357d7ee257c3a6f7d7..3f5a14112c6be75e1a47c3f1a6dfc08b5444176d 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -1359,7 +1359,7 @@ static int micsetup(struct airo_info *ai) {
 	int i;
 
 	if (ai->tfm == NULL)
-	        ai->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+		ai->tfm = crypto_alloc_cipher("aes", 0, 0);
 
         if (IS_ERR(ai->tfm)) {
                 airo_print_err(ai->dev->name, "failed to load transform for AES");
diff --git a/drivers/net/wireless/intersil/orinoco/mic.c b/drivers/net/wireless/intersil/orinoco/mic.c
index 08bc7822f8209d0a9136357edb18683092c86c18..709d9ab3e7bcb694964cd60d9162059110faff4a 100644
--- a/drivers/net/wireless/intersil/orinoco/mic.c
+++ b/drivers/net/wireless/intersil/orinoco/mic.c
@@ -16,8 +16,7 @@
 /********************************************************************/
 int orinoco_mic_init(struct orinoco_private *priv)
 {
-	priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0,
-					      CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0);
 	if (IS_ERR(priv->tx_tfm_mic)) {
 		printk(KERN_DEBUG "orinoco_mic_init: could not allocate "
 		       "crypto API michael_mic\n");
@@ -25,8 +24,7 @@ int orinoco_mic_init(struct orinoco_private *priv)
 		return -ENOMEM;
 	}
 
-	priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0,
-					      CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_mic = crypto_alloc_shash("michael_mic", 0, 0);
 	if (IS_ERR(priv->rx_tfm_mic)) {
 		printk(KERN_DEBUG "orinoco_mic_init: could not allocate "
 		       "crypto API michael_mic\n");
diff --git a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
index bc45cf098b04e26a5c8ba425d255f4381a11c2c0..91871503364d4c0c297f198d4409eab9c06210e2 100644
--- a/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
+++ b/drivers/staging/rtl8192e/rtllib_crypt_ccmp.c
@@ -67,7 +67,7 @@ static void *rtllib_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		pr_debug("Could not allocate crypto API aes\n");
 		priv->tfm = NULL;
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
index 041f1b123888bfa790d02a581c04c26c4975ee3f..3534ddb900d1e5146270bab8cd23caeb17ce0746 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
@@ -71,7 +71,7 @@ static void *ieee80211_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = (void *)crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		pr_debug("ieee80211_crypt_ccmp: could not allocate crypto API aes\n");
 		priv->tfm = NULL;
diff --git a/drivers/usb/wusbcore/crypto.c b/drivers/usb/wusbcore/crypto.c
index 68ddee86a8861cccbf033a995e90ca150edcfb66..edb7263bff403c05e02582837c45e4c30a32a803 100644
--- a/drivers/usb/wusbcore/crypto.c
+++ b/drivers/usb/wusbcore/crypto.c
@@ -316,7 +316,7 @@ ssize_t wusb_prf(void *out, size_t out_size,
 		goto error_setkey_cbc;
 	}
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		result = PTR_ERR(tfm_aes);
 		printk(KERN_ERR "E: can't load AES: %d\n", (int)result);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 124e965a28b30d985f5406e039431588d96e9f53..5bf5fd08879e6b2dd0570d179c3743b646184c5b 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -269,8 +269,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 		goto out;
 	}
 
-	c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0,
-					 CRYPTO_ALG_ASYNC);
+	c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0);
 	if (IS_ERR(c->hash_tfm)) {
 		err = PTR_ERR(c->hash_tfm);
 		ubifs_err(c, "Can not allocate %s: %d",
@@ -286,7 +285,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
 		goto out_free_hash;
 	}
 
-	c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+	c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
 	if (IS_ERR(c->hmac_tfm)) {
 		err = PTR_ERR(c->hmac_tfm);
 		ubifs_err(c, "Can not allocate %s: %d", hmac_name, err);
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 22e6f412c595a117a3ea9247cf712ce374823460..a3e766dff917f0055a275b4f8e961a67da84a2a3 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -234,34 +234,6 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 		req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
 
-static inline void crypto_stat_compress(struct acomp_req *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->compress_cnt);
-		atomic64_add(req->slen, &tfm->base.__crt_alg->compress_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->compress_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->decompress_cnt);
-		atomic64_add(req->slen, &tfm->base.__crt_alg->decompress_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_acomp_compress() -- Invoke asynchronous compress operation
  *
@@ -274,10 +246,13 @@ static inline void crypto_stat_decompress(struct acomp_req *req, int ret)
 static inline int crypto_acomp_compress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int slen = req->slen;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = tfm->compress(req);
-	crypto_stat_compress(req, ret);
+	crypto_stats_compress(slen, ret, alg);
 	return ret;
 }
 
@@ -293,10 +268,13 @@ static inline int crypto_acomp_compress(struct acomp_req *req)
 static inline int crypto_acomp_decompress(struct acomp_req *req)
 {
 	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int slen = req->slen;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = tfm->decompress(req);
-	crypto_stat_decompress(req, ret);
+	crypto_stats_decompress(slen, ret, alg);
 	return ret;
 }
 
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 0d765d7bfb82715d381eb3c093452f1cd41572ba..9ad595f97c65aaab9d6b2c77596428ff98dbe020 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -115,7 +115,6 @@ struct aead_request {
  * @setkey: see struct skcipher_alg
  * @encrypt: see struct skcipher_alg
  * @decrypt: see struct skcipher_alg
- * @geniv: see struct skcipher_alg
  * @ivsize: see struct skcipher_alg
  * @chunksize: see struct skcipher_alg
  * @init: Initialize the cryptographic transformation object. This function
@@ -142,8 +141,6 @@ struct aead_alg {
 	int (*init)(struct crypto_aead *tfm);
 	void (*exit)(struct crypto_aead *tfm);
 
-	const char *geniv;
-
 	unsigned int ivsize;
 	unsigned int maxauthsize;
 	unsigned int chunksize;
@@ -306,34 +303,6 @@ static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
 	return __crypto_aead_cast(req->base.tfm);
 }
 
-static inline void crypto_stat_aead_encrypt(struct aead_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->aead_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->cryptlen, &tfm->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_aead_encrypt() - encrypt plaintext
  * @req: reference to the aead_request handle that holds all information
@@ -356,13 +325,16 @@ static inline void crypto_stat_aead_decrypt(struct aead_request *req, int ret)
 static inline int crypto_aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_alg *alg = aead->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = crypto_aead_alg(aead)->encrypt(req);
-	crypto_stat_aead_encrypt(req, ret);
+	crypto_stats_aead_encrypt(cryptlen, alg, ret);
 	return ret;
 }
 
@@ -391,15 +363,18 @@ static inline int crypto_aead_encrypt(struct aead_request *req)
 static inline int crypto_aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_alg *alg = aead->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else if (req->cryptlen < crypto_aead_authsize(aead))
 		ret = -EINVAL;
 	else
 		ret = crypto_aead_alg(aead)->decrypt(req);
-	crypto_stat_aead_decrypt(req, ret);
+	crypto_stats_aead_decrypt(cryptlen, alg, ret);
 	return ret;
 }
 
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index afac711193968f6072966f9757066b350d0d0fb3..2d690494568cf8b4e70fcc93a5d324d286cf6616 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -271,62 +271,6 @@ static inline unsigned int crypto_akcipher_maxsize(struct crypto_akcipher *tfm)
 	return alg->max_size(tfm);
 }
 
-static inline void crypto_stat_akcipher_encrypt(struct akcipher_request *req,
-						int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->src_len, &tfm->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_akcipher_decrypt(struct akcipher_request *req,
-						int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->src_len, &tfm->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_akcipher_sign(struct akcipher_request *req,
-					     int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->sign_cnt);
-#endif
-}
-
-static inline void crypto_stat_akcipher_verify(struct akcipher_request *req,
-					       int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->akcipher_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->verify_cnt);
-#endif
-}
-
 /**
  * crypto_akcipher_encrypt() - Invoke public key encrypt operation
  *
@@ -341,10 +285,13 @@ static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
+	unsigned int src_len = req->src_len;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->encrypt(req);
-	crypto_stat_akcipher_encrypt(req, ret);
+	crypto_stats_akcipher_encrypt(src_len, ret, calg);
 	return ret;
 }
 
@@ -362,10 +309,13 @@ static inline int crypto_akcipher_decrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
+	unsigned int src_len = req->src_len;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->decrypt(req);
-	crypto_stat_akcipher_decrypt(req, ret);
+	crypto_stats_akcipher_decrypt(src_len, ret, calg);
 	return ret;
 }
 
@@ -383,10 +333,12 @@ static inline int crypto_akcipher_sign(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->sign(req);
-	crypto_stat_akcipher_sign(req, ret);
+	crypto_stats_akcipher_sign(ret, calg);
 	return ret;
 }
 
@@ -404,10 +356,12 @@ static inline int crypto_akcipher_verify(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
 	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->verify(req);
-	crypto_stat_akcipher_verify(req, ret);
+	crypto_stats_akcipher_verify(ret, calg);
 	return ret;
 }
 
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fc70a69d550886ee23fa671f39c0690804692f3
--- /dev/null
+++ b/include/crypto/chacha.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
+ *
+ * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
+ * security.  Here they share the same key size, tfm context, and setkey
+ * function; only their IV size and encrypt/decrypt function differ.
+ *
+ * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
+ * recommended to use the 20-round variant ChaCha20.  However, the other
+ * variants can be needed in some performance-sensitive scenarios.  The generic
+ * ChaCha code currently allows only the 20 and 12-round variants.
+ */
+
+#ifndef _CRYPTO_CHACHA_H
+#define _CRYPTO_CHACHA_H
+
+#include <crypto/skcipher.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
+#define CHACHA_IV_SIZE		16
+
+#define CHACHA_KEY_SIZE		32
+#define CHACHA_BLOCK_SIZE	64
+#define CHACHAPOLY_IV_SIZE	12
+
+/* 192-bit nonce, then 64-bit stream position */
+#define XCHACHA_IV_SIZE		32
+
+struct chacha_ctx {
+	u32 key[8];
+	int nrounds;
+};
+
+void chacha_block(u32 *state, u8 *stream, int nrounds);
+static inline void chacha20_block(u32 *state, u8 *stream)
+{
+	chacha_block(state, stream, 20);
+}
+void hchacha_block(const u32 *in, u32 *out, int nrounds);
+
+void crypto_chacha_init(u32 *state, struct chacha_ctx *ctx, u8 *iv);
+
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
+int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
+
+int crypto_chacha_crypt(struct skcipher_request *req);
+int crypto_xchacha_crypt(struct skcipher_request *req);
+
+#endif /* _CRYPTO_CHACHA_H */
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
deleted file mode 100644
index f76302d99e2bedbdb0be35c202cd8a8b24c3a6a6..0000000000000000000000000000000000000000
--- a/include/crypto/chacha20.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Common values for the ChaCha20 algorithm
- */
-
-#ifndef _CRYPTO_CHACHA20_H
-#define _CRYPTO_CHACHA20_H
-
-#include <crypto/skcipher.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-
-#define CHACHA20_IV_SIZE	16
-#define CHACHA20_KEY_SIZE	32
-#define CHACHA20_BLOCK_SIZE	64
-
-struct chacha20_ctx {
-	u32 key[8];
-};
-
-void chacha20_block(u32 *state, u8 *stream);
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			   unsigned int keysize);
-int crypto_chacha20_crypt(struct skcipher_request *req);
-
-#endif
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index bc7796600338518a96ecfe5912a5e41aa571eeeb..3b31c1b349ae60158f96d4605f0dc2952cb089a6 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -412,32 +412,6 @@ static inline void *ahash_request_ctx(struct ahash_request *req)
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 			unsigned int keylen);
 
-static inline void crypto_stat_ahash_update(struct ahash_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
-	else
-		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
-#endif
-}
-
-static inline void crypto_stat_ahash_final(struct ahash_request *req, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->hash_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->hash_cnt);
-		atomic64_add(req->nbytes, &tfm->base.__crt_alg->hash_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_ahash_finup() - update and finalize message digest
  * @req: reference to the ahash_request handle that holds all information
@@ -552,10 +526,14 @@ static inline int crypto_ahash_init(struct ahash_request *req)
  */
 static inline int crypto_ahash_update(struct ahash_request *req)
 {
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_ahash_reqtfm(req)->update(req);
-	crypto_stat_ahash_update(req, ret);
+	crypto_stats_ahash_update(nbytes, ret, alg);
 	return ret;
 }
 
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
index 56f217d41f12e376028ac67e4617633acd76dd21..91786b68dbdbaade4908249888995c074cb99c2d 100644
--- a/include/crypto/hash_info.h
+++ b/include/crypto/hash_info.h
@@ -15,6 +15,7 @@
 
 #include <crypto/sha.h>
 #include <crypto/md5.h>
+#include <crypto/streebog.h>
 
 #include <uapi/linux/hash_info.h>
 
diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h
index 8db299c255666c937e843e51bfad9dedc5d579a3..40623f4457df66f7971ce1aa68fdc0ee224bc570 100644
--- a/include/crypto/internal/cryptouser.h
+++ b/include/crypto/internal/cryptouser.h
@@ -3,6 +3,11 @@
 
 struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact);
 
-int crypto_dump_reportstat(struct sk_buff *skb, struct netlink_callback *cb);
+#ifdef CONFIG_CRYPTO_STATS
 int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs);
-int crypto_dump_reportstat_done(struct netlink_callback *cb);
+#else
+static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs)
+{
+	return -ENOTSUPP;
+}
+#endif
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index e42f7063f245b9609671dff95a7e1db267e23b0f..453e867b4bd9277b500e7583320fc89e2cc62c97 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -70,8 +70,6 @@ struct skcipher_walk {
 	unsigned int alignmask;
 };
 
-extern const struct crypto_type crypto_givcipher_type;
-
 static inline struct crypto_instance *skcipher_crypto_instance(
 	struct skcipher_instance *inst)
 {
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index f517ba6d3a27628b6cebc9a61b36654bd2f4247f..1a97e160142228913241e43f6da77938d5cfd636 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -268,42 +268,6 @@ struct kpp_secret {
 	unsigned short len;
 };
 
-static inline void crypto_stat_kpp_set_secret(struct crypto_kpp *tfm, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->setsecret_cnt);
-#endif
-}
-
-static inline void crypto_stat_kpp_generate_public_key(struct kpp_request *req,
-						       int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-
-	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->generate_public_key_cnt);
-#endif
-}
-
-static inline void crypto_stat_kpp_compute_shared_secret(struct kpp_request *req,
-							 int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-
-	if (ret)
-		atomic_inc(&tfm->base.__crt_alg->kpp_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->compute_shared_secret_cnt);
-#endif
-}
-
 /**
  * crypto_kpp_set_secret() - Invoke kpp operation
  *
@@ -323,10 +287,12 @@ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm,
 					const void *buffer, unsigned int len)
 {
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->set_secret(tfm, buffer, len);
-	crypto_stat_kpp_set_secret(tfm, ret);
+	crypto_stats_kpp_set_secret(calg, ret);
 	return ret;
 }
 
@@ -347,10 +313,12 @@ static inline int crypto_kpp_generate_public_key(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->generate_public_key(req);
-	crypto_stat_kpp_generate_public_key(req, ret);
+	crypto_stats_kpp_generate_public_key(calg, ret);
 	return ret;
 }
 
@@ -368,10 +336,12 @@ static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	struct kpp_alg *alg = crypto_kpp_alg(tfm);
+	struct crypto_alg *calg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(calg);
 	ret = alg->compute_shared_secret(req);
-	crypto_stat_kpp_compute_shared_secret(req, ret);
+	crypto_stats_kpp_compute_shared_secret(calg, ret);
 	return ret;
 }
 
diff --git a/include/crypto/nhpoly1305.h b/include/crypto/nhpoly1305.h
new file mode 100644
index 0000000000000000000000000000000000000000..53c04423c582e9fb82018c2bcd95cd756f812538
--- /dev/null
+++ b/include/crypto/nhpoly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values and helper functions for the NHPoly1305 hash function.
+ */
+
+#ifndef _NHPOLY1305_H
+#define _NHPOLY1305_H
+
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+
+/* NH parameterization: */
+
+/* Endianness: little */
+/* Word size: 32 bits (works well on NEON, SSE2, AVX2) */
+
+/* Stride: 2 words (optimal on ARM32 NEON; works okay on other CPUs too) */
+#define NH_PAIR_STRIDE		2
+#define NH_MESSAGE_UNIT		(NH_PAIR_STRIDE * 2 * sizeof(u32))
+
+/* Num passes (Toeplitz iteration count): 4, to give ε = 2^{-128} */
+#define NH_NUM_PASSES		4
+#define NH_HASH_BYTES		(NH_NUM_PASSES * sizeof(u64))
+
+/* Max message size: 1024 bytes (32x compression factor) */
+#define NH_NUM_STRIDES		64
+#define NH_MESSAGE_WORDS	(NH_PAIR_STRIDE * 2 * NH_NUM_STRIDES)
+#define NH_MESSAGE_BYTES	(NH_MESSAGE_WORDS * sizeof(u32))
+#define NH_KEY_WORDS		(NH_MESSAGE_WORDS + \
+				 NH_PAIR_STRIDE * 2 * (NH_NUM_PASSES - 1))
+#define NH_KEY_BYTES		(NH_KEY_WORDS * sizeof(u32))
+
+#define NHPOLY1305_KEY_SIZE	(POLY1305_BLOCK_SIZE + NH_KEY_BYTES)
+
+struct nhpoly1305_key {
+	struct poly1305_key poly_key;
+	u32 nh_key[NH_KEY_WORDS];
+};
+
+struct nhpoly1305_state {
+
+	/* Running total of polynomial evaluation */
+	struct poly1305_state poly_state;
+
+	/* Partial block buffer */
+	u8 buffer[NH_MESSAGE_UNIT];
+	unsigned int buflen;
+
+	/*
+	 * Number of bytes remaining until the current NH message reaches
+	 * NH_MESSAGE_BYTES.  When nonzero, 'nh_hash' holds the partial NH hash.
+	 */
+	unsigned int nh_remaining;
+
+	__le64 nh_hash[NH_NUM_PASSES];
+};
+
+typedef void (*nh_t)(const u32 *key, const u8 *message, size_t message_len,
+		     __le64 hash[NH_NUM_PASSES]);
+
+int crypto_nhpoly1305_setkey(struct crypto_shash *tfm,
+			     const u8 *key, unsigned int keylen);
+
+int crypto_nhpoly1305_init(struct shash_desc *desc);
+int crypto_nhpoly1305_update(struct shash_desc *desc,
+			     const u8 *src, unsigned int srclen);
+int crypto_nhpoly1305_update_helper(struct shash_desc *desc,
+				    const u8 *src, unsigned int srclen,
+				    nh_t nh_fn);
+int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst);
+int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst,
+				   nh_t nh_fn);
+
+#endif /* _NHPOLY1305_H */
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index f718a19da82f75d525b2da1f818b7230af197a3e..34317ed2071e6fadbfd04979ac796d329f2637fb 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -13,13 +13,21 @@
 #define POLY1305_KEY_SIZE	32
 #define POLY1305_DIGEST_SIZE	16
 
+struct poly1305_key {
+	u32 r[5];	/* key, base 2^26 */
+};
+
+struct poly1305_state {
+	u32 h[5];	/* accumulator, base 2^26 */
+};
+
 struct poly1305_desc_ctx {
 	/* key */
-	u32 r[5];
+	struct poly1305_key r;
 	/* finalize key */
 	u32 s[4];
 	/* accumulator */
-	u32 h[5];
+	struct poly1305_state h;
 	/* partial buffer */
 	u8 buf[POLY1305_BLOCK_SIZE];
 	/* bytes used in partial buffer */
@@ -30,6 +38,22 @@ struct poly1305_desc_ctx {
 	bool sset;
 };
 
+/*
+ * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+ * ("s key") at the end.  They also only support block-aligned inputs.
+ */
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+	memset(state->h, 0, sizeof(state->h));
+}
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key,
+			  const void *src, unsigned int nblocks);
+void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+
+/* Crypto API helper functions for the Poly1305 MAC */
 int crypto_poly1305_init(struct shash_desc *desc);
 unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 					const u8 *src, unsigned int srclen);
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 6d258f5b68f1f91caef13757f7b24129c8aa44d3..022a1b896b475c2430558345c033d18d2d191c32 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -122,29 +122,6 @@ static inline void crypto_free_rng(struct crypto_rng *tfm)
 	crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
 }
 
-static inline void crypto_stat_rng_seed(struct crypto_rng *tfm, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY)
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
-	else
-		atomic_inc(&tfm->base.__crt_alg->seed_cnt);
-#endif
-}
-
-static inline void crypto_stat_rng_generate(struct crypto_rng *tfm,
-					    unsigned int dlen, int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&tfm->base.__crt_alg->rng_err_cnt);
-	} else {
-		atomic_inc(&tfm->base.__crt_alg->generate_cnt);
-		atomic64_add(dlen, &tfm->base.__crt_alg->generate_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_rng_generate() - get random number
  * @tfm: cipher handle
@@ -163,10 +140,12 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm,
 				      const u8 *src, unsigned int slen,
 				      u8 *dst, unsigned int dlen)
 {
+	struct crypto_alg *alg = tfm->base.__crt_alg;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
-	crypto_stat_rng_generate(tfm, dlen, ret);
+	crypto_stats_rng_generate(alg, dlen, ret);
 	return ret;
 }
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 925f547cdcfa2c63bfa58dcbba614b989c97bbdc..e555294ed77fe2f6ac87fbbbbdf4771b35b59d4d 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -39,19 +39,6 @@ struct skcipher_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-/**
- *	struct skcipher_givcrypt_request - Crypto request with IV generation
- *	@seq: Sequence number for IV generation
- *	@giv: Space for generated IV
- *	@creq: The crypto request itself
- */
-struct skcipher_givcrypt_request {
-	u64 seq;
-	u8 *giv;
-
-	struct ablkcipher_request creq;
-};
-
 struct crypto_skcipher {
 	int (*setkey)(struct crypto_skcipher *tfm, const u8 *key,
 	              unsigned int keylen);
@@ -486,32 +473,6 @@ static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm(
 	return container_of(tfm, struct crypto_sync_skcipher, base);
 }
 
-static inline void crypto_stat_skcipher_encrypt(struct skcipher_request *req,
-						int ret, struct crypto_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
-	} else {
-		atomic_inc(&alg->encrypt_cnt);
-		atomic64_add(req->cryptlen, &alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
-						int ret, struct crypto_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&alg->cipher_err_cnt);
-	} else {
-		atomic_inc(&alg->decrypt_cnt);
-		atomic64_add(req->cryptlen, &alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_skcipher_encrypt() - encrypt plaintext
  * @req: reference to the skcipher_request handle that holds all information
@@ -526,13 +487,16 @@ static inline void crypto_stat_skcipher_decrypt(struct skcipher_request *req,
 static inline int crypto_skcipher_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = tfm->encrypt(req);
-	crypto_stat_skcipher_encrypt(req, ret, tfm->base.__crt_alg);
+	crypto_stats_skcipher_encrypt(cryptlen, ret, alg);
 	return ret;
 }
 
@@ -550,13 +514,16 @@ static inline int crypto_skcipher_encrypt(struct skcipher_request *req)
 static inline int crypto_skcipher_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_alg *alg = tfm->base.__crt_alg;
+	unsigned int cryptlen = req->cryptlen;
 	int ret;
 
+	crypto_stats_get(alg);
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
 		ret = -ENOKEY;
 	else
 		ret = tfm->decrypt(req);
-	crypto_stat_skcipher_decrypt(req, ret, tfm->base.__crt_alg);
+	crypto_stats_skcipher_decrypt(cryptlen, ret, alg);
 	return ret;
 }
 
diff --git a/include/crypto/streebog.h b/include/crypto/streebog.h
new file mode 100644
index 0000000000000000000000000000000000000000..4af119f7e07b93594ab47a9c0602171a1bd7c17d
--- /dev/null
+++ b/include/crypto/streebog.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause */
+/*
+ * Copyright (c) 2013 Alexey Degtyarev <alexey@renatasystems.org>
+ * Copyright (c) 2018 Vitaly Chikunov <vt@altlinux.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef _CRYPTO_STREEBOG_H_
+#define _CRYPTO_STREEBOG_H_
+
+#include <linux/types.h>
+
+#define STREEBOG256_DIGEST_SIZE	32
+#define STREEBOG512_DIGEST_SIZE	64
+#define STREEBOG_BLOCK_SIZE	64
+
+struct streebog_uint512 {
+	u64 qword[8];
+};
+
+struct streebog_state {
+	u8 buffer[STREEBOG_BLOCK_SIZE];
+	struct streebog_uint512 hash;
+	struct streebog_uint512 h;
+	struct streebog_uint512 N;
+	struct streebog_uint512 Sigma;
+	size_t fillsize;
+};
+
+#endif /* !_CRYPTO_STREEBOG_H_ */
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3634ad6fe2021e81d6743b6056c95cb86a0925ef..902ec171fc6db55c3babc1eaea78c3b0c1d7dc30 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -49,7 +49,6 @@
 #define CRYPTO_ALG_TYPE_BLKCIPHER	0x00000004
 #define CRYPTO_ALG_TYPE_ABLKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_SKCIPHER	0x00000005
-#define CRYPTO_ALG_TYPE_GIVCIPHER	0x00000006
 #define CRYPTO_ALG_TYPE_KPP		0x00000008
 #define CRYPTO_ALG_TYPE_ACOMPRESS	0x0000000a
 #define CRYPTO_ALG_TYPE_SCOMPRESS	0x0000000b
@@ -76,12 +75,6 @@
  */
 #define CRYPTO_ALG_NEED_FALLBACK	0x00000100
 
-/*
- * This bit is set for symmetric key ciphers that have already been wrapped
- * with a generic IV generator to prevent them from being wrapped again.
- */
-#define CRYPTO_ALG_GENIV		0x00000200
-
 /*
  * Set if the algorithm has passed automated run-time testing.  Note that
  * if there is no run-time testing for a given algorithm it is considered
@@ -157,7 +150,6 @@ struct crypto_async_request;
 struct crypto_blkcipher;
 struct crypto_tfm;
 struct crypto_type;
-struct skcipher_givcrypt_request;
 
 typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err);
 
@@ -246,31 +238,16 @@ struct cipher_desc {
  *	     be called in parallel with the same transformation object.
  * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt
  *	     and the conditions are exactly the same.
- * @givencrypt: Update the IV for encryption. With this function, a cipher
- *	        implementation may provide the function on how to update the IV
- *	        for encryption.
- * @givdecrypt: Update the IV for decryption. This is the reverse of
- *	        @givencrypt .
- * @geniv: The transformation implementation may use an "IV generator" provided
- *	   by the kernel crypto API. Several use cases have a predefined
- *	   approach how IVs are to be updated. For such use cases, the kernel
- *	   crypto API provides ready-to-use implementations that can be
- *	   referenced with this variable.
  * @ivsize: IV size applicable for transformation. The consumer must provide an
  *	    IV of exactly that size to perform the encrypt or decrypt operation.
  *
- * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are
- * mandatory and must be filled.
+ * All fields except @ivsize are mandatory and must be filled.
  */
 struct ablkcipher_alg {
 	int (*setkey)(struct crypto_ablkcipher *tfm, const u8 *key,
 	              unsigned int keylen);
 	int (*encrypt)(struct ablkcipher_request *req);
 	int (*decrypt)(struct ablkcipher_request *req);
-	int (*givencrypt)(struct skcipher_givcrypt_request *req);
-	int (*givdecrypt)(struct skcipher_givcrypt_request *req);
-
-	const char *geniv;
 
 	unsigned int min_keysize;
 	unsigned int max_keysize;
@@ -284,10 +261,9 @@ struct ablkcipher_alg {
  * @setkey: see struct ablkcipher_alg
  * @encrypt: see struct ablkcipher_alg
  * @decrypt: see struct ablkcipher_alg
- * @geniv: see struct ablkcipher_alg
  * @ivsize: see struct ablkcipher_alg
  *
- * All fields except @geniv and @ivsize are mandatory and must be filled.
+ * All fields except @ivsize are mandatory and must be filled.
  */
 struct blkcipher_alg {
 	int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
@@ -299,8 +275,6 @@ struct blkcipher_alg {
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes);
 
-	const char *geniv;
-
 	unsigned int min_keysize;
 	unsigned int max_keysize;
 	unsigned int ivsize;
@@ -369,6 +343,115 @@ struct compress_alg {
 			      unsigned int slen, u8 *dst, unsigned int *dlen);
 };
 
+#ifdef CONFIG_CRYPTO_STATS
+/*
+ * struct crypto_istat_aead - statistics for AEAD algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @err_cnt:		number of error for AEAD requests
+ */
+struct crypto_istat_aead {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_akcipher - statistics for akcipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @verify_cnt:		number of verify operation
+ * @sign_cnt:		number of sign requests
+ * @err_cnt:		number of error for akcipher requests
+ */
+struct crypto_istat_akcipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t verify_cnt;
+	atomic64_t sign_cnt;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_cipher - statistics for cipher algorithm
+ * @encrypt_cnt:	number of encrypt requests
+ * @encrypt_tlen:	total data size handled by encrypt requests
+ * @decrypt_cnt:	number of decrypt requests
+ * @decrypt_tlen:	total data size handled by decrypt requests
+ * @err_cnt:		number of error for cipher requests
+ */
+struct crypto_istat_cipher {
+	atomic64_t encrypt_cnt;
+	atomic64_t encrypt_tlen;
+	atomic64_t decrypt_cnt;
+	atomic64_t decrypt_tlen;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_compress - statistics for compress algorithm
+ * @compress_cnt:	number of compress requests
+ * @compress_tlen:	total data size handled by compress requests
+ * @decompress_cnt:	number of decompress requests
+ * @decompress_tlen:	total data size handled by decompress requests
+ * @err_cnt:		number of error for compress requests
+ */
+struct crypto_istat_compress {
+	atomic64_t compress_cnt;
+	atomic64_t compress_tlen;
+	atomic64_t decompress_cnt;
+	atomic64_t decompress_tlen;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_hash - statistics for has algorithm
+ * @hash_cnt:		number of hash requests
+ * @hash_tlen:		total data size hashed
+ * @err_cnt:		number of error for hash requests
+ */
+struct crypto_istat_hash {
+	atomic64_t hash_cnt;
+	atomic64_t hash_tlen;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_kpp - statistics for KPP algorithm
+ * @setsecret_cnt:		number of setsecrey operation
+ * @generate_public_key_cnt:	number of generate_public_key operation
+ * @compute_shared_secret_cnt:	number of compute_shared_secret operation
+ * @err_cnt:			number of error for KPP requests
+ */
+struct crypto_istat_kpp {
+	atomic64_t setsecret_cnt;
+	atomic64_t generate_public_key_cnt;
+	atomic64_t compute_shared_secret_cnt;
+	atomic64_t err_cnt;
+};
+
+/*
+ * struct crypto_istat_rng: statistics for RNG algorithm
+ * @generate_cnt:	number of RNG generate requests
+ * @generate_tlen:	total data size of generated data by the RNG
+ * @seed_cnt:		number of times the RNG was seeded
+ * @err_cnt:		number of error for RNG requests
+ */
+struct crypto_istat_rng {
+	atomic64_t generate_cnt;
+	atomic64_t generate_tlen;
+	atomic64_t seed_cnt;
+	atomic64_t err_cnt;
+};
+#endif /* CONFIG_CRYPTO_STATS */
 
 #define cra_ablkcipher	cra_u.ablkcipher
 #define cra_blkcipher	cra_u.blkcipher
@@ -454,32 +537,14 @@ struct compress_alg {
  * @cra_refcnt: internally used
  * @cra_destroy: internally used
  *
- * All following statistics are for this crypto_alg
- * @encrypt_cnt:	number of encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @compress_cnt:	number of compress requests
- * @decompress_cnt:	number of decompress requests
- * @generate_cnt:	number of RNG generate requests
- * @seed_cnt:		number of times the rng was seeded
- * @hash_cnt:		number of hash requests
- * @sign_cnt:		number of sign requests
- * @setsecret_cnt:	number of setsecrey operation
- * @generate_public_key_cnt:	number of generate_public_key operation
- * @verify_cnt:			number of verify operation
- * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @compress_tlen:	total data size handled by compress requests
- * @decompress_tlen:	total data size handled by decompress requests
- * @generate_tlen:	total data size of generated data by the RNG
- * @hash_tlen:		total data size hashed
- * @akcipher_err_cnt:	number of error for akcipher requests
- * @cipher_err_cnt:	number of error for akcipher requests
- * @compress_err_cnt:	number of error for akcipher requests
- * @aead_err_cnt:	number of error for akcipher requests
- * @hash_err_cnt:	number of error for akcipher requests
- * @rng_err_cnt:	number of error for akcipher requests
- * @kpp_err_cnt:	number of error for akcipher requests
+ * @stats: union of all possible crypto_istat_xxx structures
+ * @stats.aead:		statistics for AEAD algorithm
+ * @stats.akcipher:	statistics for akcipher algorithm
+ * @stats.cipher:	statistics for cipher algorithm
+ * @stats.compress:	statistics for compress algorithm
+ * @stats.hash:		statistics for hash algorithm
+ * @stats.rng:		statistics for rng algorithm
+ * @stats.kpp:		statistics for KPP algorithm
  *
  * The struct crypto_alg describes a generic Crypto API algorithm and is common
  * for all of the transformations. Any variable not documented here shall not
@@ -515,46 +580,86 @@ struct crypto_alg {
 	
 	struct module *cra_module;
 
+#ifdef CONFIG_CRYPTO_STATS
 	union {
-		atomic_t encrypt_cnt;
-		atomic_t compress_cnt;
-		atomic_t generate_cnt;
-		atomic_t hash_cnt;
-		atomic_t setsecret_cnt;
-	};
-	union {
-		atomic64_t encrypt_tlen;
-		atomic64_t compress_tlen;
-		atomic64_t generate_tlen;
-		atomic64_t hash_tlen;
-	};
-	union {
-		atomic_t akcipher_err_cnt;
-		atomic_t cipher_err_cnt;
-		atomic_t compress_err_cnt;
-		atomic_t aead_err_cnt;
-		atomic_t hash_err_cnt;
-		atomic_t rng_err_cnt;
-		atomic_t kpp_err_cnt;
-	};
-	union {
-		atomic_t decrypt_cnt;
-		atomic_t decompress_cnt;
-		atomic_t seed_cnt;
-		atomic_t generate_public_key_cnt;
-	};
-	union {
-		atomic64_t decrypt_tlen;
-		atomic64_t decompress_tlen;
-	};
-	union {
-		atomic_t verify_cnt;
-		atomic_t compute_shared_secret_cnt;
-	};
-	atomic_t sign_cnt;
+		struct crypto_istat_aead aead;
+		struct crypto_istat_akcipher akcipher;
+		struct crypto_istat_cipher cipher;
+		struct crypto_istat_compress compress;
+		struct crypto_istat_hash hash;
+		struct crypto_istat_rng rng;
+		struct crypto_istat_kpp kpp;
+	} stats;
+#endif /* CONFIG_CRYPTO_STATS */
 
 } CRYPTO_MINALIGN_ATTR;
 
+#ifdef CONFIG_CRYPTO_STATS
+void crypto_stats_init(struct crypto_alg *alg);
+void crypto_stats_get(struct crypto_alg *alg);
+void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
+void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
+void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg);
+void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg);
+void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
+void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
+void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
+void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
+void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
+void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
+void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
+void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
+void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
+#else
+static inline void crypto_stats_init(struct crypto_alg *alg)
+{}
+static inline void crypto_stats_get(struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
+{}
+static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
+{}
+static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
+{}
+static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
+{}
+#endif
 /*
  * A helper struct for waiting for completion of async crypto ops
  */
@@ -800,14 +905,14 @@ static inline struct crypto_ablkcipher *__crypto_ablkcipher_cast(
 
 static inline u32 crypto_skcipher_type(u32 type)
 {
-	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	type &= ~CRYPTO_ALG_TYPE_MASK;
 	type |= CRYPTO_ALG_TYPE_BLKCIPHER;
 	return type;
 }
 
 static inline u32 crypto_skcipher_mask(u32 mask)
 {
-	mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
+	mask &= ~CRYPTO_ALG_TYPE_MASK;
 	mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
 	return mask;
 }
@@ -973,38 +1078,6 @@ static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm(
 	return __crypto_ablkcipher_cast(req->base.tfm);
 }
 
-static inline void crypto_stat_ablkcipher_encrypt(struct ablkcipher_request *req,
-						  int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct ablkcipher_tfm *crt =
-		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
-	} else {
-		atomic_inc(&crt->base->base.__crt_alg->encrypt_cnt);
-		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->encrypt_tlen);
-	}
-#endif
-}
-
-static inline void crypto_stat_ablkcipher_decrypt(struct ablkcipher_request *req,
-						  int ret)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	struct ablkcipher_tfm *crt =
-		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
-
-	if (ret && ret != -EINPROGRESS && ret != -EBUSY) {
-		atomic_inc(&crt->base->base.__crt_alg->cipher_err_cnt);
-	} else {
-		atomic_inc(&crt->base->base.__crt_alg->decrypt_cnt);
-		atomic64_add(req->nbytes, &crt->base->base.__crt_alg->decrypt_tlen);
-	}
-#endif
-}
-
 /**
  * crypto_ablkcipher_encrypt() - encrypt plaintext
  * @req: reference to the ablkcipher_request handle that holds all information
@@ -1020,10 +1093,13 @@ static inline int crypto_ablkcipher_encrypt(struct ablkcipher_request *req)
 {
 	struct ablkcipher_tfm *crt =
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
+	struct crypto_alg *alg = crt->base->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crt->encrypt(req);
-	crypto_stat_ablkcipher_encrypt(req, ret);
+	crypto_stats_ablkcipher_encrypt(nbytes, ret, alg);
 	return ret;
 }
 
@@ -1042,10 +1118,13 @@ static inline int crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
 {
 	struct ablkcipher_tfm *crt =
 		crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req));
+	struct crypto_alg *alg = crt->base->base.__crt_alg;
+	unsigned int nbytes = req->nbytes;
 	int ret;
 
+	crypto_stats_get(alg);
 	ret = crt->decrypt(req);
-	crypto_stat_ablkcipher_decrypt(req, ret);
+	crypto_stats_ablkcipher_decrypt(nbytes, ret, alg);
 	return ret;
 }
 
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 6dafbc3e441461c9a07a5cd70cde9aa42ca9cf52..4dc1603919ce85ee1536bf7cdaa2769f9d45018a 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -76,45 +76,69 @@ struct crypto_user_alg {
 	__u32 cru_flags;
 };
 
-struct crypto_stat {
-	char type[CRYPTO_MAX_NAME];
-	union {
-		__u32 stat_encrypt_cnt;
-		__u32 stat_compress_cnt;
-		__u32 stat_generate_cnt;
-		__u32 stat_hash_cnt;
-		__u32 stat_setsecret_cnt;
-	};
-	union {
-		__u64 stat_encrypt_tlen;
-		__u64 stat_compress_tlen;
-		__u64 stat_generate_tlen;
-		__u64 stat_hash_tlen;
-	};
-	union {
-		__u32 stat_akcipher_err_cnt;
-		__u32 stat_cipher_err_cnt;
-		__u32 stat_compress_err_cnt;
-		__u32 stat_aead_err_cnt;
-		__u32 stat_hash_err_cnt;
-		__u32 stat_rng_err_cnt;
-		__u32 stat_kpp_err_cnt;
-	};
-	union {
-		__u32 stat_decrypt_cnt;
-		__u32 stat_decompress_cnt;
-		__u32 stat_seed_cnt;
-		__u32 stat_generate_public_key_cnt;
-	};
-	union {
-		__u64 stat_decrypt_tlen;
-		__u64 stat_decompress_tlen;
-	};
-	union {
-		__u32 stat_verify_cnt;
-		__u32 stat_compute_shared_secret_cnt;
-	};
-	__u32 stat_sign_cnt;
+struct crypto_stat_aead {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_akcipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_verify_cnt;
+	__u64 stat_sign_cnt;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_cipher {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_encrypt_cnt;
+	__u64 stat_encrypt_tlen;
+	__u64 stat_decrypt_cnt;
+	__u64 stat_decrypt_tlen;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_compress {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_compress_cnt;
+	__u64 stat_compress_tlen;
+	__u64 stat_decompress_cnt;
+	__u64 stat_decompress_tlen;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_hash {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_hash_cnt;
+	__u64 stat_hash_tlen;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_kpp {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_setsecret_cnt;
+	__u64 stat_generate_public_key_cnt;
+	__u64 stat_compute_shared_secret_cnt;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_rng {
+	char type[CRYPTO_MAX_NAME];
+	__u64 stat_generate_cnt;
+	__u64 stat_generate_tlen;
+	__u64 stat_seed_cnt;
+	__u64 stat_err_cnt;
+};
+
+struct crypto_stat_larval {
+	char type[CRYPTO_MAX_NAME];
 };
 
 struct crypto_report_larval {
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
index eea5d02c58dead0966ff088ec9cfc02cd25165a7..74a8609fcb4d38e3ac23a1c389f69d6f903bdb27 100644
--- a/include/uapi/linux/hash_info.h
+++ b/include/uapi/linux/hash_info.h
@@ -33,6 +33,8 @@ enum hash_algo {
 	HASH_ALGO_TGR_160,
 	HASH_ALGO_TGR_192,
 	HASH_ALGO_SM3_256,
+	HASH_ALGO_STREEBOG_256,
+	HASH_ALGO_STREEBOG_512,
 	HASH_ALGO__LAST
 };
 
diff --git a/kernel/padata.c b/kernel/padata.c
index d568cc56405f8835d499df7b6555a406e99c9e96..3e2633ae3bca1ef1c3ec8f5739a94a2ee3b7ca6b 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -720,7 +720,7 @@ int padata_start(struct padata_instance *pinst)
 	if (pinst->flags & PADATA_INVALID)
 		err = -EINVAL;
 
-	 __padata_start(pinst);
+	__padata_start(pinst);
 
 	mutex_unlock(&pinst->lock);
 
diff --git a/lib/Makefile b/lib/Makefile
index f5262d30bfe66f94ca8cf0b4d7fa4c20184cfb2a..e1b59da714186e3be67706a8c010c6253ea9500c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,7 +20,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
 	 idr.o int_sqrt.o extable.o \
-	 sha1.o chacha20.o irq_regs.o argv_split.o \
+	 sha1.o chacha.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
diff --git a/lib/chacha20.c b/lib/chacha.c
similarity index 58%
rename from lib/chacha20.c
rename to lib/chacha.c
index d907fec6a9ed112769f6dc3ead1b33d1096dce87..a46d2832dbabd094091a2432a08f69a491f20ade 100644
--- a/lib/chacha20.c
+++ b/lib/chacha.c
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
  *
  * Copyright (C) 2015 Martin Willi
  *
@@ -14,17 +14,16 @@
 #include <linux/bitops.h>
 #include <linux/cryptohash.h>
 #include <asm/unaligned.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
 
-void chacha20_block(u32 *state, u8 *stream)
+static void chacha_permute(u32 *x, int nrounds)
 {
-	u32 x[16];
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		x[i] = state[i];
+	/* whitelist the allowed round counts */
+	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
 
-	for (i = 0; i < 20; i += 2) {
+	for (i = 0; i < nrounds; i += 2) {
 		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
 		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
 		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
@@ -65,10 +64,54 @@ void chacha20_block(u32 *state, u8 *stream)
 		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
 		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
 	}
+}
+
+/**
+ * chacha_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input.  This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block(u32 *state, u8 *stream, int nrounds)
+{
+	u32 x[16];
+	int i;
+
+	memcpy(x, state, 64);
+
+	chacha_permute(x, nrounds);
 
 	for (i = 0; i < ARRAY_SIZE(x); i++)
 		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
 
 	state[12]++;
 }
-EXPORT_SYMBOL(chacha20_block);
+EXPORT_SYMBOL(chacha_block);
+
+/**
+ * hchacha_block - abbreviated ChaCha core, for XChaCha
+ * @in: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state.  It should not be used for streaming directly.
+ */
+void hchacha_block(const u32 *in, u32 *out, int nrounds)
+{
+	u32 x[16];
+
+	memcpy(x, in, 64);
+
+	chacha_permute(x, nrounds);
+
+	memcpy(&out[0], &x[0], 16);
+	memcpy(&out[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index c822e626761bd0ecb51e1b1de1f8dfa5b98b3568..621146d04c0380de424c8b5ef866e2a8056a88d5 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -1390,7 +1390,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
 	if (!smp)
 		return NULL;
 
-	smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(smp->tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		goto zfree_smp;
@@ -3233,7 +3233,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
 	if (!smp)
 		return ERR_PTR(-ENOMEM);
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		kzfree(smp);
@@ -3906,13 +3906,13 @@ int __init bt_selftest_smp(void)
 	struct crypto_kpp *tfm_ecdh;
 	int err;
 
-	tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	tfm_aes = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(tfm_aes)) {
 		BT_ERR("Unable to create AES crypto context");
 		return PTR_ERR(tfm_aes);
 	}
 
-	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+	tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
 	if (IS_ERR(tfm_cmac)) {
 		BT_ERR("Unable to create CMAC crypto context");
 		crypto_free_cipher(tfm_aes);
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 73e8f347802ecadf5efb11fffa7b13d32a5705c1..bfe9ed9f4c48d6f4e52faf6f274474ebda2ac9b6 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -30,13 +30,13 @@ int ieee80211_wep_init(struct ieee80211_local *local)
 	/* start WEP IV from a random value */
 	get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN);
 
-	local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(local->wep_tx_tfm)) {
 		local->wep_rx_tfm = ERR_PTR(-EINVAL);
 		return PTR_ERR(local->wep_tx_tfm);
 	}
 
-	local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(local->wep_rx_tfm)) {
 		crypto_free_cipher(local->wep_tx_tfm);
 		local->wep_tx_tfm = ERR_PTR(-EINVAL);
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index 6beab0cfcb99e4fece6f8cbf9ca10f3c7c0e61eb..55214fe925b2accef15113863c273b6f0c8df5e7 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -75,7 +75,7 @@ static void *lib80211_ccmp_init(int key_idx)
 		goto fail;
 	priv->key_idx = key_idx;
 
-	priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	priv->tfm = crypto_alloc_cipher("aes", 0, 0);
 	if (IS_ERR(priv->tfm)) {
 		priv->tfm = NULL;
 		goto fail;
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index b5e235573c8a0777053e946bbcfde01f65ec7bcf..35f06563207ddd9a1c67ca0e48fcc01f40eb9f19 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -99,7 +99,7 @@ static void *lib80211_tkip_init(int key_idx)
 
 	priv->key_idx = key_idx;
 
-	priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->tx_tfm_arc4)) {
 		priv->tx_tfm_arc4 = NULL;
 		goto fail;
@@ -111,7 +111,7 @@ static void *lib80211_tkip_init(int key_idx)
 		goto fail;
 	}
 
-	priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->rx_tfm_arc4)) {
 		priv->rx_tfm_arc4 = NULL;
 		goto fail;
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 6015f6b542a678bf41b7929a6bc3b55815c691b9..20c1ad63ad4467770e250f658ba149083022affa 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
 		goto fail;
 	priv->key_idx = keyidx;
 
-	priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->tx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->tx_tfm)) {
 		priv->tx_tfm = NULL;
 		goto fail;
 	}
 
-	priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	priv->rx_tfm = crypto_alloc_cipher("arc4", 0, 0);
 	if (IS_ERR(priv->rx_tfm)) {
 		priv->rx_tfm = NULL;
 		goto fail;
diff --git a/security/apparmor/crypto.c b/security/apparmor/crypto.c
index 136f2a04783670d5ea8ad0c93d063d5a866fc297..af03d98c75520f4906e07fae3200d948d115289f 100644
--- a/security/apparmor/crypto.c
+++ b/security/apparmor/crypto.c
@@ -112,7 +112,7 @@ static int __init init_profile_hash(void)
 	if (!apparmor_initialized)
 		return 0;
 
-	tfm = crypto_alloc_shash("sha1", 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash("sha1", 0, 0);
 	if (IS_ERR(tfm)) {
 		int error = PTR_ERR(tfm);
 		AA_ERROR("failed to setup profile sha1 hashing: %d\n", error);
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 77ef210a8a6b1e029ab87c0ac8db0b49ae47dba8..43e2dc3a60d008652553e61f4caf8050f19a7d9e 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -97,8 +97,7 @@ static struct shash_desc *init_desc(char type, uint8_t hash_algo)
 		mutex_lock(&mutex);
 		if (*tfm)
 			goto out;
-		*tfm = crypto_alloc_shash(algo, 0,
-					  CRYPTO_ALG_ASYNC | CRYPTO_NOLOAD);
+		*tfm = crypto_alloc_shash(algo, 0, CRYPTO_NOLOAD);
 		if (IS_ERR(*tfm)) {
 			rc = PTR_ERR(*tfm);
 			pr_err("Can not allocate %s (reason: %ld)\n", algo, rc);
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index d92cbf9687c33f090865f6d0caa99d9936f49e3a..a3891ae9fa0fcb24e79161d2592ed16644cdad1a 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -342,7 +342,7 @@ static int calc_hmac(u8 *digest, const u8 *key, unsigned int keylen,
 	struct crypto_shash *tfm;
 	int err;
 
-	tfm = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC);
+	tfm = crypto_alloc_shash(hmac_alg, 0, 0);
 	if (IS_ERR(tfm)) {
 		pr_err("encrypted_key: can't alloc %s transform: %ld\n",
 		       hmac_alg, PTR_ERR(tfm));
@@ -984,7 +984,7 @@ static int __init init_encrypted(void)
 {
 	int ret;
 
-	hash_tfm = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC);
+	hash_tfm = crypto_alloc_shash(hash_alg, 0, 0);
 	if (IS_ERR(hash_tfm)) {
 		pr_err("encrypted_key: can't allocate %s transform: %ld\n",
 		       hash_alg, PTR_ERR(hash_tfm));
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index 697bfc6c819236c39b828fc786cd6d08ff7bd592..4d98f4f87236c5091d8318485a1193c7c45c9273 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1199,14 +1199,14 @@ static int __init trusted_shash_alloc(void)
 {
 	int ret;
 
-	hmacalg = crypto_alloc_shash(hmac_alg, 0, CRYPTO_ALG_ASYNC);
+	hmacalg = crypto_alloc_shash(hmac_alg, 0, 0);
 	if (IS_ERR(hmacalg)) {
 		pr_info("trusted_key: could not allocate crypto %s\n",
 			hmac_alg);
 		return PTR_ERR(hmacalg);
 	}
 
-	hashalg = crypto_alloc_shash(hash_alg, 0, CRYPTO_ALG_ASYNC);
+	hashalg = crypto_alloc_shash(hash_alg, 0, 0);
 	if (IS_ERR(hashalg)) {
 		pr_info("trusted_key: could not allocate crypto %s\n",
 			hash_alg);
diff --git a/tools/crypto/getstat.c b/tools/crypto/getstat.c
index 24115173a483fb0ec6390813d8d43d68bd63caed..9e8ff76420fa5636df917d10475f15d5d0769e3e 100644
--- a/tools/crypto/getstat.c
+++ b/tools/crypto/getstat.c
@@ -152,86 +152,86 @@ static int get_stat(const char *drivername)
 
 	if (tb[CRYPTOCFGA_STAT_HASH]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_HASH];
-		struct crypto_stat *rhash =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tHash\n\tHash: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_hash *rhash =
+			(struct crypto_stat_hash *)RTA_DATA(rta);
+		printf("%s\tHash\n\tHash: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rhash->stat_hash_cnt, rhash->stat_hash_tlen,
-			rhash->stat_hash_err_cnt);
+			rhash->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_COMPRESS]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_COMPRESS];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tCompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_compress *rblk =
+			(struct crypto_stat_compress *)RTA_DATA(rta);
+		printf("%s\tCompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_compress_cnt, rblk->stat_compress_tlen,
 			rblk->stat_decompress_cnt, rblk->stat_decompress_tlen,
-			rblk->stat_compress_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_ACOMP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_ACOMP];
-		struct crypto_stat *rcomp =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tACompress\n\tCompress: %u bytes: %llu\n\tDecompress: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_compress *rcomp =
+			(struct crypto_stat_compress *)RTA_DATA(rta);
+		printf("%s\tACompress\n\tCompress: %llu bytes: %llu\n\tDecompress: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rcomp->stat_compress_cnt, rcomp->stat_compress_tlen,
 			rcomp->stat_decompress_cnt, rcomp->stat_decompress_tlen,
-			rcomp->stat_compress_err_cnt);
+			rcomp->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AEAD]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AEAD];
-		struct crypto_stat *raead =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tAEAD\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_aead *raead =
+			(struct crypto_stat_aead *)RTA_DATA(rta);
+		printf("%s\tAEAD\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			raead->stat_encrypt_cnt, raead->stat_encrypt_tlen,
 			raead->stat_decrypt_cnt, raead->stat_decrypt_tlen,
-			raead->stat_aead_err_cnt);
+			raead->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_BLKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_BLKCIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tCipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_cipher *rblk =
+			(struct crypto_stat_cipher *)RTA_DATA(rta);
+		printf("%s\tCipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_AKCIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_AKCIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tAkcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tSign: %u\n\tVerify: %u\n\tErrors: %u\n",
+		struct crypto_stat_akcipher *rblk =
+			(struct crypto_stat_akcipher *)RTA_DATA(rta);
+		printf("%s\tAkcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tSign: %llu\n\tVerify: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
 			rblk->stat_sign_cnt, rblk->stat_verify_cnt,
-			rblk->stat_akcipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_CIPHER]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_CIPHER];
-		struct crypto_stat *rblk =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tcipher\n\tEncrypt: %u bytes: %llu\n\tDecrypt: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_cipher *rblk =
+			(struct crypto_stat_cipher *)RTA_DATA(rta);
+		printf("%s\tcipher\n\tEncrypt: %llu bytes: %llu\n\tDecrypt: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rblk->stat_encrypt_cnt, rblk->stat_encrypt_tlen,
 			rblk->stat_decrypt_cnt, rblk->stat_decrypt_tlen,
-			rblk->stat_cipher_err_cnt);
+			rblk->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_RNG]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_RNG];
-		struct crypto_stat *rrng =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tRNG\n\tSeed: %u\n\tGenerate: %u bytes: %llu\n\tErrors: %u\n",
+		struct crypto_stat_rng *rrng =
+			(struct crypto_stat_rng *)RTA_DATA(rta);
+		printf("%s\tRNG\n\tSeed: %llu\n\tGenerate: %llu bytes: %llu\n\tErrors: %llu\n",
 			drivername,
 			rrng->stat_seed_cnt,
 			rrng->stat_generate_cnt, rrng->stat_generate_tlen,
-			rrng->stat_rng_err_cnt);
+			rrng->stat_err_cnt);
 	} else if (tb[CRYPTOCFGA_STAT_KPP]) {
 		struct rtattr *rta = tb[CRYPTOCFGA_STAT_KPP];
-		struct crypto_stat *rkpp =
-			(struct crypto_stat *)RTA_DATA(rta);
-		printf("%s\tKPP\n\tSetsecret: %u\n\tGenerate public key: %u\n\tCompute_shared_secret: %u\n\tErrors: %u\n",
+		struct crypto_stat_kpp *rkpp =
+			(struct crypto_stat_kpp *)RTA_DATA(rta);
+		printf("%s\tKPP\n\tSetsecret: %llu\n\tGenerate public key: %llu\n\tCompute_shared_secret: %llu\n\tErrors: %llu\n",
 			drivername,
 			rkpp->stat_setsecret_cnt,
 			rkpp->stat_generate_public_key_cnt,
 			rkpp->stat_compute_shared_secret_cnt,
-			rkpp->stat_kpp_err_cnt);
+			rkpp->stat_err_cnt);
 	} else {
 		fprintf(stderr, "%s is of an unknown algorithm\n", drivername);
 	}