diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 021bb9e9784b271cb670f8c973312e9e2ec14ba4..706c4e10e9e294c7c5de49dbbe7a784ec7ca1458 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon)
 	mov		w3, w2
 	bl		chacha_permute
 
-	st1		{v0.16b}, [x1], #16
-	st1		{v3.16b}, [x1]
+	st1		{v0.4s}, [x1], #16
+	st1		{v3.4s}, [x1]
 
 	ldp		x29, x30, [sp], #16
 	ret
@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v3.4s, v3.4s, v19.4s
 	  add		a2, a2, w8
 	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
 
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v7.4s, v7.4s, v23.4s
 	  add		a6, a6, w8
 	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
 
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v11.4s, v11.4s, v27.4s
 	  add		a10, a10, w8
 	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
 
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
 	add		v15.4s, v15.4s, v31.4s
 	  add		a14, a14, w8
 	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
 
 	// interleave 32-bit words in state n, n+1
 	  ldp		w6, w7, [x2], #64