From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M

---
 kernel/arch/arm/crypto/aes-neonbs-core.S |   79 +++++++++++++++++++--------------------
 1 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/kernel/arch/arm/crypto/aes-neonbs-core.S b/kernel/arch/arm/crypto/aes-neonbs-core.S
index 2b625c6..7d0cc7f 100644
--- a/kernel/arch/arm/crypto/aes-neonbs-core.S
+++ b/kernel/arch/arm/crypto/aes-neonbs-core.S
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Bit sliced AES using NEON instructions
  *
  * Copyright (C) 2017 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 /*
@@ -78,11 +75,6 @@
 	.macro		__ldr, out, sym
 	vldr		\out\()l, \sym
 	vldr		\out\()h, \sym + 8
-	.endm
-
-	.macro		__adr, reg, lbl
-	adr		\reg, \lbl
-THUMB(	orr		\reg, \reg, #1		)
 	.endm
 
 	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
@@ -632,11 +624,11 @@
 	push		{r4-r6, lr}
 	ldr		r5, [sp, #16]		// number of blocks
 
-99:	__adr		ip, 0f
+99:	adr		ip, 0f
 	and		lr, r5, #7
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #2
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q0}, [r1]!
 	vld1.8		{q1}, [r1]!
@@ -651,11 +643,11 @@
 	mov		rounds, r3
 	bl		\do8
 
-	__adr		ip, 1f
+	adr		ip, 1f
 	and		lr, r5, #7
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #2
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vst1.8		{\o0}, [r0]!
 	vst1.8		{\o1}, [r0]!
@@ -692,12 +684,12 @@
 	push		{r4-r6, lr}
 	ldm		ip, {r5-r6}		// load args 4-5
 
-99:	__adr		ip, 0f
+99:	adr		ip, 0f
 	and		lr, r5, #7
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #2
 	mov		lr, r1
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q0}, [lr]!
 	vld1.8		{q1}, [lr]!
@@ -721,11 +713,11 @@
 	vmov		q14, q8
 	vmov		q15, q8
 
-	__adr		ip, 1f
+	adr		ip, 1f
 	and		lr, r5, #7
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #2
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q9}, [r1]!
 	vld1.8		{q10}, [r1]!
@@ -736,9 +728,9 @@
 	vld1.8		{q15}, [r1]!
 	W(nop)
 
-1:	__adr		ip, 2f
+1:	adr		ip, 2f
 	sub		ip, ip, lr, lsl #3
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	veor		q0, q0, q8
 	vst1.8		{q0}, [r0]!
@@ -807,13 +799,13 @@
 	vmov		q6, q0
 	vmov		q7, q0
 
-	__adr		ip, 0f
+	adr		ip, 0f
 	sub		lr, r5, #1
 	and		lr, lr, #7
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #5
 	sub		ip, ip, lr, lsl #2
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	next_ctr	q1
 	next_ctr	q2
@@ -827,13 +819,13 @@
 	mov		rounds, r3
 	bl		aesbs_encrypt8
 
-	__adr		ip, 1f
+	adr		ip, 1f
 	and		lr, r5, #7
 	cmp		r5, #8
 	movgt		r4, #0
 	ldrle		r4, [sp, #40]		// load final in the last round
 	sub		ip, ip, lr, lsl #2
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q8}, [r1]!
 	vld1.8		{q9}, [r1]!
@@ -846,10 +838,10 @@
 1:	bne		2f
 	vld1.8		{q15}, [r1]!
 
-2:	__adr		ip, 3f
+2:	adr		ip, 3f
 	cmp		r5, #8
 	sub		ip, ip, lr, lsl #3
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	veor		q0, q0, q8
 	vst1.8		{q0}, [r0]!
@@ -890,27 +882,25 @@
 	veor		\out, \out, \tmp
 	.endm
 
-	.align		4
-.Lxts_mul_x:
-	.quad		1, 0x87
-
 	/*
 	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-	 *		     int blocks, u8 iv[])
+	 *		     int blocks, u8 iv[], int reorder_last_tweak)
 	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-	 *		     int blocks, u8 iv[])
+	 *		     int blocks, u8 iv[], int reorder_last_tweak)
 	 */
 __xts_prepare8:
 	vld1.8		{q14}, [r7]		// load iv
-	__ldr		q15, .Lxts_mul_x	// load tweak mask
+	vmov.i32	d30, #0x87		// compose tweak mask vector
+	vmovl.u32	q15, d30
+	vshr.u64	d30, d31, #7
 	vmov		q12, q14
 
-	__adr		ip, 0f
+	adr		ip, 0f
 	and		r4, r6, #7
 	cmp		r6, #8
 	sub		ip, ip, r4, lsl #5
 	mov		r4, sp
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q0}, [r1]!
 	next_tweak	q12, q14, q15, q13
@@ -949,17 +939,24 @@
 
 	vld1.8		{q7}, [r1]!
 	next_tweak	q14, q12, q15, q13
-	veor		q7, q7, q12
+THUMB(	itt		le		)
+	W(cmple)	r8, #0
+	ble		1f
+0:	veor		q7, q7, q12
 	vst1.8		{q12}, [r4, :128]
 
-0:	vst1.8		{q14}, [r7]		// store next iv
+	vst1.8		{q14}, [r7]		// store next iv
 	bx		lr
+
+1:	vswp		q12, q14
+	b		0b
 ENDPROC(__xts_prepare8)
 
 	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 	push		{r4-r8, lr}
 	mov		r5, sp			// preserve sp
 	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
+	rsb		r8, ip, #1
 	sub		ip, sp, #128		// make room for 8x tweak
 	bic		ip, ip, #0xf		// align sp to 16 bytes
 	mov		sp, ip
@@ -970,12 +967,12 @@
 	mov		rounds, r3
 	bl		\do8
 
-	__adr		ip, 0f
+	adr		ip, 0f
 	and		lr, r6, #7
 	cmp		r6, #8
 	sub		ip, ip, lr, lsl #2
 	mov		r4, sp
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	vld1.8		{q8}, [r4, :128]!
 	vld1.8		{q9}, [r4, :128]!
@@ -986,9 +983,9 @@
 	vld1.8		{q14}, [r4, :128]!
 	vld1.8		{q15}, [r4, :128]
 
-0:	__adr		ip, 1f
+0:	adr		ip, 1f
 	sub		ip, ip, lr, lsl #3
-	bxlt		ip			// computed goto if blocks < 8
+	movlt		pc, ip			// computed goto if blocks < 8
 
 	veor		\o0, \o0, q8
 	vst1.8		{\o0}, [r0]!
@@ -1015,9 +1012,11 @@
 	.endm
 
 ENTRY(aesbs_xts_encrypt)
+	mov		ip, #0			// never reorder final tweak
 	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 ENDPROC(aesbs_xts_encrypt)
 
 ENTRY(aesbs_xts_decrypt)
+	ldr		ip, [sp, #8]		// reorder final tweak?
 	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 ENDPROC(aesbs_xts_decrypt)

--
Gitblit v1.6.2