hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm/crypto/ghash-ce-core.S
....@@ -1,15 +1,15 @@
1
+/* SPDX-License-Identifier: GPL-2.0-only */
12 /*
23 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
34 *
45 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5
- *
6
- * This program is free software; you can redistribute it and/or modify it
7
- * under the terms of the GNU General Public License version 2 as published
8
- * by the Free Software Foundation.
96 */
107
118 #include <linux/linkage.h>
129 #include <asm/assembler.h>
10
+
11
+ .arch armv8-a
12
+ .fpu crypto-neon-fp-armv8
1313
1414 SHASH .req q0
1515 T1 .req q1
....@@ -63,8 +63,34 @@
6363 k48 .req d31
6464 SHASH2_p64 .req d31
6565
66
+ HH .req q10
67
+ HH3 .req q11
68
+ HH4 .req q12
69
+ HH34 .req q13
70
+
71
+ HH_L .req d20
72
+ HH_H .req d21
73
+ HH3_L .req d22
74
+ HH3_H .req d23
75
+ HH4_L .req d24
76
+ HH4_H .req d25
77
+ HH34_L .req d26
78
+ HH34_H .req d27
79
+ SHASH2_H .req d29
80
+
81
+ XL2 .req q5
82
+ XM2 .req q6
83
+ XH2 .req q7
84
+ T3 .req q8
85
+
86
+ XL2_L .req d10
87
+ XL2_H .req d11
88
+ XM2_L .req d12
89
+ XM2_H .req d13
90
+ T3_L .req d16
91
+ T3_H .req d17
92
+
6693 .text
67
- .fpu crypto-neon-fp-armv8
6894
6995 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
7096 vmull.p64 \rd, \rn, \rm
....@@ -175,12 +201,77 @@
175201 beq 0f
176202 vld1.64 {T1}, [ip]
177203 teq r0, #0
178
- b 1f
204
+ b 3f
179205
180
-0: vld1.64 {T1}, [r2]!
206
+0: .ifc \pn, p64
207
+ tst r0, #3 // skip until #blocks is a
208
+ bne 2f // round multiple of 4
209
+
210
+ vld1.8 {XL2-XM2}, [r2]!
211
+1: vld1.8 {T3-T2}, [r2]!
212
+ vrev64.8 XL2, XL2
213
+ vrev64.8 XM2, XM2
214
+
215
+ subs r0, r0, #4
216
+
217
+ vext.8 T1, XL2, XL2, #8
218
+ veor XL2_H, XL2_H, XL_L
219
+ veor XL, XL, T1
220
+
221
+ vrev64.8 T3, T3
222
+ vrev64.8 T1, T2
223
+
224
+ vmull.p64 XH, HH4_H, XL_H // a1 * b1
225
+ veor XL2_H, XL2_H, XL_H
226
+ vmull.p64 XL, HH4_L, XL_L // a0 * b0
227
+ vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
228
+
229
+ vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
230
+ veor XM2_L, XM2_L, XM2_H
231
+ vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
232
+ vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
233
+
234
+ veor XH, XH, XH2
235
+ veor XL, XL, XL2
236
+ veor XM, XM, XM2
237
+
238
+ vmull.p64 XH2, HH_H, T3_L // a1 * b1
239
+ veor T3_L, T3_L, T3_H
240
+ vmull.p64 XL2, HH_L, T3_H // a0 * b0
241
+ vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
242
+
243
+ veor XH, XH, XH2
244
+ veor XL, XL, XL2
245
+ veor XM, XM, XM2
246
+
247
+ vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
248
+ veor T1_L, T1_L, T1_H
249
+ vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
250
+ vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
251
+
252
+ veor XH, XH, XH2
253
+ veor XL, XL, XL2
254
+ veor XM, XM, XM2
255
+
256
+ beq 4f
257
+
258
+ vld1.8 {XL2-XM2}, [r2]!
259
+
260
+ veor T1, XL, XH
261
+ veor XM, XM, T1
262
+
263
+ __pmull_reduce_p64
264
+
265
+ veor T1, T1, XH
266
+ veor XL, XL, T1
267
+
268
+ b 1b
269
+ .endif
270
+
271
+2: vld1.64 {T1}, [r2]!
181272 subs r0, r0, #1
182273
183
-1: /* multiply XL by SHASH in GF(2^128) */
274
+3: /* multiply XL by SHASH in GF(2^128) */
184275 #ifndef CONFIG_CPU_BIG_ENDIAN
185276 vrev64.8 T1, T1
186277 #endif
....@@ -193,7 +284,7 @@
193284 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
194285 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
195286
196
- veor T1, XL, XH
287
+4: veor T1, XL, XH
197288 veor XM, XM, T1
198289
199290 __pmull_reduce_\pn
....@@ -212,8 +303,14 @@
212303 * struct ghash_key const *k, const char *head)
213304 */
214305 ENTRY(pmull_ghash_update_p64)
215
- vld1.64 {SHASH}, [r3]
306
+ vld1.64 {SHASH}, [r3]!
307
+ vld1.64 {HH}, [r3]!
308
+ vld1.64 {HH3-HH4}, [r3]
309
+
216310 veor SHASH2_p64, SHASH_L, SHASH_H
311
+ veor SHASH2_H, HH_L, HH_H
312
+ veor HH34_L, HH3_L, HH3_H
313
+ veor HH34_H, HH4_L, HH4_H
217314
218315 vmov.i8 MASK, #0xe1
219316 vshl.u64 MASK, MASK, #57