hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm/crypto/aes-ce-core.S
....@@ -1,17 +1,15 @@
1
+/* SPDX-License-Identifier: GPL-2.0-only */
12 /*
23 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
34 *
45 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5
- *
6
- * This program is free software; you can redistribute it and/or modify
7
- * it under the terms of the GNU General Public License version 2 as
8
- * published by the Free Software Foundation.
96 */
107
118 #include <linux/linkage.h>
129 #include <asm/assembler.h>
1310
1411 .text
12
+ .arch armv8-a
1513 .fpu crypto-neon-fp-armv8
1614 .align 3
1715
....@@ -47,63 +45,73 @@
4745 veor q0, q0, \key3
4846 .endm
4947
50
- .macro enc_dround_3x, key1, key2
48
+ .macro enc_dround_4x, key1, key2
5149 enc_round q0, \key1
5250 enc_round q1, \key1
5351 enc_round q2, \key1
52
+ enc_round q3, \key1
5453 enc_round q0, \key2
5554 enc_round q1, \key2
5655 enc_round q2, \key2
56
+ enc_round q3, \key2
5757 .endm
5858
59
- .macro dec_dround_3x, key1, key2
59
+ .macro dec_dround_4x, key1, key2
6060 dec_round q0, \key1
6161 dec_round q1, \key1
6262 dec_round q2, \key1
63
+ dec_round q3, \key1
6364 dec_round q0, \key2
6465 dec_round q1, \key2
6566 dec_round q2, \key2
67
+ dec_round q3, \key2
6668 .endm
6769
68
- .macro enc_fround_3x, key1, key2, key3
70
+ .macro enc_fround_4x, key1, key2, key3
6971 enc_round q0, \key1
7072 enc_round q1, \key1
7173 enc_round q2, \key1
74
+ enc_round q3, \key1
7275 aese.8 q0, \key2
7376 aese.8 q1, \key2
7477 aese.8 q2, \key2
78
+ aese.8 q3, \key2
7579 veor q0, q0, \key3
7680 veor q1, q1, \key3
7781 veor q2, q2, \key3
82
+ veor q3, q3, \key3
7883 .endm
7984
80
- .macro dec_fround_3x, key1, key2, key3
85
+ .macro dec_fround_4x, key1, key2, key3
8186 dec_round q0, \key1
8287 dec_round q1, \key1
8388 dec_round q2, \key1
89
+ dec_round q3, \key1
8490 aesd.8 q0, \key2
8591 aesd.8 q1, \key2
8692 aesd.8 q2, \key2
93
+ aesd.8 q3, \key2
8794 veor q0, q0, \key3
8895 veor q1, q1, \key3
8996 veor q2, q2, \key3
97
+ veor q3, q3, \key3
9098 .endm
9199
92100 .macro do_block, dround, fround
93101 cmp r3, #12 @ which key size?
94
- vld1.8 {q10-q11}, [ip]!
102
+ vld1.32 {q10-q11}, [ip]!
95103 \dround q8, q9
96
- vld1.8 {q12-q13}, [ip]!
104
+ vld1.32 {q12-q13}, [ip]!
97105 \dround q10, q11
98
- vld1.8 {q10-q11}, [ip]!
106
+ vld1.32 {q10-q11}, [ip]!
99107 \dround q12, q13
100
- vld1.8 {q12-q13}, [ip]!
108
+ vld1.32 {q12-q13}, [ip]!
101109 \dround q10, q11
102110 blo 0f @ AES-128: 10 rounds
103
- vld1.8 {q10-q11}, [ip]!
111
+ vld1.32 {q10-q11}, [ip]!
104112 \dround q12, q13
105113 beq 1f @ AES-192: 12 rounds
106
- vld1.8 {q12-q13}, [ip]
114
+ vld1.32 {q12-q13}, [ip]
107115 \dround q10, q11
108116 0: \fround q12, q13, q14
109117 bx lr
....@@ -117,8 +125,9 @@
117125 * transforms. These should preserve all registers except q0 - q2 and ip
118126 * Arguments:
119127 * q0 : first in/output block
120
- * q1 : second in/output block (_3x version only)
121
- * q2 : third in/output block (_3x version only)
128
+ * q1 : second in/output block (_4x version only)
129
+ * q2 : third in/output block (_4x version only)
130
+ * q3 : fourth in/output block (_4x version only)
122131 * q8 : first round key
123132 * q9 : secound round key
124133 * q14 : final round key
....@@ -139,44 +148,44 @@
139148 ENDPROC(aes_decrypt)
140149
141150 .align 6
142
-aes_encrypt_3x:
151
+aes_encrypt_4x:
143152 add ip, r2, #32 @ 3rd round key
144
- do_block enc_dround_3x, enc_fround_3x
145
-ENDPROC(aes_encrypt_3x)
153
+ do_block enc_dround_4x, enc_fround_4x
154
+ENDPROC(aes_encrypt_4x)
146155
147156 .align 6
148
-aes_decrypt_3x:
157
+aes_decrypt_4x:
149158 add ip, r2, #32 @ 3rd round key
150
- do_block dec_dround_3x, dec_fround_3x
151
-ENDPROC(aes_decrypt_3x)
159
+ do_block dec_dround_4x, dec_fround_4x
160
+ENDPROC(aes_decrypt_4x)
152161
153162 .macro prepare_key, rk, rounds
154163 add ip, \rk, \rounds, lsl #4
155
- vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
156
- vld1.8 {q14}, [ip] @ load last round key
164
+ vld1.32 {q8-q9}, [\rk] @ load first 2 round keys
165
+ vld1.32 {q14}, [ip] @ load last round key
157166 .endm
158167
159168 /*
160
- * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
169
+ * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
161170 * int blocks)
162
- * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
171
+ * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
163172 * int blocks)
164173 */
165174 ENTRY(ce_aes_ecb_encrypt)
166175 push {r4, lr}
167176 ldr r4, [sp, #8]
168177 prepare_key r2, r3
169
-.Lecbencloop3x:
170
- subs r4, r4, #3
178
+.Lecbencloop4x:
179
+ subs r4, r4, #4
171180 bmi .Lecbenc1x
172181 vld1.8 {q0-q1}, [r1]!
173
- vld1.8 {q2}, [r1]!
174
- bl aes_encrypt_3x
182
+ vld1.8 {q2-q3}, [r1]!
183
+ bl aes_encrypt_4x
175184 vst1.8 {q0-q1}, [r0]!
176
- vst1.8 {q2}, [r0]!
177
- b .Lecbencloop3x
185
+ vst1.8 {q2-q3}, [r0]!
186
+ b .Lecbencloop4x
178187 .Lecbenc1x:
179
- adds r4, r4, #3
188
+ adds r4, r4, #4
180189 beq .Lecbencout
181190 .Lecbencloop:
182191 vld1.8 {q0}, [r1]!
....@@ -192,17 +201,17 @@
192201 push {r4, lr}
193202 ldr r4, [sp, #8]
194203 prepare_key r2, r3
195
-.Lecbdecloop3x:
196
- subs r4, r4, #3
204
+.Lecbdecloop4x:
205
+ subs r4, r4, #4
197206 bmi .Lecbdec1x
198207 vld1.8 {q0-q1}, [r1]!
199
- vld1.8 {q2}, [r1]!
200
- bl aes_decrypt_3x
208
+ vld1.8 {q2-q3}, [r1]!
209
+ bl aes_decrypt_4x
201210 vst1.8 {q0-q1}, [r0]!
202
- vst1.8 {q2}, [r0]!
203
- b .Lecbdecloop3x
211
+ vst1.8 {q2-q3}, [r0]!
212
+ b .Lecbdecloop4x
204213 .Lecbdec1x:
205
- adds r4, r4, #3
214
+ adds r4, r4, #4
206215 beq .Lecbdecout
207216 .Lecbdecloop:
208217 vld1.8 {q0}, [r1]!
....@@ -215,9 +224,9 @@
215224 ENDPROC(ce_aes_ecb_decrypt)
216225
217226 /*
218
- * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
227
+ * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
219228 * int blocks, u8 iv[])
220
- * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
229
+ * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
221230 * int blocks, u8 iv[])
222231 */
223232 ENTRY(ce_aes_cbc_encrypt)
....@@ -239,123 +248,228 @@
239248 ENTRY(ce_aes_cbc_decrypt)
240249 push {r4-r6, lr}
241250 ldrd r4, r5, [sp, #16]
242
- vld1.8 {q6}, [r5] @ keep iv in q6
251
+ vld1.8 {q15}, [r5] @ keep iv in q15
243252 prepare_key r2, r3
244
-.Lcbcdecloop3x:
245
- subs r4, r4, #3
253
+.Lcbcdecloop4x:
254
+ subs r4, r4, #4
246255 bmi .Lcbcdec1x
247256 vld1.8 {q0-q1}, [r1]!
248
- vld1.8 {q2}, [r1]!
249
- vmov q3, q0
250
- vmov q4, q1
251
- vmov q5, q2
252
- bl aes_decrypt_3x
253
- veor q0, q0, q6
254
- veor q1, q1, q3
255
- veor q2, q2, q4
256
- vmov q6, q5
257
+ vld1.8 {q2-q3}, [r1]!
258
+ vmov q4, q0
259
+ vmov q5, q1
260
+ vmov q6, q2
261
+ vmov q7, q3
262
+ bl aes_decrypt_4x
263
+ veor q0, q0, q15
264
+ veor q1, q1, q4
265
+ veor q2, q2, q5
266
+ veor q3, q3, q6
267
+ vmov q15, q7
257268 vst1.8 {q0-q1}, [r0]!
258
- vst1.8 {q2}, [r0]!
259
- b .Lcbcdecloop3x
269
+ vst1.8 {q2-q3}, [r0]!
270
+ b .Lcbcdecloop4x
260271 .Lcbcdec1x:
261
- adds r4, r4, #3
272
+ adds r4, r4, #4
262273 beq .Lcbcdecout
263
- vmov q15, q14 @ preserve last round key
274
+ vmov q6, q14 @ preserve last round key
264275 .Lcbcdecloop:
265276 vld1.8 {q0}, [r1]! @ get next ct block
266277 veor q14, q15, q6 @ combine prev ct with last key
267
- vmov q6, q0
278
+ vmov q15, q0
268279 bl aes_decrypt
269280 vst1.8 {q0}, [r0]!
270281 subs r4, r4, #1
271282 bne .Lcbcdecloop
272283 .Lcbcdecout:
273
- vst1.8 {q6}, [r5] @ keep iv in q6
284
+ vst1.8 {q15}, [r5] @ keep iv in q15
274285 pop {r4-r6, pc}
275286 ENDPROC(ce_aes_cbc_decrypt)
276287
288
+
277289 /*
278
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
290
+ * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291
+ * int rounds, int bytes, u8 const iv[])
292
+ * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293
+ * int rounds, int bytes, u8 const iv[])
294
+ */
295
+
296
+ENTRY(ce_aes_cbc_cts_encrypt)
297
+ push {r4-r6, lr}
298
+ ldrd r4, r5, [sp, #16]
299
+
300
+ movw ip, :lower16:.Lcts_permute_table
301
+ movt ip, :upper16:.Lcts_permute_table
302
+ sub r4, r4, #16
303
+ add lr, ip, #32
304
+ add ip, ip, r4
305
+ sub lr, lr, r4
306
+ vld1.8 {q5}, [ip]
307
+ vld1.8 {q6}, [lr]
308
+
309
+ add ip, r1, r4
310
+ vld1.8 {q0}, [r1] @ overlapping loads
311
+ vld1.8 {q3}, [ip]
312
+
313
+ vld1.8 {q1}, [r5] @ get iv
314
+ prepare_key r2, r3
315
+
316
+ veor q0, q0, q1 @ xor with iv
317
+ bl aes_encrypt
318
+
319
+ vtbl.8 d4, {d0-d1}, d10
320
+ vtbl.8 d5, {d0-d1}, d11
321
+ vtbl.8 d2, {d6-d7}, d12
322
+ vtbl.8 d3, {d6-d7}, d13
323
+
324
+ veor q0, q0, q1
325
+ bl aes_encrypt
326
+
327
+ add r4, r0, r4
328
+ vst1.8 {q2}, [r4] @ overlapping stores
329
+ vst1.8 {q0}, [r0]
330
+
331
+ pop {r4-r6, pc}
332
+ENDPROC(ce_aes_cbc_cts_encrypt)
333
+
334
+ENTRY(ce_aes_cbc_cts_decrypt)
335
+ push {r4-r6, lr}
336
+ ldrd r4, r5, [sp, #16]
337
+
338
+ movw ip, :lower16:.Lcts_permute_table
339
+ movt ip, :upper16:.Lcts_permute_table
340
+ sub r4, r4, #16
341
+ add lr, ip, #32
342
+ add ip, ip, r4
343
+ sub lr, lr, r4
344
+ vld1.8 {q5}, [ip]
345
+ vld1.8 {q6}, [lr]
346
+
347
+ add ip, r1, r4
348
+ vld1.8 {q0}, [r1] @ overlapping loads
349
+ vld1.8 {q1}, [ip]
350
+
351
+ vld1.8 {q3}, [r5] @ get iv
352
+ prepare_key r2, r3
353
+
354
+ bl aes_decrypt
355
+
356
+ vtbl.8 d4, {d0-d1}, d10
357
+ vtbl.8 d5, {d0-d1}, d11
358
+ vtbx.8 d0, {d2-d3}, d12
359
+ vtbx.8 d1, {d2-d3}, d13
360
+
361
+ veor q1, q1, q2
362
+ bl aes_decrypt
363
+ veor q0, q0, q3 @ xor with iv
364
+
365
+ add r4, r0, r4
366
+ vst1.8 {q1}, [r4] @ overlapping stores
367
+ vst1.8 {q0}, [r0]
368
+
369
+ pop {r4-r6, pc}
370
+ENDPROC(ce_aes_cbc_cts_decrypt)
371
+
372
+
373
+ /*
374
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
279375 * int blocks, u8 ctr[])
280376 */
281377 ENTRY(ce_aes_ctr_encrypt)
282378 push {r4-r6, lr}
283379 ldrd r4, r5, [sp, #16]
284
- vld1.8 {q6}, [r5] @ load ctr
380
+ vld1.8 {q7}, [r5] @ load ctr
285381 prepare_key r2, r3
286
- vmov r6, s27 @ keep swabbed ctr in r6
382
+ vmov r6, s31 @ keep swabbed ctr in r6
287383 rev r6, r6
288384 cmn r6, r4 @ 32 bit overflow?
289385 bcs .Lctrloop
290
-.Lctrloop3x:
291
- subs r4, r4, #3
386
+.Lctrloop4x:
387
+ subs r4, r4, #4
292388 bmi .Lctr1x
293
- add r6, r6, #1
294
- vmov q0, q6
295
- vmov q1, q6
296
- rev ip, r6
297
- add r6, r6, #1
298
- vmov q2, q6
299
- vmov s7, ip
300
- rev ip, r6
301
- add r6, r6, #1
302
- vmov s11, ip
303
- vld1.8 {q3-q4}, [r1]!
304
- vld1.8 {q5}, [r1]!
305
- bl aes_encrypt_3x
306
- veor q0, q0, q3
307
- veor q1, q1, q4
308
- veor q2, q2, q5
389
+
390
+ /*
391
+ * NOTE: the sequence below has been carefully tweaked to avoid
392
+ * a silicon erratum that exists in Cortex-A57 (#1742098) and
393
+ * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394
+ * may produce an incorrect result if they take their input from a
395
+ * register of which a single 32-bit lane has been updated the last
396
+ * time it was modified. To work around this, the lanes of registers
397
+ * q0-q3 below are not manipulated individually, and the different
398
+ * counter values are prepared by successive manipulations of q7.
399
+ */
400
+ add ip, r6, #1
401
+ vmov q0, q7
402
+ rev ip, ip
403
+ add lr, r6, #2
404
+ vmov s31, ip @ set lane 3 of q1 via q7
405
+ add ip, r6, #3
406
+ rev lr, lr
407
+ vmov q1, q7
408
+ vmov s31, lr @ set lane 3 of q2 via q7
409
+ rev ip, ip
410
+ vmov q2, q7
411
+ vmov s31, ip @ set lane 3 of q3 via q7
412
+ add r6, r6, #4
413
+ vmov q3, q7
414
+
415
+ vld1.8 {q4-q5}, [r1]!
416
+ vld1.8 {q6}, [r1]!
417
+ vld1.8 {q15}, [r1]!
418
+ bl aes_encrypt_4x
419
+ veor q0, q0, q4
420
+ veor q1, q1, q5
421
+ veor q2, q2, q6
422
+ veor q3, q3, q15
309423 rev ip, r6
310424 vst1.8 {q0-q1}, [r0]!
311
- vst1.8 {q2}, [r0]!
312
- vmov s27, ip
313
- b .Lctrloop3x
425
+ vst1.8 {q2-q3}, [r0]!
426
+ vmov s31, ip
427
+ b .Lctrloop4x
314428 .Lctr1x:
315
- adds r4, r4, #3
429
+ adds r4, r4, #4
316430 beq .Lctrout
317431 .Lctrloop:
318
- vmov q0, q6
432
+ vmov q0, q7
319433 bl aes_encrypt
434
+
435
+ adds r6, r6, #1 @ increment BE ctr
436
+ rev ip, r6
437
+ vmov s31, ip
438
+ bcs .Lctrcarry
439
+
440
+.Lctrcarrydone:
320441 subs r4, r4, #1
321442 bmi .Lctrtailblock @ blocks < 0 means tail block
322443 vld1.8 {q3}, [r1]!
323444 veor q3, q0, q3
324445 vst1.8 {q3}, [r0]!
325
-
326
- adds r6, r6, #1 @ increment BE ctr
327
- rev ip, r6
328
- vmov s27, ip
329
- bcs .Lctrcarry
330
- teq r4, #0
331446 bne .Lctrloop
447
+
332448 .Lctrout:
333
- vst1.8 {q6}, [r5]
449
+ vst1.8 {q7}, [r5] @ return next CTR value
334450 pop {r4-r6, pc}
335451
336452 .Lctrtailblock:
337
- vst1.8 {q0}, [r0, :64] @ return just the key stream
338
- pop {r4-r6, pc}
453
+ vst1.8 {q0}, [r0, :64] @ return the key stream
454
+ b .Lctrout
339455
340456 .Lctrcarry:
341
- .irp sreg, s26, s25, s24
457
+ .irp sreg, s30, s29, s28
342458 vmov ip, \sreg @ load next word of ctr
343459 rev ip, ip @ ... to handle the carry
344460 adds ip, ip, #1
345461 rev ip, ip
346462 vmov \sreg, ip
347
- bcc 0f
463
+ bcc .Lctrcarrydone
348464 .endr
349
-0: teq r4, #0
350
- beq .Lctrout
351
- b .Lctrloop
465
+ b .Lctrcarrydone
352466 ENDPROC(ce_aes_ctr_encrypt)
353467
354468 /*
355
- * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
356
- * int blocks, u8 iv[], u8 const rk2[], int first)
357
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
358
- * int blocks, u8 iv[], u8 const rk2[], int first)
469
+ * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470
+ * int bytes, u8 iv[], u32 const rk2[], int first)
471
+ * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472
+ * int bytes, u8 iv[], u32 const rk2[], int first)
359473 */
360474
361475 .macro next_tweak, out, in, const, tmp
....@@ -366,13 +480,10 @@
366480 veor \out, \out, \tmp
367481 .endm
368482
369
- .align 3
370
-.Lxts_mul_x:
371
- .quad 1, 0x87
372
-
373483 ce_aes_xts_init:
374
- vldr d14, .Lxts_mul_x
375
- vldr d15, .Lxts_mul_x + 8
484
+ vmov.i32 d30, #0x87 @ compose tweak mask vector
485
+ vmovl.u32 q15, d30
486
+ vshr.u64 d30, d31, #7
376487
377488 ldrd r4, r5, [sp, #16] @ load args
378489 ldr r6, [sp, #28]
....@@ -393,49 +504,86 @@
393504
394505 bl ce_aes_xts_init @ run shared prologue
395506 prepare_key r2, r3
396
- vmov q3, q0
507
+ vmov q4, q0
397508
398509 teq r6, #0 @ start of a block?
399
- bne .Lxtsenc3x
510
+ bne .Lxtsenc4x
400511
401
-.Lxtsencloop3x:
402
- next_tweak q3, q3, q7, q6
403
-.Lxtsenc3x:
404
- subs r4, r4, #3
512
+.Lxtsencloop4x:
513
+ next_tweak q4, q4, q15, q10
514
+.Lxtsenc4x:
515
+ subs r4, r4, #64
405516 bmi .Lxtsenc1x
406
- vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks
407
- vld1.8 {q2}, [r1]!
408
- next_tweak q4, q3, q7, q6
409
- veor q0, q0, q3
410
- next_tweak q5, q4, q7, q6
411
- veor q1, q1, q4
412
- veor q2, q2, q5
413
- bl aes_encrypt_3x
414
- veor q0, q0, q3
415
- veor q1, q1, q4
416
- veor q2, q2, q5
417
- vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks
418
- vst1.8 {q2}, [r0]!
419
- vmov q3, q5
517
+ vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
518
+ vld1.8 {q2-q3}, [r1]!
519
+ next_tweak q5, q4, q15, q10
520
+ veor q0, q0, q4
521
+ next_tweak q6, q5, q15, q10
522
+ veor q1, q1, q5
523
+ next_tweak q7, q6, q15, q10
524
+ veor q2, q2, q6
525
+ veor q3, q3, q7
526
+ bl aes_encrypt_4x
527
+ veor q0, q0, q4
528
+ veor q1, q1, q5
529
+ veor q2, q2, q6
530
+ veor q3, q3, q7
531
+ vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
532
+ vst1.8 {q2-q3}, [r0]!
533
+ vmov q4, q7
420534 teq r4, #0
421
- beq .Lxtsencout
422
- b .Lxtsencloop3x
535
+ beq .Lxtsencret
536
+ b .Lxtsencloop4x
423537 .Lxtsenc1x:
424
- adds r4, r4, #3
538
+ adds r4, r4, #64
425539 beq .Lxtsencout
540
+ subs r4, r4, #16
541
+ bmi .LxtsencctsNx
426542 .Lxtsencloop:
427543 vld1.8 {q0}, [r1]!
428
- veor q0, q0, q3
544
+.Lxtsencctsout:
545
+ veor q0, q0, q4
429546 bl aes_encrypt
430
- veor q0, q0, q3
431
- vst1.8 {q0}, [r0]!
432
- subs r4, r4, #1
547
+ veor q0, q0, q4
548
+ teq r4, #0
433549 beq .Lxtsencout
434
- next_tweak q3, q3, q7, q6
550
+ subs r4, r4, #16
551
+ next_tweak q4, q4, q15, q6
552
+ bmi .Lxtsenccts
553
+ vst1.8 {q0}, [r0]!
435554 b .Lxtsencloop
436555 .Lxtsencout:
437
- vst1.8 {q3}, [r5]
556
+ vst1.8 {q0}, [r0]
557
+.Lxtsencret:
558
+ vst1.8 {q4}, [r5]
438559 pop {r4-r6, pc}
560
+
561
+.LxtsencctsNx:
562
+ vmov q0, q3
563
+ sub r0, r0, #16
564
+.Lxtsenccts:
565
+ movw ip, :lower16:.Lcts_permute_table
566
+ movt ip, :upper16:.Lcts_permute_table
567
+
568
+ add r1, r1, r4 @ rewind input pointer
569
+ add r4, r4, #16 @ # bytes in final block
570
+ add lr, ip, #32
571
+ add ip, ip, r4
572
+ sub lr, lr, r4
573
+ add r4, r0, r4 @ output address of final block
574
+
575
+ vld1.8 {q1}, [r1] @ load final partial block
576
+ vld1.8 {q2}, [ip]
577
+ vld1.8 {q3}, [lr]
578
+
579
+ vtbl.8 d4, {d0-d1}, d4
580
+ vtbl.8 d5, {d0-d1}, d5
581
+ vtbx.8 d0, {d2-d3}, d6
582
+ vtbx.8 d1, {d2-d3}, d7
583
+
584
+ vst1.8 {q2}, [r4] @ overlapping stores
585
+ mov r4, #0
586
+ b .Lxtsencctsout
439587 ENDPROC(ce_aes_xts_encrypt)
440588
441589
....@@ -444,50 +592,90 @@
444592
445593 bl ce_aes_xts_init @ run shared prologue
446594 prepare_key r2, r3
447
- vmov q3, q0
595
+ vmov q4, q0
596
+
597
+ /* subtract 16 bytes if we are doing CTS */
598
+ tst r4, #0xf
599
+ subne r4, r4, #0x10
448600
449601 teq r6, #0 @ start of a block?
450
- bne .Lxtsdec3x
602
+ bne .Lxtsdec4x
451603
452
-.Lxtsdecloop3x:
453
- next_tweak q3, q3, q7, q6
454
-.Lxtsdec3x:
455
- subs r4, r4, #3
604
+.Lxtsdecloop4x:
605
+ next_tweak q4, q4, q15, q10
606
+.Lxtsdec4x:
607
+ subs r4, r4, #64
456608 bmi .Lxtsdec1x
457
- vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks
458
- vld1.8 {q2}, [r1]!
459
- next_tweak q4, q3, q7, q6
460
- veor q0, q0, q3
461
- next_tweak q5, q4, q7, q6
462
- veor q1, q1, q4
463
- veor q2, q2, q5
464
- bl aes_decrypt_3x
465
- veor q0, q0, q3
466
- veor q1, q1, q4
467
- veor q2, q2, q5
468
- vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks
469
- vst1.8 {q2}, [r0]!
470
- vmov q3, q5
609
+ vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
610
+ vld1.8 {q2-q3}, [r1]!
611
+ next_tweak q5, q4, q15, q10
612
+ veor q0, q0, q4
613
+ next_tweak q6, q5, q15, q10
614
+ veor q1, q1, q5
615
+ next_tweak q7, q6, q15, q10
616
+ veor q2, q2, q6
617
+ veor q3, q3, q7
618
+ bl aes_decrypt_4x
619
+ veor q0, q0, q4
620
+ veor q1, q1, q5
621
+ veor q2, q2, q6
622
+ veor q3, q3, q7
623
+ vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
624
+ vst1.8 {q2-q3}, [r0]!
625
+ vmov q4, q7
471626 teq r4, #0
472627 beq .Lxtsdecout
473
- b .Lxtsdecloop3x
628
+ b .Lxtsdecloop4x
474629 .Lxtsdec1x:
475
- adds r4, r4, #3
630
+ adds r4, r4, #64
476631 beq .Lxtsdecout
632
+ subs r4, r4, #16
477633 .Lxtsdecloop:
478634 vld1.8 {q0}, [r1]!
479
- veor q0, q0, q3
480
- add ip, r2, #32 @ 3rd round key
635
+ bmi .Lxtsdeccts
636
+.Lxtsdecctsout:
637
+ veor q0, q0, q4
481638 bl aes_decrypt
482
- veor q0, q0, q3
639
+ veor q0, q0, q4
483640 vst1.8 {q0}, [r0]!
484
- subs r4, r4, #1
641
+ teq r4, #0
485642 beq .Lxtsdecout
486
- next_tweak q3, q3, q7, q6
643
+ subs r4, r4, #16
644
+ next_tweak q4, q4, q15, q6
487645 b .Lxtsdecloop
488646 .Lxtsdecout:
489
- vst1.8 {q3}, [r5]
647
+ vst1.8 {q4}, [r5]
490648 pop {r4-r6, pc}
649
+
650
+.Lxtsdeccts:
651
+ movw ip, :lower16:.Lcts_permute_table
652
+ movt ip, :upper16:.Lcts_permute_table
653
+
654
+ add r1, r1, r4 @ rewind input pointer
655
+ add r4, r4, #16 @ # bytes in final block
656
+ add lr, ip, #32
657
+ add ip, ip, r4
658
+ sub lr, lr, r4
659
+ add r4, r0, r4 @ output address of final block
660
+
661
+ next_tweak q5, q4, q15, q6
662
+
663
+ vld1.8 {q1}, [r1] @ load final partial block
664
+ vld1.8 {q2}, [ip]
665
+ vld1.8 {q3}, [lr]
666
+
667
+ veor q0, q0, q5
668
+ bl aes_decrypt
669
+ veor q0, q0, q5
670
+
671
+ vtbl.8 d4, {d0-d1}, d4
672
+ vtbl.8 d5, {d0-d1}, d5
673
+ vtbx.8 d0, {d2-d3}, d6
674
+ vtbx.8 d1, {d2-d3}, d7
675
+
676
+ vst1.8 {q2}, [r4] @ overlapping stores
677
+ mov r4, #0
678
+ b .Lxtsdecctsout
491679 ENDPROC(ce_aes_xts_decrypt)
492680
493681 /*
....@@ -508,8 +696,18 @@
508696 * operation on round key *src
509697 */
510698 ENTRY(ce_aes_invert)
511
- vld1.8 {q0}, [r1]
699
+ vld1.32 {q0}, [r1]
512700 aesimc.8 q0, q0
513
- vst1.8 {q0}, [r0]
701
+ vst1.32 {q0}, [r0]
514702 bx lr
515703 ENDPROC(ce_aes_invert)
704
+
705
+ .section ".rodata", "a"
706
+ .align 6
707
+.Lcts_permute_table:
708
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
711
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
712
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff