hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/arm64/crypto/aes-modes.S
....@@ -1,11 +1,8 @@
1
+/* SPDX-License-Identifier: GPL-2.0-only */
12 /*
23 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
34 *
45 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5
- *
6
- * This program is free software; you can redistribute it and/or modify
7
- * it under the terms of the GNU General Public License version 2 as
8
- * published by the Free Software Foundation.
96 */
107
118 /* included by aes-ce.S and aes-neon.S */
....@@ -13,15 +10,39 @@
1310 .text
1411 .align 4
1512
16
-aes_encrypt_block4x:
17
- encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
18
- ret
19
-ENDPROC(aes_encrypt_block4x)
13
+#ifndef MAX_STRIDE
14
+#define MAX_STRIDE 4
15
+#endif
2016
21
-aes_decrypt_block4x:
22
- decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7
17
+#if MAX_STRIDE == 4
18
+#define ST4(x...) x
19
+#define ST5(x...)
20
+#else
21
+#define ST4(x...)
22
+#define ST5(x...) x
23
+#endif
24
+
25
+SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
2327 ret
24
-ENDPROC(aes_decrypt_block4x)
28
+SYM_FUNC_END(aes_encrypt_block4x)
29
+
30
+SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
32
+ ret
33
+SYM_FUNC_END(aes_decrypt_block4x)
34
+
35
+#if MAX_STRIDE == 5
36
+SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37
+ encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38
+ ret
39
+SYM_FUNC_END(aes_encrypt_block5x)
40
+
41
+SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42
+ decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43
+ ret
44
+SYM_FUNC_END(aes_decrypt_block5x)
45
+#endif
2546
2647 /*
2748 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
....@@ -30,74 +51,66 @@
3051 * int blocks)
3152 */
3253
33
-AES_ENTRY(aes_ecb_encrypt)
34
- frame_push 5
54
+AES_FUNC_START(aes_ecb_encrypt)
55
+ stp x29, x30, [sp, #-16]!
56
+ mov x29, sp
3557
36
- mov x19, x0
37
- mov x20, x1
38
- mov x21, x2
39
- mov x22, x3
40
- mov x23, x4
41
-
42
-.Lecbencrestart:
43
- enc_prepare w22, x21, x5
58
+ enc_prepare w3, x2, x5
4459
4560 .LecbencloopNx:
46
- subs w23, w23, #4
61
+ subs w4, w4, #MAX_STRIDE
4762 bmi .Lecbenc1x
48
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
49
- bl aes_encrypt_block4x
50
- st1 {v0.16b-v3.16b}, [x19], #64
51
- cond_yield_neon .Lecbencrestart
63
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
64
+ST4( bl aes_encrypt_block4x )
65
+ST5( ld1 {v4.16b}, [x1], #16 )
66
+ST5( bl aes_encrypt_block5x )
67
+ st1 {v0.16b-v3.16b}, [x0], #64
68
+ST5( st1 {v4.16b}, [x0], #16 )
5269 b .LecbencloopNx
5370 .Lecbenc1x:
54
- adds w23, w23, #4
71
+ adds w4, w4, #MAX_STRIDE
5572 beq .Lecbencout
5673 .Lecbencloop:
57
- ld1 {v0.16b}, [x20], #16 /* get next pt block */
58
- encrypt_block v0, w22, x21, x5, w6
59
- st1 {v0.16b}, [x19], #16
60
- subs w23, w23, #1
74
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
75
+ encrypt_block v0, w3, x2, x5, w6
76
+ st1 {v0.16b}, [x0], #16
77
+ subs w4, w4, #1
6178 bne .Lecbencloop
6279 .Lecbencout:
63
- frame_pop
80
+ ldp x29, x30, [sp], #16
6481 ret
65
-AES_ENDPROC(aes_ecb_encrypt)
82
+AES_FUNC_END(aes_ecb_encrypt)
6683
6784
68
-AES_ENTRY(aes_ecb_decrypt)
69
- frame_push 5
85
+AES_FUNC_START(aes_ecb_decrypt)
86
+ stp x29, x30, [sp, #-16]!
87
+ mov x29, sp
7088
71
- mov x19, x0
72
- mov x20, x1
73
- mov x21, x2
74
- mov x22, x3
75
- mov x23, x4
76
-
77
-.Lecbdecrestart:
78
- dec_prepare w22, x21, x5
89
+ dec_prepare w3, x2, x5
7990
8091 .LecbdecloopNx:
81
- subs w23, w23, #4
92
+ subs w4, w4, #MAX_STRIDE
8293 bmi .Lecbdec1x
83
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
84
- bl aes_decrypt_block4x
85
- st1 {v0.16b-v3.16b}, [x19], #64
86
- cond_yield_neon .Lecbdecrestart
94
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
95
+ST4( bl aes_decrypt_block4x )
96
+ST5( ld1 {v4.16b}, [x1], #16 )
97
+ST5( bl aes_decrypt_block5x )
98
+ st1 {v0.16b-v3.16b}, [x0], #64
99
+ST5( st1 {v4.16b}, [x0], #16 )
87100 b .LecbdecloopNx
88101 .Lecbdec1x:
89
- adds w23, w23, #4
102
+ adds w4, w4, #MAX_STRIDE
90103 beq .Lecbdecout
91104 .Lecbdecloop:
92
- ld1 {v0.16b}, [x20], #16 /* get next ct block */
93
- decrypt_block v0, w22, x21, x5, w6
94
- st1 {v0.16b}, [x19], #16
95
- subs w23, w23, #1
105
+ ld1 {v0.16b}, [x1], #16 /* get next ct block */
106
+ decrypt_block v0, w3, x2, x5, w6
107
+ st1 {v0.16b}, [x0], #16
108
+ subs w4, w4, #1
96109 bne .Lecbdecloop
97110 .Lecbdecout:
98
- frame_pop
111
+ ldp x29, x30, [sp], #16
99112 ret
100
-AES_ENDPROC(aes_ecb_decrypt)
113
+AES_FUNC_END(aes_ecb_decrypt)
101114
102115
103116 /*
....@@ -105,105 +118,205 @@
105118 * int blocks, u8 iv[])
106119 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
107120 * int blocks, u8 iv[])
121
+ * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122
+ * int rounds, int blocks, u8 iv[],
123
+ * u32 const rk2[]);
124
+ * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125
+ * int rounds, int blocks, u8 iv[],
126
+ * u32 const rk2[]);
108127 */
109128
110
-AES_ENTRY(aes_cbc_encrypt)
111
- frame_push 6
129
+AES_FUNC_START(aes_essiv_cbc_encrypt)
130
+ ld1 {v4.16b}, [x5] /* get iv */
112131
113
- mov x19, x0
114
- mov x20, x1
115
- mov x21, x2
116
- mov x22, x3
117
- mov x23, x4
118
- mov x24, x5
132
+ mov w8, #14 /* AES-256: 14 rounds */
133
+ enc_prepare w8, x6, x7
134
+ encrypt_block v4, w8, x6, x7, w9
135
+ enc_switch_key w3, x2, x6
136
+ b .Lcbcencloop4x
119137
120
-.Lcbcencrestart:
121
- ld1 {v4.16b}, [x24] /* get iv */
122
- enc_prepare w22, x21, x6
138
+AES_FUNC_START(aes_cbc_encrypt)
139
+ ld1 {v4.16b}, [x5] /* get iv */
140
+ enc_prepare w3, x2, x6
123141
124142 .Lcbcencloop4x:
125
- subs w23, w23, #4
143
+ subs w4, w4, #4
126144 bmi .Lcbcenc1x
127
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
145
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
128146 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
129
- encrypt_block v0, w22, x21, x6, w7
147
+ encrypt_block v0, w3, x2, x6, w7
130148 eor v1.16b, v1.16b, v0.16b
131
- encrypt_block v1, w22, x21, x6, w7
149
+ encrypt_block v1, w3, x2, x6, w7
132150 eor v2.16b, v2.16b, v1.16b
133
- encrypt_block v2, w22, x21, x6, w7
151
+ encrypt_block v2, w3, x2, x6, w7
134152 eor v3.16b, v3.16b, v2.16b
135
- encrypt_block v3, w22, x21, x6, w7
136
- st1 {v0.16b-v3.16b}, [x19], #64
153
+ encrypt_block v3, w3, x2, x6, w7
154
+ st1 {v0.16b-v3.16b}, [x0], #64
137155 mov v4.16b, v3.16b
138
- st1 {v4.16b}, [x24] /* return iv */
139
- cond_yield_neon .Lcbcencrestart
140156 b .Lcbcencloop4x
141157 .Lcbcenc1x:
142
- adds w23, w23, #4
158
+ adds w4, w4, #4
143159 beq .Lcbcencout
144160 .Lcbcencloop:
145
- ld1 {v0.16b}, [x20], #16 /* get next pt block */
161
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
146162 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
147
- encrypt_block v4, w22, x21, x6, w7
148
- st1 {v4.16b}, [x19], #16
149
- subs w23, w23, #1
163
+ encrypt_block v4, w3, x2, x6, w7
164
+ st1 {v4.16b}, [x0], #16
165
+ subs w4, w4, #1
150166 bne .Lcbcencloop
151167 .Lcbcencout:
152
- st1 {v4.16b}, [x24] /* return iv */
153
- frame_pop
168
+ st1 {v4.16b}, [x5] /* return iv */
154169 ret
155
-AES_ENDPROC(aes_cbc_encrypt)
170
+AES_FUNC_END(aes_cbc_encrypt)
171
+AES_FUNC_END(aes_essiv_cbc_encrypt)
156172
173
+AES_FUNC_START(aes_essiv_cbc_decrypt)
174
+ stp x29, x30, [sp, #-16]!
175
+ mov x29, sp
157176
158
-AES_ENTRY(aes_cbc_decrypt)
159
- frame_push 6
177
+ ld1 {cbciv.16b}, [x5] /* get iv */
160178
161
- mov x19, x0
162
- mov x20, x1
163
- mov x21, x2
164
- mov x22, x3
165
- mov x23, x4
166
- mov x24, x5
179
+ mov w8, #14 /* AES-256: 14 rounds */
180
+ enc_prepare w8, x6, x7
181
+ encrypt_block cbciv, w8, x6, x7, w9
182
+ b .Lessivcbcdecstart
167183
168
-.Lcbcdecrestart:
169
- ld1 {v7.16b}, [x24] /* get iv */
170
- dec_prepare w22, x21, x6
184
+AES_FUNC_START(aes_cbc_decrypt)
185
+ stp x29, x30, [sp, #-16]!
186
+ mov x29, sp
187
+
188
+ ld1 {cbciv.16b}, [x5] /* get iv */
189
+.Lessivcbcdecstart:
190
+ dec_prepare w3, x2, x6
171191
172192 .LcbcdecloopNx:
173
- subs w23, w23, #4
193
+ subs w4, w4, #MAX_STRIDE
174194 bmi .Lcbcdec1x
175
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
195
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
196
+#if MAX_STRIDE == 5
197
+ ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
198
+ mov v5.16b, v0.16b
199
+ mov v6.16b, v1.16b
200
+ mov v7.16b, v2.16b
201
+ bl aes_decrypt_block5x
202
+ sub x1, x1, #32
203
+ eor v0.16b, v0.16b, cbciv.16b
204
+ eor v1.16b, v1.16b, v5.16b
205
+ ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
206
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
207
+ eor v2.16b, v2.16b, v6.16b
208
+ eor v3.16b, v3.16b, v7.16b
209
+ eor v4.16b, v4.16b, v5.16b
210
+#else
176211 mov v4.16b, v0.16b
177212 mov v5.16b, v1.16b
178213 mov v6.16b, v2.16b
179214 bl aes_decrypt_block4x
180
- sub x20, x20, #16
181
- eor v0.16b, v0.16b, v7.16b
215
+ sub x1, x1, #16
216
+ eor v0.16b, v0.16b, cbciv.16b
182217 eor v1.16b, v1.16b, v4.16b
183
- ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */
218
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
184219 eor v2.16b, v2.16b, v5.16b
185220 eor v3.16b, v3.16b, v6.16b
186
- st1 {v0.16b-v3.16b}, [x19], #64
187
- st1 {v7.16b}, [x24] /* return iv */
188
- cond_yield_neon .Lcbcdecrestart
221
+#endif
222
+ st1 {v0.16b-v3.16b}, [x0], #64
223
+ST5( st1 {v4.16b}, [x0], #16 )
189224 b .LcbcdecloopNx
190225 .Lcbcdec1x:
191
- adds w23, w23, #4
226
+ adds w4, w4, #MAX_STRIDE
192227 beq .Lcbcdecout
193228 .Lcbcdecloop:
194
- ld1 {v1.16b}, [x20], #16 /* get next ct block */
229
+ ld1 {v1.16b}, [x1], #16 /* get next ct block */
195230 mov v0.16b, v1.16b /* ...and copy to v0 */
196
- decrypt_block v0, w22, x21, x6, w7
197
- eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
198
- mov v7.16b, v1.16b /* ct is next iv */
199
- st1 {v0.16b}, [x19], #16
200
- subs w23, w23, #1
231
+ decrypt_block v0, w3, x2, x6, w7
232
+ eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
233
+ mov cbciv.16b, v1.16b /* ct is next iv */
234
+ st1 {v0.16b}, [x0], #16
235
+ subs w4, w4, #1
201236 bne .Lcbcdecloop
202237 .Lcbcdecout:
203
- st1 {v7.16b}, [x24] /* return iv */
204
- frame_pop
238
+ st1 {cbciv.16b}, [x5] /* return iv */
239
+ ldp x29, x30, [sp], #16
205240 ret
206
-AES_ENDPROC(aes_cbc_decrypt)
241
+AES_FUNC_END(aes_cbc_decrypt)
242
+AES_FUNC_END(aes_essiv_cbc_decrypt)
243
+
244
+
245
+ /*
246
+ * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247
+ * int rounds, int bytes, u8 const iv[])
248
+ * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249
+ * int rounds, int bytes, u8 const iv[])
250
+ */
251
+
252
+AES_FUNC_START(aes_cbc_cts_encrypt)
253
+ adr_l x8, .Lcts_permute_table
254
+ sub x4, x4, #16
255
+ add x9, x8, #32
256
+ add x8, x8, x4
257
+ sub x9, x9, x4
258
+ ld1 {v3.16b}, [x8]
259
+ ld1 {v4.16b}, [x9]
260
+
261
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
262
+ ld1 {v1.16b}, [x1]
263
+
264
+ ld1 {v5.16b}, [x5] /* get iv */
265
+ enc_prepare w3, x2, x6
266
+
267
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
268
+ tbl v1.16b, {v1.16b}, v4.16b
269
+ encrypt_block v0, w3, x2, x6, w7
270
+
271
+ eor v1.16b, v1.16b, v0.16b
272
+ tbl v0.16b, {v0.16b}, v3.16b
273
+ encrypt_block v1, w3, x2, x6, w7
274
+
275
+ add x4, x0, x4
276
+ st1 {v0.16b}, [x4] /* overlapping stores */
277
+ st1 {v1.16b}, [x0]
278
+ ret
279
+AES_FUNC_END(aes_cbc_cts_encrypt)
280
+
281
+AES_FUNC_START(aes_cbc_cts_decrypt)
282
+ adr_l x8, .Lcts_permute_table
283
+ sub x4, x4, #16
284
+ add x9, x8, #32
285
+ add x8, x8, x4
286
+ sub x9, x9, x4
287
+ ld1 {v3.16b}, [x8]
288
+ ld1 {v4.16b}, [x9]
289
+
290
+ ld1 {v0.16b}, [x1], x4 /* overlapping loads */
291
+ ld1 {v1.16b}, [x1]
292
+
293
+ ld1 {v5.16b}, [x5] /* get iv */
294
+ dec_prepare w3, x2, x6
295
+
296
+ decrypt_block v0, w3, x2, x6, w7
297
+ tbl v2.16b, {v0.16b}, v3.16b
298
+ eor v2.16b, v2.16b, v1.16b
299
+
300
+ tbx v0.16b, {v1.16b}, v4.16b
301
+ decrypt_block v0, w3, x2, x6, w7
302
+ eor v0.16b, v0.16b, v5.16b /* xor with iv */
303
+
304
+ add x4, x0, x4
305
+ st1 {v2.16b}, [x4] /* overlapping stores */
306
+ st1 {v0.16b}, [x0]
307
+ ret
308
+AES_FUNC_END(aes_cbc_cts_decrypt)
309
+
310
+ .section ".rodata", "a"
311
+ .align 6
312
+.Lcts_permute_table:
313
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
316
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
317
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319
+ .previous
207320
208321
209322 /*
....@@ -211,310 +324,342 @@
211324 * int blocks, u8 ctr[])
212325 */
213326
214
-AES_ENTRY(aes_ctr_encrypt)
215
- frame_push 6
327
+AES_FUNC_START(aes_ctr_encrypt)
328
+ stp x29, x30, [sp, #-16]!
329
+ mov x29, sp
216330
217
- mov x19, x0
218
- mov x20, x1
219
- mov x21, x2
220
- mov x22, x3
221
- mov x23, x4
222
- mov x24, x5
331
+ enc_prepare w3, x2, x6
332
+ ld1 {vctr.16b}, [x5]
223333
224
-.Lctrrestart:
225
- enc_prepare w22, x21, x6
226
- ld1 {v4.16b}, [x24]
227
-
228
- umov x6, v4.d[1] /* keep swabbed ctr in reg */
334
+ umov x6, vctr.d[1] /* keep swabbed ctr in reg */
229335 rev x6, x6
336
+ cmn w6, w4 /* 32 bit overflow? */
337
+ bcs .Lctrloop
230338 .LctrloopNx:
231
- subs w23, w23, #4
339
+ subs w4, w4, #MAX_STRIDE
232340 bmi .Lctr1x
233
- cmn w6, #4 /* 32 bit overflow? */
234
- bcs .Lctr1x
235341 add w7, w6, #1
236
- mov v0.16b, v4.16b
342
+ mov v0.16b, vctr.16b
237343 add w8, w6, #2
238
- mov v1.16b, v4.16b
344
+ mov v1.16b, vctr.16b
239345 add w9, w6, #3
240
- mov v2.16b, v4.16b
346
+ mov v2.16b, vctr.16b
347
+ add w9, w6, #3
241348 rev w7, w7
242
- mov v3.16b, v4.16b
349
+ mov v3.16b, vctr.16b
243350 rev w8, w8
351
+ST5( mov v4.16b, vctr.16b )
244352 mov v1.s[3], w7
245353 rev w9, w9
354
+ST5( add w10, w6, #4 )
246355 mov v2.s[3], w8
356
+ST5( rev w10, w10 )
247357 mov v3.s[3], w9
248
- ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */
249
- bl aes_encrypt_block4x
358
+ST5( mov v4.s[3], w10 )
359
+ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
360
+ST4( bl aes_encrypt_block4x )
361
+ST5( bl aes_encrypt_block5x )
250362 eor v0.16b, v5.16b, v0.16b
251
- ld1 {v5.16b}, [x20], #16 /* get 1 input block */
363
+ST4( ld1 {v5.16b}, [x1], #16 )
252364 eor v1.16b, v6.16b, v1.16b
365
+ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
253366 eor v2.16b, v7.16b, v2.16b
254367 eor v3.16b, v5.16b, v3.16b
255
- st1 {v0.16b-v3.16b}, [x19], #64
256
- add x6, x6, #4
368
+ST5( eor v4.16b, v6.16b, v4.16b )
369
+ st1 {v0.16b-v3.16b}, [x0], #64
370
+ST5( st1 {v4.16b}, [x0], #16 )
371
+ add x6, x6, #MAX_STRIDE
257372 rev x7, x6
258
- ins v4.d[1], x7
259
- cbz w23, .Lctrout
260
- st1 {v4.16b}, [x24] /* return next CTR value */
261
- cond_yield_neon .Lctrrestart
373
+ ins vctr.d[1], x7
374
+ cbz w4, .Lctrout
262375 b .LctrloopNx
263376 .Lctr1x:
264
- adds w23, w23, #4
377
+ adds w4, w4, #MAX_STRIDE
265378 beq .Lctrout
266379 .Lctrloop:
267
- mov v0.16b, v4.16b
268
- encrypt_block v0, w22, x21, x8, w7
380
+ mov v0.16b, vctr.16b
381
+ encrypt_block v0, w3, x2, x8, w7
269382
270383 adds x6, x6, #1 /* increment BE ctr */
271384 rev x7, x6
272
- ins v4.d[1], x7
385
+ ins vctr.d[1], x7
273386 bcs .Lctrcarry /* overflow? */
274387
275388 .Lctrcarrydone:
276
- subs w23, w23, #1
389
+ subs w4, w4, #1
277390 bmi .Lctrtailblock /* blocks <0 means tail block */
278
- ld1 {v3.16b}, [x20], #16
391
+ ld1 {v3.16b}, [x1], #16
279392 eor v3.16b, v0.16b, v3.16b
280
- st1 {v3.16b}, [x19], #16
393
+ st1 {v3.16b}, [x0], #16
281394 bne .Lctrloop
282395
283396 .Lctrout:
284
- st1 {v4.16b}, [x24] /* return next CTR value */
285
-.Lctrret:
286
- frame_pop
397
+ st1 {vctr.16b}, [x5] /* return next CTR value */
398
+ ldp x29, x30, [sp], #16
287399 ret
288400
289401 .Lctrtailblock:
290
- st1 {v0.16b}, [x19]
291
- b .Lctrret
402
+ st1 {v0.16b}, [x0]
403
+ b .Lctrout
292404
293405 .Lctrcarry:
294
- umov x7, v4.d[0] /* load upper word of ctr */
406
+ umov x7, vctr.d[0] /* load upper word of ctr */
295407 rev x7, x7 /* ... to handle the carry */
296408 add x7, x7, #1
297409 rev x7, x7
298
- ins v4.d[0], x7
410
+ ins vctr.d[0], x7
299411 b .Lctrcarrydone
300
-AES_ENDPROC(aes_ctr_encrypt)
301
- .ltorg
412
+AES_FUNC_END(aes_ctr_encrypt)
302413
303414
304415 /*
416
+ * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
417
+ * int bytes, u8 const rk2[], u8 iv[], int first)
305418 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
306
- * int blocks, u8 const rk2[], u8 iv[], int first)
307
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
308
- * int blocks, u8 const rk2[], u8 iv[], int first)
419
+ * int bytes, u8 const rk2[], u8 iv[], int first)
309420 */
310421
311
- .macro next_tweak, out, in, const, tmp
422
+ .macro next_tweak, out, in, tmp
312423 sshr \tmp\().2d, \in\().2d, #63
313
- and \tmp\().16b, \tmp\().16b, \const\().16b
424
+ and \tmp\().16b, \tmp\().16b, xtsmask.16b
314425 add \out\().2d, \in\().2d, \in\().2d
315426 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
316427 eor \out\().16b, \out\().16b, \tmp\().16b
317428 .endm
318429
319
-.Lxts_mul_x:
320
-CPU_LE( .quad 1, 0x87 )
321
-CPU_BE( .quad 0x87, 1 )
430
+ .macro xts_load_mask, tmp
431
+ movi xtsmask.2s, #0x1
432
+ movi \tmp\().2s, #0x87
433
+ uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
434
+ .endm
322435
323
-AES_ENTRY(aes_xts_encrypt)
324
- frame_push 6
436
+AES_FUNC_START(aes_xts_encrypt)
437
+ stp x29, x30, [sp, #-16]!
438
+ mov x29, sp
325439
326
- mov x19, x0
327
- mov x20, x1
328
- mov x21, x2
329
- mov x22, x3
330
- mov x23, x4
331
- mov x24, x6
332
-
333
- ld1 {v4.16b}, [x24]
440
+ ld1 {v4.16b}, [x6]
441
+ xts_load_mask v8
334442 cbz w7, .Lxtsencnotfirst
335443
336444 enc_prepare w3, x5, x8
445
+ xts_cts_skip_tw w7, .LxtsencNx
337446 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
338447 enc_switch_key w3, x2, x8
339
- ldr q7, .Lxts_mul_x
340448 b .LxtsencNx
341449
342
-.Lxtsencrestart:
343
- ld1 {v4.16b}, [x24]
344450 .Lxtsencnotfirst:
345
- enc_prepare w22, x21, x8
451
+ enc_prepare w3, x2, x8
346452 .LxtsencloopNx:
347
- ldr q7, .Lxts_mul_x
348
- next_tweak v4, v4, v7, v8
453
+ next_tweak v4, v4, v8
349454 .LxtsencNx:
350
- subs w23, w23, #4
455
+ subs w4, w4, #64
351456 bmi .Lxtsenc1x
352
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */
353
- next_tweak v5, v4, v7, v8
457
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
458
+ next_tweak v5, v4, v8
354459 eor v0.16b, v0.16b, v4.16b
355
- next_tweak v6, v5, v7, v8
460
+ next_tweak v6, v5, v8
356461 eor v1.16b, v1.16b, v5.16b
357462 eor v2.16b, v2.16b, v6.16b
358
- next_tweak v7, v6, v7, v8
463
+ next_tweak v7, v6, v8
359464 eor v3.16b, v3.16b, v7.16b
360465 bl aes_encrypt_block4x
361466 eor v3.16b, v3.16b, v7.16b
362467 eor v0.16b, v0.16b, v4.16b
363468 eor v1.16b, v1.16b, v5.16b
364469 eor v2.16b, v2.16b, v6.16b
365
- st1 {v0.16b-v3.16b}, [x19], #64
470
+ st1 {v0.16b-v3.16b}, [x0], #64
366471 mov v4.16b, v7.16b
367
- cbz w23, .Lxtsencout
368
- st1 {v4.16b}, [x24]
369
- cond_yield_neon .Lxtsencrestart
472
+ cbz w4, .Lxtsencret
473
+ xts_reload_mask v8
370474 b .LxtsencloopNx
371475 .Lxtsenc1x:
372
- adds w23, w23, #4
476
+ adds w4, w4, #64
373477 beq .Lxtsencout
478
+ subs w4, w4, #16
479
+ bmi .LxtsencctsNx
374480 .Lxtsencloop:
375
- ld1 {v1.16b}, [x20], #16
376
- eor v0.16b, v1.16b, v4.16b
377
- encrypt_block v0, w22, x21, x8, w7
481
+ ld1 {v0.16b}, [x1], #16
482
+.Lxtsencctsout:
378483 eor v0.16b, v0.16b, v4.16b
379
- st1 {v0.16b}, [x19], #16
380
- subs w23, w23, #1
381
- beq .Lxtsencout
382
- next_tweak v4, v4, v7, v8
484
+ encrypt_block v0, w3, x2, x8, w7
485
+ eor v0.16b, v0.16b, v4.16b
486
+ cbz w4, .Lxtsencout
487
+ subs w4, w4, #16
488
+ next_tweak v4, v4, v8
489
+ bmi .Lxtsenccts
490
+ st1 {v0.16b}, [x0], #16
383491 b .Lxtsencloop
384492 .Lxtsencout:
385
- st1 {v4.16b}, [x24]
386
- frame_pop
493
+ st1 {v0.16b}, [x0]
494
+.Lxtsencret:
495
+ st1 {v4.16b}, [x6]
496
+ ldp x29, x30, [sp], #16
387497 ret
388
-AES_ENDPROC(aes_xts_encrypt)
389498
499
+.LxtsencctsNx:
500
+ mov v0.16b, v3.16b
501
+ sub x0, x0, #16
502
+.Lxtsenccts:
503
+ adr_l x8, .Lcts_permute_table
390504
391
-AES_ENTRY(aes_xts_decrypt)
392
- frame_push 6
505
+ add x1, x1, w4, sxtw /* rewind input pointer */
506
+ add w4, w4, #16 /* # bytes in final block */
507
+ add x9, x8, #32
508
+ add x8, x8, x4
509
+ sub x9, x9, x4
510
+ add x4, x0, x4 /* output address of final block */
393511
394
- mov x19, x0
395
- mov x20, x1
396
- mov x21, x2
397
- mov x22, x3
398
- mov x23, x4
399
- mov x24, x6
512
+ ld1 {v1.16b}, [x1] /* load final block */
513
+ ld1 {v2.16b}, [x8]
514
+ ld1 {v3.16b}, [x9]
400515
401
- ld1 {v4.16b}, [x24]
516
+ tbl v2.16b, {v0.16b}, v2.16b
517
+ tbx v0.16b, {v1.16b}, v3.16b
518
+ st1 {v2.16b}, [x4] /* overlapping stores */
519
+ mov w4, wzr
520
+ b .Lxtsencctsout
521
+AES_FUNC_END(aes_xts_encrypt)
522
+
523
+AES_FUNC_START(aes_xts_decrypt)
524
+ stp x29, x30, [sp, #-16]!
525
+ mov x29, sp
526
+
527
+ /* subtract 16 bytes if we are doing CTS */
528
+ sub w8, w4, #0x10
529
+ tst w4, #0xf
530
+ csel w4, w4, w8, eq
531
+
532
+ ld1 {v4.16b}, [x6]
533
+ xts_load_mask v8
534
+ xts_cts_skip_tw w7, .Lxtsdecskiptw
402535 cbz w7, .Lxtsdecnotfirst
403536
404537 enc_prepare w3, x5, x8
405538 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
539
+.Lxtsdecskiptw:
406540 dec_prepare w3, x2, x8
407
- ldr q7, .Lxts_mul_x
408541 b .LxtsdecNx
409542
410
-.Lxtsdecrestart:
411
- ld1 {v4.16b}, [x24]
412543 .Lxtsdecnotfirst:
413
- dec_prepare w22, x21, x8
544
+ dec_prepare w3, x2, x8
414545 .LxtsdecloopNx:
415
- ldr q7, .Lxts_mul_x
416
- next_tweak v4, v4, v7, v8
546
+ next_tweak v4, v4, v8
417547 .LxtsdecNx:
418
- subs w23, w23, #4
548
+ subs w4, w4, #64
419549 bmi .Lxtsdec1x
420
- ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */
421
- next_tweak v5, v4, v7, v8
550
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
551
+ next_tweak v5, v4, v8
422552 eor v0.16b, v0.16b, v4.16b
423
- next_tweak v6, v5, v7, v8
553
+ next_tweak v6, v5, v8
424554 eor v1.16b, v1.16b, v5.16b
425555 eor v2.16b, v2.16b, v6.16b
426
- next_tweak v7, v6, v7, v8
556
+ next_tweak v7, v6, v8
427557 eor v3.16b, v3.16b, v7.16b
428558 bl aes_decrypt_block4x
429559 eor v3.16b, v3.16b, v7.16b
430560 eor v0.16b, v0.16b, v4.16b
431561 eor v1.16b, v1.16b, v5.16b
432562 eor v2.16b, v2.16b, v6.16b
433
- st1 {v0.16b-v3.16b}, [x19], #64
563
+ st1 {v0.16b-v3.16b}, [x0], #64
434564 mov v4.16b, v7.16b
435
- cbz w23, .Lxtsdecout
436
- st1 {v4.16b}, [x24]
437
- cond_yield_neon .Lxtsdecrestart
565
+ cbz w4, .Lxtsdecout
566
+ xts_reload_mask v8
438567 b .LxtsdecloopNx
439568 .Lxtsdec1x:
440
- adds w23, w23, #4
569
+ adds w4, w4, #64
441570 beq .Lxtsdecout
571
+ subs w4, w4, #16
442572 .Lxtsdecloop:
443
- ld1 {v1.16b}, [x20], #16
444
- eor v0.16b, v1.16b, v4.16b
445
- decrypt_block v0, w22, x21, x8, w7
573
+ ld1 {v0.16b}, [x1], #16
574
+ bmi .Lxtsdeccts
575
+.Lxtsdecctsout:
446576 eor v0.16b, v0.16b, v4.16b
447
- st1 {v0.16b}, [x19], #16
448
- subs w23, w23, #1
449
- beq .Lxtsdecout
450
- next_tweak v4, v4, v7, v8
577
+ decrypt_block v0, w3, x2, x8, w7
578
+ eor v0.16b, v0.16b, v4.16b
579
+ st1 {v0.16b}, [x0], #16
580
+ cbz w4, .Lxtsdecout
581
+ subs w4, w4, #16
582
+ next_tweak v4, v4, v8
451583 b .Lxtsdecloop
452584 .Lxtsdecout:
453
- st1 {v4.16b}, [x24]
454
- frame_pop
585
+ st1 {v4.16b}, [x6]
586
+ ldp x29, x30, [sp], #16
455587 ret
456
-AES_ENDPROC(aes_xts_decrypt)
588
+
589
+.Lxtsdeccts:
590
+ adr_l x8, .Lcts_permute_table
591
+
592
+ add x1, x1, w4, sxtw /* rewind input pointer */
593
+ add w4, w4, #16 /* # bytes in final block */
594
+ add x9, x8, #32
595
+ add x8, x8, x4
596
+ sub x9, x9, x4
597
+ add x4, x0, x4 /* output address of final block */
598
+
599
+ next_tweak v5, v4, v8
600
+
601
+ ld1 {v1.16b}, [x1] /* load final block */
602
+ ld1 {v2.16b}, [x8]
603
+ ld1 {v3.16b}, [x9]
604
+
605
+ eor v0.16b, v0.16b, v5.16b
606
+ decrypt_block v0, w3, x2, x8, w7
607
+ eor v0.16b, v0.16b, v5.16b
608
+
609
+ tbl v2.16b, {v0.16b}, v2.16b
610
+ tbx v0.16b, {v1.16b}, v3.16b
611
+
612
+ st1 {v2.16b}, [x4] /* overlapping stores */
613
+ mov w4, wzr
614
+ b .Lxtsdecctsout
615
+AES_FUNC_END(aes_xts_decrypt)
457616
458617 /*
459618 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
460619 * int blocks, u8 dg[], int enc_before, int enc_after)
461620 */
462
-AES_ENTRY(aes_mac_update)
463
- frame_push 6
464
-
465
- mov x19, x0
466
- mov x20, x1
467
- mov x21, x2
468
- mov x22, x3
469
- mov x23, x4
470
- mov x24, x6
471
-
472
- ld1 {v0.16b}, [x23] /* get dg */
621
+AES_FUNC_START(aes_mac_update)
622
+ ld1 {v0.16b}, [x4] /* get dg */
473623 enc_prepare w2, x1, x7
474624 cbz w5, .Lmacloop4x
475625
476626 encrypt_block v0, w2, x1, x7, w8
477627
478628 .Lmacloop4x:
479
- subs w22, w22, #4
629
+ subs w3, w3, #4
480630 bmi .Lmac1x
481
- ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */
631
+ ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
482632 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
483
- encrypt_block v0, w21, x20, x7, w8
633
+ encrypt_block v0, w2, x1, x7, w8
484634 eor v0.16b, v0.16b, v2.16b
485
- encrypt_block v0, w21, x20, x7, w8
635
+ encrypt_block v0, w2, x1, x7, w8
486636 eor v0.16b, v0.16b, v3.16b
487
- encrypt_block v0, w21, x20, x7, w8
637
+ encrypt_block v0, w2, x1, x7, w8
488638 eor v0.16b, v0.16b, v4.16b
489
- cmp w22, wzr
490
- csinv x5, x24, xzr, eq
639
+ cmp w3, wzr
640
+ csinv x5, x6, xzr, eq
491641 cbz w5, .Lmacout
492
- encrypt_block v0, w21, x20, x7, w8
493
- st1 {v0.16b}, [x23] /* return dg */
494
- cond_yield_neon .Lmacrestart
642
+ encrypt_block v0, w2, x1, x7, w8
643
+ st1 {v0.16b}, [x4] /* return dg */
644
+ cond_yield .Lmacout, x7, x8
495645 b .Lmacloop4x
496646 .Lmac1x:
497
- add w22, w22, #4
647
+ add w3, w3, #4
498648 .Lmacloop:
499
- cbz w22, .Lmacout
500
- ld1 {v1.16b}, [x19], #16 /* get next pt block */
649
+ cbz w3, .Lmacout
650
+ ld1 {v1.16b}, [x0], #16 /* get next pt block */
501651 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
502652
503
- subs w22, w22, #1
504
- csinv x5, x24, xzr, eq
653
+ subs w3, w3, #1
654
+ csinv x5, x6, xzr, eq
505655 cbz w5, .Lmacout
506656
507657 .Lmacenc:
508
- encrypt_block v0, w21, x20, x7, w8
658
+ encrypt_block v0, w2, x1, x7, w8
509659 b .Lmacloop
510660
511661 .Lmacout:
512
- st1 {v0.16b}, [x23] /* return dg */
513
- frame_pop
662
+ st1 {v0.16b}, [x4] /* return dg */
663
+ mov w0, w3
514664 ret
515
-
516
-.Lmacrestart:
517
- ld1 {v0.16b}, [x23] /* get dg */
518
- enc_prepare w21, x20, x0
519
- b .Lmacloop4x
520
-AES_ENDPROC(aes_mac_update)
665
+AES_FUNC_END(aes_mac_update)