.. | .. |
---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-only */ |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> |
---|
5 | | - * |
---|
6 | | - * This program is free software; you can redistribute it and/or modify |
---|
7 | | - * it under the terms of the GNU General Public License version 2 as |
---|
8 | | - * published by the Free Software Foundation. |
---|
9 | 6 | */ |
---|
10 | 7 | |
---|
11 | 8 | /* included by aes-ce.S and aes-neon.S */ |
---|
.. | .. |
---|
13 | 10 | .text |
---|
14 | 11 | .align 4 |
---|
15 | 12 | |
---|
16 | | -aes_encrypt_block4x: |
---|
17 | | - encrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 |
---|
18 | | - ret |
---|
19 | | -ENDPROC(aes_encrypt_block4x) |
---|
| 13 | +#ifndef MAX_STRIDE |
---|
| 14 | +#define MAX_STRIDE 4 |
---|
| 15 | +#endif |
---|
20 | 16 | |
---|
21 | | -aes_decrypt_block4x: |
---|
22 | | - decrypt_block4x v0, v1, v2, v3, w22, x21, x8, w7 |
---|
| 17 | +#if MAX_STRIDE == 4 |
---|
| 18 | +#define ST4(x...) x |
---|
| 19 | +#define ST5(x...) |
---|
| 20 | +#else |
---|
| 21 | +#define ST4(x...) |
---|
| 22 | +#define ST5(x...) x |
---|
| 23 | +#endif |
---|
| 24 | + |
---|
| 25 | +SYM_FUNC_START_LOCAL(aes_encrypt_block4x) |
---|
| 26 | + encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
---|
23 | 27 | ret |
---|
24 | | -ENDPROC(aes_decrypt_block4x) |
---|
| 28 | +SYM_FUNC_END(aes_encrypt_block4x) |
---|
| 29 | + |
---|
| 30 | +SYM_FUNC_START_LOCAL(aes_decrypt_block4x) |
---|
| 31 | + decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
---|
| 32 | + ret |
---|
| 33 | +SYM_FUNC_END(aes_decrypt_block4x) |
---|
| 34 | + |
---|
| 35 | +#if MAX_STRIDE == 5 |
---|
| 36 | +SYM_FUNC_START_LOCAL(aes_encrypt_block5x) |
---|
| 37 | + encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
---|
| 38 | + ret |
---|
| 39 | +SYM_FUNC_END(aes_encrypt_block5x) |
---|
| 40 | + |
---|
| 41 | +SYM_FUNC_START_LOCAL(aes_decrypt_block5x) |
---|
| 42 | + decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
---|
| 43 | + ret |
---|
| 44 | +SYM_FUNC_END(aes_decrypt_block5x) |
---|
| 45 | +#endif |
---|
25 | 46 | |
---|
26 | 47 | /* |
---|
27 | 48 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
---|
.. | .. |
---|
30 | 51 | * int blocks) |
---|
31 | 52 | */ |
---|
32 | 53 | |
---|
33 | | -AES_ENTRY(aes_ecb_encrypt) |
---|
34 | | - frame_push 5 |
---|
| 54 | +AES_FUNC_START(aes_ecb_encrypt) |
---|
| 55 | + stp x29, x30, [sp, #-16]! |
---|
| 56 | + mov x29, sp |
---|
35 | 57 | |
---|
36 | | - mov x19, x0 |
---|
37 | | - mov x20, x1 |
---|
38 | | - mov x21, x2 |
---|
39 | | - mov x22, x3 |
---|
40 | | - mov x23, x4 |
---|
41 | | - |
---|
42 | | -.Lecbencrestart: |
---|
43 | | - enc_prepare w22, x21, x5 |
---|
| 58 | + enc_prepare w3, x2, x5 |
---|
44 | 59 | |
---|
45 | 60 | .LecbencloopNx: |
---|
46 | | - subs w23, w23, #4 |
---|
| 61 | + subs w4, w4, #MAX_STRIDE |
---|
47 | 62 | bmi .Lecbenc1x |
---|
48 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
---|
49 | | - bl aes_encrypt_block4x |
---|
50 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
51 | | - cond_yield_neon .Lecbencrestart |
---|
| 63 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
---|
| 64 | +ST4( bl aes_encrypt_block4x ) |
---|
| 65 | +ST5( ld1 {v4.16b}, [x1], #16 ) |
---|
| 66 | +ST5( bl aes_encrypt_block5x ) |
---|
| 67 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
| 68 | +ST5( st1 {v4.16b}, [x0], #16 ) |
---|
52 | 69 | b .LecbencloopNx |
---|
53 | 70 | .Lecbenc1x: |
---|
54 | | - adds w23, w23, #4 |
---|
| 71 | + adds w4, w4, #MAX_STRIDE |
---|
55 | 72 | beq .Lecbencout |
---|
56 | 73 | .Lecbencloop: |
---|
57 | | - ld1 {v0.16b}, [x20], #16 /* get next pt block */ |
---|
58 | | - encrypt_block v0, w22, x21, x5, w6 |
---|
59 | | - st1 {v0.16b}, [x19], #16 |
---|
60 | | - subs w23, w23, #1 |
---|
| 74 | + ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
---|
| 75 | + encrypt_block v0, w3, x2, x5, w6 |
---|
| 76 | + st1 {v0.16b}, [x0], #16 |
---|
| 77 | + subs w4, w4, #1 |
---|
61 | 78 | bne .Lecbencloop |
---|
62 | 79 | .Lecbencout: |
---|
63 | | - frame_pop |
---|
| 80 | + ldp x29, x30, [sp], #16 |
---|
64 | 81 | ret |
---|
65 | | -AES_ENDPROC(aes_ecb_encrypt) |
---|
| 82 | +AES_FUNC_END(aes_ecb_encrypt) |
---|
66 | 83 | |
---|
67 | 84 | |
---|
68 | | -AES_ENTRY(aes_ecb_decrypt) |
---|
69 | | - frame_push 5 |
---|
| 85 | +AES_FUNC_START(aes_ecb_decrypt) |
---|
| 86 | + stp x29, x30, [sp, #-16]! |
---|
| 87 | + mov x29, sp |
---|
70 | 88 | |
---|
71 | | - mov x19, x0 |
---|
72 | | - mov x20, x1 |
---|
73 | | - mov x21, x2 |
---|
74 | | - mov x22, x3 |
---|
75 | | - mov x23, x4 |
---|
76 | | - |
---|
77 | | -.Lecbdecrestart: |
---|
78 | | - dec_prepare w22, x21, x5 |
---|
| 89 | + dec_prepare w3, x2, x5 |
---|
79 | 90 | |
---|
80 | 91 | .LecbdecloopNx: |
---|
81 | | - subs w23, w23, #4 |
---|
| 92 | + subs w4, w4, #MAX_STRIDE |
---|
82 | 93 | bmi .Lecbdec1x |
---|
83 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
---|
84 | | - bl aes_decrypt_block4x |
---|
85 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
86 | | - cond_yield_neon .Lecbdecrestart |
---|
| 94 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
---|
| 95 | +ST4( bl aes_decrypt_block4x ) |
---|
| 96 | +ST5( ld1 {v4.16b}, [x1], #16 ) |
---|
| 97 | +ST5( bl aes_decrypt_block5x ) |
---|
| 98 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
| 99 | +ST5( st1 {v4.16b}, [x0], #16 ) |
---|
87 | 100 | b .LecbdecloopNx |
---|
88 | 101 | .Lecbdec1x: |
---|
89 | | - adds w23, w23, #4 |
---|
| 102 | + adds w4, w4, #MAX_STRIDE |
---|
90 | 103 | beq .Lecbdecout |
---|
91 | 104 | .Lecbdecloop: |
---|
92 | | - ld1 {v0.16b}, [x20], #16 /* get next ct block */ |
---|
93 | | - decrypt_block v0, w22, x21, x5, w6 |
---|
94 | | - st1 {v0.16b}, [x19], #16 |
---|
95 | | - subs w23, w23, #1 |
---|
| 105 | + ld1 {v0.16b}, [x1], #16 /* get next ct block */ |
---|
| 106 | + decrypt_block v0, w3, x2, x5, w6 |
---|
| 107 | + st1 {v0.16b}, [x0], #16 |
---|
| 108 | + subs w4, w4, #1 |
---|
96 | 109 | bne .Lecbdecloop |
---|
97 | 110 | .Lecbdecout: |
---|
98 | | - frame_pop |
---|
| 111 | + ldp x29, x30, [sp], #16 |
---|
99 | 112 | ret |
---|
100 | | -AES_ENDPROC(aes_ecb_decrypt) |
---|
| 113 | +AES_FUNC_END(aes_ecb_decrypt) |
---|
101 | 114 | |
---|
102 | 115 | |
---|
103 | 116 | /* |
---|
.. | .. |
---|
105 | 118 | * int blocks, u8 iv[]) |
---|
106 | 119 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
---|
107 | 120 | * int blocks, u8 iv[]) |
---|
| 121 | + * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], |
---|
| 122 | + * int rounds, int blocks, u8 iv[], |
---|
| 123 | + * u32 const rk2[]); |
---|
| 124 | + * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], |
---|
| 125 | + * int rounds, int blocks, u8 iv[], |
---|
| 126 | + * u32 const rk2[]); |
---|
108 | 127 | */ |
---|
109 | 128 | |
---|
110 | | -AES_ENTRY(aes_cbc_encrypt) |
---|
111 | | - frame_push 6 |
---|
| 129 | +AES_FUNC_START(aes_essiv_cbc_encrypt) |
---|
| 130 | + ld1 {v4.16b}, [x5] /* get iv */ |
---|
112 | 131 | |
---|
113 | | - mov x19, x0 |
---|
114 | | - mov x20, x1 |
---|
115 | | - mov x21, x2 |
---|
116 | | - mov x22, x3 |
---|
117 | | - mov x23, x4 |
---|
118 | | - mov x24, x5 |
---|
| 132 | + mov w8, #14 /* AES-256: 14 rounds */ |
---|
| 133 | + enc_prepare w8, x6, x7 |
---|
| 134 | + encrypt_block v4, w8, x6, x7, w9 |
---|
| 135 | + enc_switch_key w3, x2, x6 |
---|
| 136 | + b .Lcbcencloop4x |
---|
119 | 137 | |
---|
120 | | -.Lcbcencrestart: |
---|
121 | | - ld1 {v4.16b}, [x24] /* get iv */ |
---|
122 | | - enc_prepare w22, x21, x6 |
---|
| 138 | +AES_FUNC_START(aes_cbc_encrypt) |
---|
| 139 | + ld1 {v4.16b}, [x5] /* get iv */ |
---|
| 140 | + enc_prepare w3, x2, x6 |
---|
123 | 141 | |
---|
124 | 142 | .Lcbcencloop4x: |
---|
125 | | - subs w23, w23, #4 |
---|
| 143 | + subs w4, w4, #4 |
---|
126 | 144 | bmi .Lcbcenc1x |
---|
127 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
---|
| 145 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
---|
128 | 146 | eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ |
---|
129 | | - encrypt_block v0, w22, x21, x6, w7 |
---|
| 147 | + encrypt_block v0, w3, x2, x6, w7 |
---|
130 | 148 | eor v1.16b, v1.16b, v0.16b |
---|
131 | | - encrypt_block v1, w22, x21, x6, w7 |
---|
| 149 | + encrypt_block v1, w3, x2, x6, w7 |
---|
132 | 150 | eor v2.16b, v2.16b, v1.16b |
---|
133 | | - encrypt_block v2, w22, x21, x6, w7 |
---|
| 151 | + encrypt_block v2, w3, x2, x6, w7 |
---|
134 | 152 | eor v3.16b, v3.16b, v2.16b |
---|
135 | | - encrypt_block v3, w22, x21, x6, w7 |
---|
136 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
| 153 | + encrypt_block v3, w3, x2, x6, w7 |
---|
| 154 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
137 | 155 | mov v4.16b, v3.16b |
---|
138 | | - st1 {v4.16b}, [x24] /* return iv */ |
---|
139 | | - cond_yield_neon .Lcbcencrestart |
---|
140 | 156 | b .Lcbcencloop4x |
---|
141 | 157 | .Lcbcenc1x: |
---|
142 | | - adds w23, w23, #4 |
---|
| 158 | + adds w4, w4, #4 |
---|
143 | 159 | beq .Lcbcencout |
---|
144 | 160 | .Lcbcencloop: |
---|
145 | | - ld1 {v0.16b}, [x20], #16 /* get next pt block */ |
---|
| 161 | + ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
---|
146 | 162 | eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ |
---|
147 | | - encrypt_block v4, w22, x21, x6, w7 |
---|
148 | | - st1 {v4.16b}, [x19], #16 |
---|
149 | | - subs w23, w23, #1 |
---|
| 163 | + encrypt_block v4, w3, x2, x6, w7 |
---|
| 164 | + st1 {v4.16b}, [x0], #16 |
---|
| 165 | + subs w4, w4, #1 |
---|
150 | 166 | bne .Lcbcencloop |
---|
151 | 167 | .Lcbcencout: |
---|
152 | | - st1 {v4.16b}, [x24] /* return iv */ |
---|
153 | | - frame_pop |
---|
| 168 | + st1 {v4.16b}, [x5] /* return iv */ |
---|
154 | 169 | ret |
---|
155 | | -AES_ENDPROC(aes_cbc_encrypt) |
---|
| 170 | +AES_FUNC_END(aes_cbc_encrypt) |
---|
| 171 | +AES_FUNC_END(aes_essiv_cbc_encrypt) |
---|
156 | 172 | |
---|
| 173 | +AES_FUNC_START(aes_essiv_cbc_decrypt) |
---|
| 174 | + stp x29, x30, [sp, #-16]! |
---|
| 175 | + mov x29, sp |
---|
157 | 176 | |
---|
158 | | -AES_ENTRY(aes_cbc_decrypt) |
---|
159 | | - frame_push 6 |
---|
| 177 | + ld1 {cbciv.16b}, [x5] /* get iv */ |
---|
160 | 178 | |
---|
161 | | - mov x19, x0 |
---|
162 | | - mov x20, x1 |
---|
163 | | - mov x21, x2 |
---|
164 | | - mov x22, x3 |
---|
165 | | - mov x23, x4 |
---|
166 | | - mov x24, x5 |
---|
| 179 | + mov w8, #14 /* AES-256: 14 rounds */ |
---|
| 180 | + enc_prepare w8, x6, x7 |
---|
| 181 | + encrypt_block cbciv, w8, x6, x7, w9 |
---|
| 182 | + b .Lessivcbcdecstart |
---|
167 | 183 | |
---|
168 | | -.Lcbcdecrestart: |
---|
169 | | - ld1 {v7.16b}, [x24] /* get iv */ |
---|
170 | | - dec_prepare w22, x21, x6 |
---|
| 184 | +AES_FUNC_START(aes_cbc_decrypt) |
---|
| 185 | + stp x29, x30, [sp, #-16]! |
---|
| 186 | + mov x29, sp |
---|
| 187 | + |
---|
| 188 | + ld1 {cbciv.16b}, [x5] /* get iv */ |
---|
| 189 | +.Lessivcbcdecstart: |
---|
| 190 | + dec_prepare w3, x2, x6 |
---|
171 | 191 | |
---|
172 | 192 | .LcbcdecloopNx: |
---|
173 | | - subs w23, w23, #4 |
---|
| 193 | + subs w4, w4, #MAX_STRIDE |
---|
174 | 194 | bmi .Lcbcdec1x |
---|
175 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
---|
| 195 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
---|
| 196 | +#if MAX_STRIDE == 5 |
---|
| 197 | + ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ |
---|
| 198 | + mov v5.16b, v0.16b |
---|
| 199 | + mov v6.16b, v1.16b |
---|
| 200 | + mov v7.16b, v2.16b |
---|
| 201 | + bl aes_decrypt_block5x |
---|
| 202 | + sub x1, x1, #32 |
---|
| 203 | + eor v0.16b, v0.16b, cbciv.16b |
---|
| 204 | + eor v1.16b, v1.16b, v5.16b |
---|
| 205 | + ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ |
---|
| 206 | + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
---|
| 207 | + eor v2.16b, v2.16b, v6.16b |
---|
| 208 | + eor v3.16b, v3.16b, v7.16b |
---|
| 209 | + eor v4.16b, v4.16b, v5.16b |
---|
| 210 | +#else |
---|
176 | 211 | mov v4.16b, v0.16b |
---|
177 | 212 | mov v5.16b, v1.16b |
---|
178 | 213 | mov v6.16b, v2.16b |
---|
179 | 214 | bl aes_decrypt_block4x |
---|
180 | | - sub x20, x20, #16 |
---|
181 | | - eor v0.16b, v0.16b, v7.16b |
---|
| 215 | + sub x1, x1, #16 |
---|
| 216 | + eor v0.16b, v0.16b, cbciv.16b |
---|
182 | 217 | eor v1.16b, v1.16b, v4.16b |
---|
183 | | - ld1 {v7.16b}, [x20], #16 /* reload 1 ct block */ |
---|
| 218 | + ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
---|
184 | 219 | eor v2.16b, v2.16b, v5.16b |
---|
185 | 220 | eor v3.16b, v3.16b, v6.16b |
---|
186 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
187 | | - st1 {v7.16b}, [x24] /* return iv */ |
---|
188 | | - cond_yield_neon .Lcbcdecrestart |
---|
| 221 | +#endif |
---|
| 222 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
| 223 | +ST5( st1 {v4.16b}, [x0], #16 ) |
---|
189 | 224 | b .LcbcdecloopNx |
---|
190 | 225 | .Lcbcdec1x: |
---|
191 | | - adds w23, w23, #4 |
---|
| 226 | + adds w4, w4, #MAX_STRIDE |
---|
192 | 227 | beq .Lcbcdecout |
---|
193 | 228 | .Lcbcdecloop: |
---|
194 | | - ld1 {v1.16b}, [x20], #16 /* get next ct block */ |
---|
| 229 | + ld1 {v1.16b}, [x1], #16 /* get next ct block */ |
---|
195 | 230 | mov v0.16b, v1.16b /* ...and copy to v0 */ |
---|
196 | | - decrypt_block v0, w22, x21, x6, w7 |
---|
197 | | - eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ |
---|
198 | | - mov v7.16b, v1.16b /* ct is next iv */ |
---|
199 | | - st1 {v0.16b}, [x19], #16 |
---|
200 | | - subs w23, w23, #1 |
---|
| 231 | + decrypt_block v0, w3, x2, x6, w7 |
---|
| 232 | + eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ |
---|
| 233 | + mov cbciv.16b, v1.16b /* ct is next iv */ |
---|
| 234 | + st1 {v0.16b}, [x0], #16 |
---|
| 235 | + subs w4, w4, #1 |
---|
201 | 236 | bne .Lcbcdecloop |
---|
202 | 237 | .Lcbcdecout: |
---|
203 | | - st1 {v7.16b}, [x24] /* return iv */ |
---|
204 | | - frame_pop |
---|
| 238 | + st1 {cbciv.16b}, [x5] /* return iv */ |
---|
| 239 | + ldp x29, x30, [sp], #16 |
---|
205 | 240 | ret |
---|
206 | | -AES_ENDPROC(aes_cbc_decrypt) |
---|
| 241 | +AES_FUNC_END(aes_cbc_decrypt) |
---|
| 242 | +AES_FUNC_END(aes_essiv_cbc_decrypt) |
---|
| 243 | + |
---|
| 244 | + |
---|
| 245 | + /* |
---|
| 246 | + * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], |
---|
| 247 | + * int rounds, int bytes, u8 const iv[]) |
---|
| 248 | + * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], |
---|
| 249 | + * int rounds, int bytes, u8 const iv[]) |
---|
| 250 | + */ |
---|
| 251 | + |
---|
| 252 | +AES_FUNC_START(aes_cbc_cts_encrypt) |
---|
| 253 | + adr_l x8, .Lcts_permute_table |
---|
| 254 | + sub x4, x4, #16 |
---|
| 255 | + add x9, x8, #32 |
---|
| 256 | + add x8, x8, x4 |
---|
| 257 | + sub x9, x9, x4 |
---|
| 258 | + ld1 {v3.16b}, [x8] |
---|
| 259 | + ld1 {v4.16b}, [x9] |
---|
| 260 | + |
---|
| 261 | + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
---|
| 262 | + ld1 {v1.16b}, [x1] |
---|
| 263 | + |
---|
| 264 | + ld1 {v5.16b}, [x5] /* get iv */ |
---|
| 265 | + enc_prepare w3, x2, x6 |
---|
| 266 | + |
---|
| 267 | + eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
---|
| 268 | + tbl v1.16b, {v1.16b}, v4.16b |
---|
| 269 | + encrypt_block v0, w3, x2, x6, w7 |
---|
| 270 | + |
---|
| 271 | + eor v1.16b, v1.16b, v0.16b |
---|
| 272 | + tbl v0.16b, {v0.16b}, v3.16b |
---|
| 273 | + encrypt_block v1, w3, x2, x6, w7 |
---|
| 274 | + |
---|
| 275 | + add x4, x0, x4 |
---|
| 276 | + st1 {v0.16b}, [x4] /* overlapping stores */ |
---|
| 277 | + st1 {v1.16b}, [x0] |
---|
| 278 | + ret |
---|
| 279 | +AES_FUNC_END(aes_cbc_cts_encrypt) |
---|
| 280 | + |
---|
| 281 | +AES_FUNC_START(aes_cbc_cts_decrypt) |
---|
| 282 | + adr_l x8, .Lcts_permute_table |
---|
| 283 | + sub x4, x4, #16 |
---|
| 284 | + add x9, x8, #32 |
---|
| 285 | + add x8, x8, x4 |
---|
| 286 | + sub x9, x9, x4 |
---|
| 287 | + ld1 {v3.16b}, [x8] |
---|
| 288 | + ld1 {v4.16b}, [x9] |
---|
| 289 | + |
---|
| 290 | + ld1 {v0.16b}, [x1], x4 /* overlapping loads */ |
---|
| 291 | + ld1 {v1.16b}, [x1] |
---|
| 292 | + |
---|
| 293 | + ld1 {v5.16b}, [x5] /* get iv */ |
---|
| 294 | + dec_prepare w3, x2, x6 |
---|
| 295 | + |
---|
| 296 | + decrypt_block v0, w3, x2, x6, w7 |
---|
| 297 | + tbl v2.16b, {v0.16b}, v3.16b |
---|
| 298 | + eor v2.16b, v2.16b, v1.16b |
---|
| 299 | + |
---|
| 300 | + tbx v0.16b, {v1.16b}, v4.16b |
---|
| 301 | + decrypt_block v0, w3, x2, x6, w7 |
---|
| 302 | + eor v0.16b, v0.16b, v5.16b /* xor with iv */ |
---|
| 303 | + |
---|
| 304 | + add x4, x0, x4 |
---|
| 305 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
---|
| 306 | + st1 {v0.16b}, [x0] |
---|
| 307 | + ret |
---|
| 308 | +AES_FUNC_END(aes_cbc_cts_decrypt) |
---|
| 309 | + |
---|
| 310 | + .section ".rodata", "a" |
---|
| 311 | + .align 6 |
---|
| 312 | +.Lcts_permute_table: |
---|
| 313 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 314 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 315 | + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
---|
| 316 | + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf |
---|
| 317 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 318 | + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
---|
| 319 | + .previous |
---|
207 | 320 | |
---|
208 | 321 | |
---|
209 | 322 | /* |
---|
.. | .. |
---|
211 | 324 | * int blocks, u8 ctr[]) |
---|
212 | 325 | */ |
---|
213 | 326 | |
---|
214 | | -AES_ENTRY(aes_ctr_encrypt) |
---|
215 | | - frame_push 6 |
---|
| 327 | +AES_FUNC_START(aes_ctr_encrypt) |
---|
| 328 | + stp x29, x30, [sp, #-16]! |
---|
| 329 | + mov x29, sp |
---|
216 | 330 | |
---|
217 | | - mov x19, x0 |
---|
218 | | - mov x20, x1 |
---|
219 | | - mov x21, x2 |
---|
220 | | - mov x22, x3 |
---|
221 | | - mov x23, x4 |
---|
222 | | - mov x24, x5 |
---|
| 331 | + enc_prepare w3, x2, x6 |
---|
| 332 | + ld1 {vctr.16b}, [x5] |
---|
223 | 333 | |
---|
224 | | -.Lctrrestart: |
---|
225 | | - enc_prepare w22, x21, x6 |
---|
226 | | - ld1 {v4.16b}, [x24] |
---|
227 | | - |
---|
228 | | - umov x6, v4.d[1] /* keep swabbed ctr in reg */ |
---|
| 334 | + umov x6, vctr.d[1] /* keep swabbed ctr in reg */ |
---|
229 | 335 | rev x6, x6 |
---|
| 336 | + cmn w6, w4 /* 32 bit overflow? */ |
---|
| 337 | + bcs .Lctrloop |
---|
230 | 338 | .LctrloopNx: |
---|
231 | | - subs w23, w23, #4 |
---|
| 339 | + subs w4, w4, #MAX_STRIDE |
---|
232 | 340 | bmi .Lctr1x |
---|
233 | | - cmn w6, #4 /* 32 bit overflow? */ |
---|
234 | | - bcs .Lctr1x |
---|
235 | 341 | add w7, w6, #1 |
---|
236 | | - mov v0.16b, v4.16b |
---|
| 342 | + mov v0.16b, vctr.16b |
---|
237 | 343 | add w8, w6, #2 |
---|
238 | | - mov v1.16b, v4.16b |
---|
| 344 | + mov v1.16b, vctr.16b |
---|
239 | 345 | add w9, w6, #3 |
---|
240 | | - mov v2.16b, v4.16b |
---|
| 346 | + mov v2.16b, vctr.16b |
---|
| 347 | + add w9, w6, #3 |
---|
241 | 348 | rev w7, w7 |
---|
242 | | - mov v3.16b, v4.16b |
---|
| 349 | + mov v3.16b, vctr.16b |
---|
243 | 350 | rev w8, w8 |
---|
| 351 | +ST5( mov v4.16b, vctr.16b ) |
---|
244 | 352 | mov v1.s[3], w7 |
---|
245 | 353 | rev w9, w9 |
---|
| 354 | +ST5( add w10, w6, #4 ) |
---|
246 | 355 | mov v2.s[3], w8 |
---|
| 356 | +ST5( rev w10, w10 ) |
---|
247 | 357 | mov v3.s[3], w9 |
---|
248 | | - ld1 {v5.16b-v7.16b}, [x20], #48 /* get 3 input blocks */ |
---|
249 | | - bl aes_encrypt_block4x |
---|
| 358 | +ST5( mov v4.s[3], w10 ) |
---|
| 359 | + ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ |
---|
| 360 | +ST4( bl aes_encrypt_block4x ) |
---|
| 361 | +ST5( bl aes_encrypt_block5x ) |
---|
250 | 362 | eor v0.16b, v5.16b, v0.16b |
---|
251 | | - ld1 {v5.16b}, [x20], #16 /* get 1 input block */ |
---|
| 363 | +ST4( ld1 {v5.16b}, [x1], #16 ) |
---|
252 | 364 | eor v1.16b, v6.16b, v1.16b |
---|
| 365 | +ST5( ld1 {v5.16b-v6.16b}, [x1], #32 ) |
---|
253 | 366 | eor v2.16b, v7.16b, v2.16b |
---|
254 | 367 | eor v3.16b, v5.16b, v3.16b |
---|
255 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
256 | | - add x6, x6, #4 |
---|
| 368 | +ST5( eor v4.16b, v6.16b, v4.16b ) |
---|
| 369 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
| 370 | +ST5( st1 {v4.16b}, [x0], #16 ) |
---|
| 371 | + add x6, x6, #MAX_STRIDE |
---|
257 | 372 | rev x7, x6 |
---|
258 | | - ins v4.d[1], x7 |
---|
259 | | - cbz w23, .Lctrout |
---|
260 | | - st1 {v4.16b}, [x24] /* return next CTR value */ |
---|
261 | | - cond_yield_neon .Lctrrestart |
---|
| 373 | + ins vctr.d[1], x7 |
---|
| 374 | + cbz w4, .Lctrout |
---|
262 | 375 | b .LctrloopNx |
---|
263 | 376 | .Lctr1x: |
---|
264 | | - adds w23, w23, #4 |
---|
| 377 | + adds w4, w4, #MAX_STRIDE |
---|
265 | 378 | beq .Lctrout |
---|
266 | 379 | .Lctrloop: |
---|
267 | | - mov v0.16b, v4.16b |
---|
268 | | - encrypt_block v0, w22, x21, x8, w7 |
---|
| 380 | + mov v0.16b, vctr.16b |
---|
| 381 | + encrypt_block v0, w3, x2, x8, w7 |
---|
269 | 382 | |
---|
270 | 383 | adds x6, x6, #1 /* increment BE ctr */ |
---|
271 | 384 | rev x7, x6 |
---|
272 | | - ins v4.d[1], x7 |
---|
| 385 | + ins vctr.d[1], x7 |
---|
273 | 386 | bcs .Lctrcarry /* overflow? */ |
---|
274 | 387 | |
---|
275 | 388 | .Lctrcarrydone: |
---|
276 | | - subs w23, w23, #1 |
---|
| 389 | + subs w4, w4, #1 |
---|
277 | 390 | bmi .Lctrtailblock /* blocks <0 means tail block */ |
---|
278 | | - ld1 {v3.16b}, [x20], #16 |
---|
| 391 | + ld1 {v3.16b}, [x1], #16 |
---|
279 | 392 | eor v3.16b, v0.16b, v3.16b |
---|
280 | | - st1 {v3.16b}, [x19], #16 |
---|
| 393 | + st1 {v3.16b}, [x0], #16 |
---|
281 | 394 | bne .Lctrloop |
---|
282 | 395 | |
---|
283 | 396 | .Lctrout: |
---|
284 | | - st1 {v4.16b}, [x24] /* return next CTR value */ |
---|
285 | | -.Lctrret: |
---|
286 | | - frame_pop |
---|
| 397 | + st1 {vctr.16b}, [x5] /* return next CTR value */ |
---|
| 398 | + ldp x29, x30, [sp], #16 |
---|
287 | 399 | ret |
---|
288 | 400 | |
---|
289 | 401 | .Lctrtailblock: |
---|
290 | | - st1 {v0.16b}, [x19] |
---|
291 | | - b .Lctrret |
---|
| 402 | + st1 {v0.16b}, [x0] |
---|
| 403 | + b .Lctrout |
---|
292 | 404 | |
---|
293 | 405 | .Lctrcarry: |
---|
294 | | - umov x7, v4.d[0] /* load upper word of ctr */ |
---|
| 406 | + umov x7, vctr.d[0] /* load upper word of ctr */ |
---|
295 | 407 | rev x7, x7 /* ... to handle the carry */ |
---|
296 | 408 | add x7, x7, #1 |
---|
297 | 409 | rev x7, x7 |
---|
298 | | - ins v4.d[0], x7 |
---|
| 410 | + ins vctr.d[0], x7 |
---|
299 | 411 | b .Lctrcarrydone |
---|
300 | | -AES_ENDPROC(aes_ctr_encrypt) |
---|
301 | | - .ltorg |
---|
| 412 | +AES_FUNC_END(aes_ctr_encrypt) |
---|
302 | 413 | |
---|
303 | 414 | |
---|
304 | 415 | /* |
---|
| 416 | + * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
---|
| 417 | + * int bytes, u8 const rk2[], u8 iv[], int first) |
---|
305 | 418 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
---|
306 | | - * int blocks, u8 const rk2[], u8 iv[], int first) |
---|
307 | | - * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
---|
308 | | - * int blocks, u8 const rk2[], u8 iv[], int first) |
---|
| 419 | + * int bytes, u8 const rk2[], u8 iv[], int first) |
---|
309 | 420 | */ |
---|
310 | 421 | |
---|
311 | | - .macro next_tweak, out, in, const, tmp |
---|
| 422 | + .macro next_tweak, out, in, tmp |
---|
312 | 423 | sshr \tmp\().2d, \in\().2d, #63 |
---|
313 | | - and \tmp\().16b, \tmp\().16b, \const\().16b |
---|
| 424 | + and \tmp\().16b, \tmp\().16b, xtsmask.16b |
---|
314 | 425 | add \out\().2d, \in\().2d, \in\().2d |
---|
315 | 426 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
---|
316 | 427 | eor \out\().16b, \out\().16b, \tmp\().16b |
---|
317 | 428 | .endm |
---|
318 | 429 | |
---|
319 | | -.Lxts_mul_x: |
---|
320 | | -CPU_LE( .quad 1, 0x87 ) |
---|
321 | | -CPU_BE( .quad 0x87, 1 ) |
---|
| 430 | + .macro xts_load_mask, tmp |
---|
| 431 | + movi xtsmask.2s, #0x1 |
---|
| 432 | + movi \tmp\().2s, #0x87 |
---|
| 433 | + uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s |
---|
| 434 | + .endm |
---|
322 | 435 | |
---|
323 | | -AES_ENTRY(aes_xts_encrypt) |
---|
324 | | - frame_push 6 |
---|
| 436 | +AES_FUNC_START(aes_xts_encrypt) |
---|
| 437 | + stp x29, x30, [sp, #-16]! |
---|
| 438 | + mov x29, sp |
---|
325 | 439 | |
---|
326 | | - mov x19, x0 |
---|
327 | | - mov x20, x1 |
---|
328 | | - mov x21, x2 |
---|
329 | | - mov x22, x3 |
---|
330 | | - mov x23, x4 |
---|
331 | | - mov x24, x6 |
---|
332 | | - |
---|
333 | | - ld1 {v4.16b}, [x24] |
---|
| 440 | + ld1 {v4.16b}, [x6] |
---|
| 441 | + xts_load_mask v8 |
---|
334 | 442 | cbz w7, .Lxtsencnotfirst |
---|
335 | 443 | |
---|
336 | 444 | enc_prepare w3, x5, x8 |
---|
| 445 | + xts_cts_skip_tw w7, .LxtsencNx |
---|
337 | 446 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
---|
338 | 447 | enc_switch_key w3, x2, x8 |
---|
339 | | - ldr q7, .Lxts_mul_x |
---|
340 | 448 | b .LxtsencNx |
---|
341 | 449 | |
---|
342 | | -.Lxtsencrestart: |
---|
343 | | - ld1 {v4.16b}, [x24] |
---|
344 | 450 | .Lxtsencnotfirst: |
---|
345 | | - enc_prepare w22, x21, x8 |
---|
| 451 | + enc_prepare w3, x2, x8 |
---|
346 | 452 | .LxtsencloopNx: |
---|
347 | | - ldr q7, .Lxts_mul_x |
---|
348 | | - next_tweak v4, v4, v7, v8 |
---|
| 453 | + next_tweak v4, v4, v8 |
---|
349 | 454 | .LxtsencNx: |
---|
350 | | - subs w23, w23, #4 |
---|
| 455 | + subs w4, w4, #64 |
---|
351 | 456 | bmi .Lxtsenc1x |
---|
352 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 pt blocks */ |
---|
353 | | - next_tweak v5, v4, v7, v8 |
---|
| 457 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
---|
| 458 | + next_tweak v5, v4, v8 |
---|
354 | 459 | eor v0.16b, v0.16b, v4.16b |
---|
355 | | - next_tweak v6, v5, v7, v8 |
---|
| 460 | + next_tweak v6, v5, v8 |
---|
356 | 461 | eor v1.16b, v1.16b, v5.16b |
---|
357 | 462 | eor v2.16b, v2.16b, v6.16b |
---|
358 | | - next_tweak v7, v6, v7, v8 |
---|
| 463 | + next_tweak v7, v6, v8 |
---|
359 | 464 | eor v3.16b, v3.16b, v7.16b |
---|
360 | 465 | bl aes_encrypt_block4x |
---|
361 | 466 | eor v3.16b, v3.16b, v7.16b |
---|
362 | 467 | eor v0.16b, v0.16b, v4.16b |
---|
363 | 468 | eor v1.16b, v1.16b, v5.16b |
---|
364 | 469 | eor v2.16b, v2.16b, v6.16b |
---|
365 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
| 470 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
366 | 471 | mov v4.16b, v7.16b |
---|
367 | | - cbz w23, .Lxtsencout |
---|
368 | | - st1 {v4.16b}, [x24] |
---|
369 | | - cond_yield_neon .Lxtsencrestart |
---|
| 472 | + cbz w4, .Lxtsencret |
---|
| 473 | + xts_reload_mask v8 |
---|
370 | 474 | b .LxtsencloopNx |
---|
371 | 475 | .Lxtsenc1x: |
---|
372 | | - adds w23, w23, #4 |
---|
| 476 | + adds w4, w4, #64 |
---|
373 | 477 | beq .Lxtsencout |
---|
| 478 | + subs w4, w4, #16 |
---|
| 479 | + bmi .LxtsencctsNx |
---|
374 | 480 | .Lxtsencloop: |
---|
375 | | - ld1 {v1.16b}, [x20], #16 |
---|
376 | | - eor v0.16b, v1.16b, v4.16b |
---|
377 | | - encrypt_block v0, w22, x21, x8, w7 |
---|
| 481 | + ld1 {v0.16b}, [x1], #16 |
---|
| 482 | +.Lxtsencctsout: |
---|
378 | 483 | eor v0.16b, v0.16b, v4.16b |
---|
379 | | - st1 {v0.16b}, [x19], #16 |
---|
380 | | - subs w23, w23, #1 |
---|
381 | | - beq .Lxtsencout |
---|
382 | | - next_tweak v4, v4, v7, v8 |
---|
| 484 | + encrypt_block v0, w3, x2, x8, w7 |
---|
| 485 | + eor v0.16b, v0.16b, v4.16b |
---|
| 486 | + cbz w4, .Lxtsencout |
---|
| 487 | + subs w4, w4, #16 |
---|
| 488 | + next_tweak v4, v4, v8 |
---|
| 489 | + bmi .Lxtsenccts |
---|
| 490 | + st1 {v0.16b}, [x0], #16 |
---|
383 | 491 | b .Lxtsencloop |
---|
384 | 492 | .Lxtsencout: |
---|
385 | | - st1 {v4.16b}, [x24] |
---|
386 | | - frame_pop |
---|
| 493 | + st1 {v0.16b}, [x0] |
---|
| 494 | +.Lxtsencret: |
---|
| 495 | + st1 {v4.16b}, [x6] |
---|
| 496 | + ldp x29, x30, [sp], #16 |
---|
387 | 497 | ret |
---|
388 | | -AES_ENDPROC(aes_xts_encrypt) |
---|
389 | 498 | |
---|
| 499 | +.LxtsencctsNx: |
---|
| 500 | + mov v0.16b, v3.16b |
---|
| 501 | + sub x0, x0, #16 |
---|
| 502 | +.Lxtsenccts: |
---|
| 503 | + adr_l x8, .Lcts_permute_table |
---|
390 | 504 | |
---|
391 | | -AES_ENTRY(aes_xts_decrypt) |
---|
392 | | - frame_push 6 |
---|
| 505 | + add x1, x1, w4, sxtw /* rewind input pointer */ |
---|
| 506 | + add w4, w4, #16 /* # bytes in final block */ |
---|
| 507 | + add x9, x8, #32 |
---|
| 508 | + add x8, x8, x4 |
---|
| 509 | + sub x9, x9, x4 |
---|
| 510 | + add x4, x0, x4 /* output address of final block */ |
---|
393 | 511 | |
---|
394 | | - mov x19, x0 |
---|
395 | | - mov x20, x1 |
---|
396 | | - mov x21, x2 |
---|
397 | | - mov x22, x3 |
---|
398 | | - mov x23, x4 |
---|
399 | | - mov x24, x6 |
---|
| 512 | + ld1 {v1.16b}, [x1] /* load final block */ |
---|
| 513 | + ld1 {v2.16b}, [x8] |
---|
| 514 | + ld1 {v3.16b}, [x9] |
---|
400 | 515 | |
---|
401 | | - ld1 {v4.16b}, [x24] |
---|
| 516 | + tbl v2.16b, {v0.16b}, v2.16b |
---|
| 517 | + tbx v0.16b, {v1.16b}, v3.16b |
---|
| 518 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
---|
| 519 | + mov w4, wzr |
---|
| 520 | + b .Lxtsencctsout |
---|
| 521 | +AES_FUNC_END(aes_xts_encrypt) |
---|
| 522 | + |
---|
| 523 | +AES_FUNC_START(aes_xts_decrypt) |
---|
| 524 | + stp x29, x30, [sp, #-16]! |
---|
| 525 | + mov x29, sp |
---|
| 526 | + |
---|
| 527 | + /* subtract 16 bytes if we are doing CTS */ |
---|
| 528 | + sub w8, w4, #0x10 |
---|
| 529 | + tst w4, #0xf |
---|
| 530 | + csel w4, w4, w8, eq |
---|
| 531 | + |
---|
| 532 | + ld1 {v4.16b}, [x6] |
---|
| 533 | + xts_load_mask v8 |
---|
| 534 | + xts_cts_skip_tw w7, .Lxtsdecskiptw |
---|
402 | 535 | cbz w7, .Lxtsdecnotfirst |
---|
403 | 536 | |
---|
404 | 537 | enc_prepare w3, x5, x8 |
---|
405 | 538 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
---|
| 539 | +.Lxtsdecskiptw: |
---|
406 | 540 | dec_prepare w3, x2, x8 |
---|
407 | | - ldr q7, .Lxts_mul_x |
---|
408 | 541 | b .LxtsdecNx |
---|
409 | 542 | |
---|
410 | | -.Lxtsdecrestart: |
---|
411 | | - ld1 {v4.16b}, [x24] |
---|
412 | 543 | .Lxtsdecnotfirst: |
---|
413 | | - dec_prepare w22, x21, x8 |
---|
| 544 | + dec_prepare w3, x2, x8 |
---|
414 | 545 | .LxtsdecloopNx: |
---|
415 | | - ldr q7, .Lxts_mul_x |
---|
416 | | - next_tweak v4, v4, v7, v8 |
---|
| 546 | + next_tweak v4, v4, v8 |
---|
417 | 547 | .LxtsdecNx: |
---|
418 | | - subs w23, w23, #4 |
---|
| 548 | + subs w4, w4, #64 |
---|
419 | 549 | bmi .Lxtsdec1x |
---|
420 | | - ld1 {v0.16b-v3.16b}, [x20], #64 /* get 4 ct blocks */ |
---|
421 | | - next_tweak v5, v4, v7, v8 |
---|
| 550 | + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
---|
| 551 | + next_tweak v5, v4, v8 |
---|
422 | 552 | eor v0.16b, v0.16b, v4.16b |
---|
423 | | - next_tweak v6, v5, v7, v8 |
---|
| 553 | + next_tweak v6, v5, v8 |
---|
424 | 554 | eor v1.16b, v1.16b, v5.16b |
---|
425 | 555 | eor v2.16b, v2.16b, v6.16b |
---|
426 | | - next_tweak v7, v6, v7, v8 |
---|
| 556 | + next_tweak v7, v6, v8 |
---|
427 | 557 | eor v3.16b, v3.16b, v7.16b |
---|
428 | 558 | bl aes_decrypt_block4x |
---|
429 | 559 | eor v3.16b, v3.16b, v7.16b |
---|
430 | 560 | eor v0.16b, v0.16b, v4.16b |
---|
431 | 561 | eor v1.16b, v1.16b, v5.16b |
---|
432 | 562 | eor v2.16b, v2.16b, v6.16b |
---|
433 | | - st1 {v0.16b-v3.16b}, [x19], #64 |
---|
| 563 | + st1 {v0.16b-v3.16b}, [x0], #64 |
---|
434 | 564 | mov v4.16b, v7.16b |
---|
435 | | - cbz w23, .Lxtsdecout |
---|
436 | | - st1 {v4.16b}, [x24] |
---|
437 | | - cond_yield_neon .Lxtsdecrestart |
---|
| 565 | + cbz w4, .Lxtsdecout |
---|
| 566 | + xts_reload_mask v8 |
---|
438 | 567 | b .LxtsdecloopNx |
---|
439 | 568 | .Lxtsdec1x: |
---|
440 | | - adds w23, w23, #4 |
---|
| 569 | + adds w4, w4, #64 |
---|
441 | 570 | beq .Lxtsdecout |
---|
| 571 | + subs w4, w4, #16 |
---|
442 | 572 | .Lxtsdecloop: |
---|
443 | | - ld1 {v1.16b}, [x20], #16 |
---|
444 | | - eor v0.16b, v1.16b, v4.16b |
---|
445 | | - decrypt_block v0, w22, x21, x8, w7 |
---|
| 573 | + ld1 {v0.16b}, [x1], #16 |
---|
| 574 | + bmi .Lxtsdeccts |
---|
| 575 | +.Lxtsdecctsout: |
---|
446 | 576 | eor v0.16b, v0.16b, v4.16b |
---|
447 | | - st1 {v0.16b}, [x19], #16 |
---|
448 | | - subs w23, w23, #1 |
---|
449 | | - beq .Lxtsdecout |
---|
450 | | - next_tweak v4, v4, v7, v8 |
---|
| 577 | + decrypt_block v0, w3, x2, x8, w7 |
---|
| 578 | + eor v0.16b, v0.16b, v4.16b |
---|
| 579 | + st1 {v0.16b}, [x0], #16 |
---|
| 580 | + cbz w4, .Lxtsdecout |
---|
| 581 | + subs w4, w4, #16 |
---|
| 582 | + next_tweak v4, v4, v8 |
---|
451 | 583 | b .Lxtsdecloop |
---|
452 | 584 | .Lxtsdecout: |
---|
453 | | - st1 {v4.16b}, [x24] |
---|
454 | | - frame_pop |
---|
| 585 | + st1 {v4.16b}, [x6] |
---|
| 586 | + ldp x29, x30, [sp], #16 |
---|
455 | 587 | ret |
---|
456 | | -AES_ENDPROC(aes_xts_decrypt) |
---|
| 588 | + |
---|
| 589 | +.Lxtsdeccts: |
---|
| 590 | + adr_l x8, .Lcts_permute_table |
---|
| 591 | + |
---|
| 592 | + add x1, x1, w4, sxtw /* rewind input pointer */ |
---|
| 593 | + add w4, w4, #16 /* # bytes in final block */ |
---|
| 594 | + add x9, x8, #32 |
---|
| 595 | + add x8, x8, x4 |
---|
| 596 | + sub x9, x9, x4 |
---|
| 597 | + add x4, x0, x4 /* output address of final block */ |
---|
| 598 | + |
---|
| 599 | + next_tweak v5, v4, v8 |
---|
| 600 | + |
---|
| 601 | + ld1 {v1.16b}, [x1] /* load final block */ |
---|
| 602 | + ld1 {v2.16b}, [x8] |
---|
| 603 | + ld1 {v3.16b}, [x9] |
---|
| 604 | + |
---|
| 605 | + eor v0.16b, v0.16b, v5.16b |
---|
| 606 | + decrypt_block v0, w3, x2, x8, w7 |
---|
| 607 | + eor v0.16b, v0.16b, v5.16b |
---|
| 608 | + |
---|
| 609 | + tbl v2.16b, {v0.16b}, v2.16b |
---|
| 610 | + tbx v0.16b, {v1.16b}, v3.16b |
---|
| 611 | + |
---|
| 612 | + st1 {v2.16b}, [x4] /* overlapping stores */ |
---|
| 613 | + mov w4, wzr |
---|
| 614 | + b .Lxtsdecctsout |
---|
| 615 | +AES_FUNC_END(aes_xts_decrypt) |
---|
457 | 616 | |
---|
458 | 617 | /* |
---|
459 | 618 | * aes_mac_update(u8 const in[], u32 const rk[], int rounds, |
---|
460 | 619 | * int blocks, u8 dg[], int enc_before, int enc_after) |
---|
461 | 620 | */ |
---|
462 | | -AES_ENTRY(aes_mac_update) |
---|
463 | | - frame_push 6 |
---|
464 | | - |
---|
465 | | - mov x19, x0 |
---|
466 | | - mov x20, x1 |
---|
467 | | - mov x21, x2 |
---|
468 | | - mov x22, x3 |
---|
469 | | - mov x23, x4 |
---|
470 | | - mov x24, x6 |
---|
471 | | - |
---|
472 | | - ld1 {v0.16b}, [x23] /* get dg */ |
---|
| 621 | +AES_FUNC_START(aes_mac_update) |
---|
| 622 | + ld1 {v0.16b}, [x4] /* get dg */ |
---|
473 | 623 | enc_prepare w2, x1, x7 |
---|
474 | 624 | cbz w5, .Lmacloop4x |
---|
475 | 625 | |
---|
476 | 626 | encrypt_block v0, w2, x1, x7, w8 |
---|
477 | 627 | |
---|
478 | 628 | .Lmacloop4x: |
---|
479 | | - subs w22, w22, #4 |
---|
| 629 | + subs w3, w3, #4 |
---|
480 | 630 | bmi .Lmac1x |
---|
481 | | - ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */ |
---|
| 631 | + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ |
---|
482 | 632 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
---|
483 | | - encrypt_block v0, w21, x20, x7, w8 |
---|
| 633 | + encrypt_block v0, w2, x1, x7, w8 |
---|
484 | 634 | eor v0.16b, v0.16b, v2.16b |
---|
485 | | - encrypt_block v0, w21, x20, x7, w8 |
---|
| 635 | + encrypt_block v0, w2, x1, x7, w8 |
---|
486 | 636 | eor v0.16b, v0.16b, v3.16b |
---|
487 | | - encrypt_block v0, w21, x20, x7, w8 |
---|
| 637 | + encrypt_block v0, w2, x1, x7, w8 |
---|
488 | 638 | eor v0.16b, v0.16b, v4.16b |
---|
489 | | - cmp w22, wzr |
---|
490 | | - csinv x5, x24, xzr, eq |
---|
| 639 | + cmp w3, wzr |
---|
| 640 | + csinv x5, x6, xzr, eq |
---|
491 | 641 | cbz w5, .Lmacout |
---|
492 | | - encrypt_block v0, w21, x20, x7, w8 |
---|
493 | | - st1 {v0.16b}, [x23] /* return dg */ |
---|
494 | | - cond_yield_neon .Lmacrestart |
---|
| 642 | + encrypt_block v0, w2, x1, x7, w8 |
---|
| 643 | + st1 {v0.16b}, [x4] /* return dg */ |
---|
| 644 | + cond_yield .Lmacout, x7, x8 |
---|
495 | 645 | b .Lmacloop4x |
---|
496 | 646 | .Lmac1x: |
---|
497 | | - add w22, w22, #4 |
---|
| 647 | + add w3, w3, #4 |
---|
498 | 648 | .Lmacloop: |
---|
499 | | - cbz w22, .Lmacout |
---|
500 | | - ld1 {v1.16b}, [x19], #16 /* get next pt block */ |
---|
| 649 | + cbz w3, .Lmacout |
---|
| 650 | + ld1 {v1.16b}, [x0], #16 /* get next pt block */ |
---|
501 | 651 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
---|
502 | 652 | |
---|
503 | | - subs w22, w22, #1 |
---|
504 | | - csinv x5, x24, xzr, eq |
---|
| 653 | + subs w3, w3, #1 |
---|
| 654 | + csinv x5, x6, xzr, eq |
---|
505 | 655 | cbz w5, .Lmacout |
---|
506 | 656 | |
---|
507 | 657 | .Lmacenc: |
---|
508 | | - encrypt_block v0, w21, x20, x7, w8 |
---|
| 658 | + encrypt_block v0, w2, x1, x7, w8 |
---|
509 | 659 | b .Lmacloop |
---|
510 | 660 | |
---|
511 | 661 | .Lmacout: |
---|
512 | | - st1 {v0.16b}, [x23] /* return dg */ |
---|
513 | | - frame_pop |
---|
| 662 | + st1 {v0.16b}, [x4] /* return dg */ |
---|
| 663 | + mov w0, w3 |
---|
514 | 664 | ret |
---|
515 | | - |
---|
516 | | -.Lmacrestart: |
---|
517 | | - ld1 {v0.16b}, [x23] /* get dg */ |
---|
518 | | - enc_prepare w21, x20, x0 |
---|
519 | | - b .Lmacloop4x |
---|
520 | | -AES_ENDPROC(aes_mac_update) |
---|
| 665 | +AES_FUNC_END(aes_mac_update) |
---|