1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
| /* SPDX-License-Identifier: GPL-2.0-only */
| /*
| * Scalar AES core transform
| *
| * Copyright (C) 2017 Linaro Ltd.
| * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
| */
|
| #include <linux/linkage.h>
| #include <asm/assembler.h>
| #include <asm/cache.h>
|
| .text
| .align 5
|
| rk .req r0
| rounds .req r1
| in .req r2
| out .req r3
| ttab .req ip
|
| t0 .req lr
| t1 .req r2
| t2 .req r3
|
| .macro __select, out, in, idx
| .if __LINUX_ARM_ARCH__ < 7
| and \out, \in, #0xff << (8 * \idx)
| .else
| ubfx \out, \in, #(8 * \idx), #8
| .endif
| .endm
|
| .macro __load, out, in, idx, sz, op
| .if __LINUX_ARM_ARCH__ < 7 && \idx > 0
| ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz]
| .else
| ldr\op \out, [ttab, \in, lsl #\sz]
| .endif
| .endm
|
| .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
| __select \out0, \in0, 0
| __select t0, \in1, 1
| __load \out0, \out0, 0, \sz, \op
| __load t0, t0, 1, \sz, \op
|
| .if \enc
| __select \out1, \in1, 0
| __select t1, \in2, 1
| .else
| __select \out1, \in3, 0
| __select t1, \in0, 1
| .endif
| __load \out1, \out1, 0, \sz, \op
| __select t2, \in2, 2
| __load t1, t1, 1, \sz, \op
| __load t2, t2, 2, \sz, \op
|
| eor \out0, \out0, t0, ror #24
|
| __select t0, \in3, 3
| .if \enc
| __select \t3, \in3, 2
| __select \t4, \in0, 3
| .else
| __select \t3, \in1, 2
| __select \t4, \in2, 3
| .endif
| __load \t3, \t3, 2, \sz, \op
| __load t0, t0, 3, \sz, \op
| __load \t4, \t4, 3, \sz, \op
|
| .ifnb \oldcpsr
| /*
| * This is the final round and we're done with all data-dependent table
| * lookups, so we can safely re-enable interrupts.
| */
| restore_irqs \oldcpsr
| .endif
|
| eor \out1, \out1, t1, ror #24
| eor \out0, \out0, t2, ror #16
| ldm rk!, {t1, t2}
| eor \out1, \out1, \t3, ror #16
| eor \out0, \out0, t0, ror #8
| eor \out1, \out1, \t4, ror #8
| eor \out0, \out0, t1
| eor \out1, \out1, t2
| .endm
|
| .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
| __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
| __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
| .endm
|
| .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
| __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
| __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
| .endm
|
| .macro __rev, out, in
| .if __LINUX_ARM_ARCH__ < 6
| lsl t0, \in, #24
| and t1, \in, #0xff00
| and t2, \in, #0xff0000
| orr \out, t0, \in, lsr #24
| orr \out, \out, t1, lsl #8
| orr \out, \out, t2, lsr #8
| .else
| rev \out, \in
| .endif
| .endm
|
| .macro __adrl, out, sym, c
| .if __LINUX_ARM_ARCH__ < 7
| ldr\c \out, =\sym
| .else
| movw\c \out, #:lower16:\sym
| movt\c \out, #:upper16:\sym
| .endif
| .endm
|
| .macro do_crypt, round, ttab, ltab, bsz
| push {r3-r11, lr}
|
| // Load keys first, to reduce latency in case they're not cached yet.
| ldm rk!, {r8-r11}
|
| ldr r4, [in]
| ldr r5, [in, #4]
| ldr r6, [in, #8]
| ldr r7, [in, #12]
|
| #ifdef CONFIG_CPU_BIG_ENDIAN
| __rev r4, r4
| __rev r5, r5
| __rev r6, r6
| __rev r7, r7
| #endif
|
| eor r4, r4, r8
| eor r5, r5, r9
| eor r6, r6, r10
| eor r7, r7, r11
|
| __adrl ttab, \ttab
| /*
| * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
| * L1 cache, assuming cacheline size >= 32. This is a hardening measure
| * intended to make cache-timing attacks more difficult. They may not
| * be fully prevented, however; see the paper
| * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
| * ("Cache-timing attacks on AES") for a discussion of the many
| * difficulties involved in writing truly constant-time AES software.
| */
| save_and_disable_irqs t0
| .set i, 0
| .rept 1024 / 128
| ldr r8, [ttab, #i + 0]
| ldr r9, [ttab, #i + 32]
| ldr r10, [ttab, #i + 64]
| ldr r11, [ttab, #i + 96]
| .set i, i + 128
| .endr
| push {t0} // oldcpsr
|
| tst rounds, #2
| bne 1f
|
| 0: \round r8, r9, r10, r11, r4, r5, r6, r7
| \round r4, r5, r6, r7, r8, r9, r10, r11
|
| 1: subs rounds, rounds, #4
| \round r8, r9, r10, r11, r4, r5, r6, r7
| bls 2f
| \round r4, r5, r6, r7, r8, r9, r10, r11
| b 0b
|
| 2: .ifb \ltab
| add ttab, ttab, #1
| .else
| __adrl ttab, \ltab
| // Prefetch inverse S-box for final round; see explanation above
| .set i, 0
| .rept 256 / 64
| ldr t0, [ttab, #i + 0]
| ldr t1, [ttab, #i + 32]
| .set i, i + 64
| .endr
| .endif
|
| pop {rounds} // oldcpsr
| \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
|
| #ifdef CONFIG_CPU_BIG_ENDIAN
| __rev r4, r4
| __rev r5, r5
| __rev r6, r6
| __rev r7, r7
| #endif
|
| ldr out, [sp]
|
| str r4, [out]
| str r5, [out, #4]
| str r6, [out, #8]
| str r7, [out, #12]
|
| pop {r3-r11, pc}
|
| .align 3
| .ltorg
| .endm
|
| ENTRY(__aes_arm_encrypt)
| do_crypt fround, crypto_ft_tab,, 2
| ENDPROC(__aes_arm_encrypt)
|
| .align 5
| ENTRY(__aes_arm_decrypt)
| do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
| ENDPROC(__aes_arm_decrypt)
|
|