1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
| /* SPDX-License-Identifier: GPL-2.0-only */
| /*
| * Scalar AES core transform
| *
| * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
| */
|
| #include <linux/linkage.h>
| #include <asm/assembler.h>
| #include <asm/cache.h>
|
| .text
|
| rk .req x0
| out .req x1
| in .req x2
| rounds .req x3
| tt .req x2
|
| .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
| .ifc \op\shift, b0
| ubfiz \reg0, \in0, #2, #8
| ubfiz \reg1, \in1e, #2, #8
| .else
| ubfx \reg0, \in0, #\shift, #8
| ubfx \reg1, \in1e, #\shift, #8
| .endif
|
| /*
| * AArch64 cannot do byte size indexed loads from a table containing
| * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
| * valid instruction. So perform the shift explicitly first for the
| * high bytes (the low byte is shifted implicitly by using ubfiz rather
| * than ubfx above)
| */
| .ifnc \op, b
| ldr \reg0, [tt, \reg0, uxtw #2]
| ldr \reg1, [tt, \reg1, uxtw #2]
| .else
| .if \shift > 0
| lsl \reg0, \reg0, #2
| lsl \reg1, \reg1, #2
| .endif
| ldrb \reg0, [tt, \reg0, uxtw]
| ldrb \reg1, [tt, \reg1, uxtw]
| .endif
| .endm
|
| .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
| ubfx \reg0, \in0, #\shift, #8
| ubfx \reg1, \in1d, #\shift, #8
| ldr\op \reg0, [tt, \reg0, uxtw #\sz]
| ldr\op \reg1, [tt, \reg1, uxtw #\sz]
| .endm
|
| .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
| ldp \out0, \out1, [rk], #8
|
| __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0
| __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8
| __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16
| __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24
|
| eor \out0, \out0, w12
| eor \out1, \out1, w13
| eor \out0, \out0, w14, ror #24
| eor \out1, \out1, w15, ror #24
| eor \out0, \out0, w16, ror #16
| eor \out1, \out1, w17, ror #16
| eor \out0, \out0, \t0, ror #8
| eor \out1, \out1, \t1, ror #8
| .endm
|
| .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
| __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
| __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
| .endm
|
| .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
| __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
| __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
| .endm
|
| .macro do_crypt, round, ttab, ltab, bsz
| ldp w4, w5, [in]
| ldp w6, w7, [in, #8]
| ldp w8, w9, [rk], #16
| ldp w10, w11, [rk, #-8]
|
| CPU_BE( rev w4, w4 )
| CPU_BE( rev w5, w5 )
| CPU_BE( rev w6, w6 )
| CPU_BE( rev w7, w7 )
|
| eor w4, w4, w8
| eor w5, w5, w9
| eor w6, w6, w10
| eor w7, w7, w11
|
| adr_l tt, \ttab
|
| tbnz rounds, #1, 1f
|
| 0: \round w8, w9, w10, w11, w4, w5, w6, w7
| \round w4, w5, w6, w7, w8, w9, w10, w11
|
| 1: subs rounds, rounds, #4
| \round w8, w9, w10, w11, w4, w5, w6, w7
| b.ls 3f
| 2: \round w4, w5, w6, w7, w8, w9, w10, w11
| b 0b
| 3: adr_l tt, \ltab
| \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b
|
| CPU_BE( rev w4, w4 )
| CPU_BE( rev w5, w5 )
| CPU_BE( rev w6, w6 )
| CPU_BE( rev w7, w7 )
|
| stp w4, w5, [out]
| stp w6, w7, [out, #8]
| ret
| .endm
|
| SYM_FUNC_START(__aes_arm64_encrypt)
| do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
| SYM_FUNC_END(__aes_arm64_encrypt)
|
| .align 5
| SYM_FUNC_START(__aes_arm64_decrypt)
| do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
| SYM_FUNC_END(__aes_arm64_decrypt)
|
|