1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
| /* SPDX-License-Identifier: GPL-2.0-only */
| /*
| * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
| *
| * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
| */
|
| #include <linux/linkage.h>
| #include <asm/assembler.h>
|
| .text
| .arch armv8-a+crypto
|
| k0 .req v0
| k1 .req v1
| k2 .req v2
| k3 .req v3
|
| t0 .req v4
| t1 .req v5
|
| dga .req q6
| dgav .req v6
| dgb .req s7
| dgbv .req v7
|
| dg0q .req q12
| dg0s .req s12
| dg0v .req v12
| dg1s .req s13
| dg1v .req v13
| dg2s .req s14
|
| .macro add_only, op, ev, rc, s0, dg1
| .ifc \ev, ev
| add t1.4s, v\s0\().4s, \rc\().4s
| sha1h dg2s, dg0s
| .ifnb \dg1
| sha1\op dg0q, \dg1, t0.4s
| .else
| sha1\op dg0q, dg1s, t0.4s
| .endif
| .else
| .ifnb \s0
| add t0.4s, v\s0\().4s, \rc\().4s
| .endif
| sha1h dg1s, dg0s
| sha1\op dg0q, dg2s, t1.4s
| .endif
| .endm
|
| .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
| sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
| add_only \op, \ev, \rc, \s1, \dg1
| sha1su1 v\s0\().4s, v\s3\().4s
| .endm
|
| .macro loadrc, k, val, tmp
| movz \tmp, :abs_g0_nc:\val
| movk \tmp, :abs_g1:\val
| dup \k, \tmp
| .endm
|
| /*
| * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
| * int blocks)
| */
| SYM_FUNC_START(sha1_ce_transform)
| /* load round constants */
| loadrc k0.4s, 0x5a827999, w6
| loadrc k1.4s, 0x6ed9eba1, w6
| loadrc k2.4s, 0x8f1bbcdc, w6
| loadrc k3.4s, 0xca62c1d6, w6
|
| /* load state */
| ld1 {dgav.4s}, [x0]
| ldr dgb, [x0, #16]
|
| /* load sha1_ce_state::finalize */
| ldr_l w4, sha1_ce_offsetof_finalize, x4
| ldr w4, [x0, x4]
|
| /* load input */
| 0: ld1 {v8.4s-v11.4s}, [x1], #64
| sub w2, w2, #1
|
| CPU_LE( rev32 v8.16b, v8.16b )
| CPU_LE( rev32 v9.16b, v9.16b )
| CPU_LE( rev32 v10.16b, v10.16b )
| CPU_LE( rev32 v11.16b, v11.16b )
|
| 1: add t0.4s, v8.4s, k0.4s
| mov dg0v.16b, dgav.16b
|
| add_update c, ev, k0, 8, 9, 10, 11, dgb
| add_update c, od, k0, 9, 10, 11, 8
| add_update c, ev, k0, 10, 11, 8, 9
| add_update c, od, k0, 11, 8, 9, 10
| add_update c, ev, k1, 8, 9, 10, 11
|
| add_update p, od, k1, 9, 10, 11, 8
| add_update p, ev, k1, 10, 11, 8, 9
| add_update p, od, k1, 11, 8, 9, 10
| add_update p, ev, k1, 8, 9, 10, 11
| add_update p, od, k2, 9, 10, 11, 8
|
| add_update m, ev, k2, 10, 11, 8, 9
| add_update m, od, k2, 11, 8, 9, 10
| add_update m, ev, k2, 8, 9, 10, 11
| add_update m, od, k2, 9, 10, 11, 8
| add_update m, ev, k3, 10, 11, 8, 9
|
| add_update p, od, k3, 11, 8, 9, 10
| add_only p, ev, k3, 9
| add_only p, od, k3, 10
| add_only p, ev, k3, 11
| add_only p, od
|
| /* update state */
| add dgbv.2s, dgbv.2s, dg1v.2s
| add dgav.4s, dgav.4s, dg0v.4s
|
| cbz w2, 2f
| cond_yield 3f, x5, x6
| b 0b
|
| /*
| * Final block: add padding and total bit count.
| * Skip if the input size was not a round multiple of the block size,
| * the padding is handled by the C code in that case.
| */
| 2: cbz x4, 3f
| ldr_l w4, sha1_ce_offsetof_count, x4
| ldr x4, [x0, x4]
| movi v9.2d, #0
| mov x8, #0x80000000
| movi v10.2d, #0
| ror x7, x4, #29 // ror(lsl(x4, 3), 32)
| fmov d8, x8
| mov x4, #0
| mov v11.d[0], xzr
| mov v11.d[1], x7
| b 1b
|
| /* store new state */
| 3: st1 {dgav.4s}, [x0]
| str dgb, [x0, #16]
| mov w0, w2
| ret
| SYM_FUNC_END(sha1_ce_transform)
|
|