1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
| /* SPDX-License-Identifier: GPL-2.0 */
| /*
| * NH - ε-almost-universal hash function, NEON accelerated version
| *
| * Copyright 2018 Google LLC
| *
| * Author: Eric Biggers <ebiggers@google.com>
| */
|
| #include <linux/linkage.h>
|
| .text
| .fpu neon
|
| KEY .req r0
| MESSAGE .req r1
| MESSAGE_LEN .req r2
| HASH .req r3
|
| PASS0_SUMS .req q0
| PASS0_SUM_A .req d0
| PASS0_SUM_B .req d1
| PASS1_SUMS .req q1
| PASS1_SUM_A .req d2
| PASS1_SUM_B .req d3
| PASS2_SUMS .req q2
| PASS2_SUM_A .req d4
| PASS2_SUM_B .req d5
| PASS3_SUMS .req q3
| PASS3_SUM_A .req d6
| PASS3_SUM_B .req d7
| K0 .req q4
| K1 .req q5
| K2 .req q6
| K3 .req q7
| T0 .req q8
| T0_L .req d16
| T0_H .req d17
| T1 .req q9
| T1_L .req d18
| T1_H .req d19
| T2 .req q10
| T2_L .req d20
| T2_H .req d21
| T3 .req q11
| T3_L .req d22
| T3_H .req d23
|
| .macro _nh_stride k0, k1, k2, k3
|
| // Load next message stride
| vld1.8 {T3}, [MESSAGE]!
|
| // Load next key stride
| vld1.32 {\k3}, [KEY]!
|
| // Add message words to key words
| vadd.u32 T0, T3, \k0
| vadd.u32 T1, T3, \k1
| vadd.u32 T2, T3, \k2
| vadd.u32 T3, T3, \k3
|
| // Multiply 32x32 => 64 and accumulate
| vmlal.u32 PASS0_SUMS, T0_L, T0_H
| vmlal.u32 PASS1_SUMS, T1_L, T1_H
| vmlal.u32 PASS2_SUMS, T2_L, T2_H
| vmlal.u32 PASS3_SUMS, T3_L, T3_H
| .endm
|
| /*
| * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
| * u8 hash[NH_HASH_BYTES])
| *
| * It's guaranteed that message_len % 16 == 0.
| */
| ENTRY(nh_neon)
|
| vld1.32 {K0,K1}, [KEY]!
| vmov.u64 PASS0_SUMS, #0
| vmov.u64 PASS1_SUMS, #0
| vld1.32 {K2}, [KEY]!
| vmov.u64 PASS2_SUMS, #0
| vmov.u64 PASS3_SUMS, #0
|
| subs MESSAGE_LEN, MESSAGE_LEN, #64
| blt .Lloop4_done
| .Lloop4:
| _nh_stride K0, K1, K2, K3
| _nh_stride K1, K2, K3, K0
| _nh_stride K2, K3, K0, K1
| _nh_stride K3, K0, K1, K2
| subs MESSAGE_LEN, MESSAGE_LEN, #64
| bge .Lloop4
|
| .Lloop4_done:
| ands MESSAGE_LEN, MESSAGE_LEN, #63
| beq .Ldone
| _nh_stride K0, K1, K2, K3
|
| subs MESSAGE_LEN, MESSAGE_LEN, #16
| beq .Ldone
| _nh_stride K1, K2, K3, K0
|
| subs MESSAGE_LEN, MESSAGE_LEN, #16
| beq .Ldone
| _nh_stride K2, K3, K0, K1
|
| .Ldone:
| // Sum the accumulators for each pass, then store the sums to 'hash'
| vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B
| vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B
| vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
| vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
| vst1.8 {T0-T1}, [HASH]
| bx lr
| ENDPROC(nh_neon)
|
|