1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
| /* SPDX-License-Identifier: GPL-2.0 */
| /*
| * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
| *
| * Copyright 2018 Google LLC
| *
| * Author: Eric Biggers <ebiggers@google.com>
| */
|
| #include <linux/linkage.h>
|
| KEY .req x0
| MESSAGE .req x1
| MESSAGE_LEN .req x2
| HASH .req x3
|
| PASS0_SUMS .req v0
| PASS1_SUMS .req v1
| PASS2_SUMS .req v2
| PASS3_SUMS .req v3
| K0 .req v4
| K1 .req v5
| K2 .req v6
| K3 .req v7
| T0 .req v8
| T1 .req v9
| T2 .req v10
| T3 .req v11
| T4 .req v12
| T5 .req v13
| T6 .req v14
| T7 .req v15
|
| .macro _nh_stride k0, k1, k2, k3
|
| // Load next message stride
| ld1 {T3.16b}, [MESSAGE], #16
|
| // Load next key stride
| ld1 {\k3\().4s}, [KEY], #16
|
| // Add message words to key words
| add T0.4s, T3.4s, \k0\().4s
| add T1.4s, T3.4s, \k1\().4s
| add T2.4s, T3.4s, \k2\().4s
| add T3.4s, T3.4s, \k3\().4s
|
| // Multiply 32x32 => 64 and accumulate
| mov T4.d[0], T0.d[1]
| mov T5.d[0], T1.d[1]
| mov T6.d[0], T2.d[1]
| mov T7.d[0], T3.d[1]
| umlal PASS0_SUMS.2d, T0.2s, T4.2s
| umlal PASS1_SUMS.2d, T1.2s, T5.2s
| umlal PASS2_SUMS.2d, T2.2s, T6.2s
| umlal PASS3_SUMS.2d, T3.2s, T7.2s
| .endm
|
| /*
| * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
| * u8 hash[NH_HASH_BYTES])
| *
| * It's guaranteed that message_len % 16 == 0.
| */
| SYM_FUNC_START(nh_neon)
|
| ld1 {K0.4s,K1.4s}, [KEY], #32
| movi PASS0_SUMS.2d, #0
| movi PASS1_SUMS.2d, #0
| ld1 {K2.4s}, [KEY], #16
| movi PASS2_SUMS.2d, #0
| movi PASS3_SUMS.2d, #0
|
| subs MESSAGE_LEN, MESSAGE_LEN, #64
| blt .Lloop4_done
| .Lloop4:
| _nh_stride K0, K1, K2, K3
| _nh_stride K1, K2, K3, K0
| _nh_stride K2, K3, K0, K1
| _nh_stride K3, K0, K1, K2
| subs MESSAGE_LEN, MESSAGE_LEN, #64
| bge .Lloop4
|
| .Lloop4_done:
| ands MESSAGE_LEN, MESSAGE_LEN, #63
| beq .Ldone
| _nh_stride K0, K1, K2, K3
|
| subs MESSAGE_LEN, MESSAGE_LEN, #16
| beq .Ldone
| _nh_stride K1, K2, K3, K0
|
| subs MESSAGE_LEN, MESSAGE_LEN, #16
| beq .Ldone
| _nh_stride K2, K3, K0, K1
|
| .Ldone:
| // Sum the accumulators for each pass, then store the sums to 'hash'
| addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
| addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
| st1 {T0.16b,T1.16b}, [HASH]
| ret
| SYM_FUNC_END(nh_neon)
|
|