1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
| /*
| * Copyright 2012 The LibYuv Project Authors. All rights reserved.
| *
| * Use of this source code is governed by a BSD-style license
| * that can be found in the LICENSE file in the root of the source
| * tree. An additional intellectual property rights grant can be found
| * in the file PATENTS. All contributing project authors may
| * be found in the AUTHORS file in the root of the source tree.
| */
|
| #include "libyuv/basic_types.h"
|
| #include "libyuv/compare_row.h"
| #include "libyuv/row.h"
|
| #ifdef __cplusplus
| namespace libyuv {
| extern "C" {
| #endif
|
| // This module is for 32 bit Visual C x86 and clangcl
| #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
| __declspec(naked) uint32
| SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
| __asm {
| mov eax, [esp + 4] // src_a
| mov edx, [esp + 8] // src_b
| mov ecx, [esp + 12] // count
| pxor xmm0, xmm0
| pxor xmm5, xmm5
|
| wloop:
| movdqu xmm1, [eax]
| lea eax, [eax + 16]
| movdqu xmm2, [edx]
| lea edx, [edx + 16]
| movdqa xmm3, xmm1 // abs trick
| psubusb xmm1, xmm2
| psubusb xmm2, xmm3
| por xmm1, xmm2
| movdqa xmm2, xmm1
| punpcklbw xmm1, xmm5
| punpckhbw xmm2, xmm5
| pmaddwd xmm1, xmm1
| pmaddwd xmm2, xmm2
| paddd xmm0, xmm1
| paddd xmm0, xmm2
| sub ecx, 16
| jg wloop
|
| pshufd xmm1, xmm0, 0xee
| paddd xmm0, xmm1
| pshufd xmm1, xmm0, 0x01
| paddd xmm0, xmm1
| movd eax, xmm0
| ret
| }
| }
|
| // Visual C 2012 required for AVX2.
| #if _MSC_VER >= 1700
| // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
| #pragma warning(disable : 4752)
| __declspec(naked) uint32
| SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
| __asm {
| mov eax, [esp + 4] // src_a
| mov edx, [esp + 8] // src_b
| mov ecx, [esp + 12] // count
| vpxor ymm0, ymm0, ymm0 // sum
| vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
| sub edx, eax
|
| wloop:
| vmovdqu ymm1, [eax]
| vmovdqu ymm2, [eax + edx]
| lea eax, [eax + 32]
| vpsubusb ymm3, ymm1, ymm2 // abs difference trick
| vpsubusb ymm2, ymm2, ymm1
| vpor ymm1, ymm2, ymm3
| vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
| vpunpckhbw ymm1, ymm1, ymm5
| vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
| vpmaddwd ymm1, ymm1, ymm1
| vpaddd ymm0, ymm0, ymm1
| vpaddd ymm0, ymm0, ymm2
| sub ecx, 32
| jg wloop
|
| vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
| vpaddd ymm0, ymm0, ymm1
| vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
| vpaddd ymm0, ymm0, ymm1
| vpermq ymm1, ymm0, 0x02 // high + low lane.
| vpaddd ymm0, ymm0, ymm1
| vmovd eax, xmm0
| vzeroupper
| ret
| }
| }
| #endif // _MSC_VER >= 1700
|
| uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
| uvec32 kHashMul0 = {
| 0x0c3525e1, // 33 ^ 15
| 0xa3476dc1, // 33 ^ 14
| 0x3b4039a1, // 33 ^ 13
| 0x4f5f0981, // 33 ^ 12
| };
| uvec32 kHashMul1 = {
| 0x30f35d61, // 33 ^ 11
| 0x855cb541, // 33 ^ 10
| 0x040a9121, // 33 ^ 9
| 0x747c7101, // 33 ^ 8
| };
| uvec32 kHashMul2 = {
| 0xec41d4e1, // 33 ^ 7
| 0x4cfa3cc1, // 33 ^ 6
| 0x025528a1, // 33 ^ 5
| 0x00121881, // 33 ^ 4
| };
| uvec32 kHashMul3 = {
| 0x00008c61, // 33 ^ 3
| 0x00000441, // 33 ^ 2
| 0x00000021, // 33 ^ 1
| 0x00000001, // 33 ^ 0
| };
|
| __declspec(naked) uint32
| HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
| __asm {
| mov eax, [esp + 4] // src
| mov ecx, [esp + 8] // count
| movd xmm0, [esp + 12] // seed
|
| pxor xmm7, xmm7 // constant 0 for unpck
| movdqa xmm6, xmmword ptr kHash16x33
|
| wloop:
| movdqu xmm1, [eax] // src[0-15]
| lea eax, [eax + 16]
| pmulld xmm0, xmm6 // hash *= 33 ^ 16
| movdqa xmm5, xmmword ptr kHashMul0
| movdqa xmm2, xmm1
| punpcklbw xmm2, xmm7 // src[0-7]
| movdqa xmm3, xmm2
| punpcklwd xmm3, xmm7 // src[0-3]
| pmulld xmm3, xmm5
| movdqa xmm5, xmmword ptr kHashMul1
| movdqa xmm4, xmm2
| punpckhwd xmm4, xmm7 // src[4-7]
| pmulld xmm4, xmm5
| movdqa xmm5, xmmword ptr kHashMul2
| punpckhbw xmm1, xmm7 // src[8-15]
| movdqa xmm2, xmm1
| punpcklwd xmm2, xmm7 // src[8-11]
| pmulld xmm2, xmm5
| movdqa xmm5, xmmword ptr kHashMul3
| punpckhwd xmm1, xmm7 // src[12-15]
| pmulld xmm1, xmm5
| paddd xmm3, xmm4 // add 16 results
| paddd xmm1, xmm2
| paddd xmm1, xmm3
|
| pshufd xmm2, xmm1, 0x0e // upper 2 dwords
| paddd xmm1, xmm2
| pshufd xmm2, xmm1, 0x01
| paddd xmm1, xmm2
| paddd xmm0, xmm1
| sub ecx, 16
| jg wloop
|
| movd eax, xmm0 // return hash
| ret
| }
| }
|
| // Visual C 2012 required for AVX2.
| #if _MSC_VER >= 1700
| __declspec(naked) uint32
| HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
| __asm {
| mov eax, [esp + 4] // src
| mov ecx, [esp + 8] // count
| vmovd xmm0, [esp + 12] // seed
|
| wloop:
| vpmovzxbd xmm3, [eax] // src[0-3]
| vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
| vpmovzxbd xmm4, [eax + 4] // src[4-7]
| vpmulld xmm3, xmm3, xmmword ptr kHashMul0
| vpmovzxbd xmm2, [eax + 8] // src[8-11]
| vpmulld xmm4, xmm4, xmmword ptr kHashMul1
| vpmovzxbd xmm1, [eax + 12] // src[12-15]
| vpmulld xmm2, xmm2, xmmword ptr kHashMul2
| lea eax, [eax + 16]
| vpmulld xmm1, xmm1, xmmword ptr kHashMul3
| vpaddd xmm3, xmm3, xmm4 // add 16 results
| vpaddd xmm1, xmm1, xmm2
| vpaddd xmm1, xmm1, xmm3
| vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
| vpaddd xmm1, xmm1,xmm2
| vpshufd xmm2, xmm1, 0x01
| vpaddd xmm1, xmm1, xmm2
| vpaddd xmm0, xmm0, xmm1
| sub ecx, 16
| jg wloop
|
| vmovd eax, xmm0 // return hash
| vzeroupper
| ret
| }
| }
| #endif // _MSC_VER >= 1700
|
| #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
| #ifdef __cplusplus
| } // extern "C"
| } // namespace libyuv
| #endif
|
|