1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
| /*
| * Copyright 2012 The LibYuv Project Authors. All rights reserved.
| *
| * Use of this source code is governed by a BSD-style license
| * that can be found in the LICENSE file in the root of the source
| * tree. An additional intellectual property rights grant can be found
| * in the file PATENTS. All contributing project authors may
| * be found in the AUTHORS file in the root of the source tree.
| */
|
| #include "libyuv/basic_types.h"
|
| #include "libyuv/compare_row.h"
| #include "libyuv/row.h"
|
| #ifdef __cplusplus
| namespace libyuv {
| extern "C" {
| #endif
|
| #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
| !defined(__aarch64__)
|
| uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
| volatile uint32 sse;
| asm volatile (
| "vmov.u8 q8, #0 \n"
| "vmov.u8 q10, #0 \n"
| "vmov.u8 q9, #0 \n"
| "vmov.u8 q11, #0 \n"
|
| "1: \n"
| MEMACCESS(0)
| "vld1.8 {q0}, [%0]! \n"
| MEMACCESS(1)
| "vld1.8 {q1}, [%1]! \n"
| "subs %2, %2, #16 \n"
| "vsubl.u8 q2, d0, d2 \n"
| "vsubl.u8 q3, d1, d3 \n"
| "vmlal.s16 q8, d4, d4 \n"
| "vmlal.s16 q9, d6, d6 \n"
| "vmlal.s16 q10, d5, d5 \n"
| "vmlal.s16 q11, d7, d7 \n"
| "bgt 1b \n"
|
| "vadd.u32 q8, q8, q9 \n"
| "vadd.u32 q10, q10, q11 \n"
| "vadd.u32 q11, q8, q10 \n"
| "vpaddl.u32 q1, q11 \n"
| "vadd.u64 d0, d2, d3 \n"
| "vmov.32 %3, d0[0] \n"
| : "+r"(src_a),
| "+r"(src_b),
| "+r"(count),
| "=r"(sse)
| :
| : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
| return sse;
| }
|
| #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
| #ifdef __cplusplus
| } // extern "C"
| } // namespace libyuv
| #endif
|
|