/*M///////////////////////////////////////////////////////////////////////////////////////
|
//
|
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
//
|
// By downloading, copying, installing or using the software you agree to this license.
|
// If you do not agree to this license, do not download, install,
|
// copy or use the software.
|
//
|
//
|
// License Agreement
|
// For Open Source Computer Vision Library
|
//
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
// Third party copyrights are property of their respective owners.
|
//
|
// Redistribution and use in source and binary forms, with or without modification,
|
// are permitted provided that the following conditions are met:
|
//
|
// * Redistribution's of source code must retain the above copyright notice,
|
// this list of conditions and the following disclaimer.
|
//
|
// * Redistribution's in binary form must reproduce the above copyright notice,
|
// this list of conditions and the following disclaimer in the documentation
|
// and/or other materials provided with the distribution.
|
//
|
// * The name of the copyright holders may not be used to endorse or promote products
|
// derived from this software without specific prior written permission.
|
//
|
// This software is provided by the copyright holders and contributors "as is" and
|
// any express or implied warranties, including, but not limited to, the implied
|
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
// indirect, incidental, special, exemplary, or consequential damages
|
// (including, but not limited to, procurement of substitute goods or services;
|
// loss of use, data, or profits; or business interruption) however caused
|
// and on any theory of liability, whether in contract, strict liability,
|
// or tort (including negligence or otherwise) arising in any way out of
|
// the use of this software, even if advised of the possibility of such damage.
|
//
|
//M*/
|
|
#ifndef OPENCV_HAL_INTRIN_CPP_HPP
|
#define OPENCV_HAL_INTRIN_CPP_HPP
|
|
#include <limits>
|
#include <cstring>
|
#include <algorithm>
|
#include "opencv2/core/saturate.hpp"
|
|
namespace cv
|
{
|
|
#ifndef CV_DOXYGEN
|
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
|
#endif
|
|
/** @addtogroup core_hal_intrin
|
|
"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
|
different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
|
architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
|
containing packed values of different types. In case when there is no SIMD extension available
|
during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
|
expected although it could be slower.
|
|
### Types
|
|
There are several types representing 128-bit register as a vector of packed values, each type is
|
implemented as a structure based on a one SIMD register.
|
|
- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
|
- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
|
- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
|
- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
|
- cv::v_float32x4: four 32-bit floating point values (signed) - float
|
- cv::v_float64x2: two 64-bit floating point valies (signed) - double
|
|
@note
|
cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
|
check the CV_SIMD128_64F preprocessor definition:
|
@code
|
#if CV_SIMD128_64F
|
//...
|
#endif
|
@endcode
|
|
### Load and store operations
|
|
These operations allow to set contents of the register explicitly or by loading it from some memory
|
block and to save contents of the register to memory block.
|
|
- Constructors:
|
@ref v_reg::v_reg(const _Tp *ptr) "from memory",
|
@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
|
- Other create methods:
|
@ref v_setall_s8, @ref v_setall_u8, ...,
|
@ref v_setzero_u8, @ref v_setzero_s8, ...
|
- Memory operations:
|
@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
|
@ref v_store, @ref v_store_aligned,
|
@ref v_store_high, @ref v_store_low
|
|
### Value reordering
|
|
These operations allow to reorder or recombine elements in one or multiple vectors.
|
|
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
|
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand, @ref v_expand_low, @ref v_expand_high
|
- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
|
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
|
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
|
- Extract: @ref v_extract
|
|
|
### Arithmetic, bitwise and comparison operations
|
|
Element-wise binary and unary operations.
|
|
- Arithmetics:
|
@ref operator +(const v_reg &a, const v_reg &b) "+",
|
@ref operator -(const v_reg &a, const v_reg &b) "-",
|
@ref operator *(const v_reg &a, const v_reg &b) "*",
|
@ref operator /(const v_reg &a, const v_reg &b) "/",
|
@ref v_mul_expand
|
|
- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
|
|
- Bitwise shifts:
|
@ref operator <<(const v_reg &a, int s) "<<",
|
@ref operator >>(const v_reg &a, int s) ">>",
|
@ref v_shl, @ref v_shr
|
|
- Bitwise logic:
|
@ref operator&(const v_reg &a, const v_reg &b) "&",
|
@ref operator |(const v_reg &a, const v_reg &b) "|",
|
@ref operator ^(const v_reg &a, const v_reg &b) "^",
|
@ref operator ~(const v_reg &a) "~"
|
|
- Comparison:
|
@ref operator >(const v_reg &a, const v_reg &b) ">",
|
@ref operator >=(const v_reg &a, const v_reg &b) ">=",
|
@ref operator <(const v_reg &a, const v_reg &b) "<",
|
@ref operator <=(const v_reg &a, const v_reg &b) "<=",
|
@ref operator==(const v_reg &a, const v_reg &b) "==",
|
@ref operator !=(const v_reg &a, const v_reg &b) "!="
|
|
- min/max: @ref v_min, @ref v_max
|
|
### Reduce and mask
|
|
Most of these operations return only one value.
|
|
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
|
- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
|
|
### Other math
|
|
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
|
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
|
|
### Conversions
|
|
Different type conversions and casts:
|
|
- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
|
- To float: @ref v_cvt_f32, @ref v_cvt_f64
|
- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
|
|
### Matrix operations
|
|
In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
|
|
### Usability
|
|
Most operations are implemented only for some subset of the available types, following matrices
|
shows the applicability of different operations to the types.
|
|
Regular integers:
|
|
| Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
|
|-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
|
|load, store | x | x | x | x | x | x |
|
|interleave | x | x | x | x | x | x |
|
|expand | x | x | x | x | x | x |
|
|expand_low | x | x | x | x | x | x |
|
|expand_high | x | x | x | x | x | x |
|
|expand_q | x | x | | | | |
|
|add, sub | x | x | x | x | x | x |
|
|add_wrap, sub_wrap | x | x | x | x | | |
|
|mul_wrap | x | x | x | x | | |
|
|mul | x | x | x | x | x | x |
|
|mul_expand | x | x | x | x | x | |
|
|compare | x | x | x | x | x | x |
|
|shift | | | x | x | x | x |
|
|dotprod | | | | x | | |
|
|logical | x | x | x | x | x | x |
|
|min, max | x | x | x | x | x | x |
|
|absdiff | x | x | x | x | x | x |
|
|absdiffs | | x | | x | | |
|
|reduce | | | | | x | x |
|
|mask | x | x | x | x | x | x |
|
|pack | x | x | x | x | x | x |
|
|pack_u | x | | x | | | |
|
|pack_b | x | | | | | |
|
|unpack | x | x | x | x | x | x |
|
|extract | x | x | x | x | x | x |
|
|rotate (lanes) | x | x | x | x | x | x |
|
|cvt_flt32 | | | | | | x |
|
|cvt_flt64 | | | | | | x |
|
|transpose4x4 | | | | | x | x |
|
|
Big integers:
|
|
| Operations\\Types | uint 64x2 | int 64x2 |
|
|-------------------|:-:|:-:|
|
|load, store | x | x |
|
|add, sub | x | x |
|
|shift | x | x |
|
|logical | x | x |
|
|extract | x | x |
|
|rotate (lanes) | x | x |
|
|
Floating point:
|
|
| Operations\\Types | float 32x4 | float 64x2 |
|
|-------------------|:-:|:-:|
|
|load, store | x | x |
|
|interleave | x | |
|
|add, sub | x | x |
|
|mul | x | x |
|
|div | x | x |
|
|compare | x | x |
|
|min, max | x | x |
|
|absdiff | x | x |
|
|reduce | x | |
|
|mask | x | x |
|
|unpack | x | x |
|
|cvt_flt32 | | x |
|
|cvt_flt64 | x | |
|
|sqrt, abs | x | x |
|
|float math | x | x |
|
|transpose4x4 | x | |
|
|extract | x | x |
|
|rotate (lanes) | x | x |
|
|
@{ */
|
|
template<typename _Tp, int n> struct v_reg
|
{
|
//! @cond IGNORED
|
typedef _Tp lane_type;
|
enum { nlanes = n };
|
// !@endcond
|
|
/** @brief Constructor
|
|
Initializes register with data from memory
|
@param ptr pointer to memory block with data for register */
|
explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
|
|
/** @brief Constructor
|
|
Initializes register with two 64-bit values */
|
v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
|
|
/** @brief Constructor
|
|
Initializes register with four 32-bit values */
|
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
|
|
/** @brief Constructor
|
|
Initializes register with eight 16-bit values */
|
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
|
_Tp s4, _Tp s5, _Tp s6, _Tp s7)
|
{
|
s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
|
s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
|
}
|
|
/** @brief Constructor
|
|
Initializes register with sixteen 8-bit values */
|
v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
|
_Tp s4, _Tp s5, _Tp s6, _Tp s7,
|
_Tp s8, _Tp s9, _Tp s10, _Tp s11,
|
_Tp s12, _Tp s13, _Tp s14, _Tp s15)
|
{
|
s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
|
s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
|
s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
|
s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
|
}
|
|
/** @brief Default constructor
|
|
Does not initialize anything*/
|
v_reg() {}
|
|
/** @brief Copy constructor */
|
v_reg(const v_reg<_Tp, n> & r)
|
{
|
for( int i = 0; i < n; i++ )
|
s[i] = r.s[i];
|
}
|
/** @brief Access first value
|
|
Returns value of the first lane according to register type, for example:
|
@code{.cpp}
|
v_int32x4 r(1, 2, 3, 4);
|
int v = r.get0(); // returns 1
|
v_uint64x2 r(1, 2);
|
uint64_t v = r.get0(); // returns 1
|
@endcode
|
*/
|
_Tp get0() const { return s[0]; }
|
|
//! @cond IGNORED
|
_Tp get(const int i) const { return s[i]; }
|
v_reg<_Tp, n> high() const
|
{
|
v_reg<_Tp, n> c;
|
int i;
|
for( i = 0; i < n/2; i++ )
|
{
|
c.s[i] = s[i+(n/2)];
|
c.s[i+(n/2)] = 0;
|
}
|
return c;
|
}
|
|
static v_reg<_Tp, n> zero()
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = (_Tp)0;
|
return c;
|
}
|
|
static v_reg<_Tp, n> all(_Tp s)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = s;
|
return c;
|
}
|
|
template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
|
{
|
size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
|
v_reg<_Tp2, n2> c;
|
std::memcpy(&c.s[0], &s[0], bytes);
|
return c;
|
}
|
|
_Tp s[n];
|
//! @endcond
|
};
|
|
/** @brief Sixteen 8-bit unsigned integer values */
|
typedef v_reg<uchar, 16> v_uint8x16;
|
/** @brief Sixteen 8-bit signed integer values */
|
typedef v_reg<schar, 16> v_int8x16;
|
/** @brief Eight 16-bit unsigned integer values */
|
typedef v_reg<ushort, 8> v_uint16x8;
|
/** @brief Eight 16-bit signed integer values */
|
typedef v_reg<short, 8> v_int16x8;
|
/** @brief Four 32-bit unsigned integer values */
|
typedef v_reg<unsigned, 4> v_uint32x4;
|
/** @brief Four 32-bit signed integer values */
|
typedef v_reg<int, 4> v_int32x4;
|
/** @brief Four 32-bit floating point values (single precision) */
|
typedef v_reg<float, 4> v_float32x4;
|
/** @brief Two 64-bit floating point values (double precision) */
|
typedef v_reg<double, 2> v_float64x2;
|
/** @brief Two 64-bit unsigned integer values */
|
typedef v_reg<uint64, 2> v_uint64x2;
|
/** @brief Two 64-bit signed integer values */
|
typedef v_reg<int64, 2> v_int64x2;
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
|
template<typename _Tp, int n> inline v_reg<_Tp, n> \
|
operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
v_reg<_Tp, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
return c; \
|
} \
|
template<typename _Tp, int n> inline v_reg<_Tp, n>& \
|
operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
for( int i = 0; i < n; i++ ) \
|
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
|
return a; \
|
}
|
|
/** @brief Add values
|
|
For all types. */
|
OPENCV_HAL_IMPL_BIN_OP(+)
|
|
/** @brief Subtract values
|
|
For all types. */
|
OPENCV_HAL_IMPL_BIN_OP(-)
|
|
/** @brief Multiply values
|
|
For 16- and 32-bit integer types and floating types. */
|
OPENCV_HAL_IMPL_BIN_OP(*)
|
|
/** @brief Divide values
|
|
For floating types only. */
|
OPENCV_HAL_IMPL_BIN_OP(/)
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
|
template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
|
(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
v_reg<_Tp, n> c; \
|
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
|
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
|
return c; \
|
} \
|
template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
|
bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
for( int i = 0; i < n; i++ ) \
|
a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
|
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
|
return a; \
|
}
|
|
/** @brief Bitwise AND
|
|
Only for integer types. */
|
OPENCV_HAL_IMPL_BIT_OP(&)
|
|
/** @brief Bitwise OR
|
|
Only for integer types. */
|
OPENCV_HAL_IMPL_BIT_OP(|)
|
|
/** @brief Bitwise XOR
|
|
Only for integer types.*/
|
OPENCV_HAL_IMPL_BIT_OP(^)
|
|
/** @brief Bitwise NOT
|
|
Only for integer types.*/
|
template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
|
}
|
return c;
|
}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
|
template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
|
{ \
|
v_reg<_Tp2, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = cfunc(a.s[i]); \
|
return c; \
|
}
|
|
/** @brief Square root of elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
|
|
//! @cond IGNORED
|
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
|
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
|
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
|
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
|
//! @endcond
|
|
/** @brief Absolute value of elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
|
typename V_TypeTraits<_Tp>::abs_type)
|
|
/** @brief Round elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
|
|
/** @brief Floor elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
|
|
/** @brief Ceil elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
|
|
/** @brief Truncate elements
|
|
Only for floating point types.*/
|
OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
|
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
v_reg<_Tp, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = cfunc(a.s[i], b.s[i]); \
|
return c; \
|
}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
|
template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
|
{ \
|
_Tp c = a.s[0]; \
|
for( int i = 1; i < n; i++ ) \
|
c = cfunc(c, a.s[i]); \
|
return c; \
|
}
|
|
/** @brief Choose min values for each pair
|
|
Scheme:
|
@code
|
{A1 A2 ...}
|
{B1 B2 ...}
|
--------------
|
{min(A1,B1) min(A2,B2) ...}
|
@endcode
|
For all types except 64-bit integer. */
|
OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
|
|
/** @brief Choose max values for each pair
|
|
Scheme:
|
@code
|
{A1 A2 ...}
|
{B1 B2 ...}
|
--------------
|
{max(A1,B1) max(A2,B2) ...}
|
@endcode
|
For all types except 64-bit integer. */
|
OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
|
|
/** @brief Find one min value
|
|
Scheme:
|
@code
|
{A1 A2 A3 ...} => min(A1,A2,A3,...)
|
@endcode
|
For 32-bit integer and 32-bit floating point types. */
|
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
|
|
/** @brief Find one max value
|
|
Scheme:
|
@code
|
{A1 A2 A3 ...} => max(A1,A2,A3,...)
|
@endcode
|
For 32-bit integer and 32-bit floating point types. */
|
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
|
|
static const unsigned char popCountTable[] =
|
{
|
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
|
};
|
/** @brief Count the 1 bits in the vector and return 4 values
|
|
Scheme:
|
@code
|
{A1 A2 A3 ...} => popcount(A1)
|
@endcode
|
Any types but result will be in v_uint32x4*/
|
template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
|
{
|
v_uint8x16 b;
|
b = v_reinterpret_as_u8(a);
|
for( int i = 0; i < v_uint8x16::nlanes; i++ )
|
{
|
b.s[i] = popCountTable[b.s[i]];
|
}
|
v_uint32x4 c;
|
for( int i = 0; i < v_uint32x4::nlanes; i++ )
|
{
|
c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
|
}
|
return c;
|
}
|
|
|
//! @cond IGNORED
|
template<typename _Tp, int n>
|
inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
|
{
|
for( int i = 0; i < n; i++ )
|
{
|
minval.s[i] = std::min(a.s[i], b.s[i]);
|
maxval.s[i] = std::max(a.s[i], b.s[i]);
|
}
|
}
|
//! @endcond
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
|
template<typename _Tp, int n> \
|
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
typedef typename V_TypeTraits<_Tp>::int_type itype; \
|
v_reg<_Tp, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
|
return c; \
|
}
|
|
/** @brief Less-than comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(<)
|
|
/** @brief Greater-than comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(>)
|
|
/** @brief Less-than or equal comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(<=)
|
|
/** @brief Greater-than or equal comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(>=)
|
|
/** @brief Equal comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(==)
|
|
/** @brief Not equal comparison
|
|
For all types except 64-bit integer values. */
|
OPENCV_HAL_IMPL_CMP_OP(!=)
|
|
template<int n>
|
inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
|
{
|
typedef typename V_TypeTraits<float>::int_type itype;
|
v_reg<float, n> c;
|
for (int i = 0; i < n; i++)
|
c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
return c;
|
}
|
template<int n>
|
inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
|
{
|
typedef typename V_TypeTraits<double>::int_type itype;
|
v_reg<double, n> c;
|
for (int i = 0; i < n; i++)
|
c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
|
return c;
|
}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
|
template<typename _Tp, int n> \
|
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
typedef _Tp2 rtype; \
|
v_reg<rtype, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
|
return c; \
|
}
|
|
/** @brief Add values without saturation
|
|
For 8- and 16-bit integer values. */
|
OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
|
|
/** @brief Subtract values without saturation
|
|
For 8- and 16-bit integer values. */
|
OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
|
|
/** @brief Multiply values without saturation
|
|
For 8- and 16-bit integer values. */
|
OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
|
|
//! @cond IGNORED
|
template<typename T> inline T _absdiff(T a, T b)
|
{
|
return a > b ? a - b : b - a;
|
}
|
//! @endcond
|
|
/** @brief Absolute difference
|
|
Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
|
Example:
|
@code{.cpp}
|
v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
|
v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
|
@endcode
|
For 8-, 16-, 32-bit integer source types. */
|
template<typename _Tp, int n>
|
inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
|
{
|
typedef typename V_TypeTraits<_Tp>::abs_type rtype;
|
v_reg<rtype, n> c;
|
const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
|
for( int i = 0; i < n; i++ )
|
{
|
rtype ua = a.s[i] ^ mask;
|
rtype ub = b.s[i] ^ mask;
|
c.s[i] = _absdiff(ua, ub);
|
}
|
return c;
|
}
|
|
/** @overload
|
|
For 32-bit floating point values */
|
inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
|
{
|
v_float32x4 c;
|
for( int i = 0; i < c.nlanes; i++ )
|
c.s[i] = _absdiff(a.s[i], b.s[i]);
|
return c;
|
}
|
|
/** @overload
|
|
For 64-bit floating point values */
|
inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
|
{
|
v_float64x2 c;
|
for( int i = 0; i < c.nlanes; i++ )
|
c.s[i] = _absdiff(a.s[i], b.s[i]);
|
return c;
|
}
|
|
/** @brief Saturating absolute difference
|
|
Returns \f$ saturate(|a - b|) \f$ .
|
For 8-, 16-bit signed integer source types. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++)
|
c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
|
return c;
|
}
|
|
/** @brief Inversed square root
|
|
Returns \f$ 1/sqrt(a) \f$
|
For floating point types only. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = 1.f/std::sqrt(a.s[i]);
|
return c;
|
}
|
|
/** @brief Magnitude
|
|
Returns \f$ sqrt(a^2 + b^2) \f$
|
For floating point types only. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
|
return c;
|
}
|
|
/** @brief Square of the magnitude
|
|
Returns \f$ a^2 + b^2 \f$
|
For floating point types only. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
|
return c;
|
}
|
|
/** @brief Multiply and add
|
|
Returns \f$ a*b + c \f$
|
For floating point types and signed 32bit int only. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
const v_reg<_Tp, n>& c)
|
{
|
v_reg<_Tp, n> d;
|
for( int i = 0; i < n; i++ )
|
d.s[i] = a.s[i]*b.s[i] + c.s[i];
|
return d;
|
}
|
|
/** @brief A synonym for v_fma */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
const v_reg<_Tp, n>& c)
|
{
|
return v_fma(a, b, c);
|
}
|
|
/** @brief Dot product of elements
|
|
Multiply values in two registers and sum adjacent result pairs.
|
Scheme:
|
@code
|
{A1 A2 ...} // 16-bit
|
x {B1 B2 ...} // 16-bit
|
-------------
|
{A1B1+A2B2 ...} // 32-bit
|
@endcode
|
Implemented only for 16-bit signed source type (v_int16x8).
|
*/
|
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
v_reg<w_type, n/2> c;
|
for( int i = 0; i < (n/2); i++ )
|
c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
|
return c;
|
}
|
|
/** @brief Dot product of elements
|
|
Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
|
Scheme:
|
@code
|
{A1 A2 ...} // 16-bit
|
x {B1 B2 ...} // 16-bit
|
-------------
|
{A1B1+A2B2+C1 ...} // 32-bit
|
|
@endcode
|
Implemented only for 16-bit signed source type (v_int16x8).
|
*/
|
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
v_reg<w_type, n/2> s;
|
for( int i = 0; i < (n/2); i++ )
|
s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
|
return s;
|
}
|
|
/** @brief Multiply and expand
|
|
Multiply values two registers and store results in two registers with wider pack type.
|
Scheme:
|
@code
|
{A B C D} // 32-bit
|
x {E F G H} // 32-bit
|
---------------
|
{AE BF} // 64-bit
|
{CG DH} // 64-bit
|
@endcode
|
Example:
|
@code{.cpp}
|
v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
|
v_uint64x2 c, d; // results
|
v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
|
@endcode
|
Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
|
*/
|
template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
for( int i = 0; i < (n/2); i++ )
|
{
|
c.s[i] = (w_type)a.s[i]*b.s[i];
|
d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
|
}
|
}
|
|
/** @brief Multiply and extract high part
|
|
Multiply values two registers and store high part of the results.
|
Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
|
*/
|
template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
v_reg<_Tp, n> c;
|
for (int i = 0; i < n; i++)
|
c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
|
return c;
|
}
|
|
//! @cond IGNORED
|
template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
for( int i = 0; i < (n/2); i++ )
|
{
|
c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
|
}
|
}
|
//! @endcond
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
|
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
|
{ \
|
v_reg<_Tp, n> c; \
|
for( int i = 0; i < n; i++ ) \
|
c.s[i] = (_Tp)(a.s[i] shift_op imm); \
|
return c; \
|
}
|
|
/** @brief Bitwise shift left
|
|
For 16-, 32- and 64-bit integer values. */
|
OPENCV_HAL_IMPL_SHIFT_OP(<< )
|
|
/** @brief Bitwise shift right
|
|
For 16-, 32- and 64-bit integer values. */
|
OPENCV_HAL_IMPL_SHIFT_OP(>> )
|
|
/** @brief Element shift left among vector
|
|
For all type */
|
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
|
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
|
{ \
|
v_reg<_Tp, n> b; \
|
for (int i = 0; i < n; i++) \
|
{ \
|
int sIndex = i opA imm; \
|
if (0 <= sIndex && sIndex < n) \
|
{ \
|
b.s[i] = a.s[sIndex]; \
|
} \
|
else \
|
{ \
|
b.s[i] = 0; \
|
} \
|
} \
|
return b; \
|
} \
|
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
|
{ \
|
v_reg<_Tp, n> c; \
|
for (int i = 0; i < n; i++) \
|
{ \
|
int aIndex = i opA imm; \
|
int bIndex = i opA imm opB n; \
|
if (0 <= bIndex && bIndex < n) \
|
{ \
|
c.s[i] = b.s[bIndex]; \
|
} \
|
else if (0 <= aIndex && aIndex < n) \
|
{ \
|
c.s[i] = a.s[aIndex]; \
|
} \
|
else \
|
{ \
|
c.s[i] = 0; \
|
} \
|
} \
|
return c; \
|
}
|
|
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
|
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
|
|
/** @brief Sum packed values
|
|
Scheme:
|
@code
|
{A1 A2 A3 ...} => sum{A1,A2,A3,...}
|
@endcode
|
For 32-bit integer and 32-bit floating point types.*/
|
template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
|
{
|
typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
|
for( int i = 1; i < n; i++ )
|
c += a.s[i];
|
return c;
|
}
|
|
/** @brief Sums all elements of each input vector, returns the vector of sums
|
|
Scheme:
|
@code
|
result[0] = a[0] + a[1] + a[2] + a[3]
|
result[1] = b[0] + b[1] + b[2] + b[3]
|
result[2] = c[0] + c[1] + c[2] + c[3]
|
result[3] = d[0] + d[1] + d[2] + d[3]
|
@endcode
|
*/
|
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
|
const v_float32x4& c, const v_float32x4& d)
|
{
|
v_float32x4 r;
|
r.s[0] = a.s[0] + a.s[1] + a.s[2] + a.s[3];
|
r.s[1] = b.s[0] + b.s[1] + b.s[2] + b.s[3];
|
r.s[2] = c.s[0] + c.s[1] + c.s[2] + c.s[3];
|
r.s[3] = d.s[0] + d.s[1] + d.s[2] + d.s[3];
|
return r;
|
}
|
|
/** @brief Sum absolute differences of values
|
|
Scheme:
|
@code
|
{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
|
@endcode
|
For all types except 64-bit types.*/
|
template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
|
for (int i = 1; i < n; i++)
|
c += _absdiff(a.s[i], b.s[i]);
|
return c;
|
}
|
|
/** @brief Get negative values mask
|
|
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
|
Example:
|
@code{.cpp}
|
v_int32x4 r; // set to {-1, -1, 1, 1}
|
int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
|
{
|
int mask = 0;
|
for( int i = 0; i < n; i++ )
|
mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
|
return mask;
|
}
|
|
/** @brief Check if all packed values are less than zero
|
|
Unsigned values will be casted to signed: `uchar 254 => char -2`.
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < n; i++ )
|
if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
|
return false;
|
return true;
|
}
|
|
/** @brief Check if any of packed values is less than zero
|
|
Unsigned values will be casted to signed: `uchar 254 => char -2`.
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < n; i++ )
|
if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
|
return true;
|
return false;
|
}
|
|
/** @brief Per-element select (blend operation)
|
|
Return value will be built by combining values _a_ and _b_ using the following scheme:
|
result[i] = mask[i] ? a[i] : b[i];
|
|
@note: _mask_ element values are restricted to these values:
|
- 0: select element from _b_
|
- 0xff/0xffff/etc: select element from _a_
|
(fully compatible with bitwise-based operator)
|
*/
|
template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
|
const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
typedef V_TypeTraits<_Tp> Traits;
|
typedef typename Traits::int_type int_type;
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < n; i++ )
|
{
|
int_type m = Traits::reinterpret_int(mask.s[i]);
|
CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
|
c.s[i] = m ? a.s[i] : b.s[i];
|
}
|
return c;
|
}
|
|
/** @brief Expand values to the wider pack type
|
|
Copy contents of register to two registers with 2x wider pack type.
|
Scheme:
|
@code
|
int32x4 int64x2 int64x2
|
{A B C D} ==> {A B} , {C D}
|
@endcode */
|
template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
|
{
|
for( int i = 0; i < (n/2); i++ )
|
{
|
b0.s[i] = a.s[i];
|
b1.s[i] = a.s[i+(n/2)];
|
}
|
}
|
|
/** @brief Expand lower values to the wider pack type
|
|
Same as cv::v_expand, but return lower half of the vector.
|
|
Scheme:
|
@code
|
int32x4 int64x2
|
{A B C D} ==> {A B}
|
@endcode */
|
template<typename _Tp, int n>
|
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
v_expand_low(const v_reg<_Tp, n>& a)
|
{
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
|
for( int i = 0; i < (n/2); i++ )
|
b.s[i] = a.s[i];
|
return b;
|
}
|
|
/** @brief Expand higher values to the wider pack type
|
|
Same as cv::v_expand_low, but expand higher half of the vector instead.
|
|
Scheme:
|
@code
|
int32x4 int64x2
|
{A B C D} ==> {C D}
|
@endcode */
|
template<typename _Tp, int n>
|
inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
|
v_expand_high(const v_reg<_Tp, n>& a)
|
{
|
v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
|
for( int i = 0; i < (n/2); i++ )
|
b.s[i] = a.s[i+(n/2)];
|
return b;
|
}
|
|
//! @cond IGNORED
|
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
|
v_reinterpret_as_int(const v_reg<_Tp, n>& a)
|
{
|
v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
|
return c;
|
}
|
|
template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
|
v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
|
{
|
v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
|
return c;
|
}
|
//! @endcond
|
|
/** @brief Interleave two vectors
|
|
Scheme:
|
@code
|
{A1 A2 A3 A4}
|
{B1 B2 B3 B4}
|
---------------
|
{A1 B1 A2 B2} and {A3 B3 A4 B4}
|
@endcode
|
For all types except 64-bit.
|
*/
|
template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
|
v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
|
{
|
int i;
|
for( i = 0; i < n/2; i++ )
|
{
|
b0.s[i*2] = a0.s[i];
|
b0.s[i*2+1] = a1.s[i];
|
}
|
for( ; i < n; i++ )
|
{
|
b1.s[i*2-n] = a0.s[i];
|
b1.s[i*2-n+1] = a1.s[i];
|
}
|
}
|
|
/** @brief Load register contents from memory
|
|
@param ptr pointer to memory block with data
|
@return register object
|
|
@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
|
*/
|
template<typename _Tp>
|
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load(const _Tp* ptr)
|
{
|
return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
|
}
|
|
/** @brief Load register contents from memory (aligned)
|
|
similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
|
*/
|
template<typename _Tp>
|
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_aligned(const _Tp* ptr)
|
{
|
return v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128>(ptr);
|
}
|
|
/** @brief Load 64-bits of data to lower part (high part is undefined).
|
|
@param ptr memory block containing data for first half (0..n/2)
|
|
@code{.cpp}
|
int lo[2] = { 1, 2 };
|
v_int32x4 r = v_load_low(lo);
|
@endcode
|
*/
|
template<typename _Tp>
|
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_low(const _Tp* ptr)
|
{
|
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
for( int i = 0; i < c.nlanes/2; i++ )
|
{
|
c.s[i] = ptr[i];
|
}
|
return c;
|
}
|
|
/** @brief Load register contents from two memory blocks
|
|
@param loptr memory block containing data for first half (0..n/2)
|
@param hiptr memory block containing data for second half (n/2..n)
|
|
@code{.cpp}
|
int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
|
v_int32x4 r = v_load_halves(lo, hi);
|
@endcode
|
*/
|
template<typename _Tp>
|
inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
|
{
|
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
|
for( int i = 0; i < c.nlanes/2; i++ )
|
{
|
c.s[i] = loptr[i];
|
c.s[i+c.nlanes/2] = hiptr[i];
|
}
|
return c;
|
}
|
|
/** @brief Load register contents from memory with double expand
|
|
Same as cv::v_load, but result pack type will be 2x wider than memory type.
|
|
@code{.cpp}
|
short buf[4] = {1, 2, 3, 4}; // type is int16
|
v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
|
@endcode
|
For 8-, 16-, 32-bit integer source types. */
|
template<typename _Tp>
|
inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_TypeTraits<_Tp>::nlanes128 / 2>
|
v_load_expand(const _Tp* ptr)
|
{
|
typedef typename V_TypeTraits<_Tp>::w_type w_type;
|
v_reg<w_type, V_TypeTraits<w_type>::nlanes128> c;
|
for( int i = 0; i < c.nlanes; i++ )
|
{
|
c.s[i] = ptr[i];
|
}
|
return c;
|
}
|
|
/** @brief Load register contents from memory with quad expand
|
|
Same as cv::v_load_expand, but result type is 4 times wider than source.
|
@code{.cpp}
|
char buf[4] = {1, 2, 3, 4}; // type is int8
|
v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
|
@endcode
|
For 8-bit integer source types. */
|
template<typename _Tp>
|
inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_TypeTraits<_Tp>::nlanes128 / 4>
|
v_load_expand_q(const _Tp* ptr)
|
{
|
typedef typename V_TypeTraits<_Tp>::q_type q_type;
|
v_reg<q_type, V_TypeTraits<q_type>::nlanes128> c;
|
for( int i = 0; i < c.nlanes; i++ )
|
{
|
c.s[i] = ptr[i];
|
}
|
return c;
|
}
|
|
/** @brief Load and deinterleave (2 channels)
|
|
Load data from memory deinterleave and store to 2 registers.
|
Scheme:
|
@code
|
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
v_reg<_Tp, n>& b)
|
{
|
int i, i2;
|
for( i = i2 = 0; i < n; i++, i2 += 2 )
|
{
|
a.s[i] = ptr[i2];
|
b.s[i] = ptr[i2+1];
|
}
|
}
|
|
/** @brief Load and deinterleave (3 channels)
|
|
Load data from memory deinterleave and store to 3 registers.
|
Scheme:
|
@code
|
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
|
{
|
int i, i3;
|
for( i = i3 = 0; i < n; i++, i3 += 3 )
|
{
|
a.s[i] = ptr[i3];
|
b.s[i] = ptr[i3+1];
|
c.s[i] = ptr[i3+2];
|
}
|
}
|
|
/** @brief Load and deinterleave (4 channels)
|
|
Load data from memory deinterleave and store to 4 registers.
|
Scheme:
|
@code
|
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n>
|
inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
|
v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
|
v_reg<_Tp, n>& d)
|
{
|
int i, i4;
|
for( i = i4 = 0; i < n; i++, i4 += 4 )
|
{
|
a.s[i] = ptr[i4];
|
b.s[i] = ptr[i4+1];
|
c.s[i] = ptr[i4+2];
|
d.s[i] = ptr[i4+3];
|
}
|
}
|
|
/** @brief Interleave and store (2 channels)
|
|
Interleave and store data from 2 registers to memory.
|
Scheme:
|
@code
|
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n>
|
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
|
const v_reg<_Tp, n>& b,
|
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
|
{
|
int i, i2;
|
for( i = i2 = 0; i < n; i++, i2 += 2 )
|
{
|
ptr[i2] = a.s[i];
|
ptr[i2+1] = b.s[i];
|
}
|
}
|
|
/** @brief Interleave and store (3 channels)
|
|
Interleave and store data from 3 registers to memory.
|
Scheme:
|
@code
|
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n>
|
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
|
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
|
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
|
{
|
int i, i3;
|
for( i = i3 = 0; i < n; i++, i3 += 3 )
|
{
|
ptr[i3] = a.s[i];
|
ptr[i3+1] = b.s[i];
|
ptr[i3+2] = c.s[i];
|
}
|
}
|
|
/** @brief Interleave and store (4 channels)
|
|
Interleave and store data from 4 registers to memory.
|
Scheme:
|
@code
|
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
|
const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
|
const v_reg<_Tp, n>& d,
|
hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
|
{
|
int i, i4;
|
for( i = i4 = 0; i < n; i++, i4 += 4 )
|
{
|
ptr[i4] = a.s[i];
|
ptr[i4+1] = b.s[i];
|
ptr[i4+2] = c.s[i];
|
ptr[i4+3] = d.s[i];
|
}
|
}
|
|
/** @brief Store data to memory
|
|
Store register contents to memory.
|
Scheme:
|
@code
|
REG {A B C D} ==> MEM {A B C D}
|
@endcode
|
Pointer can be unaligned. */
|
template<typename _Tp, int n>
|
inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < n; i++ )
|
ptr[i] = a.s[i];
|
}
|
|
/** @brief Store data to memory (lower half)
|
|
Store lower half of register contents to memory.
|
Scheme:
|
@code
|
REG {A B C D} ==> MEM {A B}
|
@endcode */
|
template<typename _Tp, int n>
|
inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < (n/2); i++ )
|
ptr[i] = a.s[i];
|
}
|
|
/** @brief Store data to memory (higher half)
|
|
Store higher half of register contents to memory.
|
Scheme:
|
@code
|
REG {A B C D} ==> MEM {C D}
|
@endcode */
|
template<typename _Tp, int n>
|
inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < (n/2); i++ )
|
ptr[i] = a.s[i+(n/2)];
|
}
|
|
/** @brief Store data to memory (aligned)
|
|
Store register contents to memory.
|
Scheme:
|
@code
|
REG {A B C D} ==> MEM {A B C D}
|
@endcode
|
Pointer __should__ be aligned by 16-byte boundary. */
|
template<typename _Tp, int n>
|
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < n; i++ )
|
ptr[i] = a.s[i];
|
}
|
|
template<typename _Tp, int n>
|
inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
|
{
|
for( int i = 0; i < n; i++ )
|
ptr[i] = a.s[i];
|
}
|
|
template<typename _Tp, int n>
|
inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
|
{
|
for( int i = 0; i < n; i++ )
|
ptr[i] = a.s[i];
|
}
|
|
/** @brief Combine vector from first elements of two vectors
|
|
Scheme:
|
@code
|
{A1 A2 A3 A4}
|
{B1 B2 B3 B4}
|
---------------
|
{A1 A2 B1 B2}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < (n/2); i++ )
|
{
|
c.s[i] = a.s[i];
|
c.s[i+(n/2)] = b.s[i];
|
}
|
return c;
|
}
|
|
/** @brief Combine vector from last elements of two vectors
|
|
Scheme:
|
@code
|
{A1 A2 A3 A4}
|
{B1 B2 B3 B4}
|
---------------
|
{A3 A4 B3 B4}
|
@endcode
|
For all types except 64-bit. */
|
template<typename _Tp, int n>
|
inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> c;
|
for( int i = 0; i < (n/2); i++ )
|
{
|
c.s[i] = a.s[i+(n/2)];
|
c.s[i+(n/2)] = b.s[i+(n/2)];
|
}
|
return c;
|
}
|
|
/** @brief Combine two vectors from lower and higher parts of two other vectors
|
|
@code{.cpp}
|
low = cv::v_combine_low(a, b);
|
high = cv::v_combine_high(a, b);
|
@endcode */
|
template<typename _Tp, int n>
|
inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
|
v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
|
{
|
for( int i = 0; i < (n/2); i++ )
|
{
|
low.s[i] = a.s[i];
|
low.s[i+(n/2)] = b.s[i];
|
high.s[i] = a.s[i+(n/2)];
|
high.s[i+(n/2)] = b.s[i+(n/2)];
|
}
|
}
|
|
/** @brief Vector extract
|
|
Scheme:
|
@code
|
{A1 A2 A3 A4}
|
{B1 B2 B3 B4}
|
========================
|
shift = 1 {A2 A3 A4 B1}
|
shift = 2 {A3 A4 B1 B2}
|
shift = 3 {A4 B1 B2 B3}
|
@endcode
|
Restriction: 0 <= shift < nlanes
|
|
Usage:
|
@code
|
v_int32x4 a, b, c;
|
c = v_extract<2>(a, b);
|
@endcode
|
For all types. */
|
template<int s, typename _Tp, int n>
|
inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
v_reg<_Tp, n> r;
|
const int shift = n - s;
|
int i = 0;
|
for (; i < shift; ++i)
|
r.s[i] = a.s[i+s];
|
for (; i < n; ++i)
|
r.s[i] = b.s[i-shift];
|
return r;
|
}
|
|
/** @brief Round
|
|
Rounds each value. Input type is float vector ==> output type is int vector.*/
|
template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = cvRound(a.s[i]);
|
return c;
|
}
|
|
/** @overload */
|
template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
|
{
|
v_reg<int, n*2> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = cvRound(a.s[i]);
|
c.s[i+n] = cvRound(b.s[i]);
|
}
|
return c;
|
}
|
|
/** @brief Floor
|
|
Floor each value. Input type is float vector ==> output type is int vector.*/
|
template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = cvFloor(a.s[i]);
|
return c;
|
}
|
|
/** @brief Ceil
|
|
Ceil each value. Input type is float vector ==> output type is int vector.*/
|
template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = cvCeil(a.s[i]);
|
return c;
|
}
|
|
/** @brief Trunc
|
|
Truncate each value. Input type is float vector ==> output type is int vector.*/
|
template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = (int)(a.s[i]);
|
return c;
|
}
|
|
/** @overload */
|
template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
|
{
|
v_reg<int, n*2> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = cvRound(a.s[i]);
|
c.s[i+n] = 0;
|
}
|
return c;
|
}
|
|
/** @overload */
|
template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = cvFloor(a.s[i]);
|
c.s[i+n] = 0;
|
}
|
return c;
|
}
|
|
/** @overload */
|
template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = cvCeil(a.s[i]);
|
c.s[i+n] = 0;
|
}
|
return c;
|
}
|
|
/** @overload */
|
template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = cvCeil(a.s[i]);
|
c.s[i+n] = 0;
|
}
|
return c;
|
}
|
|
/** @brief Convert to float
|
|
Supported input type is cv::v_int32x4. */
|
template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
|
{
|
v_reg<float, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = (float)a.s[i];
|
return c;
|
}
|
|
template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
|
{
|
v_reg<float, n*2> c;
|
for( int i = 0; i < n; i++ )
|
{
|
c.s[i] = (float)a.s[i];
|
c.s[i+n] = (float)b.s[i];
|
}
|
return c;
|
}
|
|
/** @brief Convert to double
|
|
Supported input type is cv::v_int32x4. */
|
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
|
{
|
v_reg<double, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = (double)a.s[i];
|
return c;
|
}
|
|
/** @brief Convert to double
|
|
Supported input type is cv::v_float32x4. */
|
template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
|
{
|
v_reg<double, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = (double)a.s[i];
|
return c;
|
}
|
|
template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
|
{
|
v_reg<int, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = tab[idx.s[i]];
|
return c;
|
}
|
|
template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
|
{
|
v_reg<float, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = tab[idx.s[i]];
|
return c;
|
}
|
|
template<int n> inline v_reg<double, n> v_lut(const double* tab, const v_reg<int, n*2>& idx)
|
{
|
v_reg<double, n> c;
|
for( int i = 0; i < n; i++ )
|
c.s[i] = tab[idx.s[i]];
|
return c;
|
}
|
|
template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
|
v_reg<float, n>& x, v_reg<float, n>& y)
|
{
|
for( int i = 0; i < n; i++ )
|
{
|
int j = idx.s[i];
|
x.s[i] = tab[j];
|
y.s[i] = tab[j+1];
|
}
|
}
|
|
template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
|
v_reg<double, n>& x, v_reg<double, n>& y)
|
{
|
for( int i = 0; i < n; i++ )
|
{
|
int j = idx.s[i];
|
x.s[i] = tab[j];
|
y.s[i] = tab[j+1];
|
}
|
}
|
|
/** @brief Transpose 4x4 matrix
|
|
Scheme:
|
@code
|
a0 {A1 A2 A3 A4}
|
a1 {B1 B2 B3 B4}
|
a2 {C1 C2 C3 C4}
|
a3 {D1 D2 D3 D4}
|
===============
|
b0 {A1 B1 C1 D1}
|
b1 {A2 B2 C2 D2}
|
b2 {A3 B3 C3 D3}
|
b3 {A4 B4 C4 D4}
|
@endcode
|
*/
|
template<typename _Tp>
|
inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
|
const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
|
v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
|
v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
|
{
|
b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
|
b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
|
b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
|
b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
|
}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
|
inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
|
|
//! @name Init with zero
|
//! @{
|
//! @brief Create new vector with zero elements
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
|
OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
|
inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
|
|
//! @name Init with value
|
//! @{
|
//! @brief Create new vector with elements set to a specific value
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
|
OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
|
template<typename _Tp0, int n0> inline _Tpvec \
|
v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
|
{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
|
|
//! @name Reinterpret
|
//! @{
|
//! @brief Convert vector to different type without modifying underlying data.
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
|
OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
|
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
|
{ return a << n; }
|
|
//! @name Left shift
|
//! @{
|
//! @brief Shift left
|
OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
|
OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
|
OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
|
OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
|
OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
|
OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
|
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
|
{ return a >> n; }
|
|
//! @name Right shift
|
//! @{
|
//! @brief Shift right
|
OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
|
OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
|
OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
|
OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
|
OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
|
OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
|
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
|
{ \
|
_Tpvec c; \
|
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
|
c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
|
return c; \
|
}
|
|
//! @name Rounding shift
|
//! @{
|
//! @brief Rounding shift right
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
|
OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
|
inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
|
{ \
|
_Tpnvec c; \
|
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
|
{ \
|
c.s[i] = cast<_Tpn>(a.s[i]); \
|
c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
|
} \
|
return c; \
|
}
|
|
//! @name Pack
|
//! @{
|
//! @brief Pack values from two vectors to one
|
//!
|
//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
|
//! converts to corresponding unsigned type.
|
//!
|
//! - pack: for 16-, 32- and 64-bit integer input types
|
//! - pack_u: for 16- and 32-bit signed integer input types
|
//!
|
//! @note All variants except 64-bit use saturation.
|
OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack, static_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack, static_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u, saturate_cast)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
|
template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
|
{ \
|
_Tpnvec c; \
|
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
|
{ \
|
c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
|
c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
|
} \
|
return c; \
|
}
|
|
//! @name Pack with rounding shift
|
//! @{
|
//! @brief Pack values from two vectors to one with rounding shift
|
//!
|
//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
|
//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
|
//!
|
//! - pack: for 16-, 32- and 64-bit integer input types
|
//! - pack_u: for 16- and 32-bit signed integer input types
|
//!
|
//! @note All variants except 64-bit use saturation.
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack, static_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
|
inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
|
{ \
|
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
|
ptr[i] = cast<_Tpn>(a.s[i]); \
|
}
|
|
//! @name Pack and store
|
//! @{
|
//! @brief Store values from the input vector into memory with pack
|
//!
|
//! Values will be stored into memory with conversion to narrower type.
|
//! Variant with _u_ suffix converts to corresponding unsigned type.
|
//!
|
//! - pack: for 16-, 32- and 64-bit integer input types
|
//! - pack_u: for 16- and 32-bit signed integer input types
|
//!
|
//! @note All variants except 64-bit use saturation.
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
|
OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
|
//! @}
|
|
//! @brief Helper macro
|
//! @ingroup core_hal_intrin_impl
|
#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
|
template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
|
{ \
|
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
|
ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
|
}
|
|
//! @name Pack and store with rounding shift
|
//! @{
|
//! @brief Store values from the input vector into memory with pack
|
//!
|
//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
|
//! memory. Variant with _u_ suffix converts to unsigned type.
|
//!
|
//! - pack: for 16-, 32- and 64-bit integer input types
|
//! - pack_u: for 16- and 32-bit signed integer input types
|
//!
|
//! @note All variants except 64-bit use saturation.
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack, static_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack, static_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u, saturate_cast)
|
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u, saturate_cast)
|
//! @}
|
|
//! @cond IGNORED
|
template<typename _Tpm, typename _Tp, int n>
|
inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
|
{
|
for (int i = 0; i < n; ++i)
|
{
|
mptr[i] = (_Tpm)a.s[i];
|
mptr[i + n] = (_Tpm)b.s[i];
|
}
|
}
|
//! @endcond
|
|
//! @name Pack boolean values
|
//! @{
|
//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
|
//!
|
//! @note Must provide valid boolean values to guarantee same result for all architectures.
|
|
/** @brief
|
//! For 16-bit boolean values
|
|
Scheme:
|
@code
|
a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
|
b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
|
===============
|
{
|
0xFF 0 0 0xFF 0 0xFF 0xFF 0
|
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
}
|
@endcode */
|
|
inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
|
{
|
v_uint8x16 mask;
|
_pack_b(mask.s, a, b);
|
return mask;
|
}
|
|
/** @overload
|
For 32-bit boolean values
|
|
Scheme:
|
@code
|
a {0xFFFF.. 0 0 0xFFFF..}
|
b {0 0xFFFF.. 0xFFFF.. 0}
|
c {0xFFFF.. 0 0xFFFF.. 0}
|
d {0 0xFFFF.. 0 0xFFFF..}
|
===============
|
{
|
0xFF 0 0 0xFF 0 0xFF 0xFF 0
|
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
}
|
@endcode */
|
|
inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
|
const v_uint32x4& c, const v_uint32x4& d)
|
{
|
v_uint8x16 mask;
|
_pack_b(mask.s, a, b);
|
_pack_b(mask.s + 8, c, d);
|
return mask;
|
}
|
|
/** @overload
|
For 64-bit boolean values
|
|
Scheme:
|
@code
|
a {0xFFFF.. 0}
|
b {0 0xFFFF..}
|
c {0xFFFF.. 0}
|
d {0 0xFFFF..}
|
|
e {0xFFFF.. 0}
|
f {0xFFFF.. 0}
|
g {0 0xFFFF..}
|
h {0 0xFFFF..}
|
===============
|
{
|
0xFF 0 0 0xFF 0xFF 0 0 0xFF
|
0xFF 0 0xFF 0 0 0xFF 0 0xFF
|
}
|
@endcode */
|
inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
|
const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
|
const v_uint64x2& g, const v_uint64x2& h)
|
{
|
v_uint8x16 mask;
|
_pack_b(mask.s, a, b);
|
_pack_b(mask.s + 4, c, d);
|
_pack_b(mask.s + 8, e, f);
|
_pack_b(mask.s + 12, g, h);
|
return mask;
|
}
|
//! @}
|
|
/** @brief Matrix multiplication
|
|
Scheme:
|
@code
|
{A0 A1 A2 A3} |V0|
|
{B0 B1 B2 B3} |V1|
|
{C0 C1 C2 C3} |V2|
|
{D0 D1 D2 D3} x |V3|
|
====================
|
{R0 R1 R2 R3}, where:
|
R0 = A0V0 + A1V1 + A2V2 + A3V3,
|
R1 = B0V0 + B1V1 + B2V2 + B3V3
|
...
|
@endcode
|
*/
|
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
|
const v_float32x4& m1, const v_float32x4& m2,
|
const v_float32x4& m3)
|
{
|
return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
|
v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
|
v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
|
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
|
}
|
|
/** @brief Matrix multiplication and add
|
|
Scheme:
|
@code
|
{A0 A1 A2 } |V0| |D0|
|
{B0 B1 B2 } |V1| |D1|
|
{C0 C1 C2 } x |V2| + |D2|
|
====================
|
{R0 R1 R2 R3}, where:
|
R0 = A0V0 + A1V1 + A2V2 + D0,
|
R1 = B0V0 + B1V1 + B2V2 + D1
|
...
|
@endcode
|
*/
|
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
|
const v_float32x4& m1, const v_float32x4& m2,
|
const v_float32x4& m3)
|
{
|
return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
|
v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
|
v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
|
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
|
}
|
|
////// FP16 suport ///////
|
|
inline v_reg<float, V_TypeTraits<float>::nlanes128>
|
v_load_expand(const float16_t* ptr)
|
{
|
v_reg<float, V_TypeTraits<float>::nlanes128> v;
|
for( int i = 0; i < v.nlanes; i++ )
|
{
|
v.s[i] = ptr[i];
|
}
|
return v;
|
}
|
|
inline void
|
v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
|
{
|
for( int i = 0; i < v.nlanes; i++ )
|
{
|
ptr[i] = float16_t(v.s[i]);
|
}
|
}
|
|
inline void v_cleanup() {}
|
|
//! @}
|
|
//! @name Check SIMD support
|
//! @{
|
//! @brief Check CPU capability of SIMD operation
|
static inline bool hasSIMD128()
|
{
|
return false;
|
}
|
|
//! @}
|
|
#ifndef CV_DOXYGEN
|
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
|
#endif
|
}
|
|
#endif
|