intrin.hpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17. // Third party copyrights are property of their respective owners.
  18. //
  19. // Redistribution and use in source and binary forms, with or without modification,
  20. // are permitted provided that the following conditions are met:
  21. //
  22. // * Redistribution's of source code must retain the above copyright notice,
  23. // this list of conditions and the following disclaimer.
  24. //
  25. // * Redistribution's in binary form must reproduce the above copyright notice,
  26. // this list of conditions and the following disclaimer in the documentation
  27. // and/or other materials provided with the distribution.
  28. //
  29. // * The name of the copyright holders may not be used to endorse or promote products
  30. // derived from this software without specific prior written permission.
  31. //
  32. // This software is provided by the copyright holders and contributors "as is" and
  33. // any express or implied warranties, including, but not limited to, the implied
  34. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35. // In no event shall the Intel Corporation or contributors be liable for any direct,
  36. // indirect, incidental, special, exemplary, or consequential damages
  37. // (including, but not limited to, procurement of substitute goods or services;
  38. // loss of use, data, or profits; or business interruption) however caused
  39. // and on any theory of liability, whether in contract, strict liability,
  40. // or tort (including negligence or otherwise) arising in any way out of
  41. // the use of this software, even if advised of the possibility of such damage.
  42. //
  43. //M*/
  44. #ifndef OPENCV_HAL_INTRIN_HPP
  45. #define OPENCV_HAL_INTRIN_HPP
  46. #include <cmath>
  47. #include <float.h>
  48. #include <stdlib.h>
  49. #include "opencv2/core/cvdef.h"
  50. #define OPENCV_HAL_ADD(a, b) ((a) + (b))
  51. #define OPENCV_HAL_AND(a, b) ((a) & (b))
  52. #define OPENCV_HAL_NOP(a) (a)
  53. #define OPENCV_HAL_1ST(a, b) (a)
  54. namespace {
  55. inline unsigned int trailingZeros32(unsigned int value) {
  56. #if defined(_MSC_VER)
  57. #if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
  58. unsigned long index = 0;
  59. _BitScanForward(&index, value);
  60. return (unsigned int)index;
  61. #elif defined(__clang__)
  62. // clang-cl doesn't export _tzcnt_u32 for non BMI systems
  63. return value ? __builtin_ctz(value) : 32;
  64. #else
  65. return _tzcnt_u32(value);
  66. #endif
  67. #elif defined(__GNUC__) || defined(__GNUG__)
  68. return __builtin_ctz(value);
  69. #elif defined(__ICC) || defined(__INTEL_COMPILER)
  70. return _bit_scan_forward(value);
  71. #elif defined(__clang__)
  72. return llvm.cttz.i32(value, true);
  73. #else
  74. static const int MultiplyDeBruijnBitPosition[32] = {
  75. 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
  76. 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
  77. return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
  78. #endif
  79. }
  80. }
  81. // unlike HAL API, which is in cv::hal,
  82. // we put intrinsics into cv namespace to make its
  83. // access from within opencv code more accessible
  84. namespace cv {
  85. namespace hal {
  86. enum StoreMode
  87. {
  88. STORE_UNALIGNED = 0,
  89. STORE_ALIGNED = 1,
  90. STORE_ALIGNED_NOCACHE = 2
  91. };
  92. }
  93. // TODO FIXIT: Don't use "God" traits. Split on separate cases.
  94. template<typename _Tp> struct V_TypeTraits
  95. {
  96. };
  97. #define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
  98. template<> struct V_TypeTraits<type> \
  99. { \
  100. typedef type value_type; \
  101. typedef int_type_ int_type; \
  102. typedef abs_type_ abs_type; \
  103. typedef uint_type_ uint_type; \
  104. typedef w_type_ w_type; \
  105. typedef q_type_ q_type; \
  106. typedef sum_type_ sum_type; \
  107. \
  108. static inline int_type reinterpret_int(type x) \
  109. { \
  110. union { type l; int_type i; } v; \
  111. v.l = x; \
  112. return v.i; \
  113. } \
  114. \
  115. static inline type reinterpret_from_int(int_type x) \
  116. { \
  117. union { type l; int_type i; } v; \
  118. v.i = x; \
  119. return v.l; \
  120. } \
  121. }
  122. #define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
  123. template<> struct V_TypeTraits<type> \
  124. { \
  125. typedef type value_type; \
  126. typedef int_type_ int_type; \
  127. typedef abs_type_ abs_type; \
  128. typedef uint_type_ uint_type; \
  129. typedef w_type_ w_type; \
  130. typedef sum_type_ sum_type; \
  131. \
  132. static inline int_type reinterpret_int(type x) \
  133. { \
  134. union { type l; int_type i; } v; \
  135. v.l = x; \
  136. return v.i; \
  137. } \
  138. \
  139. static inline type reinterpret_from_int(int_type x) \
  140. { \
  141. union { type l; int_type i; } v; \
  142. v.i = x; \
  143. return v.l; \
  144. } \
  145. }
  146. CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
  147. CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
  148. CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
  149. CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
  150. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
  151. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
  152. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
  153. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
  154. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
  155. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
  156. #ifndef CV_DOXYGEN
  157. #ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
  158. #ifdef CV_FORCE_SIMD128_CPP
  159. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
  160. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
  161. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  162. #elif defined(CV_CPU_DISPATCH_MODE)
  163. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
  164. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
  165. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  166. #else
  167. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
  168. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
  169. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  170. #endif
  171. #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
  172. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  173. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
  174. using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
  175. #endif
  176. }
  177. #ifdef CV_DOXYGEN
  178. # undef CV_AVX2
  179. # undef CV_SSE2
  180. # undef CV_NEON
  181. # undef CV_VSX
  182. # undef CV_FP16
  183. # undef CV_MSA
  184. #endif
  185. #if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP)
  186. #define CV__SIMD_FORWARD 128
  187. #include "opencv2/core/hal/intrin_forward.hpp"
  188. #endif
  189. #if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
  190. #include "opencv2/core/hal/intrin_sse_em.hpp"
  191. #include "opencv2/core/hal/intrin_sse.hpp"
  192. #elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
  193. #include "opencv2/core/hal/intrin_neon.hpp"
  194. #elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
  195. #include "opencv2/core/hal/intrin_vsx.hpp"
  196. #elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
  197. #include "opencv2/core/hal/intrin_msa.hpp"
  198. #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
  199. #include "opencv2/core/hal/intrin_wasm.hpp"
  200. #else
  201. #include "opencv2/core/hal/intrin_cpp.hpp"
  202. #endif
  203. // AVX2 can be used together with SSE2, so
  204. // we define those two sets of intrinsics at once.
  205. // Most of the intrinsics do not conflict (the proper overloaded variant is
  206. // resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
  207. // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
  208. // Correspondingly, the wide intrinsics (which are mapped to the "widest"
  209. // available instruction set) will get vx_ prefix
  210. // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
  211. #if CV_AVX2
  212. #define CV__SIMD_FORWARD 256
  213. #include "opencv2/core/hal/intrin_forward.hpp"
  214. #include "opencv2/core/hal/intrin_avx.hpp"
  215. #endif
  216. // AVX512 can be used together with SSE2 and AVX2, so
  217. // we define those sets of intrinsics at once.
  218. // For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
  219. // Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
  220. #if CV_AVX512_SKX
  221. #define CV__SIMD_FORWARD 512
  222. #include "opencv2/core/hal/intrin_forward.hpp"
  223. #include "opencv2/core/hal/intrin_avx512.hpp"
  224. #endif
  225. //! @cond IGNORED
  226. namespace cv {
  227. #ifndef CV_DOXYGEN
  228. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  229. #endif
  230. #ifndef CV_SIMD128
  231. #define CV_SIMD128 0
  232. #endif
  233. #ifndef CV_SIMD128_CPP
  234. #define CV_SIMD128_CPP 0
  235. #endif
  236. #ifndef CV_SIMD128_64F
  237. #define CV_SIMD128_64F 0
  238. #endif
  239. #ifndef CV_SIMD256
  240. #define CV_SIMD256 0
  241. #endif
  242. #ifndef CV_SIMD256_64F
  243. #define CV_SIMD256_64F 0
  244. #endif
  245. #ifndef CV_SIMD512
  246. #define CV_SIMD512 0
  247. #endif
  248. #ifndef CV_SIMD512_64F
  249. #define CV_SIMD512_64F 0
  250. #endif
  251. #ifndef CV_SIMD128_FP16
  252. #define CV_SIMD128_FP16 0
  253. #endif
  254. #ifndef CV_SIMD256_FP16
  255. #define CV_SIMD256_FP16 0
  256. #endif
  257. #ifndef CV_SIMD512_FP16
  258. #define CV_SIMD512_FP16 0
  259. #endif
  260. //==================================================================================================
  261. template<typename _Tp> struct V_RegTraits
  262. {
  263. };
  264. #define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
  265. template<> struct V_RegTraits<_reg> \
  266. { \
  267. typedef _reg reg; \
  268. typedef _u_reg u_reg; \
  269. typedef _w_reg w_reg; \
  270. typedef _q_reg q_reg; \
  271. typedef _int_reg int_reg; \
  272. typedef _round_reg round_reg; \
  273. }
  274. #if CV_SIMD128 || CV_SIMD128_CPP
  275. CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
  276. CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
  277. CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
  278. CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
  279. CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
  280. CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
  281. #if CV_SIMD128_64F || CV_SIMD128_CPP
  282. CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
  283. #else
  284. CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
  285. #endif
  286. CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
  287. CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
  288. #if CV_SIMD128_64F
  289. CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
  290. #endif
  291. #endif
  292. #if CV_SIMD256
  293. CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
  294. CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
  295. CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
  296. CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
  297. CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
  298. CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
  299. CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
  300. CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
  301. CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
  302. CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
  303. #endif
  304. #if CV_SIMD512
  305. CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
  306. CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
  307. CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
  308. CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
  309. CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
  310. CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
  311. CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
  312. CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
  313. CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
  314. CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
  315. #endif
  316. //! @endcond
  317. #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
  318. #define CV__SIMD_NAMESPACE simd512
  319. namespace CV__SIMD_NAMESPACE {
  320. #define CV_SIMD 1
  321. #define CV_SIMD_64F CV_SIMD512_64F
  322. #define CV_SIMD_FP16 CV_SIMD512_FP16
  323. #define CV_SIMD_WIDTH 64
  324. //! @addtogroup core_hal_intrin
  325. //! @{
  326. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  327. typedef v_uint8x64 v_uint8;
  328. //! @brief Maximum available vector register capacity 8-bit signed integer values
  329. typedef v_int8x64 v_int8;
  330. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  331. typedef v_uint16x32 v_uint16;
  332. //! @brief Maximum available vector register capacity 16-bit signed integer values
  333. typedef v_int16x32 v_int16;
  334. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  335. typedef v_uint32x16 v_uint32;
  336. //! @brief Maximum available vector register capacity 32-bit signed integer values
  337. typedef v_int32x16 v_int32;
  338. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  339. typedef v_uint64x8 v_uint64;
  340. //! @brief Maximum available vector register capacity 64-bit signed integer values
  341. typedef v_int64x8 v_int64;
  342. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  343. typedef v_float32x16 v_float32;
  344. #if CV_SIMD512_64F
  345. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  346. typedef v_float64x8 v_float64;
  347. #endif
  348. //! @}
  349. #define VXPREFIX(func) v512##func
  350. } // namespace
  351. using namespace CV__SIMD_NAMESPACE;
  352. #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
  353. #define CV__SIMD_NAMESPACE simd256
  354. namespace CV__SIMD_NAMESPACE {
  355. #define CV_SIMD 1
  356. #define CV_SIMD_64F CV_SIMD256_64F
  357. #define CV_SIMD_FP16 CV_SIMD256_FP16
  358. #define CV_SIMD_WIDTH 32
  359. //! @addtogroup core_hal_intrin
  360. //! @{
  361. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  362. typedef v_uint8x32 v_uint8;
  363. //! @brief Maximum available vector register capacity 8-bit signed integer values
  364. typedef v_int8x32 v_int8;
  365. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  366. typedef v_uint16x16 v_uint16;
  367. //! @brief Maximum available vector register capacity 16-bit signed integer values
  368. typedef v_int16x16 v_int16;
  369. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  370. typedef v_uint32x8 v_uint32;
  371. //! @brief Maximum available vector register capacity 32-bit signed integer values
  372. typedef v_int32x8 v_int32;
  373. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  374. typedef v_uint64x4 v_uint64;
  375. //! @brief Maximum available vector register capacity 64-bit signed integer values
  376. typedef v_int64x4 v_int64;
  377. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  378. typedef v_float32x8 v_float32;
  379. #if CV_SIMD256_64F
  380. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  381. typedef v_float64x4 v_float64;
  382. #endif
  383. //! @}
  384. #define VXPREFIX(func) v256##func
  385. } // namespace
  386. using namespace CV__SIMD_NAMESPACE;
  387. #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
  388. #if defined CV_SIMD128_CPP
  389. #define CV__SIMD_NAMESPACE simd128_cpp
  390. #else
  391. #define CV__SIMD_NAMESPACE simd128
  392. #endif
  393. namespace CV__SIMD_NAMESPACE {
  394. #define CV_SIMD CV_SIMD128
  395. #define CV_SIMD_64F CV_SIMD128_64F
  396. #define CV_SIMD_WIDTH 16
  397. //! @addtogroup core_hal_intrin
  398. //! @{
  399. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  400. typedef v_uint8x16 v_uint8;
  401. //! @brief Maximum available vector register capacity 8-bit signed integer values
  402. typedef v_int8x16 v_int8;
  403. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  404. typedef v_uint16x8 v_uint16;
  405. //! @brief Maximum available vector register capacity 16-bit signed integer values
  406. typedef v_int16x8 v_int16;
  407. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  408. typedef v_uint32x4 v_uint32;
  409. //! @brief Maximum available vector register capacity 32-bit signed integer values
  410. typedef v_int32x4 v_int32;
  411. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  412. typedef v_uint64x2 v_uint64;
  413. //! @brief Maximum available vector register capacity 64-bit signed integer values
  414. typedef v_int64x2 v_int64;
  415. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  416. typedef v_float32x4 v_float32;
  417. #if CV_SIMD128_64F
  418. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  419. typedef v_float64x2 v_float64;
  420. #endif
  421. //! @}
  422. #define VXPREFIX(func) v##func
  423. } // namespace
  424. using namespace CV__SIMD_NAMESPACE;
  425. #endif
  426. namespace CV__SIMD_NAMESPACE {
  427. //! @addtogroup core_hal_intrin
  428. //! @{
  429. //! @name Wide init with value
  430. //! @{
  431. //! @brief Create maximum available capacity vector with elements set to a specific value
  432. inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
  433. inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
  434. inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
  435. inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
  436. inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
  437. inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
  438. inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
  439. inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
  440. inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
  441. #if CV_SIMD_64F
  442. inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
  443. #endif
  444. //! @}
  445. //! @name Wide init with zero
  446. //! @{
  447. //! @brief Create maximum available capacity vector with elements set to zero
  448. inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
  449. inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
  450. inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
  451. inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
  452. inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
  453. inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
  454. inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
  455. inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
  456. inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
  457. #if CV_SIMD_64F
  458. inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
  459. #endif
  460. //! @}
  461. //! @name Wide load from memory
  462. //! @{
  463. //! @brief Load maximum available capacity register contents from memory
  464. inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
  465. inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
  466. inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
  467. inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
  468. inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
  469. inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
  470. inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
  471. inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
  472. inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
  473. #if CV_SIMD_64F
  474. inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
  475. #endif
  476. //! @}
  477. //! @name Wide load from memory(aligned)
  478. //! @{
  479. //! @brief Load maximum available capacity register contents from memory(aligned)
  480. inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  481. inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  482. inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  483. inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  484. inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  485. inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  486. inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  487. inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  488. inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  489. #if CV_SIMD_64F
  490. inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  491. #endif
  492. //! @}
  493. //! @name Wide load lower half from memory
  494. //! @{
  495. //! @brief Load lower half of maximum available capacity register from memory
  496. inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
  497. inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
  498. inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
  499. inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
  500. inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
  501. inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
  502. inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
  503. inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
  504. inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
  505. #if CV_SIMD_64F
  506. inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
  507. #endif
  508. //! @}
  509. //! @name Wide load halfs from memory
  510. //! @{
  511. //! @brief Load maximum available capacity register contents from two memory blocks
  512. inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  513. inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  514. inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  515. inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  516. inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  517. inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  518. inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  519. inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  520. inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  521. #if CV_SIMD_64F
  522. inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  523. #endif
  524. //! @}
  525. //! @name Wide LUT of elements
  526. //! @{
  527. //! @brief Load maximum available capacity register contents with array elements by provided indexes
  528. inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  529. inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  530. inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  531. inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  532. inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  533. inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  534. inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  535. inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  536. inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  537. #if CV_SIMD_64F
  538. inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  539. #endif
  540. //! @}
  541. //! @name Wide LUT of element pairs
  542. //! @{
  543. //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
  544. inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  545. inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  546. inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  547. inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  548. inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  549. inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  550. inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  551. inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  552. inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  553. #if CV_SIMD_64F
  554. inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  555. #endif
  556. //! @}
  557. //! @name Wide LUT of element quads
  558. //! @{
  559. //! @brief Load maximum available capacity register contents with array element quads by provided indexes
  560. inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  561. inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  562. inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  563. inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  564. inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  565. inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  566. inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  567. //! @}
  568. //! @name Wide load with double expansion
  569. //! @{
  570. //! @brief Load maximum available capacity register contents from memory with double expand
  571. inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
  572. inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
  573. inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
  574. inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
  575. inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
  576. inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
  577. inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
  578. //! @}
  579. //! @name Wide load with quad expansion
  580. //! @{
  581. //! @brief Load maximum available capacity register contents from memory with quad expand
  582. inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
  583. inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
  584. //! @}
  585. /** @brief SIMD processing state cleanup call */
  586. inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
  587. //! @cond IGNORED
  588. // backward compatibility
  589. template<typename _Tp, typename _Tvec> static inline
  590. void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
  591. // backward compatibility
  592. template<typename _Tp, typename _Tvec> static inline
  593. void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
  594. //! @endcond
  595. //! @}
  596. #undef VXPREFIX
  597. } // namespace
  598. //! @cond IGNORED
  599. #ifndef CV_SIMD_64F
  600. #define CV_SIMD_64F 0
  601. #endif
  602. #ifndef CV_SIMD_FP16
  603. #define CV_SIMD_FP16 0 //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
  604. #endif
  605. #ifndef CV_SIMD
  606. #define CV_SIMD 0
  607. #endif
  608. #include "simd_utils.impl.hpp"
  609. #ifndef CV_DOXYGEN
  610. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
  611. #endif
  612. } // cv::
  613. //! @endcond
  614. #endif