transform.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Third party copyrights are property of their respective owners.
  17. //
  18. // Redistribution and use in source and binary forms, with or without modification,
  19. // are permitted provided that the following conditions are met:
  20. //
  21. // * Redistribution's of source code must retain the above copyright notice,
  22. // this list of conditions and the following disclaimer.
  23. //
  24. // * Redistribution's in binary form must reproduce the above copyright notice,
  25. // this list of conditions and the following disclaimer in the documentation
  26. // and/or other materials provided with the distribution.
  27. //
  28. // * The name of the copyright holders may not be used to endorse or promote products
  29. // derived from this software without specific prior written permission.
  30. //
  31. // This software is provided by the copyright holders and contributors "as is" and
  32. // any express or implied warranties, including, but not limited to, the implied
  33. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34. // In no event shall the Intel Corporation or contributors be liable for any direct,
  35. // indirect, incidental, special, exemplary, or consequential damages
  36. // (including, but not limited to, procurement of substitute goods or services;
  37. // loss of use, data, or profits; or business interruption) however caused
  38. // and on any theory of liability, whether in contract, strict liability,
  39. // or tort (including negligence or otherwise) arising in any way out of
  40. // the use of this software, even if advised of the possibility of such damage.
  41. //
  42. //M*/
  43. #pragma once
  44. #ifndef OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP
  45. #define OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP
  46. #include "../../common.hpp"
  47. #include "../../util/tuple.hpp"
  48. #include "../../util/saturate_cast.hpp"
  49. #include "../../util/vec_traits.hpp"
  50. #include "../../ptr2d/glob.hpp"
  51. #include "../../ptr2d/traits.hpp"
  52. namespace cv { namespace cudev {
  53. namespace grid_transform_detail
  54. {
  55. // OpUnroller
  56. template <int cn> struct OpUnroller;
  57. template <> struct OpUnroller<1>
  58. {
  59. template <typename T, typename D, class UnOp, class MaskPtr>
  60. __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
  61. {
  62. if (mask(y, x_shifted))
  63. dst.x = op(src.x);
  64. }
  65. template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
  66. __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
  67. {
  68. if (mask(y, x_shifted))
  69. dst.x = op(src1.x, src2.x);
  70. }
  71. };
  72. template <> struct OpUnroller<2>
  73. {
  74. template <typename T, typename D, class UnOp, class MaskPtr>
  75. __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
  76. {
  77. if (mask(y, x_shifted))
  78. dst.x = op(src.x);
  79. if (mask(y, x_shifted + 1))
  80. dst.y = op(src.y);
  81. }
  82. template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
  83. __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
  84. {
  85. if (mask(y, x_shifted))
  86. dst.x = op(src1.x, src2.x);
  87. if (mask(y, x_shifted + 1))
  88. dst.y = op(src1.y, src2.y);
  89. }
  90. };
  91. template <> struct OpUnroller<3>
  92. {
  93. template <typename T, typename D, class UnOp, class MaskPtr>
  94. __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
  95. {
  96. if (mask(y, x_shifted))
  97. dst.x = op(src.x);
  98. if (mask(y, x_shifted + 1))
  99. dst.y = op(src.y);
  100. if (mask(y, x_shifted + 2))
  101. dst.z = op(src.z);
  102. }
  103. template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
  104. __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
  105. {
  106. if (mask(y, x_shifted))
  107. dst.x = op(src1.x, src2.x);
  108. if (mask(y, x_shifted + 1))
  109. dst.y = op(src1.y, src2.y);
  110. if (mask(y, x_shifted + 2))
  111. dst.z = op(src1.z, src2.z);
  112. }
  113. };
  114. template <> struct OpUnroller<4>
  115. {
  116. template <typename T, typename D, class UnOp, class MaskPtr>
  117. __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y)
  118. {
  119. if (mask(y, x_shifted))
  120. dst.x = op(src.x);
  121. if (mask(y, x_shifted + 1))
  122. dst.y = op(src.y);
  123. if (mask(y, x_shifted + 2))
  124. dst.z = op(src.z);
  125. if (mask(y, x_shifted + 3))
  126. dst.w = op(src.w);
  127. }
  128. template <typename T1, typename T2, typename D, class BinOp, class MaskPtr>
  129. __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y)
  130. {
  131. if (mask(y, x_shifted))
  132. dst.x = op(src1.x, src2.x);
  133. if (mask(y, x_shifted + 1))
  134. dst.y = op(src1.y, src2.y);
  135. if (mask(y, x_shifted + 2))
  136. dst.z = op(src1.z, src2.z);
  137. if (mask(y, x_shifted + 3))
  138. dst.w = op(src1.w, src2.w);
  139. }
  140. };
  141. // transformSimple
  142. template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
  143. __global__ void transformSimple(const SrcPtr src, GlobPtr<DstType> dst, const UnOp op, const MaskPtr mask, const int rows, const int cols)
  144. {
  145. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  146. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  147. if (x >= cols || y >= rows || !mask(y, x))
  148. return;
  149. dst(y, x) = saturate_cast<DstType>(op(src(y, x)));
  150. }
  151. template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
  152. __global__ void transformSimple(const SrcPtr1 src1, const SrcPtr2 src2, GlobPtr<DstType> dst, const BinOp op, const MaskPtr mask, const int rows, const int cols)
  153. {
  154. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  155. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  156. if (x >= cols || y >= rows || !mask(y, x))
  157. return;
  158. dst(y, x) = saturate_cast<DstType>(op(src1(y, x), src2(y, x)));
  159. }
  160. // transformSmart
  161. template <int SHIFT, typename SrcType, typename DstType, class UnOp, class MaskPtr>
  162. __global__ void transformSmart(const GlobPtr<SrcType> src_, GlobPtr<DstType> dst_, const UnOp op, const MaskPtr mask, const int rows, const int cols)
  163. {
  164. typedef typename MakeVec<SrcType, SHIFT>::type read_type;
  165. typedef typename MakeVec<DstType, SHIFT>::type write_type;
  166. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  167. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  168. const int x_shifted = x * SHIFT;
  169. if (y < rows)
  170. {
  171. const SrcType* src = src_.row(y);
  172. DstType* dst = dst_.row(y);
  173. if (x_shifted + SHIFT - 1 < cols)
  174. {
  175. const read_type src_n_el = ((const read_type*)src)[x];
  176. OpUnroller<SHIFT>::unroll(src_n_el, ((write_type*)dst)[x], op, mask, x_shifted, y);
  177. }
  178. else
  179. {
  180. for (int real_x = x_shifted; real_x < cols; ++real_x)
  181. {
  182. if (mask(y, real_x))
  183. dst[real_x] = op(src[real_x]);
  184. }
  185. }
  186. }
  187. }
  188. template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
  189. __global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
  190. {
  191. typedef typename MakeVec<SrcType1, SHIFT>::type read_type1;
  192. typedef typename MakeVec<SrcType2, SHIFT>::type read_type2;
  193. typedef typename MakeVec<DstType, SHIFT>::type write_type;
  194. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  195. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  196. const int x_shifted = x * SHIFT;
  197. if (y < rows)
  198. {
  199. const SrcType1* src1 = src1_.row(y);
  200. const SrcType2* src2 = src2_.row(y);
  201. DstType* dst = dst_.row(y);
  202. if (x_shifted + SHIFT - 1 < cols)
  203. {
  204. const read_type1 src1_n_el = ((const read_type1*)src1)[x];
  205. const read_type2 src2_n_el = ((const read_type2*)src2)[x];
  206. OpUnroller<SHIFT>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], op, mask, x_shifted, y);
  207. }
  208. else
  209. {
  210. for (int real_x = x_shifted; real_x < cols; ++real_x)
  211. {
  212. if (mask(y, real_x))
  213. dst[real_x] = op(src1[real_x], src2[real_x]);
  214. }
  215. }
  216. }
  217. }
  218. // TransformDispatcher
  219. template <bool UseSmart, class Policy> struct TransformDispatcher;
  220. template <class Policy> struct TransformDispatcher<false, Policy>
  221. {
  222. template <class SrcPtr, typename DstType, class UnOp, class MaskPtr>
  223. __host__ static void call(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  224. {
  225. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  226. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  227. transformSimple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
  228. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  229. if (stream == 0)
  230. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  231. }
  232. template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
  233. __host__ static void call(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  234. {
  235. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  236. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  237. transformSimple<<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols);
  238. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  239. if (stream == 0)
  240. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  241. }
  242. };
  243. template <class Policy> struct TransformDispatcher<true, Policy>
  244. {
  245. template <typename T>
  246. __host__ static bool isAligned(const T* ptr, size_t size)
  247. {
  248. return reinterpret_cast<size_t>(ptr) % size == 0;
  249. }
  250. __host__ static bool isAligned(size_t step, size_t size)
  251. {
  252. return step % size == 0;
  253. }
  254. template <typename SrcType, typename DstType, class UnOp, class MaskPtr>
  255. __host__ static void call(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  256. {
  257. if (Policy::shift == 1 ||
  258. !isAligned(src.data, Policy::shift * sizeof(SrcType)) || !isAligned(src.step, Policy::shift * sizeof(SrcType)) ||
  259. !isAligned(dst.data, Policy::shift * sizeof(DstType)) || !isAligned(dst.step, Policy::shift * sizeof(DstType)))
  260. {
  261. TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
  262. return;
  263. }
  264. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  265. const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y));
  266. transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
  267. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  268. if (stream == 0)
  269. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  270. }
  271. template <typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
  272. __host__ static void call(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  273. {
  274. if (Policy::shift == 1 ||
  275. !isAligned(src1.data, Policy::shift * sizeof(SrcType1)) || !isAligned(src1.step, Policy::shift * sizeof(SrcType1)) ||
  276. !isAligned(src2.data, Policy::shift * sizeof(SrcType2)) || !isAligned(src2.step, Policy::shift * sizeof(SrcType2)) ||
  277. !isAligned(dst.data, Policy::shift * sizeof(DstType)) || !isAligned(dst.step, Policy::shift * sizeof(DstType)))
  278. {
  279. TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
  280. return;
  281. }
  282. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  283. const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y));
  284. transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols);
  285. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  286. if (stream == 0)
  287. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  288. }
  289. };
  290. template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
  291. __host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  292. {
  293. TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
  294. }
  295. template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
  296. __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  297. {
  298. TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
  299. }
  300. template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr>
  301. __host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  302. {
  303. TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream);
  304. }
  305. template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
  306. __host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  307. {
  308. TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
  309. }
  310. // transform_tuple
  311. template <int count> struct Unroll
  312. {
  313. template <class SrcVal, class DstPtrTuple, class OpTuple>
  314. __device__ static void transform(const SrcVal& srcVal, DstPtrTuple& dst, const OpTuple& op, int y, int x)
  315. {
  316. typedef typename tuple_element<count - 1, DstPtrTuple>::type dst_ptr_type;
  317. typedef typename PtrTraits<dst_ptr_type>::value_type dst_type;
  318. get<count - 1>(dst)(y, x) = saturate_cast<dst_type>(get<count - 1>(op)(srcVal));
  319. Unroll<count - 1>::transform(srcVal, dst, op, y, x);
  320. }
  321. };
  322. template <> struct Unroll<0>
  323. {
  324. template <class SrcVal, class DstPtrTuple, class OpTuple>
  325. __device__ __forceinline__ static void transform(const SrcVal&, DstPtrTuple&, const OpTuple&, int, int)
  326. {
  327. }
  328. };
  329. template <class SrcPtr, class DstPtrTuple, class OpTuple, class MaskPtr>
  330. __global__ void transform_tuple(const SrcPtr src, DstPtrTuple dst, const OpTuple op, const MaskPtr mask, const int rows, const int cols)
  331. {
  332. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  333. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  334. if (x >= cols || y >= rows || !mask(y, x))
  335. return;
  336. typename PtrTraits<SrcPtr>::value_type srcVal = src(y, x);
  337. Unroll<tuple_size<DstPtrTuple>::value>::transform(srcVal, dst, op, y, x);
  338. }
  339. template <class Policy, class SrcPtrTuple, class DstPtrTuple, class OpTuple, class MaskPtr>
  340. __host__ void transform_tuple(const SrcPtrTuple& src, const DstPtrTuple& dst, const OpTuple& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  341. {
  342. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  343. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  344. transform_tuple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols);
  345. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  346. if (stream == 0)
  347. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  348. }
  349. }
  350. }}
  351. #endif