split_merge.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Third party copyrights are property of their respective owners.
  17. //
  18. // Redistribution and use in source and binary forms, with or without modification,
  19. // are permitted provided that the following conditions are met:
  20. //
  21. // * Redistribution's of source code must retain the above copyright notice,
  22. // this list of conditions and the following disclaimer.
  23. //
  24. // * Redistribution's in binary form must reproduce the above copyright notice,
  25. // this list of conditions and the following disclaimer in the documentation
  26. // and/or other materials provided with the distribution.
  27. //
  28. // * The name of the copyright holders may not be used to endorse or promote products
  29. // derived from this software without specific prior written permission.
  30. //
  31. // This software is provided by the copyright holders and contributors "as is" and
  32. // any express or implied warranties, including, but not limited to, the implied
  33. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34. // In no event shall the Intel Corporation or contributors be liable for any direct,
  35. // indirect, incidental, special, exemplary, or consequential damages
  36. // (including, but not limited to, procurement of substitute goods or services;
  37. // loss of use, data, or profits; or business interruption) however caused
  38. // and on any theory of liability, whether in contract, strict liability,
  39. // or tort (including negligence or otherwise) arising in any way out of
  40. // the use of this software, even if advised of the possibility of such damage.
  41. //
  42. //M*/
  43. #pragma once
  44. #ifndef OPENCV_CUDEV_GRID_SPLIT_MERGE_DETAIL_HPP
  45. #define OPENCV_CUDEV_GRID_SPLIT_MERGE_DETAIL_HPP
  46. #include "../../common.hpp"
  47. #include "../../util/saturate_cast.hpp"
  48. #include "../../util/tuple.hpp"
  49. #include "../../util/vec_traits.hpp"
  50. #include "../../ptr2d/glob.hpp"
  51. #include "../../ptr2d/traits.hpp"
  52. namespace cv { namespace cudev {
  53. namespace grid_split_merge_detail
  54. {
  55. // merge
  56. template <class Src1Ptr, class Src2Ptr, typename DstType, class MaskPtr>
  57. __global__ void mergeC2(const Src1Ptr src1, const Src2Ptr src2, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
  58. {
  59. typedef typename VecTraits<DstType>::elem_type dst_elem_type;
  60. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  61. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  62. if (x >= cols || y >= rows || !mask(y, x))
  63. return;
  64. dst(y, x) = VecTraits<DstType>::make(
  65. saturate_cast<dst_elem_type>(src1(y, x)),
  66. saturate_cast<dst_elem_type>(src2(y, x))
  67. );
  68. }
  69. template <class Policy, class Src1Ptr, class Src2Ptr, typename DstType, class MaskPtr>
  70. __host__ void mergeC2(const Src1Ptr& src1, const Src2Ptr& src2, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  71. {
  72. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  73. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  74. mergeC2<<<grid, block, 0, stream>>>(src1, src2, dst, mask, rows, cols);
  75. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  76. if (stream == 0)
  77. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  78. }
  79. template <class Src1Ptr, class Src2Ptr, class Src3Ptr, typename DstType, class MaskPtr>
  80. __global__ void mergeC3(const Src1Ptr src1, const Src2Ptr src2, const Src3Ptr src3, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
  81. {
  82. typedef typename VecTraits<DstType>::elem_type dst_elem_type;
  83. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  84. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  85. if (x >= cols || y >= rows || !mask(y, x))
  86. return;
  87. dst(y, x) = VecTraits<DstType>::make(
  88. saturate_cast<dst_elem_type>(src1(y, x)),
  89. saturate_cast<dst_elem_type>(src2(y, x)),
  90. saturate_cast<dst_elem_type>(src3(y, x))
  91. );
  92. }
  93. template <class Policy, class Src1Ptr, class Src2Ptr, class Src3Ptr, typename DstType, class MaskPtr>
  94. __host__ void mergeC3(const Src1Ptr& src1, const Src2Ptr& src2, const Src3Ptr& src3, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  95. {
  96. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  97. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  98. mergeC3<<<grid, block, 0, stream>>>(src1, src2, src3, dst, mask, rows, cols);
  99. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  100. if (stream == 0)
  101. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  102. }
  103. template <class Src1Ptr, class Src2Ptr, class Src3Ptr, class Src4Ptr, typename DstType, class MaskPtr>
  104. __global__ void mergeC4(const Src1Ptr src1, const Src2Ptr src2, const Src3Ptr src3, const Src4Ptr src4, GlobPtr<DstType> dst, const MaskPtr mask, const int rows, const int cols)
  105. {
  106. typedef typename VecTraits<DstType>::elem_type dst_elem_type;
  107. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  108. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  109. if (x >= cols || y >= rows || !mask(y, x))
  110. return;
  111. dst(y, x) = VecTraits<DstType>::make(
  112. saturate_cast<dst_elem_type>(src1(y, x)),
  113. saturate_cast<dst_elem_type>(src2(y, x)),
  114. saturate_cast<dst_elem_type>(src3(y, x)),
  115. saturate_cast<dst_elem_type>(src4(y, x))
  116. );
  117. }
  118. template <class Policy, class Src1Ptr, class Src2Ptr, class Src3Ptr, class Src4Ptr, typename DstType, class MaskPtr>
  119. __host__ void mergeC4(const Src1Ptr& src1, const Src2Ptr& src2, const Src3Ptr& src3, const Src4Ptr& src4, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  120. {
  121. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  122. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  123. mergeC4<<<grid, block, 0, stream>>>(src1, src2, src3, src4, dst, mask, rows, cols);
  124. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  125. if (stream == 0)
  126. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  127. }
  128. template <int cn, class Policy> struct MergeImpl;
  129. template <class Policy> struct MergeImpl<2, Policy>
  130. {
  131. template <class SrcPtrTuple, typename DstType, class MaskPtr>
  132. __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  133. {
  134. mergeC2<Policy>(get<0>(src), get<1>(src), dst, mask, rows, cols, stream);
  135. }
  136. };
  137. template <class Policy> struct MergeImpl<3, Policy>
  138. {
  139. template <class SrcPtrTuple, typename DstType, class MaskPtr>
  140. __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  141. {
  142. mergeC3<Policy>(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream);
  143. }
  144. };
  145. template <class Policy> struct MergeImpl<4, Policy>
  146. {
  147. template <class SrcPtrTuple, typename DstType, class MaskPtr>
  148. __host__ static void merge(const SrcPtrTuple& src, const GlobPtr<DstType>& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  149. {
  150. mergeC4<Policy>(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream);
  151. }
  152. };
  153. // split
  154. template <class SrcPtr, typename DstType, class MaskPtr>
  155. __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, const MaskPtr mask, const int rows, const int cols)
  156. {
  157. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  158. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  159. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  160. if (x >= cols || y >= rows || !mask(y, x))
  161. return;
  162. const src_type src_value = src(y, x);
  163. dst1(y, x) = src_value.x;
  164. dst2(y, x) = src_value.y;
  165. }
  166. template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
  167. __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  168. {
  169. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  170. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  171. split<<<grid, block, 0, stream>>>(src, dst1, dst2, mask, rows, cols);
  172. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  173. if (stream == 0)
  174. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  175. }
  176. template <class SrcPtr, typename DstType, class MaskPtr>
  177. __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, GlobPtr<DstType> dst3, const MaskPtr mask, const int rows, const int cols)
  178. {
  179. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  180. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  181. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  182. if (x >= cols || y >= rows || !mask(y, x))
  183. return;
  184. const src_type src_value = src(y, x);
  185. dst1(y, x) = src_value.x;
  186. dst2(y, x) = src_value.y;
  187. dst3(y, x) = src_value.z;
  188. }
  189. template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
  190. __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const GlobPtr<DstType>& dst3, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  191. {
  192. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  193. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  194. split<<<grid, block, 0, stream>>>(src, dst1, dst2, dst3, mask, rows, cols);
  195. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  196. if (stream == 0)
  197. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  198. }
  199. template <class SrcPtr, typename DstType, class MaskPtr>
  200. __global__ void split(const SrcPtr src, GlobPtr<DstType> dst1, GlobPtr<DstType> dst2, GlobPtr<DstType> dst3, GlobPtr<DstType> dst4, const MaskPtr mask, const int rows, const int cols)
  201. {
  202. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  203. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  204. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  205. if (x >= cols || y >= rows || !mask(y, x))
  206. return;
  207. const src_type src_value = src(y, x);
  208. dst1(y, x) = src_value.x;
  209. dst2(y, x) = src_value.y;
  210. dst3(y, x) = src_value.z;
  211. dst4(y, x) = src_value.w;
  212. }
  213. template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
  214. __host__ void split(const SrcPtr& src, const GlobPtr<DstType>& dst1, const GlobPtr<DstType>& dst2, const GlobPtr<DstType>& dst3, const GlobPtr<DstType>& dst4, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  215. {
  216. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  217. const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
  218. split<<<grid, block, 0, stream>>>(src, dst1, dst2, dst3, dst4, mask, rows, cols);
  219. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  220. if (stream == 0)
  221. CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
  222. }
  223. }
  224. }}
  225. #endif