reduce.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Third party copyrights are property of their respective owners.
  17. //
  18. // Redistribution and use in source and binary forms, with or without modification,
  19. // are permitted provided that the following conditions are met:
  20. //
  21. // * Redistribution's of source code must retain the above copyright notice,
  22. // this list of conditions and the following disclaimer.
  23. //
  24. // * Redistribution's in binary form must reproduce the above copyright notice,
  25. // this list of conditions and the following disclaimer in the documentation
  26. // and/or other materials provided with the distribution.
  27. //
  28. // * The name of the copyright holders may not be used to endorse or promote products
  29. // derived from this software without specific prior written permission.
  30. //
  31. // This software is provided by the copyright holders and contributors "as is" and
  32. // any express or implied warranties, including, but not limited to, the implied
  33. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34. // In no event shall the Intel Corporation or contributors be liable for any direct,
  35. // indirect, incidental, special, exemplary, or consequential damages
  36. // (including, but not limited to, procurement of substitute goods or services;
  37. // loss of use, data, or profits; or business interruption) however caused
  38. // and on any theory of liability, whether in contract, strict liability,
  39. // or tort (including negligence or otherwise) arising in any way out of
  40. // the use of this software, even if advised of the possibility of such damage.
  41. //
  42. //M*/
  43. #pragma once
  44. #ifndef OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP
  45. #define OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP
  46. #include "../../common.hpp"
  47. #include "../../util/tuple.hpp"
  48. #include "../../util/saturate_cast.hpp"
  49. #include "../../util/atomic.hpp"
  50. #include "../../util/vec_traits.hpp"
  51. #include "../../util/type_traits.hpp"
  52. #include "../../util/limits.hpp"
  53. #include "../../block/reduce.hpp"
  54. #include "../../functional/functional.hpp"
  55. #include "../../ptr2d/traits.hpp"
  56. namespace cv { namespace cudev {
  57. namespace grid_reduce_detail
  58. {
  59. // Unroll
  60. template <int cn> struct Unroll;
  61. template <> struct Unroll<1>
  62. {
  63. template <int BLOCK_SIZE, typename R>
  64. __device__ __forceinline__ static volatile R* smem(R* ptr)
  65. {
  66. return ptr;
  67. }
  68. template <typename R>
  69. __device__ __forceinline__ static R& res(R& val)
  70. {
  71. return val;
  72. }
  73. template <class Op>
  74. __device__ __forceinline__ static const Op& op(const Op& aop)
  75. {
  76. return aop;
  77. }
  78. };
  79. template <> struct Unroll<2>
  80. {
  81. template <int BLOCK_SIZE, typename R>
  82. __device__ __forceinline__ static tuple<volatile R*, volatile R*> smem(R* ptr)
  83. {
  84. return smem_tuple(ptr, ptr + BLOCK_SIZE);
  85. }
  86. template <typename R>
  87. __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> res(R& val)
  88. {
  89. return tie(val.x, val.y);
  90. }
  91. template <class Op>
  92. __device__ __forceinline__ static tuple<Op, Op> op(const Op& aop)
  93. {
  94. return make_tuple(aop, aop);
  95. }
  96. };
  97. template <> struct Unroll<3>
  98. {
  99. template <int BLOCK_SIZE, typename R>
  100. __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*> smem(R* ptr)
  101. {
  102. return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE);
  103. }
  104. template <typename R>
  105. __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
  106. typename VecTraits<R>::elem_type&,
  107. typename VecTraits<R>::elem_type&> res(R& val)
  108. {
  109. return tie(val.x, val.y, val.z);
  110. }
  111. template <class Op>
  112. __device__ __forceinline__ static tuple<Op, Op, Op> op(const Op& aop)
  113. {
  114. return make_tuple(aop, aop, aop);
  115. }
  116. };
  117. template <> struct Unroll<4>
  118. {
  119. template <int BLOCK_SIZE, typename R>
  120. __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem(R* ptr)
  121. {
  122. return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE, ptr + 3 * BLOCK_SIZE);
  123. }
  124. template <typename R>
  125. __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
  126. typename VecTraits<R>::elem_type&,
  127. typename VecTraits<R>::elem_type&,
  128. typename VecTraits<R>::elem_type&> res(R& val)
  129. {
  130. return tie(val.x, val.y, val.z, val.w);
  131. }
  132. template <class Op>
  133. __device__ __forceinline__ static tuple<Op, Op, Op, Op> op(const Op& aop)
  134. {
  135. return make_tuple(aop, aop, aop, aop);
  136. }
  137. };
  138. // AtomicUnroll
  139. template <typename R, int cn> struct AtomicUnroll;
  140. template <typename R> struct AtomicUnroll<R, 1>
  141. {
  142. __device__ __forceinline__ static void add(R* ptr, R val)
  143. {
  144. atomicAdd(ptr, val);
  145. }
  146. __device__ __forceinline__ static void min(R* ptr, R val)
  147. {
  148. atomicMin(ptr, val);
  149. }
  150. __device__ __forceinline__ static void max(R* ptr, R val)
  151. {
  152. atomicMax(ptr, val);
  153. }
  154. };
  155. template <typename R> struct AtomicUnroll<R, 2>
  156. {
  157. typedef typename MakeVec<R, 2>::type val_type;
  158. __device__ __forceinline__ static void add(R* ptr, val_type val)
  159. {
  160. atomicAdd(ptr, val.x);
  161. atomicAdd(ptr + 1, val.y);
  162. }
  163. __device__ __forceinline__ static void min(R* ptr, val_type val)
  164. {
  165. atomicMin(ptr, val.x);
  166. atomicMin(ptr + 1, val.y);
  167. }
  168. __device__ __forceinline__ static void max(R* ptr, val_type val)
  169. {
  170. atomicMax(ptr, val.x);
  171. atomicMax(ptr + 1, val.y);
  172. }
  173. };
  174. template <typename R> struct AtomicUnroll<R, 3>
  175. {
  176. typedef typename MakeVec<R, 3>::type val_type;
  177. __device__ __forceinline__ static void add(R* ptr, val_type val)
  178. {
  179. atomicAdd(ptr, val.x);
  180. atomicAdd(ptr + 1, val.y);
  181. atomicAdd(ptr + 2, val.z);
  182. }
  183. __device__ __forceinline__ static void min(R* ptr, val_type val)
  184. {
  185. atomicMin(ptr, val.x);
  186. atomicMin(ptr + 1, val.y);
  187. atomicMin(ptr + 2, val.z);
  188. }
  189. __device__ __forceinline__ static void max(R* ptr, val_type val)
  190. {
  191. atomicMax(ptr, val.x);
  192. atomicMax(ptr + 1, val.y);
  193. atomicMax(ptr + 2, val.z);
  194. }
  195. };
  196. template <typename R> struct AtomicUnroll<R, 4>
  197. {
  198. typedef typename MakeVec<R, 4>::type val_type;
  199. __device__ __forceinline__ static void add(R* ptr, val_type val)
  200. {
  201. atomicAdd(ptr, val.x);
  202. atomicAdd(ptr + 1, val.y);
  203. atomicAdd(ptr + 2, val.z);
  204. atomicAdd(ptr + 3, val.w);
  205. }
  206. __device__ __forceinline__ static void min(R* ptr, val_type val)
  207. {
  208. atomicMin(ptr, val.x);
  209. atomicMin(ptr + 1, val.y);
  210. atomicMin(ptr + 2, val.z);
  211. atomicMin(ptr + 3, val.w);
  212. }
  213. __device__ __forceinline__ static void max(R* ptr, val_type val)
  214. {
  215. atomicMax(ptr, val.x);
  216. atomicMax(ptr + 1, val.y);
  217. atomicMax(ptr + 2, val.z);
  218. atomicMax(ptr + 3, val.w);
  219. }
  220. };
  221. // SumReductor
  222. template <typename src_type, typename work_type> struct SumReductor
  223. {
  224. typedef typename VecTraits<work_type>::elem_type work_elem_type;
  225. enum { cn = VecTraits<src_type>::cn };
  226. work_type sum;
  227. __device__ __forceinline__ SumReductor()
  228. {
  229. sum = VecTraits<work_type>::all(0);
  230. }
  231. __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
  232. {
  233. sum = sum + saturate_cast<work_type>(srcVal);
  234. }
  235. template <int BLOCK_SIZE>
  236. __device__ void reduceGrid(work_elem_type* result, int tid)
  237. {
  238. __shared__ work_elem_type smem[BLOCK_SIZE * cn];
  239. blockReduce<BLOCK_SIZE>(Unroll<cn>::template smem<BLOCK_SIZE>(smem), Unroll<cn>::res(sum), tid, Unroll<cn>::op(plus<work_elem_type>()));
  240. if (tid == 0)
  241. AtomicUnroll<work_elem_type, cn>::add(result, sum);
  242. }
  243. };
  244. // MinMaxReductor
  245. template <typename T> struct minop : minimum<T>
  246. {
  247. __device__ __forceinline__ static T initial()
  248. {
  249. return numeric_limits<T>::max();
  250. }
  251. __device__ __forceinline__ static void atomic(T* result, T myval)
  252. {
  253. atomicMin(result, myval);
  254. }
  255. };
  256. template <typename T> struct maxop : maximum<T>
  257. {
  258. __device__ __forceinline__ static T initial()
  259. {
  260. return -numeric_limits<T>::max();
  261. }
  262. __device__ __forceinline__ static void atomic(T* result, T myval)
  263. {
  264. atomicMax(result, myval);
  265. }
  266. };
  267. struct both
  268. {
  269. };
  270. template <class Op, typename src_type, typename work_type> struct MinMaxReductor
  271. {
  272. work_type myval;
  273. __device__ __forceinline__ MinMaxReductor()
  274. {
  275. myval = Op::initial();
  276. }
  277. __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
  278. {
  279. Op op;
  280. myval = op(myval, srcVal);
  281. }
  282. template <int BLOCK_SIZE>
  283. __device__ void reduceGrid(work_type* result, int tid)
  284. {
  285. __shared__ work_type smem[BLOCK_SIZE];
  286. Op op;
  287. blockReduce<BLOCK_SIZE>(smem, myval, tid, op);
  288. if (tid == 0)
  289. Op::atomic(result, myval);
  290. }
  291. };
  292. template <typename src_type, typename work_type> struct MinMaxReductor<both, src_type, work_type>
  293. {
  294. work_type mymin;
  295. work_type mymax;
  296. __device__ __forceinline__ MinMaxReductor()
  297. {
  298. mymin = numeric_limits<work_type>::max();
  299. mymax = -numeric_limits<work_type>::max();
  300. }
  301. __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
  302. {
  303. minimum<work_type> minOp;
  304. maximum<work_type> maxOp;
  305. mymin = minOp(mymin, srcVal);
  306. mymax = maxOp(mymax, srcVal);
  307. }
  308. template <int BLOCK_SIZE>
  309. __device__ void reduceGrid(work_type* result, int tid)
  310. {
  311. __shared__ work_type sminval[BLOCK_SIZE];
  312. __shared__ work_type smaxval[BLOCK_SIZE];
  313. minimum<work_type> minOp;
  314. maximum<work_type> maxOp;
  315. blockReduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), tie(mymin, mymax), tid, make_tuple(minOp, maxOp));
  316. if (tid == 0)
  317. {
  318. atomicMin(result, mymin);
  319. atomicMax(result + 1, mymax);
  320. }
  321. }
  322. };
  323. // glob_reduce
  324. template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
  325. __global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
  326. {
  327. const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
  328. const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
  329. Reductor reductor;
  330. for (int i = 0, y = y0; i < PATCH_Y && y < rows; ++i, y += blockDim.y)
  331. {
  332. for (int j = 0, x = x0; j < PATCH_X && x < cols; ++j, x += blockDim.x)
  333. {
  334. if (mask(y, x))
  335. {
  336. reductor.reduceVal(src(y, x));
  337. }
  338. }
  339. }
  340. const int tid = threadIdx.y * blockDim.x + threadIdx.x;
  341. reductor.template reduceGrid<BLOCK_SIZE>(result, tid);
  342. }
  343. template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
  344. __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  345. {
  346. const dim3 block(Policy::block_size_x, Policy::block_size_y);
  347. const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
  348. reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
  349. CV_CUDEV_SAFE_CALL( cudaGetLastError() );
  350. if (stream == 0)
  351. CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
  352. }
  353. // callers
  354. template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
  355. __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  356. {
  357. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  358. typedef typename VecTraits<ResType>::elem_type res_elem_type;
  359. reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
  360. }
  361. template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
  362. __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  363. {
  364. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  365. reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
  366. }
  367. template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
  368. __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  369. {
  370. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  371. reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
  372. }
  373. template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
  374. __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
  375. {
  376. typedef typename PtrTraits<SrcPtr>::value_type src_type;
  377. reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
  378. }
  379. }
  380. }}
  381. #endif