simd_functions.hpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Third party copyrights are property of their respective owners.
  17. //
  18. // Redistribution and use in source and binary forms, with or without modification,
  19. // are permitted provided that the following conditions are met:
  20. //
  21. // * Redistribution's of source code must retain the above copyright notice,
  22. // this list of conditions and the following disclaimer.
  23. //
  24. // * Redistribution's in binary form must reproduce the above copyright notice,
  25. // this list of conditions and the following disclaimer in the documentation
  26. // and/or other materials provided with the distribution.
  27. //
  28. // * The name of the copyright holders may not be used to endorse or promote products
  29. // derived from this software without specific prior written permission.
  30. //
  31. // This software is provided by the copyright holders and contributors "as is" and
  32. // any express or implied warranties, including, but not limited to, the implied
  33. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  34. // In no event shall the Intel Corporation or contributors be liable for any direct,
  35. // indirect, incidental, special, exemplary, or consequential damages
  36. // (including, but not limited to, procurement of substitute goods or services;
  37. // loss of use, data, or profits; or business interruption) however caused
  38. // and on any theory of liability, whether in contract, strict liability,
  39. // or tort (including negligence or otherwise) arising in any way out of
  40. // the use of this software, even if advised of the possibility of such damage.
  41. //
  42. //M*/
  43. /*
  44. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
  45. *
  46. * Redistribution and use in source and binary forms, with or without
  47. * modification, are permitted provided that the following conditions are met:
  48. *
  49. * Redistributions of source code must retain the above copyright notice,
  50. * this list of conditions and the following disclaimer.
  51. *
  52. * Redistributions in binary form must reproduce the above copyright notice,
  53. * this list of conditions and the following disclaimer in the documentation
  54. * and/or other materials provided with the distribution.
  55. *
  56. * Neither the name of NVIDIA Corporation nor the names of its contributors
  57. * may be used to endorse or promote products derived from this software
  58. * without specific prior written permission.
  59. *
  60. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  61. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  62. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  63. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  64. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  65. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  66. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  67. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  68. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  69. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  70. * POSSIBILITY OF SUCH DAMAGE.
  71. */
  72. #pragma once
  73. #ifndef OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
  74. #define OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
  75. #include "../common.hpp"
  76. /*
  77. This header file contains inline functions that implement intra-word SIMD
  78. operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
  79. emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
  80. to make the code portable across all GPUs supported by CUDA. The following
  81. functions are currently implemented:
  82. vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
  83. vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
  84. vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
  85. vavg2(a,b) per-halfword unsigned average: (a + b) / 2
  86. vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
  87. vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
  88. vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
  89. vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
  90. vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
  91. vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
  92. vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
  93. vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
  94. vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
  95. vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
  96. vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
  97. vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
  98. vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
  99. vmax2(a,b) per-halfword unsigned maximum: max(a, b)
  100. vmin2(a,b) per-halfword unsigned minimum: min(a, b)
  101. vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
  102. vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
  103. vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
  104. vavg4(a,b) per-byte unsigned average: (a + b) / 2
  105. vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
  106. vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
  107. vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
  108. vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
  109. vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
  110. vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
  111. vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
  112. vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
  113. vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
  114. vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
  115. vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
  116. vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
  117. vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
  118. vmax4(a,b) per-byte unsigned maximum: max(a, b)
  119. vmin4(a,b) per-byte unsigned minimum: min(a, b)
  120. */
  121. namespace cv { namespace cudev {
  122. //! @addtogroup cudev
  123. //! @{
  124. // 2
  125. __device__ __forceinline__ uint vadd2(uint a, uint b)
  126. {
  127. uint r = 0;
  128. #if CV_CUDEV_ARCH >= 300
  129. asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  130. #elif CV_CUDEV_ARCH >= 200
  131. asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  132. asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  133. #else
  134. uint s;
  135. s = a ^ b; // sum bits
  136. r = a + b; // actual sum
  137. s = s ^ r; // determine carry-ins for each bit position
  138. s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
  139. r = r - s; // subtract out carry-out from low word
  140. #endif
  141. return r;
  142. }
  143. __device__ __forceinline__ uint vsub2(uint a, uint b)
  144. {
  145. uint r = 0;
  146. #if CV_CUDEV_ARCH >= 300
  147. asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  148. #elif CV_CUDEV_ARCH >= 200
  149. asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  150. asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  151. #else
  152. uint s;
  153. s = a ^ b; // sum bits
  154. r = a - b; // actual sum
  155. s = s ^ r; // determine carry-ins for each bit position
  156. s = s & 0x00010000; // borrow to high word
  157. r = r + s; // compensate for borrow from low word
  158. #endif
  159. return r;
  160. }
  161. __device__ __forceinline__ uint vabsdiff2(uint a, uint b)
  162. {
  163. uint r = 0;
  164. #if CV_CUDEV_ARCH >= 300
  165. asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  166. #elif CV_CUDEV_ARCH >= 200
  167. asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  168. asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  169. #else
  170. uint s, t, u, v;
  171. s = a & 0x0000ffff; // extract low halfword
  172. r = b & 0x0000ffff; // extract low halfword
  173. u = ::max(r, s); // maximum of low halfwords
  174. v = ::min(r, s); // minimum of low halfwords
  175. s = a & 0xffff0000; // extract high halfword
  176. r = b & 0xffff0000; // extract high halfword
  177. t = ::max(r, s); // maximum of high halfwords
  178. s = ::min(r, s); // minimum of high halfwords
  179. r = u | t; // maximum of both halfwords
  180. s = v | s; // minimum of both halfwords
  181. r = r - s; // |a - b| = max(a,b) - min(a,b);
  182. #endif
  183. return r;
  184. }
  185. __device__ __forceinline__ uint vavg2(uint a, uint b)
  186. {
  187. uint r, s;
  188. // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
  189. // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
  190. s = a ^ b;
  191. r = a & b;
  192. s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
  193. s = s >> 1;
  194. s = r + s;
  195. return s;
  196. }
  197. __device__ __forceinline__ uint vavrg2(uint a, uint b)
  198. {
  199. uint r = 0;
  200. #if CV_CUDEV_ARCH >= 300
  201. asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  202. #else
  203. // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
  204. // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
  205. uint s;
  206. s = a ^ b;
  207. r = a | b;
  208. s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
  209. s = s >> 1;
  210. r = r - s;
  211. #endif
  212. return r;
  213. }
  214. __device__ __forceinline__ uint vseteq2(uint a, uint b)
  215. {
  216. uint r = 0;
  217. #if CV_CUDEV_ARCH >= 300
  218. asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  219. #else
  220. // inspired by Alan Mycroft's null-byte detection algorithm:
  221. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  222. uint c;
  223. r = a ^ b; // 0x0000 if a == b
  224. c = r | 0x80008000; // set msbs, to catch carry out
  225. r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
  226. c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
  227. c = r & ~c; // msb = 1, if r was 0x0000
  228. r = c >> 15; // convert to bool
  229. #endif
  230. return r;
  231. }
  232. __device__ __forceinline__ uint vcmpeq2(uint a, uint b)
  233. {
  234. uint r, c;
  235. #if CV_CUDEV_ARCH >= 300
  236. r = vseteq2(a, b);
  237. c = r << 16; // convert bool
  238. r = c - r; // into mask
  239. #else
  240. // inspired by Alan Mycroft's null-byte detection algorithm:
  241. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  242. r = a ^ b; // 0x0000 if a == b
  243. c = r | 0x80008000; // set msbs, to catch carry out
  244. r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
  245. c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
  246. c = r & ~c; // msb = 1, if r was 0x0000
  247. r = c >> 15; // convert
  248. r = c - r; // msbs to
  249. r = c | r; // mask
  250. #endif
  251. return r;
  252. }
  253. __device__ __forceinline__ uint vsetge2(uint a, uint b)
  254. {
  255. uint r = 0;
  256. #if CV_CUDEV_ARCH >= 300
  257. asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  258. #else
  259. uint c;
  260. asm("not.b32 %0, %0;" : "+r"(b));
  261. c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
  262. c = c & 0x80008000; // msb = carry-outs
  263. r = c >> 15; // convert to bool
  264. #endif
  265. return r;
  266. }
  267. __device__ __forceinline__ uint vcmpge2(uint a, uint b)
  268. {
  269. uint r, c;
  270. #if CV_CUDEV_ARCH >= 300
  271. r = vsetge2(a, b);
  272. c = r << 16; // convert bool
  273. r = c - r; // into mask
  274. #else
  275. asm("not.b32 %0, %0;" : "+r"(b));
  276. c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
  277. c = c & 0x80008000; // msb = carry-outs
  278. r = c >> 15; // convert
  279. r = c - r; // msbs to
  280. r = c | r; // mask
  281. #endif
  282. return r;
  283. }
  284. __device__ __forceinline__ uint vsetgt2(uint a, uint b)
  285. {
  286. uint r = 0;
  287. #if CV_CUDEV_ARCH >= 300
  288. asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  289. #else
  290. uint c;
  291. asm("not.b32 %0, %0;" : "+r"(b));
  292. c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
  293. c = c & 0x80008000; // msbs = carry-outs
  294. r = c >> 15; // convert to bool
  295. #endif
  296. return r;
  297. }
  298. __device__ __forceinline__ uint vcmpgt2(uint a, uint b)
  299. {
  300. uint r, c;
  301. #if CV_CUDEV_ARCH >= 300
  302. r = vsetgt2(a, b);
  303. c = r << 16; // convert bool
  304. r = c - r; // into mask
  305. #else
  306. asm("not.b32 %0, %0;" : "+r"(b));
  307. c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
  308. c = c & 0x80008000; // msbs = carry-outs
  309. r = c >> 15; // convert
  310. r = c - r; // msbs to
  311. r = c | r; // mask
  312. #endif
  313. return r;
  314. }
  315. __device__ __forceinline__ uint vsetle2(uint a, uint b)
  316. {
  317. uint r = 0;
  318. #if CV_CUDEV_ARCH >= 300
  319. asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  320. #else
  321. uint c;
  322. asm("not.b32 %0, %0;" : "+r"(a));
  323. c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
  324. c = c & 0x80008000; // msb = carry-outs
  325. r = c >> 15; // convert to bool
  326. #endif
  327. return r;
  328. }
  329. __device__ __forceinline__ uint vcmple2(uint a, uint b)
  330. {
  331. uint r, c;
  332. #if CV_CUDEV_ARCH >= 300
  333. r = vsetle2(a, b);
  334. c = r << 16; // convert bool
  335. r = c - r; // into mask
  336. #else
  337. asm("not.b32 %0, %0;" : "+r"(a));
  338. c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
  339. c = c & 0x80008000; // msb = carry-outs
  340. r = c >> 15; // convert
  341. r = c - r; // msbs to
  342. r = c | r; // mask
  343. #endif
  344. return r;
  345. }
  346. __device__ __forceinline__ uint vsetlt2(uint a, uint b)
  347. {
  348. uint r = 0;
  349. #if CV_CUDEV_ARCH >= 300
  350. asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  351. #else
  352. uint c;
  353. asm("not.b32 %0, %0;" : "+r"(a));
  354. c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
  355. c = c & 0x80008000; // msb = carry-outs
  356. r = c >> 15; // convert to bool
  357. #endif
  358. return r;
  359. }
  360. __device__ __forceinline__ uint vcmplt2(uint a, uint b)
  361. {
  362. uint r, c;
  363. #if CV_CUDEV_ARCH >= 300
  364. r = vsetlt2(a, b);
  365. c = r << 16; // convert bool
  366. r = c - r; // into mask
  367. #else
  368. asm("not.b32 %0, %0;" : "+r"(a));
  369. c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
  370. c = c & 0x80008000; // msb = carry-outs
  371. r = c >> 15; // convert
  372. r = c - r; // msbs to
  373. r = c | r; // mask
  374. #endif
  375. return r;
  376. }
  377. __device__ __forceinline__ uint vsetne2(uint a, uint b)
  378. {
  379. uint r = 0;
  380. #if CV_CUDEV_ARCH >= 300
  381. asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  382. #else
  383. // inspired by Alan Mycroft's null-byte detection algorithm:
  384. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  385. uint c;
  386. r = a ^ b; // 0x0000 if a == b
  387. c = r | 0x80008000; // set msbs, to catch carry out
  388. c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
  389. c = r | c; // msb = 1, if r was not 0x0000
  390. c = c & 0x80008000; // extract msbs
  391. r = c >> 15; // convert to bool
  392. #endif
  393. return r;
  394. }
  395. __device__ __forceinline__ uint vcmpne2(uint a, uint b)
  396. {
  397. uint r, c;
  398. #if CV_CUDEV_ARCH >= 300
  399. r = vsetne2(a, b);
  400. c = r << 16; // convert bool
  401. r = c - r; // into mask
  402. #else
  403. // inspired by Alan Mycroft's null-byte detection algorithm:
  404. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  405. r = a ^ b; // 0x0000 if a == b
  406. c = r | 0x80008000; // set msbs, to catch carry out
  407. c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
  408. c = r | c; // msb = 1, if r was not 0x0000
  409. c = c & 0x80008000; // extract msbs
  410. r = c >> 15; // convert
  411. r = c - r; // msbs to
  412. r = c | r; // mask
  413. #endif
  414. return r;
  415. }
  416. __device__ __forceinline__ uint vmax2(uint a, uint b)
  417. {
  418. uint r = 0;
  419. #if CV_CUDEV_ARCH >= 300
  420. asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  421. #elif CV_CUDEV_ARCH >= 200
  422. asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  423. asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  424. #else
  425. uint s, t, u;
  426. r = a & 0x0000ffff; // extract low halfword
  427. s = b & 0x0000ffff; // extract low halfword
  428. t = ::max(r, s); // maximum of low halfwords
  429. r = a & 0xffff0000; // extract high halfword
  430. s = b & 0xffff0000; // extract high halfword
  431. u = ::max(r, s); // maximum of high halfwords
  432. r = t | u; // combine halfword maximums
  433. #endif
  434. return r;
  435. }
  436. __device__ __forceinline__ uint vmin2(uint a, uint b)
  437. {
  438. uint r = 0;
  439. #if CV_CUDEV_ARCH >= 300
  440. asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  441. #elif CV_CUDEV_ARCH >= 200
  442. asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  443. asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  444. #else
  445. uint s, t, u;
  446. r = a & 0x0000ffff; // extract low halfword
  447. s = b & 0x0000ffff; // extract low halfword
  448. t = ::min(r, s); // minimum of low halfwords
  449. r = a & 0xffff0000; // extract high halfword
  450. s = b & 0xffff0000; // extract high halfword
  451. u = ::min(r, s); // minimum of high halfwords
  452. r = t | u; // combine halfword minimums
  453. #endif
  454. return r;
  455. }
  456. // 4
  457. __device__ __forceinline__ uint vadd4(uint a, uint b)
  458. {
  459. uint r = 0;
  460. #if CV_CUDEV_ARCH >= 300
  461. asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  462. #elif CV_CUDEV_ARCH >= 200
  463. asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  464. asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  465. asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  466. asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  467. #else
  468. uint s, t;
  469. s = a ^ b; // sum bits
  470. r = a & 0x7f7f7f7f; // clear msbs
  471. t = b & 0x7f7f7f7f; // clear msbs
  472. s = s & 0x80808080; // msb sum bits
  473. r = r + t; // add without msbs, record carry-out in msbs
  474. r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
  475. #endif /* CV_CUDEV_ARCH >= 300 */
  476. return r;
  477. }
  478. __device__ __forceinline__ uint vsub4(uint a, uint b)
  479. {
  480. uint r = 0;
  481. #if CV_CUDEV_ARCH >= 300
  482. asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  483. #elif CV_CUDEV_ARCH >= 200
  484. asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  485. asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  486. asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  487. asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  488. #else
  489. uint s, t;
  490. s = a ^ ~b; // inverted sum bits
  491. r = a | 0x80808080; // set msbs
  492. t = b & 0x7f7f7f7f; // clear msbs
  493. s = s & 0x80808080; // inverted msb sum bits
  494. r = r - t; // subtract w/o msbs, record inverted borrows in msb
  495. r = r ^ s; // combine inverted msb sum bits and borrows
  496. #endif
  497. return r;
  498. }
  499. __device__ __forceinline__ uint vavg4(uint a, uint b)
  500. {
  501. uint r, s;
  502. // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
  503. // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
  504. s = a ^ b;
  505. r = a & b;
  506. s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
  507. s = s >> 1;
  508. s = r + s;
  509. return s;
  510. }
  511. __device__ __forceinline__ uint vavrg4(uint a, uint b)
  512. {
  513. uint r = 0;
  514. #if CV_CUDEV_ARCH >= 300
  515. asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  516. #else
  517. // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
  518. // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
  519. uint c;
  520. c = a ^ b;
  521. r = a | b;
  522. c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
  523. c = c >> 1;
  524. r = r - c;
  525. #endif
  526. return r;
  527. }
  528. __device__ __forceinline__ uint vseteq4(uint a, uint b)
  529. {
  530. uint r = 0;
  531. #if CV_CUDEV_ARCH >= 300
  532. asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  533. #else
  534. // inspired by Alan Mycroft's null-byte detection algorithm:
  535. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  536. uint c;
  537. r = a ^ b; // 0x00 if a == b
  538. c = r | 0x80808080; // set msbs, to catch carry out
  539. r = r ^ c; // extract msbs, msb = 1 if r < 0x80
  540. c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
  541. c = r & ~c; // msb = 1, if r was 0x00
  542. r = c >> 7; // convert to bool
  543. #endif
  544. return r;
  545. }
  546. __device__ __forceinline__ uint vcmpeq4(uint a, uint b)
  547. {
  548. uint r, t;
  549. #if CV_CUDEV_ARCH >= 300
  550. r = vseteq4(a, b);
  551. t = r << 8; // convert bool
  552. r = t - r; // to mask
  553. #else
  554. // inspired by Alan Mycroft's null-byte detection algorithm:
  555. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  556. t = a ^ b; // 0x00 if a == b
  557. r = t | 0x80808080; // set msbs, to catch carry out
  558. t = t ^ r; // extract msbs, msb = 1 if t < 0x80
  559. r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
  560. r = t & ~r; // msb = 1, if t was 0x00
  561. t = r >> 7; // build mask
  562. t = r - t; // from
  563. r = t | r; // msbs
  564. #endif
  565. return r;
  566. }
  567. __device__ __forceinline__ uint vsetle4(uint a, uint b)
  568. {
  569. uint r = 0;
  570. #if CV_CUDEV_ARCH >= 300
  571. asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  572. #else
  573. uint c;
  574. asm("not.b32 %0, %0;" : "+r"(a));
  575. c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
  576. c = c & 0x80808080; // msb = carry-outs
  577. r = c >> 7; // convert to bool
  578. #endif
  579. return r;
  580. }
  581. __device__ __forceinline__ uint vcmple4(uint a, uint b)
  582. {
  583. uint r, c;
  584. #if CV_CUDEV_ARCH >= 300
  585. r = vsetle4(a, b);
  586. c = r << 8; // convert bool
  587. r = c - r; // to mask
  588. #else
  589. asm("not.b32 %0, %0;" : "+r"(a));
  590. c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
  591. c = c & 0x80808080; // msbs = carry-outs
  592. r = c >> 7; // convert
  593. r = c - r; // msbs to
  594. r = c | r; // mask
  595. #endif
  596. return r;
  597. }
  598. __device__ __forceinline__ uint vsetlt4(uint a, uint b)
  599. {
  600. uint r = 0;
  601. #if CV_CUDEV_ARCH >= 300
  602. asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  603. #else
  604. uint c;
  605. asm("not.b32 %0, %0;" : "+r"(a));
  606. c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
  607. c = c & 0x80808080; // msb = carry-outs
  608. r = c >> 7; // convert to bool
  609. #endif
  610. return r;
  611. }
  612. __device__ __forceinline__ uint vcmplt4(uint a, uint b)
  613. {
  614. uint r, c;
  615. #if CV_CUDEV_ARCH >= 300
  616. r = vsetlt4(a, b);
  617. c = r << 8; // convert bool
  618. r = c - r; // to mask
  619. #else
  620. asm("not.b32 %0, %0;" : "+r"(a));
  621. c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
  622. c = c & 0x80808080; // msbs = carry-outs
  623. r = c >> 7; // convert
  624. r = c - r; // msbs to
  625. r = c | r; // mask
  626. #endif
  627. return r;
  628. }
  629. __device__ __forceinline__ uint vsetge4(uint a, uint b)
  630. {
  631. uint r = 0;
  632. #if CV_CUDEV_ARCH >= 300
  633. asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  634. #else
  635. uint c;
  636. asm("not.b32 %0, %0;" : "+r"(b));
  637. c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
  638. c = c & 0x80808080; // msb = carry-outs
  639. r = c >> 7; // convert to bool
  640. #endif
  641. return r;
  642. }
  643. __device__ __forceinline__ uint vcmpge4(uint a, uint b)
  644. {
  645. uint r, s;
  646. #if CV_CUDEV_ARCH >= 300
  647. r = vsetge4(a, b);
  648. s = r << 8; // convert bool
  649. r = s - r; // to mask
  650. #else
  651. asm ("not.b32 %0,%0;" : "+r"(b));
  652. r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
  653. r = r & 0x80808080; // msb = carry-outs
  654. s = r >> 7; // build mask
  655. s = r - s; // from
  656. r = s | r; // msbs
  657. #endif
  658. return r;
  659. }
  660. __device__ __forceinline__ uint vsetgt4(uint a, uint b)
  661. {
  662. uint r = 0;
  663. #if CV_CUDEV_ARCH >= 300
  664. asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  665. #else
  666. uint c;
  667. asm("not.b32 %0, %0;" : "+r"(b));
  668. c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
  669. c = c & 0x80808080; // msb = carry-outs
  670. r = c >> 7; // convert to bool
  671. #endif
  672. return r;
  673. }
  674. __device__ __forceinline__ uint vcmpgt4(uint a, uint b)
  675. {
  676. uint r, c;
  677. #if CV_CUDEV_ARCH >= 300
  678. r = vsetgt4(a, b);
  679. c = r << 8; // convert bool
  680. r = c - r; // to mask
  681. #else
  682. asm("not.b32 %0, %0;" : "+r"(b));
  683. c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
  684. c = c & 0x80808080; // msb = carry-outs
  685. r = c >> 7; // convert
  686. r = c - r; // msbs to
  687. r = c | r; // mask
  688. #endif
  689. return r;
  690. }
  691. __device__ __forceinline__ uint vsetne4(uint a, uint b)
  692. {
  693. uint r = 0;
  694. #if CV_CUDEV_ARCH >= 300
  695. asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  696. #else
  697. // inspired by Alan Mycroft's null-byte detection algorithm:
  698. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  699. uint c;
  700. r = a ^ b; // 0x00 if a == b
  701. c = r | 0x80808080; // set msbs, to catch carry out
  702. c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
  703. c = r | c; // msb = 1, if r was not 0x00
  704. c = c & 0x80808080; // extract msbs
  705. r = c >> 7; // convert to bool
  706. #endif
  707. return r;
  708. }
  709. __device__ __forceinline__ uint vcmpne4(uint a, uint b)
  710. {
  711. uint r, c;
  712. #if CV_CUDEV_ARCH >= 300
  713. r = vsetne4(a, b);
  714. c = r << 8; // convert bool
  715. r = c - r; // to mask
  716. #else
  717. // inspired by Alan Mycroft's null-byte detection algorithm:
  718. // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
  719. r = a ^ b; // 0x00 if a == b
  720. c = r | 0x80808080; // set msbs, to catch carry out
  721. c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
  722. c = r | c; // msb = 1, if r was not 0x00
  723. c = c & 0x80808080; // extract msbs
  724. r = c >> 7; // convert
  725. r = c - r; // msbs to
  726. r = c | r; // mask
  727. #endif
  728. return r;
  729. }
  730. __device__ __forceinline__ uint vabsdiff4(uint a, uint b)
  731. {
  732. uint r = 0;
  733. #if CV_CUDEV_ARCH >= 300
  734. asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  735. #elif CV_CUDEV_ARCH >= 200
  736. asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  737. asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  738. asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  739. asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  740. #else
  741. uint s;
  742. s = vcmpge4(a, b); // mask = 0xff if a >= b
  743. r = a ^ b; //
  744. s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
  745. r = s ^ r; // select a when b >= a, else select b => min(a,b)
  746. r = s - r; // |a - b| = max(a,b) - min(a,b);
  747. #endif
  748. return r;
  749. }
  750. __device__ __forceinline__ uint vmax4(uint a, uint b)
  751. {
  752. uint r = 0;
  753. #if CV_CUDEV_ARCH >= 300
  754. asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  755. #elif CV_CUDEV_ARCH >= 200
  756. asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  757. asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  758. asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  759. asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  760. #else
  761. uint s;
  762. s = vcmpge4(a, b); // mask = 0xff if a >= b
  763. r = a & s; // select a when b >= a
  764. s = b & ~s; // select b when b < a
  765. r = r | s; // combine byte selections
  766. #endif
  767. return r; // byte-wise unsigned maximum
  768. }
  769. __device__ __forceinline__ uint vmin4(uint a, uint b)
  770. {
  771. uint r = 0;
  772. #if CV_CUDEV_ARCH >= 300
  773. asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  774. #elif CV_CUDEV_ARCH >= 200
  775. asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  776. asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  777. asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  778. asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
  779. #else
  780. uint s;
  781. s = vcmpge4(b, a); // mask = 0xff if a >= b
  782. r = a & s; // select a when b >= a
  783. s = b & ~s; // select b when b < a
  784. r = r | s; // combine byte selections
  785. #endif
  786. return r;
  787. }
  788. //! @}
  789. }}
  790. #endif