123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918 |
- /*M///////////////////////////////////////////////////////////////////////////////////////
- //
- // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- //
- // By downloading, copying, installing or using the software you agree to this license.
- // If you do not agree to this license, do not download, install,
- // copy or use the software.
- //
- //
- // License Agreement
- // For Open Source Computer Vision Library
- //
- // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
- // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
- // Third party copyrights are property of their respective owners.
- //
- // Redistribution and use in source and binary forms, with or without modification,
- // are permitted provided that the following conditions are met:
- //
- // * Redistribution's of source code must retain the above copyright notice,
- // this list of conditions and the following disclaimer.
- //
- // * Redistribution's in binary form must reproduce the above copyright notice,
- // this list of conditions and the following disclaimer in the documentation
- // and/or other materials provided with the distribution.
- //
- // * The name of the copyright holders may not be used to endorse or promote products
- // derived from this software without specific prior written permission.
- //
- // This software is provided by the copyright holders and contributors "as is" and
- // any express or implied warranties, including, but not limited to, the implied
- // warranties of merchantability and fitness for a particular purpose are disclaimed.
- // In no event shall the Intel Corporation or contributors be liable for any direct,
- // indirect, incidental, special, exemplary, or consequential damages
- // (including, but not limited to, procurement of substitute goods or services;
- // loss of use, data, or profits; or business interruption) however caused
- // and on any theory of liability, whether in contract, strict liability,
- // or tort (including negligence or otherwise) arising in any way out of
- // the use of this software, even if advised of the possibility of such damage.
- //
- //M*/
- /*
- * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * Neither the name of NVIDIA Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
- #pragma once
- #ifndef OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
- #define OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP
- #include "../common.hpp"
- /*
- This header file contains inline functions that implement intra-word SIMD
- operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
- emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
- to make the code portable across all GPUs supported by CUDA. The following
- functions are currently implemented:
- vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
- vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
- vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
- vavg2(a,b) per-halfword unsigned average: (a + b) / 2
- vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
- vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
- vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
- vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
- vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
- vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
- vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
- vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
- vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
- vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
- vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
- vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
- vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
- vmax2(a,b) per-halfword unsigned maximum: max(a, b)
- vmin2(a,b) per-halfword unsigned minimum: min(a, b)
- vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
- vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
- vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
- vavg4(a,b) per-byte unsigned average: (a + b) / 2
- vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
- vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
- vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
- vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
- vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
- vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
- vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
- vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
- vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
- vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
- vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
- vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
- vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
- vmax4(a,b) per-byte unsigned maximum: max(a, b)
- vmin4(a,b) per-byte unsigned minimum: min(a, b)
- */
- namespace cv { namespace cudev {
- //! @addtogroup cudev
- //! @{
- // 2
- __device__ __forceinline__ uint vadd2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s;
- s = a ^ b; // sum bits
- r = a + b; // actual sum
- s = s ^ r; // determine carry-ins for each bit position
- s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
- r = r - s; // subtract out carry-out from low word
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsub2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s;
- s = a ^ b; // sum bits
- r = a - b; // actual sum
- s = s ^ r; // determine carry-ins for each bit position
- s = s & 0x00010000; // borrow to high word
- r = r + s; // compensate for borrow from low word
- #endif
- return r;
- }
- __device__ __forceinline__ uint vabsdiff2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s, t, u, v;
- s = a & 0x0000ffff; // extract low halfword
- r = b & 0x0000ffff; // extract low halfword
- u = ::max(r, s); // maximum of low halfwords
- v = ::min(r, s); // minimum of low halfwords
- s = a & 0xffff0000; // extract high halfword
- r = b & 0xffff0000; // extract high halfword
- t = ::max(r, s); // maximum of high halfwords
- s = ::min(r, s); // minimum of high halfwords
- r = u | t; // maximum of both halfwords
- s = v | s; // minimum of both halfwords
- r = r - s; // |a - b| = max(a,b) - min(a,b);
- #endif
- return r;
- }
- __device__ __forceinline__ uint vavg2(uint a, uint b)
- {
- uint r, s;
- // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
- // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
- s = a ^ b;
- r = a & b;
- s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
- s = s >> 1;
- s = r + s;
- return s;
- }
- __device__ __forceinline__ uint vavrg2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
- // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
- uint s;
- s = a ^ b;
- r = a | b;
- s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
- s = s >> 1;
- r = r - s;
- #endif
- return r;
- }
- __device__ __forceinline__ uint vseteq2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- uint c;
- r = a ^ b; // 0x0000 if a == b
- c = r | 0x80008000; // set msbs, to catch carry out
- r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
- c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
- c = r & ~c; // msb = 1, if r was 0x0000
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpeq2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vseteq2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- r = a ^ b; // 0x0000 if a == b
- c = r | 0x80008000; // set msbs, to catch carry out
- r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
- c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
- c = r & ~c; // msb = 1, if r was 0x0000
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetge2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpge2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetge2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetgt2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
- c = c & 0x80008000; // msbs = carry-outs
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpgt2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetgt2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
- c = c & 0x80008000; // msbs = carry-outs
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetle2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmple2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetle2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetlt2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmplt2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetlt2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
- c = c & 0x80008000; // msb = carry-outs
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetne2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- uint c;
- r = a ^ b; // 0x0000 if a == b
- c = r | 0x80008000; // set msbs, to catch carry out
- c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
- c = r | c; // msb = 1, if r was not 0x0000
- c = c & 0x80008000; // extract msbs
- r = c >> 15; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpne2(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetne2(a, b);
- c = r << 16; // convert bool
- r = c - r; // into mask
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- r = a ^ b; // 0x0000 if a == b
- c = r | 0x80008000; // set msbs, to catch carry out
- c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
- c = r | c; // msb = 1, if r was not 0x0000
- c = c & 0x80008000; // extract msbs
- r = c >> 15; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vmax2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s, t, u;
- r = a & 0x0000ffff; // extract low halfword
- s = b & 0x0000ffff; // extract low halfword
- t = ::max(r, s); // maximum of low halfwords
- r = a & 0xffff0000; // extract high halfword
- s = b & 0xffff0000; // extract high halfword
- u = ::max(r, s); // maximum of high halfwords
- r = t | u; // combine halfword maximums
- #endif
- return r;
- }
- __device__ __forceinline__ uint vmin2(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s, t, u;
- r = a & 0x0000ffff; // extract low halfword
- s = b & 0x0000ffff; // extract low halfword
- t = ::min(r, s); // minimum of low halfwords
- r = a & 0xffff0000; // extract high halfword
- s = b & 0xffff0000; // extract high halfword
- u = ::min(r, s); // minimum of high halfwords
- r = t | u; // combine halfword minimums
- #endif
- return r;
- }
- // 4
- __device__ __forceinline__ uint vadd4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s, t;
- s = a ^ b; // sum bits
- r = a & 0x7f7f7f7f; // clear msbs
- t = b & 0x7f7f7f7f; // clear msbs
- s = s & 0x80808080; // msb sum bits
- r = r + t; // add without msbs, record carry-out in msbs
- r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
- #endif /* CV_CUDEV_ARCH >= 300 */
- return r;
- }
- __device__ __forceinline__ uint vsub4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s, t;
- s = a ^ ~b; // inverted sum bits
- r = a | 0x80808080; // set msbs
- t = b & 0x7f7f7f7f; // clear msbs
- s = s & 0x80808080; // inverted msb sum bits
- r = r - t; // subtract w/o msbs, record inverted borrows in msb
- r = r ^ s; // combine inverted msb sum bits and borrows
- #endif
- return r;
- }
- __device__ __forceinline__ uint vavg4(uint a, uint b)
- {
- uint r, s;
- // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
- // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
- s = a ^ b;
- r = a & b;
- s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
- s = s >> 1;
- s = r + s;
- return s;
- }
- __device__ __forceinline__ uint vavrg4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
- // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
- uint c;
- c = a ^ b;
- r = a | b;
- c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
- c = c >> 1;
- r = r - c;
- #endif
- return r;
- }
- __device__ __forceinline__ uint vseteq4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- uint c;
- r = a ^ b; // 0x00 if a == b
- c = r | 0x80808080; // set msbs, to catch carry out
- r = r ^ c; // extract msbs, msb = 1 if r < 0x80
- c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
- c = r & ~c; // msb = 1, if r was 0x00
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpeq4(uint a, uint b)
- {
- uint r, t;
- #if CV_CUDEV_ARCH >= 300
- r = vseteq4(a, b);
- t = r << 8; // convert bool
- r = t - r; // to mask
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- t = a ^ b; // 0x00 if a == b
- r = t | 0x80808080; // set msbs, to catch carry out
- t = t ^ r; // extract msbs, msb = 1 if t < 0x80
- r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
- r = t & ~r; // msb = 1, if t was 0x00
- t = r >> 7; // build mask
- t = r - t; // from
- r = t | r; // msbs
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetle4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
- c = c & 0x80808080; // msb = carry-outs
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmple4(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetle4(a, b);
- c = r << 8; // convert bool
- r = c - r; // to mask
- #else
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
- c = c & 0x80808080; // msbs = carry-outs
- r = c >> 7; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetlt4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
- c = c & 0x80808080; // msb = carry-outs
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmplt4(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetlt4(a, b);
- c = r << 8; // convert bool
- r = c - r; // to mask
- #else
- asm("not.b32 %0, %0;" : "+r"(a));
- c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
- c = c & 0x80808080; // msbs = carry-outs
- r = c >> 7; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetge4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
- c = c & 0x80808080; // msb = carry-outs
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpge4(uint a, uint b)
- {
- uint r, s;
- #if CV_CUDEV_ARCH >= 300
- r = vsetge4(a, b);
- s = r << 8; // convert bool
- r = s - r; // to mask
- #else
- asm ("not.b32 %0,%0;" : "+r"(b));
- r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
- r = r & 0x80808080; // msb = carry-outs
- s = r >> 7; // build mask
- s = r - s; // from
- r = s | r; // msbs
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetgt4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint c;
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
- c = c & 0x80808080; // msb = carry-outs
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpgt4(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetgt4(a, b);
- c = r << 8; // convert bool
- r = c - r; // to mask
- #else
- asm("not.b32 %0, %0;" : "+r"(b));
- c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
- c = c & 0x80808080; // msb = carry-outs
- r = c >> 7; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vsetne4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- uint c;
- r = a ^ b; // 0x00 if a == b
- c = r | 0x80808080; // set msbs, to catch carry out
- c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
- c = r | c; // msb = 1, if r was not 0x00
- c = c & 0x80808080; // extract msbs
- r = c >> 7; // convert to bool
- #endif
- return r;
- }
- __device__ __forceinline__ uint vcmpne4(uint a, uint b)
- {
- uint r, c;
- #if CV_CUDEV_ARCH >= 300
- r = vsetne4(a, b);
- c = r << 8; // convert bool
- r = c - r; // to mask
- #else
- // inspired by Alan Mycroft's null-byte detection algorithm:
- // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
- r = a ^ b; // 0x00 if a == b
- c = r | 0x80808080; // set msbs, to catch carry out
- c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
- c = r | c; // msb = 1, if r was not 0x00
- c = c & 0x80808080; // extract msbs
- r = c >> 7; // convert
- r = c - r; // msbs to
- r = c | r; // mask
- #endif
- return r;
- }
- __device__ __forceinline__ uint vabsdiff4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s;
- s = vcmpge4(a, b); // mask = 0xff if a >= b
- r = a ^ b; //
- s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
- r = s ^ r; // select a when b >= a, else select b => min(a,b)
- r = s - r; // |a - b| = max(a,b) - min(a,b);
- #endif
- return r;
- }
- __device__ __forceinline__ uint vmax4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s;
- s = vcmpge4(a, b); // mask = 0xff if a >= b
- r = a & s; // select a when b >= a
- s = b & ~s; // select b when b < a
- r = r | s; // combine byte selections
- #endif
- return r; // byte-wise unsigned maximum
- }
- __device__ __forceinline__ uint vmin4(uint a, uint b)
- {
- uint r = 0;
- #if CV_CUDEV_ARCH >= 300
- asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #elif CV_CUDEV_ARCH >= 200
- asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
- #else
- uint s;
- s = vcmpge4(b, a); // mask = 0xff if a >= b
- r = a & s; // select a when b >= a
- s = b & ~s; // select b when b < a
- r = r | s; // combine byte selections
- #endif
- return r;
- }
- //! @}
- }}
- #endif
|