FusionAndTailor.cu 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. #include "FusionAndTailor.cuh"
  2. #include <builtin_types.h>
  3. //经典的转换方式
  4. #define COLOR_COMPONENT_MASK 0x3FF
  5. #define COLOR_COMPONENT_BIT_SIZE 10
  6. #define MUL(x,y) ((x)*(y))
  7. typedef unsigned int uint32;
  8. typedef int int32;
  9. __constant__ float constHueColorSpaceMat2[9]; //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释
  10. __device__ void YUV2RGB2(unsigned int* yuvi, float* red, float* green, float* blue)
  11. {
  12. float luma, chromaCb, chromaCr;
  13. // Prepare for hue adjustment
  14. luma = (float)yuvi[0];
  15. chromaCb = (float)((int32)yuvi[1] - 512.0f);
  16. chromaCr = (float)((int32)yuvi[2] - 512.0f);
  17. // Convert YUV To RGB with hue adjustment
  18. *red = MUL(luma, constHueColorSpaceMat2[0]) +
  19. MUL(chromaCb, constHueColorSpaceMat2[1]) +
  20. MUL(chromaCr, constHueColorSpaceMat2[2]);
  21. *green = MUL(luma, constHueColorSpaceMat2[3]) +
  22. MUL(chromaCb, constHueColorSpaceMat2[4]) +
  23. MUL(chromaCr, constHueColorSpaceMat2[5]);
  24. *blue = MUL(luma, constHueColorSpaceMat2[6]) +
  25. MUL(chromaCb, constHueColorSpaceMat2[7]) +
  26. MUL(chromaCr, constHueColorSpaceMat2[8]);
  27. }
  28. __device__ unsigned char clip_v(int x, int min_val, int max_val) {
  29. if (x > max_val) {
  30. return max_val;
  31. }
  32. else if (x < min_val) {
  33. return min_val;
  34. }
  35. else {
  36. return x;
  37. }
  38. }
  39. namespace YUVTailorAndBlender {
  40. __host__ void TurnNV12ToI420Classics(unsigned char* pNV12, int nWidth, int nHeight,
  41. unsigned char* pI420, int nPitch)
  42. {
  43. //set the block and grid
  44. dim3 Block(32, 32);
  45. dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);
  46. //call the kernel
  47. TurnNV12ToI420ClassicsKernal << <Grid, Block >> > (pNV12, nWidth, nHeight, pI420, nPitch);
  48. }
  49. __host__ void TurnNV12ToI420Dissociative(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch)
  50. {
  51. dim3 Block(32, 32);
  52. dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);
  53. CopyPlaneKernal<<<Grid,Block>>>(pNV12,nWidth,nHeight,pI420,nPitch);
  54. dim3 BlockUV(32, 32);
  55. dim3 GridUV(((nWidth / 2) + Block.x - 1) / Block.x, ((nHeight / 2) + Block.y - 1) / Block.y);
  56. //TurnNV12ToI420DissociativeKernalUV<<<GridUV,BlockUV>>>(pNV12,nWidth/2,nHeight/2,pI420,nPitch/2);
  57. }
  58. __host__ void TurnCUDAFormatToI420(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* dstImage, int width, int height, CUstream* pStream)
  59. {
  60. dim3 block(32, 16, 1);
  61. dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
  62. if (pStream == nullptr)
  63. TurnCUDAFormatToI420Kernal << < grid, block >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
  64. else
  65. TurnCUDAFormatToI420Kernal << < grid, block, 0 ,*pStream >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
  66. }
  67. __host__ void CopyPlane(unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
  68. unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
  69. int nCopyWidth, int nCopyHeight,
  70. int nCopyStartX, int nCopyStartY,
  71. unsigned int Type,
  72. CUstream* pStream)
  73. {
  74. dim3 block(BLOCK_SIZE, SHARED_MEMORY_SIZE_Y, 1);
  75. dim3 grid((nCopyWidth/2 + (block.x - 1)) / block.x, ((nCopyHeight + (block.y - 1))) / block.y, 1);
  76. _CopyPlaneKernal << <grid, block >> > (
  77. pSrc, nSrcWidth, nSrcHeight, nSrcPitch,
  78. pDst, nDstWidth, nDstHeight, nDstPitch,
  79. nCopyWidth, nCopyHeight,
  80. nCopyStartX, nCopyStartY,
  81. 1
  82. );
  83. }
  84. __host__ void CropI420(unsigned char* pSrcY, int nSrcYPitch,
  85. unsigned char* pSrcU, int nSrcUPitch,
  86. unsigned char* pSrcV, int nSrcVPitch,
  87. unsigned char* pDstY, int nDstYPitch,
  88. unsigned char* pDstU, int nDstUPitch,
  89. unsigned char* pDstV, int nDstVPitch,
  90. int nCropWidth, int nCropHeight,
  91. int nCropStartX, int nCropStartY)
  92. {
  93. dim3 block(BLOCK_SIZE, BLOCK_SIZE);
  94. dim3 grid((nCropWidth + block.x - 1) / block.x, (nCropHeight + block.y - 1) / block.y);
  95. CropI420Kernel << <grid, block >> > (
  96. pSrcY, nSrcYPitch,
  97. pSrcU, nSrcUPitch,
  98. pSrcV, nSrcVPitch,
  99. pDstY, nDstYPitch,
  100. pDstU, nDstUPitch,
  101. pDstV, nDstVPitch,
  102. nCropWidth, nCropHeight,
  103. nCropStartX, nCropStartY,
  104. nCropWidth, nCropHeight
  105. );
  106. //cudaError Error = cudaDeviceSynchronize();
  107. }
  108. __host__ void GradientBlenderYUV(unsigned char* pRelateY, int YRelateStride,
  109. unsigned char* pRelateU, int URelateStride,
  110. unsigned char* pRelateV, int VRelateStride,
  111. unsigned char* pTargetY, int YTargetStride,
  112. unsigned char* pTargetU, int UTargetStride,
  113. unsigned char* pTargetV, int VTargetStride,
  114. int width, int height,
  115. float* pRelateMask, float* pTargetMask, int MaskStride,
  116. unsigned char* pDstY, int DstYStride,
  117. unsigned char* pDstU, int DstUStride,
  118. unsigned char* pDstV, int DstVStride,
  119. int DstWidth, int DstHeight)
  120. {
  121. dim3 block(32, 32);
  122. dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
  123. GradientBlenderYUVKernal << <grid, block >> >
  124. (
  125. pRelateY, YRelateStride,
  126. pRelateU, URelateStride,
  127. pRelateV, VRelateStride,
  128. pTargetY, YTargetStride,
  129. pTargetU, UTargetStride,
  130. pTargetV, VTargetStride,
  131. width, height,
  132. pRelateMask, pTargetMask, MaskStride,
  133. pDstY, DstYStride,
  134. pDstU, DstUStride,
  135. pDstV, DstVStride
  136. );
  137. }
  138. //最经典的方法,根据当前的像素数进行修改
  139. void __global__ TurnNV12ToI420ClassicsKernal(unsigned char* pNV12, int nWidth, int nHeight,
  140. unsigned char* pI420, int nPitch)
  141. {
  142. //get the position
  143. int x = threadIdx.x + blockIdx.x * blockDim.x;
  144. int y = threadIdx.y + blockIdx.y * blockDim.y;
  145. //check the position
  146. if (x >= nWidth || y >= nHeight)
  147. {
  148. return;
  149. }
  150. //get the position of the YUV
  151. int nYPos = y * nPitch + x;
  152. int nUVPos = nHeight * nPitch + y * nPitch / 2 + x;
  153. //get the position of the I420
  154. int nYPosI420 = y * nWidth + x;
  155. int nUPosI420 = nHeight * nWidth + y * nWidth / 4 + x / 2;
  156. int nVPosI420 = nHeight * nWidth * 5 / 4 + y * nWidth / 4 + x / 2;
  157. //copy the YUV to I420
  158. pI420[nYPosI420] = pNV12[nYPos];
  159. if (y % 2 == 0 && x % 2 == 0)
  160. {
  161. pI420[nUPosI420] = pNV12[nUVPos];
  162. pI420[nVPosI420] = pNV12[nUVPos + 1];
  163. }
  164. }
  165. //分离式的转换方式,类似转置
  166. void __global__ CopyPlaneKernal(unsigned char* pNV12, int nWidth, int nHeight,
  167. unsigned char* pI420, int nPitch)
  168. {
  169. __shared__ unsigned char sByteData[ SHARED_MEMORY_SIZE][SHARED_MEMORY_SIZE];
  170. //获取当前位置,因为是
  171. int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
  172. int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
  173. unsigned int nLoadSharedPos = nInputX + nInputY * nPitch;
  174. if (nInputX < nWidth && nInputY < nHeight)
  175. {
  176. //sByteData[threadIdx.y][threadIdx.x] = pNV12[nLoadSharedPos];
  177. __syncthreads();
  178. pI420[nLoadSharedPos] = pNV12[nLoadSharedPos];// sByteData[threadIdx.y][threadIdx.x];
  179. }
  180. }
  181. void __global__ _CopyPlaneKernal(
  182. unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
  183. unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
  184. int nCopyWidth, int nCopyHeight,
  185. int nCopyStartX, int nCopyStartY,
  186. int nChannel)
  187. {
  188. //__shared__ unsigned char sByteData[SHARED_MEMORY_SIZE_Y][SHARED_MEMORY_SIZE*2];
  189. int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
  190. int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
  191. /*if (nInputX + SHARED_MEMORY_SIZE < nCopyWidth
  192. && nInputY < nCopyHeight
  193. )
  194. {
  195. unsigned int nLoadSharedPos = nInputX + nInputY * nSrcPitch;
  196. unsigned char* pDstTarget = pDst + nCopyStartX + nCopyStartY * nDstPitch;
  197. unsigned char* pDstCurTarget = pDstTarget + nInputX + nInputY * nDstPitch;
  198. sByteData[threadIdx.y][threadIdx.x] = pSrc[nLoadSharedPos];
  199. sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x] = pSrc[nLoadSharedPos + SHARED_MEMORY_SIZE];
  200. __syncthreads();
  201. *pDstCurTarget = sByteData[threadIdx.y][threadIdx.x];
  202. *(pDstCurTarget + SHARED_MEMORY_SIZE) = sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x];
  203. }*/
  204. if (nInputX*2 < nCopyWidth
  205. && nInputY < nCopyHeight)
  206. {
  207. unsigned int nLoadSharedPos = nInputX*2 + nInputY * nSrcPitch;
  208. unsigned char* pDstTarget = pDst + nCopyStartX + nInputX*2 + (nCopyStartY + nInputY) * nDstPitch;
  209. //sByteData[threadIdx.y][threadIdx.x * 2] = pSrc[nLoadSharedPos];;
  210. //sByteData[threadIdx.y][threadIdx.x * 2 + 1] = pSrc[nLoadSharedPos + 1];;
  211. *pDstTarget = pSrc[nLoadSharedPos];//sByteData[threadIdx.y][threadIdx.x * 2];
  212. *(pDstTarget + 1) = pSrc[nLoadSharedPos + 1];//sByteData[threadIdx.y][threadIdx.x * 2 + 1];
  213. //*(pDstTarget + 2) = pSrc[nLoadSharedPos + 2];
  214. //*(pDstTarget + 3) = pSrc[nLoadSharedPos + 3];
  215. /**(pDstTarget + 4) = pSrc[nLoadSharedPos + 4];
  216. *(pDstTarget + 5) = pSrc[nLoadSharedPos + 5];
  217. *(pDstTarget + 6) = pSrc[nLoadSharedPos + 6];
  218. *(pDstTarget + 7) = pSrc[nLoadSharedPos + 7];*/
  219. }
  220. }
  221. //分离式的转换方式,类似转置,本质上时从竖向的读取数据,转置到顺序读取中去
  222. void __global__ TurnNV12ToI420DissociativeKernalUV(unsigned char* pNV12UV, int nWidth, int nHeight, unsigned char* pI420UV, int nPitch)
  223. {
  224. __shared__ unsigned char sByteData[SHARED_MEMORY_SIZE*2 * (SHARED_MEMORY_SIZE)];
  225. //获取当前位置读取位置
  226. int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
  227. int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
  228. //当前位置在共享内存的位置NV12
  229. int nLoadPosU = nInputX*2 + nInputY * nPitch;
  230. int nLoadPosV = nInputX*2 + nInputY * nPitch + 1;
  231. //当前NV12的数据在I420中的位置
  232. int nStorePosU = nInputX + nInputY * nPitch;
  233. int nStorePosV = nInputX + nInputY * nPitch + nPitch * nHeight;
  234. //越界检查
  235. if (nInputX >= nWidth || nInputY >= nHeight)
  236. return;
  237. //计算当前的坐标应该在共享内存的哪个位置,共享内存块大小与线程块分配一致
  238. int nLoadSharedPos = threadIdx.x + (threadIdx.y) * blockDim.x;
  239. //由此计算得出共享内存的X,Y坐标
  240. sByteData[nLoadSharedPos] = pNV12UV[nLoadPosU];
  241. sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE] = pNV12UV[nLoadPosV];
  242. //同步,这一步是设置给线程束的
  243. __syncthreads();
  244. //写入
  245. pI420UV[nStorePosU] = sByteData[nLoadSharedPos];
  246. pI420UV[nStorePosV] = sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE];
  247. return;
  248. }
  249. cudaError_t setColorSpace2( float hue)
  250. {
  251. float hueSin = sin(hue);
  252. float hueCos = cos(hue);
  253. float hueCSC[9];
  254. //if (CSC == ITU601)
  255. //{
  256. // //CCIR 601
  257. // hueCSC[0] = 1.1644f;
  258. // hueCSC[1] = hueSin * 1.5960f;
  259. // hueCSC[2] = hueCos * 1.5960f;
  260. // hueCSC[3] = 1.1644f;
  261. // hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
  262. // hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f);
  263. // hueCSC[6] = 1.1644f;
  264. // hueCSC[7] = hueCos * 2.0172f;
  265. // hueCSC[8] = hueSin * -2.0172f;
  266. //}
  267. //else if (CSC == ITU709)
  268. {
  269. //CCIR 709
  270. hueCSC[0] = 1.0f;
  271. hueCSC[1] = hueSin * 1.57480f;
  272. hueCSC[2] = hueCos * 1.57480f;
  273. hueCSC[3] = 1.0;
  274. hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
  275. hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f);
  276. hueCSC[6] = 1.0f;
  277. hueCSC[7] = hueCos * 1.85560f;
  278. hueCSC[8] = hueSin * -1.85560f;
  279. }
  280. cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
  281. float tmpf[9];
  282. memset(tmpf, 0, 9 * sizeof(float));
  283. cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
  284. cudaDeviceSynchronize();
  285. if (cudaStatus != cudaSuccess) {
  286. fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
  287. }
  288. return cudaStatus;
  289. }
  290. void __global__ TurnCUDAFormatToI420Kernal(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* I420, int width, int height)
  291. {
  292. // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
  293. int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
  294. int y = blockIdx.y * blockDim.y + threadIdx.y;
  295. if(x > width || y > height)
  296. return;
  297. unsigned int yuv101010Pel[2];
  298. unsigned char* srcImageU8_Y = (unsigned char*)dataY;
  299. unsigned char* srcImageU8_UV = (unsigned char*)dataUV;
  300. unsigned char* dstImageY1 = I420 + y * width + x;
  301. unsigned char* dstImageY2 = I420 + y * width + x + 1;
  302. unsigned char* dstImageU = I420 + width * height + y * width / 4 + x/2 ;
  303. unsigned char* dstImageV = I420 + width * height * 5 / 4 + y * width / 4 + x/2;
  304. // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
  305. // if we move to texture we could read 4 luminance values
  306. //右移两位
  307. yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
  308. yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;
  309. int y_chroma = y >> 1;
  310. if (y & 1) // odd scanline ?
  311. {
  312. unsigned int chromaCb;
  313. unsigned int chromaCr;
  314. chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
  315. chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];
  316. if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
  317. {
  318. chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
  319. chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
  320. }
  321. yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
  322. yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  323. yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
  324. yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  325. }
  326. else
  327. {
  328. yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
  329. yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  330. yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
  331. yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
  332. }
  333. uint32 YuvOdd[3], YuvEven[3];
  334. float red[2], green[2], blue[2];
  335. YuvOdd[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
  336. YuvOdd[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  337. YuvOdd[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  338. YuvEven[0] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
  339. YuvEven[1] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
  340. YuvEven[2] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
  341. *dstImageY1 = (round((float)(YuvOdd[0]))) / 4;
  342. *dstImageY2 = (round((float)(YuvEven[0]))) / 4;
  343. if (!(y & 0x01))
  344. {
  345. *dstImageU = (unsigned char)(round(((float)(YuvOdd[1] + YuvEven[1] + 1) / 2.f ))/4.f);
  346. *dstImageV = (unsigned char)(round((float)((float)(YuvOdd[2] + YuvEven[2] + 1) / 2.f))/4.f);
  347. }
  348. //RGB转化
  349. /*YUV2RGB2(&YuvOdd[0], &red[0], &green[0], &blue[0]);
  350. YUV2RGB2(&YuvEven[0], &red[1], &green[1], &blue[1]);
  351. I420[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25, 0, 255);
  352. I420[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25, 0, 255);
  353. I420[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25, 0, 255);
  354. I420[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25, 0, 255);
  355. I420[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
  356. I420[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25, 0, 255);*/
  357. }
  358. void __global__ GradientBlenderYUVKernal(unsigned char* pRelateY, int YRelateStride,
  359. unsigned char* pRelateU, int URelateStride,
  360. unsigned char* pRelateV, int VRelateStride,
  361. unsigned char* pTargetY, int YTargetStride,
  362. unsigned char* pTargetU, int UTargetStride,
  363. unsigned char* pTargetV, int VTargetStride,
  364. int width, int height,
  365. float* pRelateMask, float* pTargetMask, int MaskStride,
  366. unsigned char* pDstY, int DstYStride,
  367. unsigned char* pDstU, int DstUStride,
  368. unsigned char* pDstV, int DstVStride)
  369. {
  370. //共享内存设置上,加载快
  371. /*__shared__ float sRelateMask[32][32];
  372. __shared__ float sTargetMask[32][32];
  373. __shared__ unsigned char sRelateData[32][32];
  374. __shared__ unsigned char sTargetData[32][32];
  375. __shared__ float sWeight[32][32];
  376. __shared__ float sSum[32][32];*/
  377. int x = blockIdx.x * blockDim.x + threadIdx.x;
  378. int y = blockIdx.y * blockDim.y + threadIdx.y;
  379. if (x >= width || y >= height)
  380. {
  381. return;
  382. }
  383. ////加载共享内存
  384. /*sRelateMask[threadIdx.y][threadIdx.x] = pRelateMask[y * width + x];
  385. sRelateData[threadIdx.y][threadIdx.x] = pRelateY[y * YRelateStride + x];
  386. sTargetMask[threadIdx.y][threadIdx.x] = pTargetMask[y * width + x];
  387. sTargetData[threadIdx.y][threadIdx.x] = pTargetY[y * YTargetStride + x];*/
  388. //写入共享内存
  389. //sSum[threadIdx.y][threadIdx.x] = sRelateData[threadIdx.y][threadIdx.x] * sRelateMask[threadIdx.y][threadIdx.x]
  390. // + sTargetData[threadIdx.y][threadIdx.x] * sTargetMask[threadIdx.y][threadIdx.x];
  391. //sWeight[threadIdx.y][threadIdx.x] = sRelateMask[threadIdx.y][threadIdx.x] + sTargetMask[threadIdx.y][threadIdx.x];
  392. float dbWeightRelate = pRelateMask[y * width + x];
  393. float dbWeightTarget = pTargetMask[y * width + x];
  394. float dbSum = dbWeightRelate * pRelateY[y * YRelateStride + x] + dbWeightTarget * pTargetY[y * YTargetStride + x];
  395. float dbWeight = dbWeightRelate + dbWeightTarget;
  396. ////写入目标图像
  397. pDstY[y * DstYStride + x] =
  398. dbSum / dbWeight;// sWeight[threadIdx.y][threadIdx.x];
  399. //__syncthreads();
  400. if (!(x & 0x01) && !(y & 0x01) )
  401. {
  402. unsigned int X = x >> 1;
  403. unsigned int Y = y >> 1;
  404. float dbSumU = pRelateU[Y * URelateStride + X] * dbWeightRelate +
  405. dbWeightTarget * pTargetU[Y * UTargetStride + X];
  406. pDstU[Y * DstUStride + X] = (unsigned char) dbSumU / dbWeight;
  407. float dbSumV = pRelateV[Y * VRelateStride + X] * dbWeightRelate +
  408. dbWeightTarget * pTargetV[Y * VTargetStride + X];
  409. pDstV[Y * DstVStride + X] = (unsigned char)dbSumV / dbWeight;
  410. }
  411. }
  412. void __global__ CropI420Kernel(unsigned char* pRelateY, int YRelateStride,
  413. unsigned char* pRelateU, int URelateStride,
  414. unsigned char* pRelateV, int VRelateStride,
  415. unsigned char* pTargetY, int YTargetStride,
  416. unsigned char* pTargetU, int UTargetStride,
  417. unsigned char* pTargetV, int VTargetStride,
  418. int nCropWidth, int nCropHeight,
  419. int nCropX, int nCropY,
  420. int DstWidth, int DstHeight)
  421. {
  422. __shared__ unsigned char sSrcData[SHARED_MEMORY_SIZE * 3 / 2][SHARED_MEMORY_SIZE];
  423. int x = threadIdx.x + blockIdx.x * blockDim.x;
  424. int y = threadIdx.y + blockIdx.y * blockDim.y;
  425. if (x > nCropWidth || y > nCropHeight)
  426. return;
  427. //Y
  428. sSrcData[threadIdx.y][threadIdx.x] = pRelateY[(y + nCropY) * YRelateStride + x + nCropX];
  429. if (!(x & 0x01) && !(y & 0x01))
  430. {
  431. //U
  432. sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2] = pRelateU[(y + nCropY) / 2 * URelateStride + (x + nCropX) / 2];
  433. //V
  434. sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2] =
  435. pRelateV[(y + nCropY) / 2 * VRelateStride + (x + nCropX) / 2];
  436. }
  437. __syncthreads();
  438. //写入
  439. pTargetY[y * YTargetStride + x] = sSrcData[threadIdx.y][threadIdx.x];
  440. if (!(x & 0x01) && !(y & 0x01))
  441. {
  442. pTargetU[y * UTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2];
  443. pTargetV[y * VTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2];
  444. }
  445. }
  446. }