123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565 |
- #include "FusionAndTailor.cuh"
- #include <builtin_types.h>
- //经典的转换方式
- #define COLOR_COMPONENT_MASK 0x3FF
- #define COLOR_COMPONENT_BIT_SIZE 10
- #define MUL(x,y) ((x)*(y))
- typedef unsigned int uint32;
- typedef int int32;
- __constant__ float constHueColorSpaceMat2[9]; //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释
- __device__ void YUV2RGB2(unsigned int* yuvi, float* red, float* green, float* blue)
- {
- float luma, chromaCb, chromaCr;
- // Prepare for hue adjustment
- luma = (float)yuvi[0];
- chromaCb = (float)((int32)yuvi[1] - 512.0f);
- chromaCr = (float)((int32)yuvi[2] - 512.0f);
- // Convert YUV To RGB with hue adjustment
- *red = MUL(luma, constHueColorSpaceMat2[0]) +
- MUL(chromaCb, constHueColorSpaceMat2[1]) +
- MUL(chromaCr, constHueColorSpaceMat2[2]);
- *green = MUL(luma, constHueColorSpaceMat2[3]) +
- MUL(chromaCb, constHueColorSpaceMat2[4]) +
- MUL(chromaCr, constHueColorSpaceMat2[5]);
- *blue = MUL(luma, constHueColorSpaceMat2[6]) +
- MUL(chromaCb, constHueColorSpaceMat2[7]) +
- MUL(chromaCr, constHueColorSpaceMat2[8]);
- }
- __device__ unsigned char clip_v(int x, int min_val, int max_val) {
- if (x > max_val) {
- return max_val;
- }
- else if (x < min_val) {
- return min_val;
- }
- else {
- return x;
- }
- }
- namespace YUVTailorAndBlender {
- __host__ void TurnNV12ToI420Classics(unsigned char* pNV12, int nWidth, int nHeight,
- unsigned char* pI420, int nPitch)
- {
- //set the block and grid
- dim3 Block(32, 32);
- dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);
- //call the kernel
- TurnNV12ToI420ClassicsKernal << <Grid, Block >> > (pNV12, nWidth, nHeight, pI420, nPitch);
- }
- __host__ void TurnNV12ToI420Dissociative(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch)
- {
- dim3 Block(32, 32);
- dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);
- CopyPlaneKernal<<<Grid,Block>>>(pNV12,nWidth,nHeight,pI420,nPitch);
- dim3 BlockUV(32, 32);
- dim3 GridUV(((nWidth / 2) + Block.x - 1) / Block.x, ((nHeight / 2) + Block.y - 1) / Block.y);
- //TurnNV12ToI420DissociativeKernalUV<<<GridUV,BlockUV>>>(pNV12,nWidth/2,nHeight/2,pI420,nPitch/2);
- }
- __host__ void TurnCUDAFormatToI420(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* dstImage, int width, int height, CUstream* pStream)
- {
- dim3 block(32, 16, 1);
- dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
- if (pStream == nullptr)
- TurnCUDAFormatToI420Kernal << < grid, block >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
- else
- TurnCUDAFormatToI420Kernal << < grid, block, 0 ,*pStream >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
- }
- __host__ void CopyPlane(unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
- unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
- int nCopyWidth, int nCopyHeight,
- int nCopyStartX, int nCopyStartY,
- unsigned int Type,
- CUstream* pStream)
- {
- dim3 block(BLOCK_SIZE, SHARED_MEMORY_SIZE_Y, 1);
- dim3 grid((nCopyWidth/2 + (block.x - 1)) / block.x, ((nCopyHeight + (block.y - 1))) / block.y, 1);
- _CopyPlaneKernal << <grid, block >> > (
- pSrc, nSrcWidth, nSrcHeight, nSrcPitch,
- pDst, nDstWidth, nDstHeight, nDstPitch,
- nCopyWidth, nCopyHeight,
- nCopyStartX, nCopyStartY,
- 1
- );
- }
- __host__ void CropI420(unsigned char* pSrcY, int nSrcYPitch,
- unsigned char* pSrcU, int nSrcUPitch,
- unsigned char* pSrcV, int nSrcVPitch,
- unsigned char* pDstY, int nDstYPitch,
- unsigned char* pDstU, int nDstUPitch,
- unsigned char* pDstV, int nDstVPitch,
- int nCropWidth, int nCropHeight,
- int nCropStartX, int nCropStartY)
- {
-
- dim3 block(BLOCK_SIZE, BLOCK_SIZE);
- dim3 grid((nCropWidth + block.x - 1) / block.x, (nCropHeight + block.y - 1) / block.y);
- CropI420Kernel << <grid, block >> > (
- pSrcY, nSrcYPitch,
- pSrcU, nSrcUPitch,
- pSrcV, nSrcVPitch,
- pDstY, nDstYPitch,
- pDstU, nDstUPitch,
- pDstV, nDstVPitch,
- nCropWidth, nCropHeight,
- nCropStartX, nCropStartY,
- nCropWidth, nCropHeight
- );
- //cudaError Error = cudaDeviceSynchronize();
- }
- __host__ void GradientBlenderYUV(unsigned char* pRelateY, int YRelateStride,
- unsigned char* pRelateU, int URelateStride,
- unsigned char* pRelateV, int VRelateStride,
- unsigned char* pTargetY, int YTargetStride,
- unsigned char* pTargetU, int UTargetStride,
- unsigned char* pTargetV, int VTargetStride,
- int width, int height,
- float* pRelateMask, float* pTargetMask, int MaskStride,
- unsigned char* pDstY, int DstYStride,
- unsigned char* pDstU, int DstUStride,
- unsigned char* pDstV, int DstVStride,
- int DstWidth, int DstHeight)
- {
- dim3 block(32, 32);
- dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
- GradientBlenderYUVKernal << <grid, block >> >
- (
- pRelateY, YRelateStride,
- pRelateU, URelateStride,
- pRelateV, VRelateStride,
- pTargetY, YTargetStride,
- pTargetU, UTargetStride,
- pTargetV, VTargetStride,
- width, height,
- pRelateMask, pTargetMask, MaskStride,
- pDstY, DstYStride,
- pDstU, DstUStride,
- pDstV, DstVStride
- );
- }
- //最经典的方法,根据当前的像素数进行修改
- void __global__ TurnNV12ToI420ClassicsKernal(unsigned char* pNV12, int nWidth, int nHeight,
- unsigned char* pI420, int nPitch)
- {
- //get the position
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- //check the position
- if (x >= nWidth || y >= nHeight)
- {
- return;
- }
- //get the position of the YUV
- int nYPos = y * nPitch + x;
- int nUVPos = nHeight * nPitch + y * nPitch / 2 + x;
- //get the position of the I420
- int nYPosI420 = y * nWidth + x;
- int nUPosI420 = nHeight * nWidth + y * nWidth / 4 + x / 2;
- int nVPosI420 = nHeight * nWidth * 5 / 4 + y * nWidth / 4 + x / 2;
- //copy the YUV to I420
- pI420[nYPosI420] = pNV12[nYPos];
- if (y % 2 == 0 && x % 2 == 0)
- {
- pI420[nUPosI420] = pNV12[nUVPos];
- pI420[nVPosI420] = pNV12[nUVPos + 1];
- }
- }
- //分离式的转换方式,类似转置
- void __global__ CopyPlaneKernal(unsigned char* pNV12, int nWidth, int nHeight,
- unsigned char* pI420, int nPitch)
- {
- __shared__ unsigned char sByteData[ SHARED_MEMORY_SIZE][SHARED_MEMORY_SIZE];
- //获取当前位置,因为是
- int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
- int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
- unsigned int nLoadSharedPos = nInputX + nInputY * nPitch;
- if (nInputX < nWidth && nInputY < nHeight)
- {
- //sByteData[threadIdx.y][threadIdx.x] = pNV12[nLoadSharedPos];
- __syncthreads();
- pI420[nLoadSharedPos] = pNV12[nLoadSharedPos];// sByteData[threadIdx.y][threadIdx.x];
- }
- }
- void __global__ _CopyPlaneKernal(
- unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
- unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
- int nCopyWidth, int nCopyHeight,
- int nCopyStartX, int nCopyStartY,
- int nChannel)
- {
- //__shared__ unsigned char sByteData[SHARED_MEMORY_SIZE_Y][SHARED_MEMORY_SIZE*2];
- int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
- int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
-
- /*if (nInputX + SHARED_MEMORY_SIZE < nCopyWidth
- && nInputY < nCopyHeight
- )
- {
- unsigned int nLoadSharedPos = nInputX + nInputY * nSrcPitch;
- unsigned char* pDstTarget = pDst + nCopyStartX + nCopyStartY * nDstPitch;
- unsigned char* pDstCurTarget = pDstTarget + nInputX + nInputY * nDstPitch;
- sByteData[threadIdx.y][threadIdx.x] = pSrc[nLoadSharedPos];
- sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x] = pSrc[nLoadSharedPos + SHARED_MEMORY_SIZE];
- __syncthreads();
-
- *pDstCurTarget = sByteData[threadIdx.y][threadIdx.x];
- *(pDstCurTarget + SHARED_MEMORY_SIZE) = sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x];
- }*/
- if (nInputX*2 < nCopyWidth
- && nInputY < nCopyHeight)
- {
- unsigned int nLoadSharedPos = nInputX*2 + nInputY * nSrcPitch;
- unsigned char* pDstTarget = pDst + nCopyStartX + nInputX*2 + (nCopyStartY + nInputY) * nDstPitch;
- //sByteData[threadIdx.y][threadIdx.x * 2] = pSrc[nLoadSharedPos];;
- //sByteData[threadIdx.y][threadIdx.x * 2 + 1] = pSrc[nLoadSharedPos + 1];;
- *pDstTarget = pSrc[nLoadSharedPos];//sByteData[threadIdx.y][threadIdx.x * 2];
- *(pDstTarget + 1) = pSrc[nLoadSharedPos + 1];//sByteData[threadIdx.y][threadIdx.x * 2 + 1];
- //*(pDstTarget + 2) = pSrc[nLoadSharedPos + 2];
- //*(pDstTarget + 3) = pSrc[nLoadSharedPos + 3];
- /**(pDstTarget + 4) = pSrc[nLoadSharedPos + 4];
- *(pDstTarget + 5) = pSrc[nLoadSharedPos + 5];
- *(pDstTarget + 6) = pSrc[nLoadSharedPos + 6];
- *(pDstTarget + 7) = pSrc[nLoadSharedPos + 7];*/
- }
- }
- //分离式的转换方式,类似转置,本质上时从竖向的读取数据,转置到顺序读取中去
- void __global__ TurnNV12ToI420DissociativeKernalUV(unsigned char* pNV12UV, int nWidth, int nHeight, unsigned char* pI420UV, int nPitch)
- {
- __shared__ unsigned char sByteData[SHARED_MEMORY_SIZE*2 * (SHARED_MEMORY_SIZE)];
- //获取当前位置读取位置
- int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
- int nInputY = threadIdx.y + blockIdx.y * blockDim.y;
- //当前位置在共享内存的位置NV12
- int nLoadPosU = nInputX*2 + nInputY * nPitch;
- int nLoadPosV = nInputX*2 + nInputY * nPitch + 1;
- //当前NV12的数据在I420中的位置
- int nStorePosU = nInputX + nInputY * nPitch;
- int nStorePosV = nInputX + nInputY * nPitch + nPitch * nHeight;
- //越界检查
- if (nInputX >= nWidth || nInputY >= nHeight)
- return;
-
- //计算当前的坐标应该在共享内存的哪个位置,共享内存块大小与线程块分配一致
- int nLoadSharedPos = threadIdx.x + (threadIdx.y) * blockDim.x;
- //由此计算得出共享内存的X,Y坐标
- sByteData[nLoadSharedPos] = pNV12UV[nLoadPosU];
- sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE] = pNV12UV[nLoadPosV];
- //同步,这一步是设置给线程束的
- __syncthreads();
- //写入
- pI420UV[nStorePosU] = sByteData[nLoadSharedPos];
- pI420UV[nStorePosV] = sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE];
- return;
- }
- cudaError_t setColorSpace2( float hue)
- {
- float hueSin = sin(hue);
- float hueCos = cos(hue);
- float hueCSC[9];
- //if (CSC == ITU601)
- //{
- // //CCIR 601
- // hueCSC[0] = 1.1644f;
- // hueCSC[1] = hueSin * 1.5960f;
- // hueCSC[2] = hueCos * 1.5960f;
- // hueCSC[3] = 1.1644f;
- // hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
- // hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f);
- // hueCSC[6] = 1.1644f;
- // hueCSC[7] = hueCos * 2.0172f;
- // hueCSC[8] = hueSin * -2.0172f;
- //}
- //else if (CSC == ITU709)
- {
- //CCIR 709
- hueCSC[0] = 1.0f;
- hueCSC[1] = hueSin * 1.57480f;
- hueCSC[2] = hueCos * 1.57480f;
- hueCSC[3] = 1.0;
- hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
- hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f);
- hueCSC[6] = 1.0f;
- hueCSC[7] = hueCos * 1.85560f;
- hueCSC[8] = hueSin * -1.85560f;
- }
- cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
- float tmpf[9];
- memset(tmpf, 0, 9 * sizeof(float));
- cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
- cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
- }
- return cudaStatus;
- }
- void __global__ TurnCUDAFormatToI420Kernal(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* I420, int width, int height)
- {
- // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
- int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
- int y = blockIdx.y * blockDim.y + threadIdx.y;
- if(x > width || y > height)
- return;
- unsigned int yuv101010Pel[2];
- unsigned char* srcImageU8_Y = (unsigned char*)dataY;
- unsigned char* srcImageU8_UV = (unsigned char*)dataUV;
- unsigned char* dstImageY1 = I420 + y * width + x;
- unsigned char* dstImageY2 = I420 + y * width + x + 1;
- unsigned char* dstImageU = I420 + width * height + y * width / 4 + x/2 ;
- unsigned char* dstImageV = I420 + width * height * 5 / 4 + y * width / 4 + x/2;
- // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
- // if we move to texture we could read 4 luminance values
- //右移两位
- yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
- yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;
- int y_chroma = y >> 1;
- if (y & 1) // odd scanline ?
- {
- unsigned int chromaCb;
- unsigned int chromaCr;
- chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
- chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];
- if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
- {
- chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
- chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
- }
- yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
- yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
- yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
- yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
- }
- else
- {
- yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
- yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
- yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
- yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
- }
- uint32 YuvOdd[3], YuvEven[3];
- float red[2], green[2], blue[2];
- YuvOdd[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
- YuvOdd[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
- YuvOdd[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
- YuvEven[0] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
- YuvEven[1] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
- YuvEven[2] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
- *dstImageY1 = (round((float)(YuvOdd[0]))) / 4;
- *dstImageY2 = (round((float)(YuvEven[0]))) / 4;
-
- if (!(y & 0x01))
- {
- *dstImageU = (unsigned char)(round(((float)(YuvOdd[1] + YuvEven[1] + 1) / 2.f ))/4.f);
- *dstImageV = (unsigned char)(round((float)((float)(YuvOdd[2] + YuvEven[2] + 1) / 2.f))/4.f);
- }
-
- //RGB转化
- /*YUV2RGB2(&YuvOdd[0], &red[0], &green[0], &blue[0]);
- YUV2RGB2(&YuvEven[0], &red[1], &green[1], &blue[1]);
- I420[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25, 0, 255);
- I420[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25, 0, 255);
- I420[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25, 0, 255);
- I420[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25, 0, 255);
- I420[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
- I420[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25, 0, 255);*/
- }
- void __global__ GradientBlenderYUVKernal(unsigned char* pRelateY, int YRelateStride,
- unsigned char* pRelateU, int URelateStride,
- unsigned char* pRelateV, int VRelateStride,
- unsigned char* pTargetY, int YTargetStride,
- unsigned char* pTargetU, int UTargetStride,
- unsigned char* pTargetV, int VTargetStride,
- int width, int height,
- float* pRelateMask, float* pTargetMask, int MaskStride,
- unsigned char* pDstY, int DstYStride,
- unsigned char* pDstU, int DstUStride,
- unsigned char* pDstV, int DstVStride)
- {
- //共享内存设置上,加载快
- /*__shared__ float sRelateMask[32][32];
- __shared__ float sTargetMask[32][32];
- __shared__ unsigned char sRelateData[32][32];
- __shared__ unsigned char sTargetData[32][32];
- __shared__ float sWeight[32][32];
- __shared__ float sSum[32][32];*/
- int x = blockIdx.x * blockDim.x + threadIdx.x;
- int y = blockIdx.y * blockDim.y + threadIdx.y;
- if (x >= width || y >= height)
- {
- return;
- }
- ////加载共享内存
- /*sRelateMask[threadIdx.y][threadIdx.x] = pRelateMask[y * width + x];
- sRelateData[threadIdx.y][threadIdx.x] = pRelateY[y * YRelateStride + x];
- sTargetMask[threadIdx.y][threadIdx.x] = pTargetMask[y * width + x];
- sTargetData[threadIdx.y][threadIdx.x] = pTargetY[y * YTargetStride + x];*/
-
- //写入共享内存
- //sSum[threadIdx.y][threadIdx.x] = sRelateData[threadIdx.y][threadIdx.x] * sRelateMask[threadIdx.y][threadIdx.x]
- // + sTargetData[threadIdx.y][threadIdx.x] * sTargetMask[threadIdx.y][threadIdx.x];
-
- //sWeight[threadIdx.y][threadIdx.x] = sRelateMask[threadIdx.y][threadIdx.x] + sTargetMask[threadIdx.y][threadIdx.x];
- float dbWeightRelate = pRelateMask[y * width + x];
- float dbWeightTarget = pTargetMask[y * width + x];
- float dbSum = dbWeightRelate * pRelateY[y * YRelateStride + x] + dbWeightTarget * pTargetY[y * YTargetStride + x];
- float dbWeight = dbWeightRelate + dbWeightTarget;
- ////写入目标图像
- pDstY[y * DstYStride + x] =
- dbSum / dbWeight;// sWeight[threadIdx.y][threadIdx.x];
- //__syncthreads();
- if (!(x & 0x01) && !(y & 0x01) )
- {
- unsigned int X = x >> 1;
- unsigned int Y = y >> 1;
- float dbSumU = pRelateU[Y * URelateStride + X] * dbWeightRelate +
- dbWeightTarget * pTargetU[Y * UTargetStride + X];
- pDstU[Y * DstUStride + X] = (unsigned char) dbSumU / dbWeight;
- float dbSumV = pRelateV[Y * VRelateStride + X] * dbWeightRelate +
- dbWeightTarget * pTargetV[Y * VTargetStride + X];
- pDstV[Y * DstVStride + X] = (unsigned char)dbSumV / dbWeight;
- }
- }
- void __global__ CropI420Kernel(unsigned char* pRelateY, int YRelateStride,
- unsigned char* pRelateU, int URelateStride,
- unsigned char* pRelateV, int VRelateStride,
- unsigned char* pTargetY, int YTargetStride,
- unsigned char* pTargetU, int UTargetStride,
- unsigned char* pTargetV, int VTargetStride,
- int nCropWidth, int nCropHeight,
- int nCropX, int nCropY,
- int DstWidth, int DstHeight)
- {
- __shared__ unsigned char sSrcData[SHARED_MEMORY_SIZE * 3 / 2][SHARED_MEMORY_SIZE];
- int x = threadIdx.x + blockIdx.x * blockDim.x;
- int y = threadIdx.y + blockIdx.y * blockDim.y;
- if (x > nCropWidth || y > nCropHeight)
- return;
- //Y
- sSrcData[threadIdx.y][threadIdx.x] = pRelateY[(y + nCropY) * YRelateStride + x + nCropX];
-
- if (!(x & 0x01) && !(y & 0x01))
- {
- //U
- sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2] = pRelateU[(y + nCropY) / 2 * URelateStride + (x + nCropX) / 2];
- //V
- sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2] =
- pRelateV[(y + nCropY) / 2 * VRelateStride + (x + nCropX) / 2];
- }
- __syncthreads();
- //写入
- pTargetY[y * YTargetStride + x] = sSrcData[threadIdx.y][threadIdx.x];
-
- if (!(x & 0x01) && !(y & 0x01))
- {
- pTargetU[y * UTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2];
- pTargetV[y * VTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2];
- }
-
- }
- }
|