#include "FusionAndTailor.cuh" #include //经典的转换方式 #define COLOR_COMPONENT_MASK 0x3FF #define COLOR_COMPONENT_BIT_SIZE 10 #define MUL(x,y) ((x)*(y)) typedef unsigned int uint32; typedef int int32; __constant__ float constHueColorSpaceMat2[9]; //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释 __device__ void YUV2RGB2(unsigned int* yuvi, float* red, float* green, float* blue) { float luma, chromaCb, chromaCr; // Prepare for hue adjustment luma = (float)yuvi[0]; chromaCb = (float)((int32)yuvi[1] - 512.0f); chromaCr = (float)((int32)yuvi[2] - 512.0f); // Convert YUV To RGB with hue adjustment *red = MUL(luma, constHueColorSpaceMat2[0]) + MUL(chromaCb, constHueColorSpaceMat2[1]) + MUL(chromaCr, constHueColorSpaceMat2[2]); *green = MUL(luma, constHueColorSpaceMat2[3]) + MUL(chromaCb, constHueColorSpaceMat2[4]) + MUL(chromaCr, constHueColorSpaceMat2[5]); *blue = MUL(luma, constHueColorSpaceMat2[6]) + MUL(chromaCb, constHueColorSpaceMat2[7]) + MUL(chromaCr, constHueColorSpaceMat2[8]); } __device__ unsigned char clip_v(int x, int min_val, int max_val) { if (x > max_val) { return max_val; } else if (x < min_val) { return min_val; } else { return x; } } namespace YUVTailorAndBlender { __host__ void TurnNV12ToI420Classics(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch) { //set the block and grid dim3 Block(32, 32); dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y); //call the kernel TurnNV12ToI420ClassicsKernal << > > (pNV12, nWidth, nHeight, pI420, nPitch); } __host__ void TurnNV12ToI420Dissociative(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch) { dim3 Block(32, 32); dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y); CopyPlaneKernal<<>>(pNV12,nWidth,nHeight,pI420,nPitch); dim3 BlockUV(32, 32); dim3 GridUV(((nWidth / 2) + Block.x - 1) / Block.x, ((nHeight / 2) + Block.y - 1) / Block.y); //TurnNV12ToI420DissociativeKernalUV<<>>(pNV12,nWidth/2,nHeight/2,pI420,nPitch/2); } __host__ void TurnCUDAFormatToI420(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* dstImage, int width, int height, CUstream* pStream) { dim3 block(32, 16, 1); dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1); if (pStream == nullptr) TurnCUDAFormatToI420Kernal << < grid, block >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height); else TurnCUDAFormatToI420Kernal << < grid, block, 0 ,*pStream >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height); } __host__ void CopyPlane(unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch, unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch, int nCopyWidth, int nCopyHeight, int nCopyStartX, int nCopyStartY, unsigned int Type, CUstream* pStream) { dim3 block(BLOCK_SIZE, SHARED_MEMORY_SIZE_Y, 1); dim3 grid((nCopyWidth/2 + (block.x - 1)) / block.x, ((nCopyHeight + (block.y - 1))) / block.y, 1); _CopyPlaneKernal << > > ( pSrc, nSrcWidth, nSrcHeight, nSrcPitch, pDst, nDstWidth, nDstHeight, nDstPitch, nCopyWidth, nCopyHeight, nCopyStartX, nCopyStartY, 1 ); } __host__ void CropI420(unsigned char* pSrcY, int nSrcYPitch, unsigned char* pSrcU, int nSrcUPitch, unsigned char* pSrcV, int nSrcVPitch, unsigned char* pDstY, int nDstYPitch, unsigned char* pDstU, int nDstUPitch, unsigned char* pDstV, int nDstVPitch, int nCropWidth, int nCropHeight, int nCropStartX, int nCropStartY) { dim3 block(BLOCK_SIZE, BLOCK_SIZE); dim3 grid((nCropWidth + block.x - 1) / block.x, (nCropHeight + block.y - 1) / block.y); CropI420Kernel << > > ( pSrcY, nSrcYPitch, pSrcU, nSrcUPitch, pSrcV, nSrcVPitch, pDstY, nDstYPitch, pDstU, nDstUPitch, pDstV, nDstVPitch, nCropWidth, nCropHeight, nCropStartX, nCropStartY, nCropWidth, nCropHeight ); //cudaError Error = cudaDeviceSynchronize(); } __host__ void GradientBlenderYUV(unsigned char* pRelateY, int YRelateStride, unsigned char* pRelateU, int URelateStride, unsigned char* pRelateV, int VRelateStride, unsigned char* pTargetY, int YTargetStride, unsigned char* pTargetU, int UTargetStride, unsigned char* pTargetV, int VTargetStride, int width, int height, float* pRelateMask, float* pTargetMask, int MaskStride, unsigned char* pDstY, int DstYStride, unsigned char* pDstU, int DstUStride, unsigned char* pDstV, int DstVStride, int DstWidth, int DstHeight) { dim3 block(32, 32); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); GradientBlenderYUVKernal << > > ( pRelateY, YRelateStride, pRelateU, URelateStride, pRelateV, VRelateStride, pTargetY, YTargetStride, pTargetU, UTargetStride, pTargetV, VTargetStride, width, height, pRelateMask, pTargetMask, MaskStride, pDstY, DstYStride, pDstU, DstUStride, pDstV, DstVStride ); } //最经典的方法,根据当前的像素数进行修改 void __global__ TurnNV12ToI420ClassicsKernal(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch) { //get the position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; //check the position if (x >= nWidth || y >= nHeight) { return; } //get the position of the YUV int nYPos = y * nPitch + x; int nUVPos = nHeight * nPitch + y * nPitch / 2 + x; //get the position of the I420 int nYPosI420 = y * nWidth + x; int nUPosI420 = nHeight * nWidth + y * nWidth / 4 + x / 2; int nVPosI420 = nHeight * nWidth * 5 / 4 + y * nWidth / 4 + x / 2; //copy the YUV to I420 pI420[nYPosI420] = pNV12[nYPos]; if (y % 2 == 0 && x % 2 == 0) { pI420[nUPosI420] = pNV12[nUVPos]; pI420[nVPosI420] = pNV12[nUVPos + 1]; } } //分离式的转换方式,类似转置 void __global__ CopyPlaneKernal(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch) { __shared__ unsigned char sByteData[ SHARED_MEMORY_SIZE][SHARED_MEMORY_SIZE]; //获取当前位置,因为是 int nInputX = threadIdx.x + blockIdx.x * blockDim.x; int nInputY = threadIdx.y + blockIdx.y * blockDim.y; unsigned int nLoadSharedPos = nInputX + nInputY * nPitch; if (nInputX < nWidth && nInputY < nHeight) { //sByteData[threadIdx.y][threadIdx.x] = pNV12[nLoadSharedPos]; __syncthreads(); pI420[nLoadSharedPos] = pNV12[nLoadSharedPos];// sByteData[threadIdx.y][threadIdx.x]; } } void __global__ _CopyPlaneKernal( unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch, unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch, int nCopyWidth, int nCopyHeight, int nCopyStartX, int nCopyStartY, int nChannel) { //__shared__ unsigned char sByteData[SHARED_MEMORY_SIZE_Y][SHARED_MEMORY_SIZE*2]; int nInputX = threadIdx.x + blockIdx.x * blockDim.x; int nInputY = threadIdx.y + blockIdx.y * blockDim.y; /*if (nInputX + SHARED_MEMORY_SIZE < nCopyWidth && nInputY < nCopyHeight ) { unsigned int nLoadSharedPos = nInputX + nInputY * nSrcPitch; unsigned char* pDstTarget = pDst + nCopyStartX + nCopyStartY * nDstPitch; unsigned char* pDstCurTarget = pDstTarget + nInputX + nInputY * nDstPitch; sByteData[threadIdx.y][threadIdx.x] = pSrc[nLoadSharedPos]; sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x] = pSrc[nLoadSharedPos + SHARED_MEMORY_SIZE]; __syncthreads(); *pDstCurTarget = sByteData[threadIdx.y][threadIdx.x]; *(pDstCurTarget + SHARED_MEMORY_SIZE) = sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x]; }*/ if (nInputX*2 < nCopyWidth && nInputY < nCopyHeight) { unsigned int nLoadSharedPos = nInputX*2 + nInputY * nSrcPitch; unsigned char* pDstTarget = pDst + nCopyStartX + nInputX*2 + (nCopyStartY + nInputY) * nDstPitch; //sByteData[threadIdx.y][threadIdx.x * 2] = pSrc[nLoadSharedPos];; //sByteData[threadIdx.y][threadIdx.x * 2 + 1] = pSrc[nLoadSharedPos + 1];; *pDstTarget = pSrc[nLoadSharedPos];//sByteData[threadIdx.y][threadIdx.x * 2]; *(pDstTarget + 1) = pSrc[nLoadSharedPos + 1];//sByteData[threadIdx.y][threadIdx.x * 2 + 1]; //*(pDstTarget + 2) = pSrc[nLoadSharedPos + 2]; //*(pDstTarget + 3) = pSrc[nLoadSharedPos + 3]; /**(pDstTarget + 4) = pSrc[nLoadSharedPos + 4]; *(pDstTarget + 5) = pSrc[nLoadSharedPos + 5]; *(pDstTarget + 6) = pSrc[nLoadSharedPos + 6]; *(pDstTarget + 7) = pSrc[nLoadSharedPos + 7];*/ } } //分离式的转换方式,类似转置,本质上时从竖向的读取数据,转置到顺序读取中去 void __global__ TurnNV12ToI420DissociativeKernalUV(unsigned char* pNV12UV, int nWidth, int nHeight, unsigned char* pI420UV, int nPitch) { __shared__ unsigned char sByteData[SHARED_MEMORY_SIZE*2 * (SHARED_MEMORY_SIZE)]; //获取当前位置读取位置 int nInputX = threadIdx.x + blockIdx.x * blockDim.x; int nInputY = threadIdx.y + blockIdx.y * blockDim.y; //当前位置在共享内存的位置NV12 int nLoadPosU = nInputX*2 + nInputY * nPitch; int nLoadPosV = nInputX*2 + nInputY * nPitch + 1; //当前NV12的数据在I420中的位置 int nStorePosU = nInputX + nInputY * nPitch; int nStorePosV = nInputX + nInputY * nPitch + nPitch * nHeight; //越界检查 if (nInputX >= nWidth || nInputY >= nHeight) return; //计算当前的坐标应该在共享内存的哪个位置,共享内存块大小与线程块分配一致 int nLoadSharedPos = threadIdx.x + (threadIdx.y) * blockDim.x; //由此计算得出共享内存的X,Y坐标 sByteData[nLoadSharedPos] = pNV12UV[nLoadPosU]; sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE] = pNV12UV[nLoadPosV]; //同步,这一步是设置给线程束的 __syncthreads(); //写入 pI420UV[nStorePosU] = sByteData[nLoadSharedPos]; pI420UV[nStorePosV] = sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE]; return; } cudaError_t setColorSpace2( float hue) { float hueSin = sin(hue); float hueCos = cos(hue); float hueCSC[9]; //if (CSC == ITU601) //{ // //CCIR 601 // hueCSC[0] = 1.1644f; // hueCSC[1] = hueSin * 1.5960f; // hueCSC[2] = hueCos * 1.5960f; // hueCSC[3] = 1.1644f; // hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f); // hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f); // hueCSC[6] = 1.1644f; // hueCSC[7] = hueCos * 2.0172f; // hueCSC[8] = hueSin * -2.0172f; //} //else if (CSC == ITU709) { //CCIR 709 hueCSC[0] = 1.0f; hueCSC[1] = hueSin * 1.57480f; hueCSC[2] = hueCos * 1.57480f; hueCSC[3] = 1.0; hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f); hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f); hueCSC[6] = 1.0f; hueCSC[7] = hueCos * 1.85560f; hueCSC[8] = hueSin * -1.85560f; } cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice); float tmpf[9]; memset(tmpf, 0, 9 * sizeof(float)); cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault); cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus)); } return cudaStatus; } void __global__ TurnCUDAFormatToI420Kernal(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* I420, int width, int height) { // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); int y = blockIdx.y * blockDim.y + threadIdx.y; if(x > width || y > height) return; unsigned int yuv101010Pel[2]; unsigned char* srcImageU8_Y = (unsigned char*)dataY; unsigned char* srcImageU8_UV = (unsigned char*)dataUV; unsigned char* dstImageY1 = I420 + y * width + x; unsigned char* dstImageY2 = I420 + y * width + x + 1; unsigned char* dstImageU = I420 + width * height + y * width / 4 + x/2 ; unsigned char* dstImageV = I420 + width * height * 5 / 4 + y * width / 4 + x/2; // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. // if we move to texture we could read 4 luminance values //右移两位 yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2; yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2; int y_chroma = y >> 1; if (y & 1) // odd scanline ? { unsigned int chromaCb; unsigned int chromaCr; chromaCb = srcImageU8_UV[y_chroma * pitchUV + x]; chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1]; if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically { chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1; chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1; } yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); } else { yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2)); yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); } uint32 YuvOdd[3], YuvEven[3]; float red[2], green[2], blue[2]; YuvOdd[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); YuvOdd[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); YuvOdd[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); YuvEven[0] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); YuvEven[1] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); YuvEven[2] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); *dstImageY1 = (round((float)(YuvOdd[0]))) / 4; *dstImageY2 = (round((float)(YuvEven[0]))) / 4; if (!(y & 0x01)) { *dstImageU = (unsigned char)(round(((float)(YuvOdd[1] + YuvEven[1] + 1) / 2.f ))/4.f); *dstImageV = (unsigned char)(round((float)((float)(YuvOdd[2] + YuvEven[2] + 1) / 2.f))/4.f); } //RGB转化 /*YUV2RGB2(&YuvOdd[0], &red[0], &green[0], &blue[0]); YUV2RGB2(&YuvEven[0], &red[1], &green[1], &blue[1]); I420[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25, 0, 255); I420[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25, 0, 255); I420[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25, 0, 255); I420[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25, 0, 255); I420[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255); I420[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25, 0, 255);*/ } void __global__ GradientBlenderYUVKernal(unsigned char* pRelateY, int YRelateStride, unsigned char* pRelateU, int URelateStride, unsigned char* pRelateV, int VRelateStride, unsigned char* pTargetY, int YTargetStride, unsigned char* pTargetU, int UTargetStride, unsigned char* pTargetV, int VTargetStride, int width, int height, float* pRelateMask, float* pTargetMask, int MaskStride, unsigned char* pDstY, int DstYStride, unsigned char* pDstU, int DstUStride, unsigned char* pDstV, int DstVStride) { //共享内存设置上,加载快 /*__shared__ float sRelateMask[32][32]; __shared__ float sTargetMask[32][32]; __shared__ unsigned char sRelateData[32][32]; __shared__ unsigned char sTargetData[32][32]; __shared__ float sWeight[32][32]; __shared__ float sSum[32][32];*/ int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) { return; } ////加载共享内存 /*sRelateMask[threadIdx.y][threadIdx.x] = pRelateMask[y * width + x]; sRelateData[threadIdx.y][threadIdx.x] = pRelateY[y * YRelateStride + x]; sTargetMask[threadIdx.y][threadIdx.x] = pTargetMask[y * width + x]; sTargetData[threadIdx.y][threadIdx.x] = pTargetY[y * YTargetStride + x];*/ //写入共享内存 //sSum[threadIdx.y][threadIdx.x] = sRelateData[threadIdx.y][threadIdx.x] * sRelateMask[threadIdx.y][threadIdx.x] // + sTargetData[threadIdx.y][threadIdx.x] * sTargetMask[threadIdx.y][threadIdx.x]; //sWeight[threadIdx.y][threadIdx.x] = sRelateMask[threadIdx.y][threadIdx.x] + sTargetMask[threadIdx.y][threadIdx.x]; float dbWeightRelate = pRelateMask[y * width + x]; float dbWeightTarget = pTargetMask[y * width + x]; float dbSum = dbWeightRelate * pRelateY[y * YRelateStride + x] + dbWeightTarget * pTargetY[y * YTargetStride + x]; float dbWeight = dbWeightRelate + dbWeightTarget; ////写入目标图像 pDstY[y * DstYStride + x] = dbSum / dbWeight;// sWeight[threadIdx.y][threadIdx.x]; //__syncthreads(); if (!(x & 0x01) && !(y & 0x01) ) { unsigned int X = x >> 1; unsigned int Y = y >> 1; float dbSumU = pRelateU[Y * URelateStride + X] * dbWeightRelate + dbWeightTarget * pTargetU[Y * UTargetStride + X]; pDstU[Y * DstUStride + X] = (unsigned char) dbSumU / dbWeight; float dbSumV = pRelateV[Y * VRelateStride + X] * dbWeightRelate + dbWeightTarget * pTargetV[Y * VTargetStride + X]; pDstV[Y * DstVStride + X] = (unsigned char)dbSumV / dbWeight; } } void __global__ CropI420Kernel(unsigned char* pRelateY, int YRelateStride, unsigned char* pRelateU, int URelateStride, unsigned char* pRelateV, int VRelateStride, unsigned char* pTargetY, int YTargetStride, unsigned char* pTargetU, int UTargetStride, unsigned char* pTargetV, int VTargetStride, int nCropWidth, int nCropHeight, int nCropX, int nCropY, int DstWidth, int DstHeight) { __shared__ unsigned char sSrcData[SHARED_MEMORY_SIZE * 3 / 2][SHARED_MEMORY_SIZE]; int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; if (x > nCropWidth || y > nCropHeight) return; //Y sSrcData[threadIdx.y][threadIdx.x] = pRelateY[(y + nCropY) * YRelateStride + x + nCropX]; if (!(x & 0x01) && !(y & 0x01)) { //U sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2] = pRelateU[(y + nCropY) / 2 * URelateStride + (x + nCropX) / 2]; //V sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2] = pRelateV[(y + nCropY) / 2 * VRelateStride + (x + nCropX) / 2]; } __syncthreads(); //写入 pTargetY[y * YTargetStride + x] = sSrcData[threadIdx.y][threadIdx.x]; if (!(x & 0x01) && !(y & 0x01)) { pTargetU[y * UTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2]; pTargetV[y * VTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2]; } } }