#include "FusionAndTailor.cuh"
#include <builtin_types.h>
//经典的转换方式

#define COLOR_COMPONENT_MASK     0x3FF
#define COLOR_COMPONENT_BIT_SIZE 10

#define MUL(x,y)    ((x)*(y))

typedef unsigned int uint32;
typedef int int32;

__constant__ float  constHueColorSpaceMat2[9];  //默认分配到0卡上，未找到分配到指定卡上设置方法，当前也未用到，先注释

__device__ void YUV2RGB2(unsigned int* yuvi, float* red, float* green, float* blue)
{
	float luma, chromaCb, chromaCr;

	// Prepare for hue adjustment
	luma = (float)yuvi[0];
	chromaCb = (float)((int32)yuvi[1] - 512.0f);
	chromaCr = (float)((int32)yuvi[2] - 512.0f);


	// Convert YUV To RGB with hue adjustment
	*red = MUL(luma, constHueColorSpaceMat2[0]) +
		MUL(chromaCb, constHueColorSpaceMat2[1]) +
		MUL(chromaCr, constHueColorSpaceMat2[2]);
	*green = MUL(luma, constHueColorSpaceMat2[3]) +
		MUL(chromaCb, constHueColorSpaceMat2[4]) +
		MUL(chromaCr, constHueColorSpaceMat2[5]);
	*blue = MUL(luma, constHueColorSpaceMat2[6]) +
		MUL(chromaCb, constHueColorSpaceMat2[7]) +
		MUL(chromaCr, constHueColorSpaceMat2[8]);

}

__device__ unsigned char clip_v(int x, int min_val, int  max_val) {
	if (x > max_val) {
		return max_val;
	}
	else if (x < min_val) {
		return min_val;
	}
	else {
		return x;
	}
}


namespace YUVTailorAndBlender {


	__host__ void TurnNV12ToI420Classics(unsigned char* pNV12, int nWidth, int nHeight,
		unsigned char* pI420, int nPitch)
	{
		//set the block and grid
		dim3 Block(32, 32);
		dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);
		//call the kernel
		TurnNV12ToI420ClassicsKernal << <Grid, Block >> > (pNV12, nWidth, nHeight, pI420, nPitch);
	}

	__host__ void TurnNV12ToI420Dissociative(unsigned char* pNV12, int nWidth, int nHeight, unsigned char* pI420, int nPitch)
	{
		dim3 Block(32, 32);
		dim3 Grid((nWidth + Block.x - 1) / Block.x, (nHeight + Block.y - 1) / Block.y);

		CopyPlaneKernal<<<Grid,Block>>>(pNV12,nWidth,nHeight,pI420,nPitch);

		dim3 BlockUV(32, 32);
		dim3 GridUV(((nWidth / 2) + Block.x - 1) / Block.x, ((nHeight / 2) + Block.y - 1) / Block.y);

		//TurnNV12ToI420DissociativeKernalUV<<<GridUV,BlockUV>>>(pNV12,nWidth/2,nHeight/2,pI420,nPitch/2);

	}

	__host__ void TurnCUDAFormatToI420(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* dstImage, int width, int height, CUstream* pStream)
	{
		dim3 block(32, 16, 1);
		dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
		if (pStream == nullptr)
			TurnCUDAFormatToI420Kernal << < grid, block >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
		else
			TurnCUDAFormatToI420Kernal << < grid, block, 0 ,*pStream >> > ((unsigned char*)dataY, (unsigned char*)dataUV, pitchY, pitchUV, dstImage, width, height);
	}

	__host__ void CopyPlane(unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
		unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
		int nCopyWidth, int nCopyHeight,
		int nCopyStartX, int nCopyStartY,
		unsigned int Type,
		CUstream* pStream)
	{
		dim3 block(BLOCK_SIZE, SHARED_MEMORY_SIZE_Y, 1);
		dim3 grid((nCopyWidth/2 + (block.x - 1)) / block.x, ((nCopyHeight + (block.y - 1))) / block.y, 1);

		_CopyPlaneKernal << <grid, block >> > (
			pSrc, nSrcWidth, nSrcHeight, nSrcPitch,
			pDst, nDstWidth, nDstHeight, nDstPitch,
			nCopyWidth, nCopyHeight,
			nCopyStartX, nCopyStartY,
			1
			);
	}

	__host__ void CropI420(unsigned char* pSrcY, int nSrcYPitch,
		unsigned char* pSrcU, int nSrcUPitch, 
		unsigned char* pSrcV, int nSrcVPitch, 
		unsigned char* pDstY, int nDstYPitch, 
		unsigned char* pDstU, int nDstUPitch, 
		unsigned char* pDstV, int nDstVPitch, 
		int nCropWidth, int nCropHeight, 
		int nCropStartX, int nCropStartY)
	{
		
		dim3 block(BLOCK_SIZE, BLOCK_SIZE);
		dim3 grid((nCropWidth + block.x - 1) / block.x, (nCropHeight + block.y - 1) / block.y);

		CropI420Kernel << <grid, block >> > (
			pSrcY, nSrcYPitch,
			pSrcU, nSrcUPitch,
			pSrcV, nSrcVPitch,
			pDstY, nDstYPitch,
			pDstU, nDstUPitch,
			pDstV, nDstVPitch,
			nCropWidth, nCropHeight,
			nCropStartX, nCropStartY,
			nCropWidth, nCropHeight
			);

		//cudaError Error = cudaDeviceSynchronize();

	}

	__host__ void GradientBlenderYUV(unsigned char* pRelateY, int YRelateStride,
		unsigned char* pRelateU, int URelateStride,
		unsigned char* pRelateV, int VRelateStride,
		unsigned char* pTargetY, int YTargetStride,
		unsigned char* pTargetU, int UTargetStride,
		unsigned char* pTargetV, int VTargetStride,
		int width, int height,
		float* pRelateMask, float* pTargetMask, int MaskStride,
		unsigned char* pDstY, int DstYStride, 
		unsigned char* pDstU, int DstUStride, 
		unsigned char* pDstV, int DstVStride, 
		int DstWidth, int DstHeight)
	{
		dim3 block(32, 32);
		dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);

		GradientBlenderYUVKernal << <grid, block >> >
			(
				pRelateY, YRelateStride,
				pRelateU, URelateStride,
				pRelateV, VRelateStride,
				pTargetY, YTargetStride,
				pTargetU, UTargetStride,
				pTargetV, VTargetStride,
				width, height,
				pRelateMask, pTargetMask, MaskStride,
				pDstY, DstYStride,
				pDstU, DstUStride,
				pDstV, DstVStride
				);

	}


	//最经典的方法，根据当前的像素数进行修改
	void __global__ TurnNV12ToI420ClassicsKernal(unsigned char* pNV12, int nWidth, int nHeight,
		unsigned char* pI420, int nPitch)
	{
		//get the position
		int x = threadIdx.x + blockIdx.x * blockDim.x;
		int y = threadIdx.y + blockIdx.y * blockDim.y;
		//check the position
		if (x >= nWidth || y >= nHeight)
		{
			return;
		}
		//get the position of the YUV
		int nYPos = y * nPitch + x;
		int nUVPos = nHeight * nPitch + y * nPitch / 2 + x;
		//get the position of the I420
		int nYPosI420 = y * nWidth + x;
		int nUPosI420 = nHeight * nWidth + y * nWidth / 4 + x / 2;
		int nVPosI420 = nHeight * nWidth * 5 / 4 + y * nWidth / 4 + x / 2;
		//copy the YUV to I420
		pI420[nYPosI420] = pNV12[nYPos];
		if (y % 2 == 0 && x % 2 == 0)
		{
			pI420[nUPosI420] = pNV12[nUVPos];
			pI420[nVPosI420] = pNV12[nUVPos + 1];
		}
	}

	//分离式的转换方式，类似转置
	void __global__ CopyPlaneKernal(unsigned char* pNV12, int nWidth, int nHeight,
		unsigned char* pI420, int nPitch)
	{
		__shared__ unsigned char sByteData[ SHARED_MEMORY_SIZE][SHARED_MEMORY_SIZE];

		//获取当前位置，因为是
		int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
		int nInputY = threadIdx.y + blockIdx.y * blockDim.y;

		unsigned int nLoadSharedPos = nInputX + nInputY * nPitch;

		if (nInputX < nWidth && nInputY < nHeight)
		{
			//sByteData[threadIdx.y][threadIdx.x] = pNV12[nLoadSharedPos];

			__syncthreads();

			pI420[nLoadSharedPos] = pNV12[nLoadSharedPos];// sByteData[threadIdx.y][threadIdx.x];

		}
	}

	void __global__ _CopyPlaneKernal(
		unsigned char* pSrc, int nSrcWidth, int nSrcHeight, int nSrcPitch,
		unsigned char* pDst, int nDstWidth, int nDstHeight, int nDstPitch,
		int nCopyWidth, int nCopyHeight,
		int nCopyStartX, int nCopyStartY,
		int nChannel)
	{
		//__shared__ unsigned char sByteData[SHARED_MEMORY_SIZE_Y][SHARED_MEMORY_SIZE*2];

		int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
		int nInputY = threadIdx.y + blockIdx.y * blockDim.y;

		
		/*if (nInputX + SHARED_MEMORY_SIZE < nCopyWidth
			&& nInputY < nCopyHeight
			)
		{
			unsigned int nLoadSharedPos = nInputX + nInputY * nSrcPitch;
			unsigned char* pDstTarget = pDst + nCopyStartX + nCopyStartY * nDstPitch;
			unsigned char* pDstCurTarget = pDstTarget + nInputX + nInputY * nDstPitch;
			sByteData[threadIdx.y][threadIdx.x] = pSrc[nLoadSharedPos];
			sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x] = pSrc[nLoadSharedPos + SHARED_MEMORY_SIZE];

			__syncthreads();
		 	
			*pDstCurTarget = sByteData[threadIdx.y][threadIdx.x];
			*(pDstCurTarget + SHARED_MEMORY_SIZE) = sByteData[threadIdx.y][SHARED_MEMORY_SIZE + threadIdx.x];
		}*/

		if (nInputX*2 < nCopyWidth
			&& nInputY < nCopyHeight)
		{
			unsigned int nLoadSharedPos = nInputX*2 + nInputY * nSrcPitch;
			unsigned char* pDstTarget = pDst + nCopyStartX + nInputX*2 + (nCopyStartY + nInputY) * nDstPitch;

			//sByteData[threadIdx.y][threadIdx.x * 2] = pSrc[nLoadSharedPos];;
			//sByteData[threadIdx.y][threadIdx.x * 2 + 1] = pSrc[nLoadSharedPos + 1];;

			*pDstTarget = pSrc[nLoadSharedPos];//sByteData[threadIdx.y][threadIdx.x * 2];
			*(pDstTarget + 1) = pSrc[nLoadSharedPos + 1];//sByteData[threadIdx.y][threadIdx.x * 2 + 1];
			//*(pDstTarget + 2) = pSrc[nLoadSharedPos + 2];
			//*(pDstTarget + 3) = pSrc[nLoadSharedPos + 3];
			/**(pDstTarget + 4) = pSrc[nLoadSharedPos + 4];
			*(pDstTarget + 5) = pSrc[nLoadSharedPos + 5];
			*(pDstTarget + 6) = pSrc[nLoadSharedPos + 6];
			*(pDstTarget + 7) = pSrc[nLoadSharedPos + 7];*/
		}

	}

	//分离式的转换方式，类似转置,本质上时从竖向的读取数据，转置到顺序读取中去
	void __global__ TurnNV12ToI420DissociativeKernalUV(unsigned char* pNV12UV, int nWidth, int nHeight, unsigned char* pI420UV, int nPitch)
	{
		__shared__ unsigned char sByteData[SHARED_MEMORY_SIZE*2 * (SHARED_MEMORY_SIZE)];

		//获取当前位置读取位置
		int nInputX = threadIdx.x + blockIdx.x * blockDim.x;
		int nInputY = threadIdx.y + blockIdx.y * blockDim.y;

		//当前位置在共享内存的位置NV12
		int nLoadPosU = nInputX*2 + nInputY * nPitch;
		int nLoadPosV = nInputX*2 + nInputY * nPitch + 1;

		//当前NV12的数据在I420中的位置
		int nStorePosU = nInputX + nInputY * nPitch;
		int nStorePosV = nInputX + nInputY * nPitch + nPitch * nHeight;


		//越界检查
		if (nInputX >= nWidth || nInputY >= nHeight)
			return;
		
		//计算当前的坐标应该在共享内存的哪个位置,共享内存块大小与线程块分配一致
		int nLoadSharedPos = threadIdx.x + (threadIdx.y) * blockDim.x;
		//由此计算得出共享内存的X，Y坐标
		sByteData[nLoadSharedPos] = pNV12UV[nLoadPosU];
		sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE] = pNV12UV[nLoadPosV];

		//同步,这一步是设置给线程束的
		__syncthreads();

		//写入
		pI420UV[nStorePosU] = sByteData[nLoadSharedPos];
		pI420UV[nStorePosV] = sByteData[nLoadSharedPos + SHARED_MEMORY_SIZE * SHARED_MEMORY_SIZE];

		return;
	}

	cudaError_t setColorSpace2( float hue)
	{

		float hueSin = sin(hue);
		float hueCos = cos(hue);

		float hueCSC[9];
		//if (CSC == ITU601)
		//{
		//	//CCIR 601
		//	hueCSC[0] = 1.1644f;
		//	hueCSC[1] = hueSin * 1.5960f;
		//	hueCSC[2] = hueCos * 1.5960f;
		//	hueCSC[3] = 1.1644f;
		//	hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
		//	hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f);
		//	hueCSC[6] = 1.1644f;
		//	hueCSC[7] = hueCos * 2.0172f;
		//	hueCSC[8] = hueSin * -2.0172f;
		//}
		//else if (CSC == ITU709)
		{
			//CCIR 709
			hueCSC[0] = 1.0f;
			hueCSC[1] = hueSin * 1.57480f;
			hueCSC[2] = hueCos * 1.57480f;
			hueCSC[3] = 1.0;
			hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
			hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f);
			hueCSC[6] = 1.0f;
			hueCSC[7] = hueCos * 1.85560f;
			hueCSC[8] = hueSin * -1.85560f;
		}

		cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
		float tmpf[9];
		memset(tmpf, 0, 9 * sizeof(float));
		cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
		cudaDeviceSynchronize();

		if (cudaStatus != cudaSuccess) {
			fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
		}

		return cudaStatus;
	}

	void __global__ TurnCUDAFormatToI420Kernal(unsigned char* dataY, unsigned char* dataUV, size_t pitchY, size_t pitchUV, unsigned char* I420, int width, int height)
	{
		// Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
		int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
		int y = blockIdx.y * blockDim.y + threadIdx.y;

		if(x > width || y > height)
			return;

		unsigned int yuv101010Pel[2];
		unsigned char* srcImageU8_Y = (unsigned char*)dataY;
		unsigned char* srcImageU8_UV = (unsigned char*)dataUV;

		unsigned char* dstImageY1 = I420 + y * width + x;
		unsigned char* dstImageY2 = I420 + y * width + x + 1;
		unsigned char* dstImageU = I420 + width * height + y * width / 4 + x/2 ;
		unsigned char* dstImageV = I420 + width * height * 5 / 4 + y * width / 4 + x/2;

		// Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
	    // if we move to texture we could read 4 luminance values
		//右移两位
		yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
		yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;

		int y_chroma = y >> 1;

		if (y & 1)  // odd scanline ?
		{
			unsigned int chromaCb;
			unsigned int chromaCr;

			chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
			chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];

			if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
			{
				chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
				chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
			}

			yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
			yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

			yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
			yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
		}
		else
		{
			yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
			yuv101010Pel[0] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

			yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
			yuv101010Pel[1] |= ((unsigned int)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
		}

		uint32 YuvOdd[3], YuvEven[3];
		float red[2], green[2], blue[2];

		YuvOdd[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
		YuvOdd[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
		YuvOdd[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

		YuvEven[0] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
		YuvEven[1] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
		YuvEven[2] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

		*dstImageY1 = (round((float)(YuvOdd[0]))) / 4;
		*dstImageY2 = (round((float)(YuvEven[0]))) / 4;
		
		if (!(y & 0x01))
		{
			*dstImageU = (unsigned char)(round(((float)(YuvOdd[1] + YuvEven[1] + 1) / 2.f ))/4.f);
			*dstImageV = (unsigned char)(round((float)((float)(YuvOdd[2] + YuvEven[2] + 1) / 2.f))/4.f);
		}
		
		//RGB转化
		/*YUV2RGB2(&YuvOdd[0], &red[0], &green[0], &blue[0]);
		YUV2RGB2(&YuvEven[0], &red[1], &green[1], &blue[1]);

		I420[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25, 0, 255);
		I420[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25, 0, 255);

		I420[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25, 0, 255);
		I420[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25, 0, 255);

		I420[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
		I420[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25, 0, 255);*/

	}

	void __global__ GradientBlenderYUVKernal(unsigned char* pRelateY, int YRelateStride,
		unsigned char* pRelateU, int URelateStride,
		unsigned char* pRelateV, int VRelateStride, 
		unsigned char* pTargetY, int YTargetStride, 
		unsigned char* pTargetU, int UTargetStride, 
		unsigned char* pTargetV, int VTargetStride, 
		int width, int height, 
		float* pRelateMask, float* pTargetMask, int MaskStride, 
		unsigned char* pDstY, int DstYStride, 
		unsigned char* pDstU, int DstUStride, 
		unsigned char* pDstV, int DstVStride)
	{
		//共享内存设置上，加载快
		/*__shared__ float sRelateMask[32][32];
		__shared__ float sTargetMask[32][32];
		__shared__ unsigned char sRelateData[32][32];
		__shared__ unsigned char sTargetData[32][32];
		__shared__ float sWeight[32][32];
		__shared__ float sSum[32][32];*/

		int x = blockIdx.x * blockDim.x + threadIdx.x;
		int y = blockIdx.y * blockDim.y + threadIdx.y;

		if (x >= width || y >= height)
		{
			return;
		}

		////加载共享内存
		/*sRelateMask[threadIdx.y][threadIdx.x] = pRelateMask[y * width + x];
		sRelateData[threadIdx.y][threadIdx.x] = pRelateY[y * YRelateStride + x];
		sTargetMask[threadIdx.y][threadIdx.x] = pTargetMask[y * width + x];
		sTargetData[threadIdx.y][threadIdx.x] = pTargetY[y * YTargetStride + x];*/
		
		//写入共享内存
		//sSum[threadIdx.y][threadIdx.x] = sRelateData[threadIdx.y][threadIdx.x] * sRelateMask[threadIdx.y][threadIdx.x]
		//	+ sTargetData[threadIdx.y][threadIdx.x] * sTargetMask[threadIdx.y][threadIdx.x];
		
		//sWeight[threadIdx.y][threadIdx.x] = sRelateMask[threadIdx.y][threadIdx.x] + sTargetMask[threadIdx.y][threadIdx.x];

		float dbWeightRelate = pRelateMask[y * width + x];
		float dbWeightTarget = pTargetMask[y * width + x];
		float dbSum = dbWeightRelate * pRelateY[y * YRelateStride + x] + dbWeightTarget * pTargetY[y * YTargetStride + x];
		float dbWeight = dbWeightRelate + dbWeightTarget;

		////写入目标图像
		pDstY[y * DstYStride + x] = 
			dbSum / dbWeight;// sWeight[threadIdx.y][threadIdx.x];

		//__syncthreads();

		if (!(x & 0x01)  && !(y & 0x01) )
		{
			unsigned int X = x >> 1;
			unsigned int Y = y >> 1;

			float dbSumU = pRelateU[Y * URelateStride + X] * dbWeightRelate +
				dbWeightTarget * pTargetU[Y * UTargetStride + X];

			pDstU[Y * DstUStride + X] = (unsigned char) dbSumU / dbWeight;

			float dbSumV = pRelateV[Y * VRelateStride + X] * dbWeightRelate +
				dbWeightTarget * pTargetV[Y * VTargetStride + X];

			pDstV[Y * DstVStride + X] = (unsigned char)dbSumV / dbWeight;

		}

	}

	void __global__ CropI420Kernel(unsigned char* pRelateY, int YRelateStride,
		unsigned char* pRelateU, int URelateStride, 
		unsigned char* pRelateV, int VRelateStride, 
		unsigned char* pTargetY, int YTargetStride, 
		unsigned char* pTargetU, int UTargetStride, 
		unsigned char* pTargetV, int VTargetStride, 
		int nCropWidth, int nCropHeight, 
		int nCropX, int nCropY, 
		int DstWidth, int DstHeight)
	{
		__shared__ unsigned char sSrcData[SHARED_MEMORY_SIZE * 3 / 2][SHARED_MEMORY_SIZE];

		int x = threadIdx.x + blockIdx.x * blockDim.x;
		int y = threadIdx.y + blockIdx.y * blockDim.y;

		if (x > nCropWidth || y > nCropHeight)
			return;

		//Y
		sSrcData[threadIdx.y][threadIdx.x] = pRelateY[(y + nCropY) * YRelateStride + x + nCropX];
		
		if (!(x & 0x01) && !(y & 0x01))
		{
			//U
			sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2] = pRelateU[(y + nCropY) / 2 * URelateStride + (x + nCropX) / 2];
			//V
			sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2] =
				pRelateV[(y + nCropY) / 2 * VRelateStride + (x + nCropX) / 2];
		}
		__syncthreads();

		//写入
		pTargetY[y * YTargetStride + x] = sSrcData[threadIdx.y][threadIdx.x];
		
		if (!(x & 0x01) && !(y & 0x01))
		{
			pTargetU[y * UTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][threadIdx.x / 2];
			pTargetV[y * VTargetStride >> 1 + x >> 1] = sSrcData[threadIdx.y / 2 + blockDim.y][SHARED_MEMORY_SIZE / 2 + threadIdx.x / 2];
		}
		

	}


}