kangjiaqi
/
WSeries_Calibration


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238
							#include "ImageFusion.h"
#include <libyuv.h>
#include <cmath>
#include "Timer.h"

ImageFusion::ImageFusion()
{
	bIsModified = false;
}

/// <summary>
/// Initializes the specified p image.
/// the Pano Image must be aligned 4,and has been allocated memory
/// </summary>
/// <param name="pImage">The p image.</param>
/// <param name="nPanoWidth">Width of the n pano.</param>
/// <param name="nPanoHeight">Height of the n pano.</param>
/// <param name="nPanoPitch">The n pano pitch.</param>
void ImageFusion::Init(unsigned char* pImage, int nPanoWidth, int nPanoHeight, int nPanoPitch)
{
	pPanoImageBuffer = pImage;
	this->nPanoWidth = nPanoWidth;
	this->nPanoHeight = nPanoHeight;
	this->nPanoPitch = nPanoPitch;
}

/// <summary>
/// Gets the laplace pyramid.
/// </summary>
/// <param name="Image">The image to be laplace.</param>
/// <param name="vLaplacePyramid">The vector of laplace pyramid.</param>
/// <param name="nLayer">The Laplace total layer.</param>
/// <param name="nSize">The size of the gauss kernel.</param>
/// <param name="Sigma">The sigma.</param>
void ImageFusion::GetLaplacePyramid(cv::Mat& Image, std::vector<cv::Mat>& vLaplacePyramid, int nLayer, int nSize, float Sigma)
{
	//get the Image size 
	//clear the vector
	std::vector<cv::Mat>().swap(vLaplacePyramid);
	//get the gauss pyramid
	std::vector<cv::Mat> vGaussPyramid;
	cv::Mat ImageCurrent = Image.clone();
	cv::Mat GaussImage, NextLayer, UpLayer;
	for (int i = 0; i < nLayer; i++)
	{
		cv::GaussianBlur(ImageCurrent, GaussImage, cv::Size(nSize, nSize), Sigma, Sigma);
		//vGaussPyramid.push_back(GaussImage.clone());
		cv::pyrDown(GaussImage, NextLayer);
		cv::pyrUp(NextLayer, UpLayer, cv::Size(GaussImage.cols, GaussImage.rows));
		//ImageCurrent = NextLayer;
		cv::Mat LaplaceImage = GaussImage - UpLayer;
		vLaplacePyramid.push_back(LaplaceImage.clone());
		ImageCurrent = NextLayer.clone();
	}
	vLaplacePyramid.push_back(ImageCurrent);
}

void ImageFusion::GetDogPyramid(cv::Mat& Mask, std::vector<cv::Mat>& DogPyr, int nLevel)
{
	cv::Mat Down, Current;
	cv::cvtColor(Mask, Current, cv::COLOR_GRAY2RGB);
	DogPyr.push_back(Current);
	Current = Mask.clone();
	for (int i = 0; i < nLevel; i++)
	{
		cv::Mat _3ChMask;
		cv::pyrDown(Current, Down);
		cv::cvtColor(Down, _3ChMask, cv::COLOR_GRAY2RGB);
		DogPyr.push_back(_3ChMask);
		Current = Down;
	}
}

/// <summary>
/// Gets the gauss pyramid.
/// </summary>
/// <param name="Image">The image.</param>
/// <param name="vGaussPyramid">The v gauss pyramid.</param>
/// <param name="nLayer">The n layer.</param>
/// <param name="nSize">Size of the n.</param>
/// <param name="Sigma">The sigma.</param>
void ImageFusion::GetGaussPyramid(cv::Mat& Image, std::vector<cv::Mat>& vGaussPyramid, int nLayer, int nSize, float Sigma)
{
	//get the Image size 
	//clear the vector
	std::vector<cv::Mat>().swap(vGaussPyramid);
	//get the gauss pyramid
	cv::Mat ImageCopy = Image.clone();
	for (int i = 0; i < nLayer; i++)
	{
		cv::Mat GaussImage, NextLayer, UpLayer;
		cv::GaussianBlur(ImageCopy, GaussImage, cv::Size(nSize, nSize), Sigma, Sigma);
		cv::pyrDown(GaussImage, NextLayer);
		vGaussPyramid.push_back(NextLayer.clone());
		if (NextLayer.cols <= 1 || NextLayer.rows <= 1)
		{
			break;
		}
		ImageCopy = NextLayer.clone();
	}
}


/// <summary>
/// Cals the laplace blend image.
/// </summary>
/// <param name="vInferPy">The vec infer py.</param>
/// <param name="vTargetPy">The vec target py.</param>
/// <param name="vMaskPy">The vec mask py.</param>
/// <param name="vResultPy">The vec result py.</param>
void ImageFusion::CalBlendImage(vMat& vInferPy, vMat& vTargetPy, vMat& vMaskPy, vMat& vResultPy)
{
	cv::Mat AfterCountWeightInfer, AfterCountWeightTarget;
	cv::Mat Result;

	//get the current pyramid level
	int nLevel = vInferPy.size() - 1;

	for (int index = 0; index < nLevel; index++)
	{
		AfterCountWeightInfer = vInferPy[index].mul(vMaskPy[index]);
		AfterCountWeightTarget = vTargetPy[index].mul(cv::Scalar(1.0,1.0,1.0) - vMaskPy[index]);
		Result = AfterCountWeightInfer + AfterCountWeightTarget;
		vResultPy.push_back(Result.clone());
	}

	cv::Mat lastOne = vInferPy.back().mul(vMaskPy.back()) + vTargetPy.back().mul(cv::Scalar(1.f, 1.f, 1.f) - vMaskPy.back());
	vResultPy.push_back(lastOne);

}

void ImageFusion::FusionImageByLaplacePyramid(cv::Mat* pLeft, cv::Mat* Right, int nLayer, int nSize, float Sigma)
{
	
}


/// <summary>
/// Fusions the image by laplace pyramid.
/// </summary>
/// <param name="pPano">The p pano.</param>
/// <param name="InferRc">The infer rc.</param>
/// <param name="pCvTarget">The p cv target.</param>
/// <param name="SzTargetMat">The sz target mat.</param>
/// <param name="nLayer">The n layer.</param>
/// <param name="nSize">Size of the n.</param>
/// <param name="Sigma">The sigma.</param>
/// <param name="LeftTop">The left top.</param>
void ImageFusion::FusionImageByLaplacePyramid(unsigned char* pPano, cv::Rect InferRc, unsigned char* pCvTarget, cv::Size SzTargetMat, int nLayer, int nSize, float Sigma, cv::Point LeftTop)
{
	//first get the fusion part in pano image
	cv::Mat PanoImage = cv::Mat(this->nPanoHeight, this->nPanoWidth, CV_8UC3, pPano);
	cv::Mat CvInferMat = PanoImage(InferRc);
	cv::Mat Mask(InferRc.size(), CV_8UC1, cv::Scalar(0));
	//get the target image to cv mat
	//create a cv mat to store the target image
	cv::Mat CvTargetMat = CvInferMat;
	//get the offset in target mat to store the target image
	//int nOffsetX = InferRc.width - SzTargetMat.width;

	//Copy the target image to cv mat
	for (int i = 0; i < InferRc.height; i++)
	{
	//	FastCopy_Any(pCvTarget + ALIGN_4(SzTargetMat.width*3) * i,
	//		CvTargetMat.ptr<unsigned char>(i) + nOffsetX*3,
	//		SzTargetMat.width*3
	//		);
		memset(Mask.ptr<unsigned char>(i), 255 , int(InferRc.width) / 2 );
	}

	//get the laplace pyramid of the fusion part
	std::vector<cv::Mat> vInferLaplacePyramid;
	std::vector<cv::Mat> vTargetLaplacePyramid;
	std::vector<cv::Mat> vMaskLapLacePyramid;
	std::vector<cv::Mat> vResultLaplacePyramid;

	cv::Mat CvInferMat_f, CvTargetMat_f, Mask_f;
	CvInferMat.convertTo(CvInferMat_f, CV_32FC3,1.f/255.f);
	CvTargetMat.convertTo(CvTargetMat_f, CV_32FC3, 1.f / 255.f);
	Mask.convertTo(Mask_f, CV_32F, 1.f / 255.f);

	GetLaplacePyramid(CvInferMat_f, vInferLaplacePyramid, nLayer, nSize, Sigma);
	GetLaplacePyramid(CvTargetMat_f, vTargetLaplacePyramid, nLayer, nSize, Sigma);
	GetDogPyramid(Mask_f, vMaskLapLacePyramid, nLayer);

	CalBlendImage(vInferLaplacePyramid,vTargetLaplacePyramid,vMaskLapLacePyramid,vResultLaplacePyramid);
	
	//get the result image
	int nVecLength = vResultLaplacePyramid.size();
	cv::Mat UpLayer;
	cv::Mat CurrentMat = vResultLaplacePyramid.back();
	for (int level = nVecLength -2 ; level >= 0; level--)
	{
		cv::pyrUp(CurrentMat, UpLayer, vResultLaplacePyramid[level].size());
		CurrentMat = vResultLaplacePyramid[level] + UpLayer;
	}

	CurrentMat.convertTo(CurrentMat, CV_8UC3,255);

	//获取结果
	unsigned char* pModifyPart = pPano + InferRc.y * nPanoPitch + InferRc.x * 3;
	for (int i = 0; i < InferRc.height; i++)
	{
		memcpy(
			pModifyPart + i * nPanoPitch,
			CurrentMat.ptr<unsigned char>(i),
			InferRc.width * 3
		);
	}
	

}

/// <summary>
/// Gets the laplace fusion image.
/// </summary>
/// <param name="pFusionImage">The p fusion image.</param>
/// <param name="nWidth">Width of the n.</param>
/// <param name="nHeight">Height of the n.</param>
/// <param name="nPitch">The n pitch.</param>
void ImageFusion::GetLaplaceFusionImage(unsigned char* pFusionImage, int& nWidth, int& nHeight, int& nPitch)
{
	pFusionImage = this->pPanoImageBuffer;
	nWidth = this->nPanoWidth;
	nHeight = this->nPanoHeight;
	nPitch = this->nPanoPitch;
}

void ImageFusion::FusionImageByThinRectWindowSlideMean(unsigned char* pPano, cv::Rect InferRc, int nSize, int nType, float Sigma)
{
	cv::Mat Kernal;
	if (nType == 0)
	{
		Kernal = CreateThinRectWindow(1, nSize, nType, 1);
	}
	else if(nType == 1)
	{ 
		Kernal = CreateThinRectWindow(1, nSize, nType, 1);
	}

	//first get the fusion part in pano image
	cv::Mat PanoImage = cv::Mat(this->nPanoHeight, this->nPanoWidth, CV_8UC3, pPano);
	//get infer image from PanoImage
	cv::Rect WidenInferRect = InferRc;
	WidenInferRect.x -= nSize / 2;
	WidenInferRect.width += nSize;
	
	cv::Mat InferImage = PanoImage(WidenInferRect);
	cv::Mat _32FInferImage;
	InferImage.convertTo(_32FInferImage, CV_32F);
	std::vector<cv::Mat> vSingleChannelImage;
	cv::split(_32FInferImage, vSingleChannelImage);
	cv::Mat MergeImage;
	cv::Mat Afterfilter;

	for (int i = 0; i < 3; i++)
	{
		cv::filter2D(vSingleChannelImage[i], vSingleChannelImage[i], -1, Kernal);
	}
	cv::merge(vSingleChannelImage, MergeImage);

	MergeImage.convertTo(Afterfilter, CV_8U);
	Afterfilter.copyTo(InferImage);
}

/// <summary>
/// Fusions the image by thin rect window slide mean.
/// </summary>
/// <param name="pPano">The panorama Image to be modified.</param>
/// <param name="InferRc">The Overlap Rect.</param>
/// <param name="InferImg">The Overlap infer img.</param>
/// <param name="CurCoordinateRc">The Overlap Rc in Current Image.</param>
/// <param name="TargetImg">The Overlap img.</param>
/// <param name="TargetRc">The Overlap rc in Target Image.</param>
/// <param name="nSize">Size of the Filter.</param>
/// <param name="nType">Type of the Filter 
/// 0.Mean 1.Gauss.</param>
/// <param name="Sigma">The sigma.</param>
void ImageFusion::FusionImageByThinRectWindowSlideMean(unsigned char* pPano, cv::Rect InferRc, cv::Mat& InferImg,  cv::Rect CurCoordinateRc, cv::Mat& TargetImg, cv::Rect TargetRc, int nSize, int nType, float Sigma)
{	
	if (CurCoordinateRc.width == 0 || CurCoordinateRc.height == 0 ||
		TargetRc.width == 0 || TargetRc.height == 0)
		return;

	cv::Mat Kernal;
	if (nType == 0)
	{
		Kernal = CreateThinRectWindow(1, nSize, nType, 1);
	}
	else if (nType == 1)
	{
		Kernal = CreateThinRectWindow(1, nSize, nType, 1.5);
	}
	
	//first get the fusion part in pano image
	cv::Mat PanoImage = cv::Mat(this->nPanoHeight, this->nPanoWidth, CV_8UC3, pPano);
	//get infer image from PanoImage
	cv::Rect WidenPanoRect = InferRc;
	//WidenPanoRect.x -= nSize / 2;
	//WidenPanoRect.width += nSize;
	cv::Mat InferImage = PanoImage(WidenPanoRect);
	cv::Mat InferImageCopy = InferImage.clone();
	if (WidenPanoRect.width % 2 == 1)
		TargetRc.width -= 1;


	//get infer image from inferImage and Taraget image
	cv::Rect WidenInferRect = CurCoordinateRc;
	//WidenInferRect.x -= (nSize-1) / 2;
	WidenInferRect.width = WidenPanoRect.width /2;
	if(WidenInferRect.width != 0)
	InferImg(WidenInferRect).copyTo(InferImageCopy(cv::Rect(0, 0, WidenInferRect.width, WidenInferRect.height)));

	cv::Rect WidenTargetRect = TargetRc;
	WidenTargetRect.x += WidenPanoRect.width / 2;
	WidenTargetRect.width = WidenPanoRect.width / 2;
	if (TargetRc.width != 0)
	TargetImg(WidenTargetRect).copyTo(InferImageCopy(cv::Rect(0 + WidenInferRect.width,
		0, WidenTargetRect.width, WidenTargetRect.height)));

	cv::Mat _32FInferImage;
	InferImageCopy.convertTo(_32FInferImage, CV_32F);
	std::vector<cv::Mat> vSingleChannelImage;
	cv::split(_32FInferImage, vSingleChannelImage);
	cv::Mat MergeImage;
	cv::Mat Afterfilter;
	for (int i = 0; i < 3; i++)
	{
		//cv::filter2D(vSingleChannelImage[i], vSingleChannelImage[i], -1, Kernal);
		for (int ImageRows = 0; ImageRows < vSingleChannelImage[i].rows; ImageRows++)
		{
			cv::Mat CurRow = vSingleChannelImage[i](cv::Range(ImageRows, ImageRows+1),cv::Range(0, _32FInferImage.cols));
			cv::Mat ResRow;
			Convolution1D(CurRow, ResRow, Kernal);
			ResRow.copyTo(vSingleChannelImage[i](cv::Range(ImageRows, ImageRows + 1), cv::Range(0, _32FInferImage.cols)));
		}
		
	}
	cv::merge(vSingleChannelImage, MergeImage);
	MergeImage.convertTo(Afterfilter, CV_8U);
	Afterfilter.copyTo(InferImage);
}

cv::Mat ImageFusion::CreateThinRectWindow(int nHeight, int nWidth, int nType, int nChannels)
{
	//window size must be odd
	if (nWidth % 2 == 0 || nHeight % 2 == 0)
		return cv::Mat();
	cv::Mat Window;
	if(nChannels == 1)
		Window = cv::Mat(nHeight, nWidth, CV_32F, cv::Scalar(0));
	else
		Window = cv::Mat(nHeight, nWidth, CV_32FC3, cv::Scalar(0,0,0));

	//according to type to set the value of the window
	//the center value in window
	if (nType == 0)
	{
		Window.setTo(cv::Scalar::all(1));
		Window = Window / nWidth;
	}
	else if(nType == 1)
	{
		//get center of the filter
		int nCenterX = nWidth / 2;

		//calcute the weight of the filter
		float sum = 0.0;
		for (int i = 0; i < nWidth; i++)
		{
			float x = i - nCenterX;
			float weight = CaculateGaussWeight(x, 1.5);
			sum += weight;
			Window.at<float>(i) = weight;
		}

		for (int i = 0; i < nWidth; i++)
		{
			Window.at<float>(i) /= sum;
		}

	}
	return Window;
}

/// <summary>
/// Caculates the gauss weight.
/// </summary>
/// <param name="x">The x.</param>
/// <param name="sigma">The sigma.</param>
/// <returns></returns>
double ImageFusion::CaculateGaussWeight(double x, double sigma)
{
	return exp(-(x * x) / (2 * sigma * sigma)) / (sqrt(2 * M_PI) * sigma);
}

void ImageFusion::Convolution2D(cv::Mat& Src, cv::Mat& Dst, cv::Mat& Kernel)
{
	int nChannels = Src.channels();
	int nRows = Src.rows;
	int nCols = Src.cols * nChannels;
	int nKernelRows = Kernel.rows;
	int nKernelCols = Kernel.cols * nChannels;
	int nKernelCenterX = nKernelCols / 2;
	int nKernelCenterY = nKernelRows / 2;
	Dst = cv::Mat(nRows, nCols, CV_32FC3, cv::Scalar(0, 0, 0));
	for (int i = 0; i < nRows; i++)
	{
		float* pDst = Dst.ptr<float>(i);
		for (int j = 0; j < nCols; j++)
		{
			float sum = 0.0;
			for (int m = 0; m < nKernelRows; m++)
			{
				int nSrcRow = i + m - nKernelCenterY;
				if (nSrcRow < 0)
					nSrcRow = -nSrcRow;
				else if (nSrcRow >= nRows)
					nSrcRow = nRows - (nSrcRow - nRows) - 1;
				float* pSrc = Src.ptr<float>(nSrcRow);
				float* pKernel = Kernel.ptr<float>(m);
				for (int n = 0; n < nKernelCols; n++)
				{
					int nSrcCol = j + n - nKernelCenterX;
					if (nSrcCol < 0)
						nSrcCol = -nSrcCol;
					else if (nSrcCol >= nCols)
						nSrcCol = nCols - (nSrcCol - nCols) - 1;
					sum += pSrc[nSrcCol] * pKernel[n];
				}
			}
			pDst[j] = sum;
		}
	}
}

void ImageFusion::Convolution1D(cv::Mat& Src, cv::Mat& Dst, cv::Mat& Kernel)
{
	int nRows = Src.rows;
	int nCols = Src.cols;

	if (nRows == 0 || nCols == 0)
		return;

	if (!(Kernel.cols % 2 != 0))
		cv::error((CV_StsInternal), "Convolution1D", "Assertion: " "Kernel.cols % 2 == 0" " failed", "D:\\kang\\360stitching\\QtCameraHardWareCopilot\\QtCameraHardWareCopilot\\ImageFusion.cpp", 428);
	if ((Kernel.channels() != 1))
		cv::error(cv::Error::StsAssert, "Kernel.channels() != 1", __FUNCTION__, "D:\\kang\\360stitching\\QtCameraHardWareCopilot\\QtCameraHardWareCopilot\\ImageFusion.cpp", 433);


	int nKernelRows = Kernel.rows;
	int nKernelCols = Kernel.cols;
	int nKernelCenterX = nKernelCols / 2 + 1;

	//constraction the Dst
	cv::Mat WidenSrc = cv::Mat(nRows, nCols + nKernelCols - 1, CV_32F, cv::Scalar(0));
	Dst = cv::Mat(nRows, nCols, CV_32F, cv::Scalar(0));

	Src.copyTo(WidenSrc(cv::Range(0, 1), cv::Range(nKernelCols / 2, nCols + nKernelCols / 2)));

	if (nKernelCols < nRows)
		for (int Bordi = 0; Bordi < nKernelCols / 2; Bordi++)
		{
			WidenSrc.ptr<float>(0)[nKernelCols / 2 - 1 - Bordi] = WidenSrc.ptr<float>(0)[nKernelCols / 2 + 1 + Bordi];
			WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 + Bordi] = WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 - 2 - Bordi];
		}
	else if (nKernelCols != 1)
		for (int Bordi = 0; Bordi < nKernelCols / 2; Bordi++)
		{
			WidenSrc.ptr<float>(0)[nKernelCols / 2 - 1 - Bordi] = WidenSrc.ptr<float>(0)[nKernelCols / 2 + 1];
			WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 + Bordi] = WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 - 2];
		}
	else
		for (int Bordi = 0; Bordi < nKernelCols / 2; Bordi++)
		{
			WidenSrc.ptr<float>(0)[nKernelCols / 2 - 1 - Bordi] = WidenSrc.ptr<float>(0)[nKernelCols / 2];
			WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 + Bordi] = WidenSrc.ptr<float>(0)[nCols + nKernelCols / 2 - 1];
		}

	if (nKernelCols <= 7)
	{
		__m256 Kernal;
		for (int i = 0; i < nKernelCols; i++)
		{
			Kernal.m256_f32[i] = Kernel.ptr<float>(0)[i];
		}
		
		//begin to convolution
		for (int i = 0; i < nCols; i++)
		{
			__m256 mmSum = _mm256_setzero_ps();
			__m256 mmSrc = _mm256_setzero_ps();
			memcpy(mmSrc.m256_f32, WidenSrc.ptr<float>(0)+i, sizeof(float) * nKernelCols);
			mmSum = (mmSum, _mm256_mul_ps(mmSrc, Kernal));

			for (int j = 0; j < nKernelCols; j++)
			{
				Dst.ptr<float>(0)[i] += mmSum.m256_f32[j];
			}
			WidenSrc.ptr<float>(0)[nKernelCenterX + i] = Dst.ptr<float>(0)[i];
		}
	}
}

void ImageFusion::FusionImageByBlendingGradient(unsigned char* pPano, cv::Rect InferRc, cv::Mat& InferImg, cv::Rect CurCoordinateRc, cv::Mat& TargetImg, cv::Rect TargetRc)
{


	if (CurCoordinateRc.width == 0 || CurCoordinateRc.height == 0 ||
		TargetRc.width == 0 || TargetRc.height == 0)
		return;

	if (CurCoordinateRc.width + CurCoordinateRc.x > InferImg.cols
		|| CurCoordinateRc.height + CurCoordinateRc.y > InferImg.rows
		|| InferRc.height + InferRc.y > nPanoHeight
		|| TargetRc.width + TargetRc.x > TargetImg.cols
		|| TargetRc.height + TargetRc.y > TargetImg.rows
		|| CurCoordinateRc.x < 0
		|| CurCoordinateRc.y < 0
		|| TargetRc.x < 0
		|| TargetRc.y < 0
		)
		return;

	//first get the fusion part in pano image
	cv::Mat PanoImage = cv::Mat(this->nPanoHeight, this->nPanoWidth, CV_8UC3, pPano);
	
	if (PanoImage.empty())
		return;
	
	
	//get infer image from PanoImage
	cv::Rect WidenPanoRect = InferRc;
	//WidenPanoRect.x -= nSize / 2;
	//WidenPanoRect.width += nSize;
	cv::Mat InferImage = PanoImage(WidenPanoRect);
	cv::Mat InferImageCopy = InferImage.clone();
	if (WidenPanoRect.width % 2 == 1)
		TargetRc.width -= 1;

	cv::Mat InferPart, TargetPart;
	
	//get infer image from inferImage and Taraget image
	cv::Rect WidenInferRect = CurCoordinateRc;
	//WidenInferRect.x -= (nSize-1) / 2;
	//WidenInferRect.width = WidenPanoRect.width / 2;
	if (WidenInferRect.width != 0)
		InferPart = InferImg(WidenInferRect).clone();
	else
		return;

	cv::Rect WidenTargetRect = TargetRc;
	//WidenTargetRect.x += WidenPanoRect.width / 2;
	//WidenTargetRect.width = WidenPanoRect.width / 2;
	if (TargetRc.width != 0)
		TargetPart = TargetImg(WidenTargetRect).clone();
	else
		return;

	if (InferPart.cols != TargetPart.cols)
		return;

	//create the weight for two blend image
	cv::Mat InferWeight, TargetWeight;
	std::vector<bool> Useless;
	GetGradientMask(CurCoordinateRc, TargetRc, InferWeight, TargetWeight, Useless);
	cv::Mat Res;
	cv::blendLinear(InferPart, TargetPart, InferWeight, TargetWeight, Res);

	Res.copyTo(PanoImage(WidenPanoRect));

}

void ImageFusion::FusionImageByBlendingGradientYUV(unsigned char* pPano, cv::Rect InferRc, unsigned char* pInfer, int nInferWidth, int nInferHeight, int nInferPitch, cv::Rect CurCoordinateRc, unsigned char* pTarget, int nTargetWidth, int nTargetHeight, int nTargetPitch, cv::Rect TargetRc, bool bUseSSE2)
{
	if (CurCoordinateRc.width == 0 || CurCoordinateRc.height == 0 ||
		TargetRc.width == 0 || TargetRc.height == 0)
		return;

	if (CurCoordinateRc.width + CurCoordinateRc.x > nInferWidth
		|| CurCoordinateRc.height + CurCoordinateRc.y > nInferHeight
		|| InferRc.height + InferRc.y > nPanoHeight
		|| TargetRc.width + TargetRc.x > nTargetWidth
		|| TargetRc.height + TargetRc.y > nInferHeight
		|| CurCoordinateRc.x < 0
		|| CurCoordinateRc.y < 0
		|| TargetRc.x < 0
		|| TargetRc.y < 0
		)
		return;

	/***test***/
	cv::Mat matPanoImage = cv::Mat(nPanoHeight * 3 / 2, nPanoWidth, CV_8U, pPano);
	/***test***/

	//get infer image from PanoImage
	cv::Rect WidenPanoRect = InferRc;
	if (WidenPanoRect.width % 2 == 1)
		TargetRc.width -= 1;

	//get infer image from inferImage and Taraget image
	cv::Rect WidenInferRect = CurCoordinateRc;
	//申请一块内存，用于存放裁剪后的infer图像数据
	unsigned char* pInferCorp = new unsigned char[
		ALIGN_4(CurCoordinateRc.width) * CurCoordinateRc.height * 3 / 2
	];

	if (WidenInferRect.width != 0)
		//InferPart = InferImg(WidenInferRect).clone();
	{

		libyuv::ConvertToI420(
			pInfer,
			nInferPitch * nInferHeight *3/2,
			pInferCorp,
			ALIGN_4(CurCoordinateRc.width),
			pInferCorp + ALIGN_4(CurCoordinateRc.width) * CurCoordinateRc.height,
			ALIGN_4(CurCoordinateRc.width) / 2,
			pInferCorp + ALIGN_4(CurCoordinateRc.width) * CurCoordinateRc.height * 5 / 4,
			ALIGN_4(CurCoordinateRc.width) / 2,
			CurCoordinateRc.x,
			CurCoordinateRc.y,
			nInferWidth,
			nInferHeight,
			ALIGN_4(CurCoordinateRc.width),
			CurCoordinateRc.height,
			libyuv::kRotate0,
			libyuv::FOURCC_I420
		);
	}
	else
		return;

	cv::Rect WidenTargetRect = TargetRc;
	//申请一块内存，用于存放裁剪后的Target图像数据
	unsigned char* pTargetCorp = new unsigned char[
		ALIGN_4(TargetRc.width) * TargetRc.height * 3 / 2
	];

	if (TargetRc.width != 0)
		//TargetPart = TargetImg(WidenTargetRect).clone();
	{
		libyuv::ConvertToI420(
			pTarget,
			nTargetPitch * nTargetHeight *3/2,
			pTargetCorp,
			ALIGN_4(TargetRc.width),
			pTargetCorp + ALIGN_4(TargetRc.width) * TargetRc.height,
			ALIGN_4(TargetRc.width) / 2,
			pTargetCorp + ALIGN_4(TargetRc.width) * TargetRc.height * 5 / 4,
			ALIGN_4(TargetRc.width) / 2,
			TargetRc.x,
			TargetRc.y,
			nTargetWidth,
			nTargetHeight,
			ALIGN_4(TargetRc.width),
			TargetRc.height,
			libyuv::kRotate0,
			libyuv::FOURCC_I420
		);
	}
	else
		return;

	
	//create the weight for two blend image
	cv::Mat InferWeight, TargetWeight;
	std::vector<bool> Useless;
	GetGradientMask(CurCoordinateRc, TargetRc, InferWeight, TargetWeight, Useless);

	unsigned char* pBlenderPart = NULL;

	cv::Mat testInferImage = cv::Mat(nTargetHeight * 3 / 2, ALIGN_4(TargetRc.width), CV_8U, pInfer);
	cv::Mat testTargetImage = cv::Mat(nTargetHeight * 3 / 2, ALIGN_4(TargetRc.width), CV_8U, pTarget);
	
	TimerCounter Timer;
	Timer.Start();

	/*BlendingLinearYUV(
		pInferCorp, CurCoordinateRc.width, CurCoordinateRc.height, ALIGN_4(CurCoordinateRc.width),
		pTargetCorp, TargetRc.width, TargetRc.height, ALIGN_4(TargetRc.width),
		InferWeight, TargetWeight,
		pBlenderPart,
		true
	);*/

	BlendingLinearYUV(
		pInferCorp, CurCoordinateRc.width, CurCoordinateRc.height, ALIGN_4(CurCoordinateRc.width),
		pTargetCorp, TargetRc.width, TargetRc.height, ALIGN_4(TargetRc.width),
		InferWeight, TargetWeight,
		pBlenderPart,true
	);

	
	Timer.Stop();


	/***test***/
	cv::Mat matTargetImage = cv::Mat(TargetRc.height * 3 / 2, TargetRc.width+2, CV_8U, pBlenderPart);
	
	/***test***/

	PastePartInWholeImageYUV(
		pPano, nPanoWidth, nPanoHeight, nPanoPitch,
		pBlenderPart, WidenPanoRect.width, WidenPanoRect.height, ALIGN_4(WidenPanoRect.width),
		WidenPanoRect.x, WidenPanoRect.y,
		false
	);

}

void ImageFusion::FusionImageByBlendingGradientYUVByGpu(unsigned char* pPano, cv::Rect InferRc, unsigned char* pInfer, int nInferWidth, int nInferHeight, int nInferPitch, cv::Rect CurCoordinateRc, unsigned char* pTarget, int nTargetWidth, int nTargetHeight, int nTargetPitch, cv::Rect TargetRc)
{
	if (CurCoordinateRc.width == 0 || CurCoordinateRc.height == 0 ||
		TargetRc.width == 0 || TargetRc.height == 0)
		return;

	if (CurCoordinateRc.width + CurCoordinateRc.x > nInferWidth
		|| CurCoordinateRc.height + CurCoordinateRc.y > nInferHeight
		|| InferRc.height + InferRc.y > nPanoHeight
		|| TargetRc.width + TargetRc.x > nTargetWidth
		|| TargetRc.height + TargetRc.y > nInferHeight
		|| CurCoordinateRc.x < 0
		|| CurCoordinateRc.y < 0
		|| TargetRc.x < 0
		|| TargetRc.y < 0
		)
		return;

	//get infer image from PanoImage
	cv::Rect WidenPanoRect = InferRc;
	if (WidenPanoRect.width % 2 == 1)
		TargetRc.width -= 1;

	//get infer image from inferImage and Taraget image
	cv::Rect WidenInferRect = CurCoordinateRc;
	//申请一块内存，用于存放裁剪后的infer图像数据
	unsigned char* pInferCorp = nullptr;

	cudaMalloc((void**)&pInferCorp, ALIGN_4(WidenInferRect.width) * WidenInferRect.height * 3 / 2);

	cv::Rect WidenTargetRect = TargetRc;
	//申请一块内存，用于存放裁剪后的Target图像数据
	unsigned char* pTargetCorp = nullptr;
	cudaMalloc((void**)&pTargetCorp, ALIGN_4(TargetRc.width) * TargetRc.height * 3 / 2);

	cudaError Err = cudaDeviceSynchronize();

	if (WidenInferRect.width != 0)
		//InferPart = InferImg(WidenInferRect).clone();
	{
		unsigned char* pSrcCorpY = pInfer + WidenInferRect.y * nInferPitch + WidenInferRect.x;
		unsigned char* pSrcCorpU = pSrcCorpY + nInferPitch * nInferHeight + WidenInferRect.y * nInferPitch / 4 + WidenInferRect.x / 2;

		YUVTailorAndBlender::CropI420(
			pInfer, nInferPitch,
			pInfer + nInferPitch * nInferHeight, nInferPitch / 2,
			pInfer + nInferPitch * nInferHeight * 5 / 4, nInferPitch / 2,
			pInferCorp, ALIGN_4(CurCoordinateRc.width),
			pInferCorp + ALIGN_4(CurCoordinateRc.width) * CurCoordinateRc.height, ALIGN_4(CurCoordinateRc.width) / 2,
			pInferCorp + ALIGN_4(CurCoordinateRc.width) * CurCoordinateRc.height * 5 / 4, ALIGN_4(CurCoordinateRc.width) / 2,
			CurCoordinateRc.width, CurCoordinateRc.height,
			CurCoordinateRc.x, CurCoordinateRc.y
		);
	}
	else
		return;

	if (TargetRc.width != 0)
		//TargetPart = TargetImg(WidenTargetRect).clone();
	{
		YUVTailorAndBlender::CropI420(
			pTarget, nTargetPitch,
			pTarget + nTargetPitch * nTargetHeight, nTargetPitch / 2,
			pTarget + nTargetPitch * nTargetHeight * 5 / 4, nTargetPitch / 2,
			pTargetCorp, ALIGN_4(WidenTargetRect.width),
			pTargetCorp + ALIGN_4(WidenTargetRect.width) * WidenTargetRect.height, ALIGN_4(WidenTargetRect.width) / 2,
			pTargetCorp + ALIGN_4(WidenTargetRect.width) * WidenTargetRect.height * 5 / 4, ALIGN_4(WidenTargetRect.width) / 2,
			WidenTargetRect.width, WidenTargetRect.height,
			WidenTargetRect.x, WidenTargetRect.y
		);
		
	}
	else
		return;

	Err = cudaDeviceSynchronize();

	//create the weight for two blend image
	cv::Mat InferWeight, TargetWeight;
	std::vector<bool> Useless;
	GetGradientMask(CurCoordinateRc, TargetRc, InferWeight, TargetWeight, Useless);

	unsigned char* pBlenderPart = NULL;

	TimerCounter Timer;
	Timer.Start();

	cudaMalloc((void**) & pBlenderPart, ALIGN_4(TargetRc.width) * TargetRc.height * 3 / 2);

	/*BlendingLinearYUV(
		pInferCorp, CurCoordinateRc.width, CurCoordinateRc.height, ALIGN_4(CurCoordinateRc.width),
		pTargetCorp, TargetRc.width, TargetRc.height, ALIGN_4(TargetRc.width),
		InferWeight, TargetWeight,
		pBlenderPart,
		true
	);*/

	BlendingLinearYUVByGpu(
		pInferCorp, CurCoordinateRc.width, CurCoordinateRc.height, ALIGN_4(CurCoordinateRc.width),
		pTargetCorp, TargetRc.width, TargetRc.height, ALIGN_4(TargetRc.width),
		InferWeight, TargetWeight,
		pBlenderPart
	);

	Timer.Stop();

	Err = cudaDeviceSynchronize();

	if(Err ==700)
		int a = 0;

	/***test***/

	//cv::Mat matTargetImage = cv::Mat(TargetRc.height * 3 / 2, TargetRc.width + 2, CV_8U, pBlenderPart);

	/***test***/

	PastePartInWholeImageYUVByGpu(
		pPano, nPanoWidth, nPanoHeight, nPanoPitch,
		pBlenderPart, WidenPanoRect.width, WidenPanoRect.height, ALIGN_4(WidenPanoRect.width),
		WidenPanoRect.x, WidenPanoRect.y,
		NULL
	);

	Err = cudaDeviceSynchronize();

	cudaFree(pBlenderPart);
	cudaFree(pTargetCorp);
	cudaFree(pInferCorp);
}

void ImageFusion::GetGradientMask(cv::Rect Infer, cv::Rect Target, cv::Mat& MaskTarget, cv::Mat& MaskInfer, std::vector<bool>& bAddOrSub)
{

	bAddOrSub.resize(3);

	//according to intersect rect rgb avg val to get the light base in this image


	cv::Mat WeigthMapTarget = cv::Mat(cv::Size(Infer.width, Infer.height), CV_8U, cv::Scalar(255));
	cv::Mat WeightMapInfer = cv::Mat(cv::Size(Infer.width, Infer.height), CV_8U, cv::Scalar(255));
	cv::Mat DistanceMaskTarget, DistanceMaskInfer;

	//set a column 0 in Mask
	WeigthMapTarget(cv::Rect(Infer.width - 1, 0, 1, Infer.height)).setTo(cv::Scalar(0));
	WeightMapInfer(cv::Rect(0, 0, 1, Infer.height)).setTo(cv::Scalar(0));
	DistanceMaskTarget.create(WeigthMapTarget.size(), WeigthMapTarget.type());
	DistanceMaskInfer.create(WeightMapInfer.size(), WeightMapInfer.type());
	//calculate a Weightmap in DistanceMask
	distanceATS_L1_8u(WeigthMapTarget.data, DistanceMaskTarget.cols, DistanceMaskTarget.rows, DistanceMaskTarget.step, DistanceMaskTarget.data);
	distanceATS_L1_8u(WeightMapInfer.data, DistanceMaskInfer.cols, DistanceMaskInfer.rows, DistanceMaskInfer.step, DistanceMaskInfer.data);

	std::shared_ptr<float> pfLightBaseDif(new float[3] {0});
	cv::Mat vTargetChannels, vInferChannels;
	DistanceMaskTarget.convertTo(DistanceMaskTarget, CV_32F);
	DistanceMaskInfer.convertTo(DistanceMaskInfer, CV_32F);

	//step is according to inferRc`s width
	float fStep = abs(1.f / Infer.width);

	vTargetChannels = DistanceMaskTarget * fStep;
	vInferChannels = DistanceMaskInfer * fStep;

	MaskTarget = (vTargetChannels);
	MaskInfer = (vInferChannels);

}

void ImageFusion::BlendingLinearYUV(unsigned char* pInfer, int nInferWidth, int nInferHeight, int nInferPitch, unsigned char* pTarget, int nTargetWidth, int nTargetHeight, int nTargetPitch, cv::Mat& cvInferWeight, cv::Mat& cvTargetWeight, unsigned char*& pResult, bool bUseAVX)
{
	//获取YUV三个通道的分量的指针
	unsigned char* pInferY = pInfer;
	unsigned char* pInferU = pInfer + nInferPitch * nInferHeight;
	unsigned char* pInferV = pInfer + nInferPitch * nInferHeight + (nInferPitch * nInferHeight) / 4;

	unsigned char* pTargetY = pTarget;
	unsigned char* pTargetU = pTarget + nTargetPitch * nTargetHeight;
	unsigned char* pTargetV = pTarget + nTargetPitch * nTargetHeight * 5 / 4;

	//默认infer 和 target的 大小一样的,且都能被整除，或者整采样
	assert(nInferHeight == nTargetHeight || nInferWidth == nTargetWidth);
	assert(nInferHeight == cvInferWeight.rows ||
		nInferWidth == cvInferWeight.cols ||
		nTargetHeight == cvTargetWeight.rows ||
		nTargetWidth == cvTargetWeight.cols
	);
	assert(nInferWidth % 2 == 0);
	assert(nInferHeight % 2 == 0);
	//设置一个大小与之相同的dst,分别有uchar和float类型的
	if (pResult != NULL)
		delete[]pResult;
	pResult = new unsigned char[nInferPitch * cvInferWeight.rows * 3 / 2];
	//计算得出result的分区指针
	unsigned char* pDstY = pResult;
	unsigned char* pDstU = pResult + nInferPitch * cvInferWeight.rows;
	unsigned char* pDstV = pResult + nInferPitch * cvInferWeight.rows * 5 / 4;
	int nDstPitch = nInferPitch;
	int nDstUVPitch = nInferPitch / 2;
	//开始计算

	cv::Mat InferImage = cv::Mat(nInferHeight * 3 / 2, nInferWidth +2, CV_8U, pInferY);
	cv::Mat TargetImage = cv::Mat(nInferHeight * 3 / 2, nInferWidth +2, CV_8U, pTargetY);
	cv::Mat ResultImage = cv::Mat(nInferHeight * 3 / 2, nInferWidth+2, CV_8U, pDstY);


	if (bUseAVX)
	{
		//计算Y通道
		for (int i = 0; i < cvInferWeight.rows; i++)
		{
			for (int j = 0; j < cvInferWeight.cols; )
			{
				__m256 mInferWeight, mTargetWeight;
				int nOffsetMax = 0;
				if (j + 8 < cvInferWeight.cols)
				{
					//本模块内所有的宽度均进行了宽度测试所以均可以满足被4整除
					mInferWeight = _mm256_loadu_ps(cvInferWeight.ptr<float>(i) + j);
					mTargetWeight = _mm256_loadu_ps(cvTargetWeight.ptr<float>(i) + j);
					nOffsetMax = 8;
				}
				else
				{
					nOffsetMax = cvInferWeight.cols - j;
					for (int offset = 0; offset < nOffsetMax; offset++)
					{
						mInferWeight.m256_f32[offset] = cvInferWeight.ptr<float>(i)[j + offset];
						mTargetWeight.m256_f32[offset] = cvTargetWeight.ptr<float>(i)[j + offset];
					}
				}
				unsigned char* pInferPointer = pInferY + i * nInferPitch + j;
				unsigned char* pTargetPointer = pTarget + i * nTargetPitch + j;
				unsigned char* pResultPointer = pDstY + nDstPitch * i + j;
				__m256 mTargetData, mInferData;
				for (int offset = 0; offset < nOffsetMax; offset++)
				{
					mTargetData.m256_f32[offset] = float(*(pTargetPointer + offset));
					mInferData.m256_f32[offset] = float(*( pInferPointer+ offset));
				}
				__m256 Res,WeightAdd;

				Res = _mm256_add_ps(_mm256_mul_ps(mInferWeight, mInferData), _mm256_mul_ps(mTargetWeight, mTargetData));
				WeightAdd = _mm256_add_ps(mInferWeight, mTargetWeight);
				for (int offset = 0; offset < nOffsetMax; offset++)
				{
					*(pResultPointer + offset) = unsigned char(Res.m256_f32[offset]/WeightAdd.m256_f32[offset]);
				}

				j += 8;
			}
		}
		//计算UV通道
		for (int i = 0; i < cvInferWeight.rows; i+=2)
		{
			for (int j = 0; j < cvInferWeight.cols; )
			{
				if (i % 2 != 0 && j % 2 != 0)
					continue;
				int X = j / 2;
				int Y = i / 2;
				__m256 mInferWeight, mTargetWeight;
				int nOffsetMax = 0;
				if (j + 16 < cvInferWeight.cols)
				{
					nOffsetMax = 16;
					//本模块内所有的宽度均进行了宽度测试所以均可以满足被4整除
					for (int offset = 0; offset < nOffsetMax; offset += 2)
					{
						mInferWeight.m256_f32[offset / 2] = cvInferWeight.ptr<float>(i)[j + offset];
						mTargetWeight.m256_f32[offset / 2] = cvTargetWeight.ptr<float>(i)[j + offset];
					}
				}
				else
				{
					nOffsetMax = cvInferWeight.cols - j;
					for (int offset = 0; offset < nOffsetMax; offset += 2)
					{
						mInferWeight.m256_f32[offset / 2] = cvInferWeight.ptr<float>(i)[j + offset];
						mTargetWeight.m256_f32[offset / 2] = cvTargetWeight.ptr<float>(i)[j + offset];
					}
				}

				unsigned char* pInferPointerU = pInferU + Y * nInferPitch / 2 + X;
				unsigned char* pInferPointerV = pInferV + Y * nInferPitch / 2 + X;
				unsigned char* pTargetPointerU = pTargetU + Y * nTargetPitch / 2 + X;
				unsigned char* pTargetPointerV = pTargetV + Y * nTargetPitch / 2 + X;
				unsigned char* pResultPointerU = pDstU + Y * nDstUVPitch + X;
				unsigned char* pResultPointerV = pDstV + Y * nDstUVPitch + X;

				__m256 mTargetDataU, mTargetDataV, mInferDataU, mInferDataV;
				for (int offset = 0; offset < nOffsetMax / 2; offset++)
				{
					mTargetDataU.m256_f32[offset] = float(*(pInferPointerU + offset));
					mTargetDataV.m256_f32[offset] = float(*(pInferPointerV + offset));
					mInferDataU.m256_f32[offset] = float(*(pTargetPointerU + offset));
					mInferDataV.m256_f32[offset] = float(*(pTargetPointerV + offset));

				}
				__m256 ResU, ResV, WeightAdd;

				ResU = _mm256_add_ps(_mm256_mul_ps(mInferWeight, mInferDataU), _mm256_mul_ps(mTargetWeight, mTargetDataU));
				ResV = _mm256_add_ps(_mm256_mul_ps(mInferWeight, mInferDataV), _mm256_mul_ps(mTargetWeight, mTargetDataV));
				WeightAdd = _mm256_add_ps(mInferWeight, mTargetWeight);
				for (int offset = 0; offset < nOffsetMax / 2; offset++)
				{
					*(pResultPointerU + offset) = unsigned char(ResU.m256_f32[offset]/ WeightAdd.m256_f32[offset]);
					*(pResultPointerV + offset) = unsigned char(ResV.m256_f32[offset] / WeightAdd.m256_f32[offset]);
				}

				j += 16;
			}
		}
	}
	else
	{
		for (int i = 0; i < cvInferWeight.rows; i++)
		{
			for (int j = 0; j < cvInferWeight.cols; j++)
			{
				//计算Y通道的权重
				float fInferWeight = cvInferWeight.at<float>(i, j);
				float fTargetWeight = cvTargetWeight.at<float>(i, j);
				//计算Y通道的值
				float fY = fInferWeight * pInferY[i * nInferPitch + j] + fTargetWeight * pTargetY[i * nTargetPitch + j];
				float fWeight = fInferWeight + fTargetWeight;
				pDstY[i * nDstPitch + j] = unsigned char(fY / fWeight);
				//计算U通道的值
				//当两个通道均被2整除则说明是采样的位置
				if (i % 2 == 0 && j % 2 == 0)
				{
					int X = j / 2;
					int Y = i / 2;

					float fU = fInferWeight * pInferU[Y * nDstUVPitch + X] +
						fTargetWeight * pTargetU[Y * nDstUVPitch + X];
					float fV = fInferWeight * pInferV[Y * nDstUVPitch + X] +
						fTargetWeight * pTargetV[Y * nDstUVPitch + X];

					//计算X和Y在i和j中的偏移,之后将之转换到真正的图像坐标内来
					//int nTotal = Y * nDstUVPitch + X;
					//int nTrueY = nTotal / nDstPitch;
					//int nTrueX = nTotal % nDstPitch;

					pDstU[Y * nDstUVPitch + X] = unsigned char(fU / fWeight);
					pDstV[Y * nDstUVPitch + X] = unsigned char(fV / fWeight);
				}
			}
		}
	}
}

void ImageFusion::BlendingLinearYUVByGpu(unsigned char* pInfer, int nInferWidth, int nInferHeight, int nInferPitch, unsigned char* pTarget, int nTargetWidth, int nTargetHeight, int nTargetPitch, cv::Mat& cvInferWeight, cv::Mat& cvTargetWeight, unsigned char*& pResult)
{

	//默认infer 和 target的 大小一样的,且都能被整除，或者整采样
	assert(nInferHeight == nTargetHeight || nInferWidth == nTargetWidth);
	assert(nInferHeight == cvInferWeight.rows ||
		nInferWidth == cvInferWeight.cols ||
		nTargetHeight == cvTargetWeight.rows ||
		nTargetWidth == cvTargetWeight.cols
	);
	
	assert(nInferWidth % 2 == 0);
	//assert(nInferHeight % 2 == 0);
	
	if (nInferHeight % 2 != 0)
	{
		nInferHeight -= 1;
		nTargetHeight -= 1;
	}

	//首先将数据传入到GPU中

	float* pDevInferMask = NULL, * pDevTargetMask = NULL;
	cudaMalloc((void**)&pDevTargetMask,cvTargetWeight.step* cvTargetWeight.rows);
	cudaMalloc((void**)&pDevInferMask,cvInferWeight.step* cvInferWeight.rows);
	
	//传输
	cudaError Err = cudaMemcpy(pDevTargetMask, cvTargetWeight.data, cvTargetWeight.step * cvTargetWeight.rows, cudaMemcpyHostToDevice);
	Err = cudaMemcpy(pDevInferMask, cvInferWeight.data, cvInferWeight.step * cvInferWeight.rows, cudaMemcpyHostToDevice);

	if (Err != 0)
		int i = 10;


	//计算得出result的分区指针
	unsigned char* pDstY = pResult;
	unsigned char* pDstU = pResult + nInferPitch * cvInferWeight.rows;
	unsigned char* pDstV = pResult + nInferPitch * cvInferWeight.rows * 5 / 4;
	int nDstPitch = nInferPitch;
	int nDstUVPitch = nInferPitch / 2;

	//获取YUV三个通道的分量的指针
	unsigned char* pInferY = pInfer;
	unsigned char* pInferU = pInfer + nInferPitch * nInferHeight;
	unsigned char* pInferV = pInfer + nInferPitch * nInferHeight * 5 / 4;

	unsigned char* pTargetY = pTarget;
	unsigned char* pTargetU = pTarget + nTargetPitch * nTargetHeight;
	unsigned char* pTargetV = pTarget + nTargetPitch * nTargetHeight * 5 / 4;

	YUVTailorAndBlender::GradientBlenderYUV(
		pInferY, nInferPitch,
		pInferU, nInferPitch / 2,
		pInferV, nInferPitch / 2,
		pTargetY, nTargetPitch,
		pTargetU, nTargetPitch / 2,
		pTargetV, nTargetPitch / 2,
		cvInferWeight.cols, cvInferWeight.rows,
		pDevInferMask, pDevTargetMask, cvInferWeight.step,
		pDstY, nDstPitch,
		pDstU, nDstUVPitch,
		pDstV, nDstUVPitch,
		cvInferWeight.cols, cvInferWeight.rows
	);

	//释放内存
	cudaFree(pDevInferMask);
	cudaFree(pDevTargetMask);

}

void ImageFusion::PastePartInWholeImageYUV(unsigned char* pPanoImg, int nPanoWidth, int nPanoHeight, int nPanoPitch, unsigned char* pPartImg, int nPartWidth, int nPartHeight, int nPartPitch, int nLeft, int nTop, bool bUseAVX)
{
	//chatgtp
	uint8_t* dst_y = pPanoImg;   // 源Y平面地址
	uint8_t* dst_u = pPanoImg + nPanoHeight * nPanoPitch;   // 源U平面地址
	uint8_t* dst_v = pPanoImg + nPanoHeight * nPanoPitch * 5 / 4;   // 源V平面地址
	int dst_stride_y = nPanoPitch;            // 源Y平面跨距
	int dst_stride_u = nPanoPitch >> 1;       // 源U平面跨距
	int dst_stride_v = nPanoPitch >> 1;       // 源V平面跨距

	uint8_t* src_y = pPartImg;         // 目标Y平面地址
	uint8_t* src_u = pPartImg + nPartPitch * nPartHeight;         // 目标U平面地址
	uint8_t* src_v = pPartImg + nPartPitch * nPartHeight * 5 / 4;         // 目标V平面地址
	int src_stride_y = nPartPitch;       // 目标Y平面跨距
	int src_stride_u = nPartPitch >> 1;       // 目标U平面跨距
	int src_stride_v = nPartPitch >> 1;       // 目标V平面跨距

	int width = nPartWidth;              // 拷贝区域宽度
	int height = nPartHeight;             // 拷贝区域高度

	//定位到目标的位置
	unsigned char* dst_target_y = dst_y + nTop * nPanoPitch + nLeft;
	unsigned char* dst_target_u = dst_u + nTop * nPanoPitch / 4 + nLeft / 2;
	unsigned char* dst_target_v = dst_v + nTop * nPanoPitch / 4 + nLeft / 2;

	// 拷贝Y平面
	libyuv::CopyPlane(
		src_y, src_stride_y,
		dst_target_y, dst_stride_y,
		width, height);

	// 拷贝U平面
	libyuv::CopyPlane(
		src_u, src_stride_u,
		dst_target_u, dst_stride_u,
		width / 2, height / 2);

	// 拷贝V平面
	libyuv::CopyPlane(
		src_v, src_stride_v,
		dst_target_v, dst_stride_v,
		width / 2, height / 2);

}

void ImageFusion::PastePartInWholeImageYUVByGpu(unsigned char* pPanoImg, int nPanoWidth, int nPanoHeight,
	int nPanoPitch, unsigned char* pPartImg, int nPartWidth,
	int nPartHeight, int nPartPitch,
	int nLeft, int nTop, CUstream* pStream)
{
	unsigned char* pDstY = pPanoImg;
	unsigned char* pDstU = pPanoImg + nPanoPitch * nPanoHeight;
	unsigned char* pDstV = pPanoImg + nPanoPitch * nPanoHeight * 5 / 4;

	int dst_stride_y = nPanoPitch;            // 源Y平面跨距
	int dst_stride_u = nPanoPitch >> 1;       // 源U平面跨距
	int dst_stride_v = nPanoPitch >> 1;       // 源V平面跨距

	uint8_t* src_y = pPartImg;         // 目标Y平面地址
	uint8_t* src_u = pPartImg + nPartPitch * nPartHeight;         // 目标U平面地址
	uint8_t* src_v = pPartImg + nPartPitch * nPartHeight * 5 / 4;         // 目标V平面地址
	int src_stride_y = nPartPitch;       // 目标Y平面跨距
	int src_stride_u = nPartPitch >> 1;       // 目标U平面跨距
	int src_stride_v = nPartPitch >> 1;       // 目标V平面跨距

	int width = nPartWidth;              // 拷贝区域宽度
	int height = nPartHeight;             // 拷贝区域高度

	YUVTailorAndBlender::CopyPlane(
		pPartImg, nPartWidth, nPartHeight, nPartPitch,
		pPanoImg, nPanoWidth, nPanoHeight, nPanoPitch,
		nPartWidth, nPartHeight,
		nLeft, nTop, 1, NULL
	);

	YUVTailorAndBlender::CopyPlane( 
		src_u, nPartWidth / 2, nPartHeight / 2, nPartPitch / 2,
		pDstU, nPanoWidth / 2, nPanoHeight / 2, nPanoPitch / 2,
		nPartWidth / 2, nPartHeight / 2,
		nLeft / 2, nTop / 2, 1, NULL
	);

	YUVTailorAndBlender::CopyPlane(
		src_v, nPartWidth / 2, nPartHeight / 2, nPartPitch / 2,
		pDstV, nPanoWidth / 2, nPanoHeight / 2, nPanoPitch / 2,
		nPartWidth / 2, nPartHeight / 2,
		nLeft / 2, nTop / 2, 1, NULL
	);

}

void ImageFusion::Init_Gpu(unsigned char* pImage, int nPanoWidth, int nPanoHeight, int nPanoPitch)
{
	pPanoImageBufferGpu = pImage;
	this->nPanoWidth = nPanoWidth;
	this->nPanoHeight = nPanoHeight;
	this->nPanoPitch = nPanoPitch;

}