HDR Tone Mapping with CUDA 5

In this example, for the sake of learning, we are going to butcher some great images. Let's describe the problem first.We are going to take some HDR images and modify their luminosity to lighten them. We could also darken the images, or apply changes only to parts of the images,

C++ (117.3 MB)
 
 
 
 
 
(0)
2,781 times
Add to favorites
3/8/2013
E-mail Twitter del.icio.us Digg Facebook

Solution explorer

C++
#include "stdafx.h"
#include "HdrTone.h"

using namespace Bisque;

using cv::Mat;
using cv::cvtColor;
using cv::imread;
using cv::imwrite;

using std::cout;
using std::endl;
using std::expf;
using std::exception;
using std::runtime_error;
using std::unique_ptr;

using std::chrono::duration_cast;
using std::chrono::milliseconds;
using std::chrono::microseconds;
using std::chrono::high_resolution_clock;
using std::chrono::time_point;

using concurrency::parallel_for;
using concurrency::parallel_for_each;

using thrust::pair;
using thrust::exclusive_scan;
using thrust::minmax_element;

// Forward declarations
void Run_LuminanceDistributionComputation(
		unsigned int* const			cdf,					// return value: luminance cumulative distribution
		unsigned int* const			histogram,				// return value: histogram of cumulative luminance distribution
		float*						lumMin,					// return value: luminance min 
		float*						lumMax,					// return value: luminance max
		float*						lumRange,				// return value: luminance range
		const float* const			luminance,				// log of luminance values for each pixel
		size_t						rows,					// image size: number of rows
		size_t						cols,					// image size: number of columns
		unsigned int				bins					// number of bins for the histogram
	);

void Run_PostProcessing(
		float* const				red,					// return value: red channel after tone mapping
		float* const				green,					// return value: green channel after tone mapping
		float* const				blue,					// return value: blue channel after tone mapping
		float* const				normalizedCdf,			// return value: luminance cumulative distribution
		const unsigned int* const	cdf,					// luminance cumulative distribution
		const float* const			x,						// x component of xyY
		const float* const			y,						// y component of xyY
		const float* const			luminance,				// Y component of xyY: luminance logarithm
		const float* const			lumMin,					// luminance min 
		const float* const			lumMax,					// luminance max
		const float* const			lumRange,				// luminance range
		unsigned int				numBins,				// number of bins for the histogram
		size_t						rows,					// image size: number of rows
		size_t						cols					// image size: number of columns
	);

void Run_rgb2xyY(
		float* const				x,						// return value: x component of xyY
		float* const				y,						// return value: y component of xyY
		float* const				luminance,				// return value: Y component of xyY: luminance logarithm
		const float* const			red,					// input for the conversion
		const float* const			green,					// input for the conversion
		const float* const			blue,					// input for the conversion
		size_t						rows,					// image size: number of rows
		size_t						cols					// image size: number of columns
	);

// Ctor
HdrTone::HdrTone(void)
{
}

HdrTone::~HdrTone(void)
{
}

// Applies gaussian blur to an image
void HdrTone::ApplyFilter(const string& imagePath, const string& outputPath)
{
    const char* func = "HdrTone::ApplyFilter";

	cudaError hr = cudaFree(nullptr);																			CHECK_CUDA_ERROR(hr, func, "Could not free CUDA memory.");

		GpuTimer gpuTimer;
		time_point<high_resolution_clock> start;
		time_point<high_resolution_clock> stop;

	// Load image
		start = high_resolution_clock::now();

	LoadHdrImage(imagePath, m_host);

		stop = high_resolution_clock::now();
		long long ms = duration_cast<milliseconds>(stop - start).count();
		long long us = duration_cast<microseconds>(stop - start).count();

		cout << "Loaded HDR image in " << us << "us (" << ms << "ms)" << endl;

	// Convert RGB to xyY (chrominance/luminosity)
		start = high_resolution_clock::now();
		gpuTimer.Start();

	rgb2xyY(m_host);

		gpuTimer.Stop();
		stop = high_resolution_clock::now();
		ms = duration_cast<milliseconds>(stop - start).count();
		us = duration_cast<microseconds>(stop - start).count();

		cout << "rgb to xyY in " << us << "us (" << ms << "ms); gpu time: " << gpuTimer.Elapsed() << "ms" << endl;


	// Compute cumulative luminance distribution
		start = high_resolution_clock::now();

		gpuTimer.Start();

	ComputeCDF();

		gpuTimer.Stop();

		stop = high_resolution_clock::now();
		ms = duration_cast<milliseconds>(stop - start).count();
		us = duration_cast<microseconds>(stop - start).count();

		cout << "GPU luminance CDF in " << us << "us (" << ms << "ms); gpu time: " << gpuTimer.Elapsed() << "ms" << endl;

	// Post-process image to apply tone map
		start = high_resolution_clock::now();

	PostProcess();

		stop = high_resolution_clock::now();
		ms = duration_cast<milliseconds>(stop - start).count();
		us = duration_cast<microseconds>(stop - start).count();

		cout << "Post-Processed in " << us << "us (" << ms << "ms)" << endl;

	// Save modifiedImage image to disk
		start = high_resolution_clock::now();

	SaveImage(outputPath);

		stop = high_resolution_clock::now();
		ms = duration_cast<milliseconds>(stop - start).count();
		us = duration_cast<microseconds>(stop - start).count();

		cout << "SaveImage in " << us << "us (" << ms << "ms)" << endl;


#if 0			// Change to 1 to enable
	// Validate GPU convertion against CPU result.
	// Only turn it when you want to run validation because CPU calculation will be slow.
		start = high_resolution_clock::now();

	VerifyGpuComputation();

		stop = high_resolution_clock::now();
		ms = duration_cast<milliseconds>(stop - start).count();
		us = duration_cast<microseconds>(stop - start).count();

		cout << "CPU luminance CDF (plus checks!!) in " << us << "us (" << ms << "ms)" << endl;
#endif

	// cleanup
	hr = cudaFree(m_device.luminance);
	hr = cudaFree(m_device.x);
	hr = cudaFree(m_device.y);
	hr = cudaFree(m_device.cdf);
	hr = cudaFree(m_device.normalizedCdf);
	hr = cudaFree(m_device.histogram);
	hr = cudaFree(m_device.lumMax);
	hr = cudaFree(m_device.lumMin);
	hr = cudaFree(m_device.lumRange);
}

// Prepares luminance cdf (cumulative distribution function) calculation and calls CUDA kernel.
//
// Example
//
// input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
// min / max / range: 0 / 9 / 9
// 
// histogram with 3 bins: [4 7 3]
// 
// cdf : [4 11 14]
//
// Calculation steps are given in the kernel code
//
void HdrTone::ComputeCDF()
{
    const char* func = "HdrTone::ComputeCDF";

	cudaError hr = cudaSuccess;

	//
	// Calculate luminance min/max and range
	//
	// NOTE: We are using thrust library here 
	vector<float> luminance(m_host.numPixels);
	hr = cudaMemcpy(luminance.data(), m_device.luminance, m_host.numPixels * sizeof(float), cudaMemcpyDeviceToHost);				CHECK_CUDA_ERROR(hr, func, "Could not copy luminance log from devicee to host.");

	using thrust::pair;
	using thrust::device_ptr;
	using thrust::minmax_element;

	pair<float*, float*> lumMinMax = minmax_element(
		luminance.data(), 
		luminance.data() + m_host.numPixels
		);

	float lumMax   = *lumMinMax.second;
	float lumMin   = *lumMinMax.first;
	float lumRange = lumMax - lumMin;

	// Allocate memory for histogram and cdf
	size_t size = NumBins * sizeof(unsigned int);

	hr = cudaMalloc(&m_device.histogram,	size);																					CHECK_CUDA_ERROR(hr, func, "Could not allocate CUDA memory.");
	hr = cudaMalloc(&m_device.cdf,	        size);																					CHECK_CUDA_ERROR(hr, func, "Could not allocate CUDA memory.");

	hr = cudaMemset( m_device.histogram, 0, size);																					CHECK_CUDA_ERROR(hr, func, "Could not set CUDA memory.");
	hr = cudaMemset( m_device.cdf,       0, size);																					CHECK_CUDA_ERROR(hr, func, "Could not set CUDA memory.");

	// Luminance min / max / range
	hr = cudaMalloc(&m_device.lumMax,      sizeof(float));																			CHECK_CUDA_ERROR(hr, func, "Couild not allocate CUDA memory.");
	hr = cudaMalloc(&m_device.lumMin,      sizeof(float));																			CHECK_CUDA_ERROR(hr, func, "Couild not allocate CUDA memory.");
	hr = cudaMalloc(&m_device.lumRange,    sizeof(float));																			CHECK_CUDA_ERROR(hr, func, "Couild not allocate CUDA memory.");

	//hr = cudaMemset( m_device.lumMax,   0, sizeof(float));																		CHECK_CUDA_ERROR(hr, func, "Couild not set CUDA memory.");
	//hr = cudaMemset( m_device.lumMin,   0, sizeof(float));																		CHECK_CUDA_ERROR(hr, func, "Couild not set CUDA memory.");
	//hr = cudaMemset( m_device.lumRange, 0, sizeof(float));																		CHECK_CUDA_ERROR(hr, func, "Couild not set CUDA memory.");

	hr = cudaMemcpy(m_device.lumMax,   (void*)&lumMax,   sizeof(float), cudaMemcpyHostToDevice);									CHECK_CUDA_ERROR(hr, func, "Couild not copy from host to device.");
	hr = cudaMemcpy(m_device.lumMin,   (void*)&lumMin,   sizeof(float), cudaMemcpyHostToDevice);									CHECK_CUDA_ERROR(hr, func, "Couild not copy from host to device.");
	hr = cudaMemcpy(m_device.lumRange, (void*)&lumRange, sizeof(float), cudaMemcpyHostToDevice);									CHECK_CUDA_ERROR(hr, func, "Couild not copy from host to device.");

	//
	// Compute luminance cdf
	//
	Run_LuminanceDistributionComputation(
		m_device.cdf,							// return value: luminance cumulative distribution
		m_device.histogram,						// return value: histogram of cumulative luminance distribution
		m_device.lumMin,						// return value: luminance min 
		m_device.lumMax,						// return value: luminance max
		m_device.lumRange,						// return value: luminance range
		m_device.luminance,						// log of luminance values for each pixel
		m_host.rows,							// image size: number of rows
		m_host.cols,							// image size: number of columns
		NumBins									// number of bins for the histogram
	);
}

// Loads HDR image from disk
void HdrTone::LoadHdrImage(const string& imagePath, HostData& hdr)
{
    const char* func = "HdrTone::LoadHdrImage";

	Mat image = imread(imagePath.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);

	if (image.empty() || (image.channels() != 3) || (!image.isContinuous()))
	{
		string msg = "Could not open image file: " + imagePath;
		throw runtime_error(msg);
	}

	// Image size
	hdr.rows		= image.rows;
	hdr.cols		= image.cols;
	hdr.numPixels	= image.rows * image.cols;

	// Copy image into a flat structure we could use
	hdr.red  .resize(hdr.numPixels);
	hdr.green.resize(hdr.numPixels);
	hdr.blue .resize(hdr.numPixels);

	float* tmp = image.ptr<float>(0);				// temporary pointer

	// The image is BGR
	parallel_for(size_t(0), hdr.numPixels, [&hdr, &tmp](size_t i)
	{
		hdr.blue [i] = tmp[i * 3 + 0];
		hdr.green[i] = tmp[i * 3 + 1];
		hdr.red  [i] = tmp[i * 3 + 2];
	});

#if 0
	for (int i = 0; i < 1000; ++i)
	{
		wchar_t text[MAX_PATH];
		swprintf_s(text, L"   [%d] r=%f g=%f b=%f\n", i, hdr.red[i], hdr.green[i], hdr.blue[i]);
		OutputDebugString(text);
	}
#endif
}

// Post-processes the image and applies tone map
void HdrTone::PostProcess()
{
    const char* func = "HdrTone::ComputeCDF";

	cudaError hr = cudaSuccess;

	// Normalized CDF: norm[idx] = cdf[idx] * k, where k = 1 / [biggest cdf value]
	hr = cudaMalloc(&m_device.normalizedCdf, NumBins * sizeof(float));														CHECK_CUDA_ERROR(hr, func, "Could not allocate normalized cdf memory.");

	// Allocate memory for the output rgb channels.
	// We'll reuse red, green, blue device pointers from the original image as
	// we have release them after conversion from rgb to xyY.

	size_t size = m_host.numPixels * sizeof(float);

	hr = cudaMalloc(&m_device.red,   size);																					CHECK_CUDA_ERROR(hr, func, "Could not allocate CUDA memory.");
	hr = cudaMalloc(&m_device.green, size);																					CHECK_CUDA_ERROR(hr, func, "Could not allocate CUDA memory.");
	hr = cudaMalloc(&m_device.blue,  size);																					CHECK_CUDA_ERROR(hr, func, "Could not allocate CUDA memory.");

	// Run normalization and tone map kernels
	Run_PostProcessing(
		m_device.red,
		m_device.green,
		m_device.blue,
		m_device.normalizedCdf, 
		m_device.cdf, 
		m_device.x,
		m_device.y,
		m_device.luminance,
		m_device.lumMin,
		m_device.lumMax,
		m_device.lumRange,
		NumBins,
		m_host.rows,
		m_host.cols
		);

	// Copy red, green, and blue channels to the host overwriting original image
	hr = cudaMemcpy(m_host.red.data(),   m_device.red,   size, cudaMemcpyDeviceToHost);										CHECK_CUDA_ERROR(hr, func, "Could not copy from device to the host.");
	hr = cudaMemcpy(m_host.green.data(), m_device.green, size, cudaMemcpyDeviceToHost);										CHECK_CUDA_ERROR(hr, func, "Could not copy from device to the host.");
	hr = cudaMemcpy(m_host.blue.data(),  m_device.blue,  size, cudaMemcpyDeviceToHost);										CHECK_CUDA_ERROR(hr, func, "Could not copy from device to the host.");
}

// Loads image from the disk and allocates memory for original and modified images on the host and the device.
void HdrTone::rgb2xyY(const HostData& hdr)
{
    const char* func = "HdrTone::rgb2xyY";

	cudaError hr = cudaSuccess;

	// Convert RGB to xyY (chromacity-luminance space)
	size_t size = hdr.numPixels * sizeof(float);			// channel size

	hr = cudaMalloc(&m_device.red,			size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");
	hr = cudaMalloc(&m_device.green,		size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");
	hr = cudaMalloc(&m_device.blue,			size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");
	hr = cudaMalloc(&m_device.x,			size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");
	hr = cudaMalloc(&m_device.y,			size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");
	hr = cudaMalloc(&m_device.luminance,	size);																CHECK_CUDA_ERROR(hr, func, "Could not allocate device memory.");

	hr = cudaMemcpy(m_device.blue,  &hdr.blue[0],  size, cudaMemcpyHostToDevice);								CHECK_CUDA_ERROR(hr, func, "Could not copy blue channel from host to device.");
	hr = cudaMemcpy(m_device.green, &hdr.green[0], size, cudaMemcpyHostToDevice);								CHECK_CUDA_ERROR(hr, func, "Could not copy green channel from host to device.");
	hr = cudaMemcpy(m_device.red,   &hdr.red[0],   size, cudaMemcpyHostToDevice);								CHECK_CUDA_ERROR(hr, func, "Could not copy red channel from host to device.");

	Run_rgb2xyY(
		m_device.x,
		m_device.y,
		m_device.luminance,
		m_device.red,
		m_device.green,
		m_device.blue,
		hdr.rows,
		hdr.cols
	);

	// clean up
	hr = cudaFree(m_device.red);																				CHECK_CUDA_ERROR(hr, func, "Failed to free CUDA resource.");
	hr = cudaFree(m_device.green);																				CHECK_CUDA_ERROR(hr, func, "Failed to free CUDA resource.");
	hr = cudaFree(m_device.blue);																				CHECK_CUDA_ERROR(hr, func, "Failed to free CUDA resource.");
}

/// Saves tone-mapped HDR image to the disk
void HdrTone::SaveImage(const string& imagePath)
{
	// Combine image from rgb channels
	vector<float> image(m_host.numPixels * 3);

	float* tmp   = image.data();
	float* red   = m_host.red.data();
	float* green = m_host.green.data();
	float* blue  = m_host.blue.data();

	// Image has to be saved in BGR format
	parallel_for(size_t(0), m_host.numPixels, [&tmp, &red, &green, &blue](size_t i)
	{
		tmp[i * 3 + 0] = blue [i];
		tmp[i * 3 + 1] = green[i];
		tmp[i * 3 + 2] = red  [i];
	});

	// Copy to OpenCV
	int size[] = {
		m_host.rows,
		m_host.cols
	};

	Mat hdr(2, size, CV_32FC3, reinterpret_cast<void*>(tmp));

	//hdr = hdr * 0.5f;

	imwrite(imagePath.c_str(), hdr);
}

// Validates GPU computation by running algorithm on CPU
void HdrTone::VerifyGpuComputation()
{
    const char* func = "HdrTone::VerifyGpuComputation";
	cudaError hr = cudaSuccess;
	wchar_t text[MAX_PATH];

	// luminance
	vector<float> luminance(m_host.numPixels);
	hr = cudaMemcpy(luminance.data(), m_device.luminance, m_host.numPixels * sizeof(float), cudaMemcpyDeviceToHost);							CHECK_CUDA_ERROR(hr, func, "Failed to copy from device to host.");

	// Copmute luminance min, max, and range.
	float lumMax;
	float lumMin;
	float lumRange;

	hr = cudaMemcpy(&lumMin,   m_device.lumMin,   sizeof(float), cudaMemcpyDeviceToHost);														CHECK_CUDA_ERROR(hr, func, "Couild not copy from device to host.");
	hr = cudaMemcpy(&lumMax,   m_device.lumMax,   sizeof(float), cudaMemcpyDeviceToHost);														CHECK_CUDA_ERROR(hr, func, "Couild not copy from device to host.");
	hr = cudaMemcpy(&lumRange, m_device.lumRange, sizeof(float), cudaMemcpyDeviceToHost);														CHECK_CUDA_ERROR(hr, func, "Couild not copy from device to host.");

	swprintf_s(text, L"   device min/max: min = %f --  max = %f -- range = %f\n", lumMin, lumMax, lumRange);
	OutputDebugString(text);

	// Cheating here by using thrust library
	pair<float*, float*> lumMinMax = minmax_element(luminance.data(), luminance.data() + m_host.numPixels);
	lumMax   = *lumMinMax.second;
	lumMin   = *lumMinMax.first;
	lumRange = *lumMinMax.second - *lumMinMax.first;

	swprintf_s(text, L"   thrust min/max: min = %f --  max = %f -- range = %f\n", lumMin, lumMax, lumRange);
	OutputDebugString(text);

	// Same computation on CPU
	lumMax = lumMin = luminance[0];

	for (int i = 0; i < m_host.numPixels; ++i)
	{
		lumMax = std::max(lumMax, luminance[i]);
		lumMin = std::min(lumMin, luminance[i]);
	}

	lumRange = lumMax - lumMin;

	swprintf_s(text, L"   cpu min/max:    min = %f --  max = %f -- range = %f\n", lumMin, lumMax, lumRange);
	OutputDebugString(text);

	// Device histogram
	vector<unsigned int> d_histogram(NumBins, 0);
	hr = cudaMemcpy(d_histogram.data(), m_device.histogram, NumBins * sizeof(unsigned int), cudaMemcpyDeviceToHost);							CHECK_CUDA_ERROR(hr, func, "Failed to copy from device to host.");

	// Host histogram
	vector<unsigned int> histogram(NumBins, 0);

	for (int i = 0; i < m_host.numPixels; ++i)
	{
		unsigned int bin = static_cast<unsigned int>((luminance[i] - lumMin) / lumRange * NumBins);
		bin = std::min(bin, NumBins - 1);
		histogram[bin]++;
	}

	for (int i = 0; i < NumBins; ++i)
	{
		//swprintf_s(text, L"   %d -- %d : %d\n", d_histogram[i], histogram[i], d_histogram[i] - histogram[i]);
		//OutputDebugString(text);

		if (d_histogram[i] != histogram[i])
		{
			throw runtime_error("Histogram on GPU has incorrect values.");
		}
	}

	swprintf_s(text, L"   histograms on GPU and CPU are identical!\n");
	OutputDebugString(text);

	//
	// Compute cdf using exclusive scan (prefix sum)
	//

	// On the host
	vector<unsigned int> cdf(NumBins, 0);

	for (int i = 1; i < NumBins; ++i)
	{
		cdf[i] = cdf[i - 1] + histogram[i - 1];
	}

	// With thrust
	vector<unsigned int> t_cdf(NumBins, 0);

	exclusive_scan(
		histogram.data(),
		histogram.data() + NumBins,
		t_cdf.data()
		);

	// On the device using Hillis & Steele algorithm
	vector<unsigned int> d_cdf(NumBins);
	hr = cudaMemcpy(d_cdf.data(), m_device.cdf, NumBins * sizeof(unsigned int), cudaMemcpyDeviceToHost);						CHECK_CUDA_ERROR(hr, func, "Failed to copy from device to host.");

	for (int i = 0; i < NumBins; ++i)
	{
		//swprintf_s(text, L"   %d\t%d\t%d\n", cdf[i], t_cdf[i], d_cdf[i]);
		//swprintf_s(text, L"   %d\t%d\n", histogram[i], d_cdf[i]);
		//OutputDebugString(text);

		if (d_cdf[i] != cdf[i])
		{
			throw runtime_error("CDF on GPU has incorrect values.");
		}
	}

	swprintf_s(text, L"   CDF's on GPU and CPU are identical!\n");
	OutputDebugString(text);

	// Normalize cdf: 1 / [biggest cdf number]
	vector<float> norm(NumBins);
	hr = cudaMemcpy(norm.data(), m_device.normalizedCdf, NumBins * sizeof(float), cudaMemcpyDeviceToHost);								CHECK_CUDA_ERROR(hr, func, "Failed to copy from device to host.");
	/*
	for (int i = 0; i < NumBins; ++i)
	{
		swprintf_s(text, L"   %d\t%f\n", cdf[i], norm[i]);
		OutputDebugString(text);
	}
	*/
}