Professional Documents
Culture Documents
Cuda
Cuda
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <chrono>
#include <thread>
#include<iostream>
#include<time.h> // Thư viện thời gian
#include <unistd.h>
inline bool loadPPM(const char *file, unsigned char **data, unsigned int *w,
unsigned int *h, unsigned int *channels)
{
FILE *fp = NULL;
fp = fopen(file, "rb");
if (!fp) {
fprintf(stderr, "__LoadPPM() : unable to open file\n" );
return false;
}
// check header
char header[PGMHeaderSize];
if (strncmp(header, "P5", 2) == 0)
{
*channels = 1;
}
else if (strncmp(header, "P6", 2) == 0)
{
*channels = 3;
}
else
{
fprintf(stderr,"__LoadPPM() : File is not a PPM or PGM image\n" );
*channels = 0;
return false;
}
while (i < 3)
{
if (fgets(header, PGMHeaderSize, fp) == NULL)
{
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;
}
if (header[0] == '#')
{
continue;
}
if (i == 0)
{
i += sscanf(header, "%u %u %u", &width, &height, &maxval);
}
else if (i == 1)
{
i += sscanf(header, "%u %u", &height, &maxval);
}
else if (i == 2)
{
i += sscanf(header, "%u", &maxval);
}
}
fclose(fp);
return true;
}
inline bool savePPM(const char *file, unsigned char *data, unsigned int w, unsigned
int h, unsigned int channels)
{
assert(NULL != data);
assert(w > 0);
assert(h > 0);
if (fh.bad())
{
fprintf(stderr, "__savePPM() : Opening file failed.\n" );
return false;
}
if (channels == 1)
{
fh << "P5\n";
}
else if (channels == 3)
{
fh << "P6\n";
}
else
{
fprintf(stderr, "__savePPM() : Invalid number of channels.\n" );
return false;
}
fh << w << "\n" << h << "\n" << 0xff << std::endl;
fh.flush();
if (fh.bad())
{
fprintf(stderr,"__savePPM() : Writing data failed.\n" );
return false;
}
fh.close();
return true;
}
#define TILE_W 16
#define TILE_H 16
#define Rx 2 // filter radius in x direction
#define Ry 2 // filter radius in y direction
#define FILTER_W (Rx*2+1) // filter diameter in x direction
#define FILTER_H (Ry*2+1) // filter diameter in y direction
#define S (FILTER_W*FILTER_H) // filter size
#define BLOCK_W (TILE_W+(2*Rx))
#define BLOCK_H (TILE_H+(2*Ry))
__global__ void box_filter_GPU(const unsigned char *in, unsigned char *out, const
unsigned int w, const unsigned int h){
//Indexes
const int d = y * w + x;
//shared mem
__shared__ float shMem[BLOCK_W][BLOCK_H];
if(x<0 || y<0 || x>=w || y>=h) { // Threads GPU which are not in the
picture just write 0 to the shared mem
shMem[threadIdx.x][threadIdx.y] = 0;
return;
}
shMem[threadIdx.x][threadIdx.y] = in[d];
__syncthreads();
inline void __checkCudaErrors(cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
__global__ void box_filter_CPU(const unsigned char *in, unsigned char *out, const
unsigned int w, const unsigned int h){
//Indexes
//shared mem
__shared__ float shMem[BLOCK_W][BLOCK_H];
if(x<0 || y<0 || x>=w || y>=h) { // Threads CPU which are not in the
picture just write 0 to the shared mem
shMem[threadIdx.x][threadIdx.y] = 0;
return;
}
shMem[threadIdx.x][threadIdx.y] = in[d];
__syncthreads();
}
int main(){
unsigned char *data=NULL, *d_idata=NULL, *d_odata=NULL ,*data1=NULL;
unsigned int w,h,channels;
// Copy to device
printf("Copy idata from the host memory to the CUDA device\n");
checkCudaErrors(cudaMemcpy(d_idata, data, datasize, cudaMemcpyHostToDevice));
// Launch Kernel
int GRID_W = w/TILE_W +1;
int GRID_H = h/TILE_H +1;
dim3 threadsPerBlock(BLOCK_W, BLOCK_H);
dim3 blocksPerGrid(GRID_W,GRID_H);
printf("CUDA kernel launch with [%d %d] blocks of [%d %d] threads\n",
blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::milliseconds;
auto t1 = high_resolution_clock::now();
std::cout<<"\n box_filter run with GPU was: " << ms_double.count() << "ms\n";
auto c1 = high_resolution_clock::now();
std::cout<<"\n box_filter run with CPU was: " << ms_doublec.count() << "ms\n";
checkCudaErrors(cudaGetLastError());
// Copy data from device to host
printf("\nCopy odata from the CUDA device to the host memory\n");
checkCudaErrors(cudaMemcpy(data, d_odata, datasize, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(data1, d_odata, datasize, cudaMemcpyDeviceToHost));
// Save Picture
printf("Save Picture\n");
bool saved = false;
if (channels==1)
saved = savePPM("output_gpu.pgm", data, w, h, channels);
else if (channels==3)
saved = savePPM("output.ppm", data, w, h, channels);
else fprintf(stderr, "ERROR: Unable to save file - wrong channel!\n");
if (channels==1)
saved = savePPM("output_cpu.pgm", data1, w, h, channels);
else if (channels==3)
saved = savePPM("output.ppm", data1, w, h, channels);
else fprintf(stderr, "ERROR: Unable to save file - wrong channel!\n");
if (!saved){
fprintf(stderr, "Failed to save File\n");
exit(EXIT_FAILURE);
}
printf("Done\n");
}