Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 7


#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>

#define PGMHeaderSize 0x40

#include <chrono>

#include <thread>
#include<time.h> // Thư viện thời gian

#include <unistd.h>
inline bool loadPPM(const char *file, unsigned char **data, unsigned int *w,
unsigned int *h, unsigned int *channels)
FILE *fp = NULL;

fp = fopen(file, "rb");
if (!fp) {
fprintf(stderr, "__LoadPPM() : unable to open file\n" );
return false;

// check header
char header[PGMHeaderSize];

if (fgets(header, PGMHeaderSize, fp) == NULL)

fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;

if (strncmp(header, "P5", 2) == 0)
*channels = 1;
else if (strncmp(header, "P6", 2) == 0)
*channels = 3;
fprintf(stderr,"__LoadPPM() : File is not a PPM or PGM image\n" );
*channels = 0;
return false;

// parse header, read maxval, width and height

unsigned int width = 0;
unsigned int height = 0;
unsigned int maxval = 0;
unsigned int i = 0;

while (i < 3)
if (fgets(header, PGMHeaderSize, fp) == NULL)
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;

if (header[0] == '#')

if (i == 0)
i += sscanf(header, "%u %u %u", &width, &height, &maxval);
else if (i == 1)
i += sscanf(header, "%u %u", &height, &maxval);
else if (i == 2)
i += sscanf(header, "%u", &maxval);

// check if given handle for the data is initialized

if (NULL != *data)
if (*w != width || *h != height)
fprintf(stderr, "__LoadPPM() : Invalid image dimensions.\n" );
*data = (unsigned char *) malloc(sizeof(unsigned char) * width * height *
if (!data) {
fprintf(stderr, "Unable to allocate hostmemory\n");
return false;
*w = width;
*h = height;

// read and close file

if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
fprintf(stderr, "__LoadPPM() : read data returned error.\n" );
return false;


return true;

inline bool savePPM(const char *file, unsigned char *data, unsigned int w, unsigned
int h, unsigned int channels)
assert(NULL != data);
assert(w > 0);
assert(h > 0);

std::fstream fh(file, std::fstream::out | std::fstream::binary);

if (fh.bad())
fprintf(stderr, "__savePPM() : Opening file failed.\n" );
return false;

if (channels == 1)
fh << "P5\n";
else if (channels == 3)
fh << "P6\n";
fprintf(stderr, "__savePPM() : Invalid number of channels.\n" );
return false;

fh << w << "\n" << h << "\n" << 0xff << std::endl;

for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)

fh << data[i];


if (fh.bad())
fprintf(stderr,"__savePPM() : Writing data failed.\n" );
return false;


return true;

#define TILE_W 16
#define TILE_H 16
#define Rx 2 // filter radius in x direction
#define Ry 2 // filter radius in y direction
#define FILTER_W (Rx*2+1) // filter diameter in x direction
#define FILTER_H (Ry*2+1) // filter diameter in y direction
#define S (FILTER_W*FILTER_H) // filter size
#define BLOCK_W (TILE_W+(2*Rx))
#define BLOCK_H (TILE_H+(2*Ry))

__global__ void box_filter_GPU(const unsigned char *in, unsigned char *out, const
unsigned int w, const unsigned int h){

const int x = blockIdx.x * TILE_W + threadIdx.x - Rx;

const int y = blockIdx.y * TILE_H + threadIdx.y - Ry;

const int d = y * w + x;

//shared mem
__shared__ float shMem[BLOCK_W][BLOCK_H];
if(x<0 || y<0 || x>=w || y>=h) { // Threads GPU which are not in the
picture just write 0 to the shared mem
shMem[threadIdx.x][threadIdx.y] = 0;
shMem[threadIdx.x][threadIdx.y] = in[d];

// box filter (only for threads inside the tile)

if ((threadIdx.x >= Rx) && (threadIdx.x < (BLOCK_W-Rx)) && (threadIdx.y >= Ry)
&& (threadIdx.y < (BLOCK_H-Ry))) {
float sum = 0;
for(int dx=-Rx; dx<=Rx; dx++) {
for(int dy=-Ry; dy<=Ry; dy++) {
sum += shMem[threadIdx.x+dx][threadIdx.y+dy];
out[d] = sum / S;

#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line)
if (cudaSuccess != err)
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString(err));

__global__ void box_filter_CPU(const unsigned char *in, unsigned char *out, const
unsigned int w, const unsigned int h){

const int x = blockIdx.x * TILE_W + threadIdx.x - Rx; // x image index

const int y = blockIdx.y * TILE_H + threadIdx.y - Ry; // y image index
const int d = y * w + x; // data index

//shared mem
__shared__ float shMem[BLOCK_W][BLOCK_H];
if(x<0 || y<0 || x>=w || y>=h) { // Threads CPU which are not in the
picture just write 0 to the shared mem
shMem[threadIdx.x][threadIdx.y] = 0;
shMem[threadIdx.x][threadIdx.y] = in[d];

// box filter (only for threads inside the tile)

if ((threadIdx.x >= Rx) && (threadIdx.x < (BLOCK_W-Rx)) && (threadIdx.y >= Ry)
&& (threadIdx.y < (BLOCK_H-Ry))) {
float sum = 0;
for(int dx=-Rx; dx<=Rx; dx++) {
for(int dy=-Ry; dy<=Ry; dy++) {
sum += shMem[threadIdx.x+dx][threadIdx.y+dy];
out[d] = sum / S;

int main(){
unsigned char *data=NULL, *d_idata=NULL, *d_odata=NULL ,*data1=NULL;
unsigned int w,h,channels;

if(! loadPPM("./drive/MyDrive/pgmimage/sample_640×426.pgm", &data, &w, &h,

fprintf(stderr, "Failed to open File\n");

printf("Loaded file with w:%d h:%d channels:%d \n",w,h,channels);

unsigned int numElements = w*h*channels;

size_t datasize = numElements * sizeof(unsigned char);

// Allocate the Device Memory

printf("Allocate Devicememory for data\n");
checkCudaErrors(cudaMalloc((void **)&d_idata, datasize));
checkCudaErrors(cudaMalloc((void **)&d_odata, datasize));

// Copy to device
printf("Copy idata from the host memory to the CUDA device\n");
checkCudaErrors(cudaMemcpy(d_idata, data, datasize, cudaMemcpyHostToDevice));

// Launch Kernel
int GRID_W = w/TILE_W +1;
int GRID_H = h/TILE_H +1;
dim3 threadsPerBlock(BLOCK_W, BLOCK_H);
dim3 blocksPerGrid(GRID_W,GRID_H);
printf("CUDA kernel launch with [%d %d] blocks of [%d %d] threads\n",
blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);

using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::milliseconds;

auto t1 = high_resolution_clock::now();

box_filter_GPU<<<blocksPerGrid, threadsPerBlock>>>(d_idata, d_odata, w,h);

auto t2 = high_resolution_clock::now();

/* Getting number of milliseconds as a double. */

duration<double, std::milli> ms_double = t2 - t1;

std::cout<<"\n box_filter run with GPU was: " << ms_double.count() << "ms\n";

auto c1 = high_resolution_clock::now();

box_filter_CPU<<<blocksPerGrid, threadsPerBlock>>>(d_idata, d_odata, w,h);

using namespace std::chrono_literals;

auto c2 = high_resolution_clock::now();

/* Getting number of milliseconds as a double. */

duration<double, std::milli> ms_doublec = c2 - c1;

std::cout<<"\n box_filter run with CPU was: " << ms_doublec.count() << "ms\n";

// Copy data from device to host
printf("\nCopy odata from the CUDA device to the host memory\n");
checkCudaErrors(cudaMemcpy(data, d_odata, datasize, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(data1, d_odata, datasize, cudaMemcpyDeviceToHost));

// Free Device memory

printf("Free Device memory\n");

// Save Picture
printf("Save Picture\n");
bool saved = false;
if (channels==1)
saved = savePPM("output_gpu.pgm", data, w, h, channels);
else if (channels==3)
saved = savePPM("output.ppm", data, w, h, channels);
else fprintf(stderr, "ERROR: Unable to save file - wrong channel!\n");

if (channels==1)
saved = savePPM("output_cpu.pgm", data1, w, h, channels);
else if (channels==3)
saved = savePPM("output.ppm", data1, w, h, channels);
else fprintf(stderr, "ERROR: Unable to save file - wrong channel!\n");

// Free Host memory

printf("Free Host memory\n");

if (!saved){
fprintf(stderr, "Failed to save File\n");

You might also like