Pdsla 1

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 18

Dot product

#include<stdio.h>
#define SIZE 4
__global__ void dotProduct(int *a, int *b, int *c)
{
int i = threadIdx.x;
*c += a[i]*b[i];
atomicAdd(c, a[i]*b[i]);
}

int main()
{
int a[SIZE] = {1,2,3,4};
int b[SIZE] = {1,2,3,4};
int c = 0;

int *da, *db, *dc, size = SIZE*sizeof(int);


cudaMalloc((void **)&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db, size);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

dotProduct<<<1,SIZE>>>(da, db, dc);


cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);
printf("Dot Product :%d \n",c);
return 0;
}
Matrix transpose

#include<stdio.h>
#include<sys/time.h>

#define N 4
#define M 4
#define BDIMX 2
#define BDIMY 2

__global__ void transpose(int *a, int *b) {


__shared__ int temp[BDIMY][BDIMX];
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int ti = iy * N + ix;
int bidx = threadIdx.x + threadIdx.y * blockDim.x;
int irow = bidx / blockDim.y;
int icol = bidx % blockDim.y;
ix = icol + blockIdx.y * blockDim.y;
iy = irow + blockIdx.x * blockDim.x;
int to = iy * M + ix;
if(ix < N && iy < M) {
temp[threadIdx.y][threadIdx.x] = a[ti];
__syncthreads();
b[to] = temp[icol][irow];
}
}

double cpuSecond() {
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}

int main() {
int *a, *b;
int size = N * M * sizeof(int);
a = (int* )malloc(size);
b = (int* )malloc(size);
for(int i = 0; i < N * M; i++) {
a[i] = i;
}
printf("Initial Array: \n");
for(int i = 0; i < N; i++) {
for(int j = 0; j < M; j++) {
printf("%d ", a[i * M + j]);
}
printf("\n");
}

int *da, *db;


cudaMalloc((void** )&da, size);
cudaMalloc((void** )&db, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
dim3 block(BDIMX, BDIMY);
dim3 grid(2, 2);
double istart = cpuSecond();
transpose<<<grid, block>>>(da, db);
cudaDeviceSynchronize();
double ielapsed = cpuSecond() - istart;
cudaMemcpy(b, db, size, cudaMemcpyDeviceToHost);
printf("Final Array: \n");
for(int i = 0; i < N; i++) {
for(int j = 0; j < M; j++) {
printf("%d ", b[i * M + j]);
}
printf("\n");
}
printf("Elapsed Time : %lf\n", ielapsed);
}
3d stencil

#include <cuda_runtime.h>
#include<stdio.h>
#include<sys/time.h>

#define RADIUS 4
#define BDIM 8

// constant memory
__constant__ float coef[RADIUS + 1];

/*
// FD coeffecient
#define a0 0.00000f
#define a1 0.80000f
#define a2 -0.20000f
#define a3 0.03809f
#define a4 -0.00357f
*/

#define a0 0
#define a1 1
#define a2 2
#define a3 3
#define a4 4

double cpuSecond(){
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}

void initialData(float *in, const int size)


{
for (int i = 0; i < size; i++)
{
// in[i] = (float)(rand() & 0xFF) / 100.0f;
in[i]=i+1;
}
}

void printData(float *in, const int size)


{
for (int i = RADIUS; i < size; i++)
{
printf("%f ", in[i]);
}

printf("\n");
}

void cpu_stencil_1d (float *in, float *out, int isize)


{
for (int i = RADIUS; i <= isize; i++)
{
float tmp = a1 * (in[i + 1] - in[i - 1])
+ a2 * (in[i + 2] - in[i - 2])
+ a3 * (in[i + 3] - in[i - 3])
+ a4 * (in[i + 4] - in[i - 4]);
out[i] = tmp;
}
}

__global__ void stencil_1d(float *in, float *out, int N)


{
// shared memory
__shared__ float smem[BDIM + 2 * RADIUS];

// index to global memory


int idx = blockIdx.x * blockDim.x + threadIdx.x;

// index to shared memory for stencil calculatioin


int sidx = threadIdx.x + RADIUS;

// Read data from global memory into shared memory


smem[sidx] = in[idx];
printf("\nsmem[%d]=in[%d] by %d, value is %f",sidx,idx,threadIdx.x,in[idx]);
__syncthreads();
// read halo part to shared memory
if (threadIdx.x < RADIUS)
{
smem[sidx - RADIUS] = in[idx - RADIUS];
smem[sidx + BDIM] = in[idx + BDIM];
printf("\nsmem[%d]=in[%d] by %d, value is
%f",sidx-RADIUS,idx-RADIUS,threadIdx.x,in[idx-RADIUS]);
printf("\nsmem[%d]=in[%d] by %d,value is
%f",sidx+BDIM,idx+BDIM,threadIdx.x,in[idx+BDIM]);
}

// Synchronize (ensure all the data is available)


__syncthreads();

// Apply the stencil


float tmp = 0.0f;
#pragma unroll
for (int i = 1; i <= RADIUS; i++)
{
tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]);
}

// Store the result


out[idx] = tmp;
printf("\nin[%d] is %f",idx,in[threadIdx.x]);
printf("\nout[%d] = %f by %d", idx,tmp,threadIdx.x);
}

int main(int argc, char **argv)


{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("%s starting transpose at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
cudaSetDevice(dev);
// set up data size
int isize = 1 << 3;

size_t nBytes = (isize + 2 * RADIUS) * sizeof(float);


printf("array size: %d ", isize);

bool iprint = 1;

// allocate host memory


float *h_in = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);

// allocate device memory


float *d_in, *d_out;
cudaMalloc((float**)&d_in, nBytes);
cudaMalloc((float**)&d_out, nBytes);

// initialize host array


initialData(h_in, isize + 2 * RADIUS);

// Copy to device
cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice);

// set up constant memory


const float h_coef[] = {a0, a1, a2, a3, a4};
cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float));

// launch configuration
cudaDeviceProp info;
cudaGetDeviceProperties(&info, 0);
dim3 block(BDIM, 1);
dim3 grid(info.maxGridSize[0] < isize / block.x ? info.maxGridSize[0] :
isize / block.x, 1);
printf("(grid, block) %d,%d \n ", grid.x, block.x);
double istart = cpuSecond();
// Launch stencil_1d() kernel on GPU
stencil_1d<<<1, 8>>>(d_in + RADIUS, d_out + RADIUS, isize);
double ielapsed = cpuSecond() - istart;
// Copy result back to host
cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost);

// apply cpu stencil


double cpustart = cpuSecond();
cpu_stencil_1d(h_in, hostRef, isize);
double cpuelapsed = cpuSecond() - cpustart;
// print out results
if(iprint)
{ printf("\nisize is %d\n",isize);
printData(gpuRef, isize);
// printData(hostRef, isize);
}
printf("GPU Elapsed Time %lf\n",ielapsed);
printf("CPU Elapsed Time %lf\n",cpuelapsed);
// Cleanup
cudaFree(d_in);
cudaFree(d_out);
free(h_in);
free(hostRef);
free(gpuRef);

// reset device
cudaDeviceReset();
return EXIT_SUCCESS;
}
Odd even sort

#include<stdio.h>
#include<stdlib.h>
#define N 100

__global__ void oddevensort(int *a) {


int ix = threadIdx.x;
for(int i = 0; i < N / 2; i++) {
if(ix < N-1 && ix % 2 != 0) {
int t = a[ix + 1];
a[ix + 1] = max(t, a[ix]);
a[ix] = min(t, a[ix]);
}
__syncthreads();
if(ix < N-1 && ix % 2 == 0) {
int t = a[ix + 1];
a[ix + 1] = max(t, a[ix]);
a[ix] = min(t, a[ix]);
}
__syncthreads();
}
}

int main()
{
int *arr;
int *dev_arr;
arr = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&dev_arr, N*sizeof(int));
int ct = N;
printf("Input array is :\n");
for (int i=0; i<N; i++) {
arr[i] = ct;
ct--;
printf("%d ",arr[i]);
}
printf("\n");
cudaMemcpy(dev_arr, arr, N*sizeof(int), cudaMemcpyHostToDevice);
oddevensort<<<1, N>>>(dev_arr);
cudaMemcpy(arr, dev_arr, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceReset();

printf("Output array is :\n");


for (int i=0; i<N; i++) {
printf("%d ",arr[i]);
}
printf("\n");
cudaFree(dev_arr);
free(arr);
}

#include<stdio.h>
#include<stdlib.h>
#define N 100

__global__ void oddevensort(int *a) {


int ix = threadIdx.x;
for(int i = 0; i < N / 2; i++) {
if(ix < N-1 && ix % 2 != 0) {
int t = a[ix + 1];
a[ix + 1] = max(t, a[ix]);
a[ix] = min(t, a[ix]);
}
__syncthreads();
if(ix < N-1 && ix % 2 == 0) {
int t = a[ix + 1];
a[ix + 1] = max(t, a[ix]);
a[ix] = min(t, a[ix]);
}
__syncthreads();
}
}

int main()
{
int *arr;
int *dev_arr;
arr = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&dev_arr, N*sizeof(int));
int ct = N;
printf("Input array is :\n");
for (int i=0; i<N; i++) {
arr[i] = ct;
ct--;
printf("%d ",arr[i]);
}
printf("\n");

cudaMemcpy(dev_arr, arr, N*sizeof(int), cudaMemcpyHostToDevice);


oddevensort<<<1, N>>>(dev_arr);
cudaMemcpy(arr, dev_arr, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceReset();

printf("Output array is :\n");


for (int i=0; i<N; i++) {
printf("%d ",arr[i]);
}
printf("\n");
cudaFree(dev_arr);
free(arr);
}
Parallel quicksort

#include<stdio.h>
#include<time.h>

#define N 20

void initialize(int *a) {


for(int i = 0; i < N; i++) {
a[i] = rand() % (100 - 10 + 1) + 10;
}
}

__device__ int d_size;

__global__ void partition(int *arr, int *lstack, int *hstack, int n) {


int idx = blockIdx.x * blockDim.x + threadIdx.x;
d_size = 0;
__syncthreads();
if(idx < n) {
int h = hstack[idx], l = lstack[idx], x = arr[h], i = l - 1;
int temp;
for(int j = l; j < h; j++) {
if(arr[j] <= x) {
i++;
temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
temp = arr[i + 1];
arr[i + 1] = arr[h];
arr[h] = temp;
int p = i + 1;
if(p - 1 > l) {
int ind = atomicAdd(&d_size, 1);
lstack[ind] = l;
hstack[ind] = p - 1;
}
if(p + 1 < h) {
int ind = atomicAdd(&d_size, 1);
lstack[ind] = p + 1;
hstack[ind] = h;
}
}
}

void quickSort(int *arr) {


int low = 0, high = N - 1;
int lstack[high - low + 1], hstack[high - low + 1];
int top = -1, *da, *dl, *dh, size = (high - low + 1) * sizeof(int);
lstack[++top] = low;
hstack[top] = high;

cudaMalloc(&da, size);
cudaMemcpy(da, arr, size, cudaMemcpyHostToDevice);

cudaMalloc(&dl, size);
cudaMemcpy(dl, lstack, size, cudaMemcpyHostToDevice);

cudaMalloc(&dh, size);
cudaMemcpy(dh, hstack, size, cudaMemcpyHostToDevice);

int nt, nb, ni;


nt = nb = ni = 1;

while(ni > 0) {
partition<<<nb, nt>>>(da, dl, dh, ni);
int ans;
cudaMemcpyFromSymbol(&ans, d_size, sizeof(int),0,
cudaMemcpyDeviceToHost);
if(ans < N * nt) {
nt = ans;
}
else {
nt = N * nt;
nb = ans / nt + (ans % nt == 0 ? 0 : 1);
}
ni = ans;
cudaMemcpy(arr, da, (high - low + 1) * sizeof(int),
cudaMemcpyDeviceToHost);
}
}

int main() {
int *a = (int* )malloc(N * sizeof(int));
initialize(a);
quickSort(a);
for(int i = 0; i < N; i++) {
printf("%d ", a[i]);
}
printf("\n");
}
Preorder

#include<stdio.h>
#define N 8
__device__ struct point {
int x;
int y;
};

__device__ struct point succ[N][N];


__device__ int position[N][N];

__global__ void preorder(int *parent, int *sibling, int *child, int *adj, int *preo) {
int i = threadIdx.x;
int j = threadIdx.y;
int gind = j*N+i;
if(adj[gind]==1) {
printf("Edge (%d, %d)\n",i, j);
if(parent[i] == j) {
if(sibling[i]!=(-1)) {
struct point pt;
pt.x = j;
pt.y = sibling[i];
succ[i][j] = pt;
}
else if(parent[j]!=(-1)) {
struct point pt;
pt.x = j;
pt.y = parent[j];
succ[i][j] = pt;
}
else {
struct point pt;
pt.x = i;
pt.y = j;
succ[i][j] = pt;
preo[j] = 1;
}
}
else {
if(child[j]!=(-1)) {
struct point pt;
pt.x = j;
pt.y = child[j];
succ[i][j] = pt;
}
else {
struct point pt;
pt.x = j;
pt.y = i;
succ[i][j] = pt;
}
}
__syncthreads();
if(parent[i]==j) position[i][j] = 0;
else position[i][j] = 1;
int logval = (int)ceil(log2((double)(2*(N-1))));
printf("Successor of (%d, %d) = (%d, %d)\n",i, j, succ[i][j].x, succ[i][j].y);

for(int k=1; k<=logval; k++) {


__syncthreads();
struct point pt = succ[i][j];
position[i][j] = position[i][j]+position[pt.x][pt.y];
succ[i][j] = succ[pt.x][pt.y];
}
if(i==parent[j]) preo[j] = N+1-position[i][j];
__syncthreads();
}

int main() {
int parents[] = {-1, 0, 0, 1, 1, 2, 4, 4};
int sibling[] = {-1, 2, -1, 4, -1, -1, 7, -1};
int children[] = {1, 3, 5, -1, 6, -1, -1, -1};
int *parent, *sib, *child, *preo, *ordered, *adj;
ordered = (int *)malloc(sizeof(int)*N);
cudaMalloc((int **)&parent, sizeof(int)*N);
cudaMalloc((int **)&sib, sizeof(int)*N);
cudaMalloc((int **)&child, sizeof(int)*N);
cudaMalloc((int **)&preo, sizeof(int)*N);
cudaMalloc((int **)&adj, sizeof(int)*N*N);
int adjacency[N][N];
memset(adjacency, 0, sizeof(adjacency));
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
if(parents[j]!=-1 && parents[j]==i) {
adjacency[i][j] = 1;
adjacency[j][i] = 1;
}
}
}
cudaMemcpy(parent, parents, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(sib, sibling, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(child, children, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(adj, adjacency, sizeof(int)*N*N, cudaMemcpyHostToDevice);
dim3 grid(1);
dim3 block(N, N);
preorder<<<grid, block>>>(parent, sib, child, adj, preo);
cudaMemcpy(ordered, preo, sizeof(int)*N, cudaMemcpyDeviceToHost);
int preordered[N];
for(int i=0; i<N; i++) {
preordered[ordered[i]-1] = i;
}
for(int i=0; i<N; i++) {
printf("%d ", preordered[i]);
}
printf("\n");
free(ordered);
cudaFree(parent);
cudaFree(sib);
cudaFree(child);
cudaDeviceReset();
}

You might also like