Download as pdf or txt
Download as pdf or txt
You are on page 1of 2

new 2

11 martie 2016 18:19

#include<iostream>
#include<cuda_runtime.h>

__global__ void mul(float * mat, float * vec1, float * vec2, int size)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;

if (index >= size)


return;
float sum = 0;
for (int k = 0; k < size; k++)
{
sum += mat[index* size + k] * vec1[k];
}
vec2[index] = sum;
}
void main()
{
int size = 32;
// Alocare memorie pe host
float *mat_h = new float[size*size];
float *vec1_h = new float[size];
float *vec2_h = new float[size];
float *mat_d;
float *vec1_d;
float *vec2_d;
// Alocare memorie pe GPU
cudaMalloc((void**)&mat_d, size*size*sizeof(float));
cudaMalloc((void**)&vec1_d, size*sizeof(float));
cudaMalloc((void**)&vec2_d, size*sizeof(float));

// Initializare valori
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
mat_h[j * size + i] = 1;
}
vec1_h[i] = 1;
}
// Copiere CPU GPU
cudaMemcpy(mat_d, mat_h, size*size*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(vec1_d, vec1_h, size*sizeof(float), cudaMemcpyHostToDevice);

// Lansarea n execuie a kernel-ului CUDA


dim3 threads_per_block(16, 1, 1);
-1-

new 2

11 martie 2016 18:19

dim3 blocks(3, 1, 1);


mul <<<blocks, threads_per_block >>>(mat_d, vec1_d, vec2_d, size);

// Copierea memoriei GPU CPU


cudaMemcpy(vec2_h, vec2_d, size*sizeof(float), cudaMemcpyDeviceToHost);

// Afiare rezultate
for (int i = 0; i < size; i++)
{
std::cout << vec2_h [i] << std::endl;
}

return;
}

-2-

You might also like