Download as pdf or txt
Download as pdf or txt
You are on page 1of 20

COMPUTER ARCHITECTURE AND ORGANIZATION

NAME : T. RAGHAVENDRA

REG NO. : 18BCE0698

SLOT : G1+TG1

COURSERA : FUNDAMENTALS OF PARALLELISM ON INTEL


ARCHITECTURE

CODE:

#include <cstdio>

int main() {

#ifdef __INTEL_COMPILER

// Only compiled with Intel Compiler

printf("Hello world from Intel compiler");

#elif __GNUC__
// Only compiled with GNU Compiler

printf("Hello world from GNU compiler");

#endif

QUIZ : 2

CODE :
#include <cstdio>

#include <cstdlib>

#include <cmath>

#include <omp.h>

#include <mkl.h>

#include "distribution.h"

int diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

VSLStreamStatePtr rnStream);

// DO NOT MODIFY THIS FUNCTION //

//unoptimized reference function

int ref_diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

VSLStreamStatePtr rnStream) {

int n_escaped=0;

for (int i = 0; i < n_particles; i++) {

float x = 0.0f;

for (int j = 0; j < n_steps; j++) {

float rn;

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 1, &rn, -1.0, 1.0);


x += delta_max*sinf(alpha*rn)*expf(-rn*rn);

if (x > x_threshold) n_escaped++;

return n_escaped;

int main(int argc, char** argv) {

float alpha = 1.0f;

float x_threshold = 3.0f;

if(argc>1) {

alpha = atof(argv[1]);

if(argc>2) {

x_threshold = atof(argv[2]);

const int n_particles = 1<<17;

const int n_steps = 500;

VSLStreamStatePtr rnStream;

//initialize random stream

vslNewStream( &rnStream, VSL_BRNG_MT19937, 0);

//compute refernce data

const int ref_escaped = ref_diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const int n_trials = 10;


const int skip_trials = 2;

double tsum = 0.0;

bool err = false;

//compute diffusion data using function defined in worker.cc and get the timing

const double t0 = omp_get_wtime();

int n_escaped = diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const double t1 = omp_get_wtime();

//verify the filter data with refernce data

if(n_escaped - ref_escaped > 5*sqrt(ref_escaped)) {

printf("Error: n_escaped %d, while reference is %d\n", n_escaped, ref_escaped);

} else {

// Printing verification and performance

printf("%d\t(ref: %d)\t%f\n", n_escaped, ref_escaped, t1-t0);

QUIZ : 3
#include <cstdlib>

#include <cstdio>

#include <omp.h>

#include <mkl.h>

#include <vector>

#include <algorithm>

void filter(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind);

//reference function to verify data

void filter_ref(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind) {

float sum;

for(long i = 0; i < n; i++){

sum = 0.0f;

for(long j = 0; j < m; j++) {

sum+=data[i*m+j];
}

if(sum > threshold)

result_row_ind.push_back(i);

std::sort(result_row_ind.begin(),result_row_ind.end());

int main(int argc, char** argv) {

float threshold = 0.5;

if(argc < 2) {

threshold = 0.5;

} else {

threshold = atof(argv[1]);

const long n = 1<<15; //rows

const long m = 1<<18; //columns

float *data = (float *) malloc((long)sizeof(float)*n*m);

long random_seed = (long)(omp_get_wtime()*1000.0) % 1000L;

VSLStreamStatePtr rnStream;

vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);

//initialize 2D data

#pragma omp parallel for

for(long i =0; i < n; i++)


vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, m, &data[m*i], -1.0, 1.0);

std::vector<long> ref_result_row_ind;

//compute the refernce data using unoptimized refernce function defined above

filter_ref(n, m, data, threshold, ref_result_row_ind);

//compute actual data using the function defined in worker.cc and get the timing

std::vector<long> result_row_ind;

const double t0 = omp_get_wtime();

filter(n, m, data, threshold, result_row_ind);

const double t1 = omp_get_wtime();

//verify the actual data and the refernce data

if(ref_result_row_ind.size() != result_row_ind.size()) {

// Result sizes did not match

printf("Error: The reference and result vectors have different sizes: %ld
%ld",ref_result_row_ind.size(), result_row_ind.size());

} else {

bool passed = true;

for(long i = 0; i < ref_result_row_ind.size(); i++) {

passed &= (ref_result_row_ind[i] == result_row_ind[i]);

if(passed) {

// Printing perf

printf("Time: %f\n", t1-t0);

} else {

// Results did not match

printf("Error: The reference and result vectors did not match");


}

QUIZ : 4

CODE :

#include <cstdio>
#include <cstdlib>

#include <mkl.h>

#include <omp.h>

#include <hbwmalloc.h>

void runFFTs( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle);

// Do not modify.

//reference funtion

void runFFTs_ref( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle) {

for(size_t i = 0; i < num_fft; i++) {

DftiComputeForward (*fftHandle, &data[i*fft_size]);

int main() {

const size_t fft_size = 1L<<27;

const size_t num_fft = 32L;

MKL_Complex8 *data = (MKL_Complex8 *) _mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size,


4096);

MKL_Complex8 *ref_data = (MKL_Complex8 *)


_mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size, 4096);

//iniitialize data array and copy it to ref_data array

#pragma omp parallel

long random_seed = (long)(omp_get_wtime()*1000.0*omp_get_thread_num()) % 1000L;


VSLStreamStatePtr rnStream;

//initialize random stream

vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);

#pragma omp for

for(size_t i = 0; i < num_fft; i++) {

//Intel MKL Rnadom stream generation function

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 2*fft_size, (float *)


&data[i*fft_size], -1.0, 1.0);

//copy data to ref_data

#pragma omp for

for(long i = 0; i < (fft_size+2)*num_fft; i++) {

ref_data[i].real = data[i].real;

ref_data[i].imag = data[i].imag;

DFTI_DESCRIPTOR_HANDLE* fftHandle = new DFTI_DESCRIPTOR_HANDLE;

DftiCreateDescriptor(fftHandle, DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG) fft_size);

DftiCommitDescriptor (*fftHandle);

//compute FFT using refernce function

runFFTs_ref(fft_size, num_fft, ref_data, fftHandle);

//compute and time FFT using function defined in worker.cc

const double t0 = omp_get_wtime();

runFFTs(fft_size, num_fft, data, fftHandle);

const double t1 = omp_get_wtime();


//verify the comuted FFT data with the reference FFT data

bool within_tolerance = true;

#pragma omp parallel for reduction(&: within_tolerance)

for(long i = 0; i < num_fft; i++) {

for(long j = 0; j < fft_size; j++) {

within_tolerance &= ((data[i*fft_size+j].real-ref_data[i*fft_size+j].real)

*(data[i*fft_size+j].real-ref_data[i*fft_size+j].real)

+(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag)

*(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag))

< 1.0e-6;

if(within_tolerance) {

// Printing performance

printf("Time: %f\n", t1-t0);

} else {

// Verification failed

printf("Error: Verification failed\n");

DftiFreeDescriptor (fftHandle);

_mm_free(ref_data);

_mm_free(data);

}
QUIZ : 5

CODE :

#include <cstdlib>

#include <cstdio>

#include <math.h>

#include <mpi.h>
#include <omp.h>

#include <assert.h>

#include "L.h"

// Finite difference method for stings

// d_(x, t+1) = L(x)*(d_(x+dx, t) + d_(x-dx, t))

// + 2.0f*(1.0f-L(x))*(d_(x,t))

// - d_(x, t-1)

float * simulate(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process);

// Do not modify

//reference simulate function to verify data

float * simulate_ref(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process) {

float* d_t = d_buf1; // buffer for d(*, t)

float* d_t1 = d_buf2; // buffer for d(*, t+1)

const int start_segment = segments_per_process*((long)rank) +1L;

const int last_segment = segments_per_process*((long)rank+1L)+1L;

const float dx = 1.0f/(float)n_segments;

const float phase = 0.5f;

for(int t = 0; t < n_steps; t++) {

#pragma omp parallel for simd


for(long i = start_segment; i < last_segment; i++) {

const float L_x = L(alpha,phase,i*dx);

d_t1[i] = L_x*(d_t[i+1] + d_t[i-1])

+2.0f*(1.0f-L_x)*(d_t[i])

- d_t1[i]; // The algorithm calls for d(i, t-1) here, but that is currently contained in d_t1

MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &d_t1[1], segments_per_process,


MPI_FLOAT, MPI_COMM_WORLD);

float* temp = d_t1; d_t1 = d_t; d_t=temp; // swap buffers

return d_t;

void initialize_buffers(const float alpha, const long n_segments, float *d_buf1, float *d_buf2) {

const float dx = 1.0f/(float)n_segments;

const float phase = (float)n_segments/2.0f;

#pragma omp parallel for

for(long i =0; i < n_segments; i++)

d_buf1[i] = 100.0*sinf(3.14159*(float)i*dx);

d_buf1[0] = d_buf1[n_segments-1] = d_buf2[0] = d_buf2[n_segments-1] = 0.0f;

for(long i = 1; i < n_segments-1; i++)

// d_1 = d_0 + v_0*dt + 0.5*a*dt^2

d_buf2[i] = L(alpha,phase,i*dx)/2.0f*(d_buf1[i+1] + d_buf1[i-1]) + (1.0f-


L(alpha,phase,i*dx))*(d_buf1[i]);

int main(int argc, char** argv) {

int ret = MPI_Init(&argc,&argv);

if (ret != MPI_SUCCESS) {
printf("error: could not initialize MPI\n");

MPI_Abort(MPI_COMM_WORLD, ret);

float alpha;

if (argc < 2) {

alpha = 0.2;

} else {

alpha = atof(argv[1]);

int world_size, rank;

MPI_Status stat;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

const int n_steps = 1<<6;

const long n_segments = (1L<<25)+2L;

assert((n_segments-2L)%world_size == 0); // This will make MPI gather much easier to work with

const long segments_per_process = (n_segments-2)/(long)world_size;

//two buffers to store current and next position

float *d_buf1 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

float *d_buf2 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

// Getting verificatiobn data

float *d_ref = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

if(rank == 0) {
initialize_buffers(alpha, n_segments, d_buf1, d_buf2);

MPI_Bcast(d_buf1, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(d_buf2, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

//compute reference data

float *d_ref_temp = simulate_ref(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,


segments_per_process);

if(rank == 0) {

#pragma omp parallel for

for(long i = 0; i < n_segments; i++)

d_ref[i] = d_ref_temp[i];

//initialize buffers in rank0 and broadcast them to all the processes

if(rank == 0) {

initialize_buffers(alpha, n_segments, d_buf1, d_buf2);

MPI_Bcast(d_buf1, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(d_buf2, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

//compute using the function in worker.cc and get the timing

const double t0 = omp_get_wtime();

float *d_final = simulate(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,


segments_per_process);
const double t1 = omp_get_wtime();

//verify computed data with the reference data in rank 0

if(rank == 0) {

bool within_tolerance = true;

#pragma omp parallel for reduction(&: within_tolerance)

for(long i = 0; i < n_segments; i++)

within_tolerance &= ((d_ref[i] - d_final[i])*(d_ref[i] - d_final[i])) < 1.0e-6;;

if(within_tolerance) {

// Printing performance as measured on node 1

printf("Time: %f\n", t1-t0);

} else {

// Verification failed

printf("Error: verification failed %f\n", t1-t0);

MPI_Finalize();

You might also like