Computer Architecture and Organization

COMPUTER ARCHITECTURE AND ORGANIZATION
NAME : T. RAGHAVENDRA
REG NO. : 18BCE0698
SLOT : G1+TG1
COURSERA : FUNDAMENTALS OF PARALLELISM ON INTEL

ARCHITECTURE
CODE:
#include <cstdio>
int main() {
#ifdef __INTEL_COMPILER
// Only compiled with Intel Compiler
printf("Hello world from Intel compiler");
#elif __GNUC__
// Only compiled with GNU Compiler
printf("Hello world from GNU compiler");
#endif
QUIZ : 2
CODE :
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <omp.h>
#include <mkl.h>
#include "distribution.h"
int diffusion(const int n_particles,
const int n_steps,
const float x_threshold,
const float alpha,
VSLStreamStatePtr rnStream);
// DO NOT MODIFY THIS FUNCTION //
//unoptimized reference function
int ref_diffusion(const int n_particles,
const int n_steps,
const float x_threshold,
const float alpha,
VSLStreamStatePtr rnStream) {
int n_escaped=0;
for (int i = 0; i < n_particles; i++) {
float x = 0.0f;
for (int j = 0; j < n_steps; j++) {
float rn;
vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 1, &rn, -1.0, 1.0);

x += delta_max*sinf(alpha*rn)*expf(-rn*rn);
if (x > x_threshold) n_escaped++;
return n_escaped;
int main(int argc, char** argv) {
float alpha = 1.0f;
float x_threshold = 3.0f;
if(argc>1) {
alpha = atof(argv[1]);
if(argc>2) {
x_threshold = atof(argv[2]);
const int n_particles = 1<<17;
const int n_steps = 500;
VSLStreamStatePtr rnStream;
//initialize random stream
vslNewStream( &rnStream, VSL_BRNG_MT19937, 0);
//compute refernce data
const int ref_escaped = ref_diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);
const int n_trials = 10;

const int skip_trials = 2;
double tsum = 0.0;
bool err = false;
//compute diffusion data using function defined in worker.cc and get the timing
const double t0 = omp_get_wtime();
int n_escaped = diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);
//verify the filter data with refernce data
if(n_escaped - ref_escaped > 5*sqrt(ref_escaped)) {
printf("Error: n_escaped %d, while reference is %d\n", n_escaped, ref_escaped);
} else {
// Printing verification and performance
printf("%d\t(ref: %d)\t%f\n", n_escaped, ref_escaped, t1-t0);
QUIZ : 3
#include <cstdlib>
#include <cstdio>
#include <omp.h>
#include <mkl.h>
#include <vector>
#include <algorithm>
void filter(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind);
//reference function to verify data
void filter_ref(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind) {
float sum;
for(long i = 0; i < n; i++){
sum = 0.0f;
for(long j = 0; j < m; j++) {
sum+=data[i*m+j];
}
if(sum > threshold)
result_row_ind.push_back(i);
std::sort(result_row_ind.begin(),result_row_ind.end());
float threshold = 0.5;
if(argc < 2) {
threshold = 0.5;
} else {
threshold = atof(argv[1]);
const long n = 1<<15; //rows
const long m = 1<<18; //columns
float *data = (float *) malloc((long)sizeof(float)*n*m);
long random_seed = (long)(omp_get_wtime()*1000.0) % 1000L;
vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);
//initialize 2D data
#pragma omp parallel for
for(long i =0; i < n; i++)

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, m, &data[m*i], -1.0, 1.0);
std::vector<long> ref_result_row_ind;
//compute the refernce data using unoptimized refernce function defined above
filter_ref(n, m, data, threshold, ref_result_row_ind);
//compute actual data using the function defined in worker.cc and get the timing
std::vector<long> result_row_ind;
filter(n, m, data, threshold, result_row_ind);
//verify the actual data and the refernce data
if(ref_result_row_ind.size() != result_row_ind.size()) {
// Result sizes did not match
printf("Error: The reference and result vectors have different sizes: %ld
%ld",ref_result_row_ind.size(), result_row_ind.size());
} else {
bool passed = true;
for(long i = 0; i < ref_result_row_ind.size(); i++) {
passed &= (ref_result_row_ind[i] == result_row_ind[i]);
if(passed) {
// Printing perf
printf("Time: %f\n", t1-t0);
} else {
// Results did not match
printf("Error: The reference and result vectors did not match");

}
QUIZ : 4
CODE :
#include <cstdio>
#include <cstdlib>
#include <mkl.h>
#include <omp.h>
#include <hbwmalloc.h>
void runFFTs( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle);
// Do not modify.
//reference funtion
void runFFTs_ref( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle) {
for(size_t i = 0; i < num_fft; i++) {
DftiComputeForward (*fftHandle, &data[i*fft_size]);
int main() {
const size_t fft_size = 1L<<27;
const size_t num_fft = 32L;
MKL_Complex8 *data = (MKL_Complex8 *) _mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size,

4096);
MKL_Complex8 *ref_data = (MKL_Complex8 *)

_mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size, 4096);
//iniitialize data array and copy it to ref_data array
#pragma omp parallel
long random_seed = (long)(omp_get_wtime()*1000.0*omp_get_thread_num()) % 1000L;

//initialize random stream
vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);
#pragma omp for
for(size_t i = 0; i < num_fft; i++) {
//Intel MKL Rnadom stream generation function
vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 2*fft_size, (float *)

&data[i*fft_size], -1.0, 1.0);
//copy data to ref_data
#pragma omp for
for(long i = 0; i < (fft_size+2)*num_fft; i++) {
ref_data[i].real = data[i].real;
ref_data[i].imag = data[i].imag;
DFTI_DESCRIPTOR_HANDLE* fftHandle = new DFTI_DESCRIPTOR_HANDLE;
DftiCreateDescriptor(fftHandle, DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG) fft_size);
DftiCommitDescriptor (*fftHandle);
//compute FFT using refernce function
runFFTs_ref(fft_size, num_fft, ref_data, fftHandle);
//compute and time FFT using function defined in worker.cc
runFFTs(fft_size, num_fft, data, fftHandle);

//verify the comuted FFT data with the reference FFT data
bool within_tolerance = true;
#pragma omp parallel for reduction(&: within_tolerance)
for(long i = 0; i < num_fft; i++) {
for(long j = 0; j < fft_size; j++) {
within_tolerance &= ((data[i*fft_size+j].real-ref_data[i*fft_size+j].real)
*(data[i*fft_size+j].real-ref_data[i*fft_size+j].real)
+(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag)
*(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag))
< 1.0e-6;
if(within_tolerance) {
// Printing performance
} else {
// Verification failed
printf("Error: Verification failed\n");
DftiFreeDescriptor (fftHandle);
_mm_free(ref_data);
_mm_free(data);
}
QUIZ : 5
CODE :
#include <cstdlib>
#include <cstdio>
#include <math.h>
#include <mpi.h>
#include <omp.h>
#include <assert.h>
#include "L.h"
// Finite difference method for stings
// d_(x, t+1) = L(x)*(d_(x+dx, t) + d_(x-dx, t))
// + 2.0f*(1.0f-L(x))*(d_(x,t))
// - d_(x, t-1)
float * simulate(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process);
// Do not modify
//reference simulate function to verify data
float * simulate_ref(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process) {
float* d_t = d_buf1; // buffer for d(*, t)
float* d_t1 = d_buf2; // buffer for d(*, t+1)
const int start_segment = segments_per_process*((long)rank) +1L;
const int last_segment = segments_per_process*((long)rank+1L)+1L;
const float dx = 1.0f/(float)n_segments;
const float phase = 0.5f;
for(int t = 0; t < n_steps; t++) {
#pragma omp parallel for simd

for(long i = start_segment; i < last_segment; i++) {
const float L_x = L(alpha,phase,i*dx);
d_t1[i] = L_x*(d_t[i+1] + d_t[i-1])
+2.0f*(1.0f-L_x)*(d_t[i])
- d_t1[i]; // The algorithm calls for d(i, t-1) here, but that is currently contained in d_t1
MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &d_t1[1], segments_per_process,

MPI_FLOAT, MPI_COMM_WORLD);
float* temp = d_t1; d_t1 = d_t; d_t=temp; // swap buffers
return d_t;
void initialize_buffers(const float alpha, const long n_segments, float *d_buf1, float *d_buf2) {
const float dx = 1.0f/(float)n_segments;
const float phase = (float)n_segments/2.0f;
for(long i =0; i < n_segments; i++)
d_buf1[i] = 100.0*sinf(3.14159*(float)i*dx);
d_buf1[0] = d_buf1[n_segments-1] = d_buf2[0] = d_buf2[n_segments-1] = 0.0f;
for(long i = 1; i < n_segments-1; i++)
// d_1 = d_0 + v_0*dt + 0.5*a*dt^2
d_buf2[i] = L(alpha,phase,i*dx)/2.0f*(d_buf1[i+1] + d_buf1[i-1]) + (1.0f-

L(alpha,phase,i*dx))*(d_buf1[i]);
int ret = MPI_Init(&argc,&argv);
if (ret != MPI_SUCCESS) {
printf("error: could not initialize MPI\n");
MPI_Abort(MPI_COMM_WORLD, ret);
float alpha;
if (argc < 2) {
alpha = 0.2;
} else {
alpha = atof(argv[1]);
int world_size, rank;
MPI_Status stat;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
const int n_steps = 1<<6;
const long n_segments = (1L<<25)+2L;
assert((n_segments-2L)%world_size == 0); // This will make MPI gather much easier to work with
const long segments_per_process = (n_segments-2)/(long)world_size;
//two buffers to store current and next position
float *d_buf1 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);
float *d_buf2 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);
// Getting verificatiobn data
float *d_ref = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);
if(rank == 0) {
initialize_buffers(alpha, n_segments, d_buf1, d_buf2);
MPI_Bcast(d_buf1, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);
//compute reference data
float *d_ref_temp = simulate_ref(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,

segments_per_process);
if(rank == 0) {
for(long i = 0; i < n_segments; i++)
d_ref[i] = d_ref_temp[i];
//initialize buffers in rank0 and broadcast them to all the processes
if(rank == 0) {
initialize_buffers(alpha, n_segments, d_buf1, d_buf2);
//compute using the function in worker.cc and get the timing
float *d_final = simulate(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,

segments_per_process);
//verify computed data with the reference data in rank 0
if(rank == 0) {
bool within_tolerance = true;
#pragma omp parallel for reduction(&: within_tolerance)
for(long i = 0; i < n_segments; i++)
within_tolerance &= ((d_ref[i] - d_final[i])*(d_ref[i] - d_final[i])) < 1.0e-6;;
if(within_tolerance) {
// Printing performance as measured on node 1
} else {
// Verification failed
printf("Error: verification failed %f\n", t1-t0);
MPI_Finalize();

Computer Architecture and Organization

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Computer Architecture and Organization

Uploaded by

Copyright:

Available Formats

COMPUTER ARCHITECTURE AND ORGANIZATION

REG NO. : 18BCE0698

COURSERA : FUNDAMENTALS OF PARALLELISM ON INTEL

// Only compiled with Intel Compiler

printf("Hello world from Intel compiler");

printf("Hello world from GNU compiler");

int diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

// DO NOT MODIFY THIS FUNCTION //

//unoptimized reference function

int ref_diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

for (int i = 0; i < n_particles; i++) {

for (int j = 0; j < n_steps; j++) {

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 1, &rn, -1.0, 1.0);

if (x > x_threshold) n_escaped++;

int main(int argc, char** argv) {

float alpha = 1.0f;

float x_threshold = 3.0f;

const int n_particles = 1<<17;

const int n_steps = 500;

//initialize random stream

vslNewStream( &rnStream, VSL_BRNG_MT19937, 0);

//compute refernce data

const int ref_escaped = ref_diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const int n_trials = 10;

double tsum = 0.0;

bool err = false;

const double t0 = omp_get_wtime();

int n_escaped = diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const double t1 = omp_get_wtime();

//verify the filter data with refernce data

if(n_escaped - ref_escaped > 5*sqrt(ref_escaped)) {

printf("Error: n_escaped %d, while reference is %d\n", n_escaped, ref_escaped);

// Printing verification and performance

printf("%d\t(ref: %d)\t%f\n", n_escaped, ref_escaped, t1-t0);

//reference function to verify data

for(long i = 0; i < n; i++){

for(long j = 0; j < m; j++) {

if(sum > threshold)

int main(int argc, char** argv) {

float threshold = 0.5;

const long n = 1<<15; //rows

const long m = 1<<18; //columns

float *data = (float *) malloc((long)sizeof(float)*n*m);

long random_seed = (long)(omp_get_wtime()*1000.0) % 1000L;

vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);

#pragma omp parallel for

for(long i =0; i < n; i++)

filter_ref(n, m, data, threshold, ref_result_row_ind);

const double t0 = omp_get_wtime();

filter(n, m, data, threshold, result_row_ind);

const double t1 = omp_get_wtime();

//verify the actual data and the refernce data

// Result sizes did not match

bool passed = true;

for(long i = 0; i < ref_result_row_ind.size(); i++) {

passed &= (ref_result_row_ind[i] == result_row_ind[i]);

float data = (float ) malloc((long)sizeof(float)nm);

DftiComputeForward (fftHandle, &data[ifft_size]);

MKL_Complex8 data = (MKL_Complex8 ) _mm_malloc(sizeof(MKL_Complex8)num_fftfft_size,

MKL_Complex8 ref_data = (MKL_Complex8 )

long random_seed = (long)(omp_get_wtime()1000.0omp_get_thread_num()) % 1000L;

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 2fft_size, (float )

within_tolerance &= ((data[ifft_size+j].real-ref_data[ifft_size+j].real)

// d_1 = d_0 + v_0dt + 0.5a*dt^2

d_buf2[i] = L(alpha,phase,idx)/2.0f(d_buf1[i+1] + d_buf1[i-1]) + (1.0f-

float d_buf1 = (float ) _mm_malloc(sizeof(float)*n_segments, 4096);

float d_buf2 = (float ) _mm_malloc(sizeof(float)*n_segments, 4096);

float d_ref = (float ) _mm_malloc(sizeof(float)*n_segments, 4096);