Professional Documents
Culture Documents
HPC MPI LAB 2 Vector Addition
HPC MPI LAB 2 Vector Addition
HPC MPI LAB 2 Vector Addition
Hardware Configuration:
CPU NAME : Intel Core i5-8250U @ 8x 3.4GHz
Number of Sockets : 1
Cores per Socket : 4
Threads per core : 2
L1 Cache size : 32KB
L2 Cache size : 256KB
L3 Cache size(Shared): 6MB
RAM : 8 GB
MPI_Finalize();
}
#define n 100000
double a[n], b[n];
double c[n] = {0};
int main(int argc, char *argv[])
{
int myid, np, elements_per_process, n_elements_recieved;
MPI_Status status;
double startwtime, endwtime, totalTime;
MPI_Init(&argc, &argv);
startwtime = MPI_Wtime();
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &np);
if (myid == 0)
{
for (int i = 0; i < n; i += 1)
{
a[i] = (i + 1);
b[i] = (i + 1);
}
int idx, i;
if(np == 1)
elements_per_process = n;
else
elements_per_process = n / (np - 1);
if (np > 1)
{
for (i = 1; i < np - 1; i++)
{
idx = (i - 1) * elements_per_process;
MPI_Send(&elements_per_process,1, MPI_DOUBLE, i,
0,MPI_COMM_WORLD);
MPI_Send(&a[idx],elements_per_process,MPI_DOUBLE, i,
0,MPI_COMM_WORLD);
MPI_Send(&b[idx],elements_per_process,MPI_DOUBLE, i,
0,MPI_COMM_WORLD);
}
idx = (i - 1) * elements_per_process;
int elements_left = n - idx;
MPI_Send(&elements_left,1, MPI_DOUBLE,i, 0,MPI_COMM_WORLD);
MPI_Send(&a[idx],elements_left,MPI_DOUBLE, i,
0,MPI_COMM_WORLD);
MPI_Send(&b[idx],elements_left,MPI_DOUBLE, i,
0,MPI_COMM_WORLD);
}
for (i = 1; i < np; i++)
{
int n_elements_recieved;
idx = (i - 1) * elements_per_process;
MPI_Recv(&n_elements_recieved,1, MPI_DOUBLE, i,
0,MPI_COMM_WORLD,&status);
MPI_Recv(&c[idx], n_elements_recieved,MPI_DOUBLE, i,
0,MPI_COMM_WORLD,&status);
int sender = status.MPI_SOURCE;
}
endwtime = MPI_Wtime();
totalTime = endwtime - startwtime;
printf("%f\n", totalTime);
}
else
{
MPI_Recv(&n_elements_recieved,1, MPI_DOUBLE, 0,
0,MPI_COMM_WORLD,&status);
char processor_name[MPI_MAX_PROCESSOR_NAME];
double a_recv[n + 1000], b_recv[n + 1000], c_recv[n + 1000];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
MPI_Recv(&a_recv, n_elements_recieved,MPI_DOUBLE, 0,
0,MPI_COMM_WORLD,&status);
MPI_Recv(&b_recv, n_elements_recieved,MPI_DOUBLE, 0,
0,MPI_COMM_WORLD,&status);
for (int i = 0; i < n_elements_recieved; i++)
c_recv[i] = a_recv[i] * b_recv[i];
MPI_Send(&n_elements_recieved,1, MPI_DOUBLE,0, 0,MPI_COMM_WORLD);
MPI_Send(&c_recv, n_elements_recieved, MPI_DOUBLE,0, 0,
MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
For execution;
mpirun -n 25 -f machinefile ./vector_mul
Observations
Collective Communication;
No of Execution Parallel
Threads(P) Time(T(P)) Speedup(S(P)) Fraction(f(P))
1 0.019139
2 0.002196 8.715391621 1.770520926
4 0.003118 6.138229634 1.116115436
6 0.127387 0.1502429604 -6.787063065
8 0.149017 0.1284350108 -7.755473118
12 0.236928 0.08077981496 -12.41381472
16 0.289745 0.06605463425 -15.08158211
20 0.385727 0.04961799407 -20.16208293
24 0.396934 0.04821708395 -20.59777781
32 0.361012 0.05301485823 -18.43885058
48 1.45738 0.0131324706 -76.74600932
64 2.124361 0.009009297384 -111.7424224
128 11.849663 0.001615151418 -623.0042182
Point to point Communication;
No of Execution Parallel
Threads(P) Time(T(P)) Speedup(S(P)) Fraction(f(P))
1 0.001067
2 0.00229 0.4659388646 -2.292408622
4 0.00259 0.411969112 -1.903155264
6 0.049124 0.02172054393 -54.04723524
8 0.127308 0.008381248625 -135.2159593
12 0.453921 0.002350629295 -463.0014484
16 0.030938 0.0344883315 -29.86166823
20 0.054872 0.01944525441 -53.08045183
24 0.064523 0.01653673884 -62.05712889
32 0.337267 0.003163665582 -325.2531971
48 3.010907 0.0003543782654 -2880.861433
64 4.838193 0.0002205368823 -4605.347496
128 23.48159 0.00004543985309 -22179.38989
Inference
The performance of the program worsens with the cluster execution as the same
resources are divided among different virtual machines.
At any given instance of the program’s runtime, there are 4 OS (1 Windows 11 OS and
3 Ubuntu 20.10) running, using the common resources from the laptop. This causes a
bottleneck in the performance of the program.
Along with the OS, there are multiple background and foreground processes running,
which use the system resources.
Also, the complexity of the algorithm is low, O(n).
The communication overhead crosses the computation overhead at 128 processors,
causing the drastic reduction in the performance of the program.
At 256 processors, my computer stopped working because the computations overhead
were too much.
Collective communication offers better performance compared to Point-to-point
communication.