Openmp

Exercise - 1
Write a multithreaded program where each thread prints “hello world”.
1 #include <stdio.h>
2 #include <omp.h>
3
4 int main ()
5 {
6 int nthreads = 4;
7 omp_set_num_threads(nthreads);
8
9 #pragma omp parallel
10 {
11 int id = omp_get_thread_num();
12
13 printf("Hello World from thread = %d", id);
14 printf(" with %d threads\n",omp_get_num_threads());
15 }
16
17 printf("all done, with hopefully %d threads\n",nthreads);
18 }
这个例子可以很好地帮助我们理解什么是并发。
Exercise - 2
用多线程写下面这段代码：
1 static long num_steps = 100000;

2 double step;
3 int main(){
4 int i;
5 double x,pi,sum = 0.0;
6 step = 1.0/(double)num_steps;
7 for (int i = 0; i<num_steps; i++){
8 x = (i+0.5)*step;
9 sum+=4.0/(1.0+x*x);
10 }
11 pi = step*sum;
12 return 0;
13 }
我的第一个答案是这样的：
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
4 const long num_steps = 100000;
5 double step = 1.0/(double) num_steps;
6
7 double solve(int id){
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
10 double x = (i+0.5)*step;
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15
16 int main(){
17
18 omp_set_num_threads(10);
20 {
22 double x = solve(id);
23 ans+=x;
24 }
25 printf("%f\n",ans);
26 return 0;
27 }
思路很简单，按照每个线程的 id 为其分配对应的循环区间，随后累加所有的答案到全局变量 ans 中。但是

当我们多次运行这个程序，我们会发现最终结果可能不是 3.141593 ，而可能是一些明显错误的值。我想这
应该是竞争所导致的。适当修改代码如下:
1 #include<stdio.h>
2 #include<omp.h>
3 double ans[10];
6
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15
16 int main(){
17
20 {
23 ans[id]+=x;
24 }
25 double sum=0;
26 for(int i =0; i<10; i++){
27 sum+=ans[i];
28 }
29 printf("%f\n",sum);
30 return 0;
31 }
这样上述的现象就不再出现，因为我们人为地消除了竞争。下面我们测试开不同的线程的个数，该程序的性
能有什么区别。
2 #include <omp.h>
3
4 #define MAX_THREADS 4
5
7 double step;
8 int main ()
9 {
10 int i,j;
11 double pi, full_sum = 0.0;
12 double start_time, run_time;
13 double sum[MAX_THREADS];
14
15 step = 1.0/(double) num_steps;
16
17
18 for (j=1;j<=MAX_THREADS ;j++) {
19
20 omp_set_num_threads(j);
21 full_sum=0.0;
22 start_time = omp_get_wtime();
23
25 {
26 int i;
28 int numthreads = omp_get_num_threads();
29 double x;
30
31 sum[id] = 0.0;
32
33 if (id == 0)
34 printf(" num_threads = %d",numthreads);
35
36 for (i=id;i< num_steps; i+=numthreads){
37 x = (i+0.5)*step;
38 sum[id] = sum[id] + 4.0/(1.0+x*x);
39 }
40 }
41
42 for(full_sum = 0.0, i=0;i<j;i++)
43 full_sum += sum[i];
44
45 pi = step * full_sum;
46 run_time = omp_get_wtime() - start_time;
47 printf("\n pi is %f in %f seconds %d thrds \n",pi,run_time,j);
48 }
49 }
运行结果如下
1 num_threads = 1
2 pi is 3.141593 in 0.325053 seconds 1 thrds
3
4 num_threads = 2
6
7 num_threads = 3
9
10 num_threads = 4
12
13 num_threads = 5
15
16 num_threads = 6
18
19 num_threads = 7
21
22 num_threads = 8
24
25 num_threads = 9
27
28 num_threads = 10
这样地结果是出人意料地，为什么只用一个线程反而是最快的? 下面我们引出 false sharing 的概念。
什么是false sharing
现代CPU使用缓存来加速内存访问，它们按cache line（通常是64字节）来存储数据。当一个core修改
了cache line中的数据，其他core中缓存了同一cache line的副本会失效。这意味着如果另一个core需
要访问同一cache line（即使是不同的变量），它需要等待原cache line被写回内存然后重新加载它，
导致延迟。这就是所谓的伪共享，伪共享可以显著降低并发程序的性能，尤其是在多核处理器系统
上。
由于数组在内存中是连续的，所以我们定义的数组 ans 中的每四个元素都在一个 cache line 上，这导致每

个线程需要反反复复地从它们共享的最底层的 cache 上面取出这些 cache line ，这便导致了很差的性
能。
解决办法：
Pad arrays so elements you use are on distinct cache lines
2 #include <omp.h>
3
4 #define MAX_THREADS 4
5
7 double step;
8 double sum[4][8];
9 int main ()
10 {
11 int i,j;
12 double pi, full_sum = 0.0;
13 double start_time, run_time;
14
15 step = 1.0/(double) num_steps;
16
17
18 for (j=1;j<=MAX_THREADS ;j++) {
19
20 omp_set_num_threads(j);
21 full_sum=0.0;
22 start_time = omp_get_wtime();
23
25 {
26 int i;
28 int numthreads = omp_get_num_threads();
29 double x;
30
31
32 if (id == 0)
33 printf(" num_threads = %d",numthreads);
34
35 for (i=id;i< num_steps; i+=numthreads){
36 x = (i+0.5)*step;
37 sum[id][0] += 4.0/(1.0+x*x);
38 }
39 }
40
41 for(full_sum = 0.0, i=0;i<4;i++)
42 full_sum += sum[i][0];
43
44 pi = step * full_sum;
45 run_time = omp_get_wtime() - start_time;
46 printf("\n pi is %f in %f seconds %d thrds \n",pi,run_time,j);
47 }
48 }
我们只能用padding 吗？如果想要用padding的话，我们必须知道这台机器的 cache line 大小是多少，并

且如果我们的代码放到另一台机器上就有可能失效。更好的办法见下面。
Synchronization
sychronization: bringing one or more threads to a well defined and known point in their execution.
Synchronization is used to impose order constraints and to protect access to shared data
最常见的两种 sychronization :
Barrier: each thread wait at the barrier until all threads arrive.
Mutual exclusion: Define a block of code that only one thread at a time can execute.
Exercise - 3
我们可以把第二题的代码改为下面这个样子，既避免了竞争，也不需要数组
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
6
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15 int main(){
16
18 double start_time = omp_get_wtime();
20 {
23 #pragma omp critical
24 ans+=x;
25 }
26 double run_time = omp_get_wtime() - start_time;
27 printf("%f ",ans);
28 printf("%f\n", run_time);
29 return 0;
30 }
worksharing
Loop Construct
pragma omp for :
schedule clasue
reduction
Exercise -4
下面我们可以把第三题的代码再次简化：
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
6
7 int main(){
8
10 double start_time = omp_get_wtime();
11 #pragma omp parallel for reduction(+:ans)
12
13 for (int i=0;i<num_steps;i++){
15 ans += 4.0/(1.0+x*x);
16 }
17
18 double run_time = omp_get_wtime() - start_time;
19 printf("%f ",ans*step);
20 printf("%f\n", run_time);
21 return 0;
22 }
Exercise -5
本作业要求我们找出 mandel.c 中的几处错误并修改它们，使得最终输出的答案正确。让我们先看看源代
码：
1 /*
2 ** PROGRAM: Mandelbrot area
3 **
4 ** PURPOSE: Program to compute the area of a Mandelbrot set.
5 ** Correct answer should be around 1.510659.
6 ** WARNING: this program may contain errors
7 **
8 ** USAGE: Program runs without input ... just run the executable
9 **
10 ** HISTORY: Written: (Mark Bull, August 2011).
11 ** Changed "comples" to "d_comples" to avoid collsion with
12 ** math.h complex type (Tim Mattson, September 2011)
13 */
14
16 #include <stdlib.h>
17 #include <math.h>
18 #include <omp.h>
19
20 # define NPOINTS 1000
21 # define MAXITER 1000
22
23 void testpoint(void);
24
25 struct d_complex{
26 double r;
27 double i;
28 };
29
30 struct d_complex c;
31 int numoutside = 0;
32
33 int main(){
34 int i, j;
35 double area, error, eps = 1.0e-5;
36
37 // Loop over grid of points in the complex plane which contains the Mandelbrot
set,
38 // testing each point to see whether it is inside or outside the set.
39
40 #pragma omp parallel for default(shared) private(c,eps)
41 for (i=0; i<NPOINTS; i++) {
42 for (j=0; j<NPOINTS; j++) {
43 c.r = -2.0+2.5*(double)(i)/(double)(NPOINTS)+eps;
44 c.i = 1.125*(double)(j)/(double)(NPOINTS)+eps;
45 testpoint();
46 }
47 }
48
49 // Calculate area of set and error estimate and output the results
50
51 area=2.0*2.5*1.125*(double)(NPOINTS*NPOINTS-numoutside)/(double)
(NPOINTS*NPOINTS);
52 error=area/(double)NPOINTS;
53
54 printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n",area,error);
55 printf("Correct answer should be around 1.510659\n");
56
57 }
58
59 void testpoint(void){
60
61 // Does the iteration z=z*z+c, until |z| > 2 when point is known to be outside
set
62 // If loop count reaches MAXITER, point is considered to be inside the set
63
64 struct d_complex z;
65 int iter;
66 double temp;
67
68 z=c;
69 for (iter=0; iter<MAXITER; iter++){
70 temp = (z.r*z.r)-(z.i*z.i)+c.r;
71 z.i = z.r*z.i*2+c.i;
72 z.r = temp;
73 if ((z.r*z.r+z.i*z.i)>4.0) {
74 numoutside++;
75 break;
76 }
77 }
78
79 }
答案:
第一个错误，也是最违背直觉的错误：我们的循环变量 j 是 shared 的
第二个错误： eps 虽然被设置为 private 但是其并没有被初始化。
第三个错误，我们发现在函数 testpoint 内部有这么一句话 z=c , 那么这个 c 是什么呢? 是定义在

parallel 里面的 private c 呢？还是定义在最外边的 c , 经过测试应该是最外边的 c , 那就错了。
第四个错误：在 numoutside++ 时可能会发生竞争
答案如下:
1 /*
2 ** PROGRAM: Mandelbrot area
3 **
4 ** PURPOSE: Program to compute the area of a Mandelbrot set.
5 ** Correct answer should be around 1.510659.
6 ** WARNING: this program may contain errors
7 **
8 ** USAGE: Program runs without input ... just run the executable
9 **
10 ** HISTORY: Written: (Mark Bull, August 2011).
11 ** Changed "comples" to "d_comples" to avoid collsion with
12 ** math.h complex type (Tim Mattson, September 2011)
13 */
14
17 #include <math.h>
18 #include <omp.h>
19
20 # define NPOINTS 1000
21 # define MAXITER 1000
22
23 void testpoint(void);
24
25 struct d_complex{
26 double r;
27 double i;
28 };
29
30 struct d_complex c;
31 int numoutside = 0;
32
33 int main(){
34 int i, j;
35 double area, error, eps = 1.0e-5;
36
37
38 // Loop over grid of points in the complex plane which contains the Mandelbrot
set,
39 // testing each point to see whether it is inside or outside the set.
40
41 #pragma omp parallel for default(shared) private(c,j) firstprivate(eps)
42 for (i=0; i<NPOINTS; i++) {
43 for (j=0; j<NPOINTS; j++) {
44 c.r = -2.0+2.5*(double)(i)/(double)(NPOINTS)+eps;
45 c.i = 1.125*(double)(j)/(double)(NPOINTS)+eps;
46 struct d_complex z;
47 int iter;
48 double temp;
49
50 z=c;
51 for (iter=0; iter<MAXITER; iter++){
52 temp = (z.r*z.r)-(z.i*z.i)+c.r;
53 z.i = z.r*z.i*2+c.i;
54 z.r = temp;
55 if ((z.r*z.r+z.i*z.i)>4.0) {
56 #pragma omp atomic
57 numoutside++;
58 break;
59 }
60 }
61 }
62 }
63 // Calculate area of set and error estimate and output the results
64
65 area=2.0*2.5*1.125*(double)(NPOINTS*NPOINTS-numoutside)/(double)
(NPOINTS*NPOINTS);
66 error=area/(double)NPOINTS;
67
68 printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n",area,error);
69 printf("Correct answer should be around 1.510659\n");
70 }
Exercise -6
在本任务中，我们需要考虑如何并行处理下面这一段代码：
3 #include <omp.h>
4
5 #ifndef N
6 #define N 5
7 #endif
8 #ifndef FS
9 #define FS 38
10 #endif
11
12 struct node {
13 int data;
14 int fibdata;
15 struct node* next;
16 };
17
18 int fib(int n) {
19 int x, y;
20 if (n < 2) {
21 return (n);
22 } else {
23 x = fib(n - 1);
24 y = fib(n - 2);
25 return (x + y);
26 }
27 }
28
29 void processwork(struct node* p)
30 {
31 int n;
32 n = p->data;
33 p->fibdata = fib(n);
34 }
35
36
37 struct node* init_list(struct node* p) {
38 int i;
39 struct node* head = NULL;
40 struct node* temp = NULL;
41
42 head = malloc(sizeof(struct node));
43 p = head;
44 p->data = FS;
45 p->fibdata = 0;
46 for (i=0; i< N; i++) {
47 temp = malloc(sizeof(struct node));
48 p->next = temp;
49 p = temp;
50 p->data = FS + i + 1;
51 p->fibdata = i+1;
52 }
53 p->next = NULL;
54 return head;
55 }
56
57 int main(int argc, char *argv[]) {
58 double start, end;
59 struct node *p=NULL;
60 struct node *temp=NULL;
61 struct node *head=NULL;
62
63 printf("Process linked list\n");
64 printf(" Each linked list node will be processed by function
'processwork()'\n");
65 printf(" Each ll node will compute %d fibonacci numbers beginning with
%d\n",N,FS);
66
67 p = init_list(p);
68 head = p;
69
70 start = omp_get_wtime();
71 {
72 while (p != NULL) {
73 processwork(p);
74 p = p->next;
75 }
76 }
77
78 end = omp_get_wtime();
79 p = head;
81 printf("%d : %d\n",p->data, p->fibdata);
82 temp = p->next;
83 free (p);
84 p = temp;
85 }
86 free (p);
87
88 printf("Compute Time: %f seconds\n", end - start);
89
90 return 0;
91 }
简单来说，我们从头到尾遍历链表的每个节点，每走到一个节点执行一个函数，这个函数会修改这个节点的
属性。
并行处理这段代码的难点在于 while 循环是不能被并行的，那如果我们可以考虑把 while 转化为 for 就好

了。
解决办法是：我们先定义一个结构体数组，该数组的每个位置装一个节点。
3 #include "omp.h"
4
5 #define N 5
6 #define FS 38
7 #define NMAX 10
8
9 struct node {
10 int data;
11 int fibdata;
13 };
14
15 int fib(int n) {
16 int x, y;
17 if (n < 2) {
18 return (n);
19 } else {
20 x = fib(n - 1);
21 y = fib(n - 2);
22 return (x + y);
23 }
24 }
25
27 {
28 int n;
29 n = p->data;
30 p->fibdata = fib(n);
31 }
32
33 struct node* init_list(struct node* p) {
34 int i;
37
39 p = head;
40 p->data = FS;
41 p->fibdata = 0;
42 for (i=0; i< N; i++) {
44 p->next = temp;
45 p = temp;
46 p->data = FS + i + 1;
48 }
49 p->next = NULL;
50 return head;
51 }
52
53 int main(int argc, char *argv[]) {
58 struct node *parr[NMAX];
59 int i, count=0;
60
%d\n",N,FS);
64
66 head = p;
67
68
70 {
72 processwork(p);
73 p = p->next;
74 }
75 }
76
78
79 printf("serial Compute Time: %f seconds\n", end - start);
80
81
82 p = head;
83
85 {
86 // count number of items in the list. Strictly speaking this isn't
87 // needed since we know there are N elements in the list. But in
88 // most cases you don't know this and need to count nodes.
90 p = p->next;
91 count++;
92 }
93
94 // traverse the list and collect pointers into an array.
95 p = head;
96 for(i=0; i<count; i++) {
97 parr[i] = p;
98 p = p->next;
99 }
100
101 // do the work in parallel
103 {
104 #pragma omp single
105 printf(" %d threads \n",omp_get_num_threads());
106 #pragma omp for schedule(static,1)
107 for(i=0; i<count; i++)
108 processwork(parr[i]);
109 }
110 }
111
113 p = head;
116 temp = p->next;
117 free (p);
118 p = temp;
119 }
120 free (p);
121
123
124 return 0;
125 }
Exercise -7
有没有觉得上一题的做法太麻烦了？聪明的计算机科学家们早就意识到了，因此他们提出了一个解决的办
法： task
task子句相当于显式定义一个任务。常用在不规则循环（不适用parallel for的循环）与递归函数中。
默认任务只能由一个线程执行。当线程遇到task子句时，既可能自己立即执行，也可能将其放入任务池
等待别的线程取走。
taskwait显式等待之前定义的任务执行结束。用于同步
1 #include <omp.h>
4
5
6 #ifndef N
7 #define N 5
8 #endif
9 #ifndef FS
10 #define FS 38
11 #endif
12
13 struct node {
14 int data;
15 int fibdata;
17 };
18
19 struct node* init_list(struct node* p);
20 void processwork(struct node* p);
21 int fib(int n);
22
23 int fib(int n)
24 {
25 int x, y;
26 if (n < 2) {
27 return (n);
28 } else {
29 x = fib(n - 1);
30 y = fib(n - 2);
31 return (x + y);
32 }
33 }
34
36 {
37 int n, temp;
38 n = p->data;
39 temp = fib(n);
40
41 p->fibdata = temp;
42
43 }
44
45 struct node* init_list(struct node* p)
46 {
47 int i;
50
52 p = head;
53 p->data = FS;
54 p->fibdata = 0;
55 for (i=0; i< N; i++) {
57 p->next = temp;
58 p = temp;
59 p->data = FS + i + 1;
61 }
62 p->next = NULL;
63 return head;
64 }
65
66 int main()
67 {
72
%d\n",N,FS);
76
78 head = p;
79
81
83 {
84 #pragma omp master
85 printf("Threads: %d\n", omp_get_num_threads());
86
88 {
89 p=head;
90 while (p) {
91 #pragma omp task firstprivate(p) //first private is required
92 {
93 processwork(p);
94 }
95 p = p->next;
96 }
97 }
98 }
99
101 p = head;
104 temp = p->next;
105 free (p);
106 p = temp;
107 }
108 free (p);
109
111
112 return 0;
113 }
使用 task 计算斐波那契数：
1 #include<stdio.h>
2 #include<omp.h>
3 int fib(int n) {
4 int x, y;
5 if (n < 2) {
6 return n;
7 }
8 #pragma omp task shared (x)
9 x = fib(n - 1);
10 #pragma omp task shared(y)
11 y = fib(n - 2);
12 #pragma omp taskwait
13 // 上面的xy若不标记为shared，会无法访问
14 // 原因，指令内的xy在不同的代码块
15 return x + y;
16 }
17
18 int main() {
19 int NN = 3;
20 int x;
22 {
24 x = fib(NN);
25 }
26 printf("%d\n",x);
27
28 return 0;
29 }
30

Openmp

Uploaded by

Copyright:

Available Formats

You might also like

Openmp

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Openmp

Uploaded by

Copyright:

Available Formats

Exercise - 1

Write a multithreaded program where each thread prints “hello world”.

1 static long num_steps = 100000;

思路很简单，按照每个线程的 id 为其分配对应的循环区间，随后累加所有的答案到全局变量 ans 中。但是

这样地结果是出人意料地，为什么只用一个线程反而是最快的? 下面我们引出 false sharing 的概念。

由于数组在内存中是连续的，所以我们定义的数组 ans 中的每四个元素都在一个 cache line 上，这导致每

Pad arrays so elements you use are on distinct cache lines

我们只能用padding 吗？如果想要用padding的话，我们必须知道这台机器的 cache line 大小是多少，并

第一个错误，也是最违背直觉的错误：我们的循环变量 j 是 shared 的

第二个错误： eps 虽然被设置为 private 但是其并没有被初始化。

第三个错误，我们发现在函数 testpoint 内部有这么一句话 z=c , 那么这个 c 是什么呢? 是定义在

第四个错误：在 numoutside++ 时可能会发生竞争

并行处理这段代码的难点在于 while 循环是不能被并行的，那如果我们可以考虑把 while 转化为 for 就好

You might also like

Openmp

Uploaded by

Copyright:

Available Formats

You might also like

Openmp

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Openmp

Uploaded by

Copyright:

Available Formats

Exercise - 1

Write a multithreaded program where each thread prints “hello world”.

1 static long num_steps = 100000;

思路很简单，按照每个线程的 id 为其分配对应的循环区间，随后累加所有的答案到全局变量 ans 中。但是

这样地结果是出人意料地，为什么只用一个线程反而是最快的? 下面我们引出 false sharing 的概念。

由于数组在内存中是连续的，所以我们定义的数组 ans 中的每四个元素都在一个 cache line 上，这导致每

Pad arrays so elements you use are on distinct cache lines

我们只能用padding 吗？如果想要用padding的话，我们必须知道这台机器的 cache line 大小是多少，并

第一个错误，也是最违背直觉的错误： 我们的循环变量 j 是 shared 的

第二个错误： eps 虽然被设置为 private 但是其并没有被初始化。

第三个错误，我们发现在函数 testpoint 内部有这么一句话 z=c , 那么这个 c 是什么呢? 是定义在

第四个错误：在 numoutside++ 时可能会发生竞争

并行处理这段代码的难点在于 while 循环是不能被并行的，那如果我们可以考虑把 while 转化为 for 就好

You might also like

第一个错误，也是最违背直觉的错误：我们的循环变量 j 是 shared 的