Professional Documents
Culture Documents
Openmp
Openmp
Openmp
1 #include <stdio.h>
2 #include <omp.h>
3
4 int main ()
5 {
6 int nthreads = 4;
7 omp_set_num_threads(nthreads);
8
9 #pragma omp parallel
10 {
11 int id = omp_get_thread_num();
12
13 printf("Hello World from thread = %d", id);
14 printf(" with %d threads\n",omp_get_num_threads());
15 }
16
17 printf("all done, with hopefully %d threads\n",nthreads);
18 }
这个例子可以很好地帮助我们理解什么是并发。
Exercise - 2
用多线程写下面这段代码:
我的第一个答案是这样的:
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
4 const long num_steps = 100000;
5 double step = 1.0/(double) num_steps;
6
7 double solve(int id){
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
10 double x = (i+0.5)*step;
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15
16 int main(){
17
18 omp_set_num_threads(10);
19 #pragma omp parallel
20 {
21 int id = omp_get_thread_num();
22 double x = solve(id);
23 ans+=x;
24 }
25 printf("%f\n",ans);
26 return 0;
27 }
1 #include<stdio.h>
2 #include<omp.h>
3 double ans[10];
4 const long num_steps = 100000;
5 double step = 1.0/(double) num_steps;
6
7 double solve(int id){
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
10 double x = (i+0.5)*step;
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15
16 int main(){
17
18 omp_set_num_threads(10);
19 #pragma omp parallel
20 {
21 int id = omp_get_thread_num();
22 double x = solve(id);
23 ans[id]+=x;
24 }
25 double sum=0;
26 for(int i =0; i<10; i++){
27 sum+=ans[i];
28 }
29 printf("%f\n",sum);
30 return 0;
31 }
这样上述的现象就不再出现,因为我们人为地消除了竞争。下面我们测试开不同的线程的个数,该程序的性
能有什么区别。
1 #include <stdio.h>
2 #include <omp.h>
3
4 #define MAX_THREADS 4
5
6 static long num_steps = 100000000;
7 double step;
8 int main ()
9 {
10 int i,j;
11 double pi, full_sum = 0.0;
12 double start_time, run_time;
13 double sum[MAX_THREADS];
14
15 step = 1.0/(double) num_steps;
16
17
18 for (j=1;j<=MAX_THREADS ;j++) {
19
20 omp_set_num_threads(j);
21 full_sum=0.0;
22 start_time = omp_get_wtime();
23
24 #pragma omp parallel
25 {
26 int i;
27 int id = omp_get_thread_num();
28 int numthreads = omp_get_num_threads();
29 double x;
30
31 sum[id] = 0.0;
32
33 if (id == 0)
34 printf(" num_threads = %d",numthreads);
35
36 for (i=id;i< num_steps; i+=numthreads){
37 x = (i+0.5)*step;
38 sum[id] = sum[id] + 4.0/(1.0+x*x);
39 }
40 }
41
42 for(full_sum = 0.0, i=0;i<j;i++)
43 full_sum += sum[i];
44
45 pi = step * full_sum;
46 run_time = omp_get_wtime() - start_time;
47 printf("\n pi is %f in %f seconds %d thrds \n",pi,run_time,j);
48 }
49 }
运行结果如下
1 num_threads = 1
2 pi is 3.141593 in 0.325053 seconds 1 thrds
3
4 num_threads = 2
5 pi is 3.141593 in 1.315671 seconds 2 thrds
6
7 num_threads = 3
8 pi is 3.141593 in 0.948907 seconds 3 thrds
9
10 num_threads = 4
11 pi is 3.141593 in 0.877917 seconds 4 thrds
12
13 num_threads = 5
14 pi is 3.141593 in 0.966563 seconds 5 thrds
15
16 num_threads = 6
17 pi is 3.141593 in 0.868150 seconds 6 thrds
18
19 num_threads = 7
20 pi is 3.141593 in 0.957173 seconds 7 thrds
21
22 num_threads = 8
23 pi is 3.141593 in 0.923707 seconds 8 thrds
24
25 num_threads = 9
26 pi is 3.141593 in 1.000140 seconds 9 thrds
27
28 num_threads = 10
29 pi is 3.141593 in 1.020560 seconds 10 thrds
什么是false sharing
现代CPU使用缓存来加速内存访问,它们按cache line(通常是64字节)来存储数据。当一个core修改
了cache line中的数据,其他core中缓存了同一cache line的副本会失效。这意味着如果另一个core需
要访问同一cache line(即使是不同的变量),它需要等待原cache line被写回内存然后重新加载它,
导致延迟。这就是所谓的伪共享,伪共享可以显著降低并发程序的性能,尤其是在多核处理器系统
上。
解决办法:
1 #include <stdio.h>
2 #include <omp.h>
3
4 #define MAX_THREADS 4
5
6 static long num_steps = 100000000;
7 double step;
8 double sum[4][8];
9 int main ()
10 {
11 int i,j;
12 double pi, full_sum = 0.0;
13 double start_time, run_time;
14
15 step = 1.0/(double) num_steps;
16
17
18 for (j=1;j<=MAX_THREADS ;j++) {
19
20 omp_set_num_threads(j);
21 full_sum=0.0;
22 start_time = omp_get_wtime();
23
24 #pragma omp parallel
25 {
26 int i;
27 int id = omp_get_thread_num();
28 int numthreads = omp_get_num_threads();
29 double x;
30
31
32 if (id == 0)
33 printf(" num_threads = %d",numthreads);
34
35 for (i=id;i< num_steps; i+=numthreads){
36 x = (i+0.5)*step;
37 sum[id][0] += 4.0/(1.0+x*x);
38 }
39 }
40
41 for(full_sum = 0.0, i=0;i<4;i++)
42 full_sum += sum[i][0];
43
44 pi = step * full_sum;
45 run_time = omp_get_wtime() - start_time;
46 printf("\n pi is %f in %f seconds %d thrds \n",pi,run_time,j);
47 }
48 }
Synchronization
sychronization: bringing one or more threads to a well defined and known point in their execution.
Synchronization is used to impose order constraints and to protect access to shared data
最常见的两种 sychronization :
Barrier: each thread wait at the barrier until all threads arrive.
Mutual exclusion: Define a block of code that only one thread at a time can execute.
Exercise - 3
我们可以把第二题的代码改为下面这个样子,既避免了竞争,也不需要数组
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
4 const long num_steps = 100000;
5 double step = 1.0/(double) num_steps;
6
7 double solve(int id){
8 double sum=0;
9 for (int i = 10000*id; i<10000*(id+1); i++){
10 double x = (i+0.5)*step;
11 sum+=4.0/(1.0+x*x);
12 }
13 return sum*step;
14 }
15 int main(){
16
17 omp_set_num_threads(10);
18 double start_time = omp_get_wtime();
19 #pragma omp parallel
20 {
21 int id = omp_get_thread_num();
22 double x = solve(id);
23 #pragma omp critical
24 ans+=x;
25 }
26 double run_time = omp_get_wtime() - start_time;
27 printf("%f ",ans);
28 printf("%f\n", run_time);
29 return 0;
30 }
worksharing
Loop Construct
pragma omp for :
schedule clasue
reduction
Exercise -4
下面我们可以把第三题的代码再次简化:
1 #include<stdio.h>
2 #include<omp.h>
3 double ans = 0;
4 const long num_steps = 100000;
5 double step = 1.0/(double) num_steps;
6
7 int main(){
8
9 omp_set_num_threads(10);
10 double start_time = omp_get_wtime();
11 #pragma omp parallel for reduction(+:ans)
12
13 for (int i=0;i<num_steps;i++){
14 double x = (i+0.5)*step;
15 ans += 4.0/(1.0+x*x);
16 }
17
18 double run_time = omp_get_wtime() - start_time;
19 printf("%f ",ans*step);
20 printf("%f\n", run_time);
21 return 0;
22 }
Exercise -5
本作业要求我们找出 mandel.c 中的几处错误并修改它们,使得最终输出的答案正确。让我们先看看源代
码:
1 /*
2 ** PROGRAM: Mandelbrot area
3 **
4 ** PURPOSE: Program to compute the area of a Mandelbrot set.
5 ** Correct answer should be around 1.510659.
6 ** WARNING: this program may contain errors
7 **
8 ** USAGE: Program runs without input ... just run the executable
9 **
10 ** HISTORY: Written: (Mark Bull, August 2011).
11 ** Changed "comples" to "d_comples" to avoid collsion with
12 ** math.h complex type (Tim Mattson, September 2011)
13 */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <math.h>
18 #include <omp.h>
19
20 # define NPOINTS 1000
21 # define MAXITER 1000
22
23 void testpoint(void);
24
25 struct d_complex{
26 double r;
27 double i;
28 };
29
30 struct d_complex c;
31 int numoutside = 0;
32
33 int main(){
34 int i, j;
35 double area, error, eps = 1.0e-5;
36
37 // Loop over grid of points in the complex plane which contains the Mandelbrot
set,
38 // testing each point to see whether it is inside or outside the set.
39
40 #pragma omp parallel for default(shared) private(c,eps)
41 for (i=0; i<NPOINTS; i++) {
42 for (j=0; j<NPOINTS; j++) {
43 c.r = -2.0+2.5*(double)(i)/(double)(NPOINTS)+eps;
44 c.i = 1.125*(double)(j)/(double)(NPOINTS)+eps;
45 testpoint();
46 }
47 }
48
49 // Calculate area of set and error estimate and output the results
50
51 area=2.0*2.5*1.125*(double)(NPOINTS*NPOINTS-numoutside)/(double)
(NPOINTS*NPOINTS);
52 error=area/(double)NPOINTS;
53
54 printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n",area,error);
55 printf("Correct answer should be around 1.510659\n");
56
57 }
58
59 void testpoint(void){
60
61 // Does the iteration z=z*z+c, until |z| > 2 when point is known to be outside
set
62 // If loop count reaches MAXITER, point is considered to be inside the set
63
64 struct d_complex z;
65 int iter;
66 double temp;
67
68 z=c;
69 for (iter=0; iter<MAXITER; iter++){
70 temp = (z.r*z.r)-(z.i*z.i)+c.r;
71 z.i = z.r*z.i*2+c.i;
72 z.r = temp;
73 if ((z.r*z.r+z.i*z.i)>4.0) {
74 numoutside++;
75 break;
76 }
77 }
78
79 }
答案:
答案如下:
1 /*
2 ** PROGRAM: Mandelbrot area
3 **
4 ** PURPOSE: Program to compute the area of a Mandelbrot set.
5 ** Correct answer should be around 1.510659.
6 ** WARNING: this program may contain errors
7 **
8 ** USAGE: Program runs without input ... just run the executable
9 **
10 ** HISTORY: Written: (Mark Bull, August 2011).
11 ** Changed "comples" to "d_comples" to avoid collsion with
12 ** math.h complex type (Tim Mattson, September 2011)
13 */
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <math.h>
18 #include <omp.h>
19
20 # define NPOINTS 1000
21 # define MAXITER 1000
22
23 void testpoint(void);
24
25 struct d_complex{
26 double r;
27 double i;
28 };
29
30 struct d_complex c;
31 int numoutside = 0;
32
33 int main(){
34 int i, j;
35 double area, error, eps = 1.0e-5;
36
37
38 // Loop over grid of points in the complex plane which contains the Mandelbrot
set,
39 // testing each point to see whether it is inside or outside the set.
40
41 #pragma omp parallel for default(shared) private(c,j) firstprivate(eps)
42 for (i=0; i<NPOINTS; i++) {
43 for (j=0; j<NPOINTS; j++) {
44 c.r = -2.0+2.5*(double)(i)/(double)(NPOINTS)+eps;
45 c.i = 1.125*(double)(j)/(double)(NPOINTS)+eps;
46 struct d_complex z;
47 int iter;
48 double temp;
49
50 z=c;
51 for (iter=0; iter<MAXITER; iter++){
52 temp = (z.r*z.r)-(z.i*z.i)+c.r;
53 z.i = z.r*z.i*2+c.i;
54 z.r = temp;
55 if ((z.r*z.r+z.i*z.i)>4.0) {
56 #pragma omp atomic
57 numoutside++;
58 break;
59 }
60 }
61 }
62 }
63 // Calculate area of set and error estimate and output the results
64
65 area=2.0*2.5*1.125*(double)(NPOINTS*NPOINTS-numoutside)/(double)
(NPOINTS*NPOINTS);
66 error=area/(double)NPOINTS;
67
68 printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n",area,error);
69 printf("Correct answer should be around 1.510659\n");
70 }
Exercise -6
在本任务中,我们需要考虑如何并行处理下面这一段代码:
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <omp.h>
4
5 #ifndef N
6 #define N 5
7 #endif
8 #ifndef FS
9 #define FS 38
10 #endif
11
12 struct node {
13 int data;
14 int fibdata;
15 struct node* next;
16 };
17
18 int fib(int n) {
19 int x, y;
20 if (n < 2) {
21 return (n);
22 } else {
23 x = fib(n - 1);
24 y = fib(n - 2);
25 return (x + y);
26 }
27 }
28
29 void processwork(struct node* p)
30 {
31 int n;
32 n = p->data;
33 p->fibdata = fib(n);
34 }
35
36
37 struct node* init_list(struct node* p) {
38 int i;
39 struct node* head = NULL;
40 struct node* temp = NULL;
41
42 head = malloc(sizeof(struct node));
43 p = head;
44 p->data = FS;
45 p->fibdata = 0;
46 for (i=0; i< N; i++) {
47 temp = malloc(sizeof(struct node));
48 p->next = temp;
49 p = temp;
50 p->data = FS + i + 1;
51 p->fibdata = i+1;
52 }
53 p->next = NULL;
54 return head;
55 }
56
57 int main(int argc, char *argv[]) {
58 double start, end;
59 struct node *p=NULL;
60 struct node *temp=NULL;
61 struct node *head=NULL;
62
63 printf("Process linked list\n");
64 printf(" Each linked list node will be processed by function
'processwork()'\n");
65 printf(" Each ll node will compute %d fibonacci numbers beginning with
%d\n",N,FS);
66
67 p = init_list(p);
68 head = p;
69
70 start = omp_get_wtime();
71 {
72 while (p != NULL) {
73 processwork(p);
74 p = p->next;
75 }
76 }
77
78 end = omp_get_wtime();
79 p = head;
80 while (p != NULL) {
81 printf("%d : %d\n",p->data, p->fibdata);
82 temp = p->next;
83 free (p);
84 p = temp;
85 }
86 free (p);
87
88 printf("Compute Time: %f seconds\n", end - start);
89
90 return 0;
91 }
简单来说,我们从头到尾遍历链表的每个节点,每走到一个节点执行一个函数,这个函数会修改这个节点的
属性。
解决办法是:我们先定义一个结构体数组,该数组的每个位置装一个节点。
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include "omp.h"
4
5 #define N 5
6 #define FS 38
7 #define NMAX 10
8
9 struct node {
10 int data;
11 int fibdata;
12 struct node* next;
13 };
14
15 int fib(int n) {
16 int x, y;
17 if (n < 2) {
18 return (n);
19 } else {
20 x = fib(n - 1);
21 y = fib(n - 2);
22 return (x + y);
23 }
24 }
25
26 void processwork(struct node* p)
27 {
28 int n;
29 n = p->data;
30 p->fibdata = fib(n);
31 }
32
33 struct node* init_list(struct node* p) {
34 int i;
35 struct node* head = NULL;
36 struct node* temp = NULL;
37
38 head = malloc(sizeof(struct node));
39 p = head;
40 p->data = FS;
41 p->fibdata = 0;
42 for (i=0; i< N; i++) {
43 temp = malloc(sizeof(struct node));
44 p->next = temp;
45 p = temp;
46 p->data = FS + i + 1;
47 p->fibdata = i+1;
48 }
49 p->next = NULL;
50 return head;
51 }
52
53 int main(int argc, char *argv[]) {
54 double start, end;
55 struct node *p=NULL;
56 struct node *temp=NULL;
57 struct node *head=NULL;
58 struct node *parr[NMAX];
59 int i, count=0;
60
61 printf("Process linked list\n");
62 printf(" Each linked list node will be processed by function
'processwork()'\n");
63 printf(" Each ll node will compute %d fibonacci numbers beginning with
%d\n",N,FS);
64
65 p = init_list(p);
66 head = p;
67
68
69 start = omp_get_wtime();
70 {
71 while (p != NULL) {
72 processwork(p);
73 p = p->next;
74 }
75 }
76
77 end = omp_get_wtime();
78
79 printf("serial Compute Time: %f seconds\n", end - start);
80
81
82 p = head;
83
84 start = omp_get_wtime();
85 {
86 // count number of items in the list. Strictly speaking this isn't
87 // needed since we know there are N elements in the list. But in
88 // most cases you don't know this and need to count nodes.
89 while (p != NULL) {
90 p = p->next;
91 count++;
92 }
93
94 // traverse the list and collect pointers into an array.
95 p = head;
96 for(i=0; i<count; i++) {
97 parr[i] = p;
98 p = p->next;
99 }
100
101 // do the work in parallel
102 #pragma omp parallel
103 {
104 #pragma omp single
105 printf(" %d threads \n",omp_get_num_threads());
106 #pragma omp for schedule(static,1)
107 for(i=0; i<count; i++)
108 processwork(parr[i]);
109 }
110 }
111
112 end = omp_get_wtime();
113 p = head;
114 while (p != NULL) {
115 printf("%d : %d\n",p->data, p->fibdata);
116 temp = p->next;
117 free (p);
118 p = temp;
119 }
120 free (p);
121
122 printf("Compute Time: %f seconds\n", end - start);
123
124 return 0;
125 }
Exercise -7
有没有觉得上一题的做法太麻烦了?聪明的计算机科学家们早就意识到了,因此他们提出了一个解决的办
法: task
task子句相当于显式定义一个任务。常用在不规则循环(不适用parallel for的循环)与递归函数中。
默认任务只能由一个线程执行。当线程遇到task子句时,既可能自己立即执行,也可能将其放入任务池
等待别的线程取走。
taskwait显式等待之前定义的任务执行结束。用于同步
1 #include <omp.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4
5
6 #ifndef N
7 #define N 5
8 #endif
9 #ifndef FS
10 #define FS 38
11 #endif
12
13 struct node {
14 int data;
15 int fibdata;
16 struct node* next;
17 };
18
19 struct node* init_list(struct node* p);
20 void processwork(struct node* p);
21 int fib(int n);
22
23 int fib(int n)
24 {
25 int x, y;
26 if (n < 2) {
27 return (n);
28 } else {
29 x = fib(n - 1);
30 y = fib(n - 2);
31 return (x + y);
32 }
33 }
34
35 void processwork(struct node* p)
36 {
37 int n, temp;
38 n = p->data;
39 temp = fib(n);
40
41 p->fibdata = temp;
42
43 }
44
45 struct node* init_list(struct node* p)
46 {
47 int i;
48 struct node* head = NULL;
49 struct node* temp = NULL;
50
51 head = malloc(sizeof(struct node));
52 p = head;
53 p->data = FS;
54 p->fibdata = 0;
55 for (i=0; i< N; i++) {
56 temp = malloc(sizeof(struct node));
57 p->next = temp;
58 p = temp;
59 p->data = FS + i + 1;
60 p->fibdata = i+1;
61 }
62 p->next = NULL;
63 return head;
64 }
65
66 int main()
67 {
68 double start, end;
69 struct node *p=NULL;
70 struct node *temp=NULL;
71 struct node *head=NULL;
72
73 printf("Process linked list\n");
74 printf(" Each linked list node will be processed by function
'processwork()'\n");
75 printf(" Each ll node will compute %d fibonacci numbers beginning with
%d\n",N,FS);
76
77 p = init_list(p);
78 head = p;
79
80 start = omp_get_wtime();
81
82 #pragma omp parallel
83 {
84 #pragma omp master
85 printf("Threads: %d\n", omp_get_num_threads());
86
87 #pragma omp single
88 {
89 p=head;
90 while (p) {
91 #pragma omp task firstprivate(p) //first private is required
92 {
93 processwork(p);
94 }
95 p = p->next;
96 }
97 }
98 }
99
100 end = omp_get_wtime();
101 p = head;
102 while (p != NULL) {
103 printf("%d : %d\n",p->data, p->fibdata);
104 temp = p->next;
105 free (p);
106 p = temp;
107 }
108 free (p);
109
110 printf("Compute Time: %f seconds\n", end - start);
111
112 return 0;
113 }
使用 task 计算斐波那契数:
1 #include<stdio.h>
2 #include<omp.h>
3 int fib(int n) {
4 int x, y;
5 if (n < 2) {
6 return n;
7 }
8 #pragma omp task shared (x)
9 x = fib(n - 1);
10 #pragma omp task shared(y)
11 y = fib(n - 2);
12 #pragma omp taskwait
13 // 上面的xy若不标记为shared,会无法访问
14 // 原因,指令内的xy在不同的代码块
15 return x + y;
16 }
17
18 int main() {
19 int NN = 3;
20 int x;
21 #pragma omp parallel
22 {
23 #pragma omp single
24 x = fib(NN);
25 }
26 printf("%d\n",x);
27
28 return 0;
29 }
30