1 nbabel: opencl + stdcl version
2
3 this opencl version of nbabel is based on the OpenCL nbody example
4 provided with stdcl ( http://www.browndeertechnology.com/opensource.html )
5
6 to compile:
7 cc -O3 -I/path/to/OpenCL/include -I/path/to/stdcl/include \
8 -o nbody nbody.c -L/path/to/OpenCL/lib -lOpenCL -lpthread -ldl \
9 -L/path/to/stdcl/lib -lstdcl
10
11 to run:
12 ./nbody < input1k
13
14 CLCONTEXT can be set to stdcpu or stdgpu to use CPU or GPU
15
16 tested with ATI Stream SDK OpenCL
17 it uses stdcl: A Simplified C Interface for OpenCL, which can be found at
18 http://www.browndeertechnology.com/opensource.html
19
20 results:
21
22 system: CPU - Intel(R) Core(TM)2 CPU T7200 @ 2Ghz
23 OpenSuse 11.2 Linux 2.6.31.14 SMP x86_64
24 ati-stream-sdk-v2.1, stdcl 1.0-RC2
25 JIT compilation
26
27 N time dE/E0
28 128 0:00.54s 3.576278e-07
29 256 0:01.10s 1.907349e-06
30 512 0:03.05s 3.934892e-02
31 1k 0:10.48s 1.423705e-02
32 2k 0:40.37s 8.257875e-03
33 4k 2:39.81s 2.035384e-03
34 8k 11:01.17s 2.739911e-03
35
36 system: GPU - nvidia 9800GTX (Host: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz)
37 Fedora Linux 2.6.35.9 SMP x86_64
38 ati-stream-sdk-v2.1, stdcl 1.0-RC2, nvidia driver 260.19.21, CUDA OpenCL 1.0
39 JIT compilation
40
41 N time dE/E0
42 128 0:00.87s 4.768371e-07
43 256 0:00.90s 1.668931e-06
44 512 0:00.99s 3.934951e-02
45 1k 0:01.11s 1.423682e-02
46 2k 0:01.85s 8.257935e-03
47 4k 0:02.31s 2.035384e-03
48 8k 0:07.10s 2.740031e-03
49 16k 0:25.07s 1.492861e-03
50
1 /* nbabel: opencl + stdcl version
2
3 this opencl version of nbabel is based on the OpenCL nbody example
4 provided with stdcl ( http://www.browndeertechnology.com/opensource.html )
5 it uses stdcl.
6
7 to compile:
8 cc -O3 -I/path/to/OpenCL/include -I/path/to/stdcl/include \
9 -o nbody nbody.c -L/path/to/OpenCL/lib -lOpenCL -lpthread -ldl \
10 -L/path/to/stdcl/lib -lstdcl
11
12 to run:
13 ./nbody < input1k
14
15 CLCONTEXT can be set to stdcpu or stdgpu to use CPU or GPU
16 */
17 #include <stdcl.h>
18 #include <math.h>
19
20 #define CLCONTEXT stdgpu
21 #define NMAX (1<<16)
22 #define LOCALMEMSIZE 256
23 #define NTHREAD 64
24
25
26 int readbodies( int n, cl_float4* pos0, cl_float4* pos1,cl_float4* vel,
27 cl_float4* acc)
28 {
29 int dummy;
30 float mass,x,y,z,vx,vy,vz;
31 int count=0.;
32
33 while(scanf("%d %f %f %f %f %f %f %f\n", &dummy, &mass,
34 &x, &y, &z, &vx, &vy, &vz) != EOF)
35 {
36 pos0[count]= (cl_float4) {x,y,z,mass};
37 pos1[count]= pos0[count];
38 vel[count]= (cl_float4) {vx,vy,vz,0.0f};
39 acc[count]= (cl_float4) {0.0f,0.0f,0.0f,0.0f};
40 count++;
41 if( count >= NMAX ) return -1;
42 }
43
44 return count;
45 }
46
47 void initialize( int n, float dt,cl_float4* pos, cl_float4* pos1,cl_float4* vel,
48 cl_float4* acc)
49 {
50 int i;
51 for(i=0;i<n;i++)
52 {
53 cl_float4_x(pos1[i]) = cl_float4_x(pos[i])+dt*cl_float4_x(vel[i])+0.
54 5f*dt*dt*cl_float4_x(acc[i]);
55 cl_float4_y(pos1[i]) = cl_float4_y(pos[i])+dt*cl_float4_y(vel[i])+0.
56 5f*dt*dt*cl_float4_y(acc[i]);
57 cl_float4_z(pos1[i]) = cl_float4_z(pos[i])+dt*cl_float4_z(vel[i])+0.
58 5f*dt*dt*cl_float4_z(acc[i]);
59 cl_float4_w(pos1[i]) = cl_float4_w(pos[i]);
60 }
61 }
62
63 void diagnostics( int n, cl_float4* pos, cl_float4* vel, cl_float4* acc )
64 {
65 int i;
66 float ek,ep,et,de;
67 static float e0=0;
68
69 cl_float4 center = (cl_float4){0.0f,0.0f,0.0f,0.0f};
70
71 for(i=0;i<n;i++) {
72 cl_float4_x(center) += cl_float4_w(pos[i])*cl_float4_x(pos[i]);
73 cl_float4_y(center) += cl_float4_w(pos[i])*cl_float4_y(pos[i]);
74 cl_float4_z(center) += cl_float4_w(pos[i])*cl_float4_z(pos[i]);
75 cl_float4_w(center) += cl_float4_w(pos[i]);
76 }
77
78 cl_float4_x(center) /= cl_float4_w(center);
79 cl_float4_y(center) /= cl_float4_w(center);
80 cl_float4_z(center) /= cl_float4_w(center);
81 cl_float4_w(center) /= n;
82
83 printf("center => {%e,%e,%e}, ",
84 cl_float4_x(center),cl_float4_y(center),cl_float4_z(center) );
85
86 ep=0;ek=0;
87 for(i=0;i<n;i++) {
88
89 ek+= 0.5f*cl_float4_w(pos[i])*(cl_float4_x(vel[i])*cl_float4_x(vel[i])+
90 cl_float4_y(vel[i])*cl_float4_y(vel[i])+
91 cl_float4_z(vel[i])*cl_float4_z(vel[i]));
92 ep+= 0.5f*cl_float4_w(pos[i])*cl_float4_w(acc[i]);
93 }
94 et=ek+ep;
95 if(e0==0.)
96 {
97 de=0.;
98 e0=et;
99 } else
100 {
101 de=fabs((et-e0)/e0);
102 }
103 printf("ek,ep,et,de/e0 => %e, %e, %e, %e\n", ek,ep,et,de);
104 }
105
106 int main(int argc, char** argv)
107 {
108 int step,burst;
109
110 int nparticle; /* MUST be a nice power of two for simplicity */
111 int nstep = 1000;
112 int nburst = 200; /* MUST divide the value of nstep without remainder */
113 int nthread = NTHREAD;
114 int localmemsize = LOCALMEMSIZE;
115
116 float dt0 = 0.0;
117 float dt = 0.001;
118 float eps = 0.0*0.001;
119
120 printf("nbabel opencl\n");
121
122 cl_float4* pos0 = (cl_float4*) clmalloc(CLCONTEXT,NMAX*sizeof(cl_float4),0);
123 cl_float4* pos1 = (cl_float4*) clmalloc(CLCONTEXT,NMAX*sizeof(cl_float4),0);
124 cl_float4* vel = (cl_float4*) clmalloc(CLCONTEXT,NMAX*sizeof(cl_float4),0);
125 cl_float4* acc = (cl_float4*) clmalloc(CLCONTEXT,NMAX*sizeof(cl_float4),0);
126
127
128 nparticle=readbodies(nparticle,pos0,pos1,vel,acc);
129
130 void* h = clopen(CLCONTEXT,"nbody_kern.cl",CLLD_NOW);
131 cl_kernel krn = clsym(CLCONTEXT,h,"nbody_kern",CLLD_NOW);
132
133 clndrange_t ndr = clndrange_init1d(0,nparticle,nthread);
134
135 clarg_set(krn,0, dt0);
136 clarg_set(krn,1,eps);
137 clarg_set_global(krn,2,pos0);
138 clarg_set_global(krn,3,pos1);
139 clarg_set_global(krn,4,vel);
140 clarg_set_global(krn,5,acc);
141 clarg_set_local(krn,6,nthread*sizeof(cl_float4));
142
143 clmsync(CLCONTEXT,0,pos0,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
144 clmsync(CLCONTEXT,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
145 clmsync(CLCONTEXT,0,vel,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
146 clmsync(CLCONTEXT,0,acc,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
147 clfork(CLCONTEXT,0,krn,&ndr,CL_EVENT_NOWAIT);
148
149 clmsync(CLCONTEXT,0,acc,CL_MEM_HOST|CL_EVENT_NOWAIT);
150 clwait(CLCONTEXT,0,CL_KERNEL_EVENT|CL_MEM_EVENT|CL_EVENT_RELEASE);
151
152 initialize(nparticle,dt,pos0,pos1,vel,acc);
153 diagnostics(nparticle,pos0,vel,acc);
154
155 clmsync(CLCONTEXT,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
156 clarg_set(krn,0,dt);
157
158 for(step=0; step<nstep; step+=nburst) {
159 for(burst=0; burst<nburst; burst+=2) {
160 clarg_set_global(krn,2,pos0);
161 clarg_set_global(krn,3,pos1);
162 clfork(CLCONTEXT,0,krn,&ndr,CL_EVENT_NOWAIT);
163
164 clarg_set_global(krn,2,pos1);
165 clarg_set_global(krn,3,pos0);
166 clfork(CLCONTEXT,0,krn,&ndr,CL_EVENT_NOWAIT);
167 }
168
169 clmsync(CLCONTEXT,0,pos0,CL_MEM_HOST|CL_EVENT_NOWAIT);
170 clmsync(CLCONTEXT,0,pos1,CL_MEM_HOST|CL_EVENT_NOWAIT);
171 clmsync(CLCONTEXT,0,vel,CL_MEM_HOST|CL_EVENT_NOWAIT);
172 clmsync(CLCONTEXT,0,acc,CL_MEM_HOST|CL_EVENT_NOWAIT);
173 clwait(CLCONTEXT,0,CL_KERNEL_EVENT|CL_MEM_EVENT|CL_EVENT_RELEASE);
174 diagnostics(nparticle,pos0,vel,acc);
175
176 }
177
178
179 clclose(CLCONTEXT,h);
180
181 clfree(pos0);
182 clfree(pos1);
183 clfree(vel);
184 clfree(acc);
185 }
186
1 /* nbody_kern.cl */
2
3 __kernel void nbody_kern(
4 float dt1, float eps,
5 __global float4* pos0,
6 __global float4* pos1,
7 __global float4* vel,
8 __global float4* acc,
9 __local float4* pblock
10 ) {
11 const float4 dt = (float4)(dt1,dt1,dt1,0.0f);
12
13 int gti = get_global_id(0);
14 int ti = get_local_id(0);
15
16 int n = get_global_size(0);
17 int nt = get_local_size(0);
18 int nb = n/nt;
19 float4 p = pos1[gti];
20
21 float4 a = (float4)(0.0f,0.0f,0.0f,0.0f);
22 for(int jb=0; jb < nb; jb++) { /* Foreach block ... */
23
24 pblock[ti] = pos1[jb*nt+ti]; /* Cache ONE particle position */
25 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in the work-group */
26
27 for(int j=0; j<nt; j++) { /* For ALL cached particle positions ... */
28 float4 p2 = pblock[j]; /* Read a cached particle position */
29 float4 d = p2 - p;
30 float r2=d.x*d.x + d.y*d.y + d.z*d.z + eps;
31 if(r2 > 0)
32 {
33 float invr = rsqrt(r2);
34 float mr3 = p2.w*invr*invr*invr;
35 float4 fa = (float4) (mr3,mr3,mr3,0.0f);
36 float4 fp = (float4) (0.0f,0.0f,0.0f,-p2.w*invr);
37 a += fa*d; /* Accumulate acceleration */
38 a += fp;
39 }
40 }
41
42 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in work-group */
43 }
44
45 vel[gti] += 0.5f*dt*(a+acc[gti]);
46 pos0[gti] = pos1[gti] + dt*vel[gti] + 0.5f*dt*dt*a;
47 acc[gti] = a;
48 }
49