diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 4086ffc645..ffe9da81ae 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -168,6 +168,8 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, const numtyp delzinv, const int order, const int order2, const numtyp delvolinv) { __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; + __local numtyp rho1d_0[MAX_STENCIL][BLOCK_1D]; + __local numtyp rho1d_1[MAX_STENCIL][BLOCK_1D]; int nx=THREAD_ID_X; int ny=THREAD_ID_Y; @@ -177,6 +179,7 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, } __syncthreads(); + int tid=mul24(BLOCK_SIZE_X,ny)+nx; nx+=mul24(BLOCK_ID_X,BLOCK_SIZE_X); ny+=mul24(BLOCK_ID_Y,BLOCK_SIZE_Y); @@ -202,13 +205,12 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, numtyp dy=ny-(p.y-b_lo_y)*delyinv; numtyp dz=nz-(p.z-b_lo_z)*delzinv; - numtyp rho1d[2][MAX_STENCIL]; for (int k=0; k=k; l-=order) { - rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx; - rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy; + rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; + rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; } } @@ -220,9 +222,9 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, numtyp y0=z0*rho1d_2; int my=mz+mul24(ny,npts_x); for (int m=0; m=n; k-=order) rho1d_2=rho_coeff[k]+rho1d_2*dz; - ans[n]+=z0*rho1d_2; + ans[n][tx]+=z0*rho1d_2; } } } @@ -332,7 +334,7 @@ __kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_, front[tx]=(numtyp)0.0; for (int n=0; n=k; l-=order) { - rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx; - rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy; + rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; + rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; } } @@ -409,9 +413,9 @@ __kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_, numtyp y0=z0*rho1d_2; int my=mz+mul24(ny,npts_x); for (int m=0; m=k; l-=order) { - rho1d[0][k]=rho_coeff[l]+rho1d[0][k]*dx; - rho1d[1][k]=rho_coeff[l]+rho1d[1][k]*dy; + rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; + rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; } } @@ -478,9 +484,9 @@ __kernel void field_force(__global numtyp4 *x_, __global numtyp *q_, numtyp z0=qs*rho1d_2; int my=mz+mul24(ny,npts_x); for (int m=0; mdestroy_2d_int_array(part2grid); pppm_gpu_clear(); double total1, total2, total3; -int rank; +int rank,size; MPI_Comm_rank(MPI_COMM_WORLD,&rank); +MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Allreduce(&time1,&total1,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); MPI_Allreduce(&time2,&total2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); MPI_Allreduce(&time3,&total3,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); -if (rank==0) -std::cout << "DEBUG_TIMES: " << total1 << " " << total2 << " " << total3 - << std::endl; +if (rank==0 && screen) +fprintf(screen,"DEBUG_TIMES: %f %f %f\n",total1/size,total2/size,total3/size); } /* ----------------------------------------------------------------------