/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ #define OFFSET 4096 __device__ int negativCUDA(float f) { return ((unsigned int)1<<31&(__float_as_int(f)))>>31; } __device__ void reduceBlock(float* data) { int p2=1; while(p2*2= nzlo_fft)&&(blockIdx.x <=nzhi_fft)&& (blockIdx.y >= nylo_fft)&&(blockIdx.y <=nyhi_fft)&& (threadIdx.x>= nxlo_fft)&&(threadIdx.x<=nxhi_fft)) { int n=((int(blockIdx.x)-nzlo_fft)*(nyhi_fft-nylo_fft+1)+int(blockIdx.y)-nylo_fft)*(nxhi_fft-nxlo_fft+1)+int(threadIdx.x)-nxlo_fft; PPPM_FLOAT sqk = my_fkx*my_fkx + my_fky*my_fky + my_fkz*my_fkz; PPPM_FLOAT vterm = (sqk==PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0)/sqk + PPPM_F(0.25)/(g_ewald*g_ewald)); vg[6*n+0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx*my_fkx; vg[6*n+1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky*my_fky; vg[6*n+2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz*my_fkz; vg[6*n+3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fky; vg[6*n+4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx*my_fkz; vg[6*n+5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky*my_fkz; } } __device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) { PPPM_FLOAT sx,sy,sz; sz = sy = sx = PPPM_F(0.0); for (int l = order-1; l >= 0; l--) { sx = gf_b[l] + sx*x; sy = gf_b[l] + sy*y; sz = gf_b[l] + sz*z; } PPPM_FLOAT s = sx*sy*sz; return s*s; } __global__ void setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, int nbx,int nby,int nbz, PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) { PPPM_FLOAT sqk; int nx,ny,nz,kper,lper,mper,k,l,m; PPPM_FLOAT snx,sny,snz,snx2,sny2,snz2; PPPM_FLOAT argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; PPPM_FLOAT sum1,dot1,dot2; PPPM_FLOAT numerator,denominator; PPPM_FLOAT form=PPPM_F(1.0); int n=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; m=blockIdx.x; l=blockIdx.y; k=threadIdx.x; mper = m - nz_pppm*(2*m/nz_pppm); snz = sin(PPPM_F(0.5)*unitkz*mper*zprd_slab/nz_pppm); snz2 = snz*snz; lper = l - ny_pppm*(2*l/ny_pppm); sny = sin(PPPM_F(0.5)*unitky*lper*yprd/ny_pppm); sny2 = sny*sny; kper = k - nx_pppm*(2*k/nx_pppm); snx = sin(PPPM_F(0.5)*unitkx*kper*xprd/nx_pppm); snx2 = snx*snx; sqk = pow(unitkx*kper,PPPM_F(2.0)) + pow(unitky*lper,PPPM_F(2.0)) + pow(unitkz*mper,PPPM_F(2.0)); if (sqk != PPPM_F(0.0)) { numerator = form*PPPM_F(12.5663706)/sqk; denominator = gf_denom(snx2,sny2,snz2); sum1 = PPPM_F(0.0); for (nx = -nbx; nx <= nbx; nx++) { qx = unitkx*(kper+nx_pppm*nx); sx = exp(PPPM_F(-.25)*pow(qx/g_ewald,PPPM_F(2.0))); wx = PPPM_F(1.0); argx = PPPM_F(0.5)*qx*xprd/nx_pppm; if (argx != PPPM_F(0.0)) wx = pow(sin(argx)/argx,order); for (ny = -nby; ny <= nby; ny++) { qy = unitky*(lper+ny_pppm*ny); sy = exp(PPPM_F(-.25)*pow(qy/g_ewald,PPPM_F(2.0))); wy = PPPM_F(1.0); argy = PPPM_F(0.5)*qy*yprd/ny_pppm; if (argy != PPPM_F(0.0)) wy = pow(sin(argy)/argy,order); for (nz = -nbz; nz <= nbz; nz++) { qz = unitkz*(mper+nz_pppm*nz); sz = exp(PPPM_F(-.25)*pow(qz/g_ewald,PPPM_F(2.0))); wz = PPPM_F(1.0); argz = PPPM_F(0.5)*qz*zprd_slab/nz_pppm; if (argz != PPPM_F(0.0)) wz = pow(sin(argz)/argz,order); dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; dot2 = qx*qx+qy*qy+qz*qz; sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,PPPM_F(2.0)); } } } greensfn[n] = numerator*sum1/denominator; } else greensfn[n] = PPPM_F(0.0); } __global__ void poisson_scale_kernel() { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; FFT_FLOAT scaleinv=FFT_F(1.0)/(gridDim.x*gridDim.y*blockDim.x); work1[2*i] *= scaleinv * greensfn[i]; work1[2*i+1] *= scaleinv * greensfn[i]; } __global__ void poisson_xgrad_kernel() { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; work2[2*i] = fkx[threadIdx.x] * work1[2*i+1]; work2[2*i+1] = -fkx[threadIdx.x] * work1[2*i]; } __global__ void poisson_ygrad_kernel() { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; work2[2*i] = fky[blockIdx.y] * work1[2*i+1]; work2[2*i+1] = -fky[blockIdx.y] * work1[2*i]; } __global__ void poisson_zgrad_kernel() { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; work2[2*i] = fkz[blockIdx.x] * work1[2*i+1]; work2[2*i+1] = -fkz[blockIdx.x] * work1[2*i]; } __global__ void poisson_vdx_brick_kernel(int ilo,int jlo,int klo) { int k=blockIdx.x+klo; k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); int j=blockIdx.y+jlo; j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); int i=threadIdx.x+ilo; i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); vdx_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; } __global__ void poisson_vdy_brick_kernel(int ilo,int jlo,int klo) { int k=blockIdx.x+klo; k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); int j=blockIdx.y+jlo; j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); int i=threadIdx.x+ilo; i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); vdy_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; } __global__ void poisson_vdz_brick_kernel(int ilo,int jlo,int klo) { int k=blockIdx.x+klo; k+=nz_pppm*negativCUDA(CUDA_F(1.0)*k)-nz_pppm*negativCUDA(CUDA_F(1.0)*(nz_pppm-k-1)); int j=blockIdx.y+jlo; j+=ny_pppm*negativCUDA(CUDA_F(1.0)*j)-ny_pppm*negativCUDA(CUDA_F(1.0)*(ny_pppm-j-1)); int i=threadIdx.x+ilo; i+=nx_pppm*negativCUDA(CUDA_F(1.0)*i)-nx_pppm*negativCUDA(CUDA_F(1.0)*(nx_pppm-i-1)); vdz_brick[((blockIdx.x)*(nyhi_out-nylo_out+1)+blockIdx.y)*(nxhi_out-nxlo_out+1)+threadIdx.x] = work3[2*(((k)*ny_pppm+(j))*nx_pppm+i)]; } __global__ void poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag) { ENERGY_FLOAT scaleinv=FFT_F(1.0)/(nx_pppm*ny_pppm*nz_pppm); int i=(blockIdx.x+nzlo_fft)*ny_pppm*nx_pppm+(blockIdx.y+nylo_fft)*nx_pppm+threadIdx.x+nxlo_fft; ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; ENERGY_FLOAT myenergy= scaleinv*scaleinv * greensfn[i] * (work1[2*i]*work1[2*i] + work1[2*i+1]*work1[2*i+1]); s_energy[threadIdx.x]=myenergy; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) energy[blockIdx.x*ny_pppm+blockIdx.y]=s_energy[0]; if(vflag) { __syncthreads(); for (int j = 0; j < 6; j++) { s_energy[threadIdx.x]= myenergy*vg[((blockIdx.x*gridDim.y+blockIdx.y)*(blockDim.x)+threadIdx.x)*6+j]; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) virial[blockIdx.x*ny_pppm+blockIdx.y+j*nz_pppm*ny_pppm]=s_energy[0]; } } } __global__ void sum_energy_kernel1(int vflag) { ENERGY_FLOAT myenergy=energy[(blockIdx.x*ny_pppm+threadIdx.x)]; ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; s_energy[threadIdx.x]=myenergy; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) energy[blockIdx.x*ny_pppm]=s_energy[0]; if(vflag) { __syncthreads(); for (int j = 0; j < 6; j++) { myenergy=virial[blockIdx.x*ny_pppm+threadIdx.x+j*ny_pppm*nz_pppm]; s_energy[threadIdx.x]=myenergy; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) virial[blockIdx.x*ny_pppm+j*ny_pppm*nz_pppm]=s_energy[0]; } } } __global__ void sum_energy_kernel2(int vflag) { ENERGY_FLOAT myenergy=energy[threadIdx.x*ny_pppm]; ENERGY_FLOAT* s_energy=(ENERGY_FLOAT*) sharedmem; s_energy[threadIdx.x]=myenergy; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) energy[0]=s_energy[0]; if(vflag) { __syncthreads(); for (int j = 0; j < 6; j++) { myenergy=virial[threadIdx.x*ny_pppm+j*ny_pppm*nz_pppm]; s_energy[threadIdx.x]=myenergy; __syncthreads(); reduceBlock(s_energy); if(threadIdx.x==0) virial[j]=s_energy[0]; } } } __device__ PPPM_FLOAT rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT* srho_coeff) { PPPM_FLOAT rho1d_tmp=PPPM_F(0.0); for (int l = order-1; l >= 0; l--) rho1d_tmp = srho_coeff[l*order+k-(1-order)/2] + rho1d_tmp*d; return rho1d_tmp; } __global__ void particle_map_kernel(int* flag) { int i=blockIdx.x*blockDim.x+threadIdx.x; if(i nxhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out || nz+nlower < nzlo_out || nz+nupper > nzhi_out) {flag[0]++; debugdata[0]=i; debugdata[1]=_boxlo[0]; debugdata[2]=_boxlo[1]; debugdata[3]=_boxlo[2]; debugdata[4]=nx; debugdata[5]=ny; debugdata[6]=nz; debugdata[7]=_x[i]; debugdata[8]=_x[i+_nmax]; debugdata[9]=_x[i+2*_nmax]; debugdata[10]=nlocal; } } } __global__ void make_rho_kernelA() { int i,l,m,n,nx,ny,nz,mx,my,mz; // clear 3d density array // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt i=blockIdx.x*blockDim.x+threadIdx.x; if(i < nlocal) { PPPM_FLOAT dx,dy,dz,x0,y0,z0; nx = part2grid[i]; ny = part2grid[i+nmax]; nz = part2grid[i+2*nmax]; dx = nx+shiftone - (_x[i]-_boxlo[0])*delxinv; dy = ny+shiftone - (_x[i+nmax]-_boxlo[1])*delyinv; dz = nz+shiftone - (_x[i+2*nmax]-_boxlo[2])*delzinv; z0 = delxinv*delyinv*delzinv * _q[i]; for (n = nlower; n <= nupper; n++) { mz = n+nz; y0 = z0*rho1d(n,dz,rho_coeff); for (m = nlower; m <= nupper; m++) { my = m+ny; x0 = y0*rho1d(m,dy,rho_coeff); for (l = nlower; l <= nupper; l++) { mx = l+nx; int mzyx=((mz-nzlo_out)*(nyhi_out-nylo_out+1)+my-nylo_out)*(nxhi_out-nxlo_out+1)+mx-nxlo_out; while(atomicAdd(&density_brick_int[mzyx],1)!=0) atomicAdd(&density_brick_int[mzyx],-1); density_brick[mzyx]+=x0*rho1d(l,dx,rho_coeff); __threadfence(); atomicAdd(&density_brick_int[mzyx],-1); __syncthreads(); } } } } } __global__ void make_rho_kernel(int* flag,int read_threads_at_same_time) { int i,l,m,n,nx,ny,nz,mx,my,mz,a,b; // clear 3d density array // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // int nzxy=blockIdx.x*gridDim.y+blockIdx.y; int nelements=nupper-nlower+1; int* idx=(int*) sharedmem; int* sdensity_brick_int=&idx[blockDim.x]; PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &sdensity_brick_int[nelements*blockDim.x]; if(threadIdx.x-1)) { a=sdensity_brick_int[ii*nelements+threadIdx.x]; //if(a*a>1e-100) b=(atomicAdd(&density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements],a)|a); //else //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a); if(((b)&(0x7c000000))&&(not((b)&(0x80000000)))) { flag[1]++; if((b)&(0x60000000)) flag[0]++; } } } __syncthreads(); //*/ } } } } __global__ void scale_rho_kernel() { int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x; density_brick[i]=(1.0/density_intScale)*density_brick_int[i]; } __global__ void fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int* flag) //20*x64 0.36 { int i; // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle i=blockIdx.x*blockDim.x+threadIdx.x; int* idx=(int*) sharedmem; PPPM_FLOAT* tmp_brick=(PPPM_FLOAT*) &idx[blockDim.x]; PPPM_FLOAT* srho_coeff=(PPPM_FLOAT*) &tmp_brick[3*blockDim.x*elements_per_thread]; if(threadIdx.x-1)) { tmp_brick[ii*elements_per_thread+threadIdx.x]=vdx_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; tmp_brick[(ii+blockDim.x)*elements_per_thread+threadIdx.x]=vdy_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; tmp_brick[(ii+2*blockDim.x)*elements_per_thread+threadIdx.x]=vdz_brick[idx[ii+kk]+threadIdx.x-kk*elements_per_thread]; } } __syncthreads(); if(i