/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator Original Version: http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov See the README file in the top-level LAMMPS directory. ----------------------------------------------------------------------- USER-CUDA Package and associated modifications: https://sourceforge.net/projects/lammpscuda/ Christian Trott, christian.trott@tu-ilmenau.de Lars Winterfeld, lars.winterfeld@tu-ilmenau.de Theoretical Physics II, University of Technology Ilmenau, Germany See the README file in the USER-CUDA directory. This software is distributed under the GNU General Public License. ------------------------------------------------------------------------- */ // load some variables from shared cuda data into device's constant memory: __device__ __constant__ X_FLOAT rez_bin_size[3]; __device__ __constant__ unsigned* bin_error_count; __device__ __constant__ int cuda_dummy_type; __device__ __constant__ unsigned binned_size_all; __device__ __constant__ X_FLOAT outside[3]; __global__ void PreBinning_Kernel() { const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true { _binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type; const int i = 3*blockDim.x * bin + threadIdx.x; X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i); binned_x += blockDim.x; *binned_x = _subhi[1] + outside[1] * (1+i); binned_x += blockDim.x; *binned_x = _subhi[2] + outside[2] * (1+i); _binned_tag[i]=-1; } } __global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag) { const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset; int binatoms=_natoms; if(offset==0) binatoms=_nlocal ; if(i < binatoms) { // copy atom position from global device memory to local register // in this 3 steps to get as much coalesced access as possible X_FLOAT my_xX, my_xY, my_xZ; x += i; my_xX = *x; x += _nmax; my_xY = *x; x += _nmax; my_xZ = *x; //my_xX=x[i]; //my_xY=x[i+_nmax]; //my_xZ=x[i+2*_nmax]; // calculate flat bin index int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2; int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2; int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2; bx-=bx*negativCUDA(1.0f*bx); bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx); by-=by*negativCUDA(1.0f*by); by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by); bz-=bz*negativCUDA(1.0f*bz); bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz); const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz; // add new atom to bin, get bin-array position const unsigned k = atomicAdd(& _bin_count_all[j], 1); if(offset==0) atomicAdd(& _bin_count_local[j], 1); if(k < _bin_nmax) { // copy register values back to global device memory unsigned pos = 3*_bin_nmax * j + k; _binpos[i]=pos; binned_x += pos; *binned_x = my_xX; binned_x += _bin_nmax; *binned_x = my_xY; binned_x += _bin_nmax; *binned_x = my_xZ; // also copy velocity and force accordingly binned_x = _binned_v + pos; x = _v + i; *binned_x = *x; binned_x += _bin_nmax; x += _nmax; *binned_x = *x; binned_x += _bin_nmax; x += _nmax; *binned_x = *x; binned_x = _binned_f + pos; x = _f + i; *binned_x = *x; binned_x += _bin_nmax; x += _nmax; *binned_x = *x; binned_x += _bin_nmax; x += _nmax; *binned_x = *x; pos = _bin_nmax * j + k; _binned_type [pos] = _type[i]; _binned_tag [pos] = _tag[i]; if(rmass_flag) _binned_rmass[pos] = _rmass[i]; if(q_flag) _binned_q [pos] = _q[i]; } else { // normally, this should not happen: int errorn=atomicAdd(bin_error_count, 1); MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); ) } } } __global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag) { const unsigned i = blockDim.x * blockIdx.x + threadIdx.x; if(i < _nlocal) { unsigned bin_pos3 = _binpos[i]; unsigned bin_pos=bin_pos3/(3*_bin_nmax); bin_pos*=_bin_nmax; bin_pos+=bin_pos3-bin_pos*3; binned_x = _binned_x + bin_pos3; x = x + i; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; binned_x = _binned_v + bin_pos3; x = _v + i; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; binned_x = _binned_f + bin_pos3; x = _f + i; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; binned_x += _bin_nmax; x += _nmax; *x = *binned_x; _type[i] = _binned_type[bin_pos]; _tag[i] = _binned_tag[bin_pos]; if(q_flag) _q[i] = _binned_q[bin_pos]; } }