From f0263bf14ccbcbaecba8e8d37d97e92ab09723be Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Fri, 25 Mar 2011 11:49:09 -0400 Subject: [PATCH] pppm/gpu now using all gpu kernels. --- lib/gpu/Makefile.fermi | 2 +- lib/gpu/Makefile.lens | 2 +- lib/gpu/Makefile.lincoln | 2 +- lib/gpu/Makefile.linux | 2 +- lib/gpu/Makefile.linux_opencl | 2 +- lib/gpu/Makefile.longhorn | 2 +- lib/gpu/Makefile.mac | 2 +- lib/gpu/Makefile.mac_opencl | 2 +- lib/gpu/pair_gpu_ans.cpp | 4 +- lib/gpu/pair_gpu_device.cpp | 15 +++- lib/gpu/pair_gpu_device.h | 17 ++-- lib/gpu/pppm_gpu_kernel.cu | 18 ++-- lib/gpu/pppm_gpu_memory.cpp | 26 +++--- lib/gpu/pppm_gpu_memory.h | 5 +- lib/gpu/pppm_l_gpu.cpp | 10 +-- src/GPU/pppm_gpu.cpp | 159 ++-------------------------------- src/GPU/pppm_gpu.h | 10 +-- 17 files changed, 71 insertions(+), 209 deletions(-) diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi index d830c8924c..98c823cf40 100644 --- a/lib/gpu/Makefile.fermi +++ b/lib/gpu/Makefile.fermi @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens index 3b6301277f..33d2b88801 100644 --- a/lib/gpu/Makefile.lens +++ b/lib/gpu/Makefile.lens @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -openmp +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln index 97a7901811..bbaca61ef1 100644 --- a/lib/gpu/Makefile.lincoln +++ b/lib/gpu/Makefile.lincoln @@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index c0001a54ab..d69a00a817 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 69522298c5..3eac71cc88 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn index ba921f0f68..cc41174332 100644 --- a/lib/gpu/Makefile.longhorn +++ b/lib/gpu/Makefile.longhorn @@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB) CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac index f061a1a68a..5276ac10b2 100644 --- a/lib/gpu/Makefile.mac +++ b/lib/gpu/Makefile.mac @@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11 CUDA_PRECISION = -D_SINGLE_SINGLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib -CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32 +CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32 CUDR_CPP = mpic++ CUDR_OPTS = -O2 -m32 -g diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index 53d6d466e2..50ed67e9c3 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON +OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_LINK = -framework OpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/pair_gpu_ans.cpp b/lib/gpu/pair_gpu_ans.cpp index 155b1f6620..69980dd8ab 100644 --- a/lib/gpu/pair_gpu_ans.cpp +++ b/lib/gpu/pair_gpu_ans.cpp @@ -269,10 +269,8 @@ double PairGPUAnsT::energy_virial(double *eatom, double **vatom, template double PairGPUAnsT::energy_virial(double *eatom, double **vatom, double *virial, double &ecoul) { - if (_eflag==false && _vflag==false) { - ecoul=0.0; + if (_eflag==false && _vflag==false) return 0.0; - } if (_charge==false) return energy_virial(eatom,vatom,virial); diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 8cc7de31cb..87775291f5 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -267,8 +267,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_rho, UCL_Timer &time_interp, PairGPUAns &ans, - const double max_bytes, FILE *screen) { - double single[5], times[5]; + const double max_bytes, + const double cpu_time, FILE *screen) { + double single[7], times[7]; // single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+ // time_out.total_seconds(); @@ -279,8 +280,10 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); single[4]=time_interp.total_seconds(); + single[5]=ans.transfer_time()+ans.cast_time(); + single[6]=cpu_time; - MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; @@ -303,7 +306,11 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Total rho: %.4f s.\n",(times[0]+times[2]+times[3])/_replica_size); fprintf(screen,"Total interp: %.4f s.\n",(times[1]+times[4])/_replica_size); - fprintf(screen,"Total: %.4f s.\n",(times[0]+times[1]+times[2]+times[3]+times[4])/_replica_size); + fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Total: %.4f s.\n", + (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ + _replica_size); + fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); } fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 664ce6c3be..9dbe2f50f6 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -77,7 +77,8 @@ class PairGPUDevice { UCL_Timer & time_map, UCL_Timer & time_rho, UCL_Timer &time_interp, PairGPUAns &ans, - const double max_bytes, FILE *screen); + const double max_bytes, const double cpu_time, + FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); @@ -91,7 +92,7 @@ class PairGPUDevice { /// Add "answers" (force,energies,etc.) into LAMMPS structures inline double fix_gpu(double **f, double **tor, double *eatom, - double **vatom, double *virial, double &ecoul) { + double **vatom, double *virial, double &ecoul) { atom.data_unavail(); if (ans_queue.empty()==false) { stop_host_timer(); @@ -106,10 +107,16 @@ class PairGPUDevice { } /// Start timer on host - inline void start_host_timer() { _cpu_full=MPI_Wtime(); } + inline void start_host_timer() + { _cpu_full=MPI_Wtime(); _host_timer_started=true; } /// Stop timer on host - inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; } + inline void stop_host_timer() { + if (_host_timer_started) { + _cpu_full=MPI_Wtime()-_cpu_full; + _host_timer_started=false; + } + } /// Return host time inline double host_time() { return _cpu_full; } @@ -170,7 +177,7 @@ class PairGPUDevice { private: std::queue *> ans_queue; int _init_count; - bool _device_init; + bool _device_init, _host_timer_started; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index eeb596ca9c..2df1291936 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -88,9 +88,9 @@ __inline float fetch_q(const int& i, const float *q) #endif -// Maximum order for stencil -#define MAX_STENCIL 8 -// Thread block size for all kernels (Must be >=MAX_STENCIL^2) +// Maximum order for spline +#define MAX_SPLINE 8 +// Thread block size for all kernels (Must be >=MAX_SPLINE^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread //#define PENCIL_SIZE MEM_THREADS @@ -158,9 +158,9 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, const int npts_y, const int npts_z, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int order_m_1, const int order, const int order2) { - __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; - __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_STENCIL]; - __local numtyp ans[MAX_STENCIL][BLOCK_1D]; + __local numtyp rho_coeff[MAX_SPLINE*MAX_SPLINE]; + __local numtyp front[BLOCK_PENCILS][PENCIL_SIZE+MAX_SPLINE]; + __local numtyp ans[MAX_SPLINE][BLOCK_1D]; int tid=THREAD_ID_X; if (tid -// Maximum order for stencil -#define MAX_STENCIL 8 -// Thread block size for all kernels (Must be >=MAX_STENCIL^2) +// Maximum order for spline +#define MAX_SPLINE 8 +// Thread block size for all kernels (Must be >=MAX_SPLINE^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread //#define PENCIL_SIZE MEM_THREADS @@ -48,7 +48,7 @@ PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false), template PPPMGPUMemoryT::~PPPMGPUMemory() { - clear(); + clear(0.0); delete ans; } @@ -64,8 +64,6 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, numtyp **vd_brick, bool &success) { - clear(); - _max_bytes=10; screen=_screen; @@ -78,7 +76,7 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, _block_size=BLOCK_1D; assert(BLOCK_PENCILS*PENCIL_SIZE==BLOCK_1D); - assert(MAX_STENCIL*MAX_STENCIL<=BLOCK_1D); + assert(MAX_SPLINE*MAX_SPLINE<=BLOCK_1D); if (static_cast(_block_size)>ucl_device->group_size()) _block_size=ucl_device->group_size(); compile_kernels(*ucl_device); @@ -161,13 +159,11 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, d_error_flag.zero(); _max_bytes+=1; -//success=success && (force_temp.alloc(nall*4*2,*ucl_device)==UCL_SUCCESS); - return h_brick.begin(); } template -void PPPMGPUMemoryT::clear() { +void PPPMGPUMemoryT::clear(const double cpu_time) { if (!_allocated) return; _allocated=false; @@ -182,7 +178,7 @@ void PPPMGPUMemoryT::clear() { acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, - *ans,_max_bytes+_max_an_bytes,screen); + *ans,_max_bytes+_max_an_bytes,cpu_time,screen); if (_compiled) { k_particle_map.clear(); @@ -210,6 +206,8 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { + device->stop_host_timer(); + acc_timers(); if (nlocal==0) { zero_timers(); @@ -243,7 +241,7 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, int _max_atoms=10; int ainum=this->ans->inum(); - // Boxlo adjusted to be upper left brick and shift for even stencil order + // Boxlo adjusted to be upper left brick and shift for even spline order double shift=0.0; if (_order % 2) shift=0.5; @@ -267,7 +265,6 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, time_map.stop(); time_rho.start(); - BX=block_size(); GX=static_cast(ceil(static_cast(_npts_y*_npts_z)/BLOCK_PENCILS)); k_make_rho.set_size(GX,BX); @@ -320,7 +317,8 @@ void PPPMGPUMemoryT::interp(const numtyp qqrd2e_scale) { &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin()); time_interp.stop(); -//ucl_copy(force_temp,ans->dev_ans,ans->dev_ans.numel(),false); + ans->copy_answers(false,false,false,false); + device->add_ans_object(ans); } diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index a68a896ae7..6a9b86eef1 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -56,7 +56,7 @@ class PPPMGPUMemory { /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ - void clear(); + void clear(const double cpu_time); /// Returns memory usage on device per atom int bytes_per_atom() const; @@ -133,7 +133,7 @@ class PPPMGPUMemory { // Number of local grid points in brick int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride; - // -------------------------- STENCIL DATA ------------------------- + // -------------------------- SPLINE DATA ------------------------- UCL_D_Vec d_rho_coeff; int _order, _nlower, _nupper, _order_m_1, _order2; int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out; @@ -141,7 +141,6 @@ class PPPMGPUMemory { // ------------------------ FORCE/ENERGY DATA ----------------------- PairGPUAns *ans; -//UCL_H_Vec force_temp; // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pppm_program; diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index 550348c815..fc3a7ab4ca 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -34,7 +34,7 @@ PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, PRECISION **vd_brick, bool &success) { - PPPMF.clear(); + PPPMF.clear(0.0); int first_gpu=PPPMF.device->first_device(); int last_gpu=PPPMF.device->last_device(); int world_me=PPPMF.device->world_me(); @@ -91,8 +91,8 @@ PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen, return host_brick; } -void pppm_gpu_clear() { - PPPMF.clear(); +void pppm_gpu_clear(const double cpu_time) { + PPPMF.clear(cpu_time); } int pppm_gpu_spread(const int ago, const int nlocal, const int nall, @@ -111,7 +111,3 @@ double pppm_gpu_bytes() { return PPPMF.host_memory_usage(); } -/* -PRECISION * pppm_gpu_force() { return PPPMF.force_temp.begin(); } -*/ - diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 0db889c64e..a5835777e0 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -43,7 +43,7 @@ numtyp* pppm_gpu_init(const int nlocal, const int nall, FILE *screen, const int nyhi_out, const int nzhi_out, double **rho_coeff, numtyp **_vd_brick, bool &success); -void pppm_gpu_clear(); +void pppm_gpu_clear(const double poisson_time); int pppm_gpu_spread(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, @@ -98,7 +98,6 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg) remap = NULL; nmax = 0; - part2grid = NULL; } /* ---------------------------------------------------------------------- @@ -109,17 +108,7 @@ PPPMGPU::~PPPMGPU() { delete [] factors; deallocate(); - memory->destroy_2d_int_array(part2grid); - pppm_gpu_clear(); -double total1, total2, total3; -int rank,size; -MPI_Comm_rank(MPI_COMM_WORLD,&rank); -MPI_Comm_size(MPI_COMM_WORLD,&size); -MPI_Allreduce(&time1,&total1,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); -MPI_Allreduce(&time2,&total2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); -MPI_Allreduce(&time3,&total3,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); -if (rank==0 && screen) -fprintf(screen,"DEBUG_TIMES: %f %f %f\n",total1/size,total2/size,total3/size); + pppm_gpu_clear(poisson_time); } /* ---------------------------------------------------------------------- @@ -510,6 +499,7 @@ void PPPMGPU::init() if (order>8) error->all("Cannot use order greater than 8 with pppm/gpu."); + pppm_gpu_clear(poisson_time); bool success; numtyp *data, *h_brick; @@ -526,7 +516,8 @@ void PPPMGPU::init() if (!success) error->one("Insufficient memory on accelerator (or no fix gpu).\n"); -time1=0; time2=0; time3=0; + + poisson_time=0; } /* ---------------------------------------------------------------------- @@ -707,21 +698,11 @@ void PPPMGPU::compute(int eflag, int vflag) domain->x2lamda(atom->nlocal); } - // extend size of per-atom arrays if necessary - if (atom->nlocal > nmax) { - memory->destroy_2d_int_array(part2grid); - nmax = atom->nmax; - part2grid = memory->create_2d_int_array(nmax,3,"pppm:part2grid"); - } energy = 0.0; if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; - // find grid points for all my particles - // map my particle charge onto my local 3d density grid + double t3=MPI_Wtime(); - particle_map(); - -double t3=MPI_Wtime(); // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition @@ -738,16 +719,13 @@ double t3=MPI_Wtime(); // surrounding their 3d bricks fillbrick(); -time3+=MPI_Wtime()-t3; - numtyp qqrd2e_scale=qqrd2e*scale; - pppm_gpu_interp(qqrd2e_scale); + poisson_time+=MPI_Wtime()-t3; // calculate the force on my particles -double t2=MPI_Wtime(); - fieldforce(); -time2+=MPI_Wtime()-t2; + numtyp qqrd2e_scale=qqrd2e*scale; + pppm_gpu_interp(qqrd2e_scale); // sum energy across procs and add in volume-dependent term @@ -1504,46 +1482,6 @@ void PPPMGPU::fillbrick() } } -/* ---------------------------------------------------------------------- - find center grid pt for each of my particles - check that full stencil for the particle will fit in my 3d brick - store central grid pt indices in part2grid array -------------------------------------------------------------------------- */ - -void PPPMGPU::particle_map() -{ - int nx,ny,nz; - - double **x = atom->x; - int nlocal = atom->nlocal; - - int flag = 0; - for (int i = 0; i < nlocal; i++) { - - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // current particle coord can be outside global and local box - // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 - - nx = static_cast ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET; - ny = static_cast ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET; - nz = static_cast ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET; - - part2grid[i][0] = nx; - part2grid[i][1] = ny; - part2grid[i][2] = nz; - - // check that entire stencil around nx,ny,nz will fit in my 3d brick - - if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || - ny+nlower < nylo_out || ny+nupper > nyhi_out || - nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++; - } - - int flag_all; - MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); - if (flag_all) error->all("Out of range atoms - cannot compute PPPMGPU"); -} - /* ---------------------------------------------------------------------- FFT-based Poisson solver ------------------------------------------------------------------------- */ @@ -1665,85 +1603,6 @@ void PPPMGPU::poisson(int eflag, int vflag) } } -/* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles -------------------------------------------------------------------------- */ - -void PPPMGPU::fieldforce() -{ - int i,l,m,n,nx,ny,nz,mx,my,mz; - double dx,dy,dz,x0,y0,z0; - double ek[3]; - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of E-field on particle - - double *q = atom->q; - double **x = atom->x; - double **f = atom->f; - int nlocal = atom->nlocal; - - - -//numtyp *gpu_force=pppm_gpu_force(); -//double max_error=0; - - - for (i = 0; i < nlocal; i++) { - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; - - compute_rho1d(dx,dy,dz); - - ek[0] = ek[1] = ek[2] = 0.0; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = rho1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*rho1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*rho1d[0][l]; - ek[0] -= x0*vd_brick[mz][my][mx*4];; - ek[1] -= x0*vd_brick[mz][my][mx*4+1];; - ek[2] -= x0*vd_brick[mz][my][mx*4+2];; - } - } - } - -//for (int jj=0; jj<3; jj++) { -// double f0= qqrd2e*scale * q[i]*ek[jj]; -// double error=0.0; -// if (f0>1e-8) -// error = fabs(((gpu_force[i*4+jj])-(f0))/(f0)); -// if (error>0.05) -//// std::cout << "* "; -// std::cout << i << " : CPU GPU: " << error << " " << f0 << " " -// << gpu_force[i*4+jj] << std::endl; -// if (error>max_error) -// max_error=error; -//} - - // convert E-field to force - - f[i][0] += qqrd2e*scale * q[i]*ek[0]; - f[i][1] += qqrd2e*scale * q[i]*ek[1]; - f[i][2] += qqrd2e*scale * q[i]*ek[2]; - } - -//std::cout << "Maximum relative error: " << max_error*100.0 << "%\n"; - - -} - /* ---------------------------------------------------------------------- map nprocs to NX by NY grid as PX by PY procs - return optimal px,py ------------------------------------------------------------------------- */ diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h index 34bb735512..c413712e28 100644 --- a/src/GPU/pppm_gpu.h +++ b/src/GPU/pppm_gpu.h @@ -72,7 +72,6 @@ class PPPMGPU : public KSpace { class FFT3d *fft1,*fft2; class Remap *remap; - int **part2grid; // storage for particle -> grid mapping int nmax; int triclinic; // domain settings, orthog or triclinic @@ -90,19 +89,18 @@ class PPPMGPU : public KSpace { double diffpr(double, double, double, double, double **); void compute_gf_denom(); double gf_denom(double, double, double); - virtual void particle_map(); void brick2fft(); void fillbrick(); void poisson(int, int); - virtual void fieldforce(); void procs2grid2d(int,int,int,int *, int*); void compute_rho1d(double, double, double); void compute_rho_coeff(); void slabcorr(int); -double time1,time2,time3; - numtyp ***create_3d_offset(int, int, int, int, int, int, const char *, - numtyp *, int); + double poisson_time; + + numtyp ***create_3d_offset(int, int, int, int, int, int, const char *, + numtyp *, int); void destroy_3d_offset(numtyp ***, int, int); };