diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index e93805df3b..da07cec1b7 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -29,6 +29,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ + $(OBJ_DIR)/lal_base_dpd.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -65,7 +66,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ - $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o + $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ + $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ @@ -111,7 +113,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \ $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \ - $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h + $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \ + $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) @@ -190,6 +193,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp $(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp $(CUDR) -o $@ -c lal_base_three.cpp +$(OBJ_DIR)/lal_base_dpd.o: $(ALL_H) lal_base_dpd.h lal_base_dpd.cpp + $(CUDR) -o $@ -c lal_base_dpd.cpp + $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu @@ -658,6 +664,18 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_ $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/dpd.cubin: lal_dpd.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_dpd.cu + +$(OBJ_DIR)/dpd_cubin.h: $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd.cubin + $(BIN2C) -c -n dpd $(OBJ_DIR)/dpd.cubin > $(OBJ_DIR)/dpd_cubin.h + +$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OBJ_DIR)/lal_base_dpd.o + $(CUDR) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h + $(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index bb528f1e2b..1d85c65354 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -18,6 +18,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ + $(OBJ_DIR)/lal_base_dpd.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -54,7 +55,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ - $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o + $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ + $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ @@ -77,7 +79,7 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \ - $(OBJ_DIR)/lj_gromacs_cl.h + $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -129,6 +131,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp $(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp $(OCL) -o $@ -c lal_base_three.cpp +$(OBJ_DIR)/lal_base_dpd.o: $(OCL_H) lal_base_dpd.h lal_base_dpd.cpp + $(OCL) -o $@ -c lal_base_dpd.cpp + $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; @@ -471,6 +476,15 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/dpd_cl.h: lal_dpd.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh dpd $(PRE1_H) lal_dpd.cu $(OBJ_DIR)/dpd_cl.h; + +$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/lal_base_dpd.o + $(OCL) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h + $(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 92cda1239e..a250584dfa 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -41,6 +41,8 @@ int AtomT::bytes_per_atom() const { bytes+=4*sizeof(numtyp); if (_charge) bytes+=sizeof(numtyp); + if (_vel) + bytes+=4*sizeof(numtyp); return bytes; } @@ -52,7 +54,7 @@ bool AtomT::alloc(const int nall) { // Ignore host/device transfers? _host_view=false; - if (dev->shared_memory()) { + if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) { _host_view=true; #ifdef GPU_CAST assert(0==1); @@ -90,6 +92,11 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=quat.device.row_bytes(); } + if (_vel && _host_view==false) { + success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=v.device.row_bytes(); + } if (_gpu_nbor>0) { if (_bonds) { @@ -124,7 +131,7 @@ bool AtomT::alloc(const int nall) { template bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds) { + const int gpu_nbor, const bool bonds, const bool vel) { bool success=true; // Ignore host/device transfers? int gpu_bytes=0; @@ -149,6 +156,16 @@ bool AtomT::add_fields(const bool charge, const bool rot, } } + if (vel && _vel==false) { + _vel=true; + _other=true; + if (_host_view==false) { + success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=v.device.row_bytes(); + } + } + if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { @@ -191,19 +208,21 @@ bool AtomT::add_fields(const bool charge, const bool rot, template bool AtomT::init(const int nall, const bool charge, const bool rot, - UCL_Device &devi, const int gpu_nbor, const bool bonds) { + UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) { clear(); bool success=true; _x_avail=false; _q_avail=false; _quat_avail=false; + _v_avail=false; _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; - _other=_charge || _rot; + _vel=vel; + _other=_charge || _rot || _vel; dev=&devi; _time_transfer=0; @@ -216,9 +235,11 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_pos.init(*dev); time_q.init(*dev); time_quat.init(*dev); + time_vel.init(*dev); time_pos.zero(); time_q.zero(); time_quat.zero(); + time_vel.zero(); _time_cast=0.0; #ifdef GPU_CAST @@ -239,6 +260,8 @@ void AtomT::clear_resize() { q.clear(); if (_rot) quat.clear(); + if (_vel) + v.clear(); dev_cell_id.clear(); dev_particle_id.clear(); @@ -267,6 +290,7 @@ void AtomT::clear() { time_pos.clear(); time_q.clear(); time_quat.clear(); + time_vel.clear(); clear_resize(); #ifdef GPU_CAST @@ -285,6 +309,8 @@ double AtomT::host_memory_usage() const { atom_bytes+=1; if (_rot) atom_bytes+=4; + if (_vel) + atom_bytes+=4; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 4731b1b08d..23112fe712 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -67,7 +67,8 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, - UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false); + UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, + const bool vel=false); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ @@ -87,7 +88,7 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, - const bool bonds); + const bool bonds, const bool vel=false); /// Returns true if GPU is using charges bool charge() { return _charge; } @@ -95,6 +96,9 @@ class Atom { /// Returns true if GPU is using quaternions bool quaternion() { return _rot; } + /// Returns true if GPU is using velocities + bool velocity() { return _vel; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -114,6 +118,8 @@ class Atom { time_q.add_to_total(); if (_rot) time_quat.add_to_total(); + if (_vel) + time_vel.add_to_total(); } /// Add copy times to timers @@ -123,6 +129,8 @@ class Atom { time_q.zero(); if (_rot) time_quat.zero(); + if (_vel) + time_vel.zero(); } /// Return the total time for host/device data transfer @@ -135,9 +143,13 @@ class Atom { time_q.zero_total(); } if (_rot) { - total+=time_q.total_seconds(); + total+=time_quat.total_seconds(); time_quat.zero_total(); } + if (_vel) { + total+=time_vel.total_seconds(); + time_vel.zero_total(); + } return total+_time_transfer/1000.0; } @@ -242,7 +254,7 @@ class Atom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() - { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } + { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { @@ -341,6 +353,53 @@ class Atom { } } + /// Cast velocities and tags to write buffer + inline void cast_v_data(double **host_ptr, const tagint *host_tag) { + if (_v_avail==false) { + double t=MPI_Wtime(); + #ifdef GPU_CAST + memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int)); + #else + int wl=0; + for (int i=0; i<_nall; i++) { + v[wl]=host_ptr[i][0]; + v[wl+1]=host_ptr[i][1]; + v[wl+2]=host_ptr[i][2]; + v[wl+3]=host_tag[i]; + wl+=4; + } + #endif + _time_cast+=MPI_Wtime()-t; + } + } + + /// Copy velocities and tags to device asynchronously + /** Copies nall() elements **/ + inline void add_v_data(double **host_ptr, tagint *host_tag) { + time_vel.start(); + if (_v_avail==false) { + #ifdef GPU_CAST + v_cast.update_device(_nall*3,true); + tag_cast.update_device(_nall,true); + int block_size=64; + int GX=static_cast(ceil(static_cast(_nall)/block_size)); + k_cast_x.set_size(GX,block_size); + k_cast_x.run(&v, &v_cast, &tag_cast, &_nall); + #else + v.update_device(_nall*4,true); + #endif + _v_avail=true; + } + time_vel.stop(); + } + + /// Calls cast_v_data and add_v_data and times the routines + inline void cast_copy_v(double **host_ptr, tagint *host_tag) { + cast_v_data(host_ptr,host_tag); + add_v_data(host_ptr,host_tag); + } + /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } @@ -362,7 +421,9 @@ class Atom { UCL_Vector q; /// Quaterions UCL_Vector quat; - + /// Velocities + UCL_Vector v; + #ifdef GPU_CAST UCL_Vector x_cast; UCL_Vector type_cast; @@ -372,6 +433,7 @@ class Atom { UCL_D_Vec dev_cell_id; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_particle_id; + /// Atom tag information for device nbor builds UCL_D_Vec dev_tag; @@ -381,7 +443,7 @@ class Atom { UCL_H_Vec host_particle_id; /// Device timers - UCL_Timer time_pos, time_q, time_quat; + UCL_Timer time_pos, time_q, time_quat, time_vel; /// Geryon device UCL_Device *dev; @@ -396,11 +458,11 @@ class Atom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail, _resized; + bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; bool alloc(const int nall); - bool _allocated, _rot, _charge, _bonds, _other; + bool _allocated, _rot, _charge, _bonds, _vel, _other; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 554a535909..0859062345 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { template int DeviceT::init(Answer &ans, const bool charge, - const bool rot, const int nlocal, - const int host_nlocal, const int nall, - Neighbor *nbor, const int maxspecial, - const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut, - const int threads_per_atom) { + const bool rot, const int nlocal, + const int host_nlocal, const int nall, + Neighbor *nbor, const int maxspecial, + const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut, + const int threads_per_atom, const bool vel) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) @@ -245,7 +245,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) return -3; _data_in_estimate++; @@ -253,12 +253,16 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (rot) _data_in_estimate++; + if (vel) + _data_in_estimate++; } else { if (atom.charge()==false && charge) _data_in_estimate++; if (atom.quaternion()==false && rot) _data_in_estimate++; - if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial)) + if (atom.velocity()==false && vel) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) return -3; } @@ -318,7 +322,7 @@ void DeviceT::set_double_precompute template void DeviceT::init_message(FILE *screen, const char *name, - const int first_gpu, const int last_gpu) { + const int first_gpu, const int last_gpu) { #if defined(USE_OPENCL) std::string fs=""; #elif defined(USE_CUDART) @@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name, if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); - fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); + fprintf(screen,"- Using acceleration for %s:\n",name); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); #ifdef _OPENMP fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); @@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name, } else sname+="Double Precision)"; - fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); + fprintf(screen,"Device %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); @@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name, template void DeviceT::estimate_gpu_overhead(const int kernel_calls, - double &gpu_overhead, - double &gpu_driver_overhead) { + double &gpu_overhead, + double &gpu_driver_overhead) { UCL_H_Vec *host_data_in=NULL, *host_data_out=NULL; UCL_D_Vec *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; @@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]; if (replica_me()==0) if (screen && times[5]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device()) { + if (time_device() && t_time>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); @@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } if (nbor.gpu_nbor()==2) fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size); - fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); + if (times[5]>0) + fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); @@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, template void DeviceT::output_kspace_times(UCL_Timer &time_in, - UCL_Timer &time_out, - UCL_Timer &time_map, - UCL_Timer &time_rho, - UCL_Timer &time_interp, - Answer &ans, - const double max_bytes, - const double cpu_time, - const double idle_time, FILE *screen) { + UCL_Timer &time_out, + UCL_Timer &time_map, + UCL_Timer &time_rho, + UCL_Timer &time_interp, + Answer &ans, + const double max_bytes, + const double cpu_time, + const double idle_time, FILE *screen) { double single[8], times[8]; single[0]=time_out.total_seconds(); @@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; if (replica_me()==0) if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device()) { + if (time_device() && t_time>0) { fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index d7bc70edaf..9a767e74b3 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -77,7 +77,7 @@ class Device { const int nlocal, const int host_nlocal, const int nall, Neighbor *nbor, const int maxspecial, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut, - const int threads_per_atom); + const int threads_per_atom, const bool vel=false); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for