git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11660 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2014-03-20 14:50:49 +00:00
parent e1cc86a12f
commit 67afd6fb68
6 changed files with 170 additions and 43 deletions

View File

@ -29,6 +29,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_base_dpd.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -65,7 +66,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
$(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -111,7 +113,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \ $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \ $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
$(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
$(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -190,6 +193,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp $(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
$(CUDR) -o $@ -c lal_base_three.cpp $(CUDR) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/lal_base_dpd.o: $(ALL_H) lal_base_dpd.h lal_base_dpd.cpp
$(CUDR) -o $@ -c lal_base_dpd.cpp
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
@ -658,6 +664,18 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) $(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dpd.cubin: lal_dpd.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dpd.cu
$(OBJ_DIR)/dpd_cubin.h: $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd.cubin
$(BIN2C) -c -n dpd $(OBJ_DIR)/dpd.cubin > $(OBJ_DIR)/dpd_cubin.h
$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OBJ_DIR)/lal_base_dpd.o
$(CUDR) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda

View File

@ -18,6 +18,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_base_dpd.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -54,7 +55,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \ $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
$(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
$(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -77,7 +79,7 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \ $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \
$(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -129,6 +131,9 @@ $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp $(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
$(OCL) -o $@ -c lal_base_three.cpp $(OCL) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/lal_base_dpd.o: $(OCL_H) lal_base_dpd.h lal_base_dpd.cpp
$(OCL) -o $@ -c lal_base_dpd.cpp
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
@ -471,6 +476,15 @@ $(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ
$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h $(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR) $(OCL) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dpd_cl.h: lal_dpd.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh dpd $(PRE1_H) lal_dpd.cu $(OBJ_DIR)/dpd_cl.h;
$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/dpd_cl.h $(OBJ_DIR)/lal_base_dpd.o
$(OCL) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
$(OCL) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -41,6 +41,8 @@ int AtomT::bytes_per_atom() const {
bytes+=4*sizeof(numtyp); bytes+=4*sizeof(numtyp);
if (_charge) if (_charge)
bytes+=sizeof(numtyp); bytes+=sizeof(numtyp);
if (_vel)
bytes+=4*sizeof(numtyp);
return bytes; return bytes;
} }
@ -52,7 +54,7 @@ bool AtomT::alloc(const int nall) {
// Ignore host/device transfers? // Ignore host/device transfers?
_host_view=false; _host_view=false;
if (dev->shared_memory()) { if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
_host_view=true; _host_view=true;
#ifdef GPU_CAST #ifdef GPU_CAST
assert(0==1); assert(0==1);
@ -90,6 +92,11 @@ bool AtomT::alloc(const int nall) {
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes(); gpu_bytes+=quat.device.row_bytes();
} }
if (_vel && _host_view==false) {
success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=v.device.row_bytes();
}
if (_gpu_nbor>0) { if (_gpu_nbor>0) {
if (_bonds) { if (_bonds) {
@ -124,7 +131,7 @@ bool AtomT::alloc(const int nall) {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
bool AtomT::add_fields(const bool charge, const bool rot, bool AtomT::add_fields(const bool charge, const bool rot,
const int gpu_nbor, const bool bonds) { const int gpu_nbor, const bool bonds, const bool vel) {
bool success=true; bool success=true;
// Ignore host/device transfers? // Ignore host/device transfers?
int gpu_bytes=0; int gpu_bytes=0;
@ -149,6 +156,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
} }
} }
if (vel && _vel==false) {
_vel=true;
_other=true;
if (_host_view==false) {
success=success && (v.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=v.device.row_bytes();
}
}
if (bonds && _bonds==false) { if (bonds && _bonds==false) {
_bonds=true; _bonds=true;
if (_bonds && _gpu_nbor>0) { if (_bonds && _gpu_nbor>0) {
@ -191,19 +208,21 @@ bool AtomT::add_fields(const bool charge, const bool rot,
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
bool AtomT::init(const int nall, const bool charge, const bool rot, bool AtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const int gpu_nbor, const bool bonds) { UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
clear(); clear();
bool success=true; bool success=true;
_x_avail=false; _x_avail=false;
_q_avail=false; _q_avail=false;
_quat_avail=false; _quat_avail=false;
_v_avail=false;
_resized=false; _resized=false;
_gpu_nbor=gpu_nbor; _gpu_nbor=gpu_nbor;
_bonds=bonds; _bonds=bonds;
_charge=charge; _charge=charge;
_rot=rot; _rot=rot;
_other=_charge || _rot; _vel=vel;
_other=_charge || _rot || _vel;
dev=&devi; dev=&devi;
_time_transfer=0; _time_transfer=0;
@ -216,9 +235,11 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
time_pos.init(*dev); time_pos.init(*dev);
time_q.init(*dev); time_q.init(*dev);
time_quat.init(*dev); time_quat.init(*dev);
time_vel.init(*dev);
time_pos.zero(); time_pos.zero();
time_q.zero(); time_q.zero();
time_quat.zero(); time_quat.zero();
time_vel.zero();
_time_cast=0.0; _time_cast=0.0;
#ifdef GPU_CAST #ifdef GPU_CAST
@ -239,6 +260,8 @@ void AtomT::clear_resize() {
q.clear(); q.clear();
if (_rot) if (_rot)
quat.clear(); quat.clear();
if (_vel)
v.clear();
dev_cell_id.clear(); dev_cell_id.clear();
dev_particle_id.clear(); dev_particle_id.clear();
@ -267,6 +290,7 @@ void AtomT::clear() {
time_pos.clear(); time_pos.clear();
time_q.clear(); time_q.clear();
time_quat.clear(); time_quat.clear();
time_vel.clear();
clear_resize(); clear_resize();
#ifdef GPU_CAST #ifdef GPU_CAST
@ -285,6 +309,8 @@ double AtomT::host_memory_usage() const {
atom_bytes+=1; atom_bytes+=1;
if (_rot) if (_rot)
atom_bytes+=4; atom_bytes+=4;
if (_vel)
atom_bytes+=4;
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>); return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
} }

View File

@ -67,7 +67,8 @@ class Atom {
* gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/ * gpu_nbor 2 if binning on host and neighboring on device **/
bool init(const int nall, const bool charge, const bool rot, bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false); UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
const bool vel=false);
/// Check if we have enough device storage and realloc if not /// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/ /** Returns true if resized with any call during this timestep **/
@ -87,7 +88,7 @@ class Atom {
* gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/ * gpu_nbor 2 if binning on host and neighboring on device **/
bool add_fields(const bool charge, const bool rot, const int gpu_nbor, bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
const bool bonds); const bool bonds, const bool vel=false);
/// Returns true if GPU is using charges /// Returns true if GPU is using charges
bool charge() { return _charge; } bool charge() { return _charge; }
@ -95,6 +96,9 @@ class Atom {
/// Returns true if GPU is using quaternions /// Returns true if GPU is using quaternions
bool quaternion() { return _rot; } bool quaternion() { return _rot; }
/// Returns true if GPU is using velocities
bool velocity() { return _vel; }
/// Only free matrices of length inum or nall for resizing /// Only free matrices of length inum or nall for resizing
void clear_resize(); void clear_resize();
@ -114,6 +118,8 @@ class Atom {
time_q.add_to_total(); time_q.add_to_total();
if (_rot) if (_rot)
time_quat.add_to_total(); time_quat.add_to_total();
if (_vel)
time_vel.add_to_total();
} }
/// Add copy times to timers /// Add copy times to timers
@ -123,6 +129,8 @@ class Atom {
time_q.zero(); time_q.zero();
if (_rot) if (_rot)
time_quat.zero(); time_quat.zero();
if (_vel)
time_vel.zero();
} }
/// Return the total time for host/device data transfer /// Return the total time for host/device data transfer
@ -135,9 +143,13 @@ class Atom {
time_q.zero_total(); time_q.zero_total();
} }
if (_rot) { if (_rot) {
total+=time_q.total_seconds(); total+=time_quat.total_seconds();
time_quat.zero_total(); time_quat.zero_total();
} }
if (_vel) {
total+=time_vel.total_seconds();
time_vel.zero_total();
}
return total+_time_transfer/1000.0; return total+_time_transfer/1000.0;
} }
@ -242,7 +254,7 @@ class Atom {
/// Signal that we need to transfer atom data for next timestep /// Signal that we need to transfer atom data for next timestep
inline void data_unavail() inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
/// Cast positions and types to write buffer /// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) { inline void cast_x_data(double **host_ptr, const int *host_type) {
@ -341,6 +353,53 @@ class Atom {
} }
} }
/// Cast velocities and tags to write buffer
inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
if (_v_avail==false) {
double t=MPI_Wtime();
#ifdef GPU_CAST
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
#else
int wl=0;
for (int i=0; i<_nall; i++) {
v[wl]=host_ptr[i][0];
v[wl+1]=host_ptr[i][1];
v[wl+2]=host_ptr[i][2];
v[wl+3]=host_tag[i];
wl+=4;
}
#endif
_time_cast+=MPI_Wtime()-t;
}
}
/// Copy velocities and tags to device asynchronously
/** Copies nall() elements **/
inline void add_v_data(double **host_ptr, tagint *host_tag) {
time_vel.start();
if (_v_avail==false) {
#ifdef GPU_CAST
v_cast.update_device(_nall*3,true);
tag_cast.update_device(_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
#else
v.update_device(_nall*4,true);
#endif
_v_avail=true;
}
time_vel.stop();
}
/// Calls cast_v_data and add_v_data and times the routines
inline void cast_copy_v(double **host_ptr, tagint *host_tag) {
cast_v_data(host_ptr,host_tag);
add_v_data(host_ptr,host_tag);
}
/// Add in casting time from additional data (seconds) /// Add in casting time from additional data (seconds)
inline void add_cast_time(double t) { _time_cast+=t; } inline void add_cast_time(double t) { _time_cast+=t; }
@ -362,7 +421,9 @@ class Atom {
UCL_Vector<numtyp,numtyp> q; UCL_Vector<numtyp,numtyp> q;
/// Quaterions /// Quaterions
UCL_Vector<numtyp,numtyp> quat; UCL_Vector<numtyp,numtyp> quat;
/// Velocities
UCL_Vector<numtyp,numtyp> v;
#ifdef GPU_CAST #ifdef GPU_CAST
UCL_Vector<double,double> x_cast; UCL_Vector<double,double> x_cast;
UCL_Vector<int,int> type_cast; UCL_Vector<int,int> type_cast;
@ -372,6 +433,7 @@ class Atom {
UCL_D_Vec<unsigned> dev_cell_id; UCL_D_Vec<unsigned> dev_cell_id;
/// Cell list identifiers for device nbor builds /// Cell list identifiers for device nbor builds
UCL_D_Vec<int> dev_particle_id; UCL_D_Vec<int> dev_particle_id;
/// Atom tag information for device nbor builds /// Atom tag information for device nbor builds
UCL_D_Vec<tagint> dev_tag; UCL_D_Vec<tagint> dev_tag;
@ -381,7 +443,7 @@ class Atom {
UCL_H_Vec<int> host_particle_id; UCL_H_Vec<int> host_particle_id;
/// Device timers /// Device timers
UCL_Timer time_pos, time_q, time_quat; UCL_Timer time_pos, time_q, time_quat, time_vel;
/// Geryon device /// Geryon device
UCL_Device *dev; UCL_Device *dev;
@ -396,11 +458,11 @@ class Atom {
bool _compiled; bool _compiled;
// True if data has been copied to device already // True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _resized; bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
bool alloc(const int nall); bool alloc(const int nall);
bool _allocated, _rot, _charge, _bonds, _other; bool _allocated, _rot, _charge, _bonds, _vel, _other;
int _max_atoms, _nall, _gpu_nbor; int _max_atoms, _nall, _gpu_nbor;
bool _host_view; bool _host_view;
double _time_cast, _time_transfer; double _time_cast, _time_transfer;

View File

@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge, int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal, const bool rot, const int nlocal,
const int host_nlocal, const int nall, const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial, Neighbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors, const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut, const double cell_size, const bool pre_cut,
const int threads_per_atom) { const int threads_per_atom, const bool vel) {
if (!_device_init) if (!_device_init)
return -1; return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -245,7 +245,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (_init_count==0) { if (_init_count==0) {
// Initialize atom and nbor data // Initialize atom and nbor data
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0)) if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
return -3; return -3;
_data_in_estimate++; _data_in_estimate++;
@ -253,12 +253,16 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++; _data_in_estimate++;
if (rot) if (rot)
_data_in_estimate++; _data_in_estimate++;
if (vel)
_data_in_estimate++;
} else { } else {
if (atom.charge()==false && charge) if (atom.charge()==false && charge)
_data_in_estimate++; _data_in_estimate++;
if (atom.quaternion()==false && rot) if (atom.quaternion()==false && rot)
_data_in_estimate++; _data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial)) if (atom.velocity()==false && vel)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
return -3; return -3;
} }
@ -318,7 +322,7 @@ void DeviceT::set_double_precompute
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void DeviceT::init_message(FILE *screen, const char *name, void DeviceT::init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu) { const int first_gpu, const int last_gpu) {
#if defined(USE_OPENCL) #if defined(USE_OPENCL)
std::string fs=""; std::string fs="";
#elif defined(USE_CUDART) #elif defined(USE_CUDART)
@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
if (_replica_me == 0 && screen) { if (_replica_me == 0 && screen) {
fprintf(screen,"\n-------------------------------------"); fprintf(screen,"\n-------------------------------------");
fprintf(screen,"-------------------------------------\n"); fprintf(screen,"-------------------------------------\n");
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); fprintf(screen,"- Using acceleration for %s:\n",name);
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
#ifdef _OPENMP #ifdef _OPENMP
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
} else } else
sname+="Double Precision)"; sname+="Double Precision)";
fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); fprintf(screen,"Device %d: %s\n",i,sname.c_str());
} }
fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------");
@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void DeviceT::estimate_gpu_overhead(const int kernel_calls, void DeviceT::estimate_gpu_overhead(const int kernel_calls,
double &gpu_overhead, double &gpu_overhead,
double &gpu_driver_overhead) { double &gpu_driver_overhead) {
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL; UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
double mpi_max_bytes; double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0); double max_mb=mpi_max_bytes/(1024.0*1024.0);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
if (replica_me()==0) if (replica_me()==0)
if (screen && times[5]>0.0) { if (screen && times[5]>0.0) {
fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n"); fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): "); fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------"); fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n"); fprintf(screen,"--------------------------------\n");
if (time_device()) { if (time_device() && t_time>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
} }
if (nbor.gpu_nbor()==2) if (nbor.gpu_nbor()==2)
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size); fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size);
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); if (times[5]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void DeviceT::output_kspace_times(UCL_Timer &time_in, void DeviceT::output_kspace_times(UCL_Timer &time_in,
UCL_Timer &time_out, UCL_Timer &time_out,
UCL_Timer &time_map, UCL_Timer &time_map,
UCL_Timer &time_rho, UCL_Timer &time_rho,
UCL_Timer &time_interp, UCL_Timer &time_interp,
Answer<numtyp,acctyp> &ans, Answer<numtyp,acctyp> &ans,
const double max_bytes, const double max_bytes,
const double cpu_time, const double cpu_time,
const double idle_time, FILE *screen) { const double idle_time, FILE *screen) {
double single[8], times[8]; double single[8], times[8];
single[0]=time_out.total_seconds(); single[0]=time_out.total_seconds();
@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
double mpi_max_bytes; double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0); double max_mb=mpi_max_bytes/(1024.0*1024.0);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
if (replica_me()==0) if (replica_me()==0)
if (screen && times[6]>0.0) { if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n"); fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): "); fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------"); fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n"); fprintf(screen,"--------------------------------\n");
if (time_device()) { if (time_device() && t_time>0) {
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);

View File

@ -77,7 +77,7 @@ class Device {
const int nlocal, const int host_nlocal, const int nall, const int nlocal, const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial, const int gpu_host, Neighbor *nbor, const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size, const bool pre_cut, const int max_nbors, const double cell_size, const bool pre_cut,
const int threads_per_atom); const int threads_per_atom, const bool vel=false);
/// Initialize the device for Atom storage only /// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for /** \param nlocal Total number of local particles to allocate memory for