git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11660 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
Neighbor *nbor, const int maxspecial,
|
||||
const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut,
|
||||
const int threads_per_atom) {
|
||||
const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
Neighbor *nbor, const int maxspecial,
|
||||
const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut,
|
||||
const int threads_per_atom, const bool vel) {
|
||||
if (!_device_init)
|
||||
return -1;
|
||||
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||
@ -245,7 +245,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
|
||||
if (_init_count==0) {
|
||||
// Initialize atom and nbor data
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
|
||||
return -3;
|
||||
|
||||
_data_in_estimate++;
|
||||
@ -253,12 +253,16 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
_data_in_estimate++;
|
||||
if (rot)
|
||||
_data_in_estimate++;
|
||||
if (vel)
|
||||
_data_in_estimate++;
|
||||
} else {
|
||||
if (atom.charge()==false && charge)
|
||||
_data_in_estimate++;
|
||||
if (atom.quaternion()==false && rot)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
|
||||
if (atom.velocity()==false && vel)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
|
||||
return -3;
|
||||
}
|
||||
|
||||
@ -318,7 +322,7 @@ void DeviceT::set_double_precompute
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::init_message(FILE *screen, const char *name,
|
||||
const int first_gpu, const int last_gpu) {
|
||||
const int first_gpu, const int last_gpu) {
|
||||
#if defined(USE_OPENCL)
|
||||
std::string fs="";
|
||||
#elif defined(USE_CUDART)
|
||||
@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
|
||||
if (_replica_me == 0 && screen) {
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"-------------------------------------\n");
|
||||
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
||||
fprintf(screen,"- Using acceleration for %s:\n",name);
|
||||
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
|
||||
#ifdef _OPENMP
|
||||
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
|
||||
@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
|
||||
} else
|
||||
sname+="Double Precision)";
|
||||
|
||||
fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
|
||||
fprintf(screen,"Device %d: %s\n",i,sname.c_str());
|
||||
}
|
||||
|
||||
fprintf(screen,"-------------------------------------");
|
||||
@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::estimate_gpu_overhead(const int kernel_calls,
|
||||
double &gpu_overhead,
|
||||
double &gpu_driver_overhead) {
|
||||
double &gpu_overhead,
|
||||
double &gpu_driver_overhead) {
|
||||
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
|
||||
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
|
||||
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
|
||||
@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||
double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
|
||||
|
||||
if (replica_me()==0)
|
||||
if (screen && times[5]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (time_device()) {
|
||||
if (time_device() && t_time>0) {
|
||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
|
||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
|
||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
|
||||
@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
||||
}
|
||||
if (nbor.gpu_nbor()==2)
|
||||
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size);
|
||||
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
|
||||
if (times[5]>0)
|
||||
fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::output_kspace_times(UCL_Timer &time_in,
|
||||
UCL_Timer &time_out,
|
||||
UCL_Timer &time_map,
|
||||
UCL_Timer &time_rho,
|
||||
UCL_Timer &time_interp,
|
||||
Answer<numtyp,acctyp> &ans,
|
||||
const double max_bytes,
|
||||
const double cpu_time,
|
||||
const double idle_time, FILE *screen) {
|
||||
UCL_Timer &time_out,
|
||||
UCL_Timer &time_map,
|
||||
UCL_Timer &time_rho,
|
||||
UCL_Timer &time_interp,
|
||||
Answer<numtyp,acctyp> &ans,
|
||||
const double max_bytes,
|
||||
const double cpu_time,
|
||||
const double idle_time, FILE *screen) {
|
||||
double single[8], times[8];
|
||||
|
||||
single[0]=time_out.total_seconds();
|
||||
@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
|
||||
double mpi_max_bytes;
|
||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
|
||||
|
||||
if (replica_me()==0)
|
||||
if (screen && times[6]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," GPU Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (time_device()) {
|
||||
if (time_device() && t_time>0) {
|
||||
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
|
||||
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
|
||||
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);
|
||||
|
||||
Reference in New Issue
Block a user