git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@11660 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2014-03-20 14:50:49 +00:00
parent e1cc86a12f
commit 67afd6fb68
6 changed files with 170 additions and 43 deletions

View File

@ -213,12 +213,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut,
const int threads_per_atom) {
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut,
const int threads_per_atom, const bool vel) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -245,7 +245,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
return -3;
_data_in_estimate++;
@ -253,12 +253,16 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
if (rot)
_data_in_estimate++;
if (vel)
_data_in_estimate++;
} else {
if (atom.charge()==false && charge)
_data_in_estimate++;
if (atom.quaternion()==false && rot)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
if (atom.velocity()==false && vel)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
return -3;
}
@ -318,7 +322,7 @@ void DeviceT::set_double_precompute
template <class numtyp, class acctyp>
void DeviceT::init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu) {
const int first_gpu, const int last_gpu) {
#if defined(USE_OPENCL)
std::string fs="";
#elif defined(USE_CUDART)
@ -330,7 +334,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
if (_replica_me == 0 && screen) {
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"-------------------------------------\n");
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
fprintf(screen,"- Using acceleration for %s:\n",name);
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
#ifdef _OPENMP
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
@ -361,7 +365,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
} else
sname+="Double Precision)";
fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
fprintf(screen,"Device %d: %s\n",i,sname.c_str());
}
fprintf(screen,"-------------------------------------");
@ -371,8 +375,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
template <class numtyp, class acctyp>
void DeviceT::estimate_gpu_overhead(const int kernel_calls,
double &gpu_overhead,
double &gpu_driver_overhead) {
double &gpu_overhead,
double &gpu_driver_overhead) {
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
@ -506,16 +510,17 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
if (replica_me()==0)
if (screen && times[5]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device()) {
if (time_device() && t_time>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
@ -527,7 +532,8 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
}
if (nbor.gpu_nbor()==2)
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size);
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
if (times[5]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
@ -541,14 +547,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
template <class numtyp, class acctyp>
void DeviceT::output_kspace_times(UCL_Timer &time_in,
UCL_Timer &time_out,
UCL_Timer &time_map,
UCL_Timer &time_rho,
UCL_Timer &time_interp,
Answer<numtyp,acctyp> &ans,
const double max_bytes,
const double cpu_time,
const double idle_time, FILE *screen) {
UCL_Timer &time_out,
UCL_Timer &time_map,
UCL_Timer &time_rho,
UCL_Timer &time_interp,
Answer<numtyp,acctyp> &ans,
const double max_bytes,
const double cpu_time,
const double idle_time, FILE *screen) {
double single[8], times[8];
single[0]=time_out.total_seconds();
@ -566,16 +572,17 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
double mpi_max_bytes;
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
double max_mb=mpi_max_bytes/(1024.0*1024.0);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
if (replica_me()==0)
if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," GPU Time Info (average): ");
fprintf(screen," Device Time Info (average): ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
if (time_device()) {
if (time_device() && t_time>0) {
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);