From a3d6cff836334746f233d9fa93b550c020fcd2a3 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Mon, 21 Feb 2011 11:35:26 -0500 Subject: [PATCH] CODE USED TO BENCHMARK CHARGE SPREADING FOR PAPER. --- lib/gpu/pair_gpu_device.cpp | 9 +- lib/gpu/pppm_gpu_memory.cpp | 224 ++++++++++++++++++------------------ src/GPU/pppm_gpu.cpp | 13 ++- src/lmptype.h | 4 +- 4 files changed, 130 insertions(+), 120 deletions(-) diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 550c1207b5..8e1651abba 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -266,9 +266,12 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, const double max_bytes, FILE *screen) { double single[4], times[4]; - single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+ - time_out.total_seconds(); - single[1]=atom.cast_time()+ans.cast_time(); +// single[0]=atom.transfer_time()+ans.transfer_time()+time_in.total_seconds()+ +// time_out.total_seconds(); +// single[1]=atom.cast_time()+ans.cast_time(); + + single[0]=time_out.total_seconds(); + single[1]=0.0; single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 222bea40b6..395389ffa8 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -147,12 +147,6 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, d_error_flag.zero(); _max_bytes+=1; -std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl; -std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl; -std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl; -std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl; - - return h_brick.begin(); } @@ -188,7 +182,7 @@ void PPPMGPUMemoryT::clear() { device->clear(); } -/* + // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- @@ -296,6 +290,113 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, return h_error_flag[0]; } + + +/* +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + acc_timers(); + if (nlocal==0) { + zero_timers(); + return 0; + } + + ans->inum(nlocal); + + if (ago==0) { + resize_atom(nlocal,nall,success); + resize_local(nlocal,success); + if (!success) + return 0; + + double bytes=ans->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + + time_map.start(); + + // Compute the block size and grid size to keep all cores busy + int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + + int _max_atoms=10; + int ainum=this->ans->inum(); + + // Boxlo adjusted to be upper left brick and shift for even stencil order + double shift=0.0; + if (_order % 2) + shift=0.5; + numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; + numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; + numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; + + numtyp f_delxinv=delxinv; + numtyp f_delyinv=delyinv; + numtyp f_delzinv=delzinv; + double delvolinv = delxinv*delyinv*delzinv; + numtyp f_delvolinv = delvolinv; + + d_brick_counts.zero(); + k_particle_map.set_size(GX,BX); + k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(), + &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, + &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, + &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, + &d_error_flag.begin()); + time_map.stop(); + + time_rho.start(); + if (_order % 2) + shift=0.0; + else + shift=0.5; + f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv; + f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv; + f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv; + + BX=block_size(); + GX=_npts_x; + int GY=_npts_y; + k_make_rho.set_size(GX,GY,BX,1); + k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), + &d_brick_counts.begin(), &d_brick_atoms.begin(), + &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, + &_npts_yx, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z, + &_order_m_1, &f_brick_x, &f_brick_y, &f_brick_z, + &f_delxinv, &f_delyinv, &f_delzinv, &_order, &_order2, + &f_delvolinv); + time_rho.stop(); + + time_out.start(); + ucl_copy(h_brick,d_brick,true); + ucl_copy(h_error_flag,d_error_flag,false); + time_out.stop(); + + if (h_error_flag[0]==2) { + // Not enough storage for atoms on the brick + _max_brick_atoms*=2; + d_brick_atoms.clear(); + d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device); + _max_bytes+=d_brick_atoms.row_bytes(); + return compute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, + delxinv,delyinv,delzinv); + } + + return h_error_flag[0]; +} */ /* @@ -332,115 +433,14 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, atom->add_x_data(host_x,host_type); atom->add_q_data(); - // Compute the block size and grid size to keep all cores busy - int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - - int _max_atoms=10; - int ainum=this->ans->inum(); - - // Boxlo adjusted to be upper left brick and shift for even stencil order - double shift=0.0; - if (_order % 2) - shift=0.5; - numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; - numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; - numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; - - numtyp f_delxinv=delxinv; - numtyp f_delyinv=delyinv; - numtyp f_delzinv=delzinv; - double delvolinv = delxinv*delyinv*delzinv; - numtyp f_delvolinv = delvolinv; - time_map.start(); - d_brick_counts.zero(); - k_particle_map.set_size(GX,BX); - k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(), - &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, - &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, - &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, - &d_error_flag.begin()); time_map.stop(); - if (_order % 2) - shift=0.0; - else - shift=0.5; - f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv; - f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv; - f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv; - time_rho.start(); - BX=block_size(); - GX=_npts_x; - int GY=_npts_y; - k_make_rho.set_size(GX,GY,BX,1); - k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), - &d_brick_counts.begin(), &d_brick_atoms.begin(), - &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, - &_npts_yx, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z, - &_order_m_1, &f_brick_x, &f_brick_y, &f_brick_z, - &f_delxinv, &f_delyinv, &f_delzinv, &_order, &_order2, - &f_delvolinv); - time_rho.stop(); - - time_out.start(); - ucl_copy(h_brick,d_brick,true); - ucl_copy(h_error_flag,d_error_flag,false); - time_out.stop(); - - if (h_error_flag[0]==2) { - // Not enough storage for atoms on the brick - _max_brick_atoms*=2; - d_brick_atoms.clear(); - d_brick_atoms.alloc(_atom_stride*_max_atoms,*ucl_device); - _max_bytes+=d_brick_atoms.row_bytes(); - return compute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, - delxinv,delyinv,delzinv); - } - - return h_error_flag[0]; -} -*/ - -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. -// --------------------------------------------------------------------------- -template -int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, - double **host_x, int *host_type, bool &success, - double *host_q, double *boxlo, - const double delxinv, const double delyinv, - const double delzinv) { - acc_timers(); - if (nlocal==0) { - zero_timers(); - return 0; - } - - ans->inum(nlocal); - - if (ago==0) { - resize_atom(nlocal,nall,success); - resize_local(nlocal,success); - if (!success) - return 0; - - double bytes=ans->gpu_bytes(); - if (bytes>_max_an_bytes) - _max_an_bytes=bytes; - } - - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - atom->add_x_data(host_x,host_type); - atom->add_q_data(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->ans->inum(); // Boxlo adjusted to be upper left brick and shift for even stencil order @@ -457,10 +457,6 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, double delvolinv = delxinv*delyinv*delzinv; numtyp f_delvolinv = delvolinv; - time_map.start(); - time_map.stop(); - - time_rho.start(); d_brick.zero(); k_make_rho.set_size(GX,BX); k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, @@ -477,7 +473,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, return h_error_flag[0]; } - +*/ template double PPPMGPUMemoryT::host_memory_usage() const { @@ -495,7 +491,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) { pppm_program=new UCL_Program(dev); pppm_program->load_string(pppm_gpu_kernel,flags.c_str()); k_particle_map.set_function(*pppm_program,"particle_map"); - k_make_rho.set_function(*pppm_program,"make_rho3"); + k_make_rho.set_function(*pppm_program,"make_rho"); pos_tex.get_texture(*pppm_program,"pos_tex"); q_tex.get_texture(*pppm_program,"q_tex"); diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 33c92e9ed3..5f3dc5e688 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -105,7 +105,14 @@ PPPMGPU::~PPPMGPU() deallocate(); memory->destroy_2d_int_array(part2grid); pppm_gpu_clear(); -std::cout << "DEBUG_TIMES: " << time1 << " " << time2 << " " << time3 +double total1, total2, total3; +int rank; +MPI_Comm_rank(MPI_COMM_WORLD,&rank); +MPI_Allreduce(&time1,&total1,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); +MPI_Allreduce(&time2,&total2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); +MPI_Allreduce(&time3,&total3,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); +if (rank==0) +std::cout << "DEBUG_TIMES: " << total1 << " " << total2 << " " << total3 << std::endl; } @@ -702,6 +709,8 @@ double t1=MPI_Wtime(); make_rho(); time1+=MPI_Wtime()-t1; + +/* double max_error=0; int _npts_x=nxhi_out-nxlo_out+1; int _npts_y=nyhi_out-nylo_out+1; @@ -732,6 +741,8 @@ for (int i=0; i