diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index c44812cd87..6651a31ad8 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -57,8 +57,6 @@ OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \ - $(OBJ_DIR)/charge_gpu_memory2.o \ - $(OBJ_DIR)/crml_gpu_memory2.o $(OBJ_DIR)/crml_gpu2.o \ $(CUDPP) PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ @@ -77,8 +75,7 @@ PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \ - $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h \ - $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h + $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h all: $(GPU_LIB) $(EXECS) @@ -301,28 +298,6 @@ $(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_me $(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR) - - -$(OBJ_DIR)/charge_gpu_memory2.o: $(ALL_H) charge_gpu_memory2.h charge_gpu_memory2.cpp - $(CUDR) -o $@ -c charge_gpu_memory2.cpp - -$(OBJ_DIR)/crml_gpu_kernel2.ptx: crml_gpu_kernel2.cu pair_gpu_precision.h - $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel2.cu - -$(OBJ_DIR)/crml_gpu_ptx2.h: $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_kernel2.ptx - $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/crml_gpu_kernel2.ptx $(OBJ_DIR)/crml_gpu_ptx2.h - -$(OBJ_DIR)/crml_gpu_memory2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu_memory2.cpp $(OBJ_DIR)/crml_gpu_ptx2.h $(OBJ_DIR)/charge_gpu_memory2.o - $(CUDR) -o $@ -c crml_gpu_memory2.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/crml_gpu2.o: $(ALL_H) crml_gpu_memory2.h crml_gpu2.cpp charge_gpu_memory2.h - $(CUDR) -o $@ -c crml_gpu2.cpp -I$(OBJ_DIR) - - - - - - $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index 6d3eca0996..531ea4000d 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -102,7 +102,7 @@ void AtomicGPUMemoryT::clear_atomic() { _gpu_overhead*=hd_balancer.timestep(); _driver_overhead*=hd_balancer.timestep(); device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, - _gpu_overhead,_driver_overhead,screen); + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index 8c8f231bf8..412596f5f2 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -57,7 +57,7 @@ int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, if (host_nlocal>0) _gpu_host=1; - _threads_per_atom=device->threads_per_atom(); + _threads_per_atom=device->threads_per_charge(); if (_threads_per_atom>1 && gpu_nbor==false) { nbor->packing(true); _nbor_data=&(nbor->dev_packed); @@ -104,7 +104,7 @@ void ChargeGPUMemoryT::clear_atomic() { _gpu_overhead*=hd_balancer.timestep(); _driver_overhead*=hd_balancer.timestep(); device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, - _gpu_overhead,_driver_overhead,screen); + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); diff --git a/lib/gpu/charge_gpu_memory2.cpp b/lib/gpu/charge_gpu_memory2.cpp deleted file mode 100644 index 76ef74288b..0000000000 --- a/lib/gpu/charge_gpu_memory2.cpp +++ /dev/null @@ -1,304 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#include "charge_gpu_memory2.h" -#define ChargeGPUMemory2T ChargeGPUMemory2 - -extern PairGPUDevice pair_gpu_device; - -template -ChargeGPUMemory2T::ChargeGPUMemory2() : _compiled(false), _max_bytes(0) { - device=&pair_gpu_device; - ans=new PairGPUAns(); - nbor=new PairGPUNbor(); -} - -template -ChargeGPUMemory2T::~ChargeGPUMemory2() { - delete ans; - delete nbor; -} - -template -int ChargeGPUMemory2T::bytes_per_atom_atomic(const int max_nbors) const { - return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ - nbor->bytes_per_atom(max_nbors); -} - -template -int ChargeGPUMemory2T::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { - nbor_time_avail=false; - screen=_screen; - - bool gpu_nbor=false; - if (device->gpu_mode()==PairGPUDevice::GPU_NEIGH) - gpu_nbor=true; - - int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); - if (host_nlocal>0) - _gpu_host=1; - - _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==false) { - nbor->packing(true); - _nbor_data=&(nbor->dev_packed); - } else - _nbor_data=&(nbor->dev_nbor); - - int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false); - if (success!=0) - return success; - - ucl_device=device->gpu; - atom=&device->atom; - - _block_size=device->pair_block_size(); - _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program); - - // Initialize host-device load balancer - hd_balancer.init(device,gpu_nbor,gpu_split); - - // Initialize timers for the selected GPU - time_pair.init(*ucl_device); - time_pair.zero(); - - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); - - _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - - return success; -} - -template -void ChargeGPUMemory2T::estimate_gpu_overhead() { - device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); -} - -template -void ChargeGPUMemory2T::clear_atomic() { - // Output any timing information - acc_timers(); - double avg_split=hd_balancer.all_avg_split(); - _gpu_overhead*=hd_balancer.timestep(); - _driver_overhead*=hd_balancer.timestep(); - device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, - _gpu_overhead,_driver_overhead,screen); - - if (_compiled) { - k_pair_fast.clear(); - k_pair.clear(); - delete pair_program; - _compiled=false; - } - - time_pair.clear(); - hd_balancer.clear(); - - device->clear(); -} - -// --------------------------------------------------------------------------- -// Copy neighbor list from host -// --------------------------------------------------------------------------- -template -int * ChargeGPUMemory2T::reset_nbors(const int nall, const int inum, int *ilist, - int *numj, int **firstneigh, bool &success) { - success=true; - - nbor_time_avail=true; - - int mn=nbor->max_nbor_loop(inum,numj,ilist); - resize_atom(inum,nall,success); - resize_local(inum,mn,success); - if (!success) - return false; - - nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - - double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - if (bytes>_max_an_bytes) - _max_an_bytes=bytes; - - return ilist; -} - -// --------------------------------------------------------------------------- -// Build neighbor list on device -// --------------------------------------------------------------------------- -template -inline void ChargeGPUMemory2T::build_nbor_list(const int inum, - const int host_inum, - const int nall, double **host_x, - int *host_type, double *sublo, - double *subhi, int *tag, - int **nspecial, int **special, - bool &success) { - nbor_time_avail=true; - - success=true; - resize_atom(inum,nall,success); - resize_local(inum,host_inum,nbor->max_nbors(),success); - if (!success) - return; - atom->cast_copy_x(host_x,host_type); - - int mn; - nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, - nspecial, special, success, mn); - - double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - if (bytes>_max_an_bytes) - _max_an_bytes=bytes; -} - -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. -// --------------------------------------------------------------------------- -template -void ChargeGPUMemory2T::compute(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd) { - acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - resize_atom(0,nall,success); - zero_timers(); - return; - } - - int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(ago,inum_full,cpu_time); - ans->inum(inum); - host_start=inum; - - if (ago==0) { - reset_nbors(nall, inum, ilist, numj, firstneigh, success); - if (!success) - return; - } - - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - hd_balancer.start_timer(); - atom->add_x_data(host_x,host_type); - atom->add_q_data(); - - device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, - boxlo, prd); - - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - device->add_ans_object(ans); - hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template -int** ChargeGPUMemory2T::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, - int **nspecial, int **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success, - double *host_q, double *boxlo, double *prd) { - acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - resize_atom(0,nall,success); - zero_timers(); - return NULL; - } - - hd_balancer.balance(cpu_time); - int inum=hd_balancer.get_gpu_count(ago,inum_full); - ans->inum(inum); - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - atom->cast_q_data(host_q); - hd_balancer.start_timer(); - } else { - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - hd_balancer.start_timer(); - atom->add_x_data(host_x,host_type); - } - atom->add_q_data(); - *ilist=nbor->host_ilist.begin(); - *jnum=nbor->host_acc.begin(); - - device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, - boxlo, prd); - - loop(eflag,vflag); - ans->copy_answers(eflag,vflag,eatom,vatom); - device->add_ans_object(ans); - hd_balancer.stop_timer(); - - return nbor->host_jlist.begin()-host_start; -} - -template -double ChargeGPUMemory2T::host_memory_usage_atomic() const { - return device->atom.host_memory_usage()+nbor->host_memory_usage()+ - 4*sizeof(numtyp)+sizeof(ChargeGPUMemory2); -} - -template -void ChargeGPUMemory2T::compile_kernels(UCL_Device &dev, const char *pair_str) { - if (_compiled) - return; - - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE); - - pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,flags.c_str()); - k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); - k_pair.set_function(*pair_program,"kernel_pair"); - pos_tex.get_texture(*pair_program,"pos_tex"); - q_tex.get_texture(*pair_program,"q_tex"); - - _compiled=true; -} - -template class ChargeGPUMemory2; - diff --git a/lib/gpu/charge_gpu_memory2.h b/lib/gpu/charge_gpu_memory2.h deleted file mode 100644 index 0edfc7e497..0000000000 --- a/lib/gpu/charge_gpu_memory2.h +++ /dev/null @@ -1,201 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#ifndef CHARGE_GPU_MEMORY2_H -#define CHARGE_GPU_MEMORY2_H - -#include "pair_gpu_device.h" -#include "pair_gpu_balance.h" -#include "mpi.h" - -#ifdef USE_OPENCL -#include "geryon/ocl_texture.h" -#else -#include "geryon/nvd_texture.h" -#endif - -template -class ChargeGPUMemory2 { - public: - ChargeGPUMemory2(); - virtual ~ChargeGPUMemory2(); - - /// Clear any previous data and set up for a new LAMMPS run - /** \param max_nbors initial number of rows in the neighbor matrix - * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device - * - * Returns: - * - 0 if successfull - * - -1 if fix gpu not found - * - -3 if there is an out of memory error - * - -4 if the GPU library was not compiled for GPU - * - -5 Double precision is not supported on card **/ - int init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const char *pair_program); - - /// Estimate the overhead for GPU context changes and CPU driver - void estimate_gpu_overhead(); - - /// Check if there is enough storage for atom arrays and realloc if not - /** \param success set to false if insufficient memory **/ - inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(nall, success)) { - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); - } - ans->resize(inum,success); - } - - /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device - * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors - * \note olist_size=total number of local particles **/ - inline void resize_local(const int inum, const int max_nbors, bool &success) { - nbor->resize(inum,max_nbors,success); - } - - /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device - * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors - * \note host_inum is 0 if the host is performing neighboring - * \note nlocal+host_inum=total number local particles - * \note olist_size=0 **/ - inline void resize_local(const int inum, const int host_inum, - const int max_nbors, bool &success) { - nbor->resize(inum,host_inum,max_nbors,success); - } - - /// Clear all host and device data - /** \note This is called at the beginning of the init() routine **/ - void clear_atomic(); - - /// Returns memory usage on device per atom - int bytes_per_atom_atomic(const int max_nbors) const; - - /// Total host memory used by library for pair style - double host_memory_usage_atomic() const; - - /// Accumulate timers - inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; - } - time_pair.add_to_total(); - atom->acc_timers(); - ans->acc_timers(); - } - - /// Zero timers - inline void zero_timers() { - nbor_time_avail=false; - time_pair.zero(); - atom->zero_timers(); - ans->zero_timers(); - } - - /// Copy neighbor list from host - int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, - int **firstneigh, bool &success); - - /// Build neighbor list on device - void build_nbor_list(const int inum, const int host_inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, bool &success); - - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge, - const int nlocal, double *boxlo, double *prd); - - /// Pair loop with device neighboring - int** compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd); - - // -------------------------- DEVICE DATA ------------------------- - - /// Device Properties and Atom and Neighbor storage - PairGPUDevice *device; - - /// Geryon device - UCL_Device *ucl_device; - - /// Device Timers - UCL_Timer time_pair; - - /// Host device load balancer - PairGPUBalance hd_balancer; - - /// LAMMPS pointer for screen output - FILE *screen; - - // --------------------------- ATOM DATA -------------------------- - - /// Atom Data - PairGPUAtom *atom; - - - // ------------------------ FORCE/ENERGY DATA ----------------------- - - PairGPUAns *ans; - - // --------------------------- NBOR DATA ---------------------------- - - /// Neighbor data - PairGPUNbor *nbor; - - /// True if we need to accumulate time for neighboring - bool nbor_time_avail; - - // ------------------------- DEVICE KERNELS ------------------------- - UCL_Program *pair_program; - UCL_Kernel k_pair_fast, k_pair; - inline int block_size() { return _block_size; } - - // --------------------------- TEXTURES ----------------------------- - UCL_Texture pos_tex; - UCL_Texture q_tex; - - protected: - bool _compiled; - int _block_size, _block_bio_size, _threads_per_atom; - double _max_bytes, _max_an_bytes; - double _gpu_overhead, _driver_overhead; - UCL_D_Vec *_nbor_data; - - void compile_kernels(UCL_Device &dev, const char *pair_string); - - virtual void loop(const bool _eflag, const bool _vflag) = 0; -}; - -#endif - diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index b2b7796025..08cc31ed7f 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -184,7 +184,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -347,7 +347,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index c99a2a68c2..5153cb5016 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -233,7 +233,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -428,7 +428,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/cmmc_msm_gpu_kernel.cu b/lib/gpu/cmmc_msm_gpu_kernel.cu index 5ad765f8c4..d740b57feb 100644 --- a/lib/gpu/cmmc_msm_gpu_kernel.cu +++ b/lib/gpu/cmmc_msm_gpu_kernel.cu @@ -252,7 +252,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -472,7 +472,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/crml_gpu2.cpp b/lib/gpu/crml_gpu2.cpp deleted file mode 100644 index a3402fe26b..0000000000 --- a/lib/gpu/crml_gpu2.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#include -#include -#include - -#include "crml_gpu_memory2.h" - -using namespace std; - -static CRML_GPU_Memory2 CRMLMF2; - -// --------------------------------------------------------------------------- -// Allocate memory on host and device and copy constants to device -// --------------------------------------------------------------------------- -int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { - CRMLMF2.clear(); - gpu_mode=CRMLMF2.device->gpu_mode(); - double gpu_split=CRMLMF2.device->particle_split(); - int first_gpu=CRMLMF2.device->first_device(); - int last_gpu=CRMLMF2.device->last_device(); - int world_me=CRMLMF2.device->world_me(); - int gpu_rank=CRMLMF2.device->gpu_rank(); - int procs_per_gpu=CRMLMF2.device->procs_per_gpu(); - - CRMLMF2.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu); - - bool message=false; - if (CRMLMF2.device->replica_me()==0 && screen) - message=true; - - if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); - fflush(screen); - } - - int init_ok=0; - if (world_me==0) - CRMLMF2.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, - offset, special_lj, inum, nall, 300, maxspecial, cell_size, - gpu_split, screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, - epsilon,sigma,mix_arithmetic); - - CRMLMF2.device->world_barrier(); - if (message) - fprintf(screen,"Done.\n"); - - for (int i=0; igpu_barrier(); - if (message) - fprintf(screen,"Done.\n"); - } - if (message) - fprintf(screen,"\n"); - - if (init_ok==0) - CRMLMF2.estimate_gpu_overhead(); - return init_ok; -} - -void crml_gpu_clear2() { - CRMLMF2.clear(); -} - -int** crml_gpu_compute_n2(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, int **nspecial, - int **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { - return CRMLMF2.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, - host_q, boxlo, prd); -} - -void crml_gpu_compute2(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, - double *boxlo, double *prd) { - CRMLMF2.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, - eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, - nlocal,boxlo,prd); -} - -double crml_gpu_bytes2() { - return CRMLMF2.host_memory_usage(); -} - - diff --git a/lib/gpu/crml_gpu_kernel2.cu b/lib/gpu/crml_gpu_kernel2.cu deleted file mode 100644 index 293b565d44..0000000000 --- a/lib/gpu/crml_gpu_kernel2.cu +++ /dev/null @@ -1,508 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#ifndef CRML_GPU_KERNEL -#define CRML_GPU_KERNEL - -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - -#define EWALD_F (numtyp)1.12837917 -#define EWALD_P (numtyp)0.3275911 -#define A1 (numtyp)0.254829592 -#define A2 (numtyp)-0.284496736 -#define A3 (numtyp)1.421413741 -#define A4 (numtyp)-1.453152027 -#define A5 (numtyp)1.061405429 - -#ifdef NV_KERNEL - -#include "nv_kernel_def.h" -texture pos_tex; -texture q_tex; - -#ifdef _DOUBLE_DOUBLE -__inline double4 fetch_pos(const int& i, const double4 *pos) -{ - return pos[i]; -} -__inline double fetch_q(const int& i, const double *q) -{ - return q[i]; -} -#else -__inline float4 fetch_pos(const int& i, const float4 *pos) -{ - return tex1Dfetch(pos_tex, i); -} -__inline float fetch_q(const int& i, const float *q) -{ - return tex1Dfetch(q_tex, i); -} -#endif - -#else - -#pragma OPENCL EXTENSION cl_khr_fp64: enable -#define GLOBAL_ID_X get_global_id(0) -#define THREAD_ID_X get_local_id(0) -#define BLOCK_ID_X get_group_id(0) -#define BLOCK_SIZE_X get_local_size(0) -#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) -#define __inline inline - -#define fetch_pos(i,y) x_[i] -#define fetch_q(i,y) q_[i] -#define BLOCK_BIO_PAIR 64 - -#endif - -#define MAX_BIO_SHARED_TYPES 128 - -#define SBBITS 30 -#define NEIGHMASK 0x3FFFFFFF -__inline int sbmask(int j) { return j >> SBBITS & 3; } - -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, __global numtyp *sp_lj_in, - __global int *dev_nbor, __global int *dev_packed, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_, const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald, - const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, const numtyp cut_lj_innersq, - const int t_per_atom) { - __local numtyp sp_lj[8]; - sp_lj[0]=sp_lj_in[0]; - sp_lj[1]=sp_lj_in[1]; - sp_lj[2]=sp_lj_in[2]; - sp_lj[3]=sp_lj_in[3]; - sp_lj[4]=sp_lj_in[4]; - sp_lj[5]=sp_lj_in[5]; - sp_lj[6]=sp_lj_in[6]; - sp_lj[7]=sp_lj_in[7]; - - int tid=THREAD_ID_X; - - acctyp energy; - acctyp e_coul; - acctyp4 f; - acctyp virial[6]; - - int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); - ii+=tid/t_per_atom; - int offset=tid%t_per_atom; - - energy=(acctyp)0; - e_coul=(acctyp)0; - f.x=(acctyp)0; - f.y=(acctyp)0; - f.z=(acctyp)0; - for (int o=0; o<6; o++) - virial[o]=(acctyp)0; - - if (ii cut_lj_innersq) { - switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ - denom_lj; - switch1 *= switch1; - switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ - denom_lj; - switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w); - force_lj = force_lj*switch1+switch2; - } - } else - force_lj = (numtyp)0.0; - - if (rsq < cut_coulsq) { - numtyp r = sqrt(rsq); - numtyp grij = g_ewald * r; - numtyp expm2 = exp(-grij*grij); - numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij); - _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; - forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { - forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } - - force = (force_lj + forcecoul) * r2inv; - - f.x+=delx*force; - f.y+=dely*force; - f.z+=delz*force; - - if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < cut_ljsq) { - numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w); - if (rsq > cut_lj_innersq) - e *= switch1; - energy+=factor_lj*e; - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - } // if ii - - // Reduce answers - if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; - - red_acc[0][tid]=f.x; - red_acc[1][tid]=f.y; - red_acc[2][tid]=f.z; - red_acc[3][tid]=energy; - red_acc[4][tid]=e_coul; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<5; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - f.x=red_acc[0][tid]; - f.y=red_acc[1][tid]; - f.z=red_acc[2][tid]; - energy=red_acc[3][tid]; - e_coul=red_acc[4][tid]; - - if (vflag>0) { - for (int r=0; r<6; r++) - red_acc[r][tid]=virial[r]; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - for (int r=0; r<6; r++) - virial[r]=red_acc[r][tid]; - } - } - - // Store answers - __global acctyp *ap1=engv+ii; - if (ii0) { - *ap1=energy; - ap1+=inum; - *ap1=e_coul; - ap1+=inum; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=inum; - } - } - ans[ii]=f; - } -} - -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, const int nall, - const int nbor_pitch, __global numtyp *q_, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const numtyp denom_lj, - const numtyp cut_bothsq, const numtyp cut_ljsq, - const numtyp cut_lj_innersq, - const int t_per_atom) { - int tid=THREAD_ID_X; - __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; - __local numtyp sp_lj[8]; - - acctyp energy; - acctyp e_coul; - acctyp4 f; - acctyp virial[6]; - - if (tid<8) - sp_lj[tid]=sp_lj_in[tid]; - ljd[tid]=ljd_in[tid]; - if (tid+BLOCK_BIO_PAIR cut_lj_innersq) { - switch1 = (cut_ljsq-rsq); - numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ - denom_lj; - switch1 *= switch1; - switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/ - denom_lj; - switch2 *= lj3-lj4; - force_lj = force_lj*switch1+switch2; - } - } else - force_lj = (numtyp)0.0; - - if (rsq < cut_coulsq) { - numtyp r = sqrt(rsq); - numtyp grij = g_ewald * r; - numtyp expm2 = exp(-grij*grij); - numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij); - _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r; - forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul); - } else { - forcecoul = (numtyp)0.0; - prefactor = (numtyp)0.0; - } - - force = (force_lj + forcecoul) * r2inv; - - f.x+=delx*force; - f.y+=dely*force; - f.z+=delz*force; - - if (eflag>0) { - e_coul += prefactor*(_erfc-factor_coul); - if (rsq < cut_ljsq) { - numtyp e=lj3-lj4; - if (rsq > cut_lj_innersq) - e *= switch1; - energy+=factor_lj*e; - } - } - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } - } - - } // for nbor - } // if ii - - // Reduce answers - if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; - - red_acc[0][tid]=f.x; - red_acc[1][tid]=f.y; - red_acc[2][tid]=f.z; - red_acc[3][tid]=energy; - red_acc[4][tid]=e_coul; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<5; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - f.x=red_acc[0][tid]; - f.y=red_acc[1][tid]; - f.z=red_acc[2][tid]; - energy=red_acc[3][tid]; - e_coul=red_acc[4][tid]; - - if (vflag>0) { - for (int r=0; r<6; r++) - red_acc[r][tid]=virial[r]; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - for (int r=0; r<6; r++) - virial[r]=red_acc[r][tid]; - } - } - - // Store answers - __global acctyp *ap1=engv+ii; - if (ii0) { - *ap1=energy; - ap1+=inum; - *ap1=e_coul; - ap1+=inum; - } - if (vflag>0) { - for (int v=0; v<6; v++) { - *ap1=virial[v]; - ap1+=inum; - } - } - ans[ii]=f; - } -} - -#endif diff --git a/lib/gpu/crml_gpu_memory2.cpp b/lib/gpu/crml_gpu_memory2.cpp deleted file mode 100644 index 894f5bfdc0..0000000000 --- a/lib/gpu/crml_gpu_memory2.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#ifdef USE_OPENCL -#include "crml_gpu_cl2.h" -#else -#include "crml_gpu_ptx2.h" -#endif - -#include "crml_gpu_memory2.h" -#include -#define CRML_GPU_Memory2T CRML_GPU_Memory2 - -extern PairGPUDevice pair_gpu_device; - -template -CRML_GPU_Memory2T::CRML_GPU_Memory2() : ChargeGPUMemory2(), - _allocated(false) { -} - -template -CRML_GPU_Memory2T::~CRML_GPU_Memory2() { - clear(); -} - -template -int CRML_GPU_Memory2T::bytes_per_atom(const int max_nbors) const { - return this->bytes_per_atom_atomic(max_nbors); -} - -template -int CRML_GPU_Memory2T::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double host_cut_ljsq, const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { - int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,crml_gpu_kernel2); - if (success!=0) - return success; - - // If atom type constants fit in shared memory use fast kernel - int lj_types=ntypes; - shared_types=false; - if (this->_block_bio_size>=64 && mix_arithmetic) - shared_types=true; - _lj_types=lj_types; - - // Allocate a host write buffer for data initialization - int h_size=lj_types*lj_types; - int max_bio_shared_types=this->device->max_bio_shared_types(); - if (h_size host_write(h_size*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); - for (int i=0; iucl_device),UCL_READ_ONLY); - this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, - host_lj3,host_lj4); - - ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); - this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); - - sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); - for (int i=0; i<4; i++) { - host_write[i]=host_special_lj[i]; - host_write[i+4]=host_special_coul[i]; - } - ucl_copy(sp_lj,host_write,8,false); - - _cut_bothsq = host_cut_bothsq; - _cut_coulsq = host_cut_coulsq; - _cut_ljsq = host_cut_ljsq; - _cut_lj_innersq = cut_lj_innersq; - _qqrd2e=qqrd2e; - _g_ewald=g_ewald; - _denom_lj=denom_lj; - - _allocated=true; - this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes(); - return 0; -} - -template -void CRML_GPU_Memory2T::clear() { - if (!_allocated) - return; - _allocated=false; - - lj1.clear(); - ljd.clear(); - sp_lj.clear(); - this->clear_atomic(); -} - -template -double CRML_GPU_Memory2T::host_memory_usage() const { - return this->host_memory_usage_atomic()+sizeof(CRML_GPU_Memory2); -} - -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques -// --------------------------------------------------------------------------- -template -void CRML_GPU_Memory2T::loop(const bool _eflag, const bool _vflag) { - // Compute the block size and grid size to keep all cores busy - const int BX=this->_block_bio_size; - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - int ainum=this->ans->inum(); - int anall=this->atom->nall(); - int nbor_pitch=this->nbor->nbor_pitch(); - this->time_pair.start(); - if (shared_types) { - this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), - &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch, - &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, - &_cut_ljsq, &_cut_lj_innersq, - &this->_threads_per_atom); - } else { - this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, - &this->_threads_per_atom); - } - this->time_pair.stop(); -} - -template class CRML_GPU_Memory2; diff --git a/lib/gpu/crml_gpu_memory2.h b/lib/gpu/crml_gpu_memory2.h deleted file mode 100644 index 9ae11ba6ac..0000000000 --- a/lib/gpu/crml_gpu_memory2.h +++ /dev/null @@ -1,86 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -------------------------------------------------------------------------- */ - -#ifndef CRML_GPU_MEMORY2_H -#define CRML_GPU_MEMORY2_H - -#include "charge_gpu_memory2.h" - -template -class CRML_GPU_Memory2 : public ChargeGPUMemory2 { - public: - CRML_GPU_Memory2(); - ~CRML_GPU_Memory2(); - - /// Clear any previous data and set up for a new LAMMPS run - /** \param max_nbors initial number of rows in the neighbor matrix - * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device - * - * Returns: - * - 0 if successfull - * - -1 if fix gpu not found - * - -3 if there is an out of memory error - * - -4 if the GPU library was not compiled for GPU - * - -5 Double precision is not supported on card **/ - int init(const int ntypes, double host_cut_bothsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald, - const double cut_lj_innersq, const double denom_lj, - double **epsilon, double **sigma, const bool mix_arithmetic); - - /// Clear all host and device data - /** \note This is called at the beginning of the init() routine **/ - void clear(); - - /// Returns memory usage on device per atom - int bytes_per_atom(const int max_nbors) const; - - /// Total host memory used by library for pair style - double host_memory_usage() const; - - // --------------------------- TYPE DATA -------------------------- - - /// x = lj1, y = lj2, z = lj3, w = lj4 - UCL_D_Vec lj1; - /// x = epsilon, y = sigma - UCL_D_Vec ljd; - /// Special LJ values [0-3] and Special Coul values [4-7] - UCL_D_Vec sp_lj; - - /// If atom type constants fit in shared memory, use fast kernels - bool shared_types; - - /// Number of atom types - int _lj_types; - - numtyp _qqrd2e, _g_ewald, _denom_lj; - - numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq; - - private: - bool _allocated; - void loop(const bool _eflag, const bool _vflag); -}; - -#endif - diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index d1abd4338b..7bb320f5d0 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -368,7 +368,7 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[7][BLOCK_BIO_PAIR]; + __local acctyp red_acc[7][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu index 91c1355c64..657fc20cd5 100644 --- a/lib/gpu/gb_gpu_kernel_lj.cu +++ b/lib/gpu/gb_gpu_kernel_lj.cu @@ -248,7 +248,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -387,7 +387,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -535,7 +535,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index 39a17d89ed..3fc6a2f308 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -176,7 +176,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -332,7 +332,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index 1eadc7055d..75f36446f7 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -175,7 +175,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -330,7 +330,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu index df3f0ff9ae..2d09b4d941 100644 --- a/lib/gpu/lj_expand_gpu_kernel.cu +++ b/lib/gpu/lj_expand_gpu_kernel.cu @@ -178,7 +178,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -337,7 +337,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index 2b2cccb284..44a607588a 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -208,7 +208,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -388,7 +388,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index 000ecac616..7be7a86114 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -224,7 +224,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -411,7 +411,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu index 3c483efac2..0a89aae070 100644 --- a/lib/gpu/morse_gpu_kernel.cu +++ b/lib/gpu/morse_gpu_kernel.cu @@ -176,7 +176,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; @@ -333,7 +333,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, // Reduce answers if (t_per_atom>1) { - __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=f.x; red_acc[1][tid]=f.y; diff --git a/lib/gpu/nv_kernel_def.h b/lib/gpu/nv_kernel_def.h index f3f6ea4c36..8ac66a2db7 100644 --- a/lib/gpu/nv_kernel_def.h +++ b/lib/gpu/nv_kernel_def.h @@ -32,7 +32,8 @@ #if (ARCH < 200) -#define THREADS_PER_ATOM 8 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_PAIR 64 #define BLOCK_BIO_PAIR 64 @@ -40,7 +41,8 @@ #else -#define THREADS_PER_ATOM 8 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 128 #define BLOCK_BIO_PAIR 128 diff --git a/lib/gpu/pair_gpu_dev_kernel.cu b/lib/gpu/pair_gpu_dev_kernel.cu index 99f224b099..73411c99ff 100644 --- a/lib/gpu/pair_gpu_dev_kernel.cu +++ b/lib/gpu/pair_gpu_dev_kernel.cu @@ -30,6 +30,11 @@ Definition: Number of threads guaranteed to be on the same instruction THREADS_PER_ATOM Definition: Default number of threads assigned per atom for pair styles + Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE + THREADS_PER_CHARGE + Definition: Default number of threads assigned per atom for pair styles + with charge + Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE PPPM_MAX_SPLINE Definition: Maximum order for splines in PPPM PPPM_BLOCK_1D @@ -73,6 +78,7 @@ #define MEM_THREADS 16 #define WARP_SIZE 1 #define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 @@ -107,6 +113,8 @@ __kernel void kernel_info(__global int *info) { info[10]=BLOCK_NBOR_BUILD; info[11]=BLOCK_BIO_PAIR; info[12]=MAX_BIO_SHARED_TYPES; + info[13]=THREADS_PER_CHARGE; } #endif + diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index 056db9fb39..cadc8c59b0 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -52,6 +52,7 @@ int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, omp_set_num_threads(nthreads); #endif _threads_per_atom=t_per_atom; + _threads_per_charge=t_per_atom; if (_device_init) return 0; @@ -381,7 +382,8 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, PairGPUNbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, FILE *screen) { + const double driver_overhead, + const int threads_per_atom, FILE *screen) { double single[8], times[8]; single[0]=atom.transfer_time()+ans.transfer_time(); @@ -420,7 +422,7 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, } fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); + fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); @@ -534,8 +536,8 @@ int PairGPUDeviceT::compile_kernels() { k_info.set_function(*dev_program,"kernel_info"); _compiled=true; - UCL_H_Vec h_gpu_lib_data(13,*gpu,UCL_NOT_PINNED); - UCL_D_Vec d_gpu_lib_data(13,*gpu); + UCL_H_Vec h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED); + UCL_D_Vec d_gpu_lib_data(14,*gpu); k_info.set_size(1,1); k_info.run(&d_gpu_lib_data.begin()); ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); @@ -549,6 +551,8 @@ int PairGPUDeviceT::compile_kernels() { _warp_size=h_gpu_lib_data[2]; if (_threads_per_atom<1) _threads_per_atom=h_gpu_lib_data[3]; + if (_threads_per_charge<1) + _threads_per_charge=h_gpu_lib_data[13]; _pppm_max_spline=h_gpu_lib_data[4]; _pppm_block=h_gpu_lib_data[5]; _block_pair=h_gpu_lib_data[6]; @@ -567,6 +571,10 @@ int PairGPUDeviceT::compile_kernels() { _threads_per_atom=_warp_size; if (_warp_size%_threads_per_atom!=0) _threads_per_atom=1; + if (_threads_per_charge>_warp_size) + _threads_per_charge=_warp_size; + if (_warp_size%_threads_per_charge!=0) + _threads_per_charge=1; return flag; } diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index dc2e2e5ea8..8d1cefd823 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -112,7 +112,8 @@ class PairGPUDevice { void output_times(UCL_Timer &time_pair, PairGPUAns &ans, PairGPUNbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, - const double driver_overhead, FILE *screen); + const double driver_overhead, + const int threads_per_atom, FILE *screen); /// Output a message with timing information void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, @@ -203,6 +204,8 @@ class PairGPUDevice { inline int num_mem_threads() const { return _num_mem_threads; } /// Return the number of threads per atom for pair styles inline int threads_per_atom() const { return _threads_per_atom; } + /// Return the number of threads per atom for pair styles using charge + inline int threads_per_charge() const { return _threads_per_charge; } /// Return the min of the pair block size or the device max block size inline int pair_block_size() const { return _block_pair; } /// Return the maximum number of atom types that can be used with shared mem @@ -277,7 +280,7 @@ class PairGPUDevice { double _particle_split; double _cpu_full; - int _num_mem_threads, _warp_size, _threads_per_atom; + int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; int _pppm_max_spline, _pppm_block; int _block_pair, _max_shared_types; int _block_cell_2d, _block_cell_id, _block_nbor_build; diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 0f2bbfc095..83c1f61aa1 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -36,8 +36,6 @@ if (test $1 = 1) then if (test -e ../pair_lj_charmm_coul_long.cpp) then cp pair_lj_charmm_coul_long_gpu.cpp .. cp pair_lj_charmm_coul_long_gpu.h .. - cp pair_lj_charmm_coul_long_gpu2.cpp .. - cp pair_lj_charmm_coul_long_gpu2.h .. fi if (test -e ../pair_cg_cmm.cpp) then @@ -92,7 +90,6 @@ elif (test $1 = 0) then rm ../pair_lj_cut_coul_cut_gpu.cpp rm ../pair_lj_cut_coul_long_gpu.cpp rm ../pair_lj_charmm_coul_long_gpu.cpp - rm ../pair_lj_charmm_coul_long_gpu2.cpp rm ../pair_lj_cut_tgpu.cpp rm ../pair_cg_cmm_gpu.cpp rm ../pair_cg_cmm_coul_long_gpu.cpp @@ -112,7 +109,6 @@ elif (test $1 = 0) then rm ../pair_lj_cut_coul_cut_gpu.h rm ../pair_lj_cut_coul_long_gpu.h rm ../pair_lj_charmm_coul_long_gpu.h - rm ../pair_lj_charmm_coul_long_gpu2.h rm ../pair_lj_cut_tgpu.cpp rm ../pair_cg_cmm_gpu.h rm ../pair_cg_cmm_coul_long_gpu.h diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp index 6220346977..cbfb299db0 100644 --- a/src/GPU/pair_gayberne_gpu.cpp +++ b/src/GPU/pair_gayberne_gpu.cpp @@ -170,7 +170,7 @@ void PairGayBerneGPU::init_style() double cell_size = sqrt(maxcut) + neighbor->skin; int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, - shape1, well, cutsq, sigma, epsilon, lshape, form, + shape2, well, cutsq, sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, cell_size, gpu_mode, screen); diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp deleted file mode 100644 index fd5c55e7dd..0000000000 --- a/src/GPU/pair_lj_charmm_coul_long_gpu2.cpp +++ /dev/null @@ -1,344 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing author: Mike Brown (SNL) -------------------------------------------------------------------------- */ - -#include "lmptype.h" -#include "math.h" -#include "stdio.h" -#include "stdlib.h" -#include "pair_lj_charmm_coul_long_gpu2.h" -#include "atom.h" -#include "atom_vec.h" -#include "comm.h" -#include "force.h" -#include "neighbor.h" -#include "neigh_list.h" -#include "integrate.h" -#include "memory.h" -#include "error.h" -#include "neigh_request.h" -#include "universe.h" -#include "update.h" -#include "domain.h" -#include "string.h" -#include "kspace.h" -#include "gpu_extra.h" - -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - -#define EWALD_F 1.12837917 -#define EWALD_P 0.3275911 -#define A1 0.254829592 -#define A2 -0.284496736 -#define A3 1.421413741 -#define A4 -1.453152027 -#define A5 1.061405429 - -// External functions from cuda library for atom decomposition - -int crml_gpu_init2(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, double **sigma, - const bool mix_arithmetic); -void crml_gpu_clear2(); -int ** crml_gpu_compute_n2(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, int *tag, - int **nspecial, int **special, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double *host_q, - double *boxlo, double *prd); -void crml_gpu_compute2(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd); -double crml_gpu_bytes2(); - -using namespace LAMMPS_NS; - -/* ---------------------------------------------------------------------- */ - -PairLJCharmmCoulLongGPU2::PairLJCharmmCoulLongGPU2(LAMMPS *lmp) : - PairLJCharmmCoulLong(lmp), gpu_mode(GPU_PAIR) -{ - respa_enable = 0; - cpu_time = 0.0; -} - -/* ---------------------------------------------------------------------- - free all arrays -------------------------------------------------------------------------- */ - -PairLJCharmmCoulLongGPU2::~PairLJCharmmCoulLongGPU2() -{ - crml_gpu_clear2(); -} - -/* ---------------------------------------------------------------------- */ - -void PairLJCharmmCoulLongGPU2::compute(int eflag, int vflag) -{ - if (eflag || vflag) ev_setup(eflag,vflag); - else evflag = vflag_fdotr = 0; - - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; - if (gpu_mode == GPU_NEIGH) { - inum = atom->nlocal; - firstneigh = crml_gpu_compute_n2(neighbor->ago, inum, nall, atom->x, - atom->type, domain->sublo, domain->subhi, - atom->tag, atom->nspecial, atom->special, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, - domain->prd); - } else { - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - crml_gpu_compute2(neighbor->ago, inum, nall, atom->x, atom->type, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, atom->q, - atom->nlocal, domain->boxlo, domain->prd); - } - if (!success) - error->one("Out of memory on GPGPU"); - - if (host_startq_flag) - error->all("Pair style lj/charmm/coul/long requires atom attribute q"); - if (force->newton_pair) - error->all("Cannot use newton pair with GPU CHARMM pair style"); - - // Repeat cutsq calculation because done after call to init_style - double cut; - for (int i = 1; i <= atom->ntypes; i++) { - for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) - cut = init_one(i,j); - } - } - - cut_lj_innersq = cut_lj_inner * cut_lj_inner; - cut_ljsq = cut_lj * cut_lj; - cut_coulsq = cut_coul * cut_coul; - cut_bothsq = MAX(cut_ljsq,cut_coulsq); - - denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * - (cut_ljsq-cut_lj_innersq); - - double cell_size = sqrt(cut_bothsq) + neighbor->skin; - - // insure use of KSpace long-range solver, set g_ewald - - if (force->kspace == NULL) - error->all("Pair style is incompatible with KSpace style"); - g_ewald = force->kspace->g_ewald; - - // setup force tables - - if (ncoultablebits) init_tables(); - - int maxspecial=0; - if (atom->molecular) - maxspecial=atom->maxspecial; - - bool arithmetic = true; - for (int i = 1; i < atom->ntypes + 1; i++) - for (int j = i + 1; j < atom->ntypes + 1; j++) { - if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j])) - arithmetic = false; - if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j])) - arithmetic = false; - } - - int success = crml_gpu_init2(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, - atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen, cut_ljsq, - cut_coulsq, force->special_coul, force->qqrd2e, - g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma, - arithmetic); - GPU_EXTRA::check_flag(success,error,world); - - if (gpu_mode != GPU_NEIGH) { - int irequest = neighbor->request(this); - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; - } -} - -/* ---------------------------------------------------------------------- */ - -double PairLJCharmmCoulLongGPU2::memory_usage() -{ - double bytes = Pair::memory_usage(); - return bytes + crml_gpu_bytes2(); -} - -/* ---------------------------------------------------------------------- */ - -void PairLJCharmmCoulLongGPU2::cpu_compute(int start, int inum, int eflag, - int vflag, int *ilist, - int *numneigh, int **firstneigh) -{ - int i,j,ii,jj,jnum,itype,jtype,itable; - double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair; - double fraction,table; - double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj; - double grij,expm2,prefactor,t,erfc; - double philj,switch1,switch2; - int *jlist; - double rsq; - - evdwl = ecoul = 0.0; - - double **x = atom->x; - double **f = atom->f; - double *q = atom->q; - int *type = atom->type; - int nlocal = atom->nlocal; - double *special_coul = force->special_coul; - double *special_lj = force->special_lj; - double qqrd2e = force->qqrd2e; - - // loop over neighbors of my atoms - - for (ii = start; ii < inum; ii++) { - i = ilist[ii]; - qtmp = q[i]; - xtmp = x[i][0]; - ytmp = x[i][1]; - ztmp = x[i][2]; - itype = type[i]; - jlist = firstneigh[i]; - jnum = numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - factor_lj = special_lj[sbmask(j)]; - factor_coul = special_coul[sbmask(j)]; - j &= NEIGHMASK; - - delx = xtmp - x[j][0]; - dely = ytmp - x[j][1]; - delz = ztmp - x[j][2]; - rsq = delx*delx + dely*dely + delz*delz; - - if (rsq < cut_bothsq) { - r2inv = 1.0/rsq; - - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) { - r = sqrt(rsq); - grij = g_ewald * r; - expm2 = exp(-grij*grij); - t = 1.0 / (1.0 + EWALD_P*grij); - erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - prefactor = qqrd2e * qtmp*q[j]/r; - forcecoul = prefactor * (erfc + EWALD_F*grij*expm2); - if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor; - } else { - union_int_float_t rsq_lookup; - rsq_lookup.f = rsq; - itable = rsq_lookup.i & ncoulmask; - itable >>= ncoulshiftbits; - fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable]; - table = ftable[itable] + fraction*dftable[itable]; - forcecoul = qtmp*q[j] * table; - if (factor_coul < 1.0) { - table = ctable[itable] + fraction*dctable[itable]; - prefactor = qtmp*q[j] * table; - forcecoul -= (1.0-factor_coul)*prefactor; - } - } - } else forcecoul = 0.0; - - if (rsq < cut_ljsq) { - r6inv = r2inv*r2inv*r2inv; - jtype = type[j]; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - switch2 = 12.0*rsq * (cut_ljsq-rsq) * - (rsq-cut_lj_innersq) / denom_lj; - philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]); - forcelj = forcelj*switch1 + philj*switch2; - } - } else forcelj = 0.0; - - fpair = (forcecoul + factor_lj*forcelj) * r2inv; - - f[i][0] += delx*fpair; - f[i][1] += dely*fpair; - f[i][2] += delz*fpair; - - if (eflag) { - if (rsq < cut_coulsq) { - if (!ncoultablebits || rsq <= tabinnersq) - ecoul = prefactor*erfc; - else { - table = etable[itable] + fraction*detable[itable]; - ecoul = qtmp*q[j] * table; - } - if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor; - } else ecoul = 0.0; - - if (rsq < cut_ljsq) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]); - if (rsq > cut_lj_innersq) { - switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) * - (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj; - evdwl *= switch1; - } - evdwl *= factor_lj; - } else evdwl = 0.0; - } - - if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz); - } - } - } -} diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu2.h b/src/GPU/pair_lj_charmm_coul_long_gpu2.h deleted file mode 100644 index b835e08da7..0000000000 --- a/src/GPU/pair_lj_charmm_coul_long_gpu2.h +++ /dev/null @@ -1,47 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifdef PAIR_CLASS - -PairStyle(lj/charmm/coul/long/gpu2,PairLJCharmmCoulLongGPU2) - -#else - -#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H -#define LMP_PAIR_LJ_CHARMM_COUL_LONG_GPU2_H - -#include "pair_lj_charmm_coul_long.h" - -namespace LAMMPS_NS { - -class PairLJCharmmCoulLongGPU2 : public PairLJCharmmCoulLong { - public: - PairLJCharmmCoulLongGPU2(LAMMPS *lmp); - ~PairLJCharmmCoulLongGPU2(); - void cpu_compute(int, int, int, int, int *, int *, int **); - void compute(int, int); - void init_style(); - double memory_usage(); - - enum { GPU_PAIR, GPU_NEIGH }; - - private: - int gpu_mode; - double cpu_time; - int *gpulist; -}; - -} -#endif -#endif -