git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6219 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2011-05-26 22:16:44 +00:00
parent cfa91c0611
commit 5a8719e38f
21 changed files with 7231 additions and 0 deletions

74
src/USER-CUDA/Install.sh Executable file
View File

@ -0,0 +1,74 @@
# Install/unInstall package files in LAMMPS
# edit Makefile.package to include/exclude CUDA library
# do not copy child files if parent does not exist
if (test $1 = 1) then
if (test -e ../Makefile.package) then
sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
fi
cp comm_cuda.cpp ..
cp domain_cuda.cpp ..
cp modify_cuda.cpp ..
cp neighbor_cuda.cpp ..
cp neigh_full_cuda.cpp ..
cp verlet_cuda.cpp ..
cp cuda.cpp ..
cp cuda_neigh_list.cpp ..
cp comm_cuda.h ..
cp domain_cuda.h ..
cp modify_cuda.h ..
cp neighbor_cuda.h ..
cp verlet_cuda.h ..
cp cuda.h ..
cp cuda_common.h ..
cp cuda_data.h ..
cp cuda_modify_flags.h ..
cp cuda_neigh_list.h ..
cp cuda_precision.h ..
cp cuda_shared.h ..
elif (test $1 = 0) then
if (test -e ../Makefile.package) then
sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
fi
rm ../comm_cuda.cpp
rm ../domain_cuda.cpp
rm ../modify_cuda.cpp
rm ../neighbor_cuda.cpp
rm ../neigh_full_cuda.cpp
rm ../verlet_cuda.cpp
rm ../cuda.cpp
rm ../cuda_neigh_list.cpp
rm ../comm_cuda.h
rm ../domain_cuda.h
rm ../modify_cuda.h
rm ../neighbor_cuda.h
rm ../verlet_cuda.h
rm ../cuda.h
rm ../cuda_common.h
rm ../cuda_data.h
rm ../cuda_modify_flags.h
rm ../cuda_neigh_list.h
rm ../cuda_precision.h
rm ../cuda_shared.h
fi

1431
src/USER-CUDA/comm_cuda.cpp Normal file

File diff suppressed because it is too large Load Diff

69
src/USER-CUDA/comm_cuda.h Normal file
View File

@ -0,0 +1,69 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_COMM_CUDA_H
#define LMP_COMM_CUDA_H
#include "pointers.h"
#include "cuda_data.h"
#include "comm.h"
namespace LAMMPS_NS {
class CommCuda : public Comm {
public:
CommCuda(class LAMMPS *);
~CommCuda();
virtual void init();
virtual void setup(); // setup 3d communication pattern
virtual void forward_comm(int mode=0); // forward communication of atom coords
virtual void forward_comm_cuda();
virtual void forward_comm_pack_cuda();
virtual void forward_comm_transfer_cuda();
virtual void forward_comm_unpack_cuda();
virtual void forward_comm_pair(Pair *pair);
virtual void reverse_comm(); // reverse communication of forces
virtual void exchange(); // move atoms to new procs
virtual void exchange_cuda(); // move atoms to new procs
virtual void borders(); // setup list of atoms to communicate
virtual void borders_cuda(); // setup list of atoms to communicate
virtual void borders_cuda_overlap_forward_comm();
virtual void forward_comm_fix(class Fix *); // forward comm from a Fix
protected:
class Cuda *cuda;
cCudaData<int, int, xy>* cu_pbc;
cCudaData<double, X_FLOAT, x>* cu_slablo;
cCudaData<double, X_FLOAT, x>* cu_slabhi;
cCudaData<double, X_FLOAT, xy>* cu_multilo;
cCudaData<double, X_FLOAT, xy>* cu_multihi;
cCudaData<int, int, xy>* cu_sendlist;
virtual void grow_send(int,int); // reallocate send buffer
virtual void grow_recv(int); // free/allocate recv buffer
virtual void grow_list(int, int); // reallocate one sendlist
virtual void grow_swap(int); // grow swap and multi arrays
virtual void allocate_swap(int); // allocate swap arrays
virtual void allocate_multi(int); // allocate multi arrays
virtual void free_swap(); // free swap arrays
virtual void free_multi(); // free multi arrays
};
}
#endif

837
src/USER-CUDA/cuda.cpp Normal file
View File

@ -0,0 +1,837 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include "cuda.h"
#include "atom.h"
#include "domain.h"
#include "force.h"
#include "pair.h"
#include "update.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "universe.h"
#include "input.h"
#include "error.h"
#include "cuda_neigh_list.h"
//#include "pre_binning_cu.h"
#include "binning_cu.h"
//#include "reverse_binning_cu.h"
#include <ctime>
#include <cmath>
#include "cuda_pair_cu.h"
#include "cuda_cu.h"
using namespace LAMMPS_NS;
#define MAX(a,b) ((a) > (b) ? (a) : (b))
Cuda::Cuda(LAMMPS *lmp) : Pointers(lmp)
{
cuda_exists=true;
lmp->cuda=this;
if(universe->me==0)
printf("# Using LAMMPS_CUDA \n");
shared_data.me=universe->me;
device_set=false;
Cuda_Cuda_GetCompileSettings(&shared_data);
if(shared_data.compile_settings.prec_glob!=sizeof(CUDA_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_glob, sizeof(CUDA_FLOAT)/4);
if(shared_data.compile_settings.prec_x!=sizeof(X_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_x, sizeof(X_FLOAT)/4);
if(shared_data.compile_settings.prec_v!=sizeof(V_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_v, sizeof(V_FLOAT)/4);
if(shared_data.compile_settings.prec_f!=sizeof(F_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_f, sizeof(F_FLOAT)/4);
if(shared_data.compile_settings.prec_pppm!=sizeof(PPPM_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_pppm, sizeof(PPPM_FLOAT)/4);
if(shared_data.compile_settings.prec_fft!=sizeof(FFT_FLOAT)/4) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",shared_data.compile_settings.prec_fft, sizeof(FFT_FLOAT)/4);
#ifdef FFT_CUFFT
if(shared_data.compile_settings.cufft!=1) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 1);
#else
if(shared_data.compile_settings.cufft!=0) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: cufft: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, 0);
#endif
if(shared_data.compile_settings.arch!=CUDA_ARCH) printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n # CUDA WARNING: arch: cuda %i cpp %i\n\n",shared_data.compile_settings.cufft, CUDA_ARCH);
cu_x = 0;
cu_v = 0;
cu_f = 0;
cu_tag = 0;
cu_type = 0;
cu_mask = 0;
cu_image = 0;
cu_xhold = 0;
cu_q = 0;
cu_rmass = 0;
cu_mass = 0;
cu_virial = 0;
cu_eatom = 0;
cu_vatom = 0;
cu_radius = 0;
cu_density = 0;
cu_omega = 0;
cu_torque = 0;
cu_special = 0;
cu_nspecial = 0;
cu_molecule = 0;
cu_x_type = 0;
x_type = 0;
cu_v_radius = 0;
v_radius = 0;
cu_omega_rmass = 0;
omega_rmass = 0;
binned_id = 0;
cu_binned_id = 0;
binned_idnew = 0;
cu_binned_idnew = 0;
cu_map_array = 0;
copy_buffer=0;
copy_buffersize=0;
neighbor_decide_by_integrator=0;
pinned=true;
debugdata=0;
new int[2*CUDA_MAX_DEBUG_SIZE];
finished_setup = false;
begin_setup = false;
finished_run = false;
setSharedDataZero();
uploadtime=0;
downloadtime=0;
dotiming=false;
dotestatom = false;
testatom = 0;
oncpu = true;
self_comm = 0;
MYDBG( printf("# CUDA: Cuda::Cuda Done...\n");)
//cCudaData<double, float, yx >
}
Cuda::~Cuda()
{
print_timings();
if(universe->me==0) printf("# CUDA: Free memory...\n");
delete cu_q;
delete cu_x;
delete cu_v;
delete cu_f;
delete cu_tag;
delete cu_type;
delete cu_mask;
delete cu_image;
delete cu_xhold;
delete cu_mass;
delete cu_rmass;
delete cu_virial;
delete cu_eng_vdwl;
delete cu_eng_coul;
delete cu_eatom;
delete cu_vatom;
delete cu_radius;
delete cu_density;
delete cu_omega;
delete cu_torque;
delete cu_molecule;
delete cu_x_type;
delete [] x_type;
delete cu_v_radius;
delete [] v_radius;
delete cu_omega_rmass;
delete [] omega_rmass;
delete cu_map_array;
std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
while(p != neigh_lists.end())
{
delete p->second;
++p;
}
}
void Cuda::accelerator(int narg, char** arg)
{
if(device_set) return;
if(universe->me==0)
printf("# CUDA: Activate GPU \n");
int* devicelist=NULL;
int pppn=2;
for(int i=0;i<narg;i++)
{
if(strcmp(arg[i],"gpu/node")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting a number or keyword 'special' after 'gpu/node' option.");
if(strcmp(arg[i],"special")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'.");
pppn=atoi(arg[i]);
if(pppn<1) error->all("Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'.");
if(i+pppn==narg)
error->all("Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'.");
devicelist=new int[pppn];
for(int k=0;k<pppn;k++)
{i++;devicelist[k]=atoi(arg[i]);}
}
else
pppn=atoi(arg[i]);
}
if(strcmp(arg[i],"pinned")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option.");
pinned=atoi(arg[i])==0?false:true;
if((pinned==false)&&(universe->me==0)) printf(" #CUDA: Pinned memory is not used for communication\n");
}
if(strcmp(arg[i],"dotiming")==0)
{
dotiming=true;
}
if(strcmp(arg[i],"suffix")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option.");
strcpy(lmp->asuffix,arg[i]);
}
if(strcmp(arg[i],"overlap_comm")==0)
{
shared_data.overlap_comm=1;
}
if(strcmp(arg[i],"dotest")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting a number after 'dotest' option.");
testatom=atof(arg[i]);
dotestatom=true;
}
if(strcmp(arg[i],"override_bpa")==0)
{
if(++i==narg)
error->all("Invalid Options for 'accelerator' command. Expecting a number after 'override_bpa' option.");
shared_data.pair.override_block_per_atom = atoi(arg[i]);
}
}
CudaWrapper_Init(0, (char**)0,universe->me,pppn,devicelist);
//if(shared_data.overlap_comm)
CudaWrapper_AddStreams(3);
cu_x = 0;
cu_v = 0;
cu_f = 0;
cu_tag = 0;
cu_type = 0;
cu_mask = 0;
cu_image = 0;
cu_xhold = 0;
cu_q = 0;
cu_rmass = 0;
cu_mass = 0;
cu_virial = 0;
cu_eatom = 0;
cu_vatom = 0;
cu_radius = 0;
cu_density = 0;
cu_omega = 0;
cu_torque = 0;
cu_special = 0;
cu_nspecial = 0;
cu_molecule = 0;
cu_x_type = 0;
cu_v_radius = 0;
cu_omega_rmass = 0;
cu_binned_id = 0;
cu_binned_idnew = 0;
device_set=true;
allocate();
delete devicelist;
}
void Cuda::setSharedDataZero()
{
MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
shared_data.atom.nlocal = 0;
shared_data.atom.nghost = 0;
shared_data.atom.nall = 0;
shared_data.atom.nmax = 0;
shared_data.atom.ntypes = 0;
shared_data.atom.q_flag = 0;
shared_data.atom.need_eatom = 0;
shared_data.atom.need_vatom = 0;
shared_data.pair.cudable_force = 0;
shared_data.pair.collect_forces_later = 0;
shared_data.pair.use_block_per_atom = 0;
shared_data.pair.override_block_per_atom = -1;
shared_data.pair.cut = 0;
shared_data.pair.cutsq = 0;
shared_data.pair.cut_inner = 0;
shared_data.pair.cut_coul = 0;
shared_data.pair.special_lj = 0;
shared_data.pair.special_coul = 0;
shared_data.pppm.cudable_force = 0;
shared_data.buffersize = 0;
shared_data.buffer_new = 1;
shared_data.buffer = NULL;
shared_data.comm.comm_phase=0;
shared_data.overlap_comm=0;
shared_data.comm.buffer = NULL;
shared_data.comm.buffer_size=0;
shared_data.comm.overlap_split_ratio=0;
// setTimingsZero();
}
void Cuda::allocate()
{
accelerator(0,NULL);
MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
if(not cu_virial)
{
cu_virial = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.virial , 6);
cu_eng_vdwl = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_vdwl ,1);
cu_eng_coul = new cCudaData<double, ENERGY_FLOAT, x > (NULL, & shared_data.pair.eng_coul ,1);
cu_extent = new cCudaData<double, double, x> (extent, 6);
shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
int size=2*CUDA_MAX_DEBUG_SIZE;
debugdata = new int[size];
cu_debugdata = new cCudaData<int, int, x > (debugdata , size);
shared_data.debugdata=cu_debugdata->dev_data();
}
checkResize();
setSystemParams();
MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
}
void Cuda::setSystemParams()
{
MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
shared_data.atom.nlocal = atom->nlocal;
shared_data.atom.nghost = atom->nghost;
shared_data.atom.nall = atom->nlocal + atom->nghost;
shared_data.atom.ntypes = atom->ntypes;
shared_data.atom.q_flag = atom->q_flag;
shared_data.atom.rmass_flag = atom->rmass_flag;
MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
}
void Cuda::setDomainParams()
{
MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
cuda_shared_domain* cu_domain = &shared_data.domain;
cu_domain->triclinic = domain->triclinic;
for(short i=0; i<3; ++i)
{
cu_domain->periodicity[i] = domain->periodicity[i];
cu_domain->sublo[i] = domain->sublo[i];
cu_domain->subhi[i] = domain->subhi[i];
cu_domain->boxlo[i] = domain->boxlo[i];
cu_domain->boxhi[i] = domain->boxhi[i];
cu_domain->prd[i] = domain->prd[i];
}
if(domain->triclinic)
{
for(short i=0; i<3; ++i)
{
cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
cu_domain->prd_lamda[i] = domain->prd_lamda[i];
}
cu_domain->xy = domain->xy;
cu_domain->xz = domain->xz;
cu_domain->yz = domain->yz;
}
for(int i=0;i<6;i++)
{
cu_domain->h[i]=domain->h[i];
cu_domain->h_inv[i]=domain->h_inv[i];
cu_domain->h_rate[i]=domain->h_rate[i];
}
cu_domain->update=2;
MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
}
void Cuda::checkResize()
{
MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
accelerator(0,NULL);
cuda_shared_atom* cu_atom = & shared_data.atom;
cuda_shared_pair* cu_pair = & shared_data.pair;
cu_atom->q_flag = atom->q_flag;
cu_atom->rmass_flag = atom->rmass ? 1 : 0;
cu_atom->nall = atom->nlocal + atom->nghost;
cu_atom->nlocal = atom->nlocal;
cu_atom->nghost = atom->nghost;
// do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
if(atom->nmax > cu_atom->nmax || cu_tag == NULL)
{
delete cu_x; cu_x = new cCudaData<double, X_FLOAT, yx> ((double*)atom->x , & cu_atom->x , atom->nmax, 3,0,true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
delete cu_v; cu_v = new cCudaData<double, V_FLOAT, yx> ((double*)atom->v, & cu_atom->v , atom->nmax, 3);
delete cu_f; cu_f = new cCudaData<double, F_FLOAT, yx> ((double*)atom->f, & cu_atom->f , atom->nmax, 3,0,true);
delete cu_tag; cu_tag = new cCudaData<int , int , x > (atom->tag , & cu_atom->tag , atom->nmax );
delete cu_type; cu_type = new cCudaData<int , int , x > (atom->type , & cu_atom->type , atom->nmax );
delete cu_mask; cu_mask = new cCudaData<int , int , x > (atom->mask , & cu_atom->mask , atom->nmax );
delete cu_image; cu_image = new cCudaData<int , int , x > (atom->image , & cu_atom->image , atom->nmax );
if(atom->rmass)
{delete cu_rmass; cu_rmass = new cCudaData<double, V_FLOAT, x > (atom->rmass , & cu_atom->rmass , atom->nmax );}
if(cu_atom->q_flag)
{delete cu_q; cu_q = new cCudaData<double, F_FLOAT, x > ((double*)atom->q, & cu_atom->q , atom->nmax );}// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
/*
if(force->pair)
if(force->pair->eatom)
{delete cu_eatom; cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & cu_atom->eatom , atom->nmax );}// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
if(force->pair)
if(force->pair->vatom)
{delete cu_vatom; cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & cu_atom->vatom , atom->nmax,6 );}// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
*/
if(atom->radius)
{
delete cu_radius; cu_radius = new cCudaData<double, X_FLOAT, x > (atom->radius , & cu_atom->radius , atom->nmax );
delete cu_v_radius; cu_v_radius = new cCudaData<V_FLOAT, V_FLOAT, x> (v_radius , & cu_atom->v_radius , atom->nmax*4);
delete cu_omega_rmass; cu_omega_rmass = new cCudaData<V_FLOAT, V_FLOAT, x> (omega_rmass , & cu_atom->omega_rmass , atom->nmax*4);
}
/*
if(atom->density)
{delete cu_density; cu_density = new cCudaData<double, F_FLOAT, x > (atom->density , & cu_atom->density , atom->nmax );}
*/
if(atom->omega)
{delete cu_omega; cu_omega = new cCudaData<double, V_FLOAT, yx > (((double*) atom->omega) , & cu_atom->omega , atom->nmax,3 );}
if(atom->torque)
{delete cu_torque; cu_torque = new cCudaData<double, F_FLOAT, yx > (((double*) atom->torque) , & cu_atom->torque , atom->nmax,3 );}
if(atom->special)
{delete cu_special; cu_special = new cCudaData<int, int, yx > (((int*) &(atom->special[0][0])) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;}
if(atom->nspecial)
{delete cu_nspecial; cu_nspecial = new cCudaData<int, int, yx > (((int*) atom->nspecial) , & cu_atom->nspecial , atom->nmax,3 );}
if(atom->molecule)
{delete cu_molecule; cu_molecule = new cCudaData<int, int, x > (((int*) atom->molecule) , & cu_atom->molecule , atom->nmax );}
shared_data.atom.special_flag = neighbor->special_flag;
shared_data.atom.molecular = atom->molecular;
cu_atom->update_nmax = 2;
cu_atom->nmax = atom->nmax;
//delete [] x_type; x_type = new X_FLOAT4[atom->nmax];
delete cu_x_type; cu_x_type = new cCudaData<X_FLOAT, X_FLOAT, x> (x_type , & cu_atom->x_type , atom->nmax*4);
// shared_data.buffer_new = 2;
}
if(((cu_xhold==NULL)||(cu_xhold->get_dim()[0]<neighbor->maxhold))&&neighbor->xhold)
{
delete cu_xhold; cu_xhold = new cCudaData<double, X_FLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold , neighbor->maxhold, 3);
shared_data.atom.maxhold=neighbor->maxhold;
}
if(atom->mass && !cu_mass)
{cu_mass = new cCudaData<double, V_FLOAT, x > (atom->mass , & cu_atom->mass , atom->ntypes+1);}
cu_atom->mass_host = atom->mass;
if(atom->map_style==1)
{
if((cu_map_array==NULL))
{
cu_map_array = new cCudaData<int, int, x > (atom->get_map_array() , & cu_atom->map_array , atom->get_map_size() );
}
}
// if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
if(cu_x ->get_host_data() != atom->x) cu_x ->set_host_data((double*) (atom->x));
if(cu_v ->get_host_data() != atom->v) cu_v ->set_host_data((double*) (atom->v));
if(cu_f ->get_host_data() != atom->f) cu_f ->set_host_data((double*) (atom->f));
if(cu_tag ->get_host_data() != atom->tag) cu_tag ->set_host_data(atom->tag);
if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);
if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);
if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);
if(cu_xhold)
if(cu_xhold->get_host_data()!= neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));
if(atom->rmass)
if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*) (atom->rmass));
if(cu_atom->q_flag)
if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*) (atom->q));
if(atom->radius)
if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*) (atom->radius));
/*
if(atom->density)
if(cu_density->get_host_data() != atom->density) cu_density->set_host_data((double*) (atom->density));
*/
if(atom->omega)
if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*) (atom->omega));
if(atom->torque)
if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*) (atom->torque));
if(atom->special)
if(cu_special->get_host_data() != atom->special)
{delete cu_special; cu_special = new cCudaData<int, int, yx > (((int*) atom->special) , & cu_atom->special , atom->nmax,atom->maxspecial ); shared_data.atom.maxspecial=atom->maxspecial;}
if(atom->nspecial)
if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*) (atom->nspecial));
if(atom->molecule)
if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*) (atom->molecule));
if(force)
if(cu_virial ->get_host_data() != force->pair->virial) cu_virial ->set_host_data(force->pair->virial);
if(force)
if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl) cu_eng_vdwl ->set_host_data(&force->pair->eng_vdwl);
if(force)
if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul) cu_eng_coul ->set_host_data(&force->pair->eng_coul);
cu_atom->update_nlocal = 2;
MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
}
void Cuda::evsetup_eatom_vatom(int eflag_atom,int vflag_atom)
{
if(eflag_atom)
{
if(not cu_eatom)
cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom) , atom->nmax );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
cu_eatom->set_host_data(force->pair->eatom);
cu_eatom->memset_device(0);
}
if(vflag_atom)
{
if(not cu_vatom)
cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom) , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
cu_vatom->set_host_data((double*)force->pair->vatom);
cu_vatom->memset_device(0);
}
}
void Cuda::uploadAll()
{
MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
timespec starttime;
timespec endtime;
if(atom->nmax!=shared_data.atom.nmax) checkResize();
clock_gettime(CLOCK_REALTIME,&starttime);
cu_x ->upload();
cu_v ->upload();
cu_f ->upload();
cu_tag ->upload();
cu_type->upload();
cu_mask->upload();
cu_image->upload();
if(shared_data.atom.q_flag) cu_q ->upload();
//printf("A3\n");
//if(shared_data.atom.need_eatom) cu_eatom->upload();
//printf("A4\n");
//if(shared_data.atom.need_vatom) cu_vatom->upload();
//printf("A5\n");
if(atom->rmass) cu_rmass->upload();
if(atom->radius) cu_radius->upload();
// if(atom->density) cu_density->upload();
if(atom->omega) cu_omega->upload();
if(atom->torque) cu_torque->upload();
if(atom->special) cu_special->upload();
if(atom->nspecial) cu_nspecial->upload();
if(atom->molecule) cu_molecule->upload();
if(cu_eatom) cu_eatom->upload();
if(cu_vatom) cu_vatom->upload();
clock_gettime(CLOCK_REALTIME,&endtime);
uploadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
CUDA_IF_BINNING(Cuda_Binning (& shared_data);)
shared_data.atom.triggerneighsq=neighbor->triggersq;
MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
}
void Cuda::downloadAll()
{
MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
timespec starttime;
timespec endtime;
if(atom->nmax!=shared_data.atom.nmax) checkResize();
CUDA_IF_BINNING( Cuda_ReverseBinning(& shared_data); )
clock_gettime(CLOCK_REALTIME,&starttime);
cu_x ->download();
cu_v ->download();
cu_f ->download();
cu_type->download();
cu_tag ->download();
cu_mask->download();
cu_image->download();
//if(shared_data.atom.need_eatom) cu_eatom->download();
//if(shared_data.atom.need_vatom) cu_vatom->download();
if(shared_data.atom.q_flag) cu_q ->download();
if(atom->rmass) cu_rmass->download();
if(atom->radius) cu_radius->download();
// if(atom->density) cu_density->download();
if(atom->omega) cu_omega->download();
if(atom->torque) cu_torque->download();
if(atom->special) cu_special->download();
if(atom->nspecial) cu_nspecial->download();
if(atom->molecule) cu_molecule->download();
if(cu_eatom) cu_eatom->download();
if(cu_vatom) cu_vatom->download();
clock_gettime(CLOCK_REALTIME,&endtime);
downloadtime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
}
void Cuda::downloadX()
{
Cuda_Pair_RevertXType(& this->shared_data);
cu_x->download();
}
CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
{
MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.find(neigh_list);
if(p != neigh_lists.end()) return p->second;
else
{
CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
neigh_lists.insert(std::pair<NeighList*, CudaNeighList*>(neigh_list, neigh_list_cuda));
return neigh_list_cuda;
}
MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
}
void Cuda::uploadAllNeighborLists()
{
MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
while(p != neigh_lists.end())
{
p->second->nl_upload();
if(not (p->second->neigh_list->cuda_list->build_cuda))
for(int i=0;i<atom->nlocal;i++)
p->second->sneighlist.maxneighbors=MAX(p->second->neigh_list->numneigh[i],p->second->sneighlist.maxneighbors) ;
++p;
}
MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
}
void Cuda::downloadAllNeighborLists()
{
MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
while(p != neigh_lists.end())
{
p->second->nl_download();
++p;
}
}
void Cuda::update_xhold(int &maxhold,double* xhold)
{
if(this->shared_data.atom.maxhold<atom->nmax)
{
maxhold = atom->nmax;
delete this->cu_xhold; this->cu_xhold = new cCudaData<double, X_FLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold , maxhold, 3);
}
this->shared_data.atom.maxhold=maxhold;
CudaWrapper_CopyData(this->cu_xhold->dev_data(),this->cu_x->dev_data(),3*atom->nmax*sizeof(X_FLOAT));
}
void Cuda::setTimingsZero()
{
shared_data.cuda_timings.test1=0;
shared_data.cuda_timings.test2=0;
//communication
shared_data.cuda_timings.comm_forward_total = 0;
shared_data.cuda_timings.comm_forward_mpi_upper = 0;
shared_data.cuda_timings.comm_forward_mpi_lower = 0;
shared_data.cuda_timings.comm_forward_kernel_pack = 0;
shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
shared_data.cuda_timings.comm_forward_upload = 0;
shared_data.cuda_timings.comm_forward_download = 0;
shared_data.cuda_timings.comm_exchange_total = 0;
shared_data.cuda_timings.comm_exchange_mpi = 0;
shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
shared_data.cuda_timings.comm_exchange_cpu_pack= 0;
shared_data.cuda_timings.comm_exchange_upload = 0;
shared_data.cuda_timings.comm_exchange_download = 0;
shared_data.cuda_timings.comm_border_total = 0;
shared_data.cuda_timings.comm_border_mpi = 0;
shared_data.cuda_timings.comm_border_kernel_pack = 0;
shared_data.cuda_timings.comm_border_kernel_unpack = 0;
shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
shared_data.cuda_timings.comm_border_kernel_self = 0;
shared_data.cuda_timings.comm_border_upload = 0;
shared_data.cuda_timings.comm_border_download = 0;
//pair forces
shared_data.cuda_timings.pair_xtype_conversion = 0;
shared_data.cuda_timings.pair_kernel = 0;
shared_data.cuda_timings.pair_virial = 0;
shared_data.cuda_timings.pair_force_collection = 0;
//neighbor
shared_data.cuda_timings.neigh_bin = 0;
shared_data.cuda_timings.neigh_build = 0;
shared_data.cuda_timings.neigh_special = 0;
//PPPM
shared_data.cuda_timings.pppm_particle_map;
shared_data.cuda_timings.pppm_make_rho;
shared_data.cuda_timings.pppm_brick2fft;
shared_data.cuda_timings.pppm_poisson;
shared_data.cuda_timings.pppm_fillbrick;
shared_data.cuda_timings.pppm_fieldforce;
shared_data.cuda_timings.pppm_compute;
CudaWrapper_CheckUploadTime(true);
CudaWrapper_CheckDownloadTime(true);
CudaWrapper_CheckCPUBufUploadTime(true);
CudaWrapper_CheckCPUBufDownloadTime(true);
}
void Cuda::print_timings()
{
if(universe->me!=0) return;
if(not dotiming) return;
printf("\n # CUDA: Special timings\n\n");
printf("\n Transfer Times\n");
printf(" PCIe Upload: \t %lf s\n",CudaWrapper_CheckUploadTime());
printf(" PCIe Download:\t %lf s\n",CudaWrapper_CheckDownloadTime());
printf(" CPU Tempbbuf Upload: \t %lf \n",CudaWrapper_CheckCPUBufUploadTime());
printf(" CPU Tempbbuf Download: \t %lf \n",CudaWrapper_CheckCPUBufDownloadTime());
printf("\n Communication \n");
printf(" Forward Total \t %lf \n",shared_data.cuda_timings.comm_forward_total);
printf(" Forward MPI Upper Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_upper);
printf(" Forward MPI Lower Bound \t %lf \n",shared_data.cuda_timings.comm_forward_mpi_lower);
printf(" Forward Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_pack);
printf(" Forward Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_unpack);
printf(" Forward Kernel Self \t %lf \n",shared_data.cuda_timings.comm_forward_kernel_self);
printf(" Forward Upload \t %lf \n",shared_data.cuda_timings.comm_forward_upload);
printf(" Forward Download \t %lf \n",shared_data.cuda_timings.comm_forward_download);
printf(" Forward Overlap Split Ratio\t %lf \n",shared_data.comm.overlap_split_ratio);
printf("\n");
printf(" Exchange Total \t %lf \n",shared_data.cuda_timings.comm_exchange_total);
printf(" Exchange MPI \t %lf \n",shared_data.cuda_timings.comm_exchange_mpi);
printf(" Exchange Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_pack);
printf(" Exchange Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_unpack);
printf(" Exchange Kernel Fill \t %lf \n",shared_data.cuda_timings.comm_exchange_kernel_fill);
printf(" Exchange CPU Pack \t %lf \n",shared_data.cuda_timings.comm_exchange_cpu_pack);
printf(" Exchange Upload \t %lf \n",shared_data.cuda_timings.comm_exchange_upload);
printf(" Exchange Download \t %lf \n",shared_data.cuda_timings.comm_exchange_download);
printf("\n");
printf(" Border Total \t %lf \n",shared_data.cuda_timings.comm_border_total);
printf(" Border MPI \t %lf \n",shared_data.cuda_timings.comm_border_mpi);
printf(" Border Kernel Pack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_pack);
printf(" Border Kernel Unpack \t %lf \n",shared_data.cuda_timings.comm_border_kernel_unpack);
printf(" Border Kernel Self \t %lf \n",shared_data.cuda_timings.comm_border_kernel_self);
printf(" Border Kernel BuildList \t %lf \n",shared_data.cuda_timings.comm_border_kernel_buildlist);
printf(" Border Upload \t %lf \n",shared_data.cuda_timings.comm_border_upload);
printf(" Border Download \t %lf \n",shared_data.cuda_timings.comm_border_download);
printf("\n");
//pair forces
printf(" Pair XType Conversion \t %lf \n",shared_data.cuda_timings.pair_xtype_conversion );
printf(" Pair Kernel \t %lf \n",shared_data.cuda_timings.pair_kernel );
printf(" Pair Virial \t %lf \n",shared_data.cuda_timings.pair_virial );
printf(" Pair Force Collection \t %lf \n",shared_data.cuda_timings.pair_force_collection );
printf("\n");
//neighbor
printf(" Neighbor Binning \t %lf \n",shared_data.cuda_timings.neigh_bin );
printf(" Neighbor Build \t %lf \n",shared_data.cuda_timings.neigh_build );
printf(" Neighbor Special \t %lf \n",shared_data.cuda_timings.neigh_special );
printf("\n");
//pppm
if(force->kspace)
{
printf(" PPPM Total \t %lf \n",shared_data.cuda_timings.pppm_compute );
printf(" PPPM Particle Map \t %lf \n",shared_data.cuda_timings.pppm_particle_map );
printf(" PPPM Make Rho \t %lf \n",shared_data.cuda_timings.pppm_make_rho );
printf(" PPPM Brick2fft \t %lf \n",shared_data.cuda_timings.pppm_brick2fft );
printf(" PPPM Poisson \t %lf \n",shared_data.cuda_timings.pppm_poisson );
printf(" PPPM Fillbrick \t %lf \n",shared_data.cuda_timings.pppm_fillbrick );
printf(" PPPM Fieldforce \t %lf \n",shared_data.cuda_timings.pppm_fieldforce );
printf("\n");
}
printf(" Debug Test 1 \t %lf \n",shared_data.cuda_timings.test1);
printf(" Debug Test 2 \t %lf \n",shared_data.cuda_timings.test2);
printf("\n");
}

153
src/USER-CUDA/cuda.h Normal file
View File

@ -0,0 +1,153 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_H
#define CUDA_H
#include "pointers.h"
#include "cuda_shared.h"
#include "cuda_data.h"
#include "cuda_precision.h"
#include <map>
#ifdef _DEBUG
#define MYDBG(a) a
#else
#define MYDBG(a)
#endif
namespace LAMMPS_NS
{
class Cuda : protected Pointers
{
public:
Cuda(class LAMMPS *);
~Cuda();
//static void setDevice(class LAMMPS*);
void allocate();
void accelerator(int, char **);
void setSharedDataZero();
void setSystemParams();
void setDomainParams();
void checkResize();
void evsetup_eatom_vatom(int eflag_atom,int vflag_atom);
void uploadAll();
void downloadAll();
void downloadX();
class CudaNeighList* registerNeighborList(class NeighList* neigh_list);
void uploadAllNeighborLists();
void downloadAllNeighborLists();
void set_neighinit(int dist_check, double triggerneighsq)
{
shared_data.atom.dist_check=dist_check;
shared_data.atom.triggerneighsq = triggerneighsq;
}
bool decide_by_integrator()
{
return neighbor_decide_by_integrator && cu_xhold && finished_setup;
}
void update_xhold(int &maxhold,double* xhold);
void setTimingsZero();
void print_timings();
void cu_x_download() {cu_x->download();}
bool device_set;
bool dotiming;
bool dotestatom;
int testatom;
double uploadtime,downloadtime;
bool finished_setup,begin_setup;
bool oncpu;
bool finished_run;
int self_comm;
int cuda_exists;
double extent[6];
int* debugdata;
// data shared between host code and device code
// (number of atoms, device pointers for up- & download)
cuda_shared_data shared_data;
cCudaData<double , F_FLOAT , x >* cu_q;
cCudaData<double , F_FLOAT , yx>* cu_f;
cCudaData<double , V_FLOAT , x >* cu_mass;
cCudaData<double , V_FLOAT , x >* cu_rmass;
cCudaData<double , V_FLOAT , yx>* cu_v;
cCudaData<double , X_FLOAT , yx>* cu_x;
cCudaData<double , X_FLOAT , yx>* cu_xhold;
cCudaData<int , int , x >* cu_mask;
cCudaData<int , int , x >* cu_tag;
cCudaData<int , int , x >* cu_type;
cCudaData<int , int , x >* cu_image;
cCudaData<double , ENERGY_FLOAT, x >* cu_eatom;
cCudaData<double , ENERGY_FLOAT, yx>* cu_vatom;
cCudaData<double , ENERGY_FLOAT, x >* cu_virial;
cCudaData<double , ENERGY_FLOAT, x >* cu_eng_vdwl;
cCudaData<double , ENERGY_FLOAT, x >* cu_eng_coul;
cCudaData<double , double , x >* cu_extent;
int* binned_id;
cCudaData<int , int , xx >* cu_binned_id;
int* binned_idnew;
cCudaData<int , int , xx >* cu_binned_idnew;
cCudaData<int , int , x >* cu_debugdata;
cCudaData<double , X_FLOAT , x>* cu_radius;
cCudaData<double , F_FLOAT , x>* cu_density;
cCudaData<double , V_FLOAT , yx>* cu_omega;
cCudaData<double , F_FLOAT , yx>* cu_torque;
cCudaData<int , int , yx >* cu_special;
cCudaData<int , int , yx >* cu_nspecial;
cCudaData<int , int , x >* cu_molecule;
cCudaData<X_FLOAT , X_FLOAT , x>* cu_x_type;
X_FLOAT* x_type;
cCudaData<V_FLOAT , V_FLOAT , x>* cu_v_radius;
V_FLOAT* v_radius;
cCudaData<V_FLOAT , V_FLOAT , x>* cu_omega_rmass;
V_FLOAT* omega_rmass;
cCudaData<int , int , x >* cu_map_array;
int neighbor_decide_by_integrator;
bool pinned;
void* copy_buffer;
int copy_buffersize;
private:
std::map<class NeighList*, class CudaNeighList*> neigh_lists;
};
}
#endif // CUDA_H

344
src/USER-CUDA/cuda_common.h Normal file
View File

@ -0,0 +1,344 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_
//#include "cutil.h"
#include "cuda_precision.h"
#include "cuda_wrapper_cu.h"
#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
//this can not be arbitrarly large, since constant space is limited.
//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
//Christian
#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
#define CUDA_MAX_NSPECIAL 25
// define some easy-to-use debug and emulation macros
#ifdef _DEBUG
#define MYDBG(a) a
#else
#define MYDBG(a)
#endif
#if __DEVICE_EMULATION__
#define MYEMU(a) a
#else
#define MYEMU(a)
#endif
#define MYEMUDBG(a) MYEMU(MYDBG(a))
// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
#define MY_ADD_PREFIX(prefix, var) prefix##_##var
#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
#define MY_VAR_TO_STR(var) #var
#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
#define CUDA_USE_TEXTURE
#define CUDA_USE_FLOAT4
//constants used by many classes
//domain
#define _boxhi MY_AP(boxhi)
#define _boxlo MY_AP(boxlo)
#define _subhi MY_AP(subhi)
#define _sublo MY_AP(sublo)
#define _box_size MY_AP(box_size)
#define _prd MY_AP(prd)
#define _periodicity MY_AP(periodicity)
#define _triclinic MY_AP(triclinic)
#define _boxhi_lamda MY_AP(boxhi_lamda)
#define _boxlo_lamda MY_AP(boxlo_lamda)
#define _prd_lamda MY_AP(prd_lamda)
#define _h MY_AP(h)
#define _h_inv MY_AP(h_inv)
#define _h_rate MY_AP(h_rate)
__device__ __constant__ X_FLOAT _boxhi[3];
__device__ __constant__ X_FLOAT _boxlo[3];
__device__ __constant__ X_FLOAT _subhi[3];
__device__ __constant__ X_FLOAT _sublo[3];
__device__ __constant__ X_FLOAT _box_size[3];
__device__ __constant__ X_FLOAT _prd[3];
__device__ __constant__ int _periodicity[3];
__device__ __constant__ int _triclinic;
__device__ __constant__ X_FLOAT _boxhi_lamda[3];
__device__ __constant__ X_FLOAT _boxlo_lamda[3];
__device__ __constant__ X_FLOAT _prd_lamda[3];
__device__ __constant__ X_FLOAT _h[6];
__device__ __constant__ X_FLOAT _h_inv[6];
__device__ __constant__ V_FLOAT _h_rate[6];
//atom properties
#define _x MY_AP(x)
#define _v MY_AP(v)
#define _f MY_AP(f)
#define _tag MY_AP(tag)
#define _type MY_AP(type)
#define _mask MY_AP(mask)
#define _image MY_AP(image)
#define _q MY_AP(q)
#define _mass MY_AP(mass)
#define _rmass MY_AP(rmass)
#define _rmass_flag MY_AP(rmass_flag)
#define _eatom MY_AP(eatom)
#define _vatom MY_AP(vatom)
#define _x_type MY_AP(x_type)
#define _radius MY_AP(radius)
#define _density MY_AP(density)
#define _omega MY_AP(omega)
#define _torque MY_AP(torque)
#define _special MY_AP(special)
#define _maxspecial MY_AP(maxspecial)
#define _nspecial MY_AP(nspecial)
#define _special_flag MY_AP(special_flag)
#define _molecule MY_AP(molecule)
#define _v_radius MY_AP(v_radius)
#define _omega_rmass MY_AP(omega_rmass)
#define _freeze_group_bit MY_AP(freeze_group_bit)
#define _map_array MY_AP(map_array)
__device__ __constant__ X_FLOAT* _x; //holds pointer to positions
__device__ __constant__ V_FLOAT* _v;
__device__ __constant__ F_FLOAT* _f;
__device__ __constant__ int* _tag;
__device__ __constant__ int* _type;
__device__ __constant__ int* _mask;
__device__ __constant__ int* _image;
__device__ __constant__ V_FLOAT* _mass;
__device__ __constant__ F_FLOAT* _q;
__device__ __constant__ V_FLOAT* _rmass;
__device__ __constant__ int _rmass_flag;
__device__ __constant__ ENERGY_FLOAT* _eatom;
__device__ __constant__ ENERGY_FLOAT* _vatom;
__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions
__device__ __constant__ X_FLOAT* _radius;
__device__ __constant__ F_FLOAT* _density;
__device__ __constant__ V_FLOAT* _omega;
__device__ __constant__ F_FLOAT* _torque;
__device__ __constant__ int* _special;
__device__ __constant__ int _maxspecial;
__device__ __constant__ int* _nspecial;
__device__ __constant__ int _special_flag[4];
__device__ __constant__ int* _molecule;
__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions
__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions
__device__ __constant__ int _freeze_group_bit;
__device__ __constant__ int* _map_array;
#ifdef CUDA_USE_TEXTURE
#define _x_tex MY_AP(x_tex)
#if X_PRECISION == 1
texture<float> _x_tex;
#else
texture<int2,1> _x_tex;
#endif
#define _type_tex MY_AP(type_tex)
texture<int> _type_tex;
#define _x_type_tex MY_AP(x_type_tex)
#if X_PRECISION == 1
texture<float4,1> _x_type_tex;
#else
texture<int4,1> _x_type_tex;
#endif
#define _v_radius_tex MY_AP(v_radius_tex)
#if V_PRECISION == 1
texture<float4,1> _v_radius_tex;
#else
texture<int4,1> _v_radius_tex;
#endif
#define _omega_rmass_tex MY_AP(omega_rmass_tex)
#if V_PRECISION == 1
texture<float4,1> _omega_rmass_tex;
#else
texture<int4,1> _omega_rmass_tex;
#endif
#define _q_tex MY_AP(q_tex)
#if F_PRECISION == 1
texture<float> _q_tex;
#else
texture<int2,1> _q_tex;
#endif
#endif
//neighbor
#ifdef IncludeCommonNeigh
#define _inum MY_AP(inum)
#define _inum_border MY_AP(inum_border)
#define _ilist MY_AP(ilist)
#define _ilist_border MY_AP(ilist_border)
#define _numneigh MY_AP(numneigh)
#define _numneigh_border MY_AP(numneigh_border)
#define _numneigh_inner MY_AP(numneigh_inner)
#define _firstneigh MY_AP(firstneigh)
#define _neighbors MY_AP(neighbors)
#define _neighbors_border MY_AP(neighbors_border)
#define _neighbors_inner MY_AP(neighbors_inner)
#define _reneigh_flag MY_AP(reneigh_flag)
#define _triggerneighsq MY_AP(triggerneighsq)
#define _xhold MY_AP(xhold)
#define _maxhold MY_AP(maxhold)
#define _dist_check MY_AP(dist_check)
#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
#define _maxneighbors MY_AP(maxneighbors)
#define _overlap_comm MY_AP(overlap_comm)
__device__ __constant__ int _inum;
__device__ __constant__ int* _inum_border;
__device__ __constant__ int* _ilist;
__device__ __constant__ int* _ilist_border;
__device__ __constant__ int* _numneigh;
__device__ __constant__ int* _numneigh_border;
__device__ __constant__ int* _numneigh_inner;
__device__ __constant__ int** _firstneigh;
__device__ __constant__ int* _neighbors;
__device__ __constant__ int* _neighbors_border;
__device__ __constant__ int* _neighbors_inner;
__device__ __constant__ int* _reneigh_flag;
__device__ __constant__ X_FLOAT _triggerneighsq;
__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions
__device__ __constant__ int _maxhold;
__device__ __constant__ int _dist_check;
__device__ __constant__ int _neighbor_maxlocal;
__device__ __constant__ int _maxneighbors;
__device__ __constant__ int _overlap_comm;
#endif
//system properties
#define _nall MY_AP(nall)
#define _nghost MY_AP(nghost)
#define _nlocal MY_AP(nlocal)
#define _nmax MY_AP(nmax)
#define _cuda_ntypes MY_AP(cuda_ntypes)
#define _dtf MY_AP(dtf)
#define _dtv MY_AP(dtv)
#define _factor MY_AP(factor)
#define _virial MY_AP(virial)
#define _eng_vdwl MY_AP(eng_vdwl)
#define _eng_coul MY_AP(eng_coul)
#define _molecular MY_AP(molecular)
__device__ __constant__ unsigned _nall;
__device__ __constant__ unsigned _nghost;
__device__ __constant__ unsigned _nlocal;
__device__ __constant__ unsigned _nmax;
__device__ __constant__ unsigned _cuda_ntypes;
__device__ __constant__ V_FLOAT _dtf;
__device__ __constant__ X_FLOAT _dtv;
__device__ __constant__ V_FLOAT _factor;
__device__ __constant__ ENERGY_FLOAT* _virial;
__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
__device__ __constant__ ENERGY_FLOAT* _eng_coul;
__device__ __constant__ int _molecular;
//other general constants
#define _buffer MY_AP(buffer)
#define _flag MY_AP(flag)
#define _debugdata MY_AP(debugdata)
__device__ __constant__ void* _buffer;
__device__ __constant__ int* _flag;
__device__ __constant__ int* _debugdata;
// pointers to data fields on GPU are hold in constant space
// -> reduces register usage and number of parameters for kernelcalls
// will be variables of file scope in cuda files
// maybe used to output cudaError_t
#define MY_OUTPUT_RESULT(result) \
switch(result) \
{ \
case cudaSuccess: printf(" => cudaSuccess\n"); break; \
case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
default: printf(" => unknown\n"); break; \
}
#ifdef _DEBUG
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#else
# define CUT_CHECK_ERROR(errorMessage) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} \
}
#endif
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
#define X_MASK 1
#define V_MASK 2
#define F_MASK 4
#define TAG_MASK 8
#define TYPE_MASK 16
#define MASK_MASK 32
#define IMAGE_MASK 64
#define Q_MASK 128
#define MOLECULE_MASK 256
#define RMASS_MASK 512
#define RADIUS_MASK 1024
#define DENSITY_MASK 2048
#define OMEGA_MASK 4096
#define TORQUE_MASK 8192
#endif // #ifdef _CUDA_COMMON_H_

796
src/USER-CUDA/cuda_data.h Normal file
View File

@ -0,0 +1,796 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_DATA_H_
#define _CUDA_DATA_H_
enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
//xx==x in atom_vec x is a member therefore copymode x produces compile errors
#include "cuda_shared.h"
#include "cuda_wrapper_cu.h"
#include "cuda_data_cu.h"
#include <ctime>
#include <cstdio>
#include <typeinfo>
template <typename host_type, typename dev_type, copy_mode mode>
class cCudaData
{
protected:
void** buffer;
int* buf_size;
host_type* host_data;
dev_array* dev_data_array;
dev_type* temp_data;
unsigned nbytes;
bool owns_dev_array;
bool current_data_on_device; //this is not yet working as intended and therefore deactivated
bool current_data_on_host;
bool is_continues;
bool pinned;
public:
cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
~cCudaData();
void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
void set_host_data(host_type* host_data);
void* get_host_data() { return host_data;};
void set_buffer(void** buffer,int* buf_size,bool ais_continues);
unsigned int* get_dim() {return dev_data_array->dim;};
// if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
// if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
void upload(bool will_be_changed=true);
void uploadAsync(int stream, bool will_be_changed=true );
// if you want to download data just to have a look at it, then set will_be_changed=false
// if you are going to modify the downloaded data, then set will_be_changed=true (default)
void download(bool will_be_changed=true);
void downloadAsync(int stream);
void memset_device(int value);
void device_data_has_changed() {current_data_on_device=false;}
void host_data_has_changed() {current_data_on_host=false;}
int dev_size() {
int size = dev_data_array->dim[0]*sizeof(dev_type);
if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
return size;}
};
template <typename host_type, typename dev_type, copy_mode mode>
cCudaData<host_type, dev_type, mode>
::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
{
pinned=is_pinned;
owns_dev_array = false;
current_data_on_device = false;
current_data_on_host = false;
is_continues = false;
this->host_data = host_data;
this->dev_data_array = dev_data_array;
unsigned ndev;
if((mode == x)||(mode==xx))
{
ndev = dim_x;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = 0;
dev_data_array->dim[2] = 0;
}
else if(mode == xy || mode == yx )
{
ndev = dim_x * dim_y;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = dim_y;
dev_data_array->dim[2] = 0;
}
else
{
ndev = dim_x * dim_y * dim_z;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = dim_y;
dev_data_array->dim[2] = dim_z;
}
nbytes = ndev * sizeof(dev_type);
if(nbytes<=0)
{
host_data=NULL;
temp_data=NULL;
dev_data_array->dev_data=NULL;
return;
}
dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
{
if(not pinned)
temp_data = new dev_type[ndev];
else
{
temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
}
}
}
template <typename host_type, typename dev_type, copy_mode mode>
cCudaData<host_type, dev_type, mode>
::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
{
pinned=is_pinned;
this->dev_data_array = new dev_array;
this->owns_dev_array = true;
current_data_on_device = false;
current_data_on_host = false;
is_continues = false;
this->host_data = host_data;
unsigned ndev;
if((mode == x)||(mode==xx))
{
ndev = dim_x;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = 0;
dev_data_array->dim[2] = 0;
}
else if(mode == xy || mode == yx )
{
ndev = dim_x * dim_y;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = dim_y;
dev_data_array->dim[2] = 0;
}
else
{
ndev = dim_x * dim_y * dim_z;
dev_data_array->dim[0] = dim_x;
dev_data_array->dim[1] = dim_y;
dev_data_array->dim[2] = dim_z;
}
nbytes = ndev * sizeof(dev_type);
if(nbytes<=0)
{
host_data=NULL;
temp_data=NULL;
dev_data_array->dev_data=NULL;
return;
}
dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type)))
{
if(not pinned)
temp_data = new dev_type[ndev];
else
{
temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
}
}
}
template <typename host_type, typename dev_type, copy_mode mode>
cCudaData<host_type, dev_type, mode>
::~cCudaData()
{
if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
{
if(not pinned)
delete [] temp_data;
else
{
CudaWrapper_FreePinnedHostData((void*)temp_data);
}
}
if((dev_data_array->dev_data)&&(nbytes>0))
CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
if(owns_dev_array) delete dev_data_array;
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::set_host_data(host_type* host_data)
{
this->host_data = host_data;
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::upload(bool will_be_changed)
{
// if current data is already up, do not re-upload it
// if(current_data_on_device) return;
if(buffer&&is_continues)
{
printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
if(typeid(host_type)==typeid(double))
{
if(typeid(dev_type)==typeid(double))
{
CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
else if(typeid(dev_type)==typeid(float))
{
CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
else if(typeid(host_type)==typeid(float))
{
if(typeid(dev_type)==typeid(double))
{
CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
else if(typeid(dev_type)==typeid(float))
{
CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
else if(typeid(host_type)==typeid(int))
{
if(typeid(dev_type)==typeid(int))
{
CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
}
switch(mode)
{
case x:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
}
break;
}
case xx:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
}
break;
}
case xy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
break;
}
case yx:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
break;
}
case xyz:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
break;
}
case xzy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
break;
}
}
// we have uploaded the data to the device, i.e.:
current_data_on_device = true;
// the data is going to change on the device, making the host data out-dated
if(will_be_changed) current_data_on_host = false;
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::uploadAsync(int stream,bool will_be_changed)
{
// if current data is already up, do not re-upload it
// if(current_data_on_device) return;
if(buffer&&is_continues)
{
printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
if(typeid(host_type)==typeid(double))
{
if(typeid(dev_type)==typeid(double))
{
CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
else if(typeid(dev_type)==typeid(float))
{
CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
else if(typeid(host_type)==typeid(float))
{
if(typeid(dev_type)==typeid(double))
{
CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
else if(typeid(dev_type)==typeid(float))
{
CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
else if(typeid(host_type)==typeid(int))
{
if(typeid(dev_type)==typeid(int))
{
CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
dev_data_array->dim,mode,*buffer);
current_data_on_device = true;
if(will_be_changed) current_data_on_host = false;
return;
}
}
}
switch(mode)
{
case x:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
}
break;
}
case xx:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
}
break;
}
case xy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
break;
}
case yx:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
break;
}
case xyz:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
break;
}
case xzy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
break;
}
}
// we have uploaded the data to the device, i.e.:
current_data_on_device = true;
// the data is going to change on the device, making the host data out-dated
if(will_be_changed) current_data_on_host = false;
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::download(bool will_be_changed)
{
// if current data is already down, do not re-download it
// if(current_data_on_host) return;
switch(mode)
{
case x:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
else
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
}
break;
}
case xx:
{
if(typeid(host_type) == typeid(dev_type))
CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
else
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
}
break;
}
case xy:
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
}
case yx:
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
}
case xyz:
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
}
case xzy:
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
}
}
// we have downloaded the data to the host, i.e.:
current_data_on_host = true;
// the data is going to change on the host, making the device data out-dated
if(will_be_changed) current_data_on_device = false;
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::downloadAsync(int stream)
{
switch(mode)
{
case x:
{
if(typeid(host_type) == typeid(dev_type))
{
CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
}
else
{
CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
}
break;
}
case xx:
{
if(typeid(host_type) == typeid(dev_type))
{
CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
}
else
{
CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
}
break;
}
case xy:
{
CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
}
}
break;
}
case yx:
{
CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
CudaWrapper_SyncStream(stream);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
}
}
break;
}
}
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::memset_device(int value)
{
CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
}
template <typename host_type, typename dev_type, copy_mode mode>
void cCudaData<host_type, dev_type, mode>
::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
{
buffer = abuffer;
buf_size = abuf_size;
unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
if(buffer!=NULL)
if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx)))
{
printf("Allocate Buffer: %p %i\n",*buffer,*buf_size);
if(((*buffer)!=NULL)&&(*buf_size<nbytes_buf))
CudaWrapper_FreeCudaData(*buffer,*buf_size);
if(*buf_size<nbytes_buf)
{*buffer=CudaWrapper_AllocCudaData(nbytes_buf);*buf_size=nbytes_buf;}
printf("Allocate Buffer2: %p %i\n",*buffer,*buf_size);
}
is_continues=ais_continues;
}
#endif // _CUDA_DATA_H_

View File

@ -0,0 +1,39 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_MODIFY_FLAGS_H
#define CUDA_MODIFY_FLAGS_H
#define INITIAL_INTEGRATE_CUDA (1 << 16)
#define POST_INTEGRATE_CUDA (1 << 17)
#define PRE_EXCHANGE_CUDA (1 << 18)
#define PRE_NEIGHBOR_CUDA (1 << 19)
#define PRE_FORCE_CUDA (1 << 20)
#define POST_FORCE_CUDA (1 << 21)
#define FINAL_INTEGRATE_CUDA (1 << 22)
#define END_OF_STEP_CUDA (1 << 23)
#define THERMO_ENERGY_CUDA (1 << 24)
#define MIN_POST_FORCE_CUDA (1 << 25)
// remember not to shift over 31 bits
#endif // CUDA_MODIFY_FLAGS_H

View File

@ -0,0 +1,180 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#include "cuda_neigh_list.h"
#include "neigh_list.h"
#include <cstring>
#include <vector>
#include <map>
#include <algorithm>
#include "cuda.h"
#include "atom.h"
using namespace LAMMPS_NS;
CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
{
cuda = lmp->cuda;
MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
this->neigh_list = neigh_list;
neigh_list->cuda_list=this;
sneighlist.maxlocal = neigh_list->get_maxlocal();
sneighlist.maxneighbors = 32;
sneighlist.maxcut = 0.0;
sneighlist.cutneighsq = NULL;
cu_neighbors = NULL;
cu_neighbors_border = NULL;
cu_neighbors_inner = NULL;
cu_numneigh_border = NULL;
cu_numneigh_inner = NULL;
cu_numneigh = NULL;
cu_ilist = NULL;
cu_ilist_border = NULL;
cu_inum_border = NULL;
inum_border = 0;
neighbors = NULL;
neighbors_inner = NULL;
neighbors_border = NULL;
numneigh_border = NULL;
numneigh_inner = NULL;
ilist_border = NULL;
build_cuda = false;
sneighlist.binned_id=NULL;
sneighlist.bin_dim=new int[3];
sneighlist.bin_dim[0]=0;
sneighlist.bin_dim[1]=0;
sneighlist.bin_dim[2]=0;
cu_ex_type = NULL;
cu_ex1_bit = NULL;
cu_ex2_bit = NULL;
cu_ex_mol_bit = NULL;
sneighlist.nex_type=0;
sneighlist.nex_group=0;
sneighlist.nex_mol=0;
sneighlist.bin_nmax=0;
sneighlist.bin_extraspace=0.05;
MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");)
}
CudaNeighList::~CudaNeighList()
{
dev_free();
}
void CudaNeighList::dev_alloc()
{
MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); )
cu_ilist = new cCudaData<int , int , x> (neigh_list->ilist , & sneighlist.ilist , sneighlist.maxlocal );
cu_numneigh = new cCudaData<int , int , x> (neigh_list->numneigh, & sneighlist.numneigh , sneighlist.maxlocal );
neighbors = new int[atom->nmax*sneighlist.maxneighbors];
cu_neighbors= new cCudaData<int, int, x> (neighbors , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors );
if(cuda->shared_data.overlap_comm)
{
ilist_border = new int[sneighlist.maxlocal];
numneigh_border = new int[sneighlist.maxlocal];
numneigh_inner = new int[sneighlist.maxlocal];
cu_inum_border = new cCudaData<int , int , x> (&inum_border , & sneighlist.inum_border , 1 );
cu_ilist_border = new cCudaData<int , int , x> (ilist_border , & sneighlist.ilist_border , sneighlist.maxlocal );
cu_numneigh_border = new cCudaData<int , int , x> (numneigh_border , & sneighlist.numneigh_border , sneighlist.maxlocal );
cu_numneigh_inner = new cCudaData<int , int , x> (numneigh_inner , & sneighlist.numneigh_inner , sneighlist.maxlocal );
neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
cu_neighbors_border= new cCudaData<int, int, x> (neighbors_border , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors );
neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
}
MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
}
void CudaNeighList::dev_free()
{
MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); )
delete cu_numneigh;
delete cu_ilist;
delete [] neighbors;
delete cu_neighbors;
if(cuda->shared_data.overlap_comm)
{
delete [] ilist_border;
delete [] numneigh_border;
delete [] numneigh_inner;
delete [] neighbors_border;
delete [] neighbors_inner;
delete cu_inum_border;
delete cu_neighbors_border;
delete cu_neighbors_inner;
delete cu_numneigh_border;
delete cu_numneigh_inner;
delete cu_ilist_border;
}
MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); )
}
void CudaNeighList::grow_device()
{
MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");)
// if host has allocated more memory for atom arrays than device has, then allocate more memory on device
int new_maxlocal = neigh_list->get_maxlocal();
if(sneighlist.maxlocal < new_maxlocal)
{
sneighlist.maxlocal = new_maxlocal;
dev_free();
dev_alloc();
}
if(!cu_ilist || !cu_numneigh) dev_alloc();
// check, if hosts data has been allocated somewhere else
if(cu_ilist ->get_host_data() != neigh_list->ilist) cu_ilist ->set_host_data(neigh_list->ilist);
if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh);
MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");)
}
void CudaNeighList::nl_upload(bool will_be_changed)
{
//return;
MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");)
if(cu_ilist)
cu_ilist->upload();
if(cu_numneigh)
cu_numneigh->upload();
MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");)
}
void CudaNeighList::nl_download(bool will_be_changed)
{
MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");)
if(cu_ilist)
cu_ilist->download();
if(cu_numneigh)
cu_numneigh->download();
MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");)
}

View File

@ -0,0 +1,83 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef LMP_NEIGH_LIST_CUDA_H
#define LMP_NEIGH_LIST_CUDA_H
#include "pointers.h"
#include "cuda_data.h"
#include "neigh_list.h"
namespace LAMMPS_NS
{
class CudaNeighList : protected Pointers
{
public:
cCudaData<int , int , x>* cu_ilist;
cCudaData<int , int , x>* cu_numneigh;
cCudaData<int , int , x>* cu_inum_border;
cCudaData<int , int , x>* cu_ilist_border;
cCudaData<int , int , x>* cu_numneigh_border;
cCudaData<int , int , x>* cu_numneigh_inner;
cCudaData<int , int , x>* cu_neighbors;
cCudaData<int , int , x>* cu_neighbors_border;
cCudaData<int , int , x>* cu_neighbors_inner;
cCudaData<int , int , x>* cu_ex_type;
cCudaData<int , int , x>* cu_ex1_bit;
cCudaData<int , int , x>* cu_ex2_bit;
cCudaData<int , int , x>* cu_ex_mol_bit;
cuda_shared_neighlist sneighlist;
int* neighbors;
int* neighbors_inner;
int* neighbors_border;
int inum_border;
int* ilist_border;
int* numneigh_border;
int* numneigh_inner;
int nex_type;
int nex_group;
int nex_mol;
bool build_cuda;
CudaNeighList(class LAMMPS *, class NeighList* neigh_list);
~CudaNeighList();
void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists
void nl_upload(bool will_be_changed=true);
void nl_download(bool will_be_changed=true);
NeighList* neigh_list;
void dev_alloc();
void dev_free();
private:
class Cuda *cuda;
};
}
#endif

View File

@ -0,0 +1,269 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef CUDA_PRECISION_H_
#define CUDA_PRECISION_H_
/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
* Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
* ***_FLOAT: type definition of given property
* ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
*/
#ifdef CUDA_USE_BINNING
#define CUDA_IF_BINNING(a) a
#else
#define CUDA_IF_BINNING(a)
#endif
//GLOBAL
#ifdef CUDA_PRECISION
#if CUDA_PRECISION == 1
#define CUDA_FLOAT float
#define CUDA_F(x) x##f
#endif
#if CUDA_PRECISION == 2
#define CUDA_FLOAT double
#define CUDA_F(x) x
#endif
#endif
#ifndef CUDA_PRECISION
#define CUDA_FLOAT double
#define CUDA_F(x) x
#define CUDA_PRECISION 2
#endif
//--------------------------------
//-----------FFT-----------------
//--------------------------------
#ifdef FFT_PRECISION_CU
#if FFT_PRECISION_CU == 1
#define FFT_FLOAT float
#define FFT_F(x) x##f
#endif
#if FFT_PRECISION_CU == 2
#define FFT_FLOAT double
#define FFT_F(x) x
#endif
#endif
#ifndef FFT_PRECISION_CU
#define FFT_FLOAT CUDA_FLOAT
#define FFT_F(x) CUDA_F(x)
#define FFT_PRECISION_CU CUDA_PRECISION
#endif
//--------------------------------
//-----------PPPM-----------------
//--------------------------------
#ifdef PPPM_PRECISION
#if PPPM_PRECISION == 1
#define PPPM_FLOAT float
#define PPPM_F(x) x##f
#endif
#if PPPM_PRECISION == 2
#define PPPM_FLOAT double
#define PPPM_F(x) x
#endif
#endif
#ifndef PPPM_PRECISION
#define PPPM_FLOAT CUDA_FLOAT
#define PPPM_F(x) CUDA_F(x)
#define PPPM_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------FORCE-----------------
//--------------------------------
#ifdef F_PRECISION
#if F_PRECISION == 1
#define F_FLOAT float
#define F_F(x) x##f
#endif
#if F_PRECISION == 2
#define F_FLOAT double
#define F_F(x) x
#endif
#endif
#ifndef F_PRECISION
#define F_FLOAT CUDA_FLOAT
#define F_F(x) CUDA_F(x)
#define F_PRECISION CUDA_PRECISION
#endif
#if F_PRECISION == 1
#define _SQRT_ sqrtf
#define _RSQRT_ rsqrtf
#define _EXP_ expf
#else
#define _SQRT_ sqrt
#define _RSQRT_ rsqrt
#define _EXP_ exp
#endif
#if F_PRECISION == 2
struct F_FLOAT2
{
F_FLOAT x;
F_FLOAT y;
};
struct F_FLOAT3
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
};
struct F_FLOAT4
{
F_FLOAT x;
F_FLOAT y;
F_FLOAT z;
F_FLOAT w;
};
#else
#define F_FLOAT2 float2
#define F_FLOAT3 float3
#define F_FLOAT4 float4
#endif
//--------------------------------
//-----------ENERGY-----------------
//--------------------------------
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#endif
#ifdef ENERGY_PRECISION
#if ENERGY_PRECISION == 1
#define ENERGY_FLOAT float
#define ENERGY_F(x) x##f
#endif
#if ENERGY_PRECISION == 2
#define ENERGY_FLOAT double
#define ENERGY_F(x) x
#endif
#endif
#ifndef ENERGY_PRECISION
#define ENERGY_FLOAT CUDA_FLOAT
#define ENERGY_F(x) CUDA_F(x)
#define ENERGY_PRECISION CUDA_PRECISION
#endif
//--------------------------------
//-----------POSITIONS------------
//--------------------------------
#ifdef X_PRECISION
#if X_PRECISION == 1
#define X_FLOAT float
#define X_F(x) x##f
#endif
#if X_PRECISION == 2
#define X_FLOAT double
#define X_F(x) x
#endif
#endif
#ifndef X_PRECISION
#define X_FLOAT CUDA_FLOAT
#define X_F(x) CUDA_F(x)
#define X_PRECISION CUDA_PRECISION
#endif
#if X_PRECISION == 2
struct X_FLOAT2
{
X_FLOAT x;
X_FLOAT y;
};
struct X_FLOAT3
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
};
struct X_FLOAT4
{
X_FLOAT x;
X_FLOAT y;
X_FLOAT z;
X_FLOAT w;
};
#else
#define X_FLOAT2 float2
#define X_FLOAT3 float3
#define X_FLOAT4 float4
#endif
//--------------------------------
//-----------velocities-----------
//--------------------------------
#ifdef V_PRECISION
#if V_PRECISION == 1
#define V_FLOAT float
#define V_F(x) x##f
#endif
#if V_PRECISION == 2
#define V_FLOAT double
#define V_F(x) x
#endif
#endif
#ifndef V_PRECISION
#define V_FLOAT CUDA_FLOAT
#define V_F(x) CUDA_F(x)
#define V_PRECISION CUDA_PRECISION
#endif
#if V_PRECISION == 2
struct V_FLOAT4
{
V_FLOAT x;
V_FLOAT y;
V_FLOAT z;
V_FLOAT w;
};
#else
#define V_FLOAT4 float4
#endif
#ifdef NO_PREC_TIMING
struct timespec_2
{
unsigned int tv_sec;
unsigned int tv_nsec;
};
#define timespec timespec_2
#define clock_gettime(a,b)
#endif
#endif /*CUDA_PRECISION_H_*/

378
src/USER-CUDA/cuda_shared.h Normal file
View File

@ -0,0 +1,378 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifndef _CUDA_SHARED_H_
#define _CUDA_SHARED_H_
#include "cuda_precision.h"
#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
struct dev_array
{
void* dev_data; // pointer to memory address on cuda device
unsigned dim[3]; // array dimensions
};
struct cuda_shared_atom // relevent data from atom class
{
dev_array dx; // cumulated distance for binning settings
dev_array x; // position
dev_array v; // velocity
dev_array f; // force
dev_array tag;
dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
dev_array mask;
dev_array image;
dev_array q; // charges
dev_array mass; // per-type masses
dev_array rmass; // per-atom masses
dev_array radius; // per-atom radius
dev_array density;
dev_array omega;
dev_array torque;
dev_array molecule;
dev_array special;
int maxspecial;
dev_array nspecial;
int* special_flag;
int molecular;
dev_array eatom; // per-atom energy
dev_array vatom; // per-atom virial
int need_eatom;
int need_vatom;
dev_array x_type; // position + type in X_FLOAT4 struct
dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
double* mass_host; // remember per-type host pointer to masses
//int natoms; // total # of atoms in system, could be 0
int nghost; // and ghost atoms on this proc
int nlocal; // # of owned
int nall; // total # of atoms in this proc
int nmax; // max # of owned+ghost in arrays on this proc
int ntypes;
int q_flag; // do we have charges?
int rmass_flag; // do we have per-atom masses?
int firstgroup;
int nfirst;
int update_nlocal;
int update_nmax;
dev_array xhold; // position at last neighboring
X_FLOAT triggerneighsq; // maximum square movement before reneighboring
int reneigh_flag; // is reneighboring necessary
int maxhold; // size of xhold
int dist_check; //perform distance check for reneighboring
dev_array binned_id; //id of each binned atom (not tag!!)
dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
float bin_extraspace;
int bin_dim[3];
int bin_nmax;
dev_array map_array;
};
struct cuda_shared_pair // relevent data from pair class
{
char cudable_force; // check for (cudable_force!=0)
X_FLOAT cut_global;
X_FLOAT cut_inner_global;
X_FLOAT cut_coul_global;
double** cut; // type-type cutoff
double** cutsq; // type-type cutoff
double** cut_inner; // type-type cutoff for coul
double** cut_coul; // type-type cutoff for coul
double** coeff1; // tpye-type pair parameters
double** coeff2;
double** coeff3;
double** coeff4;
double** coeff5;
double** coeff6;
double** coeff7;
double** coeff8;
double** coeff9;
double** coeff10;
double** offset;
double* special_lj;
double* special_coul;
dev_array virial; // ENERGY_FLOAT
dev_array eng_vdwl; // ENERGY_FLOAT
dev_array eng_coul; // ENERGY_FLOAT
X_FLOAT cut_coulsq_global;
F_FLOAT g_ewald,kappa;
int freeze_group_bit;
dev_array coeff1_gm;
dev_array coeff2_gm;
dev_array coeff3_gm;
dev_array coeff4_gm;
dev_array coeff5_gm;
dev_array coeff6_gm;
dev_array coeff7_gm;
dev_array coeff8_gm;
dev_array coeff9_gm;
dev_array coeff10_gm;
int lastgridsize;
int n_energy_virial;
int collect_forces_later;
int use_block_per_atom;
int override_block_per_atom;
};
struct cuda_shared_domain // relevent data from domain class
{
X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
X_FLOAT subhi[3];
X_FLOAT boxlo[3];
X_FLOAT boxhi[3];
X_FLOAT prd[3];
int periodicity[3]; // xyz periodicity as array
int triclinic;
X_FLOAT xy;
X_FLOAT xz;
X_FLOAT yz;
X_FLOAT boxlo_lamda[3];
X_FLOAT boxhi_lamda[3];
X_FLOAT prd_lamda[3];
X_FLOAT h[6];
X_FLOAT h_inv[6];
V_FLOAT h_rate[6];
int update;
};
struct cuda_shared_pppm
{
char cudable_force;
#ifdef FFT_CUFFT
FFT_FLOAT* work1;
FFT_FLOAT* work2;
FFT_FLOAT* work3;
PPPM_FLOAT* greensfn;
PPPM_FLOAT* fkx;
PPPM_FLOAT* fky;
PPPM_FLOAT* fkz;
PPPM_FLOAT* vg;
#endif
int* part2grid;
PPPM_FLOAT* density_brick;
int* density_brick_int;
PPPM_FLOAT density_intScale;
PPPM_FLOAT* vdx_brick;
PPPM_FLOAT* vdy_brick;
PPPM_FLOAT* vdz_brick;
PPPM_FLOAT* density_fft;
ENERGY_FLOAT* energy;
ENERGY_FLOAT* virial;
int nxlo_in;
int nxhi_in;
int nxlo_out;
int nxhi_out;
int nylo_in;
int nyhi_in;
int nylo_out;
int nyhi_out;
int nzlo_in;
int nzhi_in;
int nzlo_out;
int nzhi_out;
int nx_pppm;
int ny_pppm;
int nz_pppm;
PPPM_FLOAT qqrd2e;
int order;
// float3 sublo;
PPPM_FLOAT* rho_coeff;
int nmax;
int nlocal;
PPPM_FLOAT* debugdata;
PPPM_FLOAT delxinv;
PPPM_FLOAT delyinv;
PPPM_FLOAT delzinv;
int nlower;
int nupper;
PPPM_FLOAT shiftone;
};
struct cuda_shared_comm
{
int maxswap;
int maxlistlength;
dev_array pbc;
dev_array slablo;
dev_array slabhi;
dev_array multilo;
dev_array multihi;
dev_array sendlist;
int grow_flag;
int comm_phase;
int nsend;
int* nsend_swap;
int* send_size;
int* recv_size;
double** buf_send;
void** buf_send_dev;
double** buf_recv;
void** buf_recv_dev;
void* buffer;
int buffer_size;
double overlap_split_ratio;
};
struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
{
int maxlocal;
int inum; // # of I atoms neighbors are stored for local indices of I atoms
int inum_border2;
dev_array inum_border; // # of atoms which interact with border atoms
dev_array ilist;
dev_array ilist_border;
dev_array numneigh;
dev_array numneigh_inner;
dev_array numneigh_border;
dev_array firstneigh;
dev_array neighbors;
dev_array neighbors_border;
dev_array neighbors_inner;
int maxpage;
dev_array page_pointers;
dev_array* pages;
int maxneighbors;
int neigh_lists_per_page;
double** cutneighsq;
CUDA_FLOAT* cu_cutneighsq;
int* binned_id;
int* bin_dim;
int bin_nmax;
float bin_extraspace;
double maxcut;
dev_array ex_type;
int nex_type;
dev_array ex1_bit;
dev_array ex2_bit;
int nex_group;
dev_array ex_mol_bit;
int nex_mol;
};
struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
{
int prec_glob;
int prec_x;
int prec_v;
int prec_f;
int prec_pppm;
int prec_fft;
int cufft;
int arch;
};
struct cuda_timings_struct
{
//Debug:
double test1;
double test2;
//transfers
double transfer_upload_tmp_constr;
double transfer_download_tmp_deconstr;
//communication
double comm_forward_total;
double comm_forward_mpi_upper;
double comm_forward_mpi_lower;
double comm_forward_kernel_pack;
double comm_forward_kernel_unpack;
double comm_forward_kernel_self;
double comm_forward_upload;
double comm_forward_download;
double comm_exchange_total;
double comm_exchange_mpi;
double comm_exchange_kernel_pack;
double comm_exchange_kernel_unpack;
double comm_exchange_kernel_fill;
double comm_exchange_cpu_pack;
double comm_exchange_upload;
double comm_exchange_download;
double comm_border_total;
double comm_border_mpi;
double comm_border_kernel_pack;
double comm_border_kernel_unpack;
double comm_border_kernel_self;
double comm_border_kernel_buildlist;
double comm_border_upload;
double comm_border_download;
//pair forces
double pair_xtype_conversion;
double pair_kernel;
double pair_virial;
double pair_force_collection;
//neighbor
double neigh_bin;
double neigh_build;
double neigh_special;
//PPPM
double pppm_particle_map;
double pppm_make_rho;
double pppm_brick2fft;
double pppm_poisson;
double pppm_fillbrick;
double pppm_fieldforce;
double pppm_compute;
};
struct cuda_shared_data // holds space for all relevent data from the different classes
{
void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
int buffersize; //maxsize of buffer
int buffer_new; //should be 1 if the pointer to buffer has changed
void* flag;
void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
cuda_shared_atom atom;
cuda_shared_pair pair;
cuda_shared_domain domain;
cuda_shared_pppm pppm;
cuda_shared_comm comm;
cuda_compile_settings compile_settings;
cuda_timings_struct cuda_timings;
int exchange_dim;
int me; //mpi rank
unsigned int datamask;
int overlap_comm;
};
#endif // #ifndef _CUDA_SHARED_H_

View File

@ -0,0 +1,270 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author (triclinic) : Pieter in 't Veld (SNL)
------------------------------------------------------------------------- */
#include "mpi.h"
#include "stdlib.h"
#include "string.h"
#include "stdio.h"
#include "math.h"
#include "domain_cuda.h"
#include "style_region.h"
#include "atom.h"
#include "force.h"
#include "update.h"
#include "modify.h"
#include "fix.h"
#include "fix_deform.h"
#include "region.h"
#include "lattice.h"
#include "comm.h"
#include "memory.h"
#include "error.h"
#include "cuda.h"
#include "domain_cu.h"
using namespace LAMMPS_NS;
#define BIG 1.0e20
#define SMALL 1.0e-4
#define DELTA 1
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp
/* ----------------------------------------------------------------------
default is periodic
------------------------------------------------------------------------- */
DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
{
cuda = lmp->cuda;
}
/* ---------------------------------------------------------------------- */
void DomainCuda::init()
{
Domain::init();
if(not cuda->finished_run)
{
cuda->setDomainParams();
Cuda_Domain_Init(&cuda->shared_data);
}
}
/* ----------------------------------------------------------------------
set global box params
assumes boxlo/hi and triclinic tilts are already set
------------------------------------------------------------------------- */
void DomainCuda::set_global_box()
{
Domain::set_global_box();
if(not cuda->finished_run)
{
cuda->setDomainParams();
}
}
/* ----------------------------------------------------------------------
set lamda box params, only need be done one time
assumes global box is defined and proc assignment has been made by comm
for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
------------------------------------------------------------------------- */
void DomainCuda::set_lamda_box()
{
Domain::set_lamda_box();
if(not cuda->finished_run)
{
cuda->setDomainParams();
}
}
/* ----------------------------------------------------------------------
set local subbox params
assumes global box is defined and proc assignment has been made
for uppermost proc, insure subhi = boxhi (in case round-off occurs)
------------------------------------------------------------------------- */
void DomainCuda::set_local_box()
{
Domain::set_local_box();
if(not cuda->finished_run)
{
// cuda->setDomainParams();
//Cuda_Domain_Init(&cuda->shared_data);
}
}
/* ----------------------------------------------------------------------
reset global & local boxes due to global box boundary changes
if shrink-wrapped, determine atom extent and reset boxlo/hi
if shrink-wrapped and triclinic, perform shrink-wrap in box coords
------------------------------------------------------------------------- */
void DomainCuda::reset_box()
{
if (nonperiodic == 2) {
// convert back to box coords for shrink-wrap operation
if (triclinic) lamda2x(atom->nlocal);
// compute extent of atoms on this proc
double extent[3][2],all[3][2];
extent[2][0] = extent[1][0] = extent[0][0] = BIG;
extent[2][1] = extent[1][1] = extent[0][1] = -BIG;
double **x = atom->x;
int nlocal = atom->nlocal;
if (cuda->finished_setup&&(!cuda->oncpu))
{
extent[0][0]=cuda->extent[0];
extent[0][1]=cuda->extent[1];
extent[1][0]=cuda->extent[2];
extent[1][1]=cuda->extent[3];
extent[2][0]=cuda->extent[4];
extent[2][1]=cuda->extent[5];
}
else
for (int i = 0; i < nlocal; i++) {
extent[0][0] = MIN(extent[0][0],x[i][0]);
extent[0][1] = MAX(extent[0][1],x[i][0]);
extent[1][0] = MIN(extent[1][0],x[i][1]);
extent[1][1] = MAX(extent[1][1],x[i][1]);
extent[2][0] = MIN(extent[2][0],x[i][2]);
extent[2][1] = MAX(extent[2][1],x[i][2]);
}
// compute extent across all procs
// flip sign of MIN to do it in one Allreduce MAX
extent[0][0] = -extent[0][0];
extent[1][0] = -extent[1][0];
extent[2][0] = -extent[2][0];
MPI_Allreduce(extent,all,6,MPI_DOUBLE,MPI_MAX,world);
// in shrink-wrapped dims, set box by atom extent
// if minimum set, enforce min box size settings
if (xperiodic == 0) {
if (boundary[0][0] == 2) boxlo[0] = -all[0][0] - SMALL;
else if (boundary[0][0] == 3) boxlo[0] = MIN(-all[0][0]-SMALL,minxlo);
if (boundary[0][1] == 2) boxhi[0] = all[0][1] + SMALL;
else if (boundary[0][1] == 3) boxhi[0] = MAX(all[0][1]+SMALL,minxhi);
if (boxlo[0] > boxhi[0]) error->all("Illegal simulation box");
}
if (yperiodic == 0) {
if (boundary[1][0] == 2) boxlo[1] = -all[1][0] - SMALL;
else if (boundary[1][0] == 3) boxlo[1] = MIN(-all[1][0]-SMALL,minylo);
if (boundary[1][1] == 2) boxhi[1] = all[1][1] + SMALL;
else if (boundary[1][1] == 3) boxhi[1] = MAX(all[1][1]+SMALL,minyhi);
if (boxlo[1] > boxhi[1]) error->all("Illegal simulation box");
}
if (zperiodic == 0) {
if (boundary[2][0] == 2) boxlo[2] = -all[2][0] - SMALL;
else if (boundary[2][0] == 3) boxlo[2] = MIN(-all[2][0]-SMALL,minzlo);
if (boundary[2][1] == 2) boxhi[2] = all[2][1] + SMALL;
else if (boundary[2][1] == 3) boxhi[2] = MAX(all[2][1]+SMALL,minzhi);
if (boxlo[2] > boxhi[2]) error->all("Illegal simulation box");
}
}
set_global_box();
set_local_box();
if(not cuda->finished_run)
{
cuda->setDomainParams();
Cuda_Domain_Init(&cuda->shared_data);
}
// if shrink-wrapped, convert to lamda coords for new box
// must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff
if (nonperiodic == 2 && triclinic) {
x2lamda(atom->nlocal);
pbc();
}
}
/* ----------------------------------------------------------------------
enforce PBC and modify box image flags for each atom
called every reneighboring and by other commands that change atoms
resulting coord must satisfy lo <= coord < hi
MAX is important since coord - prd < lo can happen when coord = hi
if fix deform, remap velocity of fix group atoms by box edge velocities
for triclinic, atoms must be in lamda coords (0-1) before pbc is called
image = 10 bits for each dimension
increment/decrement in wrap-around fashion
------------------------------------------------------------------------- */
void DomainCuda::pbc()
{
if(cuda->finished_setup&&(!cuda->oncpu))
{
cuda->setDomainParams();
Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit,cuda->extent);
return;
}
Domain::pbc();
}
/* ----------------------------------------------------------------------
convert triclinic 0-1 lamda coords to box coords for all N atoms
x = H lamda + x0;
------------------------------------------------------------------------- */
void DomainCuda::lamda2x(int n)
{
if(cuda->finished_setup&&(!cuda->oncpu))
{
Cuda_Domain_lamda2x(&cuda->shared_data,n);
return;
}
Domain::lamda2x(n);
}
/* ----------------------------------------------------------------------
convert box coords to triclinic 0-1 lamda coords for all N atoms
lamda = H^-1 (x - x0)
------------------------------------------------------------------------- */
void DomainCuda::x2lamda(int n)
{
if(cuda->finished_setup&&(!cuda->oncpu))
{
Cuda_Domain_x2lamda(&cuda->shared_data,n);
return;
}
Domain::x2lamda(n);
}

View File

@ -0,0 +1,41 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_DOMAIN_CUDA_H
#define LMP_DOMAIN_CUDA_H
#include "pointers.h"
#include "domain.h"
namespace LAMMPS_NS {
class DomainCuda : public Domain {
public:
DomainCuda(class LAMMPS *);
void init();
void set_global_box();
void set_lamda_box();
void set_local_box();
void reset_box();
void pbc();
void lamda2x(int);
void x2lamda(int);
protected:
class Cuda *cuda;
};
}
#endif

View File

@ -0,0 +1,442 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include <cstdio>
#include <cstring>
#include "modify_cuda.h"
#include "style_compute.h"
#include "style_fix.h"
#include "atom.h"
#include "comm.h"
#include "fix.h"
#include "compute.h"
#include "group.h"
#include "update.h"
#include "domain.h"
#include "cuda.h"
#include "memory.h"
#include "error.h"
using namespace LAMMPS_NS;
#define DELTA 4
// mask settings - same as in fix.cpp
#define INITIAL_INTEGRATE 1
#define POST_INTEGRATE 2
#define PRE_EXCHANGE 4
#define PRE_NEIGHBOR 8
#define PRE_FORCE 16
#define POST_FORCE 32
#define FINAL_INTEGRATE 64
#define END_OF_STEP 128
#define THERMO_ENERGY 256
#define INITIAL_INTEGRATE_RESPA 512
#define POST_INTEGRATE_RESPA 1024
#define PRE_FORCE_RESPA 2048
#define POST_FORCE_RESPA 4096
#define FINAL_INTEGRATE_RESPA 8192
#define MIN_PRE_EXCHANGE 16384
#define MIN_POST_FORCE 32768
#define MIN_ENERGY 65536
#include "cuda_modify_flags.h"
#define MIN(A,B) ((A) < (B)) ? (A) : (B)
#define MAX(A,B) ((A) > (B)) ? (A) : (B)
#define BIG 1.0e20
/* ---------------------------------------------------------------------- */
ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
{
cuda = lmp->cuda;
n_initial_integrate_cuda = 0;
n_post_integrate_cuda = 0;
n_pre_exchange = 0;
n_pre_neighbor_cuda = 0;
n_pre_force_cuda = 0;
n_post_force_cuda = 0;
n_final_integrate_cuda = 0;
n_end_of_step_cuda = 0;
n_thermo_energy_cuda = 0;
n_initial_integrate_host = 0;
n_post_integrate_host = 0;
n_pre_exchange = 0;
n_pre_neighbor_host = 0;
n_pre_force_host = 0;
n_post_force_host = 0;
n_final_integrate_host = 0;
n_end_of_step_host = 0;
n_thermo_energy_host = 0;
list_initial_integrate_cuda = NULL;
list_post_integrate_cuda = NULL;
list_pre_exchange_cuda = NULL;
list_pre_neighbor_cuda = NULL;
list_pre_force_cuda = NULL;
list_post_force_cuda = NULL;
list_final_integrate_cuda = NULL;
list_end_of_step_cuda = NULL;
list_thermo_energy_cuda = NULL;
end_of_step_every_cuda = NULL;
}
/* ---------------------------------------------------------------------- */
ModifyCuda::~ModifyCuda()
{
delete [] list_initial_integrate_cuda;
delete [] list_post_integrate_cuda;
delete [] list_pre_exchange_cuda;
delete [] list_pre_neighbor_cuda;
delete [] list_pre_force_cuda;
delete [] list_post_force_cuda;
delete [] list_final_integrate_cuda;
delete [] list_end_of_step_cuda;
delete [] list_thermo_energy_cuda;
delete [] end_of_step_every_cuda;
}
/* ----------------------------------------------------------------------
initialize all fixes and computes
------------------------------------------------------------------------- */
void ModifyCuda::init()
{
int i,j;
// delete storage of restart info since it is not valid after 1st run
restart_deallocate();
// create lists of fixes to call at each stage of run
list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
list_init(PRE_FORCE,n_pre_force,list_pre_force);
list_init(POST_FORCE,n_post_force,list_post_force);
list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda);
list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda);
list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda);
list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda);
list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda);
list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda);
list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda);
list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda);
list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda);
n_initial_integrate_host = n_initial_integrate;
n_post_integrate_host = n_post_integrate;
n_pre_exchange_host = n_pre_exchange;
n_pre_neighbor_host = n_pre_neighbor;
n_pre_force_host = n_pre_force;
n_post_force_host = n_post_force;
n_final_integrate_host = n_final_integrate;
n_end_of_step_host = n_end_of_step;
n_thermo_energy_host = n_thermo_energy;
n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host;
n_post_integrate = n_post_integrate_cuda+n_post_integrate_host;
n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host;
n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host;
n_pre_force = n_pre_force_cuda+n_pre_force_host;
n_post_force = n_post_force_cuda+n_post_force_host;
n_final_integrate = n_final_integrate_cuda+n_final_integrate_host;
n_end_of_step = n_end_of_step_cuda+n_end_of_step_host;
n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host;
list_init(INITIAL_INTEGRATE_RESPA,
n_initial_integrate_respa,list_initial_integrate_respa);
list_init(POST_INTEGRATE_RESPA,
n_post_integrate_respa,list_post_integrate_respa);
list_init(POST_FORCE_RESPA,
n_post_force_respa,list_post_force_respa);
list_init(PRE_FORCE_RESPA,
n_pre_force_respa,list_pre_force_respa);
list_init(FINAL_INTEGRATE_RESPA,
n_final_integrate_respa,list_final_integrate_respa);
list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
list_init(MIN_ENERGY,n_min_energy,list_min_energy);
// init each fix
// needs to come before compute init
// this is b/c some computes call fix->dof()
// FixRigid::dof() depends on its own init having been called
comm->maxforward_fix = comm->maxreverse_fix = 0;
for (i = 0; i < nfix; i++) fix[i]->init();
// set global flag if any fix has its restart_pbc flag set
restart_pbc_any = 0;
for (i = 0; i < nfix; i++)
if (fix[i]->restart_pbc) restart_pbc_any = 1;
// create list of computes that store invocation times
list_init_compute();
// init each compute
// set invoked_scalar,vector,etc to -1 to force new run to re-compute them
// add initial timestep to all computes that store invocation times
// since any of them may be invoked by initial thermo
// do not clear out invocation times stored within a compute,
// b/c some may be holdovers from previous run, like for ave fixes
for (i = 0; i < ncompute; i++) {
compute[i]->init();
compute[i]->invoked_scalar = -1;
compute[i]->invoked_vector = -1;
compute[i]->invoked_array = -1;
compute[i]->invoked_peratom = -1;
compute[i]->invoked_local = -1;
}
addstep_compute_all(update->ntimestep);
// warn if any particle is time integrated more than once
int nlocal = atom->nlocal;
int *mask = atom->mask;
int *flag = new int[nlocal];
for (i = 0; i < nlocal; i++) flag[i] = 0;
int groupbit;
for (i = 0; i < nfix; i++) {
if (fix[i]->time_integrate == 0) continue;
groupbit = fix[i]->groupbit;
for (j = 0; j < nlocal; j++)
if (mask[j] & groupbit) flag[j]++;
}
int check = 0;
for (i = 0; i < nlocal; i++)
if (flag[i] > 1) check = 1;
delete [] flag;
int checkall;
MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
if (comm->me == 0 && checkall)
error->warning("One or more atoms are time integrated more than once");
}
/* ----------------------------------------------------------------------
1st half of integrate call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::initial_integrate(int vflag)
{
for(int i = 0; i < n_initial_integrate_cuda; i++)
fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag);
if(n_initial_integrate_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_initial_integrate_host; i++)
fix[list_initial_integrate[i]]->initial_integrate(vflag);
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
post_integrate call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::post_integrate()
{
for(int i = 0; i < n_post_integrate_cuda; i++)
fix[list_post_integrate_cuda[i]]->post_integrate();
if(n_post_integrate_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_post_integrate_host; i++)
fix[list_post_integrate[i]]->post_integrate();
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
pre_exchange call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::pre_exchange()
{
for(int i = 0; i < n_pre_exchange_cuda; i++)
fix[list_pre_exchange_cuda[i]]->pre_exchange();
if(n_pre_exchange_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_pre_exchange_host; i++)
fix[list_pre_exchange[i]]->pre_exchange();
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
pre_neighbor call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::pre_neighbor()
{
for(int i = 0; i < n_pre_neighbor_cuda; i++)
fix[list_pre_neighbor_cuda[i]]->pre_neighbor();
if(n_pre_neighbor_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_pre_neighbor_host; i++)
fix[list_pre_neighbor[i]]->pre_neighbor();
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
pre_force call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::pre_force(int vflag)
{
for(int i = 0; i < n_pre_force_cuda; i++)
fix[list_pre_force_cuda[i]]->pre_force(vflag);
if(n_pre_force_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_pre_force_host; i++)
fix[list_pre_force[i]]->pre_force(vflag);
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
post_force call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::post_force(int vflag)
{
for(int i = 0; i < n_post_force_cuda; i++)
fix[list_post_force_cuda[i]]->post_force(vflag);
if(n_post_force_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_post_force_host; i++)
fix[list_post_force[i]]->post_force(vflag);
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
2nd half of integrate call, only for relevant fixes
------------------------------------------------------------------------- */
void ModifyCuda::final_integrate()
{
for (int i = 0; i < n_final_integrate_cuda; i++)
fix[list_final_integrate_cuda[i]]->final_integrate();
if(n_final_integrate_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_final_integrate_host; i++)
fix[list_final_integrate[i]]->final_integrate();
cuda->uploadAll(); cuda->oncpu = false;
}
}
/* ----------------------------------------------------------------------
end-of-timestep call, only for relevant fixes
only call fix->end_of_step() on timesteps that are multiples of nevery
------------------------------------------------------------------------- */
void ModifyCuda::end_of_step()
{
for (int i = 0; i < n_end_of_step_cuda; i++)
if (update->ntimestep % end_of_step_every_cuda[i] == 0)
fix[list_end_of_step_cuda[i]]->end_of_step();
if(n_end_of_step_host != 0)
{
int do_thisstep=0;
for (int i = 0; i < n_end_of_step_host; i++)
if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1;
if(do_thisstep)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_end_of_step_host; i++)
if (update->ntimestep % end_of_step_every[i] == 0)
fix[list_end_of_step[i]]->end_of_step();
cuda->uploadAll(); cuda->oncpu = false;
}
}
}
/* ----------------------------------------------------------------------
thermo energy call, only for relevant fixes
called by Thermo class
compute_scalar() is fix call to return energy
------------------------------------------------------------------------- */
double ModifyCuda::thermo_energy()
{
double energy = 0.0;
for (int i = 0; i < n_thermo_energy_cuda; i++)
energy += fix[list_thermo_energy_cuda[i]]->compute_scalar();
if(n_thermo_energy_host != 0)
{
cuda->downloadAll(); cuda->oncpu = true;
for (int i = 0; i < n_thermo_energy_host; i++)
energy += fix[list_thermo_energy[i]]->compute_scalar();
cuda->uploadAll(); cuda->oncpu = false;
}
return energy;
}
void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list)
{
delete [] list;
delete [] end_of_step_every_cuda;
n = 0;
for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
list = new int[n];
end_of_step_every_cuda = new int[n];
n = 0;
for (int i = 0; i < nfix; i++)
if (fmask[i] & mask) {
list[n] = i;
end_of_step_every_cuda[n++] = fix[i]->nevery;
}
}

View File

@ -0,0 +1,82 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_MODIFY_CUDA_H
#define LMP_MODIFY_CUDA_H
#include <cstdio>
#include "modify.h"
namespace LAMMPS_NS {
class ModifyCuda : public Modify {
public:
int n_initial_integrate_cuda;
int n_post_integrate_cuda;
int n_pre_exchange_cuda;
int n_pre_neighbor_cuda;
int n_pre_force_cuda;
int n_post_force_cuda;
int n_final_integrate_cuda;
int n_end_of_step_cuda;
int n_thermo_energy_cuda;
int n_initial_integrate_host;
int n_post_integrate_host;
int n_pre_exchange_host;
int n_pre_neighbor_host;
int n_pre_force_host;
int n_post_force_host;
int n_final_integrate_host;
int n_end_of_step_host;
int n_thermo_energy_host;
ModifyCuda(class LAMMPS *);
~ModifyCuda();
void init();
void initial_integrate(int);
void post_integrate();
//void pre_decide();
void pre_exchange();
void pre_neighbor();
void pre_force(int);
void post_force(int);
void final_integrate();
void end_of_step();
double thermo_energy();
protected:
class Cuda *cuda;
// lists of fixes to apply at different stages of timestep
// list of cuda fixes
int *list_initial_integrate_cuda;
int *list_post_integrate_cuda;
int *list_pre_exchange_cuda;
int *list_pre_neighbor_cuda;
int *list_pre_force_cuda;
int *list_post_force_cuda;
int *list_final_integrate_cuda;
int *list_end_of_step_cuda;
int *list_thermo_energy_cuda;
int *end_of_step_every_cuda;
void list_init_end_of_step_cuda(int, int &, int *&);
};
}
#endif

View File

@ -0,0 +1,317 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifdef CUDA
#include "neighbor_cuda.h"
#include "neigh_list.h"
#include "atom.h"
#include "domain.h"
#include "group.h"
#include "error.h"
#include "cuda_neigh_list.h"
#include "cuda.h"
#include "neighbor_cu.h"
#include <cmath>
using namespace LAMMPS_NS;
/* ----------------------------------------------------------------------
N^2 search for all neighbors
every neighbor pair appears in list of both atoms i and j
------------------------------------------------------------------------- */
void NeighborCuda::full_bin_cuda(NeighList *list)
{
MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");)
if(includegroup) error->warning("Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n");
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
if(nlocal==0) return;
CudaNeighList* clist=list->cuda_list;
cuda_shared_neighlist* slist=&clist->sneighlist;
if(not clist) cuda->registerNeighborList(list);
clist->build_cuda=true;
if(slist->bin_extraspace<0.09)
{
for(int i=1;i<=atom->ntypes;i++)
for(int j=1;j<=atom->ntypes;j++)
{
if(slist->maxcut<cutneighsq[i][j]) slist->maxcut=cutneighsq[i][j];
}
slist->maxcut=sqrt(slist->maxcut);
}
int bin_dim_tmp[3];
int bin_nmax_tmp;
//printf("Hallo\n");
timespec starttime,endtime;
do
{
do
{
bin_dim_tmp[0]=static_cast <int> ((domain->subhi[0]-domain->sublo[0])/slist->maxcut);
bin_dim_tmp[1]=static_cast <int> ((domain->subhi[1]-domain->sublo[1])/slist->maxcut);
bin_dim_tmp[2]=static_cast <int> ((domain->subhi[2]-domain->sublo[2])/slist->maxcut);
if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1;
if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1;
if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1;
bin_nmax_tmp=static_cast <int> ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2]));
bin_dim_tmp[0]+=4;
bin_dim_tmp[1]+=4;
bin_dim_tmp[2]+=4;
if(bin_nmax_tmp<32) slist->maxcut*=1.2;
// printf("slist->maxcut: %lf\n", slist->maxcut);
} while(bin_nmax_tmp<32);
if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp))
{
if(slist->binned_id!=NULL)
CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
slist->bin_dim[0] = bin_dim_tmp[0];
slist->bin_dim[1] = bin_dim_tmp[1];
slist->bin_dim[2] = bin_dim_tmp[2];
slist->bin_nmax = bin_nmax_tmp;
slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
//printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp);
}
//if(list->cuda_list->sneighlist.bin_nmax>512) error->all("To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n");
}while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist));
// cuda->cu_debugdata->memset_device(0);
int maxneighbors=slist->maxneighbors;
int *ilist = list->ilist;
int *numneigh = list->numneigh;
if((nex_type!=slist->nex_type)||
(nex_group!=slist->nex_group)||
(nex_mol!=slist->nex_mol))
{
slist->nex_type=nex_type;
slist->nex_group=nex_group;
slist->nex_mol=nex_mol;
//printf("%i %i %i\n",nex_type,nex_group,nex_mol);
if(nex_type)
{
delete clist->cu_ex_type;
clist->cu_ex_type=new cCudaData<int , int , x> (&ex_type[0][0] , & slist->ex_type , (atom->ntypes+1)*(atom->ntypes+1) );
clist->cu_ex_type->upload();
}
//printf("AA %i %i %i\n",nex_type,nex_group,nex_mol);
if(nex_group)
{
delete clist->cu_ex1_bit;
clist->cu_ex1_bit=new cCudaData<int , int , x> (ex1_bit , & slist->ex1_bit , nex_group );
clist->cu_ex1_bit->upload();
//printf("A %i %i %i\n",nex_type,nex_group,nex_mol);
delete clist->cu_ex2_bit;
clist->cu_ex2_bit=new cCudaData<int , int , x> (ex2_bit , & slist->ex2_bit , nex_group );
clist->cu_ex2_bit->upload();
}
//printf("B %i %i %i\n",nex_type,nex_group,nex_mol);
if(nex_mol)
{
delete clist->cu_ex_mol_bit;
clist->cu_ex_mol_bit=new cCudaData<int , int , x> (ex_mol_bit , & slist->ex_mol_bit , nex_mol );
clist->cu_ex_mol_bit->upload();
}
//printf("C %i %i %i\n",nex_type,nex_group,nex_mol);
}
int overflow = 0;
int inum = 0;
int npnt = 0;
do
{
npnt=0;
inum=0;
overflow=0;
clist->grow_device();
slist->cutneighsq=cutneighsq;
slist->maxneighbors=maxneighbors;
slist->inum = list->inum = nlocal;
//list->cuda_list->grow_device();
if(cuda->shared_data.overlap_comm)
{
list->cuda_list->inum_border=0;
list->cuda_list->cu_inum_border->upload();
}
cuda->shared_data.atom.nall=nall;
//Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist);
/*cuda->cu_debugdata->download();
printf("Debugdata: %i ",cuda->debugdata[0]);
for(int i=0;i<cuda->debugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]);
printf("\n");*/
//printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
if(overflow<0)
{
maxneighbors+=32;
if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32;
delete list->cuda_list->cu_neighbors;
delete [] list->cuda_list->neighbors;
list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors];
list->cuda_list->sneighlist.maxneighbors=maxneighbors;
//printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal);
list->cuda_list->cu_neighbors= new cCudaData<int, int, x> (list->cuda_list->neighbors , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors );
//printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
if(cuda->shared_data.overlap_comm)
{
list->cuda_list->sneighlist.maxneighbors=maxneighbors;
list->cuda_list->dev_free();
list->cuda_list->dev_alloc();
}
//printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
}
//printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
if(cuda->shared_data.overlap_comm)
{
list->cuda_list->cu_inum_border->download();
list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border;
}
}
while(overflow<0);
//cuda->cu_debugdata->download();
// printf("Differences in: %i\n",cuda->debugdata[0]);
// for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]);
// printf("\n");
/*for(int i=0;i<10;i++)
{
printf("%i %i // ",i,numneigh[i]);
for(int j=0;j<numneigh[i];j++)
printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
printf("\n");
}*/
/* int count=0;
if(cuda->shared_data.overlap_comm)
{
list->cuda_list->cu_inum_border->download();
list->cuda_list->cu_ilist_border->download();
list->cuda_list->cu_numneigh_border->download();
list->cuda_list->cu_numneigh_inner->download();
list->cuda_list->cu_neighbors->download();
list->cuda_list->cu_neighbors_inner->download();
list->cuda_list->cu_neighbors_border->download();
//list->cuda_list->cu_firstneigh->download();
// list->cuda_list->nl_download();
list->cuda_list->cu_numneigh->download();
int diff=0;
//for(int i=0;i<nlocal;i++)*/
/* int i=123;
{
int k=-1;
//printf("inum_border: %i\n",list->cuda_list->inum_border);
//for(int j=0;j<list->numneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n");
for(int j=0;j<list->cuda_list->inum_border;j++)
if(list->cuda_list->ilist_border[j]==i) k=j;
int d=numneigh[i]-list->cuda_list->numneigh_inner[i];
if(k>-1) d-=list->cuda_list->numneigh_border[k];
if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;}
if(k>-1 && count<10)
{
printf("Numneighs: %i %i %i Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data());
cuda->shared_data.me=k;
for(int j=0;j<numneigh[i];j++)
printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
printf("\n");
for(int j=0;j<list->cuda_list->numneigh_inner[i];j++)
printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]);
printf(" // ");
for(int j=0;j<list->cuda_list->numneigh_border[k];j++)
printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]);
printf("\n");
count++;
}
}
printf("%i\n",diff);
}*/
list->cuda_list->cu_numneigh->download();
list->cuda_list->cu_ilist->download();
//printf("Done\n");
MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
}
void NeighborCuda::full_nsq_cuda(NeighList *list)
{
printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n");
return;
/*
MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");)
int nlocal = atom->nlocal;
int nall = nlocal + atom->nghost;
if(cuda->cu_xhold) cuda->cu_xhold->upload();
if(not list->cuda_list) cuda->registerNeighborList(list);
list->cuda_list->build_cuda=true;
int maxneighbors=list->cuda_list->sneighlist.maxneighbors;
int neigh_lists_per_page=pgsize/maxneighbors;
int *ilist = list->ilist;
int *numneigh = list->numneigh;
int **firstneigh = list->firstneigh;
int **pages = list->pages;
int overflow = 0;
int inum = 0;
int npage = 0;
int npnt = 0;
do
{
npage=0;
npnt=0;
inum=0;
overflow=0;
neigh_lists_per_page=pgsize/maxneighbors;
npage=(2*nlocal*maxneighbors-1)/pgsize;
while(npage>list->maxpage) list->add_pages();
pages = list->pages;
npage=0;
list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors;
list->cuda_list->grow_device();
list->cuda_list->sneighlist.cutneighsq=cutneighsq;
list->cuda_list->sneighlist.maxneighbors=maxneighbors;
list->cuda_list->sneighlist.inum = list->inum = nlocal;
cuda->shared_data.atom.nall=nall;
Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist);
if(overflow) maxneighbors+=32;
}
while(overflow);
if(not cudable) list->cuda_list->nl_download();
MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
*/
}
#endif

View File

@ -0,0 +1,221 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "math.h"
#include "neighbor_cuda.h"
#include "cuda.h"
#include "atom.h"
#include "atom_vec.h"
#include "domain.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "force.h"
#include "group.h"
#include "memory.h"
#include "error.h"
using namespace LAMMPS_NS;
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp
/* ---------------------------------------------------------------------- */
NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
{
cuda = lmp->cuda;
}
/* ---------------------------------------------------------------------- */
void NeighborCuda::init()
{
cuda->set_neighinit(dist_check,0.25*skin*skin);
cudable = 1;
Neighbor::init();
}
/* ----------------------------------------------------------------------
overwrite either full_nsq or full_bin with CUDA-equivalent methods
any other neighbor build method is unchanged
------------------------------------------------------------------------- */
void NeighborCuda::choose_build(int index, NeighRequest *rq)
{
Neighbor::choose_build(index,rq);
if (rq->full && style == NSQ && rq->ghost == 0 && rq->cudable)
pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
else if (rq->full && style == BIN && rq->ghost == 0 && rq->cudable)
pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
}
/* ---------------------------------------------------------------------- */
int NeighborCuda::check_distance()
{
double delx,dely,delz,rsq;
double delta,deltasq,delta1,delta2;
if (boxcheck) {
if (triclinic == 0) {
delx = bboxlo[0] - boxlo_hold[0];
dely = bboxlo[1] - boxlo_hold[1];
delz = bboxlo[2] - boxlo_hold[2];
delta1 = sqrt(delx*delx + dely*dely + delz*delz);
delx = bboxhi[0] - boxhi_hold[0];
dely = bboxhi[1] - boxhi_hold[1];
delz = bboxhi[2] - boxhi_hold[2];
delta2 = sqrt(delx*delx + dely*dely + delz*delz);
delta = 0.5 * (skin - (delta1+delta2));
deltasq = delta*delta;
} else {
domain->box_corners();
delta1 = delta2 = 0.0;
for (int i = 0; i < 8; i++) {
delx = corners[i][0] - corners_hold[i][0];
dely = corners[i][1] - corners_hold[i][1];
delz = corners[i][2] - corners_hold[i][2];
delta = sqrt(delx*delx + dely*dely + delz*delz);
if (delta > delta1) delta1 = delta;
else if (delta > delta2) delta2 = delta;
}
delta = 0.5 * (skin - (delta1+delta2));
deltasq = delta*delta;
}
} else deltasq = triggersq;
double **x = atom->x;
int nlocal = atom->nlocal;
if (includegroup) nlocal = atom->nfirst;
int flag = 0;
if (not cuda->neighbor_decide_by_integrator) {
cuda->cu_x_download();
for (int i = 0; i < nlocal; i++) {
delx = x[i][0] - xhold[i][0];
dely = x[i][1] - xhold[i][1];
delz = x[i][2] - xhold[i][2];
rsq = delx*delx + dely*dely + delz*delz;
if (rsq > deltasq) flag = 1;
}
}
else flag = cuda->shared_data.atom.reneigh_flag;
int flagall;
MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_MAX,world);
if (flagall && ago == MAX(every,delay)) ndanger++;
return flagall;
}
/* ---------------------------------------------------------------------- */
void NeighborCuda::build()
{
int i;
ago = 0;
ncalls++;
// store current atom positions and box size if needed
if (dist_check) {
if (cuda->decide_by_integrator())
cuda->update_xhold(maxhold, &xhold[0][0]);
else {
if (cuda->finished_setup) cuda->cu_x_download();
double **x = atom->x;
int nlocal = atom->nlocal;
if (includegroup) nlocal = atom->nfirst;
if (nlocal > maxhold) {
maxhold = atom->nmax;
memory->destroy(xhold);
memory->create(xhold,maxhold,3,"neigh:xhold");
}
for (i = 0; i < nlocal; i++) {
xhold[i][0] = x[i][0];
xhold[i][1] = x[i][1];
xhold[i][2] = x[i][2];
}
if (boxcheck) {
if (triclinic == 0) {
boxlo_hold[0] = bboxlo[0];
boxlo_hold[1] = bboxlo[1];
boxlo_hold[2] = bboxlo[2];
boxhi_hold[0] = bboxhi[0];
boxhi_hold[1] = bboxhi[1];
boxhi_hold[2] = bboxhi[2];
} else {
domain->box_corners();
corners = domain->corners;
for (i = 0; i < 8; i++) {
corners_hold[i][0] = corners[i][0];
corners_hold[i][1] = corners[i][1];
corners_hold[i][2] = corners[i][2];
}
}
}
}
}
if (not cudable && cuda->finished_setup && atom->avec->cudable)
cuda->downloadAll();
if (cudable && (not cuda->finished_setup)) {
cuda->checkResize();
cuda->uploadAll();
}
// if any lists store neighbors of ghosts:
// invoke grow() if nlocal+nghost exceeds previous list size
// else only invoke grow() if nlocal exceeds previous list size
// only done for lists with growflag set and which are perpetual
if (anyghostlist && atom->nlocal+atom->nghost > maxatom) {
maxatom = atom->nmax;
for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
} else if (atom->nlocal > maxatom) {
maxatom = atom->nmax;
for (i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
}
// extend atom bin list if necessary
if (style != NSQ && atom->nmax > maxbin) {
maxbin = atom->nmax;
memory->destroy(bins);
memory->create(bins,maxbin,"bins");
}
// check that neighbor list with special bond flags will not overflow
if (atom->nlocal+atom->nghost > NEIGHMASK)
error->one("Too many local+ghost atoms for neighbor list");
// invoke building of pair and molecular neighbor lists
// only for pairwise lists with buildflag set
for (i = 0; i < nblist; i++)
(this->*pair_build[blist[i]])(lists[blist[i]]);
if (atom->molecular) {
if (force->bond) (this->*bond_build)();
if (force->angle) (this->*angle_build)();
if (force->dihedral) (this->*dihedral_build)();
if (force->improper) (this->*improper_build)();
}
}

View File

@ -0,0 +1,39 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_NEIGHBOR_CUDA_H
#define LMP_NEIGHBOR_CUDA_H
#include "neighbor.h"
namespace LAMMPS_NS {
class NeighborCuda : public Neighbor {
public:
NeighborCuda(class LAMMPS *);
void init();
int check_distance();
void build();
private:
class Cuda *cuda;
void choose_build(int, class NeighRequest *);
typedef void (NeighborCuda::*PairPtr)(class NeighList *);
void full_nsq_cuda(class NeighList *);
void full_bin_cuda(class NeighList *);
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#ifdef INTEGRATE_CLASS
IntegrateStyle(verlet/cuda,VerletCuda)
#else
#ifndef LMP_VERLET_CUDA_H
#define LMP_VERLET_CUDA_H
#include "verlet.h"
#include "modify_cuda.h"
namespace LAMMPS_NS {
class VerletCuda : public Verlet
{
public:
VerletCuda(class LAMMPS *, int, char **);
void setup();
void setup_minimal(int);
void run(int);
void test_atom(int atom,char* astring); //debugging purpose
int dotestatom; //debugging purpose
protected:
class Cuda *cuda;
void force_clear();
double time_pair;
double time_kspace;
double time_comm;
double time_modify;
double time_fulliterate;
ModifyCuda* modify_cuda;
};
}
#endif
#endif